scraping reddit programmer humor with node js using puppeteer

Solutions on MaxInterview for scraping reddit programmer humor with node js using puppeteer by the best coders in the world

showing results for - "scraping reddit programmer humor with node js using puppeteer"
Cyril
16 Aug 2018
1/*
2    This code comes from Vincent Lab
3    And it has a video version linked here: https://www.youtube.com/watch?v=Zb--639XePw
4*/
5
6// Import dependencies
7const puppeteer = require("puppeteer");
8const fs = require("fs");
9
10(async () => {
11
12    // The number of posts. 25 would give you about 100
13    const amount = 25;
14
15    // The location / URL
16    const url = "https://www.reddit.com/r/ProgrammerHumor/";
17
18    // All of the jokes as key-value pairs
19    let jokesObject = {};
20
21    console.log("Getting posts from Reddit");
22
23    // Create the browser
24    const browser = await puppeteer.launch({
25        headless: false
26    });
27
28    // Navigate to the website
29    const page = await browser.newPage();
30    await page.goto(url, { waitUntil: "load" });
31
32    // Get the root element of all the posts
33    const root = (await page.$$(`.rpBJOHq2PR60pnwJlUyP0`))[0];
34
35    // All the posts
36    const posts = [];
37
38    // For amount
39    for (let i = 0; i < amount; i++) {
40
41        // Get all the posts in this chunk
42        const chunk = await (await root.$$("._1poyrkZ7g36PawDueRza-J"));
43
44        // Add all the posts in this chunk to the posts array
45        posts.push(...chunk);
46
47        // Wait for 1 second
48        await sleep(1000);
49
50        // Scroll to the next chunk
51        await page.evaluate(() => {
52            window.scrollBy(0, (632 * 12));
53        });
54
55    }
56
57    console.log("Extracting jokes from posts");
58
59    // For each post
60    for (const post of posts) {
61
62        try {
63
64            // Get the title
65            const title = await getProperty(post, "textContent", "_eYtD2XCVieq6emjKBH3m");
66
67            // Get the image jokesObject
68            const image = await getProperty(post, "src", "ImageBox-image");
69
70            // Add the post to the jokes object
71            jokesObject[title] = { image: image };
72
73        } catch (error) {
74
75        }
76
77    }
78
79    console.log("Converting jokes into an array");
80
81    // Convert the jokes object into an array
82    const jokes = [];
83    for (const joke in jokesObject) {
84        jokes.push({
85            title: joke,
86            image: jokesObject[joke].image
87        })
88    }
89
90    console.log("Saving jokes");
91
92    // Save the jokes to a file
93    fs.writeFileSync("jokes.json", JSON.stringify(jokes));
94
95    // Close the browser
96    await browser.close();
97
98})();
99
100// Get a property on an element from within an object
101async function getProperty(rootElement, property, className) {
102    const element = (await rootElement.$$(`.${className}`))[0];
103    return await (await element.getProperty(property)).jsonValue();
104}
105
106// Sleep for x
107function sleep(ms) {
108    return new Promise(resolve => setTimeout(resolve, ms));
109}