-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.js
62 lines (47 loc) · 1.75 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
// Scraping the news articles from my friends production level application.
const fs = require("fs");
const cheerio = require("cheerio");
const puppeteer = require("puppeteer");
async function scrapeNewsArticles() {
try {
console.log("Starting the scraping process...");
const browser = await puppeteer.launch({ headless: "old" });
const page = await browser.newPage();
await page.goto("https://dnnews.in", { waitUntil: "domcontentloaded" });
// Get the HTML content of the page
const content = await page.content();
// Load the HTML content into Cheerio
const $ = cheerio.load(content);
// Get all articles
const articles = $("li[itemtype='https://schema.org/NewsArticle']");
const scrapedData = [];
articles.each((i, element) => {
// Extract data from the article
const title = $(element).find("h2[itemprop='headline'] span").text();
const date = new Date(
$(element).find("p[itemprop='datePublished']").attr("datetime")
).toLocaleDateString();
const category = $(element)
.find("p[class='text-sm text-[#808080]']")
.text();
const imageUrl = $(element).find("img[itemprop='image']").attr("src");
// Push data to the array
scrapedData.push({
title,
date,
category,
imageUrl,
});
});
// Output the data as JSON
const jsonData = JSON.stringify(scrapedData, null, 2);
console.log(scrapedData);
// Save JSON data to a file
fs.writeFileSync("scrapedData.json", jsonData, "utf-8");
await browser.close();
console.log("Scraping done! Data saved in 'scrapedData.json'");
} catch (error) {
console.error("An error occurred while scraping job articles:", error);
}
}
scrapeNewsArticles();