-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoffline_webscraper.R
51 lines (42 loc) · 1.25 KB
/
offline_webscraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#install.packages("rvest")
#install.packages("xml2")
library(rvest)
library(xml2)
# Reading the HTML file
html_file <- read_html("C:\\Users\\akunna1\\Desktop\\Projects\\neptune_technologies\\project_directory\\front_end\\term_project.html")
# Extracting title
title <- html_file %>%
html_node("title") %>%
html_text()
# Extracting all headings (h1, h2, h3, etc.)
headings <- html_file %>%
html_nodes("h1, h2, h3, h4, h5, h6") %>%
html_text()
# Extracting all paragraphs
paragraphs <- html_file %>%
html_nodes("p") %>%
html_text()
# Extracting all links
links <- html_file %>%
html_nodes("a") %>%
html_attr("href")
# Extracting the content of the "About Us" tab
about_us_content <- html_file %>%
html_node("#about-us") %>%
html_text(trim = TRUE)
# Extracting the content of the "About Role" tab
about_role_content <- html_file %>%
html_node("#about-role") %>%
html_text(trim = TRUE)
# Extracting the content of the "Apply Here" tab
apply_here_content <- html_file %>%
html_node("#apply-here") %>%
html_text(trim = TRUE)
# Printing extracted information
#print(title)
#print(headings)
#print(paragraphs)
#print(links)
print(about_us_content)
#print(about_role_content)
#print(apply_here_content)