-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.R
102 lines (88 loc) · 3.76 KB
/
scraper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#### Main scraping document
####load libs
library(RCurl)
library(XML)
library(stringr)
### function to scrape the board.netdoktor.de page to get the thematics like eyes, ....
netDoktorScraper <- function(url){
SOURCE <- getURL(url,encoding="UTF-8") # Specify encoding when dealing with non-latin characters
PARSED <- htmlParse(SOURCE)
title <- xpathSApply(PARSED, "//span[contains(@itemprop,'itemListElement')]",xmlValue)
link <- xpathSApply(PARSED, "//span[contains(@itemprop, 'itemListElement')]/a/@href")
#time <- xpathSApply(PARSED, "//time[@itemprop='datePublished']/@datetime")
#tags <- unique(xpathSApply(PARSED, "//a[@rel='tag']",xmlValue))
#text <- xpathSApply(PARSED, "//div[@id='article-body-blocks']/p",xmlValue)
return(list(title=title,
link=link))
#time=time,
#tags=paste(tags,collapse="|")
#,text=paste(text,collapse="|")))
}
### function to scrape through a thematic like eye with the postfix
scrapeThematic <- function(url, postfix){
SOURCE <- getURL(paste(url, "/", postfix, sep=""),encoding="UTF-8") # Specify encoding when dealing with non-latin characters
PARSED <- htmlParse(SOURCE)
## scroll through threads
nextPage <- xpathSApply(PARSED, "//div[@class='pageNavArrowsRight']/a/@href")
previous <- NULL
if(!is.null(nextPage)){previous <- scrapeThematic(url, nextPage[1])}
title <- xpathSApply(PARSED, "//div[@class='title']/a[contains(@itemprop, 'itemListElement')]",xmlValue)
link <- xpathSApply(PARSED, "//div[@class='title']/a[contains(@itemprop, 'itemListElement')]/@href")
#print(title)
return(list(title=c(title, previous$title),
link=c(link, previous$link)))
}
### the actial function to get the content
# TODO should be refactored
scrapeContent <- function(url){
SOURCE <- tryCatch({
getURL(url,encoding="UTF-8") # Specify encoding when dealing with non-latin characters
}, error = function(e){
print(e)
return("")
})
if(SOURCE =="") return("")
PARSED <- htmlParse(SOURCE)
title <- xpathSApply(PARSED, "//div[@class='titleBar']/h1", xmlValue)
author <- xpathSApply(PARSED, "//div[@class='userText']/a", xmlValue)
entrycount <- length(author)
## introduce length control for text date and author
text <- xpathSApply(PARSED, "//div[@class='messageContent']/article/blockquote", xmlValue)
textcount <- length(text)
text <- gsub("[\t\n]", "", x=text, useBytes = T)
### Attention order is incorrect
if(entrycount < textcount){
author <- c(author, xpathSApply(PARSED, "//div[@class='userText']/span", xmlValue))
entrycount <- length(author)
}
date <- xpathSApply(PARSED, "//span[@itemprop='dateCreated']/span[@class='DateTime']/@title")
names(date) <- NULL
if(length(date) != entrycount){
datestring <- xpathSApply(PARSED, "//span[@itemprop='dateCreated']/abbr[@class='DateTime']/@data-datestring")
timestring <- xpathSApply(PARSED, "//span[@itemprop='dateCreated']/abbr[@class='DateTime']/@data-timestring")
if(is.null(datestring)) date <- rep(0, entrycount)
else{
for(i in 1:(entrycount - length(date))){
date <- c(date, paste(datestring[i], " um ", timestring[i], " Uhr", sep=""))
print(datestring[i])
print(timestring[i])
}
#print(date)
if(length(date)!= entrycount){
print("error")
}
}
}
date <- as.POSIXct(date, tz = "CET", format="%d. %B %Y um %H:%M Uhr")
relatedEntries <- xpathSApply(PARSED, "//div[@class='title']/a/@href")
posts <-list()
for(i in 1:length(text)){
posts[[i]] <- list(author=author[i], text=text[i], date=date[i])
}
return(list(title=title, posts=posts))
# list(title=title,
# author=author,
# text=text,
# date=date,
# relatedEntries=relatedEntries))
}