forked from lamps-lab/ETDMiner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanansi_psu_scraper.py
384 lines (139 loc) · 6.61 KB
/
anansi_psu_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import urllib.request
import urllib.response
import urllib.parse
import bs4
import re
import time
from collections import defaultdict
import json
import os
def createProjectFolder():
folder_location = r'E:\psu_files'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
def createUrlList(url, page_num): # creates a url link for every page of the psu website
url_index = [] # container for the url's
for i in range(1, page_num):
url_index.append(url + str(i)) # creates a link for every page and stores it into a list
return url_index
def getReponse(url): # gets a response for each url
header= {}
header['user_agent'] = "Mozilla/5.0 (X11; Linux x86_64)"
header["From"] = "tdenn007@odu.edu"
req = urllib.request.Request(url,headers=header)
response = urllib.request.urlopen(req)
time.sleep(10) # delays the program 10 secs so it doesn't get blocked
response = response.read()
return response
def getLink(response, domain): # extracts the catalog links from each page
links = []
soup = bs4.BeautifulSoup(response, "html.parser")
for link in soup.find_all("a", attrs={'href': re.compile("^/catalog/")}): # grabs the link using regular expression
catalog = link.get('href')
links.append(domain + str(catalog)) # creates a full url for the program to follow
return links
def parseMetadata(link, id): # grabs all the metadata for each paper
psu_papers = defaultdict(dict) # a nested dict for all the information to be stored
response = urllib.request.urlopen(link) # opens each link to be parsed
response = response.read()
time.sleep(5) # delays the program
link_soup = bs4.BeautifulSoup(response, "html.parser" )
title = link_soup.find('h1', attrs={"itemprop": "name"}) # parsing the metadata
author = link_soup.find('dd', class_="blacklight-author_name_tesi")
program = link_soup.find('dd', class_="blacklight-program_name_ssi")
degree = link_soup.find('dd', class_="blacklight-degree_description_ssi")
doc_type = link_soup.find('dd', class_="blacklight-degree_type_ssi")
date = link_soup.find('dd', class_="blacklight-defended_at_dtsi")
members = link_soup.find('dd', class_="blacklight-committee_member_and_role_tesim")
keywords = link_soup.find('dd', class_="blacklight-keyword_ssim")
abstract = link_soup.find('dd', class_="blacklight-abstract_tesi")
psu_papers[id]['id'] = str(id) # storing the metadata
if title is not None:
psu_papers[id]['title'] = title.getText().strip('\n')
else:
psu_papers[id]['title'] = "NaN"
if author is not None:
psu_papers[id]['author'] = author.getText().strip('\n')
else:
psu_papers[id]['author'] = "NaN"
if program is not None:
psu_papers[id]['program'] = program.getText().strip('\n')
else:
psu_papers[id]['program'] = "NaN"
if degree is not None:
psu_papers[id]['degree'] = degree.getText().strip('\n')
else:
psu_papers[id]['degree'] = "NaN"
if doc_type is not None:
psu_papers[id]['doc_type'] = doc_type.getText().strip('\n')
else:
psu_papers[id]['doc_type'] = "NaN"
if date is not None:
psu_papers[id]['date'] = date.getText().strip('\n')
else:
psu_papers[id]['date'] = "NaN"
if members is not None:
psu_papers[id]['members'] = members.getText().strip('\n')
else:
psu_papers[id]['members'] = "NaN"
if keywords is not None:
psu_papers[id]['keywords'] = keywords.getText().strip('\n')
else:
psu_papers[id]['keywords'] = "NaN"
if abstract is not None:
psu_papers[id]['abstract'] = abstract.getText().strip('\n')
else:
psu_papers[id]['abstract'] = "NaN"
if link is not None:
psu_papers[id]['link'] = link
else:
psu_papers[id]['link'] = "NaN"
return psu_papers
def writePDF(pdf_file_name, pdf_link):
f = open(pdf_file_name, 'wb')
f.write(requests.get(pdf_link, allow_redirects=True).content)
f.close()
def extractPDF(link, domain, id): # extracting the pdf from website this function is giving me trouble
psu_pdf = defaultdict(dict) # nested dict for the pdf info
response = urllib.request.urlopen(link)
response = response.read()
time.sleep(5)
link_soup = bs4.BeautifulSoup(response, "html.parser")
pdf_link = link_soup.find("a", attrs={'href': re.compile("^/files/")}) # grabs the link for the pdf
pdf_name = "psu_pdf_" + str(id) + ".pdf" # creates the pdf with unique ID
if pdf_link != None: # check if beautifulsoup grabbed nothing
pdf = domain + pdf_link['href'] # creates url
print("exporting...."+ pdf_name)
urllib.request.urlretrieve(pdf,
pdf_name) # this is where the problem is. It should download the pdf to the file name created
psu_pdf[id]["id"] = str(id) # stores info on pdf
psu_pdf[id]["PDF_filename"] = pdf_name
psu_pdf[id]["PDF_link"] = pdf
else:
psu_pdf[id]["id"] = str(id)
psu_pdf[id]["PDF_filename"] = pdf_name
psu_pdf[id]["PDF_link"] = "NaN" # if bs4 grabbed nothing it gives this answer to know data is missing
return psu_pdf
def writeFile(metadata, id): # writes to json file
metadata_filename = "psu_metadata_" + str(id) + ".txt"
with open(metadata_filename, 'w') as output_file:
json.dump(metadata, output_file)
print("exporting ... " + metadata_filename)
def getWebsite(url, page_num, domain): # all the functions under the same roof
url_list = []
links_list = []
id = 1
url_list = createUrlList(url, page_num)
for item in url_list:
response = getReponse(item)
links_list = getLink(response, domain)
for link in links_list:
metadata = parseMetadata(link, id)
extractPDF(link, domain, id)
writeFile(metadata, id)
id = id + 1
if __name__ == '__main__': # execution of program
domain = "https://etda.libraries.psu.edu" # the base domain
url = "https://etda.libraries.psu.edu/catalog?page=" # the url without the page number
page_num = 1567 # number of pages you want to scrape
getWebsite(url, page_num, domain) # program