-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsbir_nasa.py
95 lines (88 loc) · 2.53 KB
/
sbir_nasa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
file_name = 'NASA Scrape.xlsx'
import traceback, time
from itertools import count
from bs4 import BeautifulSoup
import requests
from openpyxl import load_workbook
DEBUG = False
if DEBUG:
import requests_cache
requests_cache.install_cache()
class Crawler:
def __init__(self):
self.session = requests.Session()
self.session.headers.update((
('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'),
('Accept', '*/*'),
('Accept-Language', 'en-US,en;q=0.8'),
))
self.url = None
def get(self, url, data=None):
self.url = url
for i_try in count(1):
try:
if data:
resp = self.session.post(url, data=data, verify=False, timeout=30)
else:
resp = self.session.get(url, verify=False, timeout=30)
resp.raise_for_status()
return BeautifulSoup(resp.text, 'lxml')
except requests.RequestException as e:
print(url)
print(e)
time.sleep(min(2**i_try, 3600))
def main(self):
wb = load_workbook(file_name)
ws = wb.active
for row in range(2, ws.max_row + 1):
url = ws.cell(row=row, column=1).value
if not url:
continue
soup = self.get(url)
p = soup.find('span', string='Lead Center:').parent.previous_sibling
t = []
for el in p.next_siblings:
process(el, t)
t.append('\n\n')
t = ''.join(t).strip()
with open(url[-5:] + '.txt', 'w', encoding='utf_8') as f: f.write(t)
ws.cell(row=row, column=2, value=t)
wb.save(file_name)
def process(el, t):
for el in el.children:
if isinstance(el, str):
if not el.isspace():
t.append(el.strip('\n').replace('\xa0', ' '))
elif el.name in ('span', 'strong') and not (el.string or '').isspace() or (el.name == 'em' and el.string == 'References:'):
if not t or t[-1].endswith('\n'):
t.append('#### ')
process(el, t)
elif el.name in ('p', 'div'):
process(el, t)
t.append('\n\n')
elif el.name == 'a' and el.string in ('Read more>>', 'Read less>>'):
pass
elif el.name == 'li':
level = len(el.find_parents(['ul', 'blockquote'])) - 2 # have extra ul on top
t.append(' '*level + '* ')
process(el, t)
t.append('\n')
elif el.name == 'br':
if el.next_sibling:
t.append('\n')
elif el.name == 'ul':
process(el, t)
t.append('\n')
else:
if el.name not in ('blockquote', 'sub', 'sup', 'a', 'em', 'span', 'strong'):
raise RuntimeError(el.name)
process(el, t)
if __name__ == '__main__':
requests.packages.urllib3.disable_warnings()
try:
c = Crawler()
c.main()
except Exception:
print()
print(c.url)
raise