-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_deps.py
42 lines (33 loc) · 1.08 KB
/
get_deps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import time
import lxml.html
from tqdm import tqdm
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Firefox()
def get_deps(repo):
url = 'https://github.com{}/network/dependencies'.format(repo)
driver.get(url)
time.sleep(2)
try:
while True:
driver.find_element_by_css_selector('#dependencies .ajax-pagination-btn').click()
time.sleep(5)
except NoSuchElementException:
pass
html = lxml.html.fromstring(driver.page_source)
deps = [a.attrib['href'] for a in html.cssselect('#dependencies .js-dependency [data-octo-click=dep_graph_package]')]
return deps
with open('data/repos.json', 'r') as f:
repos = json.load(f)
for i, repo in tqdm(enumerate(repos)):
if 'dependencies' in repo: continue
url = repo['url']
deps = get_deps(url)
repo['dependencies'] = deps
if i % 10 == 0:
with open('data/repos.json', 'w') as f:
json.dump(repos, f)
with open('data/repos.json', 'w') as f:
json.dump(repos, f)
driver.close()