-
Notifications
You must be signed in to change notification settings - Fork 0
/
paper_citations_from_scholar.py
74 lines (57 loc) · 2.91 KB
/
paper_citations_from_scholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
from lxml import html, etree
import click
import os
import re
import json
from tqdm import tqdm
def get_citations_from_scholar(paper_title):
try:
base_url = f"https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={paper_title}&btnG="
content = requests.get(base_url.replace(" ","+")).content.decode("latin-1")
tree = html.document_fromstring(content)
main_div = tree.body.get_element_by_id("gs_res_ccl_mid")
paper_citation_txt = main_div.getchildren()[0].getchildren()[1].getchildren()[-1].getchildren()[2].text_content()
citations = re.findall(r'\d+', paper_citation_txt)[0]
except Exception as e:
print("Citation for", paper_title, "NOT FOUND")
raise e
return citations
@click.command()
@click.option("--folder", default="conferences")
@click.option("--update", default=False)
@click.option("--progress", default=True)
def main(folder, update, progress):
total = None
if progress:
total = 0
# count number of papers
for conf_base_name in os.listdir(folder):
for year in filter(lambda x: os.path.isdir(os.path.join(folder, conf_base_name,x)), os.listdir(os.path.join(folder, conf_base_name))):
for track in os.listdir(os.path.join(folder, conf_base_name, year)):
total += len(os.listdir(os.path.join(folder, conf_base_name, year, track)))-1
print(total)
with tqdm(total=total) as pBar:
for conf_base_name in os.listdir(folder):
for year in filter(lambda x: os.path.isdir(os.path.join(folder, conf_base_name,x)), os.listdir(os.path.join(folder, conf_base_name))):
for track in os.listdir(os.path.join(folder, conf_base_name, year)):
with open(os.path.join(folder, conf_base_name, year, track, "papers_ids_titles.json")) as f:
proceedings_papers = json.load(f)
for paper_id, paper_data in proceedings_papers.items():
if isinstance(paper_data, dict):
paper_title = paper_data["title"]
if not update:
continue
else:
paper_title = paper_data
proceedings_papers = {
"title": paper_title,
"citations": int(get_citations_from_scholar(paper_title))
}
pBar.update(1)
# write the changes to file
with open(os.path.join(folder, conf_base_name, year, track, "papers_ids_titles.json"), "w") as fOut:
json.dump(proceedings_papers, fOut)
if __name__ == '__main__':
#main()
pass