-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetadata_bot.py
101 lines (82 loc) · 3.93 KB
/
metadata_bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
url = "https://arxiv.org/abs/2212.11181"
options = Options()
options.add_experimental_option("detach", True) #keeps window open
driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options=options)
driver.get(url)
session = requests.Session()
response = session.get(url)
#enable Bibliogrpahic Explorer
class_name = driver.find_element("xpath", "//div[@class='tab labs-display-bib']//div[@class='toggle']//div[1]//div[1]//label[1]//span[1]")
class_name.click()
time.sleep(1)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract the authors -------------------------
authors = [a.text for a in soup.find_all('div', class_='authors')]
# Extract the abstract -------------------------
abstract = soup.find('blockquote', class_='abstract mathjax')
# Extract the citations -------------------------
references = []
nonMLA_references = []
bibTeX_references = []
#number of references
total_references = 1
while (True):
#checks to see how many citations are on the page
try:
#checks for all citations
references_containers = driver.find_element("xpath", "/html/body/div[2]/main/div/div/div[3]/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[%s]" % total_references)
#outputs all citations on the page that can be shown in MLA format
try:
#click on reference
references_containers = driver.find_element("xpath", "//div[@class='bib-col2']//div[%s]//div[1]//div[1]//span[1]//a[1]//img[1]" % total_references)
references_containers.click()
time.sleep(1)
#click MLA button
references_containers = driver.find_element("xpath", "//input[@id='mla']")
if references_containers.is_enabled():
references_containers.click()
time.sleep(1)
#If MLA button is unavailable -> send to bibTeX_references
else:
references_containers = driver.find_element("xpath", "/html/body/div[2]/main/div/div/div[2]/div[7]/div/div/div[3]/textarea")
bibTeX_references.append(references_containers.text)
#get citation
references_containers = driver.find_element("xpath", "/html/body/div[2]/main/div/div/div[2]/div[7]/div/div/div[3]/textarea")
references.append(references_containers.text)
#close citation
references_containers = driver.find_element("xpath", "/html/body/div[2]/main/div/div/div[2]/div[7]")
driver.execute_script("""
var element = arguments[0];
element.parentNode.removeChild(element);
""", references_containers)
total_references += 1
#runs when reference is unavailable
except:
#outputs non MLA citation
references_containers = driver.find_element("xpath", "/html/body/div[2]/main/div/div/div[3]/div/div[1]/div[2]/div/div[2]/div[1]/div[3]/div[%s]/div[2]/div[1]/a" % total_references)
nonMLA_references.append(references_containers.text)
total_references += 1
#outputs total number of ciations
except:
total_references = 1
try:
page_number = driver.find_element("xpath", "/html/body/div[2]/main/div/div/div[3]/div/div[1]/div[2]/div/div[2]/div[1]/div[2]/div[3]/div/span/ul/li[6]/a")
page_number.click()
except:
break
print('Authors:', authors)
print('#############################')
print('Abstract:', abstract.text)
print('#############################')
print('References:', references)
print('#############################')
print('Non MLA References:', nonMLA_references)
print('#############################')
print('BibTeX References:', bibTeX_references)