-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathemaness.py
55 lines (49 loc) · 1.4 KB
/
emaness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json, re, gzip
import requests
from urlextract import URLExtract
url_base = {
'model': 'https://huggingface.co/',
'data': 'https://huggingface.co/datasets/',
'source': 'https://',
}
post = '/raw/main/README.md'
postGH = 'blob/master/README.md'
extU = URLExtract()
DOIpattern = r'\b(10\.\d{4,9}\/[-._;()/:A-Z0-9]+)\b/i'
def extract_dois(str):
return re.findall(DOIpattern, str)
def extract_urls(text):
return extU.find_urls(text)
def entries_for_type(type):
entries = []
with open(f"input/emaness_{type}", 'r') as of:
for line in of:
line = line.strip()
id = line.split(';')[1] if type == 'source' else line
suffix = postGH if type == 'source' else post
url = f"{url_base[type]}{id}{suffix}"
try:
r = requests.get(url)
except:
continue
if r.status_code != 200:
continue
content = r.text.replace('\n', ' ')
urls = extract_urls(content)
dois = extract_dois(content)
obj = {
'id': id,
'type': type,
'url': url,
'content': content,
'links': urls,
'dois': dois,
'bibs': [] # bib extraction unimplemented
}
entries.append(obj)
return entries
with gzip.open(f"output/emaness.json.gz", 'wt', encoding='utf-8') as of:
for type in ['model', 'data', 'source']:
for entry in entries_for_type(type):
json.dump(entry, of)
of.write('\n')