-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathpmasani.py
113 lines (94 loc) · 3.28 KB
/
pmasani.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import re
import requests
from urlextract import URLExtract
import gzip
from tqdm import tqdm
import os
utid = 'pmasani'
# Base URLs for models, datasets, and source repositories
base = {
'model': 'https://huggingface.co/',
'data': 'https://huggingface.co/datasets/',
'source': 'https://'
}
post = '/raw/main/README.md'
postGH = '/blob/master/README.md'
# URL and DOI extractors
extU = URLExtract()
DOIpattern = r'\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\b'
# To extract URLs from content
def extractURLs(content):
return extU.find_urls(content)
# To extract DOIs from content
def extractDOIs(content):
return re.findall(DOIpattern, content)
# To correct GitHub URLs
def correct_github_url(url):
if "githubcom" in url:
url = url.replace("githubcom", "github.com")
return url
# To fetch content with error handling
def fetch_content(url):
try:
response = requests.get(url)
response.raise_for_status()
print(f"Successfully fetched {url}")
return response.text
except requests.exceptions.RequestException as e:
print(f"Failed to fetch {url}: {e}")
return None
# output exist or none
os.makedirs("output", exist_ok=True)
# To run the scraping process with tqdm for progress
def run(tp):
# Check if input file exists
input_file = f"input/{utid}_{tp}"
if not os.path.exists(input_file):
print(f"Input file {input_file} not found. Skipping {tp} scraping.")
return
post0 = post if tp in ['model', 'data'] else postGH
# Open the input file and compressed output file
with open(input_file, 'r') as f, gzip.open(f"output/{utid}.json.gz", 'wt', encoding='utf-8') as fo:
lines = f.readlines()
for line in tqdm(lines, desc=f"Processing {tp} URLs"):
line = line.strip()
if not line:
continue
# Handle source-specific URL correction
if tp == 'source':
try:
number, url_part = line.split(';')
url_part = correct_github_url(url_part)
except ValueError:
print(f"Skipping malformed line: {line}")
continue
url = base[tp] + url_part + post0
else:
# Prepend the appropriate base URL for models and data
url = base[tp] + line + post0
# Fetch the content
content = fetch_content(url)
if content is None:
print(f"Failed to fetch content for {line}")
continue # Skip if content couldn't be fetched
# Extract URLs and DOIs
urls = extractURLs(content)
dois = extractDOIs(content)
# Create a dictionary for the scraped data
res = {
'id': line,
'type': tp,
'url': url,
'content': content.replace('\n', ' '),
'links': urls,
'dois': dois,
'bibs': [] # For BibTeX entries
}
# Write the result to the compressed JSON file
fo.write(json.dumps(res, ensure_ascii=False) + "\n")
# Run the script for model, data and source
run('model')
run('data')
run('source')
print("Data scraping and extraction done!")