-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapi_wikipedia.py
190 lines (171 loc) · 7.22 KB
/
api_wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from datetime import date, timedelta
import time
import datetime
import requests
def get_counts(title, start, end, email, language_edition="en"):
"""
Get the daily pageviews of an wikipedia article (uses wikipedia API).
Returns the daily pageviews of the wikipedia article ``title`` between day ``start`` and day ``end`` in the language ``language_edition``.
Notes
-----
1.) Do not make more than one api function call per second due to given request limits by wikipedia.
2.) If the function returns an empty list (but the article exists online), then most likely the article was created after day ``end```.
3.) We only consider English wikipedia articles in our analysis.
Parameters
----------
title : str
Title of the wikipedia article (https://en.wikipedia.org/wiki/``title``).
start : datetime.date
First day of pageview statistic.
end : datetime.date
Last day of pageview statistic.
email : str
Your personal or institutional e-mail address for the request header (providing the possibility to be contacted by Wikipedia)
language_edition : str
Selector for the language edition of wikipedia (default is ``en`` for English, see Notes).
Returns
-------
list of datetime.date
list of daily timestamps.
list of int
list of pageview counts for that day (same position as in first list).
"""
time.sleep(1)
timestamp_list = []
viewcount_list = []
url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{}.wikipedia/all-access/all-agents/{}/daily/{}/{}" # https://www.mediawiki.org/wiki/API:Etiquette
header = {
'User-Agent': 'WikipediaNews 1.0',
'From': email
}
url_start = str(start).replace("-","")
url_end = str(end).replace("-","")
response = requests.get(url.format(language_edition, title, url_start, url_end), headers=header)
if response.status_code == 200:
try:
for item in response.json()["items"]:
timestamp_str = item["timestamp"]
datetime = date(int(timestamp_str[:4]), int(timestamp_str[4:6]), int(timestamp_str[6:8]))
timestamp_list.append(datetime)
viewcount_list.append(item["views"])
except KeyError:
return [], []
else:
#print(response.status_code, response.reason)
return None
return timestamp_list, viewcount_list
def search_articles(keywords, email, nmax=10, date=None, timeout=10):
"""
Get ranked search results of wikipedia articles for given keywords belonging to the same topic (uses wikipedia API).
Returns a ranked dict with ``nmax`` entries of wikipedia articles as search result for the query ``keywords``. If multiple keywords are
used, then all ``keywords`` should belong to the same topic.
Notes
-----
1.) Do not make more than one api function call per second due to given request limits by wikipedia.
2.) This implementation is not deterministic, the API returns different results within hours.
3.) If optional parameter ``date`` gets used, only articles which already existed at ``date`` will be returned, which increases the runtime
in certain circumstances significantly. Make sure to use the parameter ``timeout`` in this case, to limit the runtime for a single query.
Parameters
----------
keywords : str or list of str
Keyword(s) for the query (argument for the search).
email : str
Your personal or institutional e-mail address for the request header (providing the possibility to be contacted by Wikipedia)
nmax : int
Maximum number of wikipedia articles which should be returned (can be less depending on number of found results).
date : datetime.date
Results are limited to articles which already existed at ``date`` (default is None, see Notes).
timeout : int
Limits the runtime for this query to ``timeout`` seconds. When exceeding the threshold, intermediate results will be returned.
Returns
-------
dict
Results of the API search (key represents ranking: starting at 1 for best match, up to ``nmax``)
"""
start = datetime.datetime.now()
url = "https://en.wikipedia.org/w/api.php"
if type(keywords) == list:
keywords = " ".join(keywords)
params = {
"action": "query",
"format": "json",
"list": "search",
"srsearch": keywords
}
header = {
'User-Agent': 'WikipediaNews 1.0',
'From': email
}
response = requests.get(url=url, params=params, headers=header)
if response.status_code == 200:
ranking = {}
i = 1
try:
for item in response.json()["query"]["search"]:
now = datetime.datetime.now()
if start + timedelta(seconds=timeout) < now:
break
elif "disambiguation" not in item["title"] and i <= nmax:
if date == None:
ranking[i] = item["title"].replace(" ", "_")
i += 1
else:
time.sleep(1)
if get_creationdate(item["title"]) <= date:
ranking[i] = item["title"].replace(" ", "_")
i += 1
except KeyError:
return {}
else:
print(response.status_code, response.reason)
return {}
return ranking
def get_creationdate(title, email):
"""
Get the date of creation of a given wikipedia article.
Notes
-----
Do not make more than one api function call per second due to given request limits by wikipedia.
Parameters
----------
title : str
Title of the wikipedia article (https://en.wikipedia.org/wiki/``title``).
email : str
Your personal or institutional e-mail address for the request header (providing the possibility to be contacted by Wikipedia)
Returns
-------
datetime.date
creation date of the article.
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"prop": "revisions",
"titles": title,
"rvdir": "newer",
"rvlimit": 1,
"rvprop": "timestamp|user",
"rvslots": "main",
"formatversion": "2",
"format": "json"
}
header = {
'User-Agent': 'WikipediaNews 1.0',
'From': email
}
response = requests.get(url=url, params=params, headers=header)
if response.status_code == 200:
i = 1
try:
timestamp = response.json()["query"]["pages"][0]["revisions"][0]["timestamp"]
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ").date()
except KeyError:
return None
else:
print(response.status_code, response.reason)
return None
print(search_articles("9/11"))
#print(search_articles(["9/11", "attacks", "New", "York"], email))
#print(get_creationdate("September_11_attacks"), email)
#print(get_creationdate("Aftermath_of_the_September_11_attacks"), email)
#print(search_articles(["9/11", "attacks", "New", "York"], email, date=date(2001,11,22))) # e.g. on 2001-11-22 9/11 was mentioned in the news, not all related articles created yet