-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateTables.py
191 lines (171 loc) · 9.34 KB
/
createTables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import logging
import manage_articles as mng
import datetime
from datetime import date, timedelta
import timeseries
import word as wpy
import json
import matching
import api_wikipedia as wiki
import multiprocessing as mp
def createYearlyDatabase(path_source, path_result, year):
"""
Creates a database at ``path_results`` of world news articles for a given ``year`` using data from ``path_source``.
Notes
-----
1.) Note that this function discards all items, which do not belong to the world news article section.
2.) Works with NYT and TheGuardian data.
Parameters
----------
path_source : str
Path on local machine where article database is located.
path_result : str
Path on local machine where resulting article dataset should be stored.
year : int
Year of publication of targeted articles (filter parameter).
Returns
-------
dict
Dict of article data in JSON format.
"""
data = mng.load_articles(path_source)
if "guardian" in path_source.lower():
data = mng.filter_articles(data, {"document_type": ["article"], "section_name": ["World news"]}, start=date(year,1,1), end=date(year,12,31))
else:
data = mng.filter_articles(data, {"document_type": ["article"], "type_of_material": ["News"], "section_name": ["World"]}, start=date(year,1,1), end=date(year,12,31))
mng.store_articles(data, path_result)
def compute_topKeywordsTable(path_source, path_result, n=50):
"""
Creates a Markdown table at ``path_results`` with the ``n`` most frequent (top) keywords of the database at ``path_source``.
Parameters
----------
path_source : str
Path on local machine where article database is located.
path_result : str
Path on local machine where resulting Markdown table should be stored.
n : int
Number of top keywords in the table (default is 50).
Returns
-------
None
"""
articles = mng.load_articles(path_source)
# Get a dict of dicts for each calendar week with word frequencies from getWordCounts
wordCounts = mng.getWordCounts(articles)
# List of distinct words
distinctWords = mng.getDistinctWords(wordCounts)
# List of lists of tuples containing weekly word frequency
countsPerWeek = []
for w in distinctWords:
countsPerWeek.append((w,mng.getCountPerWeek(wordCounts,w)))
# Create list of word objects for each keyword
words = []
for c in countsPerWeek:
yearCount = 0
for el in c[1]:
yearCount += el[1]
words.append([c[0], yearCount])
# Compute top keywords
ts_sorted = sorted(words, key=lambda x: x[1], reverse=True)
keywords = []
for i in range(0,min(n,len(ts_sorted))):
keywords.append(ts_sorted[i][0])
m = matching.groupmatch(keywords, articles)
writer = open(path_result,'a',encoding="utf-8")
writer.write("| # | keyword | matching result (simple) | computed query (advanced) | matching result (advanced) |")
writer.write("\n|---|---|---|---|---|")
for i in range(0,len(keywords)):
simple = wiki.search_articles(ts_sorted[i][0], nmax=1)
if len(simple) > 0 and len(m[ts_sorted[i][0]]["link"]) > 0:
writer.write("\n| {}. | {} | {} | {} | {} |".format(i+1, ts_sorted[i][0], simple[1], m[ts_sorted[i][0]]["query"], m[ts_sorted[i][0]]["link"][1]))
elif len(simple) > 0 and len(m[ts_sorted[i][0]]["link"]) == 0:
writer.write("\n| {}. | {} | {} | {} | EMPTY MATCHING |".format(i+1, ts_sorted[i][0], simple[1], m[ts_sorted[i][0]]["query"]))
elif len(simple) == 0 and len(m[ts_sorted[i][0]]["link"]) > 0:
writer.write("\n| {}. | {} | EMPTY MATCHING | {} | {} |".format(i+1, ts_sorted[i][0], m[ts_sorted[i][0]]["query"], m[ts_sorted[i][0]]["link"][1]))
elif len(simple) == 0 and len(m[ts_sorted[i][0]]["link"]) == 0:
writer.write("\n| {}. | {} | EMPTY MATCHING | {} | EMPTY MATCHING |".format(i+1, ts_sorted[i][0], m[ts_sorted[i][0]]["query"]))
writer.close()
def compute_mostInterestingKeywordsTable(path_source, path_result, min_weektotal=10, min_changerate=5):
"""
Creates a Markdown table at ``path_results`` with the most interesting keywords of the database at ``path_source``.
Keywords must fulfill the following two conditions in at least one (and the same) week:
1. Keyword was mentioned at least ``min_weektotal`` in that week.
2. Keyword has a relative changerate of mentions of ``min_changerate`` compared to the previous week.
Parameters
----------
path_source : str
Path on local machine where article database is located.
path_result : str
Path on local machine where resulting Markdown table should be stored.
min_weektotal : int
Minimum number of total mentions in at least one week (default is 10).
min_changerate : int
Minimum number for the relative changerate of mentions compared to the previous week (default is 5).
Returns
-------
None
"""
articles = mng.load_articles(path_source)
# Get a dict of dicts for each calendar week with word frequencies from getWordCounts
wordCounts = mng.getWordCounts(articles)
# List of distinct words
distinctWords = mng.getDistinctWords(wordCounts)
# List of lists of tuples containing weekly word frequency
countsPerWeek = []
for w in distinctWords:
countsPerWeek.append((w,mng.getCountPerWeek(wordCounts,w)))
# Create list of word objects for each keyword
words = []
for c in countsPerWeek:
words.append(wpy.Word(c[0],ts_articles=timeseries.Timeseries(c[1])))
# Compute interestingness
interestingWords = mng.filter_interestingness(articles, min_weektotal=min_weektotal, min_changerate=min_changerate)
res = {}
keywords = []
for k in interestingWords:
keywords.append(k)
# Delete year but keep calendar week and round numbers:
res[k] = {}
for el in interestingWords[k]:
res[k][el[1]] = [interestingWords[k][el][0], round(interestingWords[k][el][1],2)]
m = matching.groupmatch(keywords, articles)
writer = open(path_result,'a',encoding="utf-8")
writer.write("|row|Keyword|week: [ total , changerate ]| computed query (advanced) | matching result (advanced) |")
writer.write("\n|---|---|---|---|---|")
for i in range(0,len(keywords)):
if len(m[keywords[i]]["link"]) > 0:
writer.write("\n| {}. | {} | {} | {} | {} |".format(i+1, keywords[i], res[keywords[i]], m[keywords[i]]["query"], m[keywords[i]]["link"][1]).replace("{","").replace("}",""))
else:
writer.write("\n| {}. | {} | {} | {} | EMPTY MATCHING |".format(i+1, keywords[i], res[keywords[i]], m[keywords[i]]["query"]).replace("{","").replace("}",""))
writer.close()
'''
# Create yearly database of world news articles for NYT and TheGuardian:
for year in range(2001,2021):
createYearlyDatabase("/home/lmoldon/forschungspraktikum/nyt_reduced.json", "/home/lmoldon/forschungspraktikum/data/nyt{}.json".format(year), year)
compute_topKeywordsTable("/home/lmoldon/forschungspraktikum/data/nyt{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/nytTop{}.md".format(year))
compute_mostInterestingKeywordsTable("/home/lmoldon/forschungspraktikum/data/nyt{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/nytMostInteresting{}.md".format(year))
createYearlyDatabase("/home/lmoldon/forschungspraktikum/theguardian_reduced.json", "/home/lmoldon/forschungspraktikum/theguardian{}.json".format(year), year)
compute_topKeywordsTable("/home/lmoldon/forschungspraktikum/data/theguardian{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/theguardianTop{}.md".format(year))
compute_mostInterestingKeywordsTable("/home/lmoldon/forschungspraktikum/data/theguardian{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/theguardianMostInteresting{}.md".format(year))
'''
'''
# WITH MULTIPROCESSING: Create yearly database of world news articles for NYT and TheGuardian:
def single_run(year, nyt):
if nyt:
createYearlyDatabase("/home/lmoldon/forschungspraktikum/nyt_reduced.json", "/home/lmoldon/forschungspraktikum/data/nyt{}.json".format(year), year)
compute_topKeywordsTable("/home/lmoldon/forschungspraktikum/data/nyt{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/nytTop{}.md".format(year))
compute_mostInterestingKeywordsTable("/home/lmoldon/forschungspraktikum/data/nyt{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/nytMostInteresting{}.md".format(year))
else:
createYearlyDatabase("/home/lmoldon/forschungspraktikum/theguardian_reduced.json", "/home/lmoldon/forschungspraktikum/data/theguardian{}.json".format(year), year)
compute_topKeywordsTable("/home/lmoldon/forschungspraktikum/data/theguardian{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/theguardianTop{}.md".format(year))
compute_mostInterestingKeywordsTable("/home/lmoldon/forschungspraktikum/data/theguardian{}.json".format(year), "/home/lmoldon/forschungspraktikum/results/theguardianMostInteresting{}.md".format(year))
tasks = []
for year in range(2001,2021):
for flag in range(0,2):
tasks.append((year, flag))
processes = [mp.Process(target=single_run, args=(task[0], task[1])) for task in tasks]
for p in processes:
p.start()
for p in processes:
p.join()
'''