-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgather_data.py
163 lines (146 loc) · 5.94 KB
/
gather_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import json
import os
import time
import string
import datetime
import ijson
import re
from pprint import pprint
# mindshare!
RESULTS_FILE = "results.txt"
STORIES_FILE = "top.txt"
SAFE = False
def get_data():
if SAFE and os.path.exists(RESULTS_FILE):
print("%s already exists" % RESULTS_FILE)
return
# load static language list
with open("languages.json") as fi:
languages = json.loads(fi.read())
# init results file and vars
writeData('[', RESULTS_FILE, append=False)
writeData('[', STORIES_FILE, append=False)
store = initStore('init', languages)
topStory = {}
currentDate = ()
primary_key = ''
# initialize some counters
top_score = 0
stories = 0
days = 0
# we will store each day as a line in the file
# format is {[day, month, year]: {language: score, lanuage: score, etc...}
filepath = "D:\\Inbox\\HackerNewsStoriesAndCommentsDump\\HNStoriesAll.json"
print("Loading file: %s" % filepath)
with open(filepath) as fi:
start = time.time()
# each "hits" list is 1000 items
for hits in ijson.items(fi, 'item.hits'):
for story in hits:
stories += 1
# check what day the story is from, init new primary key if necessary
newDate = parseTimestamp(story.get('created_at'))
if newDate != currentDate:
primary_key = str(newDate)
if currentDate != ():
writeLine(store, RESULTS_FILE)
writeLine(topStory, STORIES_FILE)
store = initStore(primary_key, languages)
topStory = {primary_key: ""}
currentDate = newDate
top_score = 0
days += 1
if stories % 1000 == 0:
# print some occasional status
print("Processed: %s stories, %s days" % (stories, days))
for item in languages:
# core loop iterates over the stores in the hit list
# check if any of our language targets are present
score = find(story, item['words'])
if score:
# if we found a match, store the associated score
#print("+%s: %s" % (score, key))
language_key = item['name']
store[primary_key][language_key] += score
if score >= top_score:
topStory[primary_key] = story
# wrap it up
writeData(']', RESULTS_FILE, append=True)
writeData(']', STORIES_FILE, append=True)
end = time.time()
print("total time: %s min" % ((end - start) / 60) )
print("time per query: %s sec" % ((end - start) / stories) )
def parseTimestamp(dateString):
# return a tuple of (day, month, year) from a given timestampe
d = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
return (d.day, d.month, d.year)
def writeLine(data, filename):
# take some json data and write it out
writeData(json.dumps(data) + ',\n', filename)
def writeData(data, filename, append=True):
# direct write for strings
mode = 'a' if append else 'w'
with open(filename, mode) as fi:
fi.write(data)
def initStore(primary_key, languages):
# populate initial date structure, a dict of {primary_key: {language: score, language: score, ...}}
store = {primary_key: {}}
for item in languages:
store[primary_key][item['name']] = 0
return store
def find(data, targets):
# take some string data and look for matches from the given list of targets
score = 0
#print(data)
title = data.get('title', '')
text = data.get('text', '')
norm_title = normalizeString(title)
norm_text = normalizeString(text)
score = data.get('score', 0)
for target in targets:
target = " %s " % target
#print("searching for '%s' in '%s'" % (target, title))
if title and (target in norm_title):
if verifySingle(target, title):
#print("Found @ title %s: %s -> %s" % (target, title, norm_title))
score += int(data.get('points', 0))
if text and (target in norm_text):
if verifySingle(target, text):
#print("Found @ text %s: %s -> %s" % (target, text, norm_text))
score += int(data.get('points', 0))
return score
def verifySingle(target, text):
# a special function to handle single letter targets
target = target.strip()
if len(target) != 1:
return True
target = target.upper()
if not re.search("\\b(%s)\\b" % target, text):
#print("Denied regex %s: %s" % (target, text))
return False
if target + '.' in text:
if target + '. ' not in text:
#print("Denied initials %s: %s" % (target, text))
return False
#print("Allowed %s: %s" % (target, text))
return True
def normalizeString(sentence):
"""
iterate over the string char by char and convert any troublesome characters
such as period, comma, etc into spaces. once the string has been normalized,
we can search for matches on word boundaries only by surrounding our targets
with spaces. this allows us to find matches for "i like python." while
avoiding matches such as 'c' with 'computer'
"""
fixed = ""
special_chars = '+#-' # these are necessary for c++, objective-c, c#, etc
for char in sentence:
if (char in string.ascii_letters) or (char in special_chars) or char.isnumeric():
fixed += char
elif (char not in string.printable):
fixed += "~"
else:
fixed += " "
return " " + fixed.lower() + " "
if __name__ == "__main__":
get_data()