-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf.py
137 lines (120 loc) · 4.98 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Event registry summary statistics
#imports
import numpy as np
import math
import json
from collections import Counter
import time
from eventregistry import *
from datetime import date, datetime
print 'imports completed'
startTime = datetime.now()
file_counter = 876 #all files
#file_counter = 10 #realistic
total_files = file_counter * 1000
files = 0 #file incrementer
event = 0 #event within file incrementer
output = []
concept_main_list = {}
max_sum = 0
def hashing(word):
sum = 0
global max_sum
for pos in range(len(word)):
sum = sum + ord(word[pos]) * (pos + 1)
if (sum > max_sum):
max_sum = sum
return sum%2000000
def list_check(list,index):
try:
return list[index]
except IndexError:
return 0
#count = 0
#while count < 2000000: #initialise the hash table
# concept_main_list.append(0)
# count = count + 1
while files < file_counter:
concept_list = []
print 'events-00' + '{0:03d}'.format(files) + '000.json'
print datetime.now() - startTime
print str(round(float(files) / float(file_counter),4) * 100) + '%'
with open('events/events-00' + "{0:03d}".format(files) + '000.json') as data_file:
data = json.load(data_file)
#loop through events in json file
count = len(data) + (files * 1000)
while event < count:
event_dict = {}
print event
the_event = data[str(event)] #the_event refers to one event
if 'info' in the_event: #if the event isn't a merge with another event
the_event = the_event['info']
if 'uri' in the_event:
event_dict['ID'] = the_event['uri']
if 'stories' in the_event:
#event_dict['story_title'] = the_event['stories'][0]['title']
#event_dict['story_lang'] = the_event['stories'][0]['lang']
#event_dict['story_summary'] = the_event['stories'][0]['summary']
#event_dict['story_date'] = the_event['stories'][0]['averageDate']
var0 = 0
if 'concepts' in the_event:
#concept_number = 0
#concept_list = []
#for concept in the_event['concepts']:
#the_hash = hashing(concept['labelEng'])
#print the_hash
#concept_main_list[the_hash] = concept_main_list[the_hash] + 1
#concept_main_list.insert(the_hash,initial_count + 1)
#event_dict['concept' + str(concept_number)] = concept['labelEng']
#concept_list.append(concept['labelEng'])
#concept_number = concept_number + 1
event_dict['concepts'] = concept_list
if 'eventDate' in the_event:
if the_event['eventDate'] != "":
event_date = datetime.strptime(the_event['eventDate'], "%Y-%m-%d").date()
event_dict['event_date'] = the_event['eventDate']
if 'multiLingInfo' in the_event:
for key, value in the_event['multiLingInfo'].iteritems():
event_dict['event_lang'] = key
if key == "eng":
summary = the_event['multiLingInfo'][key]['summary'].split()
word_list = []
for word in summary:
if word not in word_list:
word_list.append(word)
for word in word_list:
#the_hash = hashing(word)
word = word.replace(".", "")
word = word.replace(",", "")
word = word.replace("\"", "")
print word
if concept_main_list.has_key(word):
concept_main_list[word] = concept_main_list[word] + 1
else:
concept_main_list[word] = 1
the_event = the_event['multiLingInfo'].values()
#print the_event[0]['title']
#" + str("{0:03d}".format(int(round(math.ceil(event / 1000) * 1000,4)))) + "
output.append(event_dict)
#with open("flatten.json", "a") as myfile:
# myfile.write(json.dumps(event_dict, indent=4))
event = event + 1;
files = files + 1 #increment the file counter
startTime = datetime.now()
print concept_main_list
for value in concept_main_list:
concept_main_list[value] = int(concept_main_list[value])
if concept_main_list[value] > 0:
concept_main_list[value] = math.log(total_files / float(concept_main_list[value]))
with open("concept_hash_table.json", "w") as myfile:
myfile.write(json.dumps(concept_main_list, indent=0))
myfile.close()
with open("concept_hash_table.json", "r") as myfile:
reading = json.load(myfile)
myfile.close()
#print concept_main_list
word = raw_input('What is the word? ')
print reading[word]
print 'There are ' + str(event) + ' events.'
#print json.dumps(output, indent=4)
print 'This took ' + str(datetime.now() - startTime) + ' to run'