-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrelevance_feedback_1.py
193 lines (180 loc) · 6.24 KB
/
relevance_feedback_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
import os
import numpy
import re
import sys
import math
import pickle
# Class to store document (node) information
class node_data:
# A node contains sentence
def __init__(self, str_sentence):
self.sentence = str_sentence
self.tf = {}
self.idf = {}
# vocabulary
words_database = {}
doc_id = {}
# # Processing line by line
# def word_by_word_processing(line):
# line = re.sub("[^a-zA-Z]+", " ", line)
# set_of_stop_words = nltk.corpus.stopwords.words('english')
# #stem_ob = nltk.stem.porter.PorterStemmer()
# #tokenset = nltk.tokenize.word_tokenize(line)
# #final_set_of_tokens = []
# final_line = ""
# for token in tokenset:
# token = token.lower()
# if token not in set_of_stop_words:
# #final_line += stem_ob.stem(token)
# final_line += token
# final_line += " "
# return final_line
# This text processing module is wrt to every doc in the folder "alldocs"
def Doc_processing_module():
file_node_list = []
# file_list = os.listdir("/home/sid/Downloads/Assignement2_IR/Topic"+str(i+1))
file_list = os.listdir(dir)
temp_tkn = nltk.data.load('tokenizers/punkt/english.pickle')
i = 0
# iterate thru every file
for file in file_list:
if not file.endswith('.xml'):
continue
# print file
doc_id[file] = i
# file_ob = open("/home/sid/Downloads/Assignement2_IR/Topic"+str(i+1)+"/"+file,"r")
file_ob = open(dir+ "/" + file, "r")
# concatenating all the text in the folder to one entity
file_text = file_ob.read()
# final_text = word_by_word_processing(file_text)
# print (file_text + "\n\n\n")
node = node_data(file_text)
file_node_list.append(node)
i += 1
with open("doc_id_data.p", "wb") as doc1_data:
pickle.dump(doc_id, doc1_data,protocol=2)
return file_node_list
# End of function
# This text processing module is wrt to every query in "query.txt"
def Query_processing_module():
query_node_list = []
# file_list = os.listdir("/home/sid/Downloads/Assignement2_IR/Topic"+str(i+1))
queries = open("query.txt", 'r')
# iterate thru every file
for query in queries:
node = node_data(query)
query_node_list.append(node)
# print(query_node_list)
return query_node_list
# End of function
# Get word list from given text
def getwordlist(node):
sent = node.sentence
# sent = sent[5:]
sent = sent.lower()
sent = re.sub("[^a-zA-Z]+", " ", sent)
# print (sent + "\n")
sent = sent.strip()
word_list = sent.split(" ")
stop_words = nltk.corpus.stopwords.words('english')
# word_list1 = filter(lambda x: x not in stop_words, word_list)
# word_list1 = [x for x in word_list if x not in stop_words]
word_list2 = filter(lambda x: x != '', word_list)
return word_list2
# end of function
# Module to generate tf-idf vectors corresponding to the sentences
def generate_tf_idf_vectors(node_list):
# Dictionary for storing the entire vocabulary
# Vocabulary stores the no of nodes in which a
# particular word appears
# Calculation of tf
for node in node_list:
word_list = getwordlist(node)
word_set = set(word_list)
for word in word_set:
node.tf[word] = 0
if word not in words_database:
words_database[word] = 1
else:
words_database[word] += 1
# finding out the tf-vector of the node
for word in word_list:
node.tf[word] += 1
# Calculation of idf
i = 0
N = len(words_database)
nodes_to_be_removed = []
for node in node_list:
word_list = getwordlist(node)
word_set = set(word_list)
if len(word_set) == 0:
nodes_to_be_removed.append(i)
for word in word_set:
ni = words_database[word]
# print("word = "+ word + " N = "+ str(N)+ " ni = "+str(ni))
node.idf[word] = math.log(N * 1.0 / ni)
i = i + 1
# end of for loop
print("size of nodes to be removed = "+str(len(nodes_to_be_removed)))
# Removing invalid nodes (nodes containing invalid elements)
# final_node_list = []
# l = len(node_list)
# for i in range(0,l):
# if i not in nodes_to_be_removed:
# final_node_list.append(node_list[i])
with open("doc_data.p", "wb") as doc_data:
pickle.dump(node_list, doc_data,protocol=2)
return node_list
# End of function
# Module to generate tf-idf vectors corresponding to the sentences
def generate_tf_idf_vectors_for_query(node_list):
# Calculation of tf
for node in node_list:
word_list = getwordlist(node)
# print str(word_list[0])+" is the indeccs"
# wordlist.pop(0)
word_set = set(word_list)
for word in word_set:
node.tf[word] = 0
# finding out the tf-vector of the node
for word in word_list:
node.tf[word] += 1
# Calculation of idf
i = 0
N = len(words_database)
nodes_to_be_removed = []
for node in node_list:
word_list = getwordlist(node)
word_set = set(word_list)
for word in word_set:
if word in words_database:
ni = words_database[word]
# print str(ni) + "\n"
node.idf[word] = math.log(N * 1.0 / ni)
else:
node.idf[word] = 10000
i = i + 1
# print("Size of vocabulary : "+str(len(words_database))+"\n\n")
return node_list
# End of function
# main function
if __name__ == '__main__':
dir = sys.argv[1]
if len(sys.argv) < 2:
print(dir.__doc__)
sys.exit(1)
# Documents processing and tf vector and idf vector generation
doc_list = Doc_processing_module()
doc_node_list = generate_tf_idf_vectors(doc_list)
# print('this is doc node list:', doc_node_list)
# Query processing and tf vector and idf vector generation
query_list = Query_processing_module()
# print('this is query list: ', query_list)
query_node_list = generate_tf_idf_vectors_for_query(query_list)
# print('query node list',query_node_list)
# Storing word vocabulary in a file
with open("vocabulary.p", "wb") as voc_data:
pickle.dump(words_database, voc_data,protocol=2)