-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrocchio_algorithm_new.py
302 lines (273 loc) · 11.2 KB
/
rocchio_algorithm_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# Import the necessay packages
import sys
import os
import nltk
import re
from nltk.tokenize import word_tokenize
import lucene
from os import path, listdir
import math
import _pickle as pickle
# from java.io import File
from org.apache.lucene.search.similarities import BM25Similarity,ClassicSimilarity
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.store import RAMDirectory, SimpleFSDirectory
import time
# Indexer imports:
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
# from org.apache.lucene.store import SimpleFSDirectory
# Retriever imports:
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
# ---------------------------- global constants ----------------------------- #
BASE_DIR = path.dirname(path.abspath(sys.argv[0]))
INPUT_DIR = BASE_DIR + "/data_without_titles/"
# Class to store document (node) information
class node_data:
# A node contains sentence
def __init__(self, str_sentence):
self.sentence = str_sentence
self.tf = {}
self.idf = {}
# Loading the dictionary from the pickle file
with open("vocabulary.p", "rb")as x:
words_database = pickle.load(x,encoding='utf-8')
# Loading the dictionary from the pickle file
with open("doc_data.p", "rb") as y:
doc_node_list = pickle.load(y,encoding='utf-8')
# Loading the dictionary from the doc id hashmap
with open("doc_id_data.p", "rb") as z:
doc_id = pickle.load(z,encoding='utf-8')
# List of docs from lucene search
lucene_output_docs = {}
# Queries tf-idf values
query_tf_idf = {}
def create_document(file_name):
path = INPUT_DIR + file_name # assemble the file descriptor
file = open(path) # open in read mode
doc = Document() # create a new document
# add the title field
doc.add(StringField("title", input_file, Field.Store.YES))
# add the whole book
doc.add(TextField("text", file.read(), Field.Store.YES))
file.close() # close the file pointer
return doc
# Initialize lucene and the JVM
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# Create a new directory. As a SimpleFSDirectory is rather slow ...
directory = RAMDirectory() # ... we'll use a RAMDirectory!
# directory = SimpleFSDirectory()
# Get and configure an IndexWriter
analyzer = StandardAnalyzer()
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(analyzer)
writer = IndexWriter(directory, config)
print ("Number of indexed documents: %d\n" % writer.numDocs())
for input_file in listdir(INPUT_DIR): # iterate over all input files
# print "Current file:", input_file
doc = create_document(input_file) # call the create_document function
writer.addDocument(doc) # add the document to the IndexWriter
# print "\nNumber of indexed documents = %d" % writer.numDocs()
writer.close()
# This text processing module is wrt to every query in "query.txt"
def Query_processing_module():
query_node_list = []
# file_list = os.listdir("/home/sid/Downloads/Assignement2_IR/Topic"+str(i+1))
queries = open("query_for_updated_query.txt", 'r')
i=0
for query in queries:
node = node_data(query)
query_node_list.append(node)
i+=1
# print(query_node_list)
return query_node_list
# End of function
# Get word list from given text from the query.txt
def getwordlist(node):
sent = node.sentence
# sent = sent[5:]
sent = sent.lower()
sent = re.sub("[^a-zA-Z]+", " ", sent)
# print sent + "\n"
sent = sent.strip()
word_list = sent.split(" ")
stop_words = nltk.corpus.stopwords.words('english')
# word_list1 = filter(lambda x: x not in stop_words, word_list)
# word_list1 = [x for x in word_list if x not in stop_words]
word_list2 = filter(lambda x: x != '', word_list)
return word_list2
# end of function
# Module to generate tf-idf vectors corresponding to the sentences
def generate_tf_idf_vectors_for_query(node_list):
# Calculation of tf
for node in node_list:
word_list = getwordlist(node)
# print word_list
# print word_list[0] + "in generating tf-idf-vector-forquery"
# word_list.pop(0)
word_set = set(word_list)
for word in word_set:
node.tf[word] = 0
# finding out the tf-vector of the node
for word in word_list:
node.tf[word] += 1
# Calculation of idf
N = len(words_database)
nodes_to_be_removed = []
for node in node_list:
word_list = getwordlist(node)
word_set = set(word_list)
for word in word_set:
if word in words_database:
ni = words_database[word]
# print str(ni) + "\n"
node.idf[word] = math.log(N * 1.0 / ni)
else:
node.idf[word] = 100000
return node_list
# End of function
dict={}
# Function to return lucene search results
def search_loop(searcher, analyzer):
sum = 0
with open("query.txt", 'r') as queries:
# reading every query from the input file
for command in queries:
x = word_tokenize(command)
query_no = float(x[0])
file_name = str(x[1])
dict[query_no]=file_name
print('FILE NAME : '+file_name)
# print('query number',query_no)
lucene_output_docs[query_no] = []
temp_q = command
com_lenght=command.find('l')+2
temp_q = temp_q[com_lenght:]
print ("search loop: "+ temp_q + "\n")
query = QueryParser("text", analyzer).parse(temp_q)
# retrieving top 50 results for each query
scoreDocs = searcher.search(query, 50).scoreDocs
# writing output to the file
with open("lucene_output.txt", "a") as output_file2:
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
# print doc.get("title")#, 'name:', doc.get("name")
temp_str = str(doc.get("title"))
lucene_output_docs[query_no].append(temp_str)
output_file2.write(str(int(query_no)) + " " + temp_str + "\n")
# Results retrieved
output_file2.close()
print('list of docs',lucene_output_docs[query_no])
if file_name in lucene_output_docs[query_no]:
index = lucene_output_docs[query_no].index(file_name)
print('The index of file name is:',index)
sum = sum + index
else:
print('not in a list')
sum = sum + 50
average_position = sum / 100
print('Average postion before rocchio:',average_position)
# End of outer for loop
# Closing the queries file
queries.close()
# End of function
def modified_search_loop(searcher, analyzer, query_list):
# opening the query file
# reading every query from the input file
sum = 0
print(dict)
for command in query_list:
x = word_tokenize(command)
query_no = int(x[0])
filename=dict.get(query_no)
# print(type(filename))
print('The index of file name is:',filename)
lucene_output_docs[query_no] = []
temp_q = command
temp_q = temp_q[5:]
# print "search loop: "+ temp_q + "\n"
query = QueryParser("text", analyzer).parse(temp_q)
# retrieving top 50 results for each query
scoreDocs = searcher.search(query, 50).scoreDocs
# writing output to the file
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
# print doc.get("title")#, 'name:', doc.get("name")
temp_str = str(doc.get("title"))
lucene_output_docs[query_no].append(temp_str)
with open("lucene_output_for_updated_queries.txt", "a") as output_file2:
output_file2.write(str(query_no) + " " + temp_str + "\n")
if filename in lucene_output_docs[query_no]:
index = lucene_output_docs[query_no].index(filename)
print('The index of updated file name is:',index)
sum = sum + index
else:
print('not in a list')
sum = sum + 50
average_position = sum / 100
print('Average postion after rocchio :',average_position)
# main function
if __name__ == '__main__':
# Create a searcher for the above defined Directory
searcher = IndexSearcher(DirectoryReader.open(directory))
searcher.setSimilarity(BM25Similarity())
# Create a new retrieving analyzer
analyzer = StandardAnalyzer()
search_loop(searcher, analyzer)
# text processing module for retrieving the text from the documents of the folder
print(" Lucene output generated...")
# print "Doc processing and tf-idf over"
query_list = Query_processing_module()
query_node_list = generate_tf_idf_vectors_for_query(query_list)
print("Query processing and tf-idf over\n")
updated_query_list = []
i = 0
for query_node in query_node_list:
query_wordlist = getwordlist(query_node)
query_wordset = set(query_wordlist)
query_tf_idf[i] = {}
query_no = int(query_node.sentence[0:3])
# Calculating tf-idf vector for the query
for word in query_wordset:
if query_node.idf[word] != 100000:
query_tf_idf[i][word] = math.log(1 + query_node.tf[word]) * query_node.idf[word]
b_by_delta_dr = 0.65
# Retrieving only the top 10 documents from lucene output
j = 0
# Implementing Rochio algorithm for each query (query vector updation)
for doc in lucene_output_docs[query_no]:
str_doc = str(doc)
doc_index = doc_id[str_doc]
cur_doc_node = doc_node_list[doc_index]
doc_word_list = getwordlist(cur_doc_node)
doc_word_set = set(doc_word_list)
# Rochio algorithm for query vector updation
for word in doc_word_set:
if word in query_tf_idf[i]:
query_tf_idf[i][word] += b_by_delta_dr * math.log(cur_doc_node.tf[word] + 1) * cur_doc_node.idf[
word]
else:
query_tf_idf[i][word] = b_by_delta_dr * math.log(cur_doc_node.tf[word] + 1) * cur_doc_node.idf[word]
j += 1
# Only top 10 docs from the retrieved 50 docs
if j == 10:
break
# print('query tf-idf:', query_tf_idf)
# Sorting the dictionary entries wrt its Values
new_query = str(query_no) + " "
sorted_dict = sorted(query_tf_idf[i], key=query_tf_idf[i].get, reverse=True)
k = 0
for r in sorted_dict:
new_query += str(r) + " "
k += 1
if k == 10:
break
print("original query = " + query_node.sentence)
print("updated query = " + new_query)
updated_query_list.append(new_query)
print("---------------------------------------------------------\n\n")
i += 1
modified_search_loop(searcher, analyzer, updated_query_list)