-
Notifications
You must be signed in to change notification settings - Fork 1
/
nqc.py
126 lines (105 loc) · 5.03 KB
/
nqc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#! /usr/bin/env python
import argparse
import xml.etree.ElementTree as eT
from collections import defaultdict
import numpy as np
import pandas as pd
# TODO: remove the queries file, or implement an option to add it just for testing (ensure all queries are present)
# TODO: switch the code to work with the global qpputils
parser = argparse.ArgumentParser(description='NQC predictor',
usage='Input QL(q|d) scores and queries files',
epilog='Prints the NQC predictor scores')
parser.add_argument('results', metavar='QL(q|d)_results_file', help='The QL results file for the documents scores')
parser.add_argument('corp_scores', metavar='QLC', help='logqlc QL Corpus scores of the queries')
parser.add_argument('queries', metavar='queries_xml_file', default='data/ROBUST/queries.xml',
help='The queries xml file')
parser.add_argument('-t', '--textqueries', metavar='queries_txt_file', default='data/ROBUST/queries.txt',
help='The queries txt file')
parser.add_argument('-d', '--docs', metavar='fbDocs', default=20, help='Number of documents')
class QueriesParser:
def __init__(self, query_file):
self.file = query_file
self.tree = eT.parse(self.file)
self.root = self.tree.getroot()
# query number: "Full command"
self.full_queries = defaultdict(str)
self.text_queries = defaultdict(str)
self.query_length = defaultdict(int)
self.fb_docs = defaultdict(list)
self.__parse_queries()
def __parse_queries(self):
for query in self.root.iter('query'):
qid_ = query.find('number').text
qstr_ = query.find('text').text
qtxt_ = qstr_[qstr_.find("(") + 1:qstr_.rfind(")")].split()
self.full_queries[qid_] = qstr_
self.text_queries[qid_] = qtxt_
self.query_length[qid_] = len(qtxt_)
def add_feedback_docs(self, num_docs, res):
"""
Adds the fbDocs from results file to the original queries
:parameter: num_files: number of fbDocs to add to each query
"""
for qid in self.full_queries.keys():
qid = qid
docs = res.loc[qid]['docID'].head(num_docs)
self.fb_docs[qid] = list(docs)
def write_to_file(self):
for query in self.root.iter('query'):
qid = query.find('number').text
fbDocs = self.fb_docs[qid]
for doc in fbDocs:
temp = eT.SubElement(query, 'feedbackDocno')
temp.text = doc
eT.dump(self.tree)
class NQC:
"""This class implements the QPP method as described in:
'Predicting Query Performance by Query-Drift Estimation'
The predictor is implemented to work with log(QL) scores (not -CE)"""
def __init__(self, queries_obj, results_df, corpus_scores_df):
self.qdb = queries_obj
self.res = results_df
self.corp = corpus_scores_df
self.predictions = defaultdict(float)
def _calc_denominator(self, qid):
_score = self.corp.loc[qid]['score']
return abs(_score)
def _calc_numerator(self, qid, num_docs):
_scores = list(self.res.loc[qid]['docScore'].head(num_docs))
for i, score in enumerate(_scores):
_scores[i] = score
return np.std(_scores)
# Version for -CE scores
# def _calc_numerator(self, qid, qlen, num_docs):
# _scores = list(self.res.loc[qid]['docScore'].head(num_docs))
# for i, score in enumerate(_scores):
# _scores[i] = score * qlen
# return np.std(_scores)
def calc_results(self, number_of_docs):
for qid, qlen in self.qdb.query_length.items():
_denominator = self._calc_denominator(qid)
_numerator = self._calc_numerator(qid, number_of_docs)
_score = _numerator / _denominator
self.predictions[qid] = _score
print('{} {:0.4f}'.format(qid, _score))
# predictions_df = pd.Series(self.predictions)
# predictions_df.to_json('wig-predictions-{}.res'.format(number_of_docs))
def main(args):
results_file = args.results
query_file = args.queries
number_of_docs = int(args.docs)
logqlc_file = args.corp_scores
corp_scores_df = pd.read_table(logqlc_file, delim_whitespace=True, header=None, index_col=0, names=['qid', 'score'],
dtype={'qid': str, 'score': float})
results_df = pd.read_table(results_file, delim_whitespace=True, header=None, index_col=0,
names=['qid', 'Q0', 'docID', 'docRank', 'docScore', 'ind'],
dtype={'qid': str, 'Q0': str, 'docID': str, 'docRank': int, 'docScore': float,
'ind': str})
results_df.index = results_df.index.map(str)
corp_scores_df.index = corp_scores_df.index.map(str)
qdb = QueriesParser(query_file)
nqc = NQC(qdb, results_df, corp_scores_df)
nqc.calc_results(number_of_docs)
if __name__ == '__main__':
args = parser.parse_args()
main(args)