Skip to content

Commit

Permalink
Merge pull request #1 from mam10eks/qpptk-dev
Browse files Browse the repository at this point in the history
Qpptk dev
  • Loading branch information
Zendelo authored Feb 21, 2024
2 parents e35aaca + 88b7ef7 commit a5900e8
Show file tree
Hide file tree
Showing 89 changed files with 7,617 additions and 342 deletions.
8 changes: 8 additions & 0 deletions .devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"image": "mam10eks/qpptk:0.0.2-dev",
"customizations": {
"vscode": {
"extensions": ["ms-python.python", "ms-python.vscode-pylance", "ms-toolsai.jupyter"]
}
}
}
25 changes: 25 additions & 0 deletions .github/workflows/run-all-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Unit Tests

on: [push]

jobs:
build:
runs-on: ubuntu-latest
timeout-minutes: 15
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Run tests
working-directory: ./code/qpptk
run: |
mkdir ~/repos/
pip3 install -r requirements.txt
pytest
14 changes: 14 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"python.testing.unittestArgs": [
"-v",
"-s",
"./code",
"-p",
"*test*.py"
],
"python.testing.pytestEnabled": true,
"python.testing.unittestEnabled": false,
"python.testing.pytestArgs": [
"code"
]
}
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,44 @@
# QPP-EnhancedEval
Code to Reproduce ECIR 2021 paper "An Enhanced Evaluation Framework for Query Performance Prediction"


## Run it locally with TIRA

Please ensure that you have python >= 3.7, Docker, and tira-run installed (`pip3 install tira`).

```
tira-run \
--input-dataset workshop-on-open-web-search/query-processing-20231027-training \
--input-run 'workshop-on-open-web-search/tira-ir-starter/Index (tira-ir-starter-pyterrier)' \
--image mam10eks/qpptk:0.0.1 \
--command 'python3 /qpptk_main.py -ti $inputRun/index/ --jsonl_queries $inputDataset/queries.jsonl --predict --retrieve --output $outputDir --cleanOutput --stats_index_path /tmp'
```

File "/workspaces/QPP-EnhancedEval/code/qpptk/qpptk/global_manager.py", line 33, in run_pre_prediction_process
max_idf = process.calc_max_idf()
File "/workspaces/QPP-EnhancedEval/code/qpptk/qpptk/pre_retrieval_predictors.py", line 30, in calc_max_idf
return np.log(np.array(self.total_docs) / self.terms_df).max()
File "/usr/local/lib/python3.10/dist-packages/numpy/core/_methods.py", line 40, in _amax
return umr_maximum(a, axis, None, out, keepdims, initial, where)
ValueError: zero-size array to reduction operation maximum which has no identity


## Build the Docker Images

Build the docker image via:
```
docker build -f docker/Dockerfile -t mam10eks/qpptk:0.0.1 .
```

If you update any dependencies, please rebuild the dev container via:
```
docker build -f docker/Dockerfile.dev -t mam10eks/qpptk:0.0.1-dev .
```

## Upload to TIRA

```
docker tag mam10eks/qpptk:0.0.1 registry.webis.de/code-research/tira/tira-user-qpptk/qpptk:0.0.1
docker push registry.webis.de/code-research/tira/tira-user-qpptk/qpptk:0.0.1
```

2,844 changes: 2,844 additions & 0 deletions code/maik-idea-in-progress.ipynb

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions code/qpptk/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
21 changes: 11 additions & 10 deletions code/qpptk/qpptk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from .utility_functions import *
from .config import *
from .load_text_index import IndexText, parse_posting_list
from .load_ciff_index import IndexCiff, parse_index_file
from .load_db_index import IndexDB
from .parse_queries import QueryParserText, QueryParserCiff
from .load_terrier_index import IndexTerrier
from .parse_queries import QueryParserText, QueryParserCiff, QueryParserJsonl
from .retrieval_local_manager import LocalManagerRetrieval
# from .parse_ciff_queries import QueryParserCiff, parse_ciff_queries_file
from .pre_retrieval_predictors import LocalManagerPredictorPre
from .post_retrieval_predictors import LocalManagerPredictorPost
from .index_to_db import parse_index_to_db
from .qpptk_main import parse_args, main, get_queries_object
from .score_replacement_prediction import replace_scores_in_run_file_with_reference_scores

__all__ = ['Config', 'Posting', 'TermPosting', 'TermRecord', 'TermFrequency', 'DocRecord', 'ResultPair', 'get_file_len',
'read_line', 'parse_posting_list', 'binary_search', 'IndexText', 'IndexCiff', 'parse_index_file', 'IndexDB',
'QueryParserText', 'QueryParserCiff', 'LocalManagerRetrieval', 'LocalManagerPredictorPre', 'ensure_dir',
'ensure_file', 'LocalManagerPredictorPost', 'read_message', 'plot_roc',
'transform_list_to_counts_dict', 'jaccard_similarity', 'overlap_coefficient',
'set_index_dump_paths', 'add_topic_to_qdf', 'msgpack_encode', 'msgpack_decode',
'parse_index_to_db', 'read_trec_res_file']
'read_line', 'parse_posting_list', 'binary_search', 'IndexText', 'IndexDB', 'IndexTerrier',
'QueryParserText', 'QueryParserCiff', 'QueryParserJsonl', 'LocalManagerRetrieval',
'LocalManagerPredictorPre', 'ensure_dir', 'ensure_file', 'LocalManagerPredictorPost', 'read_message',
'plot_roc', 'transform_list_to_counts_dict', 'jaccard_similarity', 'overlap_coefficient',
'sorensen_dice_similarity', 'calc_ndcg', 'set_index_dump_paths', 'add_topic_to_qdf', 'msgpack_encode',
'msgpack_decode', 'read_trec_res_file', 'parse_args', 'main', 'get_queries_object',
'replace_scores_in_run_file_with_reference_scores']
87 changes: 73 additions & 14 deletions code/qpptk/qpptk/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,19 @@

from qpptk import ensure_dir, ensure_file

CONFIG_FILE = './qpptk/config.toml'
# def __init_logger(self, logger):
# if logger:
# return logger
# logger = logging.getLogger(__name__)
# logger.setLevel(logging.INFO)
# if not logger.hasHandlers():
# formatter = logging.Formatter('{asctime} - {message}', datefmt="%H:%M:%S", style="{")
# handler = logging.StreamHandler()
# handler.setFormatter(formatter)
# logger.addHandler(handler)
# return logger

CONFIG_FILE = os.path.dirname(os.path.realpath(__file__)) + '/config.toml'


def set_index_dump_paths(index_dir):
Expand All @@ -31,25 +43,37 @@ class Config:
WORKING_SET_SIZE = parameters.get('working_set_size', None)
FB_TERMS = parameters.get('fb_terms')
NUM_DOCS = parameters.get('max_result_size')
logging_level = parameters.get('logging_level', 'DEBUG')

N_PROC = parameters.get('num_processes', 1)
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%d/%m/%Y %H:%M:%S',
level=logging_level)
logger = logging.getLogger(__name__)

prediction_parameters = parameters.get('prediction')
WIG_LIST_SIZE = prediction_parameters.get('wig_list_size')
NQC_LIST_SIZE = prediction_parameters.get('nqc_list_size')
SMV_LIST_SIZE = prediction_parameters.get('smv_list_size')

CLARITY_FB_TERMS = prediction_parameters.get('clarity_fb_terms')
CLARITY_LIST_SIZE = prediction_parameters.get('clarity_list_size')
uef_parameters = prediction_parameters.get('uef')
UEF_RM_FB_PARAM = uef_parameters.get('rm_fb_size')
UEF_SIM_PARAM = uef_parameters.get('re_rank_list_size')

QF_FB_TERMS = prediction_parameters.get('qf_fb_terms')
QF_LIST_SIZE = prediction_parameters.get('qf_list_size')
QF_OVERLAP_SIZE = prediction_parameters.get('qf_overlap_size')

UEF_FB_TERMS = prediction_parameters.get('uef_fb_terms')
UEF_LIST_SIZE = prediction_parameters.get('uef_list_size')
UEF_RANKING_SIZE = prediction_parameters.get('uef_ranking_size')

logging_level = parameters.get('logging_level', 'DEBUG')
# logging_level = logging.DEBUG

# uef_parameters = prediction_parameters.get('uef')
# UEF_RM_FB_PARAM = uef_parameters.get('rm_fb_size')
# UEF_SIM_PARAM = uef_parameters.get('re_rank_list_size')

env = config.get('environment')

executables = env.get('executables')
TREC_EVAL = executables.get('trec_eval')
RBP_EVAL = executables.get('rbp_eval')

env_paths = env.get('paths')
_root_dir = env_paths.get('root_dir')
Expand All @@ -58,7 +82,24 @@ class Config:
_root_dir = ensure_dir(_root_dir, False)
INDEX_DIR = env_paths.get('text_index_dir')
CIFF_INDEX = env_paths.get('ciff_index_file')
assert not (INDEX_DIR and CIFF_INDEX), f"Only one type of Index can be specified in the configurations file"
TERRIER_INDEX = env_paths.get('terrier_index_dir')

BATCH_NAME = env_paths.get('batch_name', '')

assert sum((bool(TERRIER_INDEX), bool(CIFF_INDEX), bool(INDEX_DIR))) <= 1, \
f"Only one type of Index can be specified in the configurations file"

RESULTS_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('results_dir')), True)

log_file = env_paths.get('log_file')
if log_file:
log_file = os.path.join(RESULTS_DIR, log_file)
logging.basicConfig(filename=log_file, format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%d/%m/%Y %H:%M:%S',
level=logging_level)
logger = logging.getLogger(__name__)

DB_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('db_dir')), True)

if INDEX_DIR:
try:
Expand All @@ -76,11 +117,25 @@ class Config:
logger.warning(err)
logger.warning(f"The setting 'ciff_index_file={CIFF_INDEX}' in the config file was skipped")
CIFF_INDEX = None
elif TERRIER_INDEX:
try:
# Index dump paths
TERRIER_INDEX = ensure_dir(os.path.join(_root_dir, TERRIER_INDEX), create_if_not=False)
ensure_file(os.path.join(TERRIER_INDEX, 'data.properties'))
except FileNotFoundError as err:
logger.warning(err)
logging.warning(f"The setting 'terrier_index_dir={TERRIER_INDEX}'"
f"in the config file was skipped, data.properties file is missing")
INDEX_DIR = None

TEXT_QUERIES = env_paths.get('text_queries_file')
CIFF_QUERIES = env_paths.get('ciff_queries_file')
assert not (TEXT_QUERIES and CIFF_QUERIES), f"Only a single type of queries file can be specified" \
f" in the configurations file"
JSONL_QUERIES = env_paths.get('jsonl_queries_file')
QREL_FILE = os.path.join(_root_dir, env_paths.get('qrel_file'))

assert sum((bool(TEXT_QUERIES), bool(CIFF_QUERIES), bool(JSONL_QUERIES))) == 1, \
f"Only one type of queries file can be specified in the configurations file"

if TEXT_QUERIES:
try:
TEXT_QUERIES = ensure_file(os.path.join(_root_dir, TEXT_QUERIES))
Expand All @@ -95,9 +150,13 @@ class Config:
logger.warning(err)
logger.warning(f"The setting 'ciff_queries_file={CIFF_QUERIES}' in the config file was skipped")
CIFF_QUERIES = None

RESULTS_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('results_dir')), True)
DB_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('db_dir')), True)
elif JSONL_QUERIES:
try:
JSONL_QUERIES = ensure_file(os.path.join(_root_dir, JSONL_QUERIES))
except FileNotFoundError as err:
logger.warning(err)
logger.warning(f"The setting 'jsonl_queries_file={JSONL_QUERIES}' in the config file was skipped")
JSONL_QUERIES = None

@staticmethod
def get_logger():
Expand Down
97 changes: 63 additions & 34 deletions code/qpptk/qpptk/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,73 @@ title = "Global configurations"
[environment]
[environment.paths]
# The root directory which will be used as the base path for all the environment paths
root_dir = "./"
# text_index_dir = "ROBUST04/dump/"
ciff_index_file = "ciff_indexes/robust04/robust04_Lucene_indri_porter.ciff"
# text_queries_file = "data/robust04.stemmed.qry"
ciff_queries_file = "ciff_query_indexes/robust04_Lucene_query_indri_porter.ciff"
results_dir = "anova_qpp"
# Directory for the DB files,
root_dir = "~/repos/"

# text_index_dir = "mini_dump/"
# ciff_index_file = "ciff_indexes/robust04/robust04_Lucene_indri_nostem.ciff"
terrier_index_dir = "qpp-Maik/docker/pyterrier-index/index"

# text_queries_file = "data/cw12b.stemmed.stopped.qry"
jsonl_queries_file = "qpp-Maik/docker/sample-input-full-rank/queries.jsonl"
# ciff_queries_file = "ciff_query_indexes/robust04_Lucene_query_indri_nostem.ciff"

# results_dir = "eval_qpp_results"
results_dir = "testing_docker"

log_file = 'qpptk_robust_retr.log'

# Directory for the DB files
db_dir = "qpptk/qpptk_db"
qrel_file = 'qpp-Maik/docker/sample-input-full-rank/sample.qrels'

[environment.executables]
# path to trec_eval executable
trec_eval = '~/trec_eval-9.0.7/trec_eval'
# trec_eval = '/research/remote/petabyte/users/oleg/trec_eval-9.0.7/trec_eval'
trec_eval = '/home/s3806763/Downloads/trec_eval/trec_eval'
# rbp_eval = '/research/remote/petabyte/users/oleg/eval/rbp_eval'


[parameters]
mu = 1000
# Number of docs to use for the RM construction
fb_docs = 100
# The maximum number of documents to use for the re-ranking, comment out to re-rank all docs in initial list
working_set_size = 100
# Number of top terms to use, *after* RM construction
fb_terms = 100
max_result_size = 1000
# predefined logging levels: CRITICAL, ERROR, WARNING, INFO, DEBUG
logging_level = 'DEBUG'
num_processes = 1

[parameters.prediction]
wig_list_size = 10 # Good for Robust04
nqc_list_size = 100 # Good for Robust04
smv_list_size = 100 # Good for Robust04
# Number of top terms from RM to use in Clarity
clarity_fb_terms = 100
# Number of docs to use for the RM construction in Clarity
clarity_list_size = 1000

# Number of top terms from RM to use in QF
qf_fb_terms = 100
# Number of docs to use for the RM construction in QF
qf_list_size = 1000
# Number of docs to use for the overlap calc in QF
qf_overlap_size = 25

# Number of top terms from RM to use in UEF
uef_fb_terms = 100
# Number of docs to use for the RM construction in UEF
uef_list_size = 1000
# Number of docs to use for the re-ranking and comparison in UEF
uef_ranking_size = 100

#[parameters.prediction.uef]

#rm_fb_size = 100
# Number of docs to re-rank with the RM, and calc the similarity
#re_rank_list_size = 150

mu = 1000
# Number of docs to use for the RM construction
fb_docs = 100
# The maximum number of documents to use for the re-ranking, comment out to re-rank all docs in initial list
working_set_size = 100
# Number of top terms to use, *after* RM construction
fb_terms = 100
max_result_size = 1000
# predefined logging levels: CRITICAL, ERROR, WARNING, INFO, DEBUG
logging_level = 'DEBUG'
num_processes = 25

[parameters.prediction]
wig_list_size = 10
nqc_list_size = 100
smv_list_size = 100
# Number of top terms from RM to use in Clarity
clarity_fb_terms = 100
# Number of docs to use for the RM construction in Clarity
clarity_list_size = 1000

[parameters.prediction.uef]

#rm_fb_size = 100
# Number of docs to re-rank with the RM, and calc the similarity
#re_rank_list_size = 150
#[logging]
#output=
Loading

0 comments on commit a5900e8

Please sign in to comment.