Merge pull request #1 from mam10eks/qpptk-dev

Qpptk dev
Zendelo · Feb 21, 2024 · a5900e8 · a5900e8
2 parents e35aaca + 88b7ef7
commit a5900e8
Show file tree

Hide file tree

Showing 89 changed files with 7,617 additions and 342 deletions.
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -0,0 +1,8 @@
+{
+  "image": "mam10eks/qpptk:0.0.2-dev",
+  "customizations": {
+    "vscode": {
+      "extensions": ["ms-python.python", "ms-python.vscode-pylance", "ms-toolsai.jupyter"]
+    }
+  }
+}
diff --git a/.github/workflows/run-all-tests.yml b/.github/workflows/run-all-tests.yml
@@ -0,0 +1,25 @@
+name: Unit Tests
+
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10"]
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run tests
+        working-directory: ./code/qpptk
+        run: |
+          mkdir ~/repos/
+          pip3 install -r requirements.txt
+          pytest
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,14 @@
+{
+    "python.testing.unittestArgs": [
+        "-v",
+        "-s",
+        "./code",
+        "-p",
+        "*test*.py"
+    ],
+    "python.testing.pytestEnabled": true,
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestArgs": [
+        "code"
+    ]
+}
diff --git a/README.md b/README.md
@@ -1,3 +1,44 @@
 # QPP-EnhancedEval
 Code to Reproduce ECIR 2021 paper "An Enhanced Evaluation Framework for Query Performance Prediction"
 
+
+## Run it locally with TIRA
+
+Please ensure that you have python >= 3.7, Docker, and tira-run installed (`pip3 install tira`).
+
+```
+tira-run \
+	--input-dataset workshop-on-open-web-search/query-processing-20231027-training \
+	--input-run 'workshop-on-open-web-search/tira-ir-starter/Index (tira-ir-starter-pyterrier)' \
+	--image mam10eks/qpptk:0.0.1 \
+	--command 'python3 /qpptk_main.py -ti $inputRun/index/ --jsonl_queries $inputDataset/queries.jsonl --predict --retrieve --output $outputDir --cleanOutput --stats_index_path /tmp'
+```
+
+  File "/workspaces/QPP-EnhancedEval/code/qpptk/qpptk/global_manager.py", line 33, in run_pre_prediction_process
+    max_idf = process.calc_max_idf()
+  File "/workspaces/QPP-EnhancedEval/code/qpptk/qpptk/pre_retrieval_predictors.py", line 30, in calc_max_idf
+    return np.log(np.array(self.total_docs) / self.terms_df).max()
+  File "/usr/local/lib/python3.10/dist-packages/numpy/core/_methods.py", line 40, in _amax
+    return umr_maximum(a, axis, None, out, keepdims, initial, where)
+ValueError: zero-size array to reduction operation maximum which has no identity
+
+
+## Build the Docker Images
+
+Build the docker image via:
+```
+docker build -f docker/Dockerfile -t mam10eks/qpptk:0.0.1 .
+```
+
+If you update any dependencies, please rebuild the dev container via:
+```
+docker build -f docker/Dockerfile.dev -t mam10eks/qpptk:0.0.1-dev .
+```
+
+## Upload to TIRA
+
+```
+docker tag mam10eks/qpptk:0.0.1 registry.webis.de/code-research/tira/tira-user-qpptk/qpptk:0.0.1
+docker push registry.webis.de/code-research/tira/tira-user-qpptk/qpptk:0.0.1
+```
+
diff --git a/code/maik-idea-in-progress.ipynb b/code/maik-idea-in-progress.ipynb
diff --git a/code/qpptk/.vscode/settings.json b/code/qpptk/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}
diff --git a/code/qpptk/qpptk/__init__.py b/code/qpptk/qpptk/__init__.py
@@ -1,19 +1,20 @@
 from .utility_functions import *
 from .config import *
 from .load_text_index import IndexText, parse_posting_list
-from .load_ciff_index import IndexCiff, parse_index_file
 from .load_db_index import IndexDB
-from .parse_queries import QueryParserText, QueryParserCiff
+from .load_terrier_index import IndexTerrier
+from .parse_queries import QueryParserText, QueryParserCiff, QueryParserJsonl
 from .retrieval_local_manager import LocalManagerRetrieval
-# from .parse_ciff_queries import QueryParserCiff, parse_ciff_queries_file
 from .pre_retrieval_predictors import LocalManagerPredictorPre
 from .post_retrieval_predictors import LocalManagerPredictorPost
-from .index_to_db import parse_index_to_db
+from .qpptk_main import parse_args, main, get_queries_object
+from .score_replacement_prediction import replace_scores_in_run_file_with_reference_scores
 
 __all__ = ['Config', 'Posting', 'TermPosting', 'TermRecord', 'TermFrequency', 'DocRecord', 'ResultPair', 'get_file_len',
-           'read_line', 'parse_posting_list', 'binary_search', 'IndexText', 'IndexCiff', 'parse_index_file', 'IndexDB',
-           'QueryParserText', 'QueryParserCiff', 'LocalManagerRetrieval', 'LocalManagerPredictorPre', 'ensure_dir',
-           'ensure_file', 'LocalManagerPredictorPost', 'read_message', 'plot_roc',
-           'transform_list_to_counts_dict', 'jaccard_similarity', 'overlap_coefficient',
-           'set_index_dump_paths', 'add_topic_to_qdf', 'msgpack_encode', 'msgpack_decode',
-           'parse_index_to_db', 'read_trec_res_file']
+           'read_line', 'parse_posting_list', 'binary_search', 'IndexText', 'IndexDB', 'IndexTerrier',
+           'QueryParserText', 'QueryParserCiff', 'QueryParserJsonl', 'LocalManagerRetrieval',
+           'LocalManagerPredictorPre', 'ensure_dir', 'ensure_file', 'LocalManagerPredictorPost', 'read_message',
+           'plot_roc', 'transform_list_to_counts_dict', 'jaccard_similarity', 'overlap_coefficient',
+           'sorensen_dice_similarity', 'calc_ndcg', 'set_index_dump_paths', 'add_topic_to_qdf', 'msgpack_encode',
+           'msgpack_decode',  'read_trec_res_file', 'parse_args', 'main', 'get_queries_object',
+           'replace_scores_in_run_file_with_reference_scores']
diff --git a/code/qpptk/qpptk/config.py b/code/qpptk/qpptk/config.py
@@ -6,7 +6,19 @@
 
 from qpptk import ensure_dir, ensure_file
 
-CONFIG_FILE = './qpptk/config.toml'
+# def __init_logger(self, logger):
+#     if logger:
+#         return logger
+#     logger = logging.getLogger(__name__)
+#     logger.setLevel(logging.INFO)
+#     if not logger.hasHandlers():
+#         formatter = logging.Formatter('{asctime} - {message}', datefmt="%H:%M:%S", style="{")
+#         handler = logging.StreamHandler()
+#         handler.setFormatter(formatter)
+#         logger.addHandler(handler)
+#     return logger
+
+CONFIG_FILE = os.path.dirname(os.path.realpath(__file__)) + '/config.toml'
 
 
 def set_index_dump_paths(index_dir):
@@ -31,25 +43,37 @@ class Config:
     WORKING_SET_SIZE = parameters.get('working_set_size', None)
     FB_TERMS = parameters.get('fb_terms')
     NUM_DOCS = parameters.get('max_result_size')
-    logging_level = parameters.get('logging_level', 'DEBUG')
+
     N_PROC = parameters.get('num_processes', 1)
-    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%d/%m/%Y %H:%M:%S',
-                        level=logging_level)
-    logger = logging.getLogger(__name__)
+
     prediction_parameters = parameters.get('prediction')
     WIG_LIST_SIZE = prediction_parameters.get('wig_list_size')
     NQC_LIST_SIZE = prediction_parameters.get('nqc_list_size')
     SMV_LIST_SIZE = prediction_parameters.get('smv_list_size')
+
     CLARITY_FB_TERMS = prediction_parameters.get('clarity_fb_terms')
     CLARITY_LIST_SIZE = prediction_parameters.get('clarity_list_size')
-    uef_parameters = prediction_parameters.get('uef')
-    UEF_RM_FB_PARAM = uef_parameters.get('rm_fb_size')
-    UEF_SIM_PARAM = uef_parameters.get('re_rank_list_size')
+
+    QF_FB_TERMS = prediction_parameters.get('qf_fb_terms')
+    QF_LIST_SIZE = prediction_parameters.get('qf_list_size')
+    QF_OVERLAP_SIZE = prediction_parameters.get('qf_overlap_size')
+
+    UEF_FB_TERMS = prediction_parameters.get('uef_fb_terms')
+    UEF_LIST_SIZE = prediction_parameters.get('uef_list_size')
+    UEF_RANKING_SIZE = prediction_parameters.get('uef_ranking_size')
+
+    logging_level = parameters.get('logging_level', 'DEBUG')
+    # logging_level = logging.DEBUG
+
+    # uef_parameters = prediction_parameters.get('uef')
+    # UEF_RM_FB_PARAM = uef_parameters.get('rm_fb_size')
+    # UEF_SIM_PARAM = uef_parameters.get('re_rank_list_size')
 
     env = config.get('environment')
 
     executables = env.get('executables')
     TREC_EVAL = executables.get('trec_eval')
+    RBP_EVAL = executables.get('rbp_eval')
 
     env_paths = env.get('paths')
     _root_dir = env_paths.get('root_dir')
@@ -58,7 +82,24 @@ class Config:
     _root_dir = ensure_dir(_root_dir, False)
     INDEX_DIR = env_paths.get('text_index_dir')
     CIFF_INDEX = env_paths.get('ciff_index_file')
-    assert not (INDEX_DIR and CIFF_INDEX), f"Only one type of Index can be specified in the configurations file"
+    TERRIER_INDEX = env_paths.get('terrier_index_dir')
+
+    BATCH_NAME = env_paths.get('batch_name', '')
+
+    assert sum((bool(TERRIER_INDEX), bool(CIFF_INDEX), bool(INDEX_DIR))) <= 1, \
+        f"Only one type of Index can be specified in the configurations file"
+
+    RESULTS_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('results_dir')), True)
+
+    log_file = env_paths.get('log_file')
+    if log_file:
+        log_file = os.path.join(RESULTS_DIR, log_file)
+    logging.basicConfig(filename=log_file, format='%(asctime)s %(levelname)s: %(message)s',
+                        datefmt='%d/%m/%Y %H:%M:%S',
+                        level=logging_level)
+    logger = logging.getLogger(__name__)
+
+    DB_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('db_dir')), True)
 
     if INDEX_DIR:
         try:
@@ -76,11 +117,25 @@ class Config:
             logger.warning(err)
             logger.warning(f"The setting 'ciff_index_file={CIFF_INDEX}' in the config file was skipped")
             CIFF_INDEX = None
+    elif TERRIER_INDEX:
+        try:
+            # Index dump paths
+            TERRIER_INDEX = ensure_dir(os.path.join(_root_dir, TERRIER_INDEX), create_if_not=False)
+            ensure_file(os.path.join(TERRIER_INDEX, 'data.properties'))
+        except FileNotFoundError as err:
+            logger.warning(err)
+            logging.warning(f"The setting 'terrier_index_dir={TERRIER_INDEX}'"
+                            f"in the config file was skipped, data.properties file is missing")
+            INDEX_DIR = None
 
     TEXT_QUERIES = env_paths.get('text_queries_file')
     CIFF_QUERIES = env_paths.get('ciff_queries_file')
-    assert not (TEXT_QUERIES and CIFF_QUERIES), f"Only a single type of queries file can be specified" \
-                                                f" in the configurations file"
+    JSONL_QUERIES = env_paths.get('jsonl_queries_file')
+    QREL_FILE = os.path.join(_root_dir, env_paths.get('qrel_file'))
+
+    assert sum((bool(TEXT_QUERIES), bool(CIFF_QUERIES), bool(JSONL_QUERIES))) == 1, \
+        f"Only one type of queries file can be specified in the configurations file"
+
     if TEXT_QUERIES:
         try:
             TEXT_QUERIES = ensure_file(os.path.join(_root_dir, TEXT_QUERIES))
@@ -95,9 +150,13 @@ class Config:
             logger.warning(err)
             logger.warning(f"The setting 'ciff_queries_file={CIFF_QUERIES}' in the config file was skipped")
             CIFF_QUERIES = None
-
-    RESULTS_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('results_dir')), True)
-    DB_DIR = ensure_dir(os.path.join(_root_dir, env_paths.get('db_dir')), True)
+    elif JSONL_QUERIES:
+        try:
+            JSONL_QUERIES = ensure_file(os.path.join(_root_dir, JSONL_QUERIES))
+        except FileNotFoundError as err:
+            logger.warning(err)
+            logger.warning(f"The setting 'jsonl_queries_file={JSONL_QUERIES}' in the config file was skipped")
+            JSONL_QUERIES = None
 
     @staticmethod
     def get_logger():

diff --git a/code/qpptk/qpptk/config.toml b/code/qpptk/qpptk/config.toml
@@ -3,44 +3,73 @@ title = "Global configurations"
 [environment]
     [environment.paths]
     # The root directory which will be used as the base path for all the environment paths
-    root_dir = "./"
-#    text_index_dir = "ROBUST04/dump/"
-    ciff_index_file = "ciff_indexes/robust04/robust04_Lucene_indri_porter.ciff"
-#    text_queries_file = "data/robust04.stemmed.qry"
-    ciff_queries_file = "ciff_query_indexes/robust04_Lucene_query_indri_porter.ciff"
-    results_dir = "anova_qpp"
-    # Directory for the DB files,
+    root_dir = "~/repos/"
+
+#    text_index_dir = "mini_dump/"
+#    ciff_index_file = "ciff_indexes/robust04/robust04_Lucene_indri_nostem.ciff"
+    terrier_index_dir = "qpp-Maik/docker/pyterrier-index/index"
+
+#    text_queries_file = "data/cw12b.stemmed.stopped.qry"
+    jsonl_queries_file = "qpp-Maik/docker/sample-input-full-rank/queries.jsonl"
+#    ciff_queries_file = "ciff_query_indexes/robust04_Lucene_query_indri_nostem.ciff"
+
+    #    results_dir = "eval_qpp_results"
+    results_dir = "testing_docker"
+
+    log_file = 'qpptk_robust_retr.log'
+
+    # Directory for the DB files
     db_dir = "qpptk/qpptk_db"
+    qrel_file = 'qpp-Maik/docker/sample-input-full-rank/sample.qrels'
 
     [environment.executables]
     # path to trec_eval executable
-    trec_eval = '~/trec_eval-9.0.7/trec_eval'
+#    trec_eval = '/research/remote/petabyte/users/oleg/trec_eval-9.0.7/trec_eval'
+    trec_eval = '/home/s3806763/Downloads/trec_eval/trec_eval'
+#    rbp_eval = '/research/remote/petabyte/users/oleg/eval/rbp_eval'
+
 
 [parameters]
+    mu = 1000
+    # Number of docs to use for the RM construction
+    fb_docs = 100
+    # The maximum number of documents to use for the re-ranking, comment out to re-rank all docs in initial list
+    working_set_size = 100
+    # Number of top terms to use, *after* RM construction
+    fb_terms = 100
+    max_result_size = 1000
+    # predefined logging levels: CRITICAL, ERROR, WARNING, INFO, DEBUG
+    logging_level = 'DEBUG'
+    num_processes = 1
+
+    [parameters.prediction]
+        wig_list_size = 10 # Good for Robust04
+        nqc_list_size = 100 # Good for Robust04
+        smv_list_size = 100 # Good for Robust04
+        # Number of top terms from RM to use in Clarity
+        clarity_fb_terms = 100
+        # Number of docs to use for the RM construction in Clarity
+        clarity_list_size = 1000
+
+        # Number of top terms from RM to use in QF
+        qf_fb_terms = 100
+        # Number of docs to use for the RM construction in QF
+        qf_list_size = 1000
+        # Number of docs to use for the overlap calc in QF
+        qf_overlap_size = 25
+
+        # Number of top terms from RM to use in UEF
+        uef_fb_terms = 100
+        # Number of docs to use for the RM construction in UEF
+        uef_list_size = 1000
+        # Number of docs to use for the re-ranking and comparison in UEF
+        uef_ranking_size = 100
+
+        #[parameters.prediction.uef]
+
+        #rm_fb_size = 100
+        # Number of docs to re-rank with the RM, and calc the similarity
+        #re_rank_list_size = 150
 
-mu = 1000
-# Number of docs to use for the RM construction
-fb_docs = 100
-# The maximum number of documents to use for the re-ranking, comment out to re-rank all docs in initial list
-working_set_size = 100
-# Number of top terms to use, *after* RM construction
-fb_terms = 100
-max_result_size = 1000
-# predefined logging levels: CRITICAL, ERROR, WARNING, INFO, DEBUG
-logging_level = 'DEBUG'
-num_processes = 25
-
-[parameters.prediction]
-wig_list_size = 10
-nqc_list_size = 100
-smv_list_size = 100
-# Number of top terms from RM to use in Clarity
-clarity_fb_terms = 100
-# Number of docs to use for the RM construction in Clarity
-clarity_list_size = 1000
-
-[parameters.prediction.uef]
-
-#rm_fb_size = 100
-# Number of docs to re-rank with the RM, and calc the similarity
-#re_rank_list_size = 150
+#[logging]
+#output=