Skip to content

Commit

Permalink
finalized DPH_QE
Browse files Browse the repository at this point in the history
  • Loading branch information
FrancisDrake69 committed Jun 27, 2024
1 parent 4b62353 commit 7f7a7d5
Show file tree
Hide file tree
Showing 13 changed files with 204,148 additions and 204,086 deletions.
62 changes: 36 additions & 26 deletions evaluation/evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand All @@ -11,28 +11,28 @@
"text": [
"Requirement already satisfied: nbformat in /usr/local/lib/python3.10/dist-packages (5.9.2)\n",
"Requirement already satisfied: ipython in /usr/local/lib/python3.10/dist-packages (8.18.1)\n",
"Requirement already satisfied: jupyter-core in /usr/local/lib/python3.10/dist-packages (from nbformat) (5.5.0)\n",
"Requirement already satisfied: fastjsonschema in /usr/local/lib/python3.10/dist-packages (from nbformat) (2.19.0)\n",
"Requirement already satisfied: jupyter-core in /usr/local/lib/python3.10/dist-packages (from nbformat) (5.5.0)\n",
"Requirement already satisfied: traitlets>=5.1 in /usr/local/lib/python3.10/dist-packages (from nbformat) (5.14.0)\n",
"Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat) (4.20.0)\n",
"Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.10/dist-packages (from ipython) (0.19.1)\n",
"Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython) (4.9.0)\n",
"Requirement already satisfied: pygments>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from ipython) (2.17.2)\n",
"Requirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython) (5.1.1)\n",
"Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython) (0.1.6)\n",
"Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /usr/local/lib/python3.10/dist-packages (from ipython) (3.0.41)\n",
"Requirement already satisfied: pygments>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from ipython) (2.17.2)\n",
"Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.10/dist-packages (from ipython) (0.19.1)\n",
"Requirement already satisfied: stack-data in /usr/local/lib/python3.10/dist-packages (from ipython) (0.6.3)\n",
"Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /usr/local/lib/python3.10/dist-packages (from ipython) (3.0.41)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from ipython) (1.2.0)\n",
"Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython) (4.9.0)\n",
"Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython) (0.1.6)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from jedi>=0.16->ipython) (0.8.3)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat) (2023.11.2)\n",
"Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat) (23.1.0)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat) (0.13.2)\n",
"Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat) (23.1.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat) (2023.11.2)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat) (0.31.1)\n",
"Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect>4.3->ipython) (0.7.0)\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython) (0.2.12)\n",
"Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core->nbformat) (4.0.0)\n",
"Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from stack-data->ipython) (2.0.1)\n",
"Requirement already satisfied: pure-eval in /usr/local/lib/python3.10/dist-packages (from stack-data->ipython) (0.2.2)\n",
"Requirement already satisfied: executing>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from stack-data->ipython) (2.0.1)\n",
"Requirement already satisfied: asttokens>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from stack-data->ipython) (2.4.1)\n",
"Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from asttokens>=2.1.0->stack-data->ipython) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
Expand All @@ -46,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -58,9 +58,19 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n",
"\n",
"No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.\n"
]
}
],
"source": [
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n",
"ensure_pyterrier_is_loaded()\n",
Expand All @@ -69,7 +79,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -80,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -121,9 +131,9 @@
" <tr>\n",
" <th>0</th>\n",
" <td>bestSoFar</td>\n",
" <td>0.419425</td>\n",
" <td>0.628554</td>\n",
" <td>0.834000</td>\n",
" <td>0.430218</td>\n",
" <td>0.633236</td>\n",
" <td>0.849022</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
Expand All @@ -135,9 +145,9 @@
" <tr>\n",
" <th>2</th>\n",
" <td>Another Experiment</td>\n",
" <td>0.419425</td>\n",
" <td>0.628554</td>\n",
" <td>0.834000</td>\n",
" <td>0.451618</td>\n",
" <td>0.657313</td>\n",
" <td>0.849263</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
Expand All @@ -152,13 +162,13 @@
],
"text/plain": [
" name ndcg_cut.10 recip_rank recall_1000\n",
"0 bestSoFar 0.419425 0.628554 0.834000 \n",
"1 BM25 0.374041 0.579877 0.825376 \n",
"2 Another Experiment 0.419425 0.628554 0.834000 \n",
"3 bm25_qe 0.346940 0.549149 0.828398 "
"0 bestSoFar 0.430218 0.633236 0.849022\n",
"1 BM25 0.374041 0.579877 0.825376\n",
"2 Another Experiment 0.451618 0.657313 0.849263\n",
"3 bm25_qe 0.346940 0.549149 0.828398"
]
},
"execution_count": 15,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
88 changes: 57 additions & 31 deletions newRetrievalSystem/DPH_QE.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"--2024-06-25 11:45:40-- https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar\n",
"--2024-06-27 10:01:47-- https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar\n",
"Resolving files.webis.de (files.webis.de)... 141.54.132.200\n",
"Connecting to files.webis.de (files.webis.de)|141.54.132.200|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 499865236 (477M) [application/java-archive]\n",
"Saving to: ‘/root/.pyterrier/custom-terrier-token-processing-0.0.1.jar’\n",
"\n",
"/root/.pyterrier/cu 100%[===================>] 476.71M 112MB/s in 4.5s \n",
"/root/.pyterrier/cu 100%[===================>] 476.71M 91.0MB/s in 5.5s \n",
"\n",
"2024-06-25 11:45:45 (105 MB/s) - ‘/root/.pyterrier/custom-terrier-token-processing-0.0.1.jar’ saved [499865236/499865236]\n",
"2024-06-27 10:01:53 (87.3 MB/s) - ‘/root/.pyterrier/custom-terrier-token-processing-0.0.1.jar’ saved [499865236/499865236]\n",
"\n"
]
}
Expand Down Expand Up @@ -59,27 +59,6 @@
" from jnius import autoclass"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"stopwords= [\"I\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\",\n",
" \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \n",
" \"her\", \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \n",
" \"theirs\", \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \n",
" \"these\", \"those\", \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\",\n",
" \"have\", \"has\", \"had\", \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\",\n",
" \"the\", \"and\", \"but\", \"if\", \"or\", \"because\", \"as\", \"until\", \"while\", \"of\",\n",
" \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \"between\", \"into\", \"through\", \n",
" \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \"from\", \"up\", \"down\", \"in\",\n",
" \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \"then\", \"once\", \"here\", \n",
" \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \"each\", \"few\", \"more\",\n",
" \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \"own\", \"same\", \"so\", \n",
" \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
Expand All @@ -98,36 +77,53 @@
"name": "stdout",
"output_type": "stream",
"text": [
"start\n"
"start\n",
"Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-inputs.zip?download=1\n",
"\tThis is only used for last spot checks before archival to Zenodo.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Download: 100%|██████████| 39.4M/39.4M [00:00<00:00, 44.2MiB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download finished. Extract...\n",
"Extraction finished: /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 72%|███████▏ | 90856/126958 [00:22<00:05, 6978.82it/s]"
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 72%|███████▏ | 91419/126958 [00:22<00:05, 6834.37it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"11:46:16.506 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (2008.ecir_conference-2008.5) - further warnings are suppressed\n"
"10:02:23.994 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Adding an empty document to the index (2008.ecir_conference-2008.5) - further warnings are suppressed\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:27<00:00, 4622.28it/s] \n",
"100%|██████████| 126958/126958 [00:27<00:00, 4622.53it/s]\n"
"ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:27<00:00, 4630.24it/s] \n",
"100%|██████████| 126958/126958 [00:27<00:00, 4630.50it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"11:46:25.920 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents\n",
"10:02:33.109 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 2 empty documents\n",
"end\n"
]
}
Expand All @@ -138,7 +134,7 @@
"tokenizer = pt.TerrierTokeniser.english\n",
"\n",
"\n",
"iter_indexer = pt.IterDictIndexer(\"./passage_index\", overwrite=True, verbose= True, stopwords =stopwords, tokeniser=tokenizer, stemmer =stemmer, meta={'docno': 100, 'text': 4096})\n",
"iter_indexer = pt.IterDictIndexer(\"./passage_index\", overwrite=True, verbose= True, tokeniser=tokenizer, stemmer =stemmer, meta={'docno': 100, 'text': 4096})\n",
"\n",
"print('start')\n",
"indexref = iter_indexer.index(tqdm(pt_dataset.get_corpus_iter()))\n",
Expand All @@ -159,6 +155,36 @@
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/ir-lab-sose2024/ir-acl-anthology-20240504-truth.zip?download=1\n",
"\tThis is only used for last spot checks before archival to Zenodo.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Download: 100%|██████████| 29.6k/29.6k [00:00<00:00, 1.52MiB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Download finished. Extract...\n",
"Extraction finished: /root/.tira/extracted_datasets/ir-lab-sose-2024/ir-acl-anthology-20240504-training/\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down
Loading

0 comments on commit 7f7a7d5

Please sign in to comment.