Skip to content

Commit

Permalink
Add metric evaluation to benchmark Docker
Browse files Browse the repository at this point in the history
Benchmark Docker image now can also evaluate metrics on Robust04.
  • Loading branch information
elshize committed Jan 15, 2024
1 parent ec73c78 commit 8c8fb46
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 47 deletions.
7 changes: 7 additions & 0 deletions test/docker/benchmark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ ARG USE_SANITIZERS=OFF

ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=America/New_York
ENV COLLECTION_PATH=/opt/disk45
ENV WORKDIR=/opt/workdir

COPY . /pisa
RUN ./pisa/test/docker/install-cmake.sh
Expand All @@ -22,5 +24,10 @@ RUN cmake \
&& cmake --build . --config Debug -- -j 4

COPY ./test/docker/benchmark/run.sh /pisa/build
COPY ./test/docker/benchmark/build.sh /pisa/build
COPY ./test/docker/benchmark/evaluate.sh /pisa/build
COPY ./test/docker/benchmark/bench.sh /pisa/build
COPY ./test/docker/benchmark/setup.sh /pisa/build
COPY ./test/docker/benchmark/expected-eval.txt /pisa/build

CMD ["bash", "/pisa/test/docker/benchmark/run.sh"]
14 changes: 14 additions & 0 deletions test/docker/benchmark/bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

set -e

./bin/queries \
-e block_simdbp \
-a block_max_wand \
-i "$WORKDIR/inv.block_simdbp" \
-w "$WORKDIR/inv.bm25.bmw" \
-F lowercase -F porter2 \
--terms "$WORKDIR/fwd.termlex" \
-k 1000 \
--scorer bm25 \
-q "$WORKDIR/topics.robust2004.title"
32 changes: 32 additions & 0 deletions test/docker/benchmark/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

set -e

gzip -dc $(find "$COLLECTION_PATH" -type f -name '*.*z' \
\( -path '*/disk4/fr94/[0-9]*/*' -o -path '*/disk4/ft/ft*' \
-o -path '*/disk5/fbis/fb*' -o -path '*/disk5/latimes/la*' \)) \
| ./bin/parse_collection -f trectext -b 10000 -F lowercase -F porter2 --html -o "$WORKDIR/fwd"

./bin/invert \
-i "$WORKDIR/fwd" \
-o "$WORKDIR/inv" \
--batch-size 400000

./bin/reorder-docids \
--bp \
-c "$WORKDIR/inv" \
-o "$WORKDIR/inv.bp" \
--documents "$WORKDIR/fwd.doclex" \
--reordered-documents "$WORKDIR/fwd.bp.doclex"

./bin/create_wand_data \
-c "$WORKDIR/inv.bp" \
-b 64 \
-o "$WORKDIR/inv.bm25.bmw" \
-s bm25

./bin/compress_inverted_index \
-e block_simdbp \
-c "$WORKDIR/inv.bp" \
-o "$WORKDIR/inv.block_simdbp" \
--check
22 changes: 22 additions & 0 deletions test/docker/benchmark/evaluate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

set -e

./bin/evaluate_queries \
-e block_simdbp \
-a block_max_wand \
-i "$WORKDIR/inv.block_simdbp" \
-w "$WORKDIR/inv.bm25.bmw" \
-F lowercase -F porter2 \
--terms "$WORKDIR/fwd.termlex" \
--documents "$WORKDIR/fwd.bp.doclex" \
-k 1000 \
--scorer bm25 \
-q "$WORKDIR/topics.robust2004.title" \
> "$WORKDIR/results.txt"

trec_eval -m map -m P.30 -m ndcg_cut.20 "$WORKDIR/qrels.robust2004.txt" "$WORKDIR/results.txt" > 'eval.txt'

cat 'eval.txt'

diff 'eval.txt' expected-eval.txt
3 changes: 3 additions & 0 deletions test/docker/benchmark/expected-eval.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
map all 0.2543
P_30 all 0.3139
ndcg_cut_20 all 0.4250
52 changes: 5 additions & 47 deletions test/docker/benchmark/run.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,50 +1,8 @@
#!/bin/bash
#!/usr/bin/env bash

set -e

collection_path='/opt/disk45'
workdir='/opt/workdir'

gzip -dc $(find "$collection_path" -type f -name '*.*z' \
\( -path '*/disk4/fr94/[0-9]*/*' -o -path '*/disk4/ft/ft*' \
-o -path '*/disk5/fbis/fb*' -o -path '*/disk5/latimes/la*' \)) \
| ./bin/parse_collection -f trectext -b 10000 -F lowercase -F porter2 --html -o "$workdir/fwd"

./bin/invert \
-i "$workdir/fwd" \
-o "$workdir/inv" \
--batch-size 400000

./bin/reorder-docids \
--bp \
-c "$workdir/inv" \
-o "$workdir/inv.bp" \
--documents "$workdir/fwd.doclex" \
--reordered-documents "$workdir/fwd.bp.doclex"

./bin/create_wand_data \
-c "$workdir/inv.bp" \
-b 64 \
-o "$workdir/inv.bm25.bmw" \
-s bm25

./bin/compress_inverted_index \
-e block_simdbp \
-c "$workdir/inv.bp" \
-o "$workdir/inv.block_simdbp" \
--check

wget http://trec.nist.gov/data/robust/04.testset.gz
gunzip 04.testset.gz
./bin/extract_topics -f trec -i 04.testset -o "$workdir/topics.robust2004"

./bin/queries \
-e block_simdbp \
-a block_max_wand \
-i "$workdir/inv.block_simdbp" \
-w "$workdir/inv.bm25.bmw" \
-F lowercase -F porter2 \
--terms "$workdir/fwd.termlex" \
-k 1000 \
--scorer bm25 \
-q "$workdir/topics.robust2004.title"
./setup.sh
./build.sh
./evaluate.sh
./bench.sh
16 changes: 16 additions & 0 deletions test/docker/benchmark/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

set -e

wget http://trec.nist.gov/data/robust/04.testset.gz
gunzip 04.testset.gz
./bin/extract_topics -f trec -i 04.testset -o "$WORKDIR/topics.robust2004"

wget http://trec.nist.gov/data/robust/qrels.robust2004.txt
cp qrels.robust2004.txt "$WORKDIR/"

wget https://github.com/usnistgov/trec_eval/archive/refs/tags/v9.0.8.tar.gz
tar -xzvf v9.0.8.tar.gz
cd trec_eval-9.0.8
make install
cd ..

0 comments on commit 8c8fb46

Please sign in to comment.