Skip to content

Commit

Permalink
fixed conversion of Terrier indexes generated with two pass, local te…
Browse files Browse the repository at this point in the history
…rmid assignment removed
  • Loading branch information
tonellotto committed May 27, 2020
1 parent 9a736e7 commit cc7a659
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 16 deletions.
9 changes: 5 additions & 4 deletions src/main/java/it/cnr/isti/hpclab/ef/BasicCompressor.java
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,14 @@ public void compress(final TermPartition terms) throws IOException
LexiconEntry le = null;
IterablePosting p = null;

int local_termid = 0;
// int local_termid = 0;

while (!stop(lee, terms.end() - terms.begin())) {
le = lee.getValue();
p = src_index.getInvertedIndex().getPostings((BitIndexPointer)lee.getValue());

los.writeNextEntry(lee.getKey(), new EFLexiconEntry(local_termid, le.getDocumentFrequency(), le.getFrequency(), le.getMaxFrequencyInDocuments(), docidsOffset, freqsOffset));
// los.writeNextEntry(lee.getKey(), new EFLexiconEntry(local_termid, le.getDocumentFrequency(), le.getFrequency(), le.getMaxFrequencyInDocuments(), docidsOffset, freqsOffset));
los.writeNextEntry(lee.getKey(), new EFLexiconEntry(le.getTermId(), le.getDocumentFrequency(), le.getFrequency(), le.getMaxFrequencyInDocuments(), docidsOffset, freqsOffset));

docidsAccumulator.init( le.getDocumentFrequency(), num_docs, false, true, LOG2QUANTUM );
freqsAccumulator.init( le.getDocumentFrequency(), le.getFrequency(), true, false, LOG2QUANTUM );
Expand All @@ -136,11 +137,11 @@ public void compress(final TermPartition terms) throws IOException

docidsOffset += docidsAccumulator.dump(docids);
freqsOffset += freqsAccumulator.dump(freqs);
local_termid += 1;
// local_termid += 1;
p.close();

lee = lex_iter.hasNext() ? lex_iter.next() : null;
super.cnt++;
super.written_terms++;
}

docidsAccumulator.close();
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/it/cnr/isti/hpclab/ef/BlockCompressor.java
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ public void compress(final TermPartition terms) throws IOException
LexiconEntry le = null;
IterablePosting p = null;

int local_termid = 0;
// int local_termid = 0;

while (!stop(lee, terms.end() - terms.begin())) {
le = lee.getValue();
Expand All @@ -146,7 +146,8 @@ public void compress(final TermPartition terms) throws IOException
if (occurrency != le.getFrequency())
throw new IllegalStateException("Lexicon term occurencies (" + le.getFrequency() + ") different form positions-counted occurrencies (" + occurrency + ")");

los.writeNextEntry(lee.getKey(), new EFBlockLexiconEntry(local_termid, le.getDocumentFrequency(), le.getFrequency(), le.getMaxFrequencyInDocuments(), docidsOffset, freqsOffset, posOffset));
// los.writeNextEntry(lee.getKey(), new EFBlockLexiconEntry(local_termid, le.getDocumentFrequency(), le.getFrequency(), le.getMaxFrequencyInDocuments(), docidsOffset, freqsOffset, posOffset));
los.writeNextEntry(lee.getKey(), new EFBlockLexiconEntry(le.getTermId(), le.getDocumentFrequency(), le.getFrequency(), le.getMaxFrequencyInDocuments(), docidsOffset, freqsOffset, posOffset));
// After computing sumMaxPos, we re-scan the posting list to encode the positions
posAccumulator.init(le.getFrequency(), le.getDocumentFrequency() + sumMaxPos, true, false, LOG2QUANTUM );

Expand All @@ -161,7 +162,7 @@ public void compress(final TermPartition terms) throws IOException
}
p.close();

docidsOffset += docidsAccumulator.dump(docids);
docidsOffset += docidsAccumulator.dump(docids);
freqsOffset += freqsAccumulator.dump(freqs);

// Firstly we write decoding limits info
Expand All @@ -170,10 +171,10 @@ public void compress(final TermPartition terms) throws IOException
// Secondly we dump the EF representation of the position encoding
posOffset += posAccumulator.dump(pos);

local_termid += 1;
// local_termid += 1;

lee = lex_iter.hasNext() ? lex_iter.next() : null;
super.cnt++;
super.written_terms++;
}

docidsAccumulator.close();
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/it/cnr/isti/hpclab/ef/Compressor.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

public abstract class Compressor
{
protected int cnt = 0;
protected int written_terms = 0;

abstract void compress(final TermPartition terms) throws IOException;

Expand All @@ -20,7 +20,7 @@ default boolean stop(final Entry<String, LexiconEntry> lee, final int end)

final boolean stop(final Entry<String, LexiconEntry> lee, final int len)
{
return (lee == null || cnt >= len);
return (lee == null || written_terms >= len);
}

}
6 changes: 3 additions & 3 deletions src/main/java/it/cnr/isti/hpclab/ef/CompressorReducer.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public TermPartition apply(TermPartition t1, TermPartition t2)
FSOMapFileAppendLexiconOutputStream los1 = new FSOMapFileAppendLexiconOutputStream(this.dst_index_path + File.separator + t1.prefix() + ".lexicon" + FSOrderedMapFile.USUAL_EXTENSION,
new FixedSizeTextFactory(IndexUtil.DEFAULT_MAX_TERM_LENGTH),
(!with_pos) ? new EFLexiconEntry.Factory() : new EFBlockLexiconEntry.Factory());
final int num_terms_1 = (int) (Files.size(Paths.get(dst_index_path + File.separator + t1.prefix() + ".lexicon" + FSOrderedMapFile.USUAL_EXTENSION)) / los1.getEntrySize());
// final int num_terms_1 = (int) (Files.size(Paths.get(dst_index_path + File.separator + t1.prefix() + ".lexicon" + FSOrderedMapFile.USUAL_EXTENSION)) / los1.getEntrySize());

Iterator<Entry<String, LexiconEntry>> lex_iter = null;
Entry<String, LexiconEntry> lee = null;
Expand All @@ -102,13 +102,13 @@ public TermPartition apply(TermPartition t1, TermPartition t2)
le.docidOffset += Byte.SIZE * docid_offset;
le.freqOffset += Byte.SIZE * freq_offset;
le.posOffset += Byte.SIZE * pos_offset;
le.termId += num_terms_1;
// le.termId += num_terms_1;
los1.writeNextEntry(lee.getKey(), le);
} else {
EFLexiconEntry le = (EFLexiconEntry) lee.getValue();
le.docidOffset += Byte.SIZE * docid_offset;
le.freqOffset += Byte.SIZE * freq_offset;
le.termId += num_terms_1;
// le.termId += num_terms_1;
los1.writeNextEntry(lee.getKey(), le);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/it/cnr/isti/hpclab/ef/IndexReadingTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ public IndexReadingTest(int parallelism, int skipSize)
@Parameters
public static Collection<Object[]> getParameters()
{
return Arrays.asList(new Object[][] { {3, 2} });
// return Arrays.asList(new Object[][] { {1,2}, {1,3}, {1,4}, {2,2}, {2,3}, {2,4}, {3,2}, {3,3}, {3,4}});
// return Arrays.asList(new Object[][] { {3, 2} });
return Arrays.asList(new Object[][] { {1,2}, {1,3}, {1,4}, {2,2}, {2,3}, {2,4}, {3,2}, {3,3}, {3,4}});
}

@Before
Expand Down

0 comments on commit cc7a659

Please sign in to comment.