Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor the parsing of the text index builder #1695

Merged
merged 19 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
b93fde4
Extra classes for Words- and Docsfile parsing
Flixtastic Dec 28, 2024
9c40084
Added method to tokenize and normalize at the same time.
Flixtastic Dec 28, 2024
c365935
Added the tokenization to the ql_utility namespace
Flixtastic Dec 28, 2024
479b763
Revert "Added the tokenization to the ql_utility namespace"
Flixtastic Dec 28, 2024
d0ec708
Used the custom InputRangeMixin to lazily tokenize and normalize word…
Flixtastic Jan 2, 2025
a7823fb
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 4, 2025
5f28add
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 6, 2025
f129ecd
Added comments and necessary tests to WordsAndDocsFileParser
Flixtastic Jan 8, 2025
b699551
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 8, 2025
1642175
Merge branch 'ad-freiburg:master' into words-and-docs-file-parsing
Flixtastic Jan 9, 2025
8c8a1a1
Added comments to WordsAndDcosFileParser.h. Improved useability of te…
Flixtastic Jan 9, 2025
0369de6
Rewrite the tokenizer as a view.
joka921 Jan 10, 2025
c412983
Improved comment, addressed small requested changes
Flixtastic Jan 10, 2025
46fbb98
Addressed sonar issues
Flixtastic Jan 10, 2025
1e0fc14
Removed the temporary localeManagers in WordsAndDocsFileParserTest.cpp
Flixtastic Jan 10, 2025
9f9738c
Addressed more SonarQube problems
Flixtastic Jan 11, 2025
a55f2be
For now excluding helper functions from code coverage since they coul…
Flixtastic Jan 11, 2025
bea5936
Reverting last commit
Flixtastic Jan 11, 2025
349be6d
Small improvement
Flixtastic Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 55 additions & 34 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

// _____________________________________________________________________________
cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals) {
std::string contextFile, bool addWordsFromLiterals) const {
auto localeManager = textVocab_.getLocaleManager();
// ROUND 1: If context file aka wordsfile is not empty, read words from there.
// Remember the last context id for the (optional) second round.
Expand Down Expand Up @@ -53,8 +53,7 @@
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
textView.remove_prefix(1);
auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager);
for (auto word : normalizedWords) {
for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
WordsFileLine wordLine{word, false, contextId, 1};
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we benefit from a std::move(word) here?

co_yield wordLine;
}
Expand All @@ -63,6 +62,56 @@
}
}

// _____________________________________________________________________________
void IndexImpl::processEntityCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<Id, Score>& entitiesInContext, size_t& nofLiterals,
size_t& entityNotFoundErrorMsgCount) const {
VocabIndex eid;
// TODO<joka921> Currently only IRIs and strings from the vocabulary can
// be tagged entities in the text index (no doubles, ints, etc).
if (getVocab().getId(line.word_, &eid)) {
// Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
// to be contiguous.
entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
if (line.isLiteralEntity_) {
++nofLiterals;
}
} else {
logEntityNotFound(line.word_, entityNotFoundErrorMsgCount);
}
}

// _____________________________________________________________________________
void IndexImpl::processWordCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<WordIndex, Score>& wordsInContext) const {
// TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
WordVocabIndex vid;
bool ret = textVocab_.getId(line.word_, &vid);
WordIndex wid = vid.get();
if (!ret) {
LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
<< "not found in textVocab. Terminating\n";
AD_FAIL();
}

Check warning on line 97 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L94-L97

Added lines #L94 - L97 were not covered by tests
wordsInContext[wid] += line.score_;
}

// _____________________________________________________________________________
void IndexImpl::logEntityNotFound(const string& word,
size_t& entityNotFoundErrorMsgCount) const {
if (entityNotFoundErrorMsgCount < 20) {
LOG(WARN) << "Entity from text not in KB: " << word << '\n';
if (++entityNotFoundErrorMsgCount == 20) {
LOG(WARN) << "There are more entities not in the KB..."
<< " suppressing further warnings...\n";
}

Check warning on line 109 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L107-L109

Added lines #L107 - L109 were not covered by tests
} else {
entityNotFoundErrorMsgCount++;
}

Check warning on line 112 in src/index/IndexImpl.Text.cpp

View check run for this annotation

Codecov / codecov/patch

src/index/IndexImpl.Text.cpp#L111-L112

Added lines #L111 - L112 were not covered by tests
}

// _____________________________________________________________________________
void IndexImpl::addTextFromContextFile(const string& contextFile,
bool addWordsFromLiterals) {
Expand Down Expand Up @@ -235,39 +284,11 @@
}
if (line.isEntity_) {
++nofEntityPostings;
// TODO<joka921> Currently only IRIs and strings from the vocabulary can
// be tagged entities in the text index (no doubles, ints, etc).
VocabIndex eid;
if (getVocab().getId(line.word_, &eid)) {
// Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
// to be contiguous.
entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
if (line.isLiteralEntity_) {
++nofLiterals;
}
} else {
if (entityNotFoundErrorMsgCount < 20) {
LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
if (++entityNotFoundErrorMsgCount == 20) {
LOG(WARN) << "There are more entities not in the KB..."
<< " suppressing further warnings...\n";
}
} else {
entityNotFoundErrorMsgCount++;
}
}
processEntityCaseDuringInvertedListProcessing(
line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount);
} else {
++nofWordPostings;
// TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
WordVocabIndex vid;
bool ret = textVocab_.getId(line.word_, &vid);
WordIndex wid = vid.get();
if (!ret) {
LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
<< "not found in textVocab. Terminating\n";
AD_FAIL();
}
wordsInContext[wid] += line.score_;
processWordCaseDuringInvertedListProcessing(line, wordsInContext);
}
}
if (entityNotFoundErrorMsgCount > 0) {
Expand Down
14 changes: 13 additions & 1 deletion src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,19 @@ class IndexImpl {
// testing phase, once it works, it should be easy to include the IRIs and
// literals from the external vocabulary as well).
cppcoro::generator<WordsFileLine> wordsInTextRecords(
const std::string& contextFile, bool addWordsFromLiterals);
std::string contextFile, bool addWordsFromLiterals) const;

void processEntityCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<Id, Score>& entitiesInContxt, size_t& nofLiterals,
size_t& entityNotFoundErrorMsgCount) const;

void processWordCaseDuringInvertedListProcessing(
const WordsFileLine& line,
ad_utility::HashMap<WordIndex, Score>& wordsInContext) const;

void logEntityNotFound(const string& word,
size_t& entityNotFoundErrorMsgCount) const;

size_t processWordsForVocabulary(const string& contextFile,
bool addWordsFromLiterals);
Expand Down
34 changes: 18 additions & 16 deletions src/parser/WordsAndDocsFileParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,32 @@
#include "util/StringUtils.h"

// _____________________________________________________________________________
WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
LocaleManager localeManager)
: in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}
WordsAndDocsFileParser::WordsAndDocsFileParser(
const string& wordsOrDocsFile, const LocaleManager& localeManager)
: in_(wordsOrDocsFile), localeManager_(localeManager) {}

// _____________________________________________________________________________
ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
WordsFileLine line;
string l;
if (!std::getline(in_, l)) {
if (!std::getline(getInputStream(), l)) {
return std::nullopt;
};
size_t i = l.find('\t');
}
std::string_view lineView(l);
size_t i = lineView.find('\t');
assert(i != string::npos);
size_t j = i + 2;
assert(j + 3 < l.size());
size_t k = l.find('\t', j + 2);
assert(j + 3 < lineView.size());
size_t k = lineView.find('\t', j + 2);
assert(k != string::npos);
line.isEntity_ = (l[i + 1] == '1');
line.isEntity_ = (lineView[i + 1] == '1');
line.word_ =
(line.isEntity_ ? l.substr(0, i)
: localeManager_.getLowercaseUtf8(l.substr(0, i)));
(line.isEntity_
? lineView.substr(0, i)
: getLocaleManager().getLowercaseUtf8(lineView.substr(0, i)));
line.contextId_ =
TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data()));
line.score_ = static_cast<Score>(atol(lineView.substr(k + 1).data()));
#ifndef NDEBUG
if (lastCId_ > line.contextId_) {
AD_THROW("ContextFile has to be sorted by context Id.");
Expand All @@ -46,11 +48,11 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {

// _____________________________________________________________________________
ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
DocsFileLine line;
string l;
if (!std::getline(in_, l)) {
if (!std::getline(getInputStream(), l)) {
return std::nullopt;
};
}
DocsFileLine line;
size_t i = l.find('\t');
assert(i != string::npos);
line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));
Expand Down
Loading
Loading