ad-freiburg · joka921 · Jan 22, 2025 · Dec 28, 2024 · Dec 28, 2024 · Dec 28, 2024
diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp
@@ -23,7 +23,7 @@
 
 // _____________________________________________________________________________
 cppcoro::generator<WordsFileLine> IndexImpl::wordsInTextRecords(
-    const std::string& contextFile, bool addWordsFromLiterals) {
+    std::string contextFile, bool addWordsFromLiterals) const {
   auto localeManager = textVocab_.getLocaleManager();
   // ROUND 1: If context file aka wordsfile is not empty, read words from there.
   // Remember the last context id for the (optional) second round.
@@ -53,8 +53,7 @@
       std::string_view textView = text;
       textView = textView.substr(0, textView.rfind('"'));
       textView.remove_prefix(1);
-      auto normalizedWords = tokenizeAndNormalizeText(textView, localeManager);
-      for (auto word : normalizedWords) {
+      for (auto word : tokenizeAndNormalizeText(textView, localeManager)) {
         WordsFileLine wordLine{word, false, contextId, 1};
         co_yield wordLine;
       }
@@ -63,6 +62,56 @@
   }
 }
 
+// _____________________________________________________________________________
+void IndexImpl::processEntityCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<Id, Score>& entitiesInContext, size_t& nofLiterals,
+    size_t& entityNotFoundErrorMsgCount) const {
+  VocabIndex eid;
+  // TODO<joka921> Currently only IRIs and strings from the vocabulary can
+  // be tagged entities in the text index (no doubles, ints, etc).
+  if (getVocab().getId(line.word_, &eid)) {
+    // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
+    // to be contiguous.
+    entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
+    if (line.isLiteralEntity_) {
+      ++nofLiterals;
+    }
+  } else {
+    logEntityNotFound(line.word_, entityNotFoundErrorMsgCount);
+  }
+}
+
+// _____________________________________________________________________________
+void IndexImpl::processWordCaseDuringInvertedListProcessing(
+    const WordsFileLine& line,
+    ad_utility::HashMap<WordIndex, Score>& wordsInContext) const {
+  // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
+  WordVocabIndex vid;
+  bool ret = textVocab_.getId(line.word_, &vid);
+  WordIndex wid = vid.get();
+  if (!ret) {
+    LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
+               << "not found in textVocab. Terminating\n";
+    AD_FAIL();
+  }
+  wordsInContext[wid] += line.score_;
+}
+
+// _____________________________________________________________________________
+void IndexImpl::logEntityNotFound(const string& word,
+                                  size_t& entityNotFoundErrorMsgCount) const {
+  if (entityNotFoundErrorMsgCount < 20) {
+    LOG(WARN) << "Entity from text not in KB: " << word << '\n';
+    if (++entityNotFoundErrorMsgCount == 20) {
+      LOG(WARN) << "There are more entities not in the KB..."
+                << " suppressing further warnings...\n";
+    }
+  } else {
+    entityNotFoundErrorMsgCount++;
+  }
+}
+
 // _____________________________________________________________________________
 void IndexImpl::addTextFromContextFile(const string& contextFile,
                                        bool addWordsFromLiterals) {
@@ -235,39 +284,11 @@
     }
     if (line.isEntity_) {
       ++nofEntityPostings;
-      // TODO<joka921> Currently only IRIs and strings from the vocabulary can
-      // be tagged entities in the text index (no doubles, ints, etc).
-      VocabIndex eid;
-      if (getVocab().getId(line.word_, &eid)) {
-        // Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
-        // to be contiguous.
-        entitiesInContext[Id::makeFromVocabIndex(eid)] += line.score_;
-        if (line.isLiteralEntity_) {
-          ++nofLiterals;
-        }
-      } else {
-        if (entityNotFoundErrorMsgCount < 20) {
-          LOG(WARN) << "Entity from text not in KB: " << line.word_ << '\n';
-          if (++entityNotFoundErrorMsgCount == 20) {
-            LOG(WARN) << "There are more entities not in the KB..."
-                      << " suppressing further warnings...\n";
-          }
-        } else {
-          entityNotFoundErrorMsgCount++;
-        }
-      }
+      processEntityCaseDuringInvertedListProcessing(
+          line, entitiesInContext, nofLiterals, entityNotFoundErrorMsgCount);
     } else {
       ++nofWordPostings;
-      // TODO<joka921> Let the `textVocab_` return a `WordIndex` directly.
-      WordVocabIndex vid;
-      bool ret = textVocab_.getId(line.word_, &vid);
-      WordIndex wid = vid.get();
-      if (!ret) {
-        LOG(ERROR) << "ERROR: word \"" << line.word_ << "\" "
-                   << "not found in textVocab. Terminating\n";
-        AD_FAIL();
-      }
-      wordsInContext[wid] += line.score_;
+      processWordCaseDuringInvertedListProcessing(line, wordsInContext);
     }
   }
   if (entityNotFoundErrorMsgCount > 0) {

diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h
@@ -522,7 +522,19 @@ class IndexImpl {
   // testing phase, once it works, it should be easy to include the IRIs and
   // literals from the external vocabulary as well).
   cppcoro::generator<WordsFileLine> wordsInTextRecords(
-      const std::string& contextFile, bool addWordsFromLiterals);
+      std::string contextFile, bool addWordsFromLiterals) const;
+
+  void processEntityCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<Id, Score>& entitiesInContxt, size_t& nofLiterals,
+      size_t& entityNotFoundErrorMsgCount) const;
+
+  void processWordCaseDuringInvertedListProcessing(
+      const WordsFileLine& line,
+      ad_utility::HashMap<WordIndex, Score>& wordsInContext) const;
+
+  void logEntityNotFound(const string& word,
+                         size_t& entityNotFoundErrorMsgCount) const;
 
   size_t processWordsForVocabulary(const string& contextFile,
                                    bool addWordsFromLiterals);

diff --git a/src/parser/WordsAndDocsFileParser.cpp b/src/parser/WordsAndDocsFileParser.cpp
@@ -11,30 +11,32 @@
 #include "util/StringUtils.h"
 
 // _____________________________________________________________________________
-WordsAndDocsFileParser::WordsAndDocsFileParser(const string& wordsOrDocsFile,
-                                               LocaleManager localeManager)
-    : in_(wordsOrDocsFile), localeManager_(std::move(localeManager)) {}
+WordsAndDocsFileParser::WordsAndDocsFileParser(
+    const string& wordsOrDocsFile, const LocaleManager& localeManager)
+    : in_(wordsOrDocsFile), localeManager_(localeManager) {}
 
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
   WordsFileLine line;
   string l;
-  if (!std::getline(in_, l)) {
+  if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
-  };
-  size_t i = l.find('\t');
+  }
+  std::string_view lineView(l);
+  size_t i = lineView.find('\t');
   assert(i != string::npos);
   size_t j = i + 2;
-  assert(j + 3 < l.size());
-  size_t k = l.find('\t', j + 2);
+  assert(j + 3 < lineView.size());
+  size_t k = lineView.find('\t', j + 2);
   assert(k != string::npos);
-  line.isEntity_ = (l[i + 1] == '1');
+  line.isEntity_ = (lineView[i + 1] == '1');
   line.word_ =
-      (line.isEntity_ ? l.substr(0, i)
-                      : localeManager_.getLowercaseUtf8(l.substr(0, i)));
+      (line.isEntity_
+           ? lineView.substr(0, i)
+           : getLocaleManager().getLowercaseUtf8(lineView.substr(0, i)));
   line.contextId_ =
-      TextRecordIndex::make(atol(l.substr(j + 1, k - j - 1).c_str()));
-  line.score_ = static_cast<Score>(atol(l.substr(k + 1).c_str()));
+      TextRecordIndex::make(atol(lineView.substr(j + 1, k - j - 1).data()));
+  line.score_ = static_cast<Score>(atol(lineView.substr(k + 1).data()));
 #ifndef NDEBUG
   if (lastCId_ > line.contextId_) {
     AD_THROW("ContextFile has to be sorted by context Id.");
@@ -46,11 +48,11 @@ ad_utility::InputRangeFromGet<WordsFileLine>::Storage WordsFileParser::get() {
 
 // _____________________________________________________________________________
 ad_utility::InputRangeFromGet<DocsFileLine>::Storage DocsFileParser::get() {
-  DocsFileLine line;
   string l;
-  if (!std::getline(in_, l)) {
+  if (!std::getline(getInputStream(), l)) {
     return std::nullopt;
-  };
+  }
+  DocsFileLine line;
   size_t i = l.find('\t');
   assert(i != string::npos);
   line.docId_ = DocumentIndex::make(atol(l.substr(0, i).c_str()));