Merge pull request #32 from pisa-engine/lexicons

Add ability to write lexicons directly
pisa-engine · Mar 7, 2022 · 01d6af5 · 01d6af5
2 parents 6dadfad + 5ec21bd
commit 01d6af5
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 21 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ciff"
-version = "0.1.1" # remember to update html_root_url
+version = "0.2.0" # remember to update html_root_url
 authors = ["Michal Siedlaczek <michal.siedlaczek@nyu.edu>"]
 edition = "2018"
 license = "Apache-2.0"

diff --git a/src/ciff2pisa.rs b/src/ciff2pisa.rs
@@ -26,11 +26,13 @@ struct Args {
     ciff_file: PathBuf,
     #[structopt(short, long, help = "Output basename")]
     output: PathBuf,
+    #[structopt(short, long, help = "Generate lexicon files?")]
+    generate_lexicons: bool,
 }
 
 fn main() {
     let args = Args::from_args();
-    if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output) {
+    if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output, args.generate_lexicons) {
         eprintln!("ERROR: {}", error);
         std::process::exit(1);
     }

diff --git a/src/lib.rs b/src/lib.rs
@@ -5,7 +5,7 @@
 //! For more information about PISA's internal storage formats, see the
 //! [documentation](https://pisa.readthedocs.io/en/latest/index.html).
 
-#![doc(html_root_url = "https://docs.rs/ciff/0.1.1")]
+#![doc(html_root_url = "https://docs.rs/ciff/0.2.0")]
 #![warn(
     missing_docs,
     trivial_casts,
@@ -46,7 +46,7 @@ pub use binary_collection::{
 };
 
 mod payload_vector;
-pub use payload_vector::{PayloadIter, PayloadSlice, PayloadVector};
+pub use payload_vector::{build_lexicon, PayloadIter, PayloadSlice, PayloadVector};
 
 type Result<T> = anyhow::Result<T>;
 
@@ -193,6 +193,8 @@ struct PisaIndexPaths {
     frequencies: PathBuf,
     sizes: PathBuf,
     titles: PathBuf,
+    termlex: PathBuf,
+    doclex: PathBuf,
 }
 
 impl PisaIndexPaths {
@@ -210,6 +212,8 @@ impl PisaIndexPaths {
             frequencies: parent.join(format_name(file_name, ".freqs")),
             sizes: parent.join(format_name(file_name, ".sizes")),
             titles: parent.join(format_name(file_name, ".documents")),
+            termlex: parent.join(format_name(file_name, ".termlex")),
+            doclex: parent.join(format_name(file_name, ".doclex")),
         })
     }
 }
@@ -258,7 +262,7 @@ fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> {
 /// - reading protobuf format fails,
 /// - data format is valid but any ID, frequency, or a count is negative,
 /// - document records is out of order.
-pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
+pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Result<()> {
     let index_paths =
         PisaIndexPaths::from_base_path(output).ok_or_else(|| anyhow!("invalid output path"))?;
 
@@ -300,6 +304,7 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
     progress.set_style(pb_style());
     progress.set_draw_delta(u64::from(header.num_documents) / 100);
     sizes.write_all(&header.num_documents.to_le_bytes())?;
+    sizes.flush()?;
 
     for docs_seen in 0..header.num_documents {
         let doc_record = input.read_message::<DocRecord>()?;
@@ -325,12 +330,20 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
         writeln!(trecids, "{}", trecid)?;
         progress.inc(1);
     }
+    trecids.flush()?;
     progress.finish();
 
     if !check_lines_sorted(BufReader::new(File::open(&index_paths.terms)?))? {
         reorder_pisa_index(&index_paths)?;
     }
 
+    if generate_lexicons {
+        eprintln!("Generating the document and term lexicons...");
+        drop(trecids);
+        build_lexicon(&index_paths.terms, &index_paths.termlex)?;
+        build_lexicon(&index_paths.titles, &index_paths.doclex)?;
+    }
+
     Ok(())
 }
 

diff --git a/src/payload_vector.rs b/src/payload_vector.rs
@@ -1,6 +1,8 @@
 use std::convert::TryInto;
-use std::io::{self, Write};
+use std::fs::File;
+use std::io::{self, BufRead, BufReader, BufWriter, Write};
 use std::ops::{Deref, Index};
+use std::path::Path;
 
 /// Owning variant of [`PayloadSlice`], in which the underlying bytes are fully
 /// in memory within the struct. This is useful mainly for building the structure
@@ -249,25 +251,39 @@ impl<'a> Iterator for PayloadIter<'a> {
     }
 }
 
+/// Builds a lexicon using the text file at `input` and writes it to `output`.
+///
+/// # Errors
+///
+/// Returns an error if any failure occurs during reading the input
+/// or writing to the output.
+pub fn build_lexicon(input: &Path, output: &Path) -> io::Result<()> {
+    let lex = BufReader::new(File::open(input)?)
+        .lines()
+        .collect::<Result<PayloadVector, _>>()?;
+    let mut lex_path = BufWriter::new(File::create(output)?);
+    lex.write(&mut lex_path)?;
+    lex_path.flush()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
     use std::io;
     use std::path::PathBuf;
+    use tempfile::TempDir;
 
     #[test]
     #[cfg(not(miri))]
     fn test_write() -> io::Result<()> {
         let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data");
-        let lex: PayloadVector = std::fs::read_to_string(test_data_dir.join("terms.txt"))?
-            .trim()
-            .split_whitespace()
-            .map(str::to_string)
-            .collect();
-        let mut output = Vec::<u8>::new();
+        let tmp = TempDir::new()?;
+        let output = tmp.path().join("terms.lex");
+        build_lexicon(&test_data_dir.join("terms.txt"), &output)?;
+        let actual_lex_bytes = std::fs::read(output)?;
         let expected_lex_bytes = std::fs::read(test_data_dir.join("terms.lex"))?;
-        lex.write(&mut output)?;
-        assert_eq!(output, expected_lex_bytes);
+        assert_eq!(actual_lex_bytes, expected_lex_bytes);
         Ok(())
     }
 

diff --git a/tests/toy.rs b/tests/toy.rs
@@ -1,4 +1,4 @@
-use ciff::{ciff_to_pisa, pisa_to_ciff};
+use ciff::{ciff_to_pisa, pisa_to_ciff, PayloadSlice};
 use std::fs::read;
 use std::path::PathBuf;
 use tempfile::TempDir;
@@ -9,13 +9,19 @@ fn test_toy_index() -> anyhow::Result<()> {
     let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
     let temp = TempDir::new().unwrap();
     let output_path = temp.path().join("coll");
-    if let Err(err) = ciff_to_pisa(&input_path, &output_path) {
+    if let Err(err) = ciff_to_pisa(&input_path, &output_path, true) {
         panic!("{}", err);
     }
     assert_eq!(
         std::fs::read_to_string(temp.path().join("coll.documents"))?,
         "WSJ_1\nTREC_DOC_1\nDOC222\n"
     );
+    let bytes = std::fs::read(temp.path().join("coll.doclex"))?;
+    let actual_titles: Vec<_> = PayloadSlice::new(&bytes).iter().collect();
+    assert_eq!(
+        actual_titles,
+        vec![b"WSJ_1".as_ref(), b"TREC_DOC_1", b"DOC222"],
+    );
     assert_eq!(
         std::fs::read(temp.path().join("coll.sizes"))?,
         vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0]
@@ -26,6 +32,22 @@ fn test_toy_index() -> anyhow::Result<()> {
             .collect::<Vec<_>>(),
         vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"]
     );
+    let bytes = std::fs::read(temp.path().join("coll.termlex"))?;
+    let actual_terms: Vec<_> = PayloadSlice::new(&bytes).iter().collect();
+    assert_eq!(
+        actual_terms,
+        vec![
+            b"01".as_ref(),
+            b"03",
+            b"30",
+            b"content",
+            b"enough",
+            b"head",
+            b"simpl",
+            b"text",
+            b"veri"
+        ]
+    );
     assert_eq!(
         std::fs::read(temp.path().join("coll.docs"))?,
         vec![
@@ -63,7 +85,7 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> {
     let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
     let temp = TempDir::new().unwrap();
     let output_path = temp.path().join("coll");
-    if let Err(err) = ciff_to_pisa(&input_path, &output_path) {
+    if let Err(err) = ciff_to_pisa(&input_path, &output_path, false) {
         panic!("{}", err);
     }
     let ciff_output_path = temp.path().join("ciff");
@@ -81,7 +103,7 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> {
     // back to PISA to verify.
 
     let pisa_copy = temp.path().join("copy");
-    ciff_to_pisa(&ciff_output_path, &pisa_copy)?;
+    ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?;
 
     let coll_basename = output_path.display().to_string();
     let copy_basename = pisa_copy.display().to_string();
@@ -115,7 +137,7 @@ fn test_reorder_terms() -> anyhow::Result<()> {
     let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
     let temp = TempDir::new().unwrap();
     let pisa_path = temp.path().join("coll");
-    ciff_to_pisa(&input_path, &pisa_path)?;
+    ciff_to_pisa(&input_path, &pisa_path, false)?;
 
     // Rewrite the terms; later, we will check if the posting lists are in reverse order.
     std::fs::write(
@@ -137,7 +159,7 @@ fn test_reorder_terms() -> anyhow::Result<()> {
 
     // Convert back to PISA to verify list order
     let pisa_copy = temp.path().join("copy");
-    ciff_to_pisa(&ciff_output_path, &pisa_copy)?;
+    ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?;
 
     assert_eq!(
         std::fs::read_to_string(temp.path().join("copy.documents"))?,