diff --git a/Cargo.lock b/Cargo.lock index 6e2a517..9f7dff2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,7 +57,7 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciff" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "indicatif", diff --git a/Cargo.toml b/Cargo.toml index 89dc60a..a0a10ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ciff" -version = "0.1.1" # remember to update html_root_url +version = "0.2.0" # remember to update html_root_url authors = ["Michal Siedlaczek "] edition = "2018" license = "Apache-2.0" diff --git a/src/ciff2pisa.rs b/src/ciff2pisa.rs index e39e453..928b7ee 100644 --- a/src/ciff2pisa.rs +++ b/src/ciff2pisa.rs @@ -26,11 +26,13 @@ struct Args { ciff_file: PathBuf, #[structopt(short, long, help = "Output basename")] output: PathBuf, + #[structopt(short, long, help = "Generate lexicon files?")] + generate_lexicons: bool, } fn main() { let args = Args::from_args(); - if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output) { + if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output, args.generate_lexicons) { eprintln!("ERROR: {}", error); std::process::exit(1); } diff --git a/src/lib.rs b/src/lib.rs index d6696f3..7e0e7cb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ //! For more information about PISA's internal storage formats, see the //! [documentation](https://pisa.readthedocs.io/en/latest/index.html). -#![doc(html_root_url = "https://docs.rs/ciff/0.1.1")] +#![doc(html_root_url = "https://docs.rs/ciff/0.2.0")] #![warn( missing_docs, trivial_casts, @@ -46,7 +46,7 @@ pub use binary_collection::{ }; mod payload_vector; -pub use payload_vector::{PayloadIter, PayloadSlice, PayloadVector}; +pub use payload_vector::{build_lexicon, PayloadIter, PayloadSlice, PayloadVector}; type Result = anyhow::Result; @@ -193,6 +193,8 @@ struct PisaIndexPaths { frequencies: PathBuf, sizes: PathBuf, titles: PathBuf, + termlex: PathBuf, + doclex: PathBuf, } impl PisaIndexPaths { @@ -210,6 +212,8 @@ impl PisaIndexPaths { frequencies: parent.join(format_name(file_name, ".freqs")), sizes: parent.join(format_name(file_name, ".sizes")), titles: parent.join(format_name(file_name, ".documents")), + termlex: parent.join(format_name(file_name, ".termlex")), + doclex: parent.join(format_name(file_name, ".doclex")), }) } } @@ -258,7 +262,7 @@ fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> { /// - reading protobuf format fails, /// - data format is valid but any ID, frequency, or a count is negative, /// - document records is out of order. -pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> { +pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Result<()> { let index_paths = PisaIndexPaths::from_base_path(output).ok_or_else(|| anyhow!("invalid output path"))?; @@ -300,6 +304,7 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> { progress.set_style(pb_style()); progress.set_draw_delta(u64::from(header.num_documents) / 100); sizes.write_all(&header.num_documents.to_le_bytes())?; + sizes.flush()?; for docs_seen in 0..header.num_documents { let doc_record = input.read_message::()?; @@ -325,12 +330,20 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> { writeln!(trecids, "{}", trecid)?; progress.inc(1); } + trecids.flush()?; progress.finish(); if !check_lines_sorted(BufReader::new(File::open(&index_paths.terms)?))? { reorder_pisa_index(&index_paths)?; } + if generate_lexicons { + eprintln!("Generating the document and term lexicons..."); + drop(trecids); + build_lexicon(&index_paths.terms, &index_paths.termlex)?; + build_lexicon(&index_paths.titles, &index_paths.doclex)?; + } + Ok(()) } diff --git a/src/payload_vector.rs b/src/payload_vector.rs index 765ceb4..a772f08 100644 --- a/src/payload_vector.rs +++ b/src/payload_vector.rs @@ -1,6 +1,8 @@ use std::convert::TryInto; -use std::io::{self, Write}; +use std::fs::File; +use std::io::{self, BufRead, BufReader, BufWriter, Write}; use std::ops::{Deref, Index}; +use std::path::Path; /// Owning variant of [`PayloadSlice`], in which the underlying bytes are fully /// in memory within the struct. This is useful mainly for building the structure @@ -249,25 +251,39 @@ impl<'a> Iterator for PayloadIter<'a> { } } +/// Builds a lexicon using the text file at `input` and writes it to `output`. +/// +/// # Errors +/// +/// Returns an error if any failure occurs during reading the input +/// or writing to the output. +pub fn build_lexicon(input: &Path, output: &Path) -> io::Result<()> { + let lex = BufReader::new(File::open(input)?) + .lines() + .collect::>()?; + let mut lex_path = BufWriter::new(File::create(output)?); + lex.write(&mut lex_path)?; + lex_path.flush()?; + Ok(()) +} + #[cfg(test)] mod test { use super::*; use std::io; use std::path::PathBuf; + use tempfile::TempDir; #[test] #[cfg(not(miri))] fn test_write() -> io::Result<()> { let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data"); - let lex: PayloadVector = std::fs::read_to_string(test_data_dir.join("terms.txt"))? - .trim() - .split_whitespace() - .map(str::to_string) - .collect(); - let mut output = Vec::::new(); + let tmp = TempDir::new()?; + let output = tmp.path().join("terms.lex"); + build_lexicon(&test_data_dir.join("terms.txt"), &output)?; + let actual_lex_bytes = std::fs::read(output)?; let expected_lex_bytes = std::fs::read(test_data_dir.join("terms.lex"))?; - lex.write(&mut output)?; - assert_eq!(output, expected_lex_bytes); + assert_eq!(actual_lex_bytes, expected_lex_bytes); Ok(()) } diff --git a/tests/toy.rs b/tests/toy.rs index e7010d0..ede9ad1 100644 --- a/tests/toy.rs +++ b/tests/toy.rs @@ -1,4 +1,4 @@ -use ciff::{ciff_to_pisa, pisa_to_ciff}; +use ciff::{ciff_to_pisa, pisa_to_ciff, PayloadSlice}; use std::fs::read; use std::path::PathBuf; use tempfile::TempDir; @@ -9,13 +9,19 @@ fn test_toy_index() -> anyhow::Result<()> { let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); let temp = TempDir::new().unwrap(); let output_path = temp.path().join("coll"); - if let Err(err) = ciff_to_pisa(&input_path, &output_path) { + if let Err(err) = ciff_to_pisa(&input_path, &output_path, true) { panic!("{}", err); } assert_eq!( std::fs::read_to_string(temp.path().join("coll.documents"))?, "WSJ_1\nTREC_DOC_1\nDOC222\n" ); + let bytes = std::fs::read(temp.path().join("coll.doclex"))?; + let actual_titles: Vec<_> = PayloadSlice::new(&bytes).iter().collect(); + assert_eq!( + actual_titles, + vec![b"WSJ_1".as_ref(), b"TREC_DOC_1", b"DOC222"], + ); assert_eq!( std::fs::read(temp.path().join("coll.sizes"))?, vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0] @@ -26,6 +32,22 @@ fn test_toy_index() -> anyhow::Result<()> { .collect::>(), vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"] ); + let bytes = std::fs::read(temp.path().join("coll.termlex"))?; + let actual_terms: Vec<_> = PayloadSlice::new(&bytes).iter().collect(); + assert_eq!( + actual_terms, + vec![ + b"01".as_ref(), + b"03", + b"30", + b"content", + b"enough", + b"head", + b"simpl", + b"text", + b"veri" + ] + ); assert_eq!( std::fs::read(temp.path().join("coll.docs"))?, vec![ @@ -63,7 +85,7 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> { let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); let temp = TempDir::new().unwrap(); let output_path = temp.path().join("coll"); - if let Err(err) = ciff_to_pisa(&input_path, &output_path) { + if let Err(err) = ciff_to_pisa(&input_path, &output_path, false) { panic!("{}", err); } let ciff_output_path = temp.path().join("ciff"); @@ -81,7 +103,7 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> { // back to PISA to verify. let pisa_copy = temp.path().join("copy"); - ciff_to_pisa(&ciff_output_path, &pisa_copy)?; + ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?; let coll_basename = output_path.display().to_string(); let copy_basename = pisa_copy.display().to_string(); @@ -115,7 +137,7 @@ fn test_reorder_terms() -> anyhow::Result<()> { let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); let temp = TempDir::new().unwrap(); let pisa_path = temp.path().join("coll"); - ciff_to_pisa(&input_path, &pisa_path)?; + ciff_to_pisa(&input_path, &pisa_path, false)?; // Rewrite the terms; later, we will check if the posting lists are in reverse order. std::fs::write( @@ -137,7 +159,7 @@ fn test_reorder_terms() -> anyhow::Result<()> { // Convert back to PISA to verify list order let pisa_copy = temp.path().join("copy"); - ciff_to_pisa(&ciff_output_path, &pisa_copy)?; + ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?; assert_eq!( std::fs::read_to_string(temp.path().join("copy.documents"))?,