diff --git a/Cargo.lock b/Cargo.lock index 98b9620..e76572b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -57,7 +57,7 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciff" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "indicatif", diff --git a/Cargo.toml b/Cargo.toml index 4e4349b..db6a8d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ciff" -version = "0.2.1" # remember to update html_root_url +version = "0.3.0" # remember to update html_root_url authors = ["Michal Siedlaczek ", "Joel Mackenzie "] edition = "2018" license = "Apache-2.0" diff --git a/src/ciff2pisa.rs b/src/ciff2pisa.rs index 928b7ee..97712a7 100644 --- a/src/ciff2pisa.rs +++ b/src/ciff2pisa.rs @@ -12,7 +12,7 @@ #![warn(clippy::all, clippy::pedantic)] #![allow(clippy::module_name_repetitions, clippy::default_trait_access)] -use ciff::ciff_to_pisa; +use ciff::CiffToPisa; use std::path::PathBuf; use structopt::StructOpt; @@ -32,7 +32,14 @@ struct Args { fn main() { let args = Args::from_args(); - if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output, args.generate_lexicons) { + let mut converter = CiffToPisa::default(); + converter + .input_path(args.ciff_file) + .output_paths(args.output); + if !args.generate_lexicons { + converter.skip_lexicons(); + } + if let Err(error) = converter.convert() { eprintln!("ERROR: {}", error); std::process::exit(1); } diff --git a/src/lib.rs b/src/lib.rs index ff29e43..5f93ce5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,8 +4,36 @@ //! //! For more information about PISA's internal storage formats, see the //! [documentation](https://pisa.readthedocs.io/en/latest/index.html). - -#![doc(html_root_url = "https://docs.rs/ciff/0.2.1")] +//! +//! # Examples +//! +//! Use [`PisaToCiff`] and [`CiffToPisa`] builders to convert from one format +//! to another. +//! +//! ``` +//! # use std::path::PathBuf; +//! # use tempfile::TempDir; +//! # use ciff::{PisaToCiff, CiffToPisa}; +//! # fn main() -> anyhow::Result<()> { +//! # let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); +//! # let ciff_file = dir.join("tests").join("test_data").join("toy-complete-20200309.ciff"); +//! # let temp = TempDir::new()?; +//! # let pisa_base_path = temp.path().join("pisa"); +//! # let output = temp.path().join("output"); +//! CiffToPisa::default() +//! .input_path(ciff_file) +//! .output_paths(&pisa_base_path) +//! .convert()?; +//! PisaToCiff::default() +//! .description("Hello, CIFF!") +//! .pisa_paths(&pisa_base_path) +//! .output_path(output) +//! .convert()?; +//! # Ok(()) +//! # } +//! ``` + +#![doc(html_root_url = "https://docs.rs/ciff/0.3.0")] #![warn( missing_docs, trivial_casts, @@ -30,7 +58,7 @@ use num_traits::ToPrimitive; use protobuf::{CodedInputStream, CodedOutputStream}; use std::borrow::Borrow; use std::convert::TryFrom; -use std::ffi::OsStr; +use std::ffi::{OsStr, OsString}; use std::fmt; use std::fs::File; use std::io::{self, BufRead, BufReader, BufWriter, Write}; @@ -187,34 +215,58 @@ fn check_lines_sorted(reader: R) -> io::Result { Ok(true) } +/// Concatenate two [`OsStr`]ings. +/// +/// Takes two arguments that can be used as a reference to [`OsStr`], and returns +/// a new [`OsString`] instance by concatenating them. +pub fn concat(path: S1, suffix: S2) -> OsString +where + S1: AsRef, + S2: AsRef, +{ + let mut path = path.as_ref().to_owned(); + path.push(suffix); + path +} + +/// Paths to an inverted index in an uncompressed PISA format. +#[derive(Debug, Clone, Default)] struct PisaIndexPaths { - terms: PathBuf, documents: PathBuf, frequencies: PathBuf, sizes: PathBuf, - titles: PathBuf, - termlex: PathBuf, - doclex: PathBuf, } impl PisaIndexPaths { - fn from_base_path(path: &Path) -> Option { - let file_name = path.file_name()?; - let parent = path.parent()?; - let format_name = |file: &OsStr, suffix| { - let mut full_name = file.to_owned(); - full_name.push(suffix); - full_name - }; - Some(Self { - terms: parent.join(format_name(file_name, ".terms")), - documents: parent.join(format_name(file_name, ".docs")), - frequencies: parent.join(format_name(file_name, ".freqs")), - sizes: parent.join(format_name(file_name, ".sizes")), - titles: parent.join(format_name(file_name, ".documents")), - termlex: parent.join(format_name(file_name, ".termlex")), - doclex: parent.join(format_name(file_name, ".doclex")), - }) + #[must_use] + fn from_base_path>(path: P) -> Self { + Self { + documents: PathBuf::from(concat(path.as_ref(), ".docs")), + frequencies: PathBuf::from(concat(path.as_ref(), ".freqs")), + sizes: PathBuf::from(concat(path.as_ref(), ".sizes")), + } + } +} + +#[derive(Debug, Clone, Default)] +struct PisaPaths { + index: PisaIndexPaths, + terms: PathBuf, + titles: PathBuf, + termlex: Option, + doclex: Option, +} + +impl PisaPaths { + #[must_use] + fn from_base_path>(path: P) -> Self { + Self { + index: PisaIndexPaths::from_base_path(&path), + terms: PathBuf::from(concat(&path, ".terms")), + titles: PathBuf::from(concat(&path, ".documents")), + termlex: Some(PathBuf::from(concat(&path, ".termlex"))), + doclex: Some(PathBuf::from(concat(&path, ".doclex"))), + } } } @@ -237,14 +289,14 @@ fn reorder_postings(path: &Path, order: &[usize], skip_first: bool) -> Result<() Ok(()) } -fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> { +fn reorder_pisa_index(paths: &PisaPaths) -> Result<()> { let terms = BufReader::new(File::open(&paths.terms)?) .lines() .collect::>>()?; let mut order: Vec<_> = (0..terms.len()).collect(); order.sort_by_key(|&i| &terms[i]); - reorder_postings(&paths.documents, &order, true)?; - reorder_postings(&paths.frequencies, &order, false)?; + reorder_postings(&paths.index.documents, &order, true)?; + reorder_postings(&paths.index.frequencies, &order, false)?; let mut term_writer = BufWriter::new(File::create(&paths.terms)?); for index in order { writeln!(&mut term_writer, "{}", terms[index])?; @@ -252,6 +304,99 @@ fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> { Ok(()) } +/// CIFF to PISA converter. +#[derive(Debug, Default, Clone)] +pub struct CiffToPisa { + input: Option, + documents_path: Option, + frequencies_path: Option, + sizes_path: Option, + terms_path: Option, + titles_path: Option, + termlex_path: Option, + doclex_path: Option, +} + +impl CiffToPisa { + /// Sets the CIFF path. Required. + pub fn input_path>(&mut self, path: P) -> &mut Self { + self.input = Some(path.into()); + self + } + + /// Sets PISA (uncompressed) inverted index paths. Required. + /// + /// Paths are constructed by appending file extensions to the base path: + /// - `.docs` for document postings, + /// - `.freqs` for frequency postings, + /// - `.sizes` for document sizes, + /// - `.terms` for terms text file, + /// - `.documents` for document titles text file, + /// - `.termlex` for term lexicon, + /// - `.doclex` for document lexicon. + pub fn output_paths>(&mut self, base_path: P) -> &mut Self { + let paths = PisaPaths::from_base_path(base_path); + self.documents_path = Some(paths.index.documents); + self.frequencies_path = Some(paths.index.frequencies); + self.sizes_path = Some(paths.index.sizes); + self.terms_path = Some(paths.terms); + self.titles_path = Some(paths.titles); + self.termlex_path = paths.termlex; + self.doclex_path = paths.doclex; + self + } + + /// Do not construct document and term lexicons. + pub fn skip_lexicons(&mut self) -> &mut Self { + self.termlex_path = None; + self.doclex_path = None; + self + } + + /// Builds a PISA index using the previously defined parameters. + /// + /// # Errors + /// + /// Error will be returned if: + /// - some required parameters are not defined, + /// - any I/O error occurs during reading input files or writing to the output file, + /// - any input file is in an incorrect format. + pub fn convert(&self) -> Result<()> { + let input = self + .input + .as_ref() + .ok_or_else(|| anyhow!("input path undefined"))?; + let index_output = PisaIndexPaths { + documents: self + .documents_path + .clone() + .ok_or_else(|| anyhow!("document postings path undefined"))?, + frequencies: self + .frequencies_path + .clone() + .ok_or_else(|| anyhow!("frequency postings path undefined"))?, + sizes: self + .sizes_path + .clone() + .ok_or_else(|| anyhow!("document sizes path undefined"))?, + }; + let output = PisaPaths { + index: index_output, + terms: self + .terms_path + .clone() + .ok_or_else(|| anyhow!("terms path undefined"))?, + titles: self + .titles_path + .clone() + .ok_or_else(|| anyhow!("terms path undefined"))?, + termlex: self.termlex_path.clone(), + doclex: self.doclex_path.clone(), + }; + convert_to_pisa(input, &output) + } +} + /// Converts a CIFF index stored in `path` to a PISA "binary collection" (uncompressed inverted /// index) with a basename `output`. /// @@ -262,16 +407,24 @@ fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> { /// - reading protobuf format fails, /// - data format is valid but any ID, frequency, or a count is negative, /// - document records is out of order. +#[deprecated = "use CiffToPisa instead"] pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Result<()> { - let index_paths = - PisaIndexPaths::from_base_path(output).ok_or_else(|| anyhow!("invalid output path"))?; + let mut converter = CiffToPisa::default(); + converter.input_path(input).output_paths(output); + if !generate_lexicons { + converter.skip_lexicons(); + } + converter.convert() +} +fn convert_to_pisa(input: &Path, output: &PisaPaths) -> Result<()> { + println!("{:?}", output); let mut ciff_reader = File::open(input).with_context(|| format!("Unable to open {}", input.display()))?; let mut input = CodedInputStream::new(&mut ciff_reader); - let mut documents = BufWriter::new(File::create(&index_paths.documents)?); - let mut frequencies = BufWriter::new(File::create(&index_paths.frequencies)?); - let mut terms = BufWriter::new(File::create(&index_paths.terms)?); + let mut documents = BufWriter::new(File::create(&output.index.documents)?); + let mut frequencies = BufWriter::new(File::create(&output.index.frequencies)?); + let mut terms = BufWriter::new(File::create(&output.terms)?); let header = Header::from_stream(&mut input)?; println!("{}", header); @@ -297,8 +450,8 @@ pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Res terms.flush()?; eprintln!("Processing document lengths"); - let mut sizes = BufWriter::new(File::create(&index_paths.sizes)?); - let mut trecids = BufWriter::new(File::create(&index_paths.titles)?); + let mut sizes = BufWriter::new(File::create(&output.index.sizes)?); + let mut trecids = BufWriter::new(File::create(&output.titles)?); let progress = ProgressBar::new(u64::from(header.num_documents)); progress.set_style(pb_style()); @@ -333,15 +486,17 @@ pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Res trecids.flush()?; progress.finish(); - if !check_lines_sorted(BufReader::new(File::open(&index_paths.terms)?))? { - reorder_pisa_index(&index_paths)?; + if !check_lines_sorted(BufReader::new(File::open(&output.terms)?))? { + reorder_pisa_index(output)?; } - if generate_lexicons { - eprintln!("Generating the document and term lexicons..."); - drop(trecids); - build_lexicon(&index_paths.terms, &index_paths.termlex)?; - build_lexicon(&index_paths.titles, &index_paths.doclex)?; + eprintln!("Generating the document and term lexicons..."); + drop(trecids); + if let Some(termlex) = output.termlex.as_ref() { + build_lexicon(&output.terms, termlex)?; + } + if let Some(doclex) = output.doclex.as_ref() { + build_lexicon(&output.titles, doclex)?; } Ok(()) @@ -455,6 +610,110 @@ fn write_postings( Ok(()) } +/// PISA to CIFF converter. +#[derive(Debug, Default, Clone)] +pub struct PisaToCiff { + documents_path: Option, + frequencies_path: Option, + sizes_path: Option, + terms_path: Option, + titles_path: Option, + output_path: Option, + description: String, +} + +impl PisaToCiff { + /// Sets CIFF index description. + pub fn description>(&mut self, description: S) -> &mut Self { + self.description = description.into(); + self + } + + /// Sets PISA paths. Required. + /// + /// Paths are constructed by appending file extensions to the base path: + /// - `.docs` for document postings, + /// - `.freqs` for frequency postings, + /// - `.sizes` for document sizes, + /// - `.terms` for terms text file, + /// - `.documents` for document titles text file, + pub fn pisa_paths>(&mut self, base_path: P) -> &mut Self { + let paths = PisaPaths::from_base_path(base_path); + self.documents_path = Some(paths.index.documents); + self.frequencies_path = Some(paths.index.frequencies); + self.sizes_path = Some(paths.index.sizes); + self.terms_path = Some(paths.terms); + self.titles_path = Some(paths.titles); + self + } + + /// Sets PISA (uncompressed) inverted index paths. Required. + /// + /// Constructs paths using the given base path, appeding suffixes: + /// `.docs`, `.freqs`, and `.sizes`. + pub fn index_paths>(&mut self, base_path: P) -> &mut Self { + let PisaIndexPaths { + documents, + frequencies, + sizes, + } = PisaIndexPaths::from_base_path(base_path); + self.documents_path = Some(documents); + self.frequencies_path = Some(frequencies); + self.sizes_path = Some(sizes); + self + } + + /// Sets the path of the term file (newline-delimited text format). Required. + pub fn terms_path>(&mut self, path: P) -> &mut Self { + self.terms_path = Some(path.into()); + self + } + + /// Sets the path of the document titles file (newline-delimited text format). Required. + pub fn titles_path>(&mut self, path: P) -> &mut Self { + self.titles_path = Some(path.into()); + self + } + + /// Set the output file path. Required. + pub fn output_path>(&mut self, path: P) -> &mut Self { + self.output_path = Some(path.into()); + self + } + + /// Builds a CIFF index using the previously defined parameters. + /// + /// # Errors + /// + /// Error will be returned if: + /// - some required parameters are not defined, + /// - any I/O error occurs during reading input files or writing to the output file, + /// - any input file is in an incorrect format. + pub fn convert(&self) -> Result<()> { + pisa_to_ciff_from_paths( + self.documents_path + .as_ref() + .ok_or_else(|| anyhow!("undefined document postings path"))?, + self.frequencies_path + .as_ref() + .ok_or_else(|| anyhow!("undefined frequency postings path"))?, + self.sizes_path + .as_ref() + .ok_or_else(|| anyhow!("undefined document sizes path"))?, + self.terms_path + .as_ref() + .ok_or_else(|| anyhow!("undefined terms path"))?, + self.titles_path + .as_ref() + .ok_or_else(|| anyhow!("undefined titles path"))?, + self.output_path + .as_ref() + .ok_or_else(|| anyhow!("undefined output path"))?, + &self.description, + ) + } +} + /// Converts a a PISA "binary collection" (uncompressed inverted index) with a basename `input` /// to a CIFF index stored in `output`. /// @@ -463,6 +722,7 @@ fn write_postings( /// Returns an error when: /// - an IO error occurs, /// - writing protobuf format fails, +#[deprecated = "use PisaToCiff instead"] pub fn pisa_to_ciff( collection_input: &Path, terms_input: &Path, @@ -470,15 +730,13 @@ pub fn pisa_to_ciff( output: &Path, description: &str, ) -> Result<()> { - pisa_to_ciff_from_paths( - &PathBuf::from(format!("{}.docs", collection_input.display())), - &PathBuf::from(format!("{}.freqs", collection_input.display())), - &PathBuf::from(format!("{}.sizes", collection_input.display())), - terms_input, - titles_input, - output, - description, - ) + PisaToCiff::default() + .description(description) + .index_paths(collection_input) + .terms_path(terms_input) + .titles_path(titles_input) + .output_path(output) + .convert() } fn pisa_to_ciff_from_paths( diff --git a/src/pisa2ciff.rs b/src/pisa2ciff.rs index a451346..67076ac 100644 --- a/src/pisa2ciff.rs +++ b/src/pisa2ciff.rs @@ -12,7 +12,7 @@ #![warn(clippy::all, clippy::pedantic)] #![allow(clippy::module_name_repetitions, clippy::default_trait_access)] -use ciff::pisa_to_ciff; +use ciff::PisaToCiff; use std::path::PathBuf; use structopt::StructOpt; @@ -36,13 +36,14 @@ struct Args { fn main() { let args = Args::from_args(); - if let Err(error) = pisa_to_ciff( - &args.collection, - &args.terms, - &args.documents, - &args.output, - &args.description.unwrap_or_default(), - ) { + if let Err(error) = PisaToCiff::default() + .description(args.description.unwrap_or_default()) + .index_paths(args.collection) + .terms_path(args.terms) + .titles_path(args.documents) + .output_path(args.output) + .convert() + { eprintln!("ERROR: {}", error); std::process::exit(1); } diff --git a/tests/toy.rs b/tests/toy.rs index ede9ad1..6f155b1 100644 --- a/tests/toy.rs +++ b/tests/toy.rs @@ -1,5 +1,8 @@ -use ciff::{ciff_to_pisa, pisa_to_ciff, PayloadSlice}; -use std::fs::read; +#![allow(deprecated)] + +use ciff::{ciff_to_pisa, concat, pisa_to_ciff, CiffToPisa, PayloadSlice, PisaToCiff}; +use std::fs::{read, read_to_string}; +use std::path::Path; use std::path::PathBuf; use tempfile::TempDir; @@ -9,30 +12,32 @@ fn test_toy_index() -> anyhow::Result<()> { let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); let temp = TempDir::new().unwrap(); let output_path = temp.path().join("coll"); - if let Err(err) = ciff_to_pisa(&input_path, &output_path, true) { - panic!("{}", err); - } + CiffToPisa::default() + .input_path(input_path) + .output_paths(output_path) + .convert() + .unwrap(); assert_eq!( - std::fs::read_to_string(temp.path().join("coll.documents"))?, + read_to_string(temp.path().join("coll.documents"))?, "WSJ_1\nTREC_DOC_1\nDOC222\n" ); - let bytes = std::fs::read(temp.path().join("coll.doclex"))?; + let bytes = read(temp.path().join("coll.doclex"))?; let actual_titles: Vec<_> = PayloadSlice::new(&bytes).iter().collect(); assert_eq!( actual_titles, vec![b"WSJ_1".as_ref(), b"TREC_DOC_1", b"DOC222"], ); assert_eq!( - std::fs::read(temp.path().join("coll.sizes"))?, + read(temp.path().join("coll.sizes"))?, vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0] ); assert_eq!( - std::fs::read_to_string(temp.path().join("coll.terms"))? + read_to_string(temp.path().join("coll.terms"))? .lines() .collect::>(), vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"] ); - let bytes = std::fs::read(temp.path().join("coll.termlex"))?; + let bytes = read(temp.path().join("coll.termlex"))?; let actual_terms: Vec<_> = PayloadSlice::new(&bytes).iter().collect(); assert_eq!( actual_terms, @@ -49,7 +54,7 @@ fn test_toy_index() -> anyhow::Result<()> { ] ); assert_eq!( - std::fs::read(temp.path().join("coll.docs"))?, + read(temp.path().join("coll.docs"))?, vec![ 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents 1, 0, 0, 0, 0, 0, 0, 0, // t0 @@ -64,7 +69,7 @@ fn test_toy_index() -> anyhow::Result<()> { ] ); assert_eq!( - std::fs::read(temp.path().join("coll.freqs"))?, + read(temp.path().join("coll.freqs"))?, vec![ 1, 0, 0, 0, 1, 0, 0, 0, // t0 1, 0, 0, 0, 1, 0, 0, 0, // t1 @@ -85,17 +90,18 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> { let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); let temp = TempDir::new().unwrap(); let output_path = temp.path().join("coll"); - if let Err(err) = ciff_to_pisa(&input_path, &output_path, false) { - panic!("{}", err); - } + CiffToPisa::default() + .input_path(input_path) + .output_paths(&output_path) + .convert() + .unwrap(); let ciff_output_path = temp.path().join("ciff"); - pisa_to_ciff( - &output_path, - &temp.path().join("coll.terms"), - &temp.path().join("coll.documents"), - &ciff_output_path, - "Export of toy 3-document collection from Anserini's io.anserini.integration.TrecEndToEndTest test case", - )?; + PisaToCiff::default() + .index_paths(&output_path) + .terms_path(&temp.path().join("coll.terms")) + .titles_path(&temp.path().join("coll.documents")) + .output_path(&ciff_output_path) + .convert()?; // NOTE: the constructed ciff file will not be exactly the same as the initial one. // The reason is that PISA index will be treated as a whole index while the statistics @@ -103,7 +109,11 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> { // back to PISA to verify. let pisa_copy = temp.path().join("copy"); - ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?; + CiffToPisa::default() + .input_path(&ciff_output_path) + .output_paths(&pisa_copy) + .convert() + .unwrap(); let coll_basename = output_path.display().to_string(); let copy_basename = pisa_copy.display().to_string(); @@ -137,7 +147,11 @@ fn test_reorder_terms() -> anyhow::Result<()> { let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); let temp = TempDir::new().unwrap(); let pisa_path = temp.path().join("coll"); - ciff_to_pisa(&input_path, &pisa_path, false)?; + CiffToPisa::default() + .input_path(input_path) + .output_paths(&pisa_path) + .convert() + .unwrap(); // Rewrite the terms; later, we will check if the posting lists are in reverse order. std::fs::write( @@ -149,34 +163,37 @@ fn test_reorder_terms() -> anyhow::Result<()> { )?; let ciff_output_path = temp.path().join("ciff"); - pisa_to_ciff( - &pisa_path, - &temp.path().join("coll.terms"), - &temp.path().join("coll.documents"), - &ciff_output_path, - "", - )?; + PisaToCiff::default() + .index_paths(&pisa_path) + .terms_path(&temp.path().join("coll.terms")) + .titles_path(&temp.path().join("coll.documents")) + .output_path(&ciff_output_path) + .convert()?; // Convert back to PISA to verify list order let pisa_copy = temp.path().join("copy"); - ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?; + CiffToPisa::default() + .input_path(ciff_output_path) + .output_paths(pisa_copy) + .convert() + .unwrap(); assert_eq!( - std::fs::read_to_string(temp.path().join("copy.documents"))?, + read_to_string(temp.path().join("copy.documents"))?, "WSJ_1\nTREC_DOC_1\nDOC222\n" ); assert_eq!( - std::fs::read(temp.path().join("coll.sizes"))?, + read(temp.path().join("coll.sizes"))?, vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0] ); assert_eq!( - std::fs::read_to_string(temp.path().join("copy.terms"))? + read_to_string(temp.path().join("copy.terms"))? .lines() .collect::>(), vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"] ); assert_eq!( - std::fs::read(temp.path().join("copy.docs"))?, + read(temp.path().join("copy.docs"))?, vec![ 1, 0, 0, 0, 3, 0, 0, 0, // Number of documents 1, 0, 0, 0, 1, 0, 0, 0, // t8 @@ -191,7 +208,7 @@ fn test_reorder_terms() -> anyhow::Result<()> { ] ); assert_eq!( - std::fs::read(temp.path().join("copy.freqs"))?, + read(temp.path().join("copy.freqs"))?, vec![ 1, 0, 0, 0, 1, 0, 0, 0, // t8 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, // t7 @@ -207,3 +224,84 @@ fn test_reorder_terms() -> anyhow::Result<()> { Ok(()) } + +fn assert_files_eq, P2: AsRef>(lhs: P1, rhs: P2) { + if read(lhs.as_ref()).unwrap() != read(rhs.as_ref()).unwrap() { + panic!( + "Files not equal: {}, {}", + lhs.as_ref().display(), + rhs.as_ref().display() + ); + } +} + +#[test] +fn test_legacy_api() { + let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); + let temp = TempDir::new().unwrap(); + + let builder_output = temp.path().join("builder"); + let legacy_output = temp.path().join("legacy"); + + CiffToPisa::default() + .input_path(&input_path) + .output_paths(&builder_output) + .convert() + .unwrap(); + ciff_to_pisa(&input_path, &legacy_output, true).unwrap(); + + for suffix in [ + ".docs", + ".freqs", + ".sizes", + ".documents", + ".terms", + ".doclex", + ".termlex", + ] { + assert_files_eq( + concat(&builder_output, suffix), + concat(&legacy_output, suffix), + ); + } + + let builder_ciff = temp.path().join("builder.ciff"); + let legacy_ciff = temp.path().join("legacy.ciff"); + + PisaToCiff::default() + .description("description") + .pisa_paths(&builder_output) + .output_path(&builder_ciff) + .convert() + .unwrap(); + pisa_to_ciff( + &legacy_output, + &PathBuf::from(concat(&legacy_output, ".terms")), + &PathBuf::from(concat(&legacy_output, ".documents")), + &legacy_ciff, + "description", + ) + .unwrap(); + + assert_files_eq(builder_ciff, legacy_ciff); +} + +#[test] +fn test_skip_lexicons() { + let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff"); + let temp = TempDir::new().unwrap(); + let output = temp.path().join("builder"); + + ciff_to_pisa(&input_path, &output, false).unwrap(); + assert!(!PathBuf::from(concat(&output, ".termlex")).exists()); + assert!(!PathBuf::from(concat(&output, ".doclex")).exists()); + + CiffToPisa::default() + .input_path(&input_path) + .output_paths(&output) + .skip_lexicons() + .convert() + .unwrap(); + assert!(!PathBuf::from(concat(&output, ".termlex")).exists()); + assert!(!PathBuf::from(concat(&output, ".doclex")).exists()); +}