Skip to content

Commit

Permalink
Merge pull request #32 from pisa-engine/lexicons
Browse files Browse the repository at this point in the history
Add ability to write lexicons directly
  • Loading branch information
JMMackenzie authored Mar 7, 2022
2 parents 6dadfad + 5ec21bd commit 01d6af5
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 21 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ciff"
version = "0.1.1" # remember to update html_root_url
version = "0.2.0" # remember to update html_root_url
authors = ["Michal Siedlaczek <michal.siedlaczek@nyu.edu>"]
edition = "2018"
license = "Apache-2.0"
Expand Down
4 changes: 3 additions & 1 deletion src/ciff2pisa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ struct Args {
ciff_file: PathBuf,
#[structopt(short, long, help = "Output basename")]
output: PathBuf,
#[structopt(short, long, help = "Generate lexicon files?")]
generate_lexicons: bool,
}

fn main() {
let args = Args::from_args();
if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output) {
if let Err(error) = ciff_to_pisa(&args.ciff_file, &args.output, args.generate_lexicons) {
eprintln!("ERROR: {}", error);
std::process::exit(1);
}
Expand Down
19 changes: 16 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
//! For more information about PISA's internal storage formats, see the
//! [documentation](https://pisa.readthedocs.io/en/latest/index.html).
#![doc(html_root_url = "https://docs.rs/ciff/0.1.1")]
#![doc(html_root_url = "https://docs.rs/ciff/0.2.0")]
#![warn(
missing_docs,
trivial_casts,
Expand Down Expand Up @@ -46,7 +46,7 @@ pub use binary_collection::{
};

mod payload_vector;
pub use payload_vector::{PayloadIter, PayloadSlice, PayloadVector};
pub use payload_vector::{build_lexicon, PayloadIter, PayloadSlice, PayloadVector};

type Result<T> = anyhow::Result<T>;

Expand Down Expand Up @@ -193,6 +193,8 @@ struct PisaIndexPaths {
frequencies: PathBuf,
sizes: PathBuf,
titles: PathBuf,
termlex: PathBuf,
doclex: PathBuf,
}

impl PisaIndexPaths {
Expand All @@ -210,6 +212,8 @@ impl PisaIndexPaths {
frequencies: parent.join(format_name(file_name, ".freqs")),
sizes: parent.join(format_name(file_name, ".sizes")),
titles: parent.join(format_name(file_name, ".documents")),
termlex: parent.join(format_name(file_name, ".termlex")),
doclex: parent.join(format_name(file_name, ".doclex")),
})
}
}
Expand Down Expand Up @@ -258,7 +262,7 @@ fn reorder_pisa_index(paths: &PisaIndexPaths) -> Result<()> {
/// - reading protobuf format fails,
/// - data format is valid but any ID, frequency, or a count is negative,
/// - document records is out of order.
pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
pub fn ciff_to_pisa(input: &Path, output: &Path, generate_lexicons: bool) -> Result<()> {
let index_paths =
PisaIndexPaths::from_base_path(output).ok_or_else(|| anyhow!("invalid output path"))?;

Expand Down Expand Up @@ -300,6 +304,7 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
progress.set_style(pb_style());
progress.set_draw_delta(u64::from(header.num_documents) / 100);
sizes.write_all(&header.num_documents.to_le_bytes())?;
sizes.flush()?;

for docs_seen in 0..header.num_documents {
let doc_record = input.read_message::<DocRecord>()?;
Expand All @@ -325,12 +330,20 @@ pub fn ciff_to_pisa(input: &Path, output: &Path) -> Result<()> {
writeln!(trecids, "{}", trecid)?;
progress.inc(1);
}
trecids.flush()?;
progress.finish();

if !check_lines_sorted(BufReader::new(File::open(&index_paths.terms)?))? {
reorder_pisa_index(&index_paths)?;
}

if generate_lexicons {
eprintln!("Generating the document and term lexicons...");
drop(trecids);
build_lexicon(&index_paths.terms, &index_paths.termlex)?;
build_lexicon(&index_paths.titles, &index_paths.doclex)?;
}

Ok(())
}

Expand Down
34 changes: 25 additions & 9 deletions src/payload_vector.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::convert::TryInto;
use std::io::{self, Write};
use std::fs::File;
use std::io::{self, BufRead, BufReader, BufWriter, Write};
use std::ops::{Deref, Index};
use std::path::Path;

/// Owning variant of [`PayloadSlice`], in which the underlying bytes are fully
/// in memory within the struct. This is useful mainly for building the structure
Expand Down Expand Up @@ -249,25 +251,39 @@ impl<'a> Iterator for PayloadIter<'a> {
}
}

/// Builds a lexicon using the text file at `input` and writes it to `output`.
///
/// # Errors
///
/// Returns an error if any failure occurs during reading the input
/// or writing to the output.
pub fn build_lexicon(input: &Path, output: &Path) -> io::Result<()> {
let lex = BufReader::new(File::open(input)?)
.lines()
.collect::<Result<PayloadVector, _>>()?;
let mut lex_path = BufWriter::new(File::create(output)?);
lex.write(&mut lex_path)?;
lex_path.flush()?;
Ok(())
}

#[cfg(test)]
mod test {
use super::*;
use std::io;
use std::path::PathBuf;
use tempfile::TempDir;

#[test]
#[cfg(not(miri))]
fn test_write() -> io::Result<()> {
let test_data_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/test_data");
let lex: PayloadVector = std::fs::read_to_string(test_data_dir.join("terms.txt"))?
.trim()
.split_whitespace()
.map(str::to_string)
.collect();
let mut output = Vec::<u8>::new();
let tmp = TempDir::new()?;
let output = tmp.path().join("terms.lex");
build_lexicon(&test_data_dir.join("terms.txt"), &output)?;
let actual_lex_bytes = std::fs::read(output)?;
let expected_lex_bytes = std::fs::read(test_data_dir.join("terms.lex"))?;
lex.write(&mut output)?;
assert_eq!(output, expected_lex_bytes);
assert_eq!(actual_lex_bytes, expected_lex_bytes);
Ok(())
}

Expand Down
34 changes: 28 additions & 6 deletions tests/toy.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use ciff::{ciff_to_pisa, pisa_to_ciff};
use ciff::{ciff_to_pisa, pisa_to_ciff, PayloadSlice};
use std::fs::read;
use std::path::PathBuf;
use tempfile::TempDir;
Expand All @@ -9,13 +9,19 @@ fn test_toy_index() -> anyhow::Result<()> {
let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
let temp = TempDir::new().unwrap();
let output_path = temp.path().join("coll");
if let Err(err) = ciff_to_pisa(&input_path, &output_path) {
if let Err(err) = ciff_to_pisa(&input_path, &output_path, true) {
panic!("{}", err);
}
assert_eq!(
std::fs::read_to_string(temp.path().join("coll.documents"))?,
"WSJ_1\nTREC_DOC_1\nDOC222\n"
);
let bytes = std::fs::read(temp.path().join("coll.doclex"))?;
let actual_titles: Vec<_> = PayloadSlice::new(&bytes).iter().collect();
assert_eq!(
actual_titles,
vec![b"WSJ_1".as_ref(), b"TREC_DOC_1", b"DOC222"],
);
assert_eq!(
std::fs::read(temp.path().join("coll.sizes"))?,
vec![3, 0, 0, 0, 6, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0]
Expand All @@ -26,6 +32,22 @@ fn test_toy_index() -> anyhow::Result<()> {
.collect::<Vec<_>>(),
vec!["01", "03", "30", "content", "enough", "head", "simpl", "text", "veri"]
);
let bytes = std::fs::read(temp.path().join("coll.termlex"))?;
let actual_terms: Vec<_> = PayloadSlice::new(&bytes).iter().collect();
assert_eq!(
actual_terms,
vec![
b"01".as_ref(),
b"03",
b"30",
b"content",
b"enough",
b"head",
b"simpl",
b"text",
b"veri"
]
);
assert_eq!(
std::fs::read(temp.path().join("coll.docs"))?,
vec![
Expand Down Expand Up @@ -63,7 +85,7 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> {
let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
let temp = TempDir::new().unwrap();
let output_path = temp.path().join("coll");
if let Err(err) = ciff_to_pisa(&input_path, &output_path) {
if let Err(err) = ciff_to_pisa(&input_path, &output_path, false) {
panic!("{}", err);
}
let ciff_output_path = temp.path().join("ciff");
Expand All @@ -81,7 +103,7 @@ fn test_to_and_from_ciff() -> anyhow::Result<()> {
// back to PISA to verify.

let pisa_copy = temp.path().join("copy");
ciff_to_pisa(&ciff_output_path, &pisa_copy)?;
ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?;

let coll_basename = output_path.display().to_string();
let copy_basename = pisa_copy.display().to_string();
Expand Down Expand Up @@ -115,7 +137,7 @@ fn test_reorder_terms() -> anyhow::Result<()> {
let input_path = PathBuf::from("tests/test_data/toy-complete-20200309.ciff");
let temp = TempDir::new().unwrap();
let pisa_path = temp.path().join("coll");
ciff_to_pisa(&input_path, &pisa_path)?;
ciff_to_pisa(&input_path, &pisa_path, false)?;

// Rewrite the terms; later, we will check if the posting lists are in reverse order.
std::fs::write(
Expand All @@ -137,7 +159,7 @@ fn test_reorder_terms() -> anyhow::Result<()> {

// Convert back to PISA to verify list order
let pisa_copy = temp.path().join("copy");
ciff_to_pisa(&ciff_output_path, &pisa_copy)?;
ciff_to_pisa(&ciff_output_path, &pisa_copy, false)?;

assert_eq!(
std::fs::read_to_string(temp.path().join("copy.documents"))?,
Expand Down

0 comments on commit 01d6af5

Please sign in to comment.