diff --git a/crates/deepbiop-utils/src/io.rs b/crates/deepbiop-utils/src/io.rs index be2c92e..5a3a230 100644 --- a/crates/deepbiop-utils/src/io.rs +++ b/crates/deepbiop-utils/src/io.rs @@ -8,13 +8,28 @@ pub use json::*; pub use parquet::*; use std::fs::File; -use noodles::bgzf; use flate2::read::GzDecoder; +use noodles::bgzf; use pyo3::prelude::*; use pyo3_stub_gen::derive::*; use std::io; use std::io::Read; +/// Represents different types of file compression formats +/// +/// This enum is used to identify and handle various compression formats commonly used for files. +/// It can be used in Python through the deepbiop.utils module. +/// +/// # Variants +/// +/// * `Uncompress` - Uncompressed/raw file format +/// * `Gzip` - Standard gzip compression (.gz files) +/// * `Bgzip` - Blocked gzip format, commonly used in bioinformatics +/// * `Zip` - ZIP archive format +/// * `Bzip2` - bzip2 compression format +/// * `Xz` - XZ compression format (LZMA2) +/// * `Zstd` - Zstandard compression format +/// * `Unknown` - Unknown or unrecognized compression format #[gen_stub_pyclass_enum] #[pyclass(eq, eq_int, module = "deepbiop.utils")] #[derive(Debug, PartialEq, Clone, Eq, Hash)] @@ -29,6 +44,34 @@ pub enum CompressedType { Unknown, } +/// Determines the compression type of a file by examining its header/signature +/// +/// This function reads the first few bytes of a file and checks for known magic numbers +/// or file signatures to identify the compression format used. +/// +/// # Arguments +/// +/// * `file_path` - Path to the file to check, can be any type that converts to a Path +/// +/// # Returns +/// +/// * `Result` - The detected compression type wrapped in a Result +/// +/// # Errors +/// +/// Returns an error if: +/// * The file cannot be opened +/// * There are issues reading the file header +/// +/// # Examples +/// +/// ```no_run +/// use deepbiop_utils::io::check_compressed_type; +/// use std::path::Path; +/// +/// let file_path = Path::new("test.gz"); +/// let compression = check_compressed_type(file_path).unwrap(); +/// ``` pub fn check_compressed_type>(file_path: P) -> Result { let mut file = File::open(file_path)?; let mut buffer = [0u8; 18]; // Large enough for BGZF detection @@ -45,11 +88,11 @@ pub fn check_compressed_type>(file_path: P) -> Result= 18 => { // Check for BGZF extra field let xlen = u16::from_le_bytes([buffer[10], buffer[11]]) as usize; - if xlen >= 6 - && buffer[12] == 0x42 // B + if xlen >= 6 && buffer[12] == 0x42 // B && buffer[13] == 0x43 // C && buffer[14] == 0x02 // Length of subfield (2) - && buffer[15] == 0x00 // Length of subfield (2) + && buffer[15] == 0x00 + // Length of subfield (2) { Ok(CompressedType::Bgzip) } else { @@ -82,6 +125,23 @@ pub fn check_compressed_type>(file_path: P) -> Result>(file_path: P) -> Result { match check_compressed_type(file_path)? { CompressedType::Uncompress => Ok(false), @@ -90,7 +150,17 @@ pub fn is_compressed>(file_path: P) -> Result { } } - +/// Creates a reader for a file that may be compressed +/// +/// This function detects the compression type of the file and returns an appropriate reader. +/// Currently supports uncompressed files, gzip, and bgzip formats. +/// +/// # Arguments +/// * `file_path` - Path to the file to read, can be compressed or uncompressed +/// +/// # Returns +/// * `Ok(Box)` - A boxed reader appropriate for the file's compression +/// * `Err` - If the file cannot be opened or has an unsupported compression type pub fn create_reader>(file_path: P) -> Result> { let compressed_type = check_compressed_type(file_path.as_ref())?; let file = File::open(file_path)?; @@ -103,28 +173,33 @@ pub fn create_reader>(file_path: P) -> Result> }) } - #[cfg(test)] -mod tests{ +mod tests { use super::*; use std::io::Write; use tempfile::NamedTempFile; - #[test] + #[test] fn test_check_file_type() -> Result<()> { // Test gzip file let mut gzip_file = NamedTempFile::new()?; gzip_file.write_all(&[0x1f, 0x8b])?; - assert_eq!(check_compressed_type(gzip_file.path())?, CompressedType::Gzip); + assert_eq!( + check_compressed_type(gzip_file.path())?, + CompressedType::Gzip + ); - // Test bgzip file + // Test bgzip file let mut bgzip_file = NamedTempFile::new()?; let bgzip_header = [ - 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x00, 0x00 + 0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x42, 0x43, + 0x02, 0x00, 0x00, 0x00, ]; bgzip_file.write_all(&bgzip_header)?; - assert_eq!(check_compressed_type(bgzip_file.path())?, CompressedType::Bgzip); + assert_eq!( + check_compressed_type(bgzip_file.path())?, + CompressedType::Bgzip + ); // Test zip file let mut zip_file = NamedTempFile::new()?; @@ -134,7 +209,10 @@ mod tests{ // Test bzip2 file let mut bzip2_file = NamedTempFile::new()?; bzip2_file.write_all(&[0x42, 0x5a, 0x68])?; - assert_eq!(check_compressed_type(bzip2_file.path())?, CompressedType::Bzip2); + assert_eq!( + check_compressed_type(bzip2_file.path())?, + CompressedType::Bzip2 + ); // Test xz file let mut xz_file = NamedTempFile::new()?; @@ -144,12 +222,18 @@ mod tests{ // Test zstd file let mut zstd_file = NamedTempFile::new()?; zstd_file.write_all(&[0x28, 0xb5, 0x2f, 0xfd])?; - assert_eq!(check_compressed_type(zstd_file.path())?, CompressedType::Zstd); + assert_eq!( + check_compressed_type(zstd_file.path())?, + CompressedType::Zstd + ); // Test normal file let mut normal_file = NamedTempFile::new()?; normal_file.write_all(b"Hello world")?; - assert_eq!(check_compressed_type(normal_file.path())?, CompressedType::Uncompress); + assert_eq!( + check_compressed_type(normal_file.path())?, + CompressedType::Uncompress + ); Ok(()) } @@ -169,7 +253,7 @@ mod tests{ Ok(()) } - #[test] + #[test] fn test_real_example() -> Result<()> { let test1 = "./tests/data/test.fastq.gz"; let test2 = "./tests/data/test.fastqbgz.gz"; @@ -180,4 +264,4 @@ mod tests{ assert_eq!(check_compressed_type(test3)?, CompressedType::Uncompress); Ok(()) } -} \ No newline at end of file +} diff --git a/crates/deepbiop-utils/src/python.rs b/crates/deepbiop-utils/src/python.rs index a0964eb..8b8db19 100644 --- a/crates/deepbiop-utils/src/python.rs +++ b/crates/deepbiop-utils/src/python.rs @@ -106,9 +106,19 @@ fn generate_unmaped_intervals( .collect() } +/// Check the compression type of a file. +/// +/// Args: +/// path: Path to the file to check +/// +/// Returns: +/// The compression type of the file (None, Gzip, Bzip2, Xz) +/// +/// Raises: +/// IOError: If the file cannot be opened or read #[gen_stub_pyfunction(module = "deepbiop.utils")] -#[pyfunction(name = "detect_compression")] -fn py_detect_compression(path: PathBuf) -> Result { +#[pyfunction(name = "check_compressed_type")] +fn py_check_compressed_type(path: PathBuf) -> Result { io::check_compressed_type(path) } @@ -129,7 +139,7 @@ pub fn register_utils_module(parent_module: &Bound<'_, PyModule>) -> PyResult<() &child_module )?)?; child_module.add_function(wrap_pyfunction!(generate_unmaped_intervals, &child_module)?)?; - child_module.add_function(wrap_pyfunction!(py_detect_compression, &child_module)?)?; + child_module.add_function(wrap_pyfunction!(py_check_compressed_type, &child_module)?)?; parent_module.add_submodule(&child_module)?; Ok(()) diff --git a/py-deepbiop/Makefile b/py-deepbiop/Makefile index 534fdaf..f4deeea 100644 --- a/py-deepbiop/Makefile +++ b/py-deepbiop/Makefile @@ -8,5 +8,6 @@ clean: build: clean uv sync + uv tool maturin develop -r cargo run --bin stub_gen ruff check --fix --unsafe-fixes diff --git a/py-deepbiop/tests/test_utils.py b/py-deepbiop/tests/test_utils.py index a421b56..7ed8406 100644 --- a/py-deepbiop/tests/test_utils.py +++ b/py-deepbiop/tests/test_utils.py @@ -8,13 +8,13 @@ def test_hight_targets(): def test_dectect_compressed_file(): file = "./tests/data/test.fastq" - result = utils.detect_compression(file) + result = utils.check_compressed_type(file) assert result == utils.CompressedType.Uncompress file = "./tests/data/test.fastq.gz" - result = utils.detect_compression(file) + result = utils.check_compressed_type(file) assert result == utils.CompressedType.Gzip file = "./tests/data/test.fastqbgz.gz" - result = utils.detect_compression(file) + result = utils.check_compressed_type(file) assert result == utils.CompressedType.Bgzip