Skip to content

Commit

Permalink
feat: Enhance compression handling with new functions and improved do…
Browse files Browse the repository at this point in the history
…cumentation for file type detection
  • Loading branch information
cauliyang committed Jan 16, 2025
1 parent 09a426b commit 0ea69aa
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 24 deletions.
120 changes: 102 additions & 18 deletions crates/deepbiop-utils/src/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,28 @@ pub use json::*;
pub use parquet::*;
use std::fs::File;

use noodles::bgzf;
use flate2::read::GzDecoder;
use noodles::bgzf;
use pyo3::prelude::*;
use pyo3_stub_gen::derive::*;
use std::io;
use std::io::Read;

/// Represents different types of file compression formats
///
/// This enum is used to identify and handle various compression formats commonly used for files.
/// It can be used in Python through the deepbiop.utils module.
///
/// # Variants
///
/// * `Uncompress` - Uncompressed/raw file format
/// * `Gzip` - Standard gzip compression (.gz files)
/// * `Bgzip` - Blocked gzip format, commonly used in bioinformatics
/// * `Zip` - ZIP archive format
/// * `Bzip2` - bzip2 compression format
/// * `Xz` - XZ compression format (LZMA2)
/// * `Zstd` - Zstandard compression format
/// * `Unknown` - Unknown or unrecognized compression format
#[gen_stub_pyclass_enum]
#[pyclass(eq, eq_int, module = "deepbiop.utils")]
#[derive(Debug, PartialEq, Clone, Eq, Hash)]
Expand All @@ -29,6 +44,34 @@ pub enum CompressedType {
Unknown,
}

/// Determines the compression type of a file by examining its header/signature
///
/// This function reads the first few bytes of a file and checks for known magic numbers
/// or file signatures to identify the compression format used.
///
/// # Arguments
///
/// * `file_path` - Path to the file to check, can be any type that converts to a Path
///
/// # Returns
///
/// * `Result<CompressedType>` - The detected compression type wrapped in a Result
///
/// # Errors
///
/// Returns an error if:
/// * The file cannot be opened
/// * There are issues reading the file header
///
/// # Examples
///
/// ```no_run
/// use deepbiop_utils::io::check_compressed_type;
/// use std::path::Path;
///
/// let file_path = Path::new("test.gz");
/// let compression = check_compressed_type(file_path).unwrap();
/// ```
pub fn check_compressed_type<P: AsRef<Path>>(file_path: P) -> Result<CompressedType> {
let mut file = File::open(file_path)?;
let mut buffer = [0u8; 18]; // Large enough for BGZF detection
Expand All @@ -45,11 +88,11 @@ pub fn check_compressed_type<P: AsRef<Path>>(file_path: P) -> Result<CompressedT
[0x1f, 0x8b, 0x08, 0x04, ..] if bytes_read >= 18 => {
// Check for BGZF extra field
let xlen = u16::from_le_bytes([buffer[10], buffer[11]]) as usize;
if xlen >= 6
&& buffer[12] == 0x42 // B
if xlen >= 6 && buffer[12] == 0x42 // B
&& buffer[13] == 0x43 // C
&& buffer[14] == 0x02 // Length of subfield (2)
&& buffer[15] == 0x00 // Length of subfield (2)
&& buffer[15] == 0x00
// Length of subfield (2)
{
Ok(CompressedType::Bgzip)
} else {
Expand Down Expand Up @@ -82,6 +125,23 @@ pub fn check_compressed_type<P: AsRef<Path>>(file_path: P) -> Result<CompressedT
}
}

/// Checks if a file is compressed by examining its file signature/magic numbers
///
/// # Arguments
/// * `file_path` - Path to the file to check
///
/// # Returns
/// * `Ok(true)` if the file is compressed (gzip, bgzip, zip, bzip2, xz, zstd)
/// * `Ok(false)` if the file is uncompressed or compression type is unknown
/// * `Err` if there was an error reading the file
///
/// # Example
/// ```no_run
/// use deepbiop_utils::io;
///
/// let is_compressed = io::is_compressed("file.gz").unwrap();
/// assert!(is_compressed);
/// ```
pub fn is_compressed<P: AsRef<Path>>(file_path: P) -> Result<bool> {
match check_compressed_type(file_path)? {
CompressedType::Uncompress => Ok(false),
Expand All @@ -90,7 +150,17 @@ pub fn is_compressed<P: AsRef<Path>>(file_path: P) -> Result<bool> {
}
}


/// Creates a reader for a file that may be compressed
///
/// This function detects the compression type of the file and returns an appropriate reader.
/// Currently supports uncompressed files, gzip, and bgzip formats.
///
/// # Arguments
/// * `file_path` - Path to the file to read, can be compressed or uncompressed
///
/// # Returns
/// * `Ok(Box<dyn io::Read>)` - A boxed reader appropriate for the file's compression
/// * `Err` - If the file cannot be opened or has an unsupported compression type
pub fn create_reader<P: AsRef<Path>>(file_path: P) -> Result<Box<dyn io::Read>> {
let compressed_type = check_compressed_type(file_path.as_ref())?;
let file = File::open(file_path)?;
Expand All @@ -103,28 +173,33 @@ pub fn create_reader<P: AsRef<Path>>(file_path: P) -> Result<Box<dyn io::Read>>
})
}


#[cfg(test)]
mod tests{
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;

#[test]
#[test]
fn test_check_file_type() -> Result<()> {
// Test gzip file
let mut gzip_file = NamedTempFile::new()?;
gzip_file.write_all(&[0x1f, 0x8b])?;
assert_eq!(check_compressed_type(gzip_file.path())?, CompressedType::Gzip);
assert_eq!(
check_compressed_type(gzip_file.path())?,
CompressedType::Gzip
);

// Test bgzip file
// Test bgzip file
let mut bgzip_file = NamedTempFile::new()?;
let bgzip_header = [
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x06, 0x00, 0x42, 0x43, 0x02, 0x00, 0x00, 0x00
0x1f, 0x8b, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x42, 0x43,
0x02, 0x00, 0x00, 0x00,
];
bgzip_file.write_all(&bgzip_header)?;
assert_eq!(check_compressed_type(bgzip_file.path())?, CompressedType::Bgzip);
assert_eq!(
check_compressed_type(bgzip_file.path())?,
CompressedType::Bgzip
);

// Test zip file
let mut zip_file = NamedTempFile::new()?;
Expand All @@ -134,7 +209,10 @@ mod tests{
// Test bzip2 file
let mut bzip2_file = NamedTempFile::new()?;
bzip2_file.write_all(&[0x42, 0x5a, 0x68])?;
assert_eq!(check_compressed_type(bzip2_file.path())?, CompressedType::Bzip2);
assert_eq!(
check_compressed_type(bzip2_file.path())?,
CompressedType::Bzip2
);

// Test xz file
let mut xz_file = NamedTempFile::new()?;
Expand All @@ -144,12 +222,18 @@ mod tests{
// Test zstd file
let mut zstd_file = NamedTempFile::new()?;
zstd_file.write_all(&[0x28, 0xb5, 0x2f, 0xfd])?;
assert_eq!(check_compressed_type(zstd_file.path())?, CompressedType::Zstd);
assert_eq!(
check_compressed_type(zstd_file.path())?,
CompressedType::Zstd
);

// Test normal file
let mut normal_file = NamedTempFile::new()?;
normal_file.write_all(b"Hello world")?;
assert_eq!(check_compressed_type(normal_file.path())?, CompressedType::Uncompress);
assert_eq!(
check_compressed_type(normal_file.path())?,
CompressedType::Uncompress
);

Ok(())
}
Expand All @@ -169,7 +253,7 @@ mod tests{
Ok(())
}

#[test]
#[test]
fn test_real_example() -> Result<()> {
let test1 = "./tests/data/test.fastq.gz";
let test2 = "./tests/data/test.fastqbgz.gz";
Expand All @@ -180,4 +264,4 @@ mod tests{
assert_eq!(check_compressed_type(test3)?, CompressedType::Uncompress);
Ok(())
}
}
}
16 changes: 13 additions & 3 deletions crates/deepbiop-utils/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,19 @@ fn generate_unmaped_intervals(
.collect()
}

/// Check the compression type of a file.
///
/// Args:
/// path: Path to the file to check
///
/// Returns:
/// The compression type of the file (None, Gzip, Bzip2, Xz)
///
/// Raises:
/// IOError: If the file cannot be opened or read
#[gen_stub_pyfunction(module = "deepbiop.utils")]
#[pyfunction(name = "detect_compression")]
fn py_detect_compression(path: PathBuf) -> Result<io::CompressedType> {
#[pyfunction(name = "check_compressed_type")]
fn py_check_compressed_type(path: PathBuf) -> Result<io::CompressedType> {
io::check_compressed_type(path)
}

Expand All @@ -129,7 +139,7 @@ pub fn register_utils_module(parent_module: &Bound<'_, PyModule>) -> PyResult<()
&child_module
)?)?;
child_module.add_function(wrap_pyfunction!(generate_unmaped_intervals, &child_module)?)?;
child_module.add_function(wrap_pyfunction!(py_detect_compression, &child_module)?)?;
child_module.add_function(wrap_pyfunction!(py_check_compressed_type, &child_module)?)?;

parent_module.add_submodule(&child_module)?;
Ok(())
Expand Down
1 change: 1 addition & 0 deletions py-deepbiop/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ clean:

build: clean
uv sync
uv tool maturin develop -r
cargo run --bin stub_gen
ruff check --fix --unsafe-fixes
6 changes: 3 additions & 3 deletions py-deepbiop/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ def test_hight_targets():

def test_dectect_compressed_file():
file = "./tests/data/test.fastq"
result = utils.detect_compression(file)
result = utils.check_compressed_type(file)
assert result == utils.CompressedType.Uncompress

file = "./tests/data/test.fastq.gz"
result = utils.detect_compression(file)
result = utils.check_compressed_type(file)
assert result == utils.CompressedType.Gzip

file = "./tests/data/test.fastqbgz.gz"
result = utils.detect_compression(file)
result = utils.check_compressed_type(file)
assert result == utils.CompressedType.Bgzip

0 comments on commit 0ea69aa

Please sign in to comment.