From ee575536ba5e9eae84cf9d444887b12b748aed5e Mon Sep 17 00:00:00 2001
From: Yangyang Li <yangyang.li@northwestern.edu>
Date: Thu, 16 Jan 2025 13:50:02 -0600
Subject: [PATCH] refactor: Clean up code formatting and improve function
 signatures for consistency

---
 py-deepbiop/Makefile           |   2 +-
 py-deepbiop/deepbiop/bam.pyi   |   7 +-
 py-deepbiop/deepbiop/core.pyi  |  13 ++--
 py-deepbiop/deepbiop/fa.pyi    |  63 ++++++++--------
 py-deepbiop/deepbiop/fq.pyi    | 128 ++++++++++++++++-----------------
 py-deepbiop/deepbiop/utils.pyi |  71 +++++++++++-------
 py-deepbiop/tests/test_fq.py   |   2 -
 7 files changed, 152 insertions(+), 134 deletions(-)

diff --git a/py-deepbiop/Makefile b/py-deepbiop/Makefile
index f4deeea..7a76878 100644
--- a/py-deepbiop/Makefile
+++ b/py-deepbiop/Makefile
@@ -8,6 +8,6 @@ clean:
 
 build: clean
 	uv sync
-	uv tool maturin develop -r
+	uv tool run maturin develop -r
 	cargo run --bin stub_gen
 	ruff check --fix --unsafe-fixes
diff --git a/py-deepbiop/deepbiop/bam.pyi b/py-deepbiop/deepbiop/bam.pyi
index 3b312de..5e92eef 100644
--- a/py-deepbiop/deepbiop/bam.pyi
+++ b/py-deepbiop/deepbiop/bam.pyi
@@ -5,12 +5,11 @@ import os
 import pathlib
 import typing
 
-def count_chimeric_reads_for_path(bam,threads = ...) -> int:
+def count_chimeric_reads_for_path(bam, threads=...) -> int:
     r"""Calculate the number of chimeric reads in a BAM file."""
 
-def count_chimeric_reads_for_paths(bams,threads = ...) -> dict[str, int]:
+def count_chimeric_reads_for_paths(bams, threads=...) -> dict[str, int]:
     r"""Calculate the number of chimeric reads in multiple BAM files."""
 
-def left_right_soft_clip(cigar_string:str) -> tuple[int, int]:
+def left_right_soft_clip(cigar_string: str) -> tuple[int, int]:
     r"""Calculate left and right soft clips from a cigar string."""
-
diff --git a/py-deepbiop/deepbiop/core.pyi b/py-deepbiop/deepbiop/core.pyi
index a17561d..fdc7af6 100644
--- a/py-deepbiop/deepbiop/core.pyi
+++ b/py-deepbiop/deepbiop/core.pyi
@@ -3,7 +3,7 @@
 
 import typing
 
-def generate_kmers(base:str,k:int) -> list[str]:
+def generate_kmers(base: str, k: int) -> list[str]:
     r"""
     Generate all possible k-mers from a set of base characters.
 
@@ -20,7 +20,7 @@ def generate_kmers(base:str,k:int) -> list[str]:
     A vector containing all possible k-mer combinations as strings
     """
 
-def generate_kmers_table(base:str,k:int) -> dict[list[int], int]:
+def generate_kmers_table(base: str, k: int) -> dict[list[int], int]:
     r"""
     Generate a lookup table mapping k-mers to unique IDs.
 
@@ -37,7 +37,7 @@ def generate_kmers_table(base:str,k:int) -> dict[list[int], int]:
     A HashMap mapping k-mer byte sequences to integer IDs
     """
 
-def kmers_to_seq(kmers:typing.Sequence[str]) -> str:
+def kmers_to_seq(kmers: typing.Sequence[str]) -> str:
     r"""
     Convert k-mers back into a DNA sequence.
 
@@ -53,7 +53,7 @@ def kmers_to_seq(kmers:typing.Sequence[str]) -> str:
     The reconstructed DNA sequence as a `String`, wrapped in a `Result`
     """
 
-def normalize_seq(seq:str,iupac:bool) -> str:
+def normalize_seq(seq: str, iupac: bool) -> str:
     r"""
     Normalize a DNA sequence by converting any non-standard nucleotides to standard ones.
 
@@ -70,7 +70,7 @@ def normalize_seq(seq:str,iupac:bool) -> str:
     A normalized DNA sequence as a `String`.
     """
 
-def reverse_complement(seq:str) -> str:
+def reverse_complement(seq: str) -> str:
     r"""
     Generate the reverse complement of a DNA sequence.
 
@@ -97,7 +97,7 @@ def reverse_complement(seq:str) -> str:
     ```
     """
 
-def seq_to_kmers(seq:str,k:int,overlap:bool) -> list[str]:
+def seq_to_kmers(seq: str, k: int, overlap: bool) -> list[str]:
     r"""
     Convert a DNA sequence into k-mers.
 
@@ -114,4 +114,3 @@ def seq_to_kmers(seq:str,k:int,overlap:bool) -> list[str]:
 
     A vector of k-mers as `String`s
     """
-
diff --git a/py-deepbiop/deepbiop/fa.pyi b/py-deepbiop/deepbiop/fa.pyi
index e0fd0f9..3f393bc 100644
--- a/py-deepbiop/deepbiop/fa.pyi
+++ b/py-deepbiop/deepbiop/fa.pyi
@@ -26,7 +26,7 @@ class EncoderOption:
     """
 
     bases: list[int]
-    def __new__(cls,bases): ...
+    def __new__(cls, bases): ...
 
 class ParquetEncoder:
     r"""
@@ -49,37 +49,38 @@ class ParquetEncoder:
     ```
     """
 
-    def __new__(cls,option:EncoderOption): ...
+    def __new__(cls, option: EncoderOption): ...
 
 class RecordData:
     id: str
     seq: str
-    def __new__(cls,id:str, seq:str): ...
-    def set_id(self, id:str) -> None:
-        ...
-
-    def set_seq(self, seq:str) -> None:
-        ...
-
-
-def convert_multiple_fas_to_one_fa(paths:typing.Sequence[str | os.PathLike | pathlib.Path],result_path:str | os.PathLike | pathlib.Path,parallel:bool) -> None:
-    ...
-
-def encode_fa_path_to_parquet(fa_path,bases,result_path = ...) -> None:
-    ...
-
-def encode_fa_path_to_parquet_chunk(fa_path:str | os.PathLike | pathlib.Path,chunk_size:int,parallel:bool,bases:str) -> None:
-    ...
-
-def encode_fa_paths_to_parquet(fa_path:typing.Sequence[str | os.PathLike | pathlib.Path],bases:str) -> None:
-    ...
-
-def select_record_from_fa(selected_reads:typing.Sequence[str],fq:str | os.PathLike | pathlib.Path,output:str | os.PathLike | pathlib.Path) -> None:
-    ...
-
-def write_fa(records_data,file_path = ...) -> None:
-    ...
-
-def write_fa_parallel(records_data:typing.Sequence[RecordData],file_path:str | os.PathLike | pathlib.Path,threads:int) -> None:
-    ...
-
+    def __new__(cls, id: str, seq: str): ...
+    def set_id(self, id: str) -> None: ...
+    def set_seq(self, seq: str) -> None: ...
+
+def convert_multiple_fas_to_one_fa(
+    paths: typing.Sequence[str | os.PathLike | pathlib.Path],
+    result_path: str | os.PathLike | pathlib.Path,
+    parallel: bool,
+) -> None: ...
+def encode_fa_path_to_parquet(fa_path, bases, result_path=...) -> None: ...
+def encode_fa_path_to_parquet_chunk(
+    fa_path: str | os.PathLike | pathlib.Path,
+    chunk_size: int,
+    parallel: bool,
+    bases: str,
+) -> None: ...
+def encode_fa_paths_to_parquet(
+    fa_path: typing.Sequence[str | os.PathLike | pathlib.Path], bases: str
+) -> None: ...
+def select_record_from_fa(
+    selected_reads: typing.Sequence[str],
+    fq: str | os.PathLike | pathlib.Path,
+    output: str | os.PathLike | pathlib.Path,
+) -> None: ...
+def write_fa(records_data, file_path=...) -> None: ...
+def write_fa_parallel(
+    records_data: typing.Sequence[RecordData],
+    file_path: str | os.PathLike | pathlib.Path,
+    threads: int,
+) -> None: ...
diff --git a/py-deepbiop/deepbiop/fq.pyi b/py-deepbiop/deepbiop/fq.pyi
index 826517c..a261301 100644
--- a/py-deepbiop/deepbiop/fq.pyi
+++ b/py-deepbiop/deepbiop/fq.pyi
@@ -9,10 +9,10 @@ class EncoderOption:
     qual_offset: int
     bases: list[int]
     threads: int
-    def __new__(cls,qual_offset,bases,threads = ...): ...
+    def __new__(cls, qual_offset, bases, threads=...): ...
 
 class ParquetEncoder:
-    def __new__(cls,option:EncoderOption): ...
+    def __new__(cls, option: EncoderOption): ...
 
 class Predict:
     r"""A struct to store the prediction result."""
@@ -22,20 +22,23 @@ class Predict:
     id: str
     is_truncated: bool
     qual: str | None
-    def __new__(cls,prediction,seq,id,is_truncated,qual = ...): ...
-    def __repr__(self) -> str:
-        ...
-
+    def __new__(cls, prediction, seq, id, is_truncated, qual=...): ...
+    def __repr__(self) -> str: ...
     def prediction_region(self) -> list[tuple[int, int]]:
         r"""Get the prediction region."""
 
-    def smooth_prediction(self, window_size:int) -> list[tuple[int, int]]:
+    def smooth_prediction(self, window_size: int) -> list[tuple[int, int]]:
         r"""Get the smooth prediction region."""
 
-    def smooth_label(self, window_size:int) -> list[int]:
+    def smooth_label(self, window_size: int) -> list[int]:
         r"""Get the smooth label."""
 
-    def smooth_and_select_intervals(self, smooth_window_size:int, min_interval_size:int, append_interval_number:int) -> list[tuple[int, int]]:
+    def smooth_and_select_intervals(
+        self,
+        smooth_window_size: int,
+        min_interval_size: int,
+        append_interval_number: int,
+    ) -> list[tuple[int, int]]:
         r"""Smooth and select intervals."""
 
     def seq_len(self) -> int:
@@ -44,67 +47,64 @@ class Predict:
     def qual_array(self) -> list[int]:
         r"""Get the quality score array."""
 
-    def show_info(self, smooth_interval,text_width = ...) -> str:
+    def show_info(self, smooth_interval, text_width=...) -> str:
         r"""Show the information of the prediction."""
 
-    def __getstate__(self) -> typing.Any:
-        ...
-
-    def __setstate__(self, state:typing.Any) -> None:
-        ...
-
+    def __getstate__(self) -> typing.Any: ...
+    def __setstate__(self, state: typing.Any) -> None: ...
 
 class RecordData:
     id: str
     seq: str
     qual: str
-    def __new__(cls,id:str, seq:str, qual:str): ...
-    def set_id(self, id:str) -> None:
-        ...
-
-    def set_seq(self, seq:str) -> None:
-        ...
-
-    def set_qual(self, qual:str) -> None:
-        ...
-
-
-def convert_multiple_fqs_to_one_fq(paths:typing.Sequence[str | os.PathLike | pathlib.Path],result_path:str | os.PathLike | pathlib.Path,parallel:bool) -> None:
-    ...
-
-def encode_fq_path_to_parquet(fq_path,bases,qual_offset,result_path = ...) -> None:
-    ...
-
-def encode_fq_path_to_parquet_chunk(fq_path:str | os.PathLike | pathlib.Path,chunk_size:int,parallel:bool,bases:str,qual_offset:int) -> None:
-    ...
-
-def encode_fq_paths_to_parquet(fq_path:typing.Sequence[str | os.PathLike | pathlib.Path],bases:str,qual_offset:int) -> None:
-    ...
-
-def encode_qual(qual:str,qual_offset:int) -> list[int]:
+    def __new__(cls, id: str, seq: str, qual: str): ...
+    def set_id(self, id: str) -> None: ...
+    def set_seq(self, seq: str) -> None: ...
+    def set_qual(self, qual: str) -> None: ...
+
+def convert_multiple_fqs_to_one_fq(
+    paths: typing.Sequence[str | os.PathLike | pathlib.Path],
+    result_path: str | os.PathLike | pathlib.Path,
+    parallel: bool,
+) -> None: ...
+def encode_fq_path_to_parquet(fq_path, bases, qual_offset, result_path=...) -> None: ...
+def encode_fq_path_to_parquet_chunk(
+    fq_path: str | os.PathLike | pathlib.Path,
+    chunk_size: int,
+    parallel: bool,
+    bases: str,
+    qual_offset: int,
+) -> None: ...
+def encode_fq_paths_to_parquet(
+    fq_path: typing.Sequence[str | os.PathLike | pathlib.Path],
+    bases: str,
+    qual_offset: int,
+) -> None: ...
+def encode_qual(qual: str, qual_offset: int) -> list[int]:
     r"""Convert ASCII quality to Phred score for Phred+33 encoding."""
 
-def fastq_to_fasta(fastq_path:str | os.PathLike | pathlib.Path,fasta_path:str | os.PathLike | pathlib.Path) -> None:
-    ...
-
-def get_label_region(labels:typing.Sequence[int]) -> list[tuple[int, int]]:
-    ...
-
-def load_predicts_from_batch_pt(pt_path:str | os.PathLike | pathlib.Path,ignore_label:int,id_table:typing.Mapping[int, str]) -> dict[str, Predict]:
-    ...
-
-def load_predicts_from_batch_pts(pt_path,ignore_label,id_table,max_predicts = ...) -> dict[str, Predict]:
-    ...
-
-def select_record_from_fq(selected_reads:typing.Sequence[str],fq:str | os.PathLike | pathlib.Path,output:str | os.PathLike | pathlib.Path) -> None:
-    ...
-
-def test_predicts(predicts:typing.Sequence[Predict]) -> None:
-    ...
-
-def write_fq(records_data,file_path = ...) -> None:
-    ...
-
-def write_fq_parallel(records_data:typing.Sequence[RecordData],file_path:str | os.PathLike | pathlib.Path,threads:int) -> None:
-    ...
-
+def fastq_to_fasta(
+    fastq_path: str | os.PathLike | pathlib.Path,
+    fasta_path: str | os.PathLike | pathlib.Path,
+) -> None: ...
+def get_label_region(labels: typing.Sequence[int]) -> list[tuple[int, int]]: ...
+def load_predicts_from_batch_pt(
+    pt_path: str | os.PathLike | pathlib.Path,
+    ignore_label: int,
+    id_table: typing.Mapping[int, str],
+) -> dict[str, Predict]: ...
+def load_predicts_from_batch_pts(
+    pt_path, ignore_label, id_table, max_predicts=...
+) -> dict[str, Predict]: ...
+def select_record_from_fq(
+    selected_reads: typing.Sequence[str],
+    fq: str | os.PathLike | pathlib.Path,
+    output: str | os.PathLike | pathlib.Path,
+) -> None: ...
+def test_predicts(predicts: typing.Sequence[Predict]) -> None: ...
+def write_fq(records_data, file_path=...) -> None: ...
+def write_fq_parallel(
+    records_data: typing.Sequence[RecordData],
+    file_path: str | os.PathLike | pathlib.Path,
+    threads: int,
+) -> None: ...
diff --git a/py-deepbiop/deepbiop/utils.pyi b/py-deepbiop/deepbiop/utils.pyi
index d2e4eac..cd7fd8a 100644
--- a/py-deepbiop/deepbiop/utils.pyi
+++ b/py-deepbiop/deepbiop/utils.pyi
@@ -15,16 +15,10 @@ class GenomicInterval:
     start: int
     end: int
     chr: str
-    def __new__(cls,chr:str, start:int, end:int): ...
-    def set_chr(self, chr:str) -> None:
-        ...
-
-    def overlap(self, other:GenomicInterval) -> bool:
-        ...
-
-    def __repr__(self) -> str:
-        ...
-
+    def __new__(cls, chr: str, start: int, end: int): ...
+    def set_chr(self, chr: str) -> None: ...
+    def overlap(self, other: GenomicInterval) -> bool: ...
+    def __repr__(self) -> str: ...
 
 class PslAlignment:
     qname: str
@@ -37,11 +31,27 @@ class PslAlignment:
     tstart: int
     tend: int
     identity: float
-    def __repr__(self) -> str:
-        ...
-
+    def __repr__(self) -> str: ...
 
 class CompressedType(Enum):
+    r"""
+    Represents different types of file compression formats.
+
+    This enum is used to identify and handle various compression formats commonly used for files.
+    It can be used in Python through the deepbiop.utils module.
+
+    # Variants
+
+    * `Uncompress` - Uncompressed/raw file format
+    * `Gzip` - Standard gzip compression (.gz files)
+    * `Bgzip` - Blocked gzip format, commonly used in bioinformatics
+    * `Zip` - ZIP archive format
+    * `Bzip2` - bzip2 compression format
+    * `Xz` - XZ compression format (LZMA2)
+    * `Zstd` - Zstandard compression format
+    * `Unknown` - Unknown or unrecognized compression format
+    """
+
     Uncompress = auto()
     Gzip = auto()
     Bgzip = auto()
@@ -51,21 +61,32 @@ class CompressedType(Enum):
     Zstd = auto()
     Unknown = auto()
 
-def detect_compression(path:str | os.PathLike | pathlib.Path) -> CompressedType:
-    ...
+def check_compressed_type(path: str | os.PathLike | pathlib.Path) -> CompressedType:
+    r"""
+    Check the compression type of a file.
 
-def generate_unmaped_intervals(input:typing.Sequence[tuple[int, int]],total_length:int) -> list[tuple[int, int]]:
-    ...
+    Args:
+        path: Path to the file to check
 
-def highlight_targets(sequence,targets,text_width = ...) -> str:
-    ...
+    Returns
+    -------
+        The compression type of the file (None, Gzip, Bzip2, Xz)
 
-def majority_voting(labels:typing.Sequence[int],window_size:int) -> list[int]:
-    ...
+    Raises
+    ------
+        IOError: If the file cannot be opened or read
+    """
 
-def parse_psl_by_qname(file_path:str | os.PathLike | pathlib.Path) -> dict[str, list[PslAlignment]]:
+def generate_unmaped_intervals(
+    input: typing.Sequence[tuple[int, int]], total_length: int
+) -> list[tuple[int, int]]: ...
+def highlight_targets(sequence, targets, text_width=...) -> str: ...
+def majority_voting(labels: typing.Sequence[int], window_size: int) -> list[int]: ...
+def parse_psl_by_qname(
+    file_path: str | os.PathLike | pathlib.Path,
+) -> dict[str, list[PslAlignment]]:
     r"""Parse PSL file by query name."""
 
-def remove_intervals_and_keep_left(seq:str,intervals:typing.Sequence[tuple[int, int]]) -> tuple[list[str], list[tuple[int, int]]]:
-    ...
-
+def remove_intervals_and_keep_left(
+    seq: str, intervals: typing.Sequence[tuple[int, int]]
+) -> tuple[list[str], list[tuple[int, int]]]: ...
diff --git a/py-deepbiop/tests/test_fq.py b/py-deepbiop/tests/test_fq.py
index cfd8e15..f1456e8 100644
--- a/py-deepbiop/tests/test_fq.py
+++ b/py-deepbiop/tests/test_fq.py
@@ -1,4 +1,2 @@
-
-
 def test_read_fq():
     pass