From ee575536ba5e9eae84cf9d444887b12b748aed5e Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Thu, 16 Jan 2025 13:50:02 -0600 Subject: [PATCH] refactor: Clean up code formatting and improve function signatures for consistency --- py-deepbiop/Makefile | 2 +- py-deepbiop/deepbiop/bam.pyi | 7 +- py-deepbiop/deepbiop/core.pyi | 13 ++-- py-deepbiop/deepbiop/fa.pyi | 63 ++++++++-------- py-deepbiop/deepbiop/fq.pyi | 128 ++++++++++++++++----------------- py-deepbiop/deepbiop/utils.pyi | 71 +++++++++++------- py-deepbiop/tests/test_fq.py | 2 - 7 files changed, 152 insertions(+), 134 deletions(-) diff --git a/py-deepbiop/Makefile b/py-deepbiop/Makefile index f4deeea..7a76878 100644 --- a/py-deepbiop/Makefile +++ b/py-deepbiop/Makefile @@ -8,6 +8,6 @@ clean: build: clean uv sync - uv tool maturin develop -r + uv tool run maturin develop -r cargo run --bin stub_gen ruff check --fix --unsafe-fixes diff --git a/py-deepbiop/deepbiop/bam.pyi b/py-deepbiop/deepbiop/bam.pyi index 3b312de..5e92eef 100644 --- a/py-deepbiop/deepbiop/bam.pyi +++ b/py-deepbiop/deepbiop/bam.pyi @@ -5,12 +5,11 @@ import os import pathlib import typing -def count_chimeric_reads_for_path(bam,threads = ...) -> int: +def count_chimeric_reads_for_path(bam, threads=...) -> int: r"""Calculate the number of chimeric reads in a BAM file.""" -def count_chimeric_reads_for_paths(bams,threads = ...) -> dict[str, int]: +def count_chimeric_reads_for_paths(bams, threads=...) -> dict[str, int]: r"""Calculate the number of chimeric reads in multiple BAM files.""" -def left_right_soft_clip(cigar_string:str) -> tuple[int, int]: +def left_right_soft_clip(cigar_string: str) -> tuple[int, int]: r"""Calculate left and right soft clips from a cigar string.""" - diff --git a/py-deepbiop/deepbiop/core.pyi b/py-deepbiop/deepbiop/core.pyi index a17561d..fdc7af6 100644 --- a/py-deepbiop/deepbiop/core.pyi +++ b/py-deepbiop/deepbiop/core.pyi @@ -3,7 +3,7 @@ import typing -def generate_kmers(base:str,k:int) -> list[str]: +def generate_kmers(base: str, k: int) -> list[str]: r""" Generate all possible k-mers from a set of base characters. @@ -20,7 +20,7 @@ def generate_kmers(base:str,k:int) -> list[str]: A vector containing all possible k-mer combinations as strings """ -def generate_kmers_table(base:str,k:int) -> dict[list[int], int]: +def generate_kmers_table(base: str, k: int) -> dict[list[int], int]: r""" Generate a lookup table mapping k-mers to unique IDs. @@ -37,7 +37,7 @@ def generate_kmers_table(base:str,k:int) -> dict[list[int], int]: A HashMap mapping k-mer byte sequences to integer IDs """ -def kmers_to_seq(kmers:typing.Sequence[str]) -> str: +def kmers_to_seq(kmers: typing.Sequence[str]) -> str: r""" Convert k-mers back into a DNA sequence. @@ -53,7 +53,7 @@ def kmers_to_seq(kmers:typing.Sequence[str]) -> str: The reconstructed DNA sequence as a `String`, wrapped in a `Result` """ -def normalize_seq(seq:str,iupac:bool) -> str: +def normalize_seq(seq: str, iupac: bool) -> str: r""" Normalize a DNA sequence by converting any non-standard nucleotides to standard ones. @@ -70,7 +70,7 @@ def normalize_seq(seq:str,iupac:bool) -> str: A normalized DNA sequence as a `String`. """ -def reverse_complement(seq:str) -> str: +def reverse_complement(seq: str) -> str: r""" Generate the reverse complement of a DNA sequence. @@ -97,7 +97,7 @@ def reverse_complement(seq:str) -> str: ``` """ -def seq_to_kmers(seq:str,k:int,overlap:bool) -> list[str]: +def seq_to_kmers(seq: str, k: int, overlap: bool) -> list[str]: r""" Convert a DNA sequence into k-mers. @@ -114,4 +114,3 @@ def seq_to_kmers(seq:str,k:int,overlap:bool) -> list[str]: A vector of k-mers as `String`s """ - diff --git a/py-deepbiop/deepbiop/fa.pyi b/py-deepbiop/deepbiop/fa.pyi index e0fd0f9..3f393bc 100644 --- a/py-deepbiop/deepbiop/fa.pyi +++ b/py-deepbiop/deepbiop/fa.pyi @@ -26,7 +26,7 @@ class EncoderOption: """ bases: list[int] - def __new__(cls,bases): ... + def __new__(cls, bases): ... class ParquetEncoder: r""" @@ -49,37 +49,38 @@ class ParquetEncoder: ``` """ - def __new__(cls,option:EncoderOption): ... + def __new__(cls, option: EncoderOption): ... class RecordData: id: str seq: str - def __new__(cls,id:str, seq:str): ... - def set_id(self, id:str) -> None: - ... - - def set_seq(self, seq:str) -> None: - ... - - -def convert_multiple_fas_to_one_fa(paths:typing.Sequence[str | os.PathLike | pathlib.Path],result_path:str | os.PathLike | pathlib.Path,parallel:bool) -> None: - ... - -def encode_fa_path_to_parquet(fa_path,bases,result_path = ...) -> None: - ... - -def encode_fa_path_to_parquet_chunk(fa_path:str | os.PathLike | pathlib.Path,chunk_size:int,parallel:bool,bases:str) -> None: - ... - -def encode_fa_paths_to_parquet(fa_path:typing.Sequence[str | os.PathLike | pathlib.Path],bases:str) -> None: - ... - -def select_record_from_fa(selected_reads:typing.Sequence[str],fq:str | os.PathLike | pathlib.Path,output:str | os.PathLike | pathlib.Path) -> None: - ... - -def write_fa(records_data,file_path = ...) -> None: - ... - -def write_fa_parallel(records_data:typing.Sequence[RecordData],file_path:str | os.PathLike | pathlib.Path,threads:int) -> None: - ... - + def __new__(cls, id: str, seq: str): ... + def set_id(self, id: str) -> None: ... + def set_seq(self, seq: str) -> None: ... + +def convert_multiple_fas_to_one_fa( + paths: typing.Sequence[str | os.PathLike | pathlib.Path], + result_path: str | os.PathLike | pathlib.Path, + parallel: bool, +) -> None: ... +def encode_fa_path_to_parquet(fa_path, bases, result_path=...) -> None: ... +def encode_fa_path_to_parquet_chunk( + fa_path: str | os.PathLike | pathlib.Path, + chunk_size: int, + parallel: bool, + bases: str, +) -> None: ... +def encode_fa_paths_to_parquet( + fa_path: typing.Sequence[str | os.PathLike | pathlib.Path], bases: str +) -> None: ... +def select_record_from_fa( + selected_reads: typing.Sequence[str], + fq: str | os.PathLike | pathlib.Path, + output: str | os.PathLike | pathlib.Path, +) -> None: ... +def write_fa(records_data, file_path=...) -> None: ... +def write_fa_parallel( + records_data: typing.Sequence[RecordData], + file_path: str | os.PathLike | pathlib.Path, + threads: int, +) -> None: ... diff --git a/py-deepbiop/deepbiop/fq.pyi b/py-deepbiop/deepbiop/fq.pyi index 826517c..a261301 100644 --- a/py-deepbiop/deepbiop/fq.pyi +++ b/py-deepbiop/deepbiop/fq.pyi @@ -9,10 +9,10 @@ class EncoderOption: qual_offset: int bases: list[int] threads: int - def __new__(cls,qual_offset,bases,threads = ...): ... + def __new__(cls, qual_offset, bases, threads=...): ... class ParquetEncoder: - def __new__(cls,option:EncoderOption): ... + def __new__(cls, option: EncoderOption): ... class Predict: r"""A struct to store the prediction result.""" @@ -22,20 +22,23 @@ class Predict: id: str is_truncated: bool qual: str | None - def __new__(cls,prediction,seq,id,is_truncated,qual = ...): ... - def __repr__(self) -> str: - ... - + def __new__(cls, prediction, seq, id, is_truncated, qual=...): ... + def __repr__(self) -> str: ... def prediction_region(self) -> list[tuple[int, int]]: r"""Get the prediction region.""" - def smooth_prediction(self, window_size:int) -> list[tuple[int, int]]: + def smooth_prediction(self, window_size: int) -> list[tuple[int, int]]: r"""Get the smooth prediction region.""" - def smooth_label(self, window_size:int) -> list[int]: + def smooth_label(self, window_size: int) -> list[int]: r"""Get the smooth label.""" - def smooth_and_select_intervals(self, smooth_window_size:int, min_interval_size:int, append_interval_number:int) -> list[tuple[int, int]]: + def smooth_and_select_intervals( + self, + smooth_window_size: int, + min_interval_size: int, + append_interval_number: int, + ) -> list[tuple[int, int]]: r"""Smooth and select intervals.""" def seq_len(self) -> int: @@ -44,67 +47,64 @@ class Predict: def qual_array(self) -> list[int]: r"""Get the quality score array.""" - def show_info(self, smooth_interval,text_width = ...) -> str: + def show_info(self, smooth_interval, text_width=...) -> str: r"""Show the information of the prediction.""" - def __getstate__(self) -> typing.Any: - ... - - def __setstate__(self, state:typing.Any) -> None: - ... - + def __getstate__(self) -> typing.Any: ... + def __setstate__(self, state: typing.Any) -> None: ... class RecordData: id: str seq: str qual: str - def __new__(cls,id:str, seq:str, qual:str): ... - def set_id(self, id:str) -> None: - ... - - def set_seq(self, seq:str) -> None: - ... - - def set_qual(self, qual:str) -> None: - ... - - -def convert_multiple_fqs_to_one_fq(paths:typing.Sequence[str | os.PathLike | pathlib.Path],result_path:str | os.PathLike | pathlib.Path,parallel:bool) -> None: - ... - -def encode_fq_path_to_parquet(fq_path,bases,qual_offset,result_path = ...) -> None: - ... - -def encode_fq_path_to_parquet_chunk(fq_path:str | os.PathLike | pathlib.Path,chunk_size:int,parallel:bool,bases:str,qual_offset:int) -> None: - ... - -def encode_fq_paths_to_parquet(fq_path:typing.Sequence[str | os.PathLike | pathlib.Path],bases:str,qual_offset:int) -> None: - ... - -def encode_qual(qual:str,qual_offset:int) -> list[int]: + def __new__(cls, id: str, seq: str, qual: str): ... + def set_id(self, id: str) -> None: ... + def set_seq(self, seq: str) -> None: ... + def set_qual(self, qual: str) -> None: ... + +def convert_multiple_fqs_to_one_fq( + paths: typing.Sequence[str | os.PathLike | pathlib.Path], + result_path: str | os.PathLike | pathlib.Path, + parallel: bool, +) -> None: ... +def encode_fq_path_to_parquet(fq_path, bases, qual_offset, result_path=...) -> None: ... +def encode_fq_path_to_parquet_chunk( + fq_path: str | os.PathLike | pathlib.Path, + chunk_size: int, + parallel: bool, + bases: str, + qual_offset: int, +) -> None: ... +def encode_fq_paths_to_parquet( + fq_path: typing.Sequence[str | os.PathLike | pathlib.Path], + bases: str, + qual_offset: int, +) -> None: ... +def encode_qual(qual: str, qual_offset: int) -> list[int]: r"""Convert ASCII quality to Phred score for Phred+33 encoding.""" -def fastq_to_fasta(fastq_path:str | os.PathLike | pathlib.Path,fasta_path:str | os.PathLike | pathlib.Path) -> None: - ... - -def get_label_region(labels:typing.Sequence[int]) -> list[tuple[int, int]]: - ... - -def load_predicts_from_batch_pt(pt_path:str | os.PathLike | pathlib.Path,ignore_label:int,id_table:typing.Mapping[int, str]) -> dict[str, Predict]: - ... - -def load_predicts_from_batch_pts(pt_path,ignore_label,id_table,max_predicts = ...) -> dict[str, Predict]: - ... - -def select_record_from_fq(selected_reads:typing.Sequence[str],fq:str | os.PathLike | pathlib.Path,output:str | os.PathLike | pathlib.Path) -> None: - ... - -def test_predicts(predicts:typing.Sequence[Predict]) -> None: - ... - -def write_fq(records_data,file_path = ...) -> None: - ... - -def write_fq_parallel(records_data:typing.Sequence[RecordData],file_path:str | os.PathLike | pathlib.Path,threads:int) -> None: - ... - +def fastq_to_fasta( + fastq_path: str | os.PathLike | pathlib.Path, + fasta_path: str | os.PathLike | pathlib.Path, +) -> None: ... +def get_label_region(labels: typing.Sequence[int]) -> list[tuple[int, int]]: ... +def load_predicts_from_batch_pt( + pt_path: str | os.PathLike | pathlib.Path, + ignore_label: int, + id_table: typing.Mapping[int, str], +) -> dict[str, Predict]: ... +def load_predicts_from_batch_pts( + pt_path, ignore_label, id_table, max_predicts=... +) -> dict[str, Predict]: ... +def select_record_from_fq( + selected_reads: typing.Sequence[str], + fq: str | os.PathLike | pathlib.Path, + output: str | os.PathLike | pathlib.Path, +) -> None: ... +def test_predicts(predicts: typing.Sequence[Predict]) -> None: ... +def write_fq(records_data, file_path=...) -> None: ... +def write_fq_parallel( + records_data: typing.Sequence[RecordData], + file_path: str | os.PathLike | pathlib.Path, + threads: int, +) -> None: ... diff --git a/py-deepbiop/deepbiop/utils.pyi b/py-deepbiop/deepbiop/utils.pyi index d2e4eac..cd7fd8a 100644 --- a/py-deepbiop/deepbiop/utils.pyi +++ b/py-deepbiop/deepbiop/utils.pyi @@ -15,16 +15,10 @@ class GenomicInterval: start: int end: int chr: str - def __new__(cls,chr:str, start:int, end:int): ... - def set_chr(self, chr:str) -> None: - ... - - def overlap(self, other:GenomicInterval) -> bool: - ... - - def __repr__(self) -> str: - ... - + def __new__(cls, chr: str, start: int, end: int): ... + def set_chr(self, chr: str) -> None: ... + def overlap(self, other: GenomicInterval) -> bool: ... + def __repr__(self) -> str: ... class PslAlignment: qname: str @@ -37,11 +31,27 @@ class PslAlignment: tstart: int tend: int identity: float - def __repr__(self) -> str: - ... - + def __repr__(self) -> str: ... class CompressedType(Enum): + r""" + Represents different types of file compression formats. + + This enum is used to identify and handle various compression formats commonly used for files. + It can be used in Python through the deepbiop.utils module. + + # Variants + + * `Uncompress` - Uncompressed/raw file format + * `Gzip` - Standard gzip compression (.gz files) + * `Bgzip` - Blocked gzip format, commonly used in bioinformatics + * `Zip` - ZIP archive format + * `Bzip2` - bzip2 compression format + * `Xz` - XZ compression format (LZMA2) + * `Zstd` - Zstandard compression format + * `Unknown` - Unknown or unrecognized compression format + """ + Uncompress = auto() Gzip = auto() Bgzip = auto() @@ -51,21 +61,32 @@ class CompressedType(Enum): Zstd = auto() Unknown = auto() -def detect_compression(path:str | os.PathLike | pathlib.Path) -> CompressedType: - ... +def check_compressed_type(path: str | os.PathLike | pathlib.Path) -> CompressedType: + r""" + Check the compression type of a file. -def generate_unmaped_intervals(input:typing.Sequence[tuple[int, int]],total_length:int) -> list[tuple[int, int]]: - ... + Args: + path: Path to the file to check -def highlight_targets(sequence,targets,text_width = ...) -> str: - ... + Returns + ------- + The compression type of the file (None, Gzip, Bzip2, Xz) -def majority_voting(labels:typing.Sequence[int],window_size:int) -> list[int]: - ... + Raises + ------ + IOError: If the file cannot be opened or read + """ -def parse_psl_by_qname(file_path:str | os.PathLike | pathlib.Path) -> dict[str, list[PslAlignment]]: +def generate_unmaped_intervals( + input: typing.Sequence[tuple[int, int]], total_length: int +) -> list[tuple[int, int]]: ... +def highlight_targets(sequence, targets, text_width=...) -> str: ... +def majority_voting(labels: typing.Sequence[int], window_size: int) -> list[int]: ... +def parse_psl_by_qname( + file_path: str | os.PathLike | pathlib.Path, +) -> dict[str, list[PslAlignment]]: r"""Parse PSL file by query name.""" -def remove_intervals_and_keep_left(seq:str,intervals:typing.Sequence[tuple[int, int]]) -> tuple[list[str], list[tuple[int, int]]]: - ... - +def remove_intervals_and_keep_left( + seq: str, intervals: typing.Sequence[tuple[int, int]] +) -> tuple[list[str], list[tuple[int, int]]]: ... diff --git a/py-deepbiop/tests/test_fq.py b/py-deepbiop/tests/test_fq.py index cfd8e15..f1456e8 100644 --- a/py-deepbiop/tests/test_fq.py +++ b/py-deepbiop/tests/test_fq.py @@ -1,4 +1,2 @@ - - def test_read_fq(): pass