From e5bd212a8a1a9be3bd843e6109795c628d134816 Mon Sep 17 00:00:00 2001 From: dagou Date: Fri, 13 Sep 2024 09:29:06 +0800 Subject: [PATCH] refactor --- .github/workflows/rust.yml | 139 ------ .gitignore | 1 + .vscode/settings.json | 3 +- Cargo.toml | 43 +- README.md | 17 +- {kr2r/docs => docs}/KunPeng.png | Bin {kr2r/docs => docs}/Picture1.png | Bin {kr2r/docs => docs}/Picture2.png | Bin .../build_and_classify.rs | 5 +- kr2r/Cargo.toml | 40 -- kr2r/README.md | 394 ------------------ seqkmer/Cargo.toml | 21 - seqkmer/src/fasta.rs | 258 ------------ seqkmer/src/fastq.rs | 190 --------- seqkmer/src/fastx.rs | 67 --- seqkmer/src/feat.rs | 202 --------- seqkmer/src/lib.rs | 20 - seqkmer/src/mmscanner.rs | 266 ------------ seqkmer/src/parallel.rs | 236 ----------- seqkmer/src/reader.rs | 202 --------- seqkmer/src/seq.rs | 37 -- seqkmer/src/utils.rs | 94 ----- {kr2r/src => src}/args.rs | 13 +- {kr2r/src => src}/bin/annotate.rs | 0 {kr2r/src => src}/bin/build_k2_db.rs | 0 {kr2r/src => src}/bin/chunk_db.rs | 0 {kr2r/src => src}/bin/direct.rs | 0 {kr2r/src => src}/bin/estimate_capacity.rs | 0 {kr2r/src => src}/bin/hashshard.rs | 0 {kr2r/src => src}/bin/kun.rs | 0 {kr2r/src => src}/bin/merge_fna.rs | 0 {kr2r/src => src}/bin/resolve.rs | 0 {kr2r/src => src}/bin/splitr.rs | 0 {kr2r/src => src}/classify.rs | 51 +++ {kr2r/src => src}/compact_hash.rs | 159 ++++++- {kr2r/src => src}/db.rs | 90 +++- {kr2r/src => src}/kr2r_data.rs | 75 +++- {kr2r/src => src}/kv_store.rs | 37 +- {kr2r/src => src}/lib.rs | 0 {kr2r/src => src}/readcounts.rs | 16 - {kr2r/src => src}/report.rs | 127 +++++- {kr2r/src => src}/taxonomy.rs | 195 ++++++--- {kr2r/src => src}/utils.rs | 95 +++-- 43 files changed, 740 insertions(+), 2353 deletions(-) delete mode 100644 .github/workflows/rust.yml rename {kr2r/docs => docs}/KunPeng.png (100%) rename {kr2r/docs => docs}/Picture1.png (100%) rename {kr2r/docs => docs}/Picture2.png (100%) rename {kr2r/examples => examples}/build_and_classify.rs (97%) delete mode 100644 kr2r/Cargo.toml delete mode 100644 kr2r/README.md delete mode 100644 seqkmer/Cargo.toml delete mode 100644 seqkmer/src/fasta.rs delete mode 100644 seqkmer/src/fastq.rs delete mode 100644 seqkmer/src/fastx.rs delete mode 100644 seqkmer/src/feat.rs delete mode 100644 seqkmer/src/lib.rs delete mode 100644 seqkmer/src/mmscanner.rs delete mode 100644 seqkmer/src/parallel.rs delete mode 100644 seqkmer/src/reader.rs delete mode 100644 seqkmer/src/seq.rs delete mode 100644 seqkmer/src/utils.rs rename {kr2r/src => src}/args.rs (95%) rename {kr2r/src => src}/bin/annotate.rs (100%) rename {kr2r/src => src}/bin/build_k2_db.rs (100%) rename {kr2r/src => src}/bin/chunk_db.rs (100%) rename {kr2r/src => src}/bin/direct.rs (100%) rename {kr2r/src => src}/bin/estimate_capacity.rs (100%) rename {kr2r/src => src}/bin/hashshard.rs (100%) rename {kr2r/src => src}/bin/kun.rs (100%) rename {kr2r/src => src}/bin/merge_fna.rs (100%) rename {kr2r/src => src}/bin/resolve.rs (100%) rename {kr2r/src => src}/bin/splitr.rs (100%) rename {kr2r/src => src}/classify.rs (58%) rename {kr2r/src => src}/compact_hash.rs (78%) rename {kr2r/src => src}/db.rs (71%) rename {kr2r/src => src}/kr2r_data.rs (61%) rename {kr2r/src => src}/kv_store.rs (74%) rename {kr2r/src => src}/lib.rs (100%) rename {kr2r/src => src}/readcounts.rs (89%) rename {kr2r/src => src}/report.rs (67%) rename {kr2r/src => src}/taxonomy.rs (76%) rename {kr2r/src => src}/utils.rs (76%) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml deleted file mode 100644 index fa7ab65..0000000 --- a/.github/workflows/rust.yml +++ /dev/null @@ -1,139 +0,0 @@ -name: "publish" - -# Commenting out the current trigger -# on: -# create: -# tags: -# - '*' - -on: - workflow_dispatch: - inputs: - version: - description: 'Release version (e.g., v1.2.3)' - required: true - type: string - -# This is the example from the readme. -# On each push to the `release` branch it will create or update a GitHub release, build your app, and upload the artifacts to the release. -env: - CARGO_TERM_COLOR: always - BINARIES_LIST: 'kun_peng' - PROJECT_PREFIX: 'Kun-peng-' - -jobs: - build-and-release: - permissions: - contents: write - strategy: - fail-fast: false - matrix: - platform: [macos-latest, ubuntu-20.04, windows-latest, macos-13] - - runs-on: ${{ matrix.platform }} - steps: - - uses: actions/checkout@v4 - - name: Set Version Number - shell: bash - run: echo "VERSION=$(echo $GITHUB_REF | sed -e 's|refs/tags/||')" >> $GITHUB_ENV - - - name: Build - run: cargo build --release - - # - name: Install Rust (macOS) - # if: matrix.platform == 'macos-latest' - # run: | - # rustup target add x86_64-apple-darwin - - # - name: Build (macOS) - # if: matrix.platform == 'macos-latest' - # run: | - # cargo build --release --target x86_64-apple-darwin - - # Set up the GitHub CLI - - name: Install GitHub CLI - run: | - brew install gh - if: matrix.platform == 'macos-latest' || matrix.platform == 'macos-13' - - - name: Install GitHub CLI - run: | - sudo apt install -y gh - if: matrix.platform == 'ubuntu-20.04' - - - name: Install GitHub CLI - run: | - choco install gh - if: matrix.platform == 'windows-latest' - - # Log in to the GitHub CLI - - name: Login to GitHub CLI - run: echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token - - # # Create a release - # - name: Create Release - # id: create_release - # run: | - # gh release create ${{ github.ref_name }} \ - # --title "Release ${{ github.ref_name }}" \ - # --notes "Release notes for ${{ github.ref_name }}" \ - # --draft - # shell: bash - - - name: Prepare asset name - run: | - PLATFORM_TAG=$(echo ${{ matrix.platform }} | sed -e 's/macos-latest/macos-arm64/' -e 's/macos-13/macos-x86_64/' -e 's/ubuntu-20.04/linux-x86_64/' -e 's/windows-latest/windows-x86_64/') - echo "ASSET_NAME=${PROJECT_PREFIX}${VERSION}-${PLATFORM_TAG}.tar.gz" >> $GITHUB_ENV - shell: bash - - - name: Create tar.gz archive - run: | - mkdir -p ./target/release/packaged - for binary in ${{ env.BINARIES_LIST }}; do - cp "./target/release/$binary" "./target/release/packaged/" - done - tar czvf "./target/release/${{ env.ASSET_NAME }}" -C "./target/release/packaged/" . - shell: bash - - - name: Upload Release Asset - run: | - gh release upload ${{ github.ref_name }} \ - ./target/release/${{ env.ASSET_NAME }} \ - --clobber - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - shell: bash - - # This step is for building on CentOS 7, only run on ubuntu-latest - - name: Build on CentOS 7 - if: matrix.platform == 'ubuntu-20.04' - run: | - docker run --name centos7-container -v $GITHUB_WORKSPACE:/github/workspace -w /github/workspace centos:7 \ - /bin/bash -c "echo '[base]' > /etc/yum.repos.d/CentOS-Base.repo; \ - echo 'name=CentOS-7 - Base' >> /etc/yum.repos.d/CentOS-Base.repo; \ - echo 'baseurl=http://vault.centos.org/centos/7/os/x86_64/' >> /etc/yum.repos.d/CentOS-Base.repo; \ - echo 'gpgcheck=1' >> /etc/yum.repos.d/CentOS-Base.repo; \ - echo 'enabled=1' >> /etc/yum.repos.d/CentOS-Base.repo; \ - echo 'gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7' >> /etc/yum.repos.d/CentOS-Base.repo; \ - yum update -y && yum install -y gcc make openssl openssl-devel && \ - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && export PATH=\$HOME/.cargo/bin:\$PATH && cd /github/workspace && cargo build --release" - - mkdir -p ./target/release/packaged_centos7 - for binary in $BINARIES_LIST; do - docker cp centos7-container:/github/workspace/target/release/$binary ./target/release/packaged_centos7/ - done - - tar czvf ./${PROJECT_PREFIX}${VERSION}-centos7.tar.gz -C ./target/release/packaged_centos7 . - - docker rm centos7-container - - - name: Upload Release Asset for CentOS 7 - if: matrix.platform == 'ubuntu-20.04' - run: | - gh release upload ${{ github.ref_name }} \ - ./${PROJECT_PREFIX}${VERSION}-centos7.tar.gz \ - --clobber - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - shell: bash - diff --git a/.gitignore b/.gitignore index 263cc94..0eeb4b8 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ out_dir/ slurm.sh downloads/ test_database/ +chunk/ diff --git a/.vscode/settings.json b/.vscode/settings.json index ba293da..2feb4e8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,5 @@ { "rust-analyzer.linkedProjects": [ - "./kr2r/Cargo.toml", - "./kr2r/Cargo.toml" + "./Cargo.toml", ] } diff --git a/Cargo.toml b/Cargo.toml index dc3daef..9eb630f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,43 @@ -[workspace] -members = ["kr2r", "seqkmer"] -resolver = "2" +[package] +name = "kun_peng" +version = "0.7.1" +edition = "2021" +authors = ["eric9n@gmail.com"] +description = "Kun-peng: an ultra-fast, low-memory footprint and accurate taxonomy classifier for all" +license = "MIT" +repository = "https://github.com/eric9n/Kun-peng" +keywords = ["bioinformatics", "metagenomics", "microbiome", "exposome"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[[bin]] +name = "kun_peng" +path = "src/bin/kun.rs" + +[features] +double_hashing = [] +exact_counting = [] + +[dependencies] +seqkmer = "0.1.1" +clap = { version = "4.4.10", features = ["derive"] } +hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } +seahash = "4.1.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +byteorder = "1.4" +walkdir = "2" +rayon = "1.8" +libc = "0.2" +regex = "1.5.4" +flate2 = "1.0" +dashmap = { version = "6.0.1", features = ["rayon"] } +num_cpus = "1.13.1" + +[dev-dependencies] +criterion = "0.5.1" +twox-hash = "1.6.3" +farmhash = { version = "1.1.5" } [profile.release] lto = true diff --git a/README.md b/README.md index 44de5d3..2ab9320 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ -# Kun-peng Kun-peng Logo +# Kun-peng Kun-peng Logo -[![](https://img.shields.io/badge/doi-waiting-yellow.svg)]() [![](https://img.shields.io/badge/release%20version-0.7.0-green.svg)](https://github.com/eric9n/Kun-peng/releases) +[![](https://img.shields.io/badge/doi-waiting-yellow.svg)]() [![](https://img.shields.io/badge/release%20version-0.7.1-green.svg)](https://github.com/eric9n/Kun-peng/releases) Here, we introduce Kun-peng, an ultra-memory-efficient metagenomic classification tool (Fig. 1). Inspired by Kraken2's k-mer-based approach, Kun-peng employs algorithms for minimizer generation, hash table querying, and classification. The cornerstone of Kun-peng's memory efficiency lies in its unique ordered block design for reference database. This strategy dramatically reduces memory usage without compromising speed, enabling Kun-peng to be executed on both personal computers and HPCP for most databases. Moreover, Kun-peng incorporates an advanced sliding window algorithm for sequence classifications to reduce the false-positive rates. Finally, Kun-peng supports parallel processing algorithms to further bolster its speed. Kun-peng offers two classification modes: Memory-Efficient Mode (Kun-peng-M) and Full-Speed Mode (Kun-peng-F). Remarkably, Kun-peng-M achieves a comparable processing time to Kraken2 while using less than 10% of its memory. Kun-peng-F loads all the database blocks simultaneously, matching Kraken2's memory usage while surpassing its speed. Notably, Kun-peng is compatible with the reference database built by Kraken2 and the associated abundance estimate tool Bracken1, making the transition from Kraken2 effortless. The name "Kun-peng" was derived from Chinese mythology and refers to a creature transforming between a giant fish (Kun) and a giant bird (Peng), reflecting the software's flexibility in navigating complex metagenomic data landscapes.
- Workflow of Kun-peng + Workflow of Kun-peng

Fig. 1. Overview of the algorithms of Kun-peng.

@@ -19,7 +19,7 @@ We constructed a standard database using the complete RefSeq genomes of archaeal Kun-peng offers two modes for taxonomy classification: Memory-Efficient Mode (Kun-peng-M) and Full-Speed Mode (Kun-peng-F), with identical classification results. Kun-peng-M matches Kraken2's processing time and uses 57.0 ± 2.25 % of Centrifuge's time (Fig. 2d). However, Kun-peng-M requires only 4.5 ± 1.1 GB peak memory, which is 7.96 ± 0.19 % and 6.31 ± 0.15 % of Kraken2 and Centrifuge's peak memory, respectively (Fig. 2d). Compared to Kraken2, the Kun-peng-F consumes the same memory but requires only of the 67.2 ± 4.57 % processing time. Compared to Centrifuge, Kun-peng-F uses 77.9 ± 0.22 % memory while requiring only 38.8 ± 4.25 % of its processing time (Fig. 2d). Remarkably, with an ultra-low memory requirement, Kun-peng-M can even operate on most personal computers when the standard reference database is used (Fig. 2e).
- Workflow of Kun-peng + Workflow of Kun-peng

Fig. 2. Performance benchmark of Kun-peng against other metagenomic classifiers.

@@ -57,6 +57,13 @@ source ~/.bashrc For macOS users: +### Homebrew +```bash +brew install eric9n/tap/kun_peng +``` + +### Donwload binary + ```bash # Replace X.Y.Z with the latest version number VERSION=vX.Y.Z @@ -169,7 +176,7 @@ This will build the kr2r and ncbi project in release mode. Next, run the example script that demonstrates how to use the `kun_peng` binary. Execute the following command from the root of the workspace: ``` sh -cargo run --release --example build_and_classify --package kun_peng +cargo run --release --example build_and_classify ``` This will run the build_and_classify.rs example located in the kr2r project's examples directory. diff --git a/kr2r/docs/KunPeng.png b/docs/KunPeng.png similarity index 100% rename from kr2r/docs/KunPeng.png rename to docs/KunPeng.png diff --git a/kr2r/docs/Picture1.png b/docs/Picture1.png similarity index 100% rename from kr2r/docs/Picture1.png rename to docs/Picture1.png diff --git a/kr2r/docs/Picture2.png b/docs/Picture2.png similarity index 100% rename from kr2r/docs/Picture2.png rename to docs/Picture2.png diff --git a/kr2r/examples/build_and_classify.rs b/examples/build_and_classify.rs similarity index 97% rename from kr2r/examples/build_and_classify.rs rename to examples/build_and_classify.rs index b3cbd13..fe893ef 100644 --- a/kr2r/examples/build_and_classify.rs +++ b/examples/build_and_classify.rs @@ -4,10 +4,7 @@ use std::process::Command; fn main() { // Define the paths and directories - let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .parent() - .unwrap() - .to_path_buf(); + let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).to_path_buf(); let kr2r_binary = workspace_root.join("target/release/kun_peng"); let data_dir = workspace_root.join("data"); let test_dir = workspace_root.join("test_database"); diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml deleted file mode 100644 index 2e18cc7..0000000 --- a/kr2r/Cargo.toml +++ /dev/null @@ -1,40 +0,0 @@ -[package] -name = "kun_peng" -version = "0.7.0" -edition = "2021" -authors = ["eric9n@gmail.com"] -description = "Kun-peng: an ultra-fast, low-memory footprint and accurate taxonomy classifier for all" -license = "MIT" -repository = "https://github.com/eric9n/Kun-peng" -keywords = ["bioinformatics", "metagenomics", "microbiome", "exposome"] - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[[bin]] -name = "kun_peng" -path = "src/bin/kun.rs" - -[features] -double_hashing = [] -exact_counting = [] - -[dependencies] -seqkmer = { version = "0.1.0", path = "../seqkmer" } -clap = { version = "4.4.10", features = ["derive"] } -hyperloglogplus = { version = "0.4.1", features = ["const-loop"] } -seahash = "4.1.0" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -byteorder = "1.4" -walkdir = "2" -rayon = "1.8" -libc = "0.2" -regex = "1.5.4" -flate2 = "1.0" -dashmap = { version = "6.0.1", features = ["rayon"] } -num_cpus = "1.13.1" - -[dev-dependencies] -criterion = "0.5.1" -twox-hash = "1.6.3" -farmhash = { version = "1.1.5" } diff --git a/kr2r/README.md b/kr2r/README.md deleted file mode 100644 index 07016ce..0000000 --- a/kr2r/README.md +++ /dev/null @@ -1,394 +0,0 @@ -# Kun-peng - - - -[![](https://img.shields.io/badge/doi-waiting-yellow.svg)]() [![](https://img.shields.io/badge/release%20version-0.6.15-green.svg)](https://github.com/eric9n/Kun-peng/releases) - - -Here, we introduce Kun-peng, an ultra-memory-efficient metagenomic classification tool (Fig. 1). Inspired by Kraken2's k-mer-based approach, Kun-peng employs algorithms for minimizer generation, hash table querying, and classification. The cornerstone of Kun-peng's memory efficiency lies in its unique ordered block design for reference database. This strategy dramatically reduces memory usage without compromising speed, enabling Kun-peng to be executed on both personal computers and HPCP for most databases. Moreover, Kun-peng incorporates an advanced sliding window algorithm for sequence classifications to reduce the false-positive rates. Finally, Kun-peng supports parallel processing algorithms to further bolster its speed. Kun-peng offers two classification modes: Memory-Efficient Mode (Kun-peng-M) and Full-Speed Mode (Kun-peng-F). Remarkably, Kun-peng-M achieves a comparable processing time to Kraken2 while using less than 10% of its memory. Kun-peng-F loads all the database blocks simultaneously, matching Kraken2's memory usage while surpassing its speed. Notably, Kun-peng is compatible with the reference database built by Kraken2 and the associated abundance estimate tool Bracken1, making the transition from Kraken2 effortless. The name "Kun-peng" was derived from Chinese mythology and refers to a creature transforming between a giant fish (Kun) and a giant bird (Peng), reflecting the software's flexibility in navigating complex metagenomic data landscapes. - -
- Workflow of Kun-peng -

Fig. 1. Overview of the algorithms of Kun-peng.

-
- -To assess Kun-peng's accuracy and performance, we used two datasets comprising 14 mock metagenomes 2,3. We processed these data with Kraken2 4 and Centrifuge 5, both requiring relatively lower memory and supporting custom databases. The classification processes were executed with default parameters, generating reports of identified taxa and their abundance estimated by Bracken 1. As most classifiers considered below 0.01% abundance false positives, we removed these taxa for simplicity 6. - -Critical metrics for metagenomic classification include precision, recall, and the area under the precision-recall curve (AUPRC). After filtering the low-abundance species, these metrics were calculated at the genus or species level. All tools performed better at the genus level, with performance decreasing at the species level (Fig. 2a). At the genus level, Centrifuge's precision was 25.5 ± 12.4 % lower than Kun-peng's (Fig. 2a). At the species level, Kun-peng significantly outperformed Kraken2 and Centrifuge, showing 11.2 ± 8.08 % and 23.6 ± 12.3 % higher precision, respectively (Fig. 2a). We focused on the genus level due to higher overall performance. Kun-peng's increased precision resulted from significantly lower false positives compared to Kraken2 and Centrifuge, which showed 16.9 ± 10.0 % and 61.7 ± 40.2 % higher false positives, respectively (Fig. 2b). - -We constructed a standard database using the complete RefSeq genomes of archaeal, bacterial, and viral domains. The database contains 123GB of fasta sequences, generating a 56GB hash index for Kraken2 and Kun-peng and a 72GB database index for Centrifuge. Database construction time for Kun-peng was noticeably longer due to sub-hashing calculations (Fig. 2c). However, Kun-peng required only 4.6GB of peak memory, roughly 8.19% and 1.50% of Kraken2 and Centrifuge's peak memory, for database construction (Fig. 2c). - -Kun-peng offers two modes for taxonomy classification: Memory-Efficient Mode (Kun-peng-M) and Full-Speed Mode (Kun-peng-F), with identical classification results. Kun-peng-M matches Kraken2's processing time and uses 57.0 ± 2.25 % of Centrifuge's time (Fig. 2d). However, Kun-peng-M requires only 4.5 ± 1.1 GB peak memory, which is 7.96 ± 0.19 % and 6.31 ± 0.15 % of Kraken2 and Centrifuge's peak memory, respectively (Fig. 2d). Compared to Kraken2, the Kun-peng-F consumes the same memory but requires only of the 67.2 ± 4.57 % processing time. Compared to Centrifuge, Kun-peng-F uses 77.9 ± 0.22 % memory while requiring only 38.8 ± 4.25 % of its processing time (Fig. 2d). Remarkably, with an ultra-low memory requirement, Kun-peng-M can even operate on most personal computers when the standard reference database is used (Fig. 2e). - -
- Workflow of Kun-peng -

Fig. 2. Performance benchmark of Kun-peng against other metagenomic classifiers.

-
- -References: - -1. Lu, J., Breitwieser, F. P., Thielen, P. & Salzberg, S. L. Bracken: Estimating species abundance in metagenomics data. PeerJ Comput. Sci. 2017, 1–17 (2017). -2. Amos, G. C. A. et al. Developing standards for the microbiome field. Microbiome 8, 1–13 (2020). -3. Kralj, J., Vallone, P., Kralj, J., Hunter, M. & Jackson, S. Reference Material 8376 Microbial Pathogen DNA Standards for Detection and Identification NIST Special Publication 260-225 Reference Material 8376 Microbial Pathogen DNA Standards for Detection and Identification. -4. Wood, D. E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol. 20, (2019). -5. Kim, D., Song, L., Breitwieser, F. P. & Salzberg, S. L. Centrifuge: Rapid and sensitive classification of metagenomic sequences. Genome Res. 26, 1721–1729 (2016). -6. Ye, S. H., Siddle, K. J., Park, D. J. & Sabeti, P. C. Benchmarking Metagenomics Tools for Taxonomic Classification. Cell 178, 779–794 (2019). - - -## Get Started - -Follow these steps to install Kun-peng and run the examples. - -### Method 1: Download Pre-built Binaries (Recommended) - -If you prefer not to build from source, you can download the pre-built binaries for your platform from the GitHub [releases page](https://github.com/eric9n/kraken2-rust/releases). - -``` bash -mkdir kun_peng_v0.6.15 -tar -xvf Kun-peng-v0.6.15-centos7.tar.gz -C kun_peng_v0.6.15 -# Add environment variable -echo 'export PATH=$PATH:~/biosoft/kun_peng_v0.6.15' >> ~/.bashrc -source ~/.bashrc -``` - -#### Run the `kun_peng` example - -We will use a very small virus database on the GitHub homepage as an example: - -1. clone the repository - -``` sh -git clone https://github.com/eric9n/kun_peng.git -cd Kun-peng -``` - -2. build database - -``` sh -kun_peng build --download-dir data/ --db test_database -``` - -``` -merge fna start... -merge fna took: 29.998258ms -estimate start... -estimate count: 14080, required capacity: 31818.0, Estimated hash table requirement: 124.29KB -convert fna file "test_database/library.fna" -process chunk file 1/1: duration: 29.326627ms -build k2 db took: 30.847894ms -``` - -3. classify - -``` sh -# temp_chunk is used to store intermediate files -mkdir temp_chunk -# test_out is used to store output files -mkdir test_out -kun_peng classify --db test_database --chunk-dir temp_chunk --output-dir test_out data/COVID_19.fa -``` - -``` -hash_config HashConfig { value_mask: 31, value_bits: 5, capacity: 31818, size: 13051, hash_capacity: 1073741824 } -splitr start... -splitr took: 18.212452ms -annotate start... -chunk_file "temp_chunk/sample_1.k2" -load table took: 548.911µs -annotate took: 12.006329ms -resolve start... -resolve took: 39.571515ms -Classify took: 92.519365ms -``` - -### Method 2: Clone the Repository and Build the project - -#### Prerequisites - -1. **Rust**: This project requires the Rust programming environment if you plan to build from source. - -#### Build the Projects - -First, clone this repository to your local machine: - -``` sh -git clone https://github.com/eric9n/kun_peng.git -cd kun_peng -``` - -Ensure that both projects are built. You can do this by running the following command from the root of the workspace: - -``` sh -cargo build --release -``` - -This will build the kr2r and ncbi project in release mode. - -#### Run the `kun_peng` example - -Next, run the example script that demonstrates how to use the `kun_peng` binary. Execute the following command from the root of the workspace: - -``` sh -cargo run --release --example build_and_classify --package kr2r -``` - -This will run the build_and_classify.rs example located in the kr2r project's examples directory. - -Example Output You should see output similar to the following: - -``` txt -Executing command: /path/to/workspace/target/release/kun_peng build --download-dir data/ --db test_database -kun_peng build output: [build output here] -kun_peng build error: [any build errors here] - -Executing command: /path/to/workspace/target/release/kun_peng direct --db test_database data/COVID_19.fa -kun_peng direct output: [direct output here] -kun_peng direct error: [any direct errors here] -``` - -This output confirms that the `kun_peng` commands were executed successfully and the files were processed as expected. - -## kun_peng tool - -``` sh -Usage: kun_peng - -Commands: - estimate estimate capacity - build build `k2d` files - hashshard Convert Kraken2 database files to Kun-peng database format for efficient processing and analysis. - splitr Split fast(q/a) file into ranges - annotate annotate a set of sequences - resolve resolve taxonomy tree - classify Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences - direct Directly load all hash tables for classification annotation - merge-fna A tool for processing genomic files - help Print this message or the help of the given subcommand(s) - -Options: - -h, --help Print help - -V, --version Print version -``` - -### build database - -Build the kun_peng database like Kraken2, specifying the directory for the data files downloaded from NCBI, as well as the database directory. - -``` sh -./target/release/kun_peng build -h -build database - -Usage: kun_peng build [OPTIONS] --download-dir --db - -Options: - -d, --download-dir - Directory to store downloaded files - --db - ncbi library fna database directory - -k, --k-mer - Set length of k-mers, k must be positive integer, k=35, k cannot be less than l [default: 35] - -l, --l-mer - Set length of minimizers, 1 <= l <= 31 [default: 31] - --minimizer-spaces - Number of characters in minimizer that are ignored in comparisons [default: 7] - -T, --toggle-mask - Minimizer ordering toggle mask [default: 16392584516609989165] - --min-clear-hash-value - - -r, --requested-bits-for-taxid - Bit storage requested for taxid 0 <= r < 31 [default: 0] - -p, --threads - Number of threads [default: 10] - --cache - estimate capacity from cache if exists - --max-n - Set maximum qualifying hash code [default: 4] - --load-factor - Proportion of the hash table to be populated (build task only; def: 0.7, must be between 0 and 1) [default: 0.7] - -h, --help - Print help - -V, --version - Print version -``` - -### Convert Kraken2 database - -This tool converts Kraken2 database files into Kun-peng database format for more efficient processing and analysis. By specifying the database directory and the hash file capacity, users can control the size of the resulting database index files. - -```sh -./target/release/kun_peng hashshard -h -Convert Kraken2 database files to Kun-peng database format for efficient processing and analysis. - -Usage: kun_peng hashshard [OPTIONS] --db - -Options: - --db The database directory for the Kraken 2 index. contains index files(hash.k2d opts.k2d taxo.k2d) - --hash-capacity Specifies the hash file capacity. - Acceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K'). - Note: The specified capacity affects the index size, with a factor of 4 applied. - For example, specifying '1G' results in an index size of '4G'. - Default: 1G (capacity 1G = file size 4G) [default: 1G] - -h, --help Print help - -V, --version Print version - -``` - - -### classify - -The classification process is divided into three modes: - -1. Direct Processing Mode: - -- Description: In this mode, all database files are loaded simultaneously, which requires a significant amount of memory. Before running this mode, you need to check the total size of hash\_\*.k2d files in the database directory using the provided script. Ensure that your available memory meets or exceeds this size. - -``` sh -bash cal_memory.sh $database_dir -``` - -- Characteristics: - - High memory requirements - - Fast performance - -Command Help - -``` sh -./target/release/kun_peng direct -h -Directly load all hash tables for classification annotation - -Usage: kun_peng direct [OPTIONS] --db [INPUT_FILES]... - -Arguments: - [INPUT_FILES]... A list of input file paths (FASTA/FASTQ) to be processed by the classify program. Supports fasta or fastq format files (e.g., .fasta, .fastq) and gzip compressed files (e.g., .fasta.gz, .fastq.gz) - -Options: - --db - database hash chunk directory and other files - -P, --paired-end-processing - Enable paired-end processing - -S, --single-file-pairs - Process pairs with mates in the same file - -Q, --minimum-quality-score - Minimum quality score for FASTQ data [default: 0] - -T, --confidence-threshold - Confidence score threshold [default: 0] - -K, --report-kmer-data - In comb. w/ -R, provide minimizer information in report - -z, --report-zero-counts - In comb. w/ -R, report taxa w/ 0 count - -g, --minimum-hit-groups - The minimum number of hit groups needed for a call [default: 2] - -p, --num-threads - The number of threads to use [default: 10] - --output-dir - File path for outputting normal Kraken output - -h, --help - Print help (see more with '--help') - -V, --version - Print version -``` - -2. Chunk Processing Mode: - -- Description: This mode processes the sample data in chunks, loading only a small portion of the database files at a time. This reduces the memory requirements, needing a minimum of 4GB of memory plus the size of one pair of sample files. -- Characteristics: - - Low memory consumption - - Slower performance compared to Direct Processing Mode - -Command Help - -``` sh -./target/release/kun_peng classify -h -Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences - -Usage: kun_peng classify [OPTIONS] --db --chunk-dir [INPUT_FILES]... - -Arguments: - [INPUT_FILES]... A list of input file paths (FASTA/FASTQ) to be processed by the classify program. Supports fasta or fastq format files (e.g., .fasta, .fastq) and gzip compressed files (e.g., .fasta.gz, .fastq.gz) - -Options: - --db - - --chunk-dir - chunk directory - --output-dir - File path for outputting normal Kraken output - -P, --paired-end-processing - Enable paired-end processing - -S, --single-file-pairs - Process pairs with mates in the same file - -Q, --minimum-quality-score - Minimum quality score for FASTQ data [default: 0] - -p, --num-threads - The number of threads to use [default: 10] - --buffer-size - [default: 16777216] - --batch-size - The size of each batch for processing taxid match results, used to control memory usage - [default: 16] - -T, --confidence-threshold - Confidence score threshold [default: 0] - -g, --minimum-hit-groups - The minimum number of hit groups needed for a call [default: 2] - --kraken-db-type - Enables use of a Kraken 2 compatible shared database - -K, --report-kmer-data - In comb. w/ -R, provide minimizer information in report - -z, --report-zero-counts - In comb. w/ -R, report taxa w/ 0 count - -h, --help - Print help (see more with '--help') - -V, --version - Print version -``` - -3. Step-by-Step Processing Mode: - -- Description: This mode breaks down the chunk processing mode into individual steps, providing greater flexibility in managing the entire classification process. -- Characteristics: - - Flexible processing steps - - Similar memory consumption to Chunk Processing Mode - - Performance varies based on execution steps - -### Output - -- test_out/output_1.txt: - -Standard Kraken Output Format: - -1. “C”/“U”: a one letter code indicating that the sequence was either classified or unclassified. -2. The sequence ID, obtained from the FASTA/FASTQ header. -3. The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence is unclassified. -4. The length of the sequence in bp. In the case of paired read data, this will be a string containing the lengths of the two sequences in bp, separated by a pipe character, e.g. “98\|94”. -5. A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s). For example, “562:13 561:4 A:31 0:1 562:3” would indicate that: - - the first 13 k-mers mapped to taxonomy ID #562 - - the next 4 k-mers mapped to taxonomy ID #561 - - the next 31 k-mers contained an ambiguous nucleotide - - the next k-mer was not in the database - - the last 3 k-mers mapped to taxonomy ID #562 - Note that paired read data will contain a “`|:|`” token in this list to indicate the end of one read and the beginning of another. - -- test_out/output_1.kreport2: - -``` -100.00 1 0 R 1 root -100.00 1 0 D 10239 Viruses -100.00 1 0 D1 2559587 Riboviria -100.00 1 0 O 76804 Nidovirales -100.00 1 0 O1 2499399 Cornidovirineae -100.00 1 0 F 11118 Coronaviridae -100.00 1 0 F1 2501931 Orthocoronavirinae -100.00 1 0 G 694002 Betacoronavirus -100.00 1 0 G1 2509511 Sarbecovirus -100.00 1 0 S 694009 Severe acute respiratory syndrome-related coronavirus -100.00 1 1 S1 2697049 Severe acute respiratory syndrome coronavirus 2 -``` - -Sample Report Output Formats: - -1. Percentage of fragments covered by the clade rooted at this taxon -2. Number of fragments covered by the clade rooted at this taxon -3. Number of fragments assigned directly to this taxon -4. A rank code, indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. Taxa that are not at any of these 10 ranks have a rank code that is formed by using the rank code of the closest ancestor rank with a number indicating the distance from that rank. E.g., “G2” is a rank code indicating a taxon is between genus and species and the grandparent taxon is at the genus rank. -5. NCBI taxonomic ID number -6. Indented scientific name diff --git a/seqkmer/Cargo.toml b/seqkmer/Cargo.toml deleted file mode 100644 index 1d432d5..0000000 --- a/seqkmer/Cargo.toml +++ /dev/null @@ -1,21 +0,0 @@ -[package] -name = "seqkmer" -version = "0.1.0" -edition = "2021" -authors = ["eric9n@gmail.com"] -description = "sequence kmer" -license = "MIT" -repository = "https://github.com/eric9n/Kun-peng" -keywords = ["bioinformatics", "sequence", "kmer", "dna", "genomics"] - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -crossbeam-channel = "0.5" -scoped_threadpool = "0.1.9" -flate2 = "1.0" - -[features] -default = ["dna"] -dna = [] -protein = [] diff --git a/seqkmer/src/fasta.rs b/seqkmer/src/fasta.rs deleted file mode 100644 index d0e5384..0000000 --- a/seqkmer/src/fasta.rs +++ /dev/null @@ -1,258 +0,0 @@ -use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE}; -use crate::seq::{Base, SeqFormat, SeqHeader}; -use crate::utils::OptionPair; -use std::io::{BufRead, BufReader, Read, Result}; -use std::path::Path; - -const SEQ_LIMIT: u64 = u64::pow(2, 32); - -/// FastaReader -pub struct FastaReader -where - R: Read + Send, -{ - reader: BufReader, - file_index: usize, - reads_index: usize, - header: Vec, - seq: Vec, - - // 批量读取 - batch_size: usize, -} - -impl FastaReader -where - R: Read + Send, -{ - pub fn new(reader: R, file_index: usize) -> Self { - Self::with_capacity(reader, file_index, BUFSIZE, 30) - } - - pub fn with_capacity(reader: R, file_index: usize, capacity: usize, batch_size: usize) -> Self { - assert!(capacity >= 3); - Self { - reader: BufReader::with_capacity(capacity, reader), - file_index, - reads_index: 0, - header: Vec::new(), - seq: Vec::new(), - batch_size, - } - } - - pub fn read_next(&mut self) -> Result> { - // 读取fastq文件header部分 - self.header.clear(); - if self.reader.read_until(b'\n', &mut self.header)? == 0 { - return Ok(None); - } - // 读取fasta文件seq部分 - self.seq.clear(); - if self.reader.read_until(b'>', &mut self.seq)? == 0 { - return Ok(None); - } - trim_end(&mut self.seq); - Ok(Some(())) - } - - pub fn _next(&mut self) -> Result>)>> { - if self.read_next()?.is_none() { - return Ok(None); - } - - let seq_len = self.seq.len(); - // 检查seq的长度是否大于2的32次方 - if seq_len as u64 > SEQ_LIMIT { - eprintln!("Sequence length exceeds 2^32, which is not handled."); - return Ok(None); - } - - let seq_id = unsafe { - let slice = if self.header.starts_with(b">") { - &self.header[1..] - } else { - &self.header[..] - }; - - let s = std::str::from_utf8_unchecked(slice); - let first_space_index = s - .find(|c: char| c.is_whitespace() || c == '\u{1}') - .unwrap_or(s.len()); - - // 直接从原始切片创建第一个单词的切片 - &s[..first_space_index] - }; - self.reads_index += 1; - - let seq_header = SeqHeader { - file_index: self.file_index, - reads_index: self.reads_index, - format: SeqFormat::Fasta, - id: seq_id.to_owned(), - }; - Ok(Some(( - seq_len, - Base::new(seq_header, OptionPair::Single(self.seq.to_owned())), - ))) - } -} - -impl FastaReader> { - #[inline] - pub fn from_path>(path: P, file_index: usize) -> Result { - let reader = dyn_reader(path)?; - Ok(Self::new(reader, file_index)) - } -} - -impl Reader for FastaReader { - fn next(&mut self) -> Result>>>> { - let mut seqs = Vec::new(); - let mut total_bytes = 0; - let max_bytes = 10 * 1024 * 1024; - - for _ in 0..self.batch_size { - if let Some((seq_len, seq)) = self._next()? { - seqs.push(seq); - total_bytes += seq_len; - if total_bytes > max_bytes { - break; - } - } else { - break; - } - } - - Ok(if seqs.is_empty() { None } else { Some(seqs) }) - } -} - -/// BufferFastaReader -pub struct BufferFastaReader -where - R: Read + Send, -{ - reader: BufReader, - file_index: usize, - reads_index: usize, - header: Vec, - seq: Vec, - line_num: usize, - - // 批量读取 - batch_size: usize, -} - -impl BufferFastaReader -where - R: Read + Send, -{ - pub fn new(reader: R, file_index: usize) -> Self { - Self::with_capacity(reader, file_index, BUFSIZE, 60) - } - - pub fn with_capacity(reader: R, file_index: usize, capacity: usize, batch_size: usize) -> Self { - assert!(capacity >= 3); - Self { - reader: BufReader::with_capacity(capacity, reader), - file_index, - reads_index: 0, - line_num: 0, - header: Vec::new(), - seq: Vec::new(), - batch_size, - } - } - - pub fn read_next(&mut self) -> Result> { - // 读取fastq文件header部分 - if self.header.is_empty() { - if self.reader.read_until(b'\n', &mut self.header)? == 0 { - return Ok(None); - } - } - - if self.reader.read_until(b'\n', &mut self.seq)? == 0 { - return Ok(None); - } - if self.seq.starts_with(&[b'>']) { - self.header = self.seq.clone(); - self.seq.clear(); - if self.reader.read_until(b'\n', &mut self.seq)? == 0 { - return Ok(None); - } - } - self.line_num += 1; - trim_end(&mut self.seq); - Ok(Some(())) - } - - pub fn _next(&mut self) -> Result>>> { - self.seq.clear(); - for _ in 0..self.batch_size { - if self.read_next()?.is_none() { - return Ok(None); - } - } - - let seq_len = self.seq.len(); - // 检查seq的长度是否大于2的32次方 - if seq_len as u64 > SEQ_LIMIT { - eprintln!("Sequence length exceeds 2^32, which is not handled."); - return Ok(None); - } - - let seq_id = unsafe { - let slice = if self.header.starts_with(b">") { - &self.header[1..] - } else { - &self.header[..] - }; - - let s = std::str::from_utf8_unchecked(slice); - let first_space_index = s - .find(|c: char| c.is_whitespace() || c == '\u{1}') - .unwrap_or(s.len()); - // let first_space_index = s - // .as_bytes() - // .iter() - // .position(|&c| c == b' ') - // .unwrap_or(s.len()); - - // 直接从原始切片创建第一个单词的切片 - &s[..first_space_index] - }; - self.reads_index += 1; - - let seq_header = SeqHeader { - file_index: self.file_index, - reads_index: self.reads_index, - format: SeqFormat::Fasta, - id: seq_id.to_owned(), - }; - Ok(Some(Base::new( - seq_header, - OptionPair::Single(self.seq.to_owned()), - ))) - } -} - -impl BufferFastaReader> { - #[inline] - pub fn from_path>(path: P, file_index: usize) -> Result { - let reader = dyn_reader(path)?; - Ok(Self::new(reader, file_index)) - } -} - -impl Reader for BufferFastaReader { - fn next(&mut self) -> Result>>>> { - let mut seqs = Vec::new(); - if let Some(seq) = self._next()? { - seqs.push(seq); - } - - Ok(if seqs.is_empty() { None } else { Some(seqs) }) - } -} diff --git a/seqkmer/src/fastq.rs b/seqkmer/src/fastq.rs deleted file mode 100644 index a17c633..0000000 --- a/seqkmer/src/fastq.rs +++ /dev/null @@ -1,190 +0,0 @@ -use crate::reader::{dyn_reader, trim_end, trim_pair_info, Reader, BUFSIZE}; -use crate::seq::{Base, SeqFormat, SeqHeader}; -use crate::utils::OptionPair; -use std::io::{BufRead, BufReader, Read, Result}; -use std::path::Path; - -struct QReader { - reader: BufReader, - quality_score: i32, - - header: Vec, - seq: Vec, - plus: Vec, - quals: Vec, -} - -impl QReader -where - R: Read + Send, -{ - pub fn with_capacity(reader: R, capacity: usize, quality_score: i32) -> Self { - assert!(capacity >= 3); - Self { - reader: BufReader::with_capacity(capacity, reader), - header: Vec::new(), - seq: Vec::new(), - plus: Vec::new(), - quals: Vec::new(), - quality_score, - } - } - - pub fn read_next(&mut self) -> Result> { - // 读取fastq文件header部分 - self.header.clear(); - if self.reader.read_until(b'\n', &mut self.header)? == 0 { - return Ok(None); - } - // 读取fastq文件seq部分 - self.seq.clear(); - if self.reader.read_until(b'\n', &mut self.seq)? == 0 { - return Ok(None); - } - trim_end(&mut self.seq); - - // 读取fastq文件+部分 - self.plus.clear(); - if self.reader.read_until(b'\n', &mut self.plus)? == 0 { - return Ok(None); - } - - // 读取fastq文件quals部分 - self.quals.clear(); - if self.reader.read_until(b'\n', &mut self.quals)? == 0 { - return Ok(None); - } - trim_end(&mut self.quals); - - if self.quality_score > 0 { - for (base, &qscore) in self.seq.iter_mut().zip(self.quals.iter()) { - if (qscore as i32 - '!' as i32) < self.quality_score { - *base = b'x'; - } - } - } - - Ok(Some(())) - } -} - -pub struct FastqReader { - inner: OptionPair>, - file_index: usize, - reads_index: usize, - // 批量读取 - batch_size: usize, -} - -impl FastqReader -where - R: Read + Send, -{ - pub fn new(readers: OptionPair, file_index: usize, quality_score: i32) -> Self { - Self::with_capacity(readers, file_index, BUFSIZE, quality_score, 30) - } - - pub fn with_capacity<'a>( - readers: OptionPair, - file_index: usize, - capacity: usize, - quality_score: i32, - batch_size: usize, - ) -> Self { - assert!(capacity >= 3); - let inner = match readers { - OptionPair::Single(reader) => { - OptionPair::Single(QReader::with_capacity(reader, capacity, quality_score)) - } - OptionPair::Pair(reader1, reader2) => OptionPair::Pair( - QReader::with_capacity(reader1, capacity, quality_score), - QReader::with_capacity(reader2, capacity, quality_score), - ), - }; - Self { - inner, - file_index, - reads_index: 0, - batch_size, - } - } - - fn create_seq_header(reader: &QReader, file_index: usize, reads_index: usize) -> SeqHeader { - let seq_id = unsafe { - let s = std::str::from_utf8_unchecked(&reader.header[1..]); - let first_space_index = s - .find(|c: char| c.is_whitespace() || c == '\u{1}') - .unwrap_or(s.len()); - - // 直接从原始切片创建第一个单词的切片 - &s[..first_space_index] - }; - SeqHeader { - file_index, - reads_index, - format: SeqFormat::Fastq, - id: trim_pair_info(seq_id), - } - } - - pub fn read_next(&mut self) -> Result>>> { - match &mut self.inner { - OptionPair::Single(reader) => { - if reader.read_next()?.is_none() { - return Ok(None); - } - - self.reads_index += 1; - - let seq_header = - Self::create_seq_header(&reader, self.file_index, self.reads_index); - Ok(Some(Base::new( - seq_header, - OptionPair::Single(reader.seq.to_owned()), - ))) - } - OptionPair::Pair(reader1, reader2) => { - if reader1.read_next()?.is_none() { - return Ok(None); - } - if reader2.read_next()?.is_none() { - return Ok(None); - } - - self.reads_index += 1; - let seq_header = - Self::create_seq_header(&reader1, self.file_index, self.reads_index); - - Ok(Some(Base::new( - seq_header, - OptionPair::Pair(reader1.seq.to_owned(), reader2.seq.to_owned()), - ))) - } - } - } -} - -impl FastqReader> { - #[inline] - pub fn from_path>( - paths: OptionPair

, - file_index: usize, - quality_score: i32, - ) -> Result { - let readers = paths.map(|path| dyn_reader(path))?; - Ok(Self::new(readers, file_index, quality_score)) - } -} - -impl Reader for FastqReader -where - R: Read + Send, -{ - fn next(&mut self) -> Result>>>> { - let seqs: Vec>> = (0..self.batch_size) - .filter_map(|_| self.read_next().transpose()) - .collect::>>()?; - - Ok(Some(seqs).filter(|v| !v.is_empty())) - } -} diff --git a/seqkmer/src/fastx.rs b/seqkmer/src/fastx.rs deleted file mode 100644 index 5400728..0000000 --- a/seqkmer/src/fastx.rs +++ /dev/null @@ -1,67 +0,0 @@ -use crate::fasta::{BufferFastaReader, FastaReader}; -use crate::fastq::FastqReader; -use crate::reader::{detect_file_format, Reader}; -use crate::seq::{Base, SeqFormat}; -use crate::utils::OptionPair; -use std::io::Result; -use std::path::Path; - -pub struct FastxReader { - inner: R, -} - -impl FastxReader { - pub fn new(inner: R) -> Self { - Self { inner } - } -} - -impl Reader for FastxReader { - fn next(&mut self) -> Result>>>> { - self.inner.next() - } -} -impl FastxReader> { - pub fn from_paths>( - paths: OptionPair

, - file_index: usize, - quality_score: i32, - ) -> Result { - let file_format = paths.map(|path: &P| detect_file_format(path)); - - match file_format? { - OptionPair::Single(SeqFormat::Fasta) => { - let reader = FastaReader::from_path(paths.single().unwrap().as_ref(), file_index)?; - Ok(Self::new(Box::new(reader) as Box)) - } - OptionPair::Single(SeqFormat::Fastq) - | OptionPair::Pair(SeqFormat::Fastq, SeqFormat::Fastq) => { - let reader = FastqReader::from_path(paths, file_index, quality_score)?; - Ok(Self::new(Box::new(reader) as Box)) - } - _ => panic!("Unsupported file format combination"), - } - } - - pub fn from_buffer_reader>( - paths: OptionPair

, - file_index: usize, - quality_score: i32, - ) -> Result { - let file_format = paths.map(|path: &P| detect_file_format(path)); - - match file_format? { - OptionPair::Single(SeqFormat::Fasta) => { - let reader = - BufferFastaReader::from_path(paths.single().unwrap().as_ref(), file_index)?; - Ok(Self::new(Box::new(reader) as Box)) - } - OptionPair::Single(SeqFormat::Fastq) - | OptionPair::Pair(SeqFormat::Fastq, SeqFormat::Fastq) => { - let reader = FastqReader::from_path(paths, file_index, quality_score)?; - Ok(Self::new(Box::new(reader) as Box)) - } - _ => panic!("Unsupported file format combination"), - } - } -} diff --git a/seqkmer/src/feat.rs b/seqkmer/src/feat.rs deleted file mode 100644 index 9cd0991..0000000 --- a/seqkmer/src/feat.rs +++ /dev/null @@ -1,202 +0,0 @@ -#[cfg(feature = "dna")] -pub mod constants { - pub const DEFAULT_KMER_LENGTH: u64 = 35; - pub const DEFAULT_MINIMIZER_LENGTH: u8 = 31; - pub const DEFAULT_MINIMIZER_SPACES: u8 = 7; - - pub const BITS_PER_CHAR: usize = 2; -} - -#[cfg(feature = "protein")] -pub mod constants { - pub const DEFAULT_KMER_LENGTH: u64 = 15; - pub const DEFAULT_MINIMIZER_LENGTH: u8 = 12; - pub const DEFAULT_MINIMIZER_SPACES: u8 = 0; - - pub const BITS_PER_CHAR: usize = 4; -} - -#[cfg(feature = "dna")] -#[inline] -pub fn char_to_value(c: u8) -> Option { - match c { - b'A' | b'a' => Some(0x00), - b'C' | b'c' => Some(0x01), - b'G' | b'g' => Some(0x02), - b'T' | b't' => Some(0x03), - _ => None, - } -} - -#[cfg(feature = "protein")] -#[inline] -pub fn char_to_value(c: u8) -> Option<64> { - match c { - // stop codons/rare amino acids - b'*' | b'U' | b'u' | b'O' | b'o' => Some(0x00), - // alanine - b'A' | b'a' => Some(0x01), - // asparagine, glutamine, serine - b'N' | b'n' | b'Q' | b'q' | b'S' | b's' => Some(0x02), - // cysteine - b'C' | b'c' => Some(0x03), - // aspartic acid, glutamic acid - b'D' | b'd' | b'E' | b'e' => Some(0x04), - // phenylalanine - b'F' | b'f' => Some(0x05), - // glycine - b'G' | b'g' => Some(0x06), - // histidine - b'H' | b'h' => Some(0x07), - // isoleucine, leucine - b'I' | b'i' | b'L' | b'l' => Some(0x08), - // lysine - b'K' | b'k' => Some(0x09), - // proline - b'P' | b'p' => Some(0x0a), - // arginine - b'R' | b'r' => Some(0x0b), - // methionine, valine - b'M' | b'm' | b'V' | b'v' => Some(0x0c), - // threonine - b'T' | b't' => Some(0x0d), - // tryptophan - b'W' | b'w' => Some(0x0e), - // tyrosine - b'Y' | b'y' => Some(0x0f), - _ => None, - } -} - -#[inline] -fn reverse_complement(mut kmer: u64, n: usize) -> u64 { - // Reverse bits while leaving bit pairs (nucleotides) intact. - - // Swap consecutive pairs of bits - kmer = (kmer >> 2 & 0x3333333333333333) | (kmer << 2 & 0xCCCCCCCCCCCCCCCC); - - // Swap consecutive nibbles (4-bit groups) - kmer = (kmer >> 4 & 0x0F0F0F0F0F0F0F0F) | (kmer << 4 & 0xF0F0F0F0F0F0F0F0); - - // Swap consecutive bytes - kmer = (kmer >> 8 & 0x00FF00FF00FF00FF) | (kmer << 8 & 0xFF00FF00FF00FF00); - - // Swap consecutive pairs of bytes - kmer = (kmer >> 16 & 0x0000FFFF0000FFFF) | (kmer << 16 & 0xFFFF0000FFFF0000); - - // Swap the two halves of the 64-bit word - kmer = (kmer >> 32) | (kmer << 32); - - // Complement the bits, shift to the right length, and mask to get the desired length - (!kmer >> (64 - n * 2)) & ((1u64 << (n * 2)) - 1) - - // if revcom_version == 0 { - // // Complement the bits and mask to get the desired length - // !kmer & ((1u64 << (n * 2)) - 1) - // } else { - // // Complement the bits, shift to the right length, and mask to get the desired length - // (!kmer >> (64 - n * 2)) & ((1u64 << (n * 2)) - 1) - // } -} - -#[cfg(feature = "dna")] -#[inline] -pub fn canonical_representation(kmer: u64, n: usize) -> u64 { - let revcom = reverse_complement(kmer, n); - if kmer < revcom { - kmer - } else { - revcom - } -} - -#[cfg(feature = "protein")] -#[inline] -pub fn canonical_representation(kmer: u64, n: usize, revcom_version: u8) -> u64 { - kmer -} - -pub const DEFAULT_TOGGLE_MASK: u64 = 0xe37e28c4271b5a2d; -pub const DEFAULT_SPACED_SEED_MASK: u64 = 0; -pub const CURRENT_REVCOM_VERSION: u8 = 1; - -// 声明常量 -const M1: u64 = 0xff51afd7ed558ccd; -const M2: u64 = 0xc4ceb9fe1a85ec53; - -/// -/// # Examples -/// -/// ``` -/// # use seqkmer::fmix64; -/// let key: u64 = 123; -/// let hash = fmix64(key); -/// assert_eq!(hash, 9208534749291869864); -/// ``` -#[inline] -pub fn fmix64(key: u64) -> u64 { - let mut k = key; - k ^= k >> 33; - k = k.wrapping_mul(M1); - k ^= k >> 33; - k = k.wrapping_mul(M2); - k ^= k >> 33; - k -} - -/// minimizer config -#[derive(Copy, Debug, Clone)] -pub struct Meros { - pub k_mer: usize, - pub l_mer: usize, - pub mask: u64, - pub spaced_seed_mask: u64, - pub toggle_mask: u64, - pub min_clear_hash_value: Option, -} - -impl Meros { - pub fn new( - k_mer: usize, - l_mer: usize, - spaced_seed_mask: Option, - toggle_mask: Option, - min_clear_hash_value: Option, - ) -> Self { - let mut mask = 1u64; - mask <<= l_mer * constants::BITS_PER_CHAR; - mask -= 1; - - Self { - k_mer, - l_mer, - mask, - spaced_seed_mask: spaced_seed_mask.unwrap_or(DEFAULT_SPACED_SEED_MASK), - toggle_mask: toggle_mask.unwrap_or(DEFAULT_TOGGLE_MASK) & mask, - min_clear_hash_value, - } - } - - pub fn window_size(&self) -> usize { - self.k_mer - self.l_mer - } -} - -impl Default for Meros { - fn default() -> Self { - let l_mer = constants::DEFAULT_MINIMIZER_LENGTH as usize; - let k_mer = constants::DEFAULT_KMER_LENGTH as usize; - let mut mask = 1u64; - mask <<= l_mer * constants::BITS_PER_CHAR; - mask -= 1; - - Self { - k_mer, - l_mer, - mask, - spaced_seed_mask: DEFAULT_SPACED_SEED_MASK, - toggle_mask: DEFAULT_TOGGLE_MASK & mask, - min_clear_hash_value: None, - } - } -} diff --git a/seqkmer/src/lib.rs b/seqkmer/src/lib.rs deleted file mode 100644 index 3beae3e..0000000 --- a/seqkmer/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -mod fasta; -mod fastq; -mod fastx; -mod feat; -mod mmscanner; -mod parallel; -mod reader; -mod seq; -mod utils; - -pub use fasta::*; -pub use fastq::*; -pub use fastx::*; -pub use feat::constants::*; -pub use feat::*; -pub use mmscanner::MinimizerIterator; -pub use parallel::*; -pub use reader::*; -pub use seq::*; -pub use utils::OptionPair; diff --git a/seqkmer/src/mmscanner.rs b/seqkmer/src/mmscanner.rs deleted file mode 100644 index b1a64be..0000000 --- a/seqkmer/src/mmscanner.rs +++ /dev/null @@ -1,266 +0,0 @@ -// kraken 2 使用的是murmur_hash3 算法的 fmix64作为 hash -use crate::seq::Base; -use crate::utils::OptionPair; -use crate::{ - canonical_representation, char_to_value, fmix64 as murmur_hash3, Meros, BITS_PER_CHAR, -}; -use std::collections::VecDeque; - -#[inline] -fn to_candidate_lmer(meros: &Meros, lmer: u64) -> u64 { - let mut canonical_lmer = canonical_representation(lmer, meros.l_mer); - if meros.spaced_seed_mask > 0 { - canonical_lmer &= meros.spaced_seed_mask; - } - canonical_lmer ^ meros.toggle_mask -} - -#[derive(Debug)] -pub struct MinimizerData { - pos: usize, - candidate_lmer: u64, -} - -impl MinimizerData { - fn new(candidate_lmer: u64, pos: usize) -> Self { - Self { - candidate_lmer, - pos, - } - } -} - -pub struct MinimizerWindow { - queue: VecDeque, - queue_pos: usize, - /// 窗口队列的大小 - capacity: usize, - /// 队列计数 - count: usize, -} - -impl MinimizerWindow { - fn new(capacity: usize) -> Self { - Self { - queue: VecDeque::with_capacity(capacity), - capacity, - count: 0, - queue_pos: 0, - } - } - - #[inline] - fn next(&mut self, candidate_lmer: u64) -> Option { - // 无需比较,直接返回 - if self.capacity == 1 { - return Some(candidate_lmer); - } - - let data = MinimizerData::new(candidate_lmer, self.count); - - // 移除队列中所有比当前元素大的元素的索引 - // 因为它们不可能是当前窗口的最小值 - while let Some(m_data) = self.queue.back() { - if m_data.candidate_lmer > candidate_lmer { - self.queue.pop_back(); - } else { - break; - } - } - let mut changed = false; - - if (self.queue.is_empty() && self.count >= self.capacity) || self.count == self.capacity { - changed = true - } - // 将当前元素的索引添加到队列 - self.queue.push_back(data); - - while !self.queue.is_empty() - && self.queue.front().map_or(false, |front| { - self.count >= self.capacity && front.pos < self.count - self.capacity - }) - { - self.queue.pop_front(); - changed = true; - } - - self.count += 1; - if changed { - self.queue.front().map(|front| front.candidate_lmer) - } else { - None - } - } - - fn clear(&mut self) { - self.count = 0; - self.queue_pos = 0; - self.queue.clear(); - } -} - -#[derive(Clone, Copy)] -pub struct Cursor { - pos: usize, - capacity: usize, - value: u64, - mask: u64, -} - -impl Cursor { - fn new(capacity: usize, mask: u64) -> Self { - Self { - pos: 0, - value: 0, - capacity, - mask, - } - } - - fn next_lmer(&mut self, item: u64) -> Option { - self.value = ((self.value << BITS_PER_CHAR) | item) & self.mask; - // 更新当前位置 - self.pos += 1; - // 检查是否达到了容量 - if self.pos >= self.capacity { - return Some(self.value); - } - None - } - - // 清除元素 - #[inline] - fn clear(&mut self) { - self.pos = 0; - self.value = 0; - } -} - -pub struct MinimizerIterator<'a> { - cursor: Cursor, - window: MinimizerWindow, - seq: &'a [u8], - meros: &'a Meros, - pos: usize, - end: usize, - pub size: usize, -} - -impl<'a> MinimizerIterator<'a> { - pub fn new(seq: &'a [u8], cursor: Cursor, window: MinimizerWindow, meros: &'a Meros) -> Self { - MinimizerIterator { - cursor, - window, - seq, - meros, - pos: 0, - size: 0, - end: seq.len(), - } - } - - fn clear_state(&mut self) { - self.cursor.clear(); - self.window.clear(); - } - - pub fn seq_size(&self) -> usize { - self.end - } -} - -impl<'a> Iterator for MinimizerIterator<'a> { - type Item = (usize, u64); - - fn next(&mut self) -> Option { - while self.pos < self.end { - let ch = self.seq[self.pos]; - self.pos += 1; - if ch == b'\n' || ch == b'\r' { - continue; - } else { - let data = match char_to_value(ch) { - Some(code) => self.cursor.next_lmer(code).and_then(|lmer| { - let candidate_lmer = to_candidate_lmer(&self.meros, lmer); - self.window - .next(candidate_lmer) - .map(|minimizer| murmur_hash3(minimizer ^ self.meros.toggle_mask)) - }), - None => { - self.clear_state(); - None - } - }; - if data.is_some() { - self.size += 1; - return Some((self.size, data.unwrap())); - } - } - } - None - } -} - -impl<'a> Base> { - pub fn seq_size_str(&self) -> OptionPair { - self.body.apply(|m_iter| m_iter.seq_size().to_string()) - } - - pub fn fmt_seq_size(&self) -> String { - self.body - .reduce_str("|", |m_iter| m_iter.seq_size().to_string()) - } - - pub fn fmt_size(&self) -> String { - self.body.reduce_str("|", |m_iter| m_iter.size.to_string()) - } - - pub fn fold(&mut self, mut f: F) -> Vec - where - F: FnMut(&mut Vec, &mut MinimizerIterator<'a>, usize) -> usize, - T: Clone, - { - let mut init = Vec::new(); - match &mut self.body { - OptionPair::Single(m_iter) => { - f(&mut init, m_iter, 0); - } - OptionPair::Pair(m_iter1, m_iter2) => { - let offset = f(&mut init, m_iter1, 0); - f(&mut init, m_iter2, offset); - } - } - init - } - - pub fn range(&self) -> OptionPair<(usize, usize)> { - match &self.body { - OptionPair::Single(m_iter) => OptionPair::Single((0, m_iter.size)), - OptionPair::Pair(m_iter1, m_iter2) => { - let size1 = m_iter1.size; - OptionPair::Pair((0, size1), (size1, m_iter2.size + size1)) - } - } - } -} - -pub fn scan_sequence<'a>( - sequence: &'a Base>, - meros: &'a Meros, -) -> Base> { - let func = |seq: &'a Vec| { - let cursor = Cursor::new(meros.l_mer, meros.mask); - let window = MinimizerWindow::new(meros.window_size()); - MinimizerIterator::new(seq, cursor, window, meros) - }; - - match &sequence.body { - OptionPair::Pair(seq1, seq2) => Base::new( - sequence.header.clone(), - OptionPair::Pair(func(&seq1), func(&seq2)), - ), - OptionPair::Single(seq1) => { - Base::new(sequence.header.clone(), OptionPair::Single(func(&seq1))) - } - } -} diff --git a/seqkmer/src/parallel.rs b/seqkmer/src/parallel.rs deleted file mode 100644 index 1218a9f..0000000 --- a/seqkmer/src/parallel.rs +++ /dev/null @@ -1,236 +0,0 @@ -use crate::mmscanner::{scan_sequence, MinimizerIterator}; -use crate::reader::Reader; -use crate::seq::{Base, SeqFormat}; -use crate::{detect_file_format, FastaReader, FastqReader, Meros}; -use crossbeam_channel::{bounded, Receiver}; -use scoped_threadpool::Pool; -use std::collections::HashMap; -use std::io::Result; -use std::sync::Arc; - -pub struct ParallelItem

(P); - -impl

ParallelItem

{ - pub fn unwrap(self) -> P { - self.0 - } -} - -pub struct ParallelResult

-where - P: Send, -{ - recv: Receiver

, -} - -impl

ParallelResult

-where - P: Send, -{ - #[inline] - pub fn next(&mut self) -> Option> { - self.recv.recv().ok().map(ParallelItem) - } -} - -pub fn create_reader( - file_pair: &[String], - file_index: usize, - score: i32, -) -> Result> { - // let mut files_iter = file_pair.iter(); - let paths = crate::OptionPair::from_slice(file_pair); - - match detect_file_format(&file_pair[0])? { - SeqFormat::Fastq => Ok(Box::new(FastqReader::from_path(paths, file_index, score)?)), - SeqFormat::Fasta => Ok(Box::new(FastaReader::from_path(&file_pair[0], file_index)?)), - } -} - -pub fn read_parallel( - reader: &mut R, - n_threads: usize, - meros: &Meros, - work: W, - func: F, -) -> Result<()> -where - R: Reader, - O: Send, - Out: Send + Default, - W: Send + Sync + Fn(&mut Vec>) -> O, - F: FnOnce(&mut ParallelResult) -> Out + Send, -{ - assert!(n_threads > 2); - let buffer_len = n_threads + 2; - let (sender, receiver) = bounded::>>>(buffer_len); - let (done_send, done_recv) = bounded::(buffer_len); - let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver - let done_send = Arc::new(done_send); - let mut pool = Pool::new(n_threads as u32); - - let mut parallel_result = ParallelResult { recv: done_recv }; - - pool.scoped(|pool_scope| { - // 生产者线程 - pool_scope.execute(move || { - while let Ok(Some(seqs)) = reader.next() { - sender.send(seqs).expect("Failed to send sequences"); - } - }); - - // 消费者线程 - for _ in 0..n_threads - 2 { - let receiver = Arc::clone(&receiver); - let work = &work; - let done_send = Arc::clone(&done_send); - pool_scope.execute(move || { - while let Ok(mut seqs) = receiver.recv() { - let mut markers: Vec>> = seqs - .iter_mut() - .map(|seq| scan_sequence(seq, &meros)) - .collect(); - let output = work(&mut markers); - done_send.send(output).expect("Failed to send outputs"); - } - }); - } - - // 引用计数减掉一个,这样都子线程结束时, done_send还能完全释放 - drop(done_send); - pool_scope.execute(move || { - let _ = func(&mut parallel_result); - }); - - pool_scope.join_all(); - }); - - Ok(()) -} - -pub fn buffer_read_parallel( - reader: &mut R, - n_threads: usize, - buffer_size: usize, - work: W, - func: F, -) -> Result<()> -where - D: Send + Sized + Sync + Clone, - R: std::io::Read + Send, - O: Send, - Out: Send + Default, - W: Send + Sync + Fn(Vec) -> O, - F: FnOnce(&mut ParallelResult) -> Out + Send, -{ - assert!(n_threads > 2); - let buffer_len = n_threads + 2; - let (sender, receiver) = bounded::>(buffer_len); - let (done_send, done_recv) = bounded::(buffer_len); - let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver - let done_send = Arc::new(done_send); - let mut pool = Pool::new(n_threads as u32); - - let slot_size = std::mem::size_of::(); - let mut parallel_result = ParallelResult { recv: done_recv }; - - pool.scoped(|pool_scope| { - // 生产者线程 - pool_scope.execute(move || { - let mut batch_buffer = vec![0u8; slot_size * buffer_size]; - - while let Ok(bytes_read) = reader.read(&mut batch_buffer) { - if bytes_read == 0 { - break; - } // 文件末尾 - - let slots_in_batch = bytes_read / slot_size; - let slots = unsafe { - std::slice::from_raw_parts(batch_buffer.as_ptr() as *const D, slots_in_batch) - }; - sender - .send(slots.to_vec()) - .expect("Failed to send sequences"); - } - }); - - // 消费者线程 - for _ in 0..n_threads - 2 { - let receiver = Arc::clone(&receiver); - let work = &work; - let done_send = Arc::clone(&done_send); - pool_scope.execute(move || { - while let Ok(seqs) = receiver.recv() { - let output = work(seqs); - done_send.send(output).expect("Failed to send outputs"); - } - }); - } - - // 引用计数减掉一个,这样都子线程结束时, done_send还能完全释放 - drop(done_send); - pool_scope.execute(move || { - let _ = func(&mut parallel_result); - }); - - pool_scope.join_all(); - }); - - Ok(()) -} - -pub fn buffer_map_parallel( - map: &HashMap>, - n_threads: usize, - work: W, - func: F, -) -> Result<()> -where - D: Send + Sized + Sync, - O: Send, - Out: Send + Default, - W: Send + Sync + Fn((&u32, &Vec)) -> O, - F: FnOnce(&mut ParallelResult) -> Out + Send, -{ - assert!(n_threads > 2); - let buffer_len = n_threads + 2; - let (sender, receiver) = bounded::<(&u32, &Vec)>(buffer_len); - let (done_send, done_recv) = bounded::(buffer_len); - let receiver = Arc::new(receiver); // 使用 Arc 来共享 receiver - let done_send = Arc::new(done_send); - let mut pool = Pool::new(n_threads as u32); - - let mut parallel_result = ParallelResult { recv: done_recv }; - - pool.scoped(|pool_scope| { - // 生产者线程 - pool_scope.execute(move || { - for entry in map { - sender.send(entry).expect("Failed to send sequences"); - } - }); - - // 消费者线程 - for _ in 0..n_threads - 2 { - let receiver = Arc::clone(&receiver); - let work = &work; - let done_send = Arc::clone(&done_send); - pool_scope.execute(move || { - while let Ok(seqs) = receiver.recv() { - let output = work(seqs); - done_send.send(output).expect("Failed to send outputs"); - } - }); - } - - // 引用计数减掉一个,这样都子线程结束时, done_send还能完全释放 - drop(done_send); - pool_scope.execute(move || { - let _ = func(&mut parallel_result); - }); - - pool_scope.join_all(); - }); - - Ok(()) -} diff --git a/seqkmer/src/reader.rs b/seqkmer/src/reader.rs deleted file mode 100644 index 334ea63..0000000 --- a/seqkmer/src/reader.rs +++ /dev/null @@ -1,202 +0,0 @@ -use crate::seq::{Base, SeqFormat}; -use crate::utils::OptionPair; -use flate2::read::GzDecoder; -use std::fmt; -use std::fs::File; -use std::io::{self, BufRead, BufReader, Read, Result, Seek}; -use std::path::Path; - -pub(crate) fn dyn_reader>(path: P) -> Result> { - let mut file = open_file(path)?; - if is_gzipped(&mut file)? { - let decoder = GzDecoder::new(file); - Ok(Box::new(decoder)) - } else { - Ok(Box::new(file)) - } -} - -pub(crate) fn is_gzipped(file: &mut File) -> Result { - let mut buffer = [0; 2]; - file.read_exact(&mut buffer)?; - file.rewind()?; // 重置文件指针到开头 - Ok(buffer == [0x1F, 0x8B]) -} - -pub fn trim_pair_info(id: &str) -> String { - let sz = id.len(); - if sz <= 2 { - return id.to_string(); - } - if id.ends_with("/1") || id.ends_with("/2") { - return id[0..sz - 2].to_string(); - } - id.to_string() -} - -pub fn open_file>(path: P) -> Result { - File::open(&path).map_err(|e| { - if e.kind() == io::ErrorKind::NotFound { - io::Error::new(e.kind(), format!("File not found: {:?}", path.as_ref())) - } else { - e - } - }) -} - -pub(crate) fn detect_file_format>(path: P) -> io::Result { - // let mut file = open_file(path)?; - let read1: Box = dyn_reader(path)?; - let reader = BufReader::new(read1); - let mut lines = reader.lines(); - - if let Some(first_line) = lines.next() { - let line = first_line?; - - if line.starts_with('>') { - return Ok(SeqFormat::Fasta); - } else if line.starts_with('@') { - let _ = lines.next(); - if let Some(third_line) = lines.next() { - let line: String = third_line?; - if line.starts_with('+') { - return Ok(SeqFormat::Fastq); - } - } - } else { - return Err(io::Error::new( - io::ErrorKind::Other, - "Unrecognized fasta(fastq) file format", - )); - } - } - - Err(io::Error::new( - io::ErrorKind::Other, - "Unrecognized fasta(fastq) file format", - )) -} - -pub(crate) fn trim_end(buffer: &mut Vec) { - while let Some(&b'\n' | &b'\r' | &b'>' | &b'@') = buffer.last() { - buffer.pop(); - } -} - -pub const BUFSIZE: usize = 16 * 1024 * 1024; - -pub trait Reader: Send { - fn next(&mut self) -> Result>>>>; -} - -impl Reader for Box { - fn next(&mut self) -> Result>>>> { - (**self).next() - } -} - -#[derive(Debug)] -pub struct PosData { - /// 外部 taxonomy id - pub ext_code: u64, - /// 连续命中次数 - pub count: usize, -} - -impl PosData { - pub fn new(ext_code: u64, count: usize) -> Self { - Self { ext_code, count } - } -} - -impl fmt::Display for PosData { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}:{}", self.ext_code, self.count) - } -} - -/// 序列的命中分布 -#[derive(Debug)] -pub struct SpaceDist { - pub value: Vec, - /// example: (0, 10], 左开右闭 - pub range: (usize, usize), - pos: usize, -} - -impl SpaceDist { - pub fn new(range: (usize, usize)) -> Self { - Self { - value: Vec::new(), - range, - pos: range.0, - } - } - - fn fill_with_zeros(&mut self, gap: usize) { - if gap > 0 { - self.value.push(PosData::new(0, gap)); - } - } - - pub fn add(&mut self, ext_code: u64, pos: usize) { - if pos <= self.pos || pos > self.range.1 { - return; // 早期返回,不做任何处理 - } - let gap = pos - self.pos - 1; - - if gap > 0 { - self.fill_with_zeros(gap); - } - - if let Some(last) = self.value.last_mut() { - if last.ext_code == ext_code { - last.count += 1; - } else { - self.value.push(PosData::new(ext_code, 1)); - } - } else { - self.value.push(PosData::new(ext_code, 1)); - } - self.pos = pos; - } - - /// Fills the end of the distribution with zeros if there is remaining space. - pub fn fill_tail_with_zeros(&mut self) { - if self.pos < self.range.1 { - self.fill_with_zeros(self.range.1 - self.pos); - self.pos = self.range.1; - } - } -} - -impl fmt::Display for SpaceDist { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for (i, data) in self.value.iter().enumerate() { - if i > 0 { - write!(f, " ")?; - } - write!(f, "{}", data)?; - } - write!(f, "") - } -} - -impl OptionPair { - pub fn add(&mut self, ext_code: u64, pos: usize) { - match self { - OptionPair::Single(sd) => sd.add(ext_code, pos), - OptionPair::Pair(sd1, sd2) => { - if pos > sd1.range.1 { - sd2.add(ext_code, pos) - } else { - sd1.add(ext_code, pos) - } - } - } - } - - pub fn fill_tail_with_zeros(&mut self) { - self.apply_mut(|sd| sd.fill_tail_with_zeros()); - } -} diff --git a/seqkmer/src/seq.rs b/seqkmer/src/seq.rs deleted file mode 100644 index aacfbaf..0000000 --- a/seqkmer/src/seq.rs +++ /dev/null @@ -1,37 +0,0 @@ -use crate::utils::OptionPair; - -#[derive(Debug, Clone, PartialEq, Eq, Copy)] -pub enum SeqFormat { - Fasta, - Fastq, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct SeqHeader { - pub id: String, - pub file_index: usize, - pub reads_index: usize, - pub format: SeqFormat, -} - -#[derive(Debug)] -pub struct Base { - pub header: SeqHeader, - pub body: OptionPair, -} - -impl Base { - pub fn new(header: SeqHeader, body: OptionPair) -> Self { - Self { header, body } - } - - pub fn map(&self, mut f: F) -> Result, E> - where - F: FnMut(&T) -> Result, - { - self.body.map(|t| f(&t)).map(|body| Base { - header: self.header.clone(), - body, - }) - } -} diff --git a/seqkmer/src/utils.rs b/seqkmer/src/utils.rs deleted file mode 100644 index 10138ac..0000000 --- a/seqkmer/src/utils.rs +++ /dev/null @@ -1,94 +0,0 @@ -#[derive(Debug, Clone)] -pub enum OptionPair { - Single(T), - Pair(T, T), -} - -impl OptionPair { - pub fn single(&self) -> Option<&T> { - match self { - OptionPair::Single(value) => Some(value), - _ => None, - } - } - - // 它接受一个泛型闭包 F,并返回一个新的 OptionPair - pub fn map(&self, mut f: F) -> Result, E> - where - F: FnMut(&T) -> Result, - { - match self { - OptionPair::Single(t) => f(t).map(OptionPair::Single), - OptionPair::Pair(t1, t2) => { - let u1 = f(t1)?; - let u2 = f(t2)?; - Ok(OptionPair::Pair(u1, u2)) - } - } - } - - pub fn reduce(&self, init: U, mut f: F) -> U - where - F: FnMut(U, &T) -> U, - { - match self { - OptionPair::Single(t) => f(init, t), - OptionPair::Pair(t1, t2) => { - let result = f(init, t1); - f(result, t2) - } - } - } - - pub fn reduce_str(&self, sep: &str, mut f: F) -> String - where - F: FnMut(&T) -> String, - { - self.reduce(String::new(), |acc, t| { - if acc.is_empty() { - f(t) - } else { - format!("{}{}{}", acc, sep, f(t)) - } - }) - } - - pub fn apply(&self, mut f: F) -> OptionPair - where - F: FnMut(&T) -> U, - { - match self { - OptionPair::Single(t) => OptionPair::Single(f(t)), - OptionPair::Pair(t1, t2) => OptionPair::Pair(f(t1), f(t2)), - } - } - - pub fn apply_mut(&mut self, mut f: F) -> OptionPair - where - F: FnMut(&mut T) -> U, - { - match self { - OptionPair::Single(t) => OptionPair::Single(f(t)), - OptionPair::Pair(t1, t2) => OptionPair::Pair(f(t1), f(t2)), - } - } -} - -impl OptionPair { - pub fn from_slice(slice: &[T]) -> OptionPair { - match slice { - [a, b] => OptionPair::Pair(a.clone(), b.clone()), - [a] => OptionPair::Single(a.clone()), - _ => unreachable!(), - } - } -} - -impl From<(T, Option)> for OptionPair { - fn from(tuple: (T, Option)) -> Self { - match tuple { - (a, Some(b)) => OptionPair::Pair(a, b), - (a, None) => OptionPair::Single(a), - } - } -} diff --git a/kr2r/src/args.rs b/src/args.rs similarity index 95% rename from kr2r/src/args.rs rename to src/args.rs index 0c166e0..2294a90 100644 --- a/kr2r/src/args.rs +++ b/src/args.rs @@ -1,4 +1,3 @@ -// 使用时需要引用模块路径 use crate::utils::expand_spaced_seed_mask; use crate::{construct_seed_template, parse_binary}; use clap::Parser; @@ -168,6 +167,18 @@ impl KLMTArgs { } } +/// Parse size string to usize +/// +/// # Examples +/// +/// ``` +/// use kun_peng::args::parse_size; +/// +/// assert_eq!(parse_size("1G"), Ok(1_073_741_824)); +/// assert_eq!(parse_size("1M"), Ok(1_048_576)); +/// assert_eq!(parse_size("1K"), Ok(1_024)); +/// assert!(parse_size("1B").is_err()); +/// ``` pub fn parse_size(s: &str) -> Result { let len = s.len(); if len < 2 { diff --git a/kr2r/src/bin/annotate.rs b/src/bin/annotate.rs similarity index 100% rename from kr2r/src/bin/annotate.rs rename to src/bin/annotate.rs diff --git a/kr2r/src/bin/build_k2_db.rs b/src/bin/build_k2_db.rs similarity index 100% rename from kr2r/src/bin/build_k2_db.rs rename to src/bin/build_k2_db.rs diff --git a/kr2r/src/bin/chunk_db.rs b/src/bin/chunk_db.rs similarity index 100% rename from kr2r/src/bin/chunk_db.rs rename to src/bin/chunk_db.rs diff --git a/kr2r/src/bin/direct.rs b/src/bin/direct.rs similarity index 100% rename from kr2r/src/bin/direct.rs rename to src/bin/direct.rs diff --git a/kr2r/src/bin/estimate_capacity.rs b/src/bin/estimate_capacity.rs similarity index 100% rename from kr2r/src/bin/estimate_capacity.rs rename to src/bin/estimate_capacity.rs diff --git a/kr2r/src/bin/hashshard.rs b/src/bin/hashshard.rs similarity index 100% rename from kr2r/src/bin/hashshard.rs rename to src/bin/hashshard.rs diff --git a/kr2r/src/bin/kun.rs b/src/bin/kun.rs similarity index 100% rename from kr2r/src/bin/kun.rs rename to src/bin/kun.rs diff --git a/kr2r/src/bin/merge_fna.rs b/src/bin/merge_fna.rs similarity index 100% rename from kr2r/src/bin/merge_fna.rs rename to src/bin/merge_fna.rs diff --git a/kr2r/src/bin/resolve.rs b/src/bin/resolve.rs similarity index 100% rename from kr2r/src/bin/resolve.rs rename to src/bin/resolve.rs diff --git a/kr2r/src/bin/splitr.rs b/src/bin/splitr.rs similarity index 100% rename from kr2r/src/bin/splitr.rs rename to src/bin/splitr.rs diff --git a/kr2r/src/classify.rs b/src/classify.rs similarity index 58% rename from kr2r/src/classify.rs rename to src/classify.rs index 671c121..0d497ec 100644 --- a/kr2r/src/classify.rs +++ b/src/classify.rs @@ -6,6 +6,20 @@ use seqkmer::SpaceDist; use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; +/// Resolves the taxonomic classification based on hit counts and taxonomy. +/// +/// This function determines the most likely taxonomic classification for a sequence +/// based on the hit counts for different taxa and the taxonomic hierarchy. +/// +/// # Arguments +/// +/// * `hit_counts` - A HashMap containing the hit counts for each taxon. +/// * `taxonomy` - The Taxonomy object representing the taxonomic hierarchy. +/// * `required_score` - The minimum score required for a classification to be considered valid. +/// +/// # Returns +/// +/// Returns the taxon ID of the resolved classification. pub fn resolve_tree( hit_counts: &HashMap, taxonomy: &Taxonomy, @@ -49,6 +63,22 @@ pub fn resolve_tree( max_taxon } +/// Processes hit statistics for a group of hits. +/// +/// This function calculates various statistics for a group of hits, including +/// updating hit counts, taxon counters, and computing space distribution. +/// +/// # Arguments +/// +/// * `hits` - The HitGroup to process. +/// * `counts` - A mutable reference to a HashMap to store hit counts. +/// * `value_mask` - A mask used for processing hit values. +/// * `taxonomy` - The Taxonomy object representing the taxonomic hierarchy. +/// * `cur_taxon_counts` - A mutable reference to TaxonCounters to update. +/// +/// # Returns +/// +/// Returns a String representing the space distribution of the hits. fn stat_hits<'a>( hits: &HitGroup, counts: &mut HashMap, @@ -77,6 +107,27 @@ fn stat_hits<'a>( space_dist.reduce_str(" |:| ", |str| str.to_string()) } +/// Processes a hit group to determine classification and gather statistics. +/// +/// This function takes a hit group, processes it to determine the taxonomic +/// classification, and collects various statistics about the hits. +/// +/// # Arguments +/// +/// * `hits` - The HitGroup to process. +/// * `taxonomy` - The Taxonomy object representing the taxonomic hierarchy. +/// * `classify_counter` - An atomic counter for tracking classifications. +/// * `required_score` - The minimum score required for a classification to be considered valid. +/// * `minimum_hit_groups` - The minimum number of hit groups required for a valid classification. +/// * `value_mask` - A mask used for processing hit values. +/// +/// # Returns +/// +/// Returns a tuple containing: +/// 1. A String indicating the classification result ("C" for classified, "U" for unclassified). +/// 2. The external ID of the classified taxon. +/// 3. A String representing the hit statistics. +/// 4. The updated TaxonCounters. pub fn process_hitgroup( hits: &HitGroup, taxonomy: &Taxonomy, diff --git a/kr2r/src/compact_hash.rs b/src/compact_hash.rs similarity index 78% rename from kr2r/src/compact_hash.rs rename to src/compact_hash.rs index 58f5035..d5c3b96 100644 --- a/kr2r/src/compact_hash.rs +++ b/src/compact_hash.rs @@ -1,19 +1,95 @@ use byteorder::{ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt}; use std::cmp::Ordering as CmpOrdering; +use std::fmt::{self, Debug}; use std::fs::File; use std::fs::OpenOptions; use std::io::{BufWriter, Read, Result, Write}; use std::path::Path; -/// 1101010101 => left: 11010, right: 10101; +/// Trait for compact hash operations pub trait Compact: Default + PartialEq + Clone + Copy + Eq + Sized + Send + Sync + Debug { + /// Creates a compacted value from a hash key + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let compacted = u32::compacted(0x1234567890ABCDEF, 16); + /// assert_eq!(compacted, 0x1234); + /// ``` fn compacted(hash_key: u64, value_bits: usize) -> Self; + + /// Creates a hash value from a hash key and a value + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let hash_value = u32::hash_value(0x1234567890ABCDEF, 16, 0xABCD); + /// assert_eq!(hash_value, 0x1234ABCD); + /// ``` fn hash_value(hash_key: u64, value_bits: usize, value: Self) -> Self; + /// Returns the left part of the value + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let value: u32 = 0x1234ABCD; + /// assert_eq!(value.left(16), 0x1234); + /// ``` fn left(&self, value_bits: usize) -> Self; + + /// Returns the right part of the value + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let value: u32 = 0x1234ABCD; + /// assert_eq!(value.right(0xFFFF), 0xABCD); + /// ``` fn right(&self, value_mask: usize) -> Self; + + /// Combines left and right parts into a single value + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let combined = u32::combined(0x1234, 0xABCD, 16); + /// assert_eq!(combined, 0x1234ABCD); + /// ``` fn combined(left: Self, right: Self, value_bits: usize) -> Self; + + /// Converts the value to u32 + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let value: u32 = 0x1234ABCD; + /// assert_eq!(value.to_u32(), 0x1234ABCD); + /// ``` fn to_u32(&self) -> u32; + + /// Creates a value from u32 + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Compact; + /// + /// let value = u32::from_u32(0x1234ABCD); + /// assert_eq!(value, 0x1234ABCD); + /// ``` fn from_u32(value: u32) -> Self; } @@ -82,6 +158,18 @@ pub struct Row { } impl Row { + /// Creates a new Row + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::Row; + /// + /// let row = Row::new(0x1234, 1, 2); + /// assert_eq!(row.value, 0x1234); + /// assert_eq!(row.seq_id, 1); + /// assert_eq!(row.kmer_id, 2); + /// ``` pub fn new(value: u32, seq_id: u32, kmer_id: u32) -> Self { Self { value, @@ -96,14 +184,14 @@ impl Row { } } -// 实现 PartialOrd,只比较 index 字段 +// Implement PartialOrd, comparing only the kmer_id field impl PartialOrd for Row { fn partial_cmp(&self, other: &Self) -> Option { self.kmer_id.partial_cmp(&other.kmer_id) } } -// 实现 Ord,只比较 index 字段 +// Implement Ord, comparing only the kmer_id field impl Ord for Row { fn cmp(&self, other: &Self) -> CmpOrdering { self.kmer_id.cmp(&other.kmer_id) @@ -124,6 +212,17 @@ impl Slot where B: Compact, { + /// Creates a new Slot + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::{Slot, Compact}; + /// + /// let slot = Slot::::new(1, 0x1234); + /// assert_eq!(slot.idx, 1); + /// assert_eq!(slot.value, 0x1234); + /// ``` pub fn new(idx: usize, value: B) -> Self { Self { idx, value } } @@ -136,12 +235,22 @@ where } impl Slot { - pub fn get_seq_id(&self) -> u64 { - self.value.right(0xFFFFFFFF) + /// Returns the sequence ID (lower 32 bits) for a u64 Slot + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::{Slot, Compact}; + /// + /// let slot = Slot::::new(1, 0x1234567890ABCDEF); + /// assert_eq!(slot.get_seq_id(), 0x90ABCDEF); + /// ``` + pub fn get_seq_id(&self) -> u32 { + self.value.right(0) as u32 } } -// 实现 PartialOrd,只比较 index 字段 +// Implement PartialOrd, comparing only the idx field impl PartialOrd for Slot where B: Compact, @@ -151,7 +260,7 @@ where } } -// 实现 Ord,只比较 index 字段 +// Implement Ord, comparing only the idx field impl Ord for Slot where B: Compact, @@ -161,27 +270,25 @@ where } } -use std::fmt::{self, Debug}; - #[derive(Clone, Copy)] pub struct HashConfig { // value_mask = ((1 << value_bits) - 1); pub value_mask: usize, - // 值的位数 + // Number of bits for the value pub value_bits: usize, - // 哈希表的容量 + // Capacity of the hash table pub capacity: usize, - // 哈希表中当前存储的元素数量。 + // Current number of elements stored in the hash table pub size: usize, - // 分区数 + // Number of partitions pub partition: usize, - // 分块大小 + // Size of each hash chunk pub hash_capacity: usize, - // 数据库版本 0是kraken 2 database转换过来的 + // Database version (0 is converted from Kraken 2 database) pub version: usize, } -// 为HashConfig手动实现Debug trait +// Manually implement Debug trait for HashConfig impl fmt::Debug for HashConfig { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("HashConfig") @@ -192,12 +299,22 @@ impl fmt::Debug for HashConfig { .field("size", &self.size) .field("value_bits", &self.value_bits) .field("value_mask", &self.value_mask) - // 注意,我们没有包括_phantom字段 .finish() } } impl HashConfig { + /// Creates a new HashConfig + /// + /// # Examples + /// + /// ``` + /// use kun_peng::compact_hash::HashConfig; + /// + /// let config = HashConfig::new(1, 1000, 16, 500, 10, 100); + /// assert_eq!(config.capacity, 1000); + /// assert_eq!(config.value_bits, 16); + /// ``` pub fn new( version: usize, capacity: usize, @@ -219,7 +336,7 @@ impl HashConfig { } pub fn write_to_file>(&self, file_path: P) -> Result<()> { - // 打开文件用于写入 + // Open the file for writing let file = File::create(file_path)?; let mut writer = BufWriter::new(file); writer.write_u64::(self.version as u64)?; @@ -372,12 +489,12 @@ fn read_large_page_from_file>(large_page: &mut Page, filename: P) let current_len = large_page.data.capacity(); if capacity > current_len { - // 如果文件中的容量大于当前页的容量,则扩展内存 + // If the capacity in the file is greater than the current page's capacity, extend the memory large_page.data.resize(capacity, 0); } else if capacity < current_len { - // 如果文件中的容量小于当前页的容量,则截断并清零多余的部分 + // If the capacity in the file is smaller than the current page's capacity, truncate and zero out the excess large_page.data.truncate(capacity); - large_page.data.shrink_to_fit(); // 释放多余的内存 + large_page.data.shrink_to_fit(); // Free up excess memory } read_page_data(&mut file, &mut large_page.data)?; diff --git a/kr2r/src/db.rs b/src/db.rs similarity index 71% rename from kr2r/src/db.rs rename to src/db.rs index 0850f33..e5734a2 100644 --- a/kr2r/src/db.rs +++ b/src/db.rs @@ -1,4 +1,3 @@ -// 使用时需要引用模块路径 use crate::compact_hash::{Compact, HashConfig, Slot}; // use crate::mmscanner::MinimizerScanner; use crate::taxonomy::{NCBITaxonomy, Taxonomy}; @@ -12,9 +11,20 @@ use std::fs::File; use std::io::{BufReader, BufWriter, Read, Result as IOResult, Write}; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; -// 定义每批次处理的 Cell 数量 + +// Define the number of Cells processed per batch const BATCH_SIZE: usize = 81920; +/// Sets a cell in the page with the given item, handling collisions and LCA calculations +/// +/// # Arguments +/// +/// * `taxonomy` - The taxonomy used for LCA calculations +/// * `page` - The page of AtomicU32 cells +/// * `item` - The Slot item to be set +/// * `page_size` - The size of the page +/// * `value_bits` - The number of bits used for the value +/// * `value_mask` - The mask used to extract the value fn set_page_cell( taxonomy: &Taxonomy, page: &[AtomicU32], @@ -39,7 +49,7 @@ fn set_page_cell( let new_taxid = taxonomy.lca(item_taxid, current_taxid); Some(u32::combined(compact_key, new_taxid, value_bits)) } else { - None // 当前值不匹配,尝试下一个索引 + None // Current value doesn't match, try the next index } }); @@ -54,13 +64,25 @@ fn set_page_cell( } } +/// Writes the hash table to a file +/// +/// # Arguments +/// +/// * `page` - The page of AtomicU32 cells to write +/// * `file_path` - The path to the output file +/// * `page_index` - The index of the current page +/// * `capacity` - The capacity of the page +/// +/// # Returns +/// +/// The number of non-zero items written to the file fn write_hashtable_to_file( page: &Vec, file_path: &PathBuf, page_index: u64, capacity: u64, ) -> IOResult { - // 打开文件用于写入 + // Open the file for writing let file = File::create(file_path)?; let mut writer = BufWriter::new(file); let mut count = 0; @@ -76,10 +98,24 @@ fn write_hashtable_to_file( writer.write_u32::(value)?; } - writer.flush()?; // 确保所有内容都被写入文件 + writer.flush()?; // Ensure all content is written to the file Ok(count) } +/// Processes a k2 file and updates the hash table +/// +/// # Arguments +/// +/// * `config` - The HashConfig for the process +/// * `database` - The path to the database +/// * `chunk_file` - The path to the chunk file +/// * `taxonomy` - The taxonomy used for processing +/// * `page_size` - The size of each page +/// * `page_index` - The index of the current page +/// +/// # Returns +/// +/// The number of items processed pub fn process_k2file( config: HashConfig, database: &PathBuf, @@ -111,9 +147,9 @@ pub fn process_k2file( while let Ok(bytes_read) = reader.read(&mut batch_buffer) { if bytes_read == 0 { break; - } // 文件末尾 + } // End of file - // 处理读取的数据批次 + // Process the read data batch let cells_in_batch = bytes_read / cell_size; let cells = unsafe { @@ -130,7 +166,17 @@ pub fn process_k2file( Ok(size_count) } -/// 生成taxonomy树文件 +/// Generates a taxonomy tree file +/// +/// # Arguments +/// +/// * `ncbi_taxonomy_directory` - The directory containing NCBI taxonomy files +/// * `taxonomy_filename` - The output filename for the generated taxonomy +/// * `id_map` - A map of string IDs to u64 IDs +/// +/// # Returns +/// +/// The generated Taxonomy pub fn generate_taxonomy( ncbi_taxonomy_directory: &PathBuf, taxonomy_filename: &PathBuf, @@ -151,15 +197,24 @@ pub fn generate_taxonomy( Ok(taxo) } -/// 获取需要存储最大内部taxid的bit数量 +/// Calculates the number of bits required to store the maximum internal taxid +/// +/// # Arguments +/// +/// * `requested_bits_for_taxid` - The requested number of bits for taxid storage +/// * `node_count` - The number of nodes in the taxonomy +/// +/// # Returns +/// +/// The number of bits required for taxid storage pub fn get_bits_for_taxid( requested_bits_for_taxid: usize, node_count: f64, ) -> Result { - // 计算存储taxonomy节点数量所需的最小位数 + // Calculate the minimum number of bits needed to store the taxonomy node count let bits_needed_for_value = (node_count.log2().ceil() as usize).max(1); - // 检查是否需要更多位来存储taxid + // Check if more bits are required for storing taxid if requested_bits_for_taxid > 0 && bits_needed_for_value > requested_bits_for_taxid as usize { return Err("more bits required for storing taxid".to_string()); } @@ -167,7 +222,18 @@ pub fn get_bits_for_taxid( Ok(bits_needed_for_value.max(requested_bits_for_taxid)) } -/// 将fna文件转换成k2格式的临时文件 +/// Converts an FNA file to the k2 format temporary file +/// +/// # Arguments +/// +/// * `fna_file` - The input FNA file path +/// * `meros` - The Meros instance for k-mer processing +/// * `taxonomy` - The taxonomy used for processing +/// * `id_to_taxon_map` - A map of string IDs to taxon IDs +/// * `hash_config` - The HashConfig for the process +/// * `writers` - A vector of BufWriters for output +/// * `chunk_size` - The size of each chunk +/// * `threads` - The number of threads to use for processing pub fn convert_fna_to_k2_format>( fna_file: P, meros: Meros, diff --git a/kr2r/src/kr2r_data.rs b/src/kr2r_data.rs similarity index 61% rename from kr2r/src/kr2r_data.rs rename to src/kr2r_data.rs index 06b4770..6288c26 100644 --- a/kr2r/src/kr2r_data.rs +++ b/src/kr2r_data.rs @@ -1,6 +1,5 @@ use crate::compact_hash::Row; use crate::utils::open_file; -// use crate::{Meros, CURRENT_REVCOM_VERSION}; use seqkmer::Meros; use seqkmer::OptionPair; use seqkmer::CURRENT_REVCOM_VERSION; @@ -9,10 +8,29 @@ use std::io::{Read, Result as IoResult, Write}; use std::mem; use std::path::Path; +/// Parses a binary string into a u64 +/// +/// # Arguments +/// +/// * `src` - The binary string to parse +/// +/// # Returns +/// +/// A Result containing the parsed u64 value or a parsing error pub fn parse_binary(src: &str) -> Result { u64::from_str_radix(src, 2) } +/// Constructs a seed template based on minimizer length and spaces +/// +/// # Arguments +/// +/// * `minimizer_len` - The length of the minimizer +/// * `minimizer_spaces` - The number of spaces in the minimizer +/// +/// # Returns +/// +/// A String representing the constructed seed template pub fn construct_seed_template(minimizer_len: usize, minimizer_spaces: usize) -> String { if minimizer_len / 4 < minimizer_spaces { panic!( @@ -27,32 +45,44 @@ pub fn construct_seed_template(minimizer_len: usize, minimizer_spaces: usize) -> format!("{}{}", core, spaces) } -/// 判断u64的值是否为0,并将其转换为Option类型 +/// Converts a u64 value to Option, filtering out zero values +/// +/// # Arguments +/// +/// * `value` - The u64 value to convert +/// +/// # Returns +/// +/// An Option containing the value if it's non-zero, or None otherwise pub fn u64_to_option(value: u64) -> Option { Option::from(value).filter(|&x| x != 0) } +/// Represents a group of hits with associated rows and range pub struct HitGroup { pub rows: Vec, - /// example: (0..10], 左开右闭 + /// Range example: (0..10], left-open right-closed pub range: OptionPair<(usize, usize)>, } impl HitGroup { + /// Creates a new HitGroup pub fn new(rows: Vec, range: OptionPair<(usize, usize)>) -> Self { Self { rows, range } } + /// Calculates the capacity of the HitGroup pub fn capacity(&self) -> usize { self.range.reduce(0, |acc, range| acc + range.1 - range.0) } + /// Calculates the required score based on a confidence threshold pub fn required_score(&self, confidence_threshold: f64) -> u64 { (confidence_threshold * self.capacity() as f64).ceil() as u64 } } -/// 顺序不能错 +/// Represents options for indexing #[repr(C)] #[derive(Debug)] pub struct IndexOptions { @@ -62,12 +92,13 @@ pub struct IndexOptions { pub toggle_mask: u64, pub dna_db: bool, pub minimum_acceptable_hash_value: u64, - pub revcom_version: i32, // 如果等于 0,就报错 - pub db_version: i32, // 为未来的数据库结构变化预留 - pub db_type: i32, // 为未来使用其他数据结构预留 + pub revcom_version: i32, // Throws an error if equal to 0 + pub db_version: i32, // Reserved for future database structure changes + pub db_type: i32, // Reserved for future use of other data structures } impl IndexOptions { + /// Creates a new IndexOptions instance pub fn new( k: usize, l: usize, @@ -89,29 +120,47 @@ impl IndexOptions { } } + /// Reads IndexOptions from a file + /// + /// # Arguments + /// + /// * `file_path` - The path to the file containing IndexOptions + /// + /// # Returns + /// + /// An IoResult containing the read IndexOptions pub fn read_index_options>(file_path: P) -> IoResult { let mut file = open_file(file_path)?; let mut buffer = vec![0; std::mem::size_of::()]; file.read_exact(&mut buffer)?; let idx_opts = unsafe { - // 确保这种转换是安全的,这依赖于数据的确切布局和来源 + // Ensure this conversion is safe, depending on the exact layout and source of the data std::ptr::read(buffer.as_ptr() as *const Self) }; if idx_opts.revcom_version != CURRENT_REVCOM_VERSION as i32 { - // 如果版本为0,直接触发 panic + // Trigger a panic if the version is 0 panic!("Unsupported version (revcom_version == 0)"); } Ok(idx_opts) } + /// Writes IndexOptions to a file + /// + /// # Arguments + /// + /// * `file_path` - The path to the file where IndexOptions will be written + /// + /// # Returns + /// + /// An IoResult indicating success or failure of the write operation pub fn write_to_file>(&self, file_path: P) -> IoResult<()> { let mut file = File::create(file_path)?; - // 将结构体转换为字节切片。这是不安全的操作,因为我们正在 - // 强制将内存内容解释为字节,这要求IndexOptions是#[repr(C)], - // 且所有字段都可以安全地以其原始内存表示形式进行复制。 + // Convert the struct to a byte slice. This is an unsafe operation as we're + // forcing memory content to be interpreted as bytes. This requires IndexOptions + // to be #[repr(C)], and all fields must be safely copyable in their raw memory representation. let bytes: &[u8] = unsafe { std::slice::from_raw_parts( (self as *const IndexOptions) as *const u8, @@ -123,6 +172,7 @@ impl IndexOptions { Ok(()) } + /// Creates IndexOptions from a Meros instance pub fn from_meros(meros: Meros) -> Self { Self::new( meros.k_mer, @@ -134,6 +184,7 @@ impl IndexOptions { ) } + /// Converts IndexOptions to a Meros instance pub fn as_meros(&self) -> Meros { Meros::new( self.k, diff --git a/kr2r/src/kv_store.rs b/src/kv_store.rs similarity index 74% rename from kr2r/src/kv_store.rs rename to src/kv_store.rs index bd3a736..c3f8778 100644 --- a/kr2r/src/kv_store.rs +++ b/src/kv_store.rs @@ -29,9 +29,19 @@ pub fn fmix64(key: u64) -> u64 { const C1: u64 = 0x87c37b91114253d5; const C2: u64 = 0x4cf5ad432745937f; -/// MurmurHash3 函数的优化 Rust 实现 -/// keys = u64.to_be_bytes();这个函数只能处理u64. +/// Optimized Rust implementation of the MurmurHash3 function +/// This function can only process u64 values (keys = u64.to_be_bytes()). /// Performs the MurmurHash3 hash computation on a given key. +/// +/// # Examples +/// +/// ``` +/// use kun_peng::murmur_hash3; +/// +/// let key: u64 = 123; +/// let hash = murmur_hash3(key); +/// assert_ne!(hash, key); // The hash should be different from the input +/// ``` #[inline] pub fn murmur_hash3(key: u64) -> u64 { let k1 = key.wrapping_mul(C1).rotate_left(31).wrapping_mul(C2); @@ -47,6 +57,17 @@ pub fn murmur_hash3(key: u64) -> u64 { h1 } +/// Computes the SeaHash for a given u64 key. +/// +/// # Examples +/// +/// ``` +/// use kun_peng::sea_hash; +/// +/// let key: u64 = 123; +/// let hash = sea_hash(key); +/// assert_ne!(hash, key); // The hash should be different from the input +/// ``` #[inline] pub fn sea_hash(key: u64) -> u64 { let mut hasher = SeaHasher::default(); @@ -56,6 +77,18 @@ pub fn sea_hash(key: u64) -> u64 { /// KHasher is designed to hash only u64 values. /// +/// # Examples +/// +/// ``` +/// use std::hash::{Hasher, BuildHasher}; +/// use kun_peng::{KHasher, KBuildHasher}; +/// +/// let mut hasher = KBuildHasher.build_hasher(); +/// hasher.write_u64(123); +/// let hash = hasher.finish(); +/// assert_eq!(hash, 123); // KHasher simply returns the input value +/// ``` +/// /// # Safety /// This hasher assumes that the input slice to `write` is exactly 8 bytes long, /// representing a `u64`. Using this hasher with input that is not 8 bytes, diff --git a/kr2r/src/lib.rs b/src/lib.rs similarity index 100% rename from kr2r/src/lib.rs rename to src/lib.rs diff --git a/kr2r/src/readcounts.rs b/src/readcounts.rs similarity index 89% rename from kr2r/src/readcounts.rs rename to src/readcounts.rs index 8b1e40c..a0aba39 100644 --- a/kr2r/src/readcounts.rs +++ b/src/readcounts.rs @@ -8,10 +8,8 @@ use std::sync::atomic::{AtomicU64, Ordering}; type TaxId = u32; pub const TAXID_MAX: TaxId = TaxId::MAX; -// 定义 taxon_counts_t 类型 pub type TaxonCounts = HashMap; -// 定义一个错误类型 #[derive(Debug)] pub struct UnionError; @@ -75,19 +73,6 @@ where kmers: T, } -// impl Clone for ReadCounts -// where -// T: Unionable + Clone, -// { -// fn clone(&self) -> Self { -// ReadCounts { -// n_reads: AtomicU64::new(self.n_reads.load(Ordering::Relaxed)), -// n_kmers: AtomicU64::new(self.n_kmers.load(Ordering::Relaxed)), -// kmers: self.kmers.clone(), -// } -// } -// } - impl ReadCounts where T: Unionable, @@ -130,7 +115,6 @@ where } } -// 定义 READCOUNTER 类型 #[cfg(feature = "exact_counting")] pub type ReadCounter = ReadCounts>; diff --git a/kr2r/src/report.rs b/src/report.rs similarity index 67% rename from kr2r/src/report.rs rename to src/report.rs index f5049b7..2d1027c 100644 --- a/kr2r/src/report.rs +++ b/src/report.rs @@ -6,6 +6,16 @@ use std::fs::File; use std::io::{self, Write}; use std::path::Path; +/// Calculates clade counts based on the taxonomy and call counts +/// +/// # Arguments +/// +/// * `taxonomy` - The taxonomy structure +/// * `call_counts` - A HashMap of taxon IDs to their counts +/// +/// # Returns +/// +/// A HashMap of taxon IDs to their clade counts pub fn get_clade_counts(taxonomy: &Taxonomy, call_counts: &HashMap) -> HashMap { let mut clade_counts = HashMap::new(); @@ -20,6 +30,16 @@ pub fn get_clade_counts(taxonomy: &Taxonomy, call_counts: &HashMap) -> clade_counts } +/// Calculates clade counters based on the taxonomy and call counters +/// +/// # Arguments +/// +/// * `taxonomy` - The taxonomy structure +/// * `call_counters` - TaxonCounters containing call counts for each taxon +/// +/// # Returns +/// +/// TaxonCounters containing clade counts for each taxon pub fn get_clade_counters(taxonomy: &Taxonomy, call_counters: &TaxonCounters) -> TaxonCounters { let mut clade_counters = TaxonCounters::new(); @@ -37,6 +57,16 @@ pub fn get_clade_counters(taxonomy: &Taxonomy, call_counters: &TaxonCounters) -> clade_counters } +/// Extracts a string from a byte slice starting at the given offset +/// +/// # Arguments +/// +/// * `data` - The byte slice to extract from +/// * `offset` - The starting offset in the byte slice +/// +/// # Returns +/// +/// A string slice extracted from the byte slice fn extract_string_from_offset(data: &[u8], offset: usize) -> &str { let end = data[offset..] .iter() @@ -46,6 +76,17 @@ fn extract_string_from_offset(data: &[u8], offset: usize) -> &str { std::str::from_utf8(&data[offset..end]).unwrap_or("") } +/// Prints a line in MPA-style report format +/// +/// # Arguments +/// +/// * `file` - The file to write to +/// * `clade_count` - The count for the clade +/// * `taxonomy_line` - The taxonomy line to print +/// +/// # Returns +/// +/// An io::Result indicating success or failure of the write operation fn print_mpa_style_report_line( file: &mut File, clade_count: u64, @@ -54,11 +95,25 @@ fn print_mpa_style_report_line( writeln!(file, "{}\t{}", taxonomy_line, clade_count) } +/// Performs a depth-first search to generate an MPA-style report +/// +/// # Arguments +/// +/// * `taxid` - The current taxon ID +/// * `file` - The file to write the report to +/// * `report_zeros` - Whether to report zero counts +/// * `taxonomy` - The taxonomy structure +/// * `clade_counts` - A HashMap of taxon IDs to their clade counts +/// * `taxonomy_names` - A vector to store the taxonomy names +/// +/// # Returns +/// +/// An io::Result indicating success or failure of the operation fn mpa_report_dfs( taxid: u64, file: &mut File, report_zeros: bool, - taxonomy: &Taxonomy, // 假设你有一个 `Taxonomy` 结构 + taxonomy: &Taxonomy, clade_counts: &HashMap, taxonomy_names: &mut Vec, ) -> io::Result<()> { @@ -66,7 +121,7 @@ fn mpa_report_dfs( return Ok(()); } - let node = &taxonomy.nodes[taxid as usize]; // 假设 nodes 是 Vec + let node = &taxonomy.nodes[taxid as usize]; let rank = extract_string_from_offset(&taxonomy.rank_data, node.rank_offset as usize); let rank_code = match rank { @@ -126,6 +181,18 @@ fn mpa_report_dfs( Ok(()) } +/// Generates an MPA-style report +/// +/// # Arguments +/// +/// * `filename` - The path to the output file +/// * `report_zeros` - Whether to report zero counts +/// * `taxonomy` - The taxonomy structure +/// * `call_counters` - A HashMap of taxon IDs to their ReadCounters +/// +/// # Returns +/// +/// An io::Result indicating success or failure of the operation pub fn report_mpa_style>( filename: P, report_zeros: bool, @@ -152,6 +219,23 @@ pub fn report_mpa_style>( ) } +/// Prints a line in Kraken-style report format +/// +/// # Arguments +/// +/// * `file` - The file to write to +/// * `report_kmer_data` - Whether to report k-mer data +/// * `total_seqs` - The total number of sequences +/// * `clade_counter` - The ReadCounter for the clade +/// * `taxon_counter` - The ReadCounter for the taxon +/// * `rank_str` - The rank string +/// * `taxid` - The taxon ID +/// * `sci_name` - The scientific name +/// * `depth` - The depth in the taxonomy tree +/// +/// # Returns +/// +/// An io::Result indicating success or failure of the write operation pub fn print_kraken_style_report_line( file: &mut File, report_kmer_data: bool, @@ -192,6 +276,25 @@ pub fn print_kraken_style_report_line( writeln!(file, "{}", sci_name) } +/// Performs a depth-first search to generate a Kraken-style report +/// +/// # Arguments +/// +/// * `taxid` - The current taxon ID +/// * `file` - The file to write the report to +/// * `report_zeros` - Whether to report zero counts +/// * `report_kmer_data` - Whether to report k-mer data +/// * `taxonomy` - The taxonomy structure +/// * `clade_counters` - A mutable reference to TaxonCounters for clade counts +/// * `call_counters` - A reference to TaxonCounters for call counts +/// * `total_seqs` - The total number of sequences +/// * `rank_code` - The current rank code +/// * `rank_depth` - The current rank depth +/// * `depth` - The current depth in the taxonomy tree +/// +/// # Returns +/// +/// An io::Result indicating success or failure of the operation pub fn kraken_report_dfs( taxid: u64, file: &mut File, @@ -240,7 +343,6 @@ pub fn kraken_report_dfs( .next() .unwrap_or(""); - // let mut clade_counter = clade_counters.get_mut(&taxid).unwrap(); let mut clade_counter = clade_counters .entry(taxid) .or_insert_with(ReadCounter::default); @@ -287,6 +389,21 @@ pub fn kraken_report_dfs( Ok(()) } +/// Generates a Kraken-style report +/// +/// # Arguments +/// +/// * `filename` - The path to the output file +/// * `report_zeros` - Whether to report zero counts +/// * `report_kmer_data` - Whether to report k-mer data +/// * `taxonomy` - The taxonomy structure +/// * `call_counters` - A HashMap of taxon IDs to their ReadCounters +/// * `total_seqs` - The total number of sequences +/// * `total_unclassified` - The total number of unclassified sequences +/// +/// # Returns +/// +/// An io::Result indicating success or failure of the operation pub fn report_kraken_style>( filename: P, report_zeros: bool, @@ -300,7 +417,7 @@ pub fn report_kraken_style>( let mut file = File::create(filename)?; - // 处理未分类序列的特殊情况 + // Handle the special case for unclassified sequences if total_unclassified != 0 || report_zeros { let mut rc = ReadCounter::new(total_unclassified, 0); let trc = ReadCounter::new(total_unclassified, 0); @@ -317,7 +434,7 @@ pub fn report_kraken_style>( )?; } - // 通过 DFS 遍历分类树 + // Traverse the taxonomy tree using DFS kraken_report_dfs( 1, &mut file, diff --git a/kr2r/src/taxonomy.rs b/src/taxonomy.rs similarity index 76% rename from kr2r/src/taxonomy.rs rename to src/taxonomy.rs index 0de63a8..5b8a268 100644 --- a/kr2r/src/taxonomy.rs +++ b/src/taxonomy.rs @@ -5,7 +5,19 @@ use std::fs::File; use std::io::{BufRead, BufReader, Error, ErrorKind, Read, Result, Write}; use std::path::Path; -/// 解析 ncbi 文件的 taxonomy nodes 文件 +/// Parse the NCBI taxonomy nodes file +/// +/// # Arguments +/// +/// * `nodes_filename` - Path to the nodes file +/// +/// # Returns +/// +/// A tuple containing: +/// - HashMap of node ID to parent ID +/// - HashMap of parent ID to set of child IDs +/// - HashMap of node ID to rank +/// - HashSet of known ranks pub fn parse_nodes_file>( nodes_filename: P, ) -> Result<( @@ -59,7 +71,15 @@ pub fn parse_nodes_file>( Ok((parent_map, child_map, rank_map, known_ranks)) } -/// 解析 ncbi 文件的 taxonomy names 文件 +/// Parse the NCBI taxonomy names file +/// +/// # Arguments +/// +/// * `names_filename` - Path to the names file +/// +/// # Returns +/// +/// A HashMap of node ID to scientific name pub fn parse_names_file>(names_filename: P) -> Result> { let names_file = open_file(names_filename)?; let reader = BufReader::new(names_file); @@ -68,22 +88,22 @@ pub fn parse_names_file>(names_filename: P) -> Result = line.split("\t|\t").collect(); if fields.len() < 4 { - continue; // 如果不满足预期的字段数量,则跳过此行 + continue; // Skip if the line doesn't have the expected number of fields } - // 解析节点 ID 和名称类型 + // Parse node ID and name type let node_id = fields[0].parse::().unwrap_or(0); let name = fields[1].to_string(); let name_type = fields[3].to_string(); - // 仅当类型为 "scientific name" 时,将名称添加到 map 中 + // Only add the name to the map if it's a "scientific name" if name_type == "scientific name" { name_map.insert(node_id, name); } @@ -92,7 +112,7 @@ pub fn parse_names_file>(names_filename: P) -> Result, name_map: HashMap, @@ -129,14 +149,23 @@ pub struct NCBITaxonomy { } impl NCBITaxonomy { - // 构造函数等实现 + /// Create a new NCBITaxonomy from NCBI taxonomy files + /// + /// # Arguments + /// + /// * `nodes_filename` - Path to the nodes file + /// * `names_filename` - Path to the names file + /// + /// # Returns + /// + /// A Result containing the new NCBITaxonomy or an error pub fn from_ncbi>(nodes_filename: P, names_filename: P) -> Result { let mut marked_nodes = HashSet::new(); let (parent_map, child_map, rank_map, known_ranks) = parse_nodes_file(nodes_filename)?; let name_map = parse_names_file(names_filename)?; - marked_nodes.insert(1); // 标记根节点 + marked_nodes.insert(1); // Mark the root node Ok(NCBITaxonomy { parent_map, @@ -148,6 +177,11 @@ impl NCBITaxonomy { }) } + /// Mark a node and all its ancestors in the taxonomy + /// + /// # Arguments + /// + /// * `taxid` - The ID of the node to mark pub fn mark_node(&mut self, taxid: u64) { let mut current_taxid = taxid; while !self.marked_nodes.contains(¤t_taxid) { @@ -160,6 +194,13 @@ impl NCBITaxonomy { } } + /// Get rank offset data for the taxonomy + /// + /// # Returns + /// + /// A tuple containing: + /// - HashMap of rank to offset + /// - String containing all ranks pub fn get_rank_offset_data(&self) -> (HashMap, String) { let mut rank_data = String::new(); let mut rank_offsets = HashMap::new(); @@ -176,9 +217,14 @@ impl NCBITaxonomy { (rank_offsets, rank_data) } + /// Convert the NCBITaxonomy to a Kraken-style Taxonomy + /// + /// # Returns + /// + /// A new Taxonomy object pub fn convert_to_kraken_taxonomy(&self) -> Taxonomy { let mut taxo = Taxonomy::default(); - // 预分配内存 + // Preallocate memory taxo.nodes.reserve(self.marked_nodes.len() + 1); taxo.nodes.push(TaxonomyNode::default()); @@ -232,13 +278,13 @@ impl NCBITaxonomy { } } -// Taxonomy 类型定义 +// Taxonomy struct definition #[derive(Debug)] pub struct Taxonomy { pub path_cache: HashMap>, pub nodes: Vec, - pub name_data: Vec, // 字符串数据以 Vec 存储 - pub rank_data: Vec, // 字符串数据以 Vec 存储 + pub name_data: Vec, // String data stored as Vec + pub rank_data: Vec, // String data stored as Vec external_to_internal_id_map: HashMap, } @@ -255,8 +301,17 @@ impl Default for Taxonomy { } impl Taxonomy { - const MAGIC: &'static [u8] = b"K2TAXDAT"; // 替换为实际的 magic bytes - + const MAGIC: &'static [u8] = b"K2TAXDAT"; // Replace with actual magic bytes + + /// Create a new Taxonomy from a file + /// + /// # Arguments + /// + /// * `filename` - Path to the taxonomy file + /// + /// # Returns + /// + /// A Result containing the new Taxonomy or an error pub fn from_file + Debug>(filename: P) -> Result { let mut file = open_file(&filename)?; @@ -305,38 +360,40 @@ impl Taxonomy { Ok(taxo) } - pub fn _is_a_ancestor_of_b(&self, a: u32, b: u32) -> bool { - if a == 0 || b == 0 { - return false; - } - - let mut current = b; - - while current > a { - current = match self.nodes.get(current as usize) { - Some(node) => node.parent_id as u32, - None => return false, - }; - } - - current == a - } - + /// Check if node A is an ancestor of node B + /// + /// # Arguments + /// + /// * `a` - The potential ancestor node ID + /// * `b` - The potential descendant node ID + /// + /// # Returns + /// + /// A boolean indicating whether A is an ancestor of B pub fn is_a_ancestor_of_b(&self, a: u32, b: u32) -> bool { if a == 0 || b == 0 { return false; } - // 尝试从path_cache中获取b的祖先路径 + // Try to get the ancestor path of B from the path cache if let Some(path) = self.path_cache.get(&b) { - // 检查路径中是否包含a + // Check if the path contains A return path.contains(&a); } false } - // 查找两个节点的最低公共祖先 + /// Find the lowest common ancestor of two nodes + /// + /// # Arguments + /// + /// * `a` - The first node ID + /// * `b` - The second node ID + /// + /// # Returns + /// + /// The ID of the lowest common ancestor pub fn lca(&self, a: u32, b: u32) -> u32 { if a == 0 || b == 0 || a == b { return if a != 0 { a } else { b }; @@ -355,39 +412,16 @@ impl Taxonomy { return 0; } - // 返回最后一个共同的祖先 + // Return the last common ancestor *path_a.get(i - 1).unwrap_or(&0) } - pub fn lowest_common_ancestor(&self, mut a: u32, mut b: u32) -> u32 { - // 如果任何一个节点是 0,返回另一个节点 - if a == 0 || b == 0 || a == b { - return if a != 0 { a } else { b }; - } - - // 遍历节点直到找到共同的祖先 - while a != b { - if a > b { - a = self - .nodes - .get(a as usize) - .map_or(0, |node| node.parent_id as u32); - } else { - b = self - .nodes - .get(b as usize) - .map_or(0, |node| node.parent_id as u32); - } - } - - a - } - + /// Build the path cache for efficient ancestor lookups pub fn build_path_cache(&mut self) { let mut cache: HashMap> = HashMap::new(); let root_external_id = 1u64; if let Some(&root_internal_id) = self.external_to_internal_id_map.get(&root_external_id) { - // 开始从根节点遍历 + // Start traversing from the root node self.build_path_for_node(root_internal_id, &mut cache, Vec::new()); } self.path_cache = cache; @@ -399,27 +433,40 @@ impl Taxonomy { path_cache: &mut HashMap>, mut current_path: Vec, ) { - current_path.push(node_id); // 将当前节点添加到路径中 - // 存储当前节点的路径 + current_path.push(node_id); // Add the current node to the path + // Store the current node's path path_cache.insert(node_id, current_path.clone()); - // 获取当前节点的信息 + // Get the current node's information let node = &self.nodes[node_id as usize]; let first_child_id = node.first_child as u32; let child_count = node.child_count as u32; - // 遍历所有子节点 + // Traverse all children for i in 0..child_count { - let child_internal_id = first_child_id + i; // 这里假设子节点的ID是连续的 + let child_internal_id = first_child_id + i; // Assume child IDs are consecutive self.build_path_for_node(child_internal_id, path_cache, current_path.clone()); } } + /// Get the number of nodes in the taxonomy + /// + /// # Returns + /// + /// The number of nodes pub fn node_count(&self) -> usize { self.nodes.len() } - // get_internal_id 函数的优化 + /// Get the internal ID for a given external ID + /// + /// # Arguments + /// + /// * `external_id` - The external ID to look up + /// + /// # Returns + /// + /// The corresponding internal ID, or 0 if not found pub fn get_internal_id(&self, external_id: u64) -> u32 { *self .external_to_internal_id_map @@ -427,6 +474,7 @@ impl Taxonomy { .unwrap_or(&0) } + /// Generate the mapping from external to internal IDs pub fn generate_external_to_internal_id_map(&mut self) { self.external_to_internal_id_map.clear(); self.external_to_internal_id_map.insert(0, 0); @@ -437,6 +485,15 @@ impl Taxonomy { } } + /// Write the taxonomy to disk + /// + /// # Arguments + /// + /// * `filename` - Path to write the taxonomy file + /// + /// # Returns + /// + /// A Result indicating success or failure pub fn write_to_disk>(&self, filename: P) -> Result<()> { let mut file = File::create(filename)?; diff --git a/kr2r/src/utils.rs b/src/utils.rs similarity index 76% rename from kr2r/src/utils.rs rename to src/utils.rs index 734b746..580d691 100644 --- a/kr2r/src/utils.rs +++ b/src/utils.rs @@ -4,7 +4,19 @@ use std::io::{self, BufRead, BufReader, BufWriter, Result}; use std::path::{Path, PathBuf}; use walkdir::WalkDir; -/// 读取 seqid2taxid.map 文件。为了裁剪 ncbi 的 taxonomy 树 +/// Reads the seqid2taxid.map file to create a mapping for trimming the NCBI taxonomy tree. +/// +/// This function reads a file containing sequence IDs and their corresponding taxon IDs, +/// and creates a HashMap mapping sequence IDs to taxon IDs. +/// +/// # Arguments +/// +/// * `filename` - A path-like object representing the file to be read. +/// +/// # Returns +/// +/// Returns a `Result` containing a `HashMap` where the keys are sequence IDs +/// and the values are taxon IDs, or an error if the file cannot be read or parsed. pub fn read_id_to_taxon_map>(filename: P) -> Result> { let file = open_file(filename)?; let reader = BufReader::new(file); @@ -51,7 +63,7 @@ pub fn read_id_to_taxon_map>(filename: P) -> Result u64 { - // 检查 bit_expansion_factor 是否在有效范围内 + // Check if bit_expansion_factor is within the valid range if bit_expansion_factor == 0 || bit_expansion_factor > 64 { return spaced_seed_mask; } @@ -109,21 +121,34 @@ extern crate libc; #[cfg(unix)] use libc::{getrlimit, rlimit, setrlimit, RLIMIT_NOFILE}; +/// Get the current file descriptor limit for the process. +/// +/// This function retrieves the soft limit on the number of open file descriptors +/// allowed for the current process on Unix-like systems. +/// +/// # Returns +/// +/// Returns the current soft limit as a `usize`, or 0 if the limit couldn't be retrieved. +/// +/// # Platform-specific behavior +/// +/// This function is only available on Unix-like systems. On other platforms, +/// it may return a default value or not be compiled. #[cfg(unix)] pub fn get_file_limit() -> usize { let mut limits = rlimit { - rlim_cur: 0, // 当前(软)限制 - rlim_max: 0, // 最大(硬)限制 + rlim_cur: 0, // Current (soft) limit + rlim_max: 0, // Maximum (hard) limit }; - // 使用unsafe块调用getrlimit,因为这是一个外部C函数 + // Use an unsafe block to call getrlimit, as it's an external C function let result = unsafe { getrlimit(RLIMIT_NOFILE, &mut limits) }; if result == 0 { - // 如果成功,返回当前软限制转换为usize + // If successful, return the current soft limit converted to usize limits.rlim_cur as usize } else { - // 如果失败,输出错误并可能返回一个默认值或panic + // If failed, print an error and return 0 eprintln!("Failed to get file limit"); 0 } @@ -166,11 +191,11 @@ pub fn create_partition_writers(partition_files: &Vec) -> Vec) -> Vec>(filename: P) -> BufWriter { let file = OpenOptions::new() .write(true) - .append(true) // 确保以追加模式打开文件 - .create(true) // 如果文件不存在,则创建 + .append(true) // Ensure opening the file in append mode + .create(true) // Create the file if it doesn't exist .open(filename) .unwrap(); BufWriter::new(file) @@ -196,19 +221,19 @@ pub fn find_and_trans_bin_files( suffix: &str, check: bool, ) -> io::Result>> { - // 改为聚合相同数字的文件路径 - // 构建正则表达式以匹配文件名中的第一个数字 + // Aggregate file paths with the same number + // Build a regular expression to match the first number in the filename let pattern = format!(r"{}_(\d+)_\d+{}", prefix, suffix); let re = Regex::new(&pattern).expect("Invalid regex pattern"); - // 读取指定目录下的所有条目 + // Read all entries in the specified directory let mut map_entries = Map::new(); for entry in fs::read_dir(directory)? { let path = entry?.path(); if path.is_file() { if let Some(file_name) = path.file_name().and_then(|name| name.to_str()) { - // 使用正则表达式匹配文件名,并提取第一个数字部分 + // Use the regular expression to match the filename and extract the first number part if let Some(cap) = re.captures(file_name) { if let Some(m) = cap.get(1) { if let Ok(num) = m.as_str().parse::() { @@ -221,7 +246,7 @@ pub fn find_and_trans_bin_files( } if check { - // 检查数字是否从1开始连续 + // Check if the numbers are continuous starting from 1 let mut keys: Vec<_> = map_entries.keys().cloned().collect(); keys.sort_unstable(); for (i, &key) in keys.iter().enumerate() { @@ -234,7 +259,7 @@ pub fn find_and_trans_bin_files( } } - // 返回聚合后的文件路径 + // Return the aggregated file paths Ok(map_entries) } @@ -244,11 +269,11 @@ pub fn find_and_trans_files( suffix: &str, check: bool, ) -> io::Result> { - // 构建正则表达式以匹配文件名中的数字 + // Build a regular expression to match the number in the filename let pattern = format!(r"{}_(\d+){}", prefix, suffix); let re = Regex::new(&pattern).unwrap(); - // 读取指定目录下的所有条目 + // Read all entries in the specified directory let entries = fs::read_dir(directory)? .filter_map(Result::ok) .map(|entry| entry.path()) @@ -262,7 +287,7 @@ pub fn find_and_trans_files( }) .collect::>(); - // 使用正则表达式提取数字,并将它们存入BTreeMap + // Extract numbers using the regular expression and store them in a BTreeMap let mut map_entries = Map::new(); for path in entries { if let Some(fname) = path.file_name().and_then(|name| name.to_str()) { @@ -277,7 +302,7 @@ pub fn find_and_trans_files( } if check { - // 检查数字是否从0开始连续 + // Check if the numbers are continuous starting from 0 let mut keys: Vec<_> = map_entries.keys().cloned().collect(); keys.sort_unstable(); for (i, key) in keys.iter().enumerate() { @@ -290,22 +315,22 @@ pub fn find_and_trans_files( } } - // 返回排序后的文件路径 + // Return the sorted file paths Ok(map_entries) } -// 函数定义 +// Function definition pub fn find_and_sort_files( directory: &Path, prefix: &str, suffix: &str, check: bool, ) -> io::Result> { - // 构建正则表达式以匹配文件名中的数字 + // Build a regular expression to match the number in the filename let pattern = format!(r"{}_(\d+){}", prefix, suffix); let re = Regex::new(&pattern).unwrap(); - // 读取指定目录下的所有条目 + // Read all entries in the specified directory let entries = fs::read_dir(directory)? .filter_map(Result::ok) .map(|entry| entry.path()) @@ -319,7 +344,7 @@ pub fn find_and_sort_files( }) .collect::>(); - // 使用正则表达式提取数字并排序 + // Extract numbers using the regular expression and sort let mut sorted_entries = entries .into_iter() .filter_map(|path| { @@ -333,7 +358,7 @@ pub fn find_and_sort_files( sorted_entries.sort_by_key(|k| k.1); if check { - // 检查数字是否从0开始连续 + // Check if the numbers are continuous starting from 0 for (i, (_, num)) in sorted_entries.iter().enumerate() { let a_idx = i + 1; if a_idx != *num { @@ -345,7 +370,7 @@ pub fn find_and_sort_files( } } - // 返回排序后的文件路径 + // Return the sorted file paths Ok(sorted_entries .iter() .map(|(path, _)| path.clone()) @@ -362,18 +387,18 @@ pub fn open_file>(path: P) -> io::Result { }) } -/// 获取最新的文件序号 +/// Get the latest file index pub fn get_lastest_file_index(file_path: &PathBuf) -> Result { let file_content = fs::read_to_string(&file_path)?; - // 如果文件内容为空,则默认最大值为0 + // If the file content is empty, default the maximum value to 0 let index = if file_content.is_empty() { 0 } else { file_content - .lines() // 将内容按行分割 - .filter_map(|line| line.split('\t').next()) // 获取每行的第一列 - .filter_map(|num_str| num_str.parse::().ok()) // 尝试将第一列的字符串转换为整型 - .max() // 找到最大值 + .lines() // Split the content by lines + .filter_map(|line| line.split('\t').next()) // Get the first column of each line + .filter_map(|num_str| num_str.parse::().ok()) // Try to convert the first column string to integer + .max() // Find the maximum value .unwrap_or(1) }; Ok(index)