From 5bf41742a622d16cf6c087f5fc45205b7dd3e192 Mon Sep 17 00:00:00 2001 From: dagou Date: Thu, 4 Jul 2024 15:20:31 +0800 Subject: [PATCH] bug fix --- kr2r/src/args.rs | 4 --- kr2r/src/bin/annotate.rs | 12 +-------- kr2r/src/bin/build_k2_db.rs | 23 ++++++----------- kr2r/src/bin/direct.rs | 6 +---- kr2r/src/bin/hashshard.rs | 19 ++++---------- kr2r/src/bin/kun.rs | 1 - kr2r/src/bin/splitr.rs | 2 +- kr2r/src/compact_hash.rs | 50 +++++++++++++++++++++++++++++-------- kr2r/src/db.rs | 22 ---------------- 9 files changed, 54 insertions(+), 85 deletions(-) diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs index b1bf46c..1be9129 100644 --- a/kr2r/src/args.rs +++ b/kr2r/src/args.rs @@ -105,10 +105,6 @@ pub struct ClassifyArgs { )] pub minimum_hit_groups: usize, - /// Enables use of a Kraken 2 compatible shared database. - #[clap(long, default_value_t = false)] - pub kraken_db_type: bool, - /// In comb. w/ -R, provide minimizer information in report #[clap(short = 'K', long, value_parser, default_value_t = false)] pub report_kmer_data: bool, diff --git a/kr2r/src/bin/annotate.rs b/kr2r/src/bin/annotate.rs index d69f75d..9f9dfd4 100644 --- a/kr2r/src/bin/annotate.rs +++ b/kr2r/src/bin/annotate.rs @@ -26,10 +26,6 @@ pub struct Args { #[arg(long = "db", required = true)] pub database: PathBuf, - /// Enables use of a Kraken 2 compatible shared database. Default is false. - #[clap(long, default_value_t = false)] - pub kraken_db_type: bool, - /// chunk directory #[clap(long)] pub chunk_dir: PathBuf, @@ -178,13 +174,7 @@ fn process_chunk_file>( println!("start load table..."); let config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; - let chtm = CHTable::from_range( - config, - hash_files, - page_index, - page_index + 1, - args.kraken_db_type, - )?; + let chtm = CHTable::from_range(config, hash_files, page_index, page_index + 1)?; // 计算持续时间 let duration = start.elapsed(); diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs index 55264a3..d1281c8 100644 --- a/kr2r/src/bin/build_k2_db.rs +++ b/kr2r/src/bin/build_k2_db.rs @@ -2,10 +2,7 @@ use clap::Parser; use kr2r::args::{parse_size, Build}; use kr2r::compact_hash::HashConfig; -use kr2r::db::{ - convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file, - write_config_to_file, -}; +use kr2r::db::{convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file}; use kr2r::utils::{ create_partition_files, create_partition_writers, find_library_fna_files, get_file_limit, read_id_to_taxon_map, set_fd_limit, @@ -58,7 +55,8 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<()> { let start = Instant::now(); let meros = idx_opts.as_meros(); let hash_files = find_and_sort_files(&args.database, "hash", ".k2d")?; - let chtable = CHTable::from_hash_files(hash_config, &hash_files, args.kraken_db_type)?; + let chtable = CHTable::from_hash_files(hash_config, &hash_files)?; process_files(args, meros, hash_config, &chtable, &taxo)?; let duration = start.elapsed(); diff --git a/kr2r/src/bin/hashshard.rs b/kr2r/src/bin/hashshard.rs index a097ca1..c0b94ce 100644 --- a/kr2r/src/bin/hashshard.rs +++ b/kr2r/src/bin/hashshard.rs @@ -37,11 +37,6 @@ fn mmap_read_write, Q: AsRef>( // 将读取的数据写入目标文件 dest_file.write_all(&buffer)?; - // let file = OpenOptions::new().read(true).open(&source_path)?; - // let mmap = unsafe { MmapOptions::new().offset(offset).len(length).map(&file)? }; - - // // 将内存映射的数据写入目标文件 - // dest_file.write_all(&mmap)?; Ok(()) } @@ -68,9 +63,12 @@ pub struct Args { pub fn run(args: Args) -> IOResult<()> { let index_filename = &args.database.join("hash.k2d"); - let hash_config = HashConfig::from_hash_header(index_filename)?; + let mut hash_config = HashConfig::from_kraken2_header(index_filename)?; let partition = (hash_config.capacity + args.hash_capacity - 1) / args.hash_capacity; + hash_config.partition = partition; + hash_config.hash_capacity = args.hash_capacity; + println!("hashshard start..."); // 开始计时 let start = Instant::now(); @@ -87,14 +85,7 @@ pub fn run(args: Args) -> IOResult<()> { panic!("hash config is exists!!!"); } - mmap_read_write( - &index_filename, - config_file, - partition, - args.hash_capacity, - 0, - 32, - )?; + hash_config.write_to_file(config_file)?; for i in 1..=partition { let chunk_file = k2d_dir.join(format!("hash_{}.k2d", i)); diff --git a/kr2r/src/bin/kun.rs b/kr2r/src/bin/kun.rs index 01713df..9333c6d 100644 --- a/kr2r/src/bin/kun.rs +++ b/kr2r/src/bin/kun.rs @@ -76,7 +76,6 @@ impl From for annotate::Args { database: item.database, chunk_dir: item.chunk_dir, batch_size: item.batch_size, - kraken_db_type: item.kraken_db_type, } } } diff --git a/kr2r/src/bin/splitr.rs b/kr2r/src/bin/splitr.rs index bb777a1..e9bd94d 100644 --- a/kr2r/src/bin/splitr.rs +++ b/kr2r/src/bin/splitr.rs @@ -250,7 +250,7 @@ pub fn run(args: Args) -> Result<()> { } let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?; - println!("hash_config {:?}", hash_config); + println!("{:?}", hash_config); if hash_config.hash_capacity == 0 { panic!("`hash_capacity` can't be zero!"); } diff --git a/kr2r/src/compact_hash.rs b/kr2r/src/compact_hash.rs index a51197d..7a4dfa3 100644 --- a/kr2r/src/compact_hash.rs +++ b/kr2r/src/compact_hash.rs @@ -1,7 +1,8 @@ -use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; +use byteorder::{ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt}; use std::cmp::Ordering as CmpOrdering; +use std::fs::File; use std::fs::OpenOptions; -use std::io::{Read, Result}; +use std::io::{BufWriter, Read, Result, Write}; use std::path::Path; /// 1101010101 => left: 11010, right: 10101; @@ -176,17 +177,21 @@ pub struct HashConfig { pub partition: usize, // 分块大小 pub hash_capacity: usize, + // 数据库版本 0是kraken 2 database转换过来的 + pub version: usize, } // 为HashConfig手动实现Debug trait impl fmt::Debug for HashConfig { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("HashConfig") - .field("value_mask", &self.value_mask) - .field("value_bits", &self.value_bits) + .field("version", &self.version) + .field("partition", &self.partition) + .field("hash_capacity", &self.hash_capacity) .field("capacity", &self.capacity) .field("size", &self.size) - .field("hash_capacity", &self.hash_capacity) + .field("value_bits", &self.value_bits) + .field("value_mask", &self.value_mask) // 注意,我们没有包括_phantom字段 .finish() } @@ -194,6 +199,7 @@ impl fmt::Debug for HashConfig { impl HashConfig { pub fn new( + version: usize, capacity: usize, value_bits: usize, size: usize, @@ -208,20 +214,44 @@ impl HashConfig { size, partition, hash_capacity, + version, } } + pub fn write_to_file>(&self, file_path: P) -> Result<()> { + // 打开文件用于写入 + let file = File::create(file_path)?; + let mut writer = BufWriter::new(file); + writer.write_u64::(self.version as u64)?; + writer.write_u64::(self.partition as u64)?; + writer.write_u64::(self.hash_capacity as u64)?; + writer.write_u64::(self.capacity as u64)?; + writer.write_u64::(self.size as u64)?; + writer.write_u64::(self.value_bits as u64)?; + writer.flush()?; + Ok(()) + } + + pub fn from_kraken2_header>(filename: P) -> Result { + let mut file = OpenOptions::new().read(true).open(&filename)?; + let capacity = file.read_u64::()? as usize; + let size = file.read_u64::()? as usize; + let _ = file.read_u64::()? as usize; + let value_bits = file.read_u64::()? as usize; + Ok(Self::new(0, capacity, value_bits, size, 0, 0)) + } + pub fn from_hash_header>(filename: P) -> Result { let mut file = OpenOptions::new().read(true).open(&filename)?; + let version = file.read_u64::()? as usize; let partition = file.read_u64::()? as usize; let hash_capacity = file.read_u64::()? as usize; - let capacity = file.read_u64::()? as usize; let size = file.read_u64::()? as usize; - let _ = file.read_u64::()? as usize; let value_bits = file.read_u64::()? as usize; Ok(Self::new( + version, capacity, value_bits, size, @@ -406,10 +436,9 @@ impl CHTable { pub fn from_hash_files + Debug>( config: HashConfig, hash_sorted_files: &Vec

, - kd_type: bool, ) -> Result { let end = hash_sorted_files.len(); - Self::from_range(config, hash_sorted_files, 0, end, kd_type) + Self::from_range(config, hash_sorted_files, 0, end) } pub fn from_range + Debug>( @@ -417,7 +446,6 @@ impl CHTable { hash_sorted_files: &Vec

, start: usize, end: usize, - kd_type: bool, ) -> Result { let mut pages = vec![Page::default(); start]; let parition = hash_sorted_files.len(); @@ -425,7 +453,7 @@ impl CHTable { let mut hash_file = &hash_sorted_files[i]; let mut page = read_page_from_file(&hash_file)?; let next_page = if page.data.last().map_or(false, |&x| x != 0) { - if kd_type { + if config.version < 1 { hash_file = &hash_sorted_files[(i + 1) % parition] } read_first_block_from_file(&hash_file)? diff --git a/kr2r/src/db.rs b/kr2r/src/db.rs index 971600d..6c9f997 100644 --- a/kr2r/src/db.rs +++ b/kr2r/src/db.rs @@ -78,28 +78,6 @@ fn write_hashtable_to_file( Ok(count) } -pub fn write_config_to_file( - file_path: &PathBuf, - partition: u64, - hash_capacity: u64, - capacity: u64, - size: u64, - key_bits: u64, - value_bits: u64, -) -> IOResult<()> { - // 打开文件用于写入 - let file = File::create(file_path)?; - let mut writer = BufWriter::new(file); - writer.write_u64::(partition)?; - writer.write_u64::(hash_capacity)?; - writer.write_u64::(capacity)?; - writer.write_u64::(size)?; - writer.write_u64::(key_bits)?; - writer.write_u64::(value_bits)?; - writer.flush()?; - Ok(()) -} - pub fn process_k2file( config: HashConfig, database: &PathBuf,