Skip to content

Commit

Permalink
bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
dagou committed Jul 4, 2024
1 parent cca04e7 commit 5bf4174
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 85 deletions.
4 changes: 0 additions & 4 deletions kr2r/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,6 @@ pub struct ClassifyArgs {
)]
pub minimum_hit_groups: usize,

/// Enables use of a Kraken 2 compatible shared database.
#[clap(long, default_value_t = false)]
pub kraken_db_type: bool,

/// In comb. w/ -R, provide minimizer information in report
#[clap(short = 'K', long, value_parser, default_value_t = false)]
pub report_kmer_data: bool,
Expand Down
12 changes: 1 addition & 11 deletions kr2r/src/bin/annotate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ pub struct Args {
#[arg(long = "db", required = true)]
pub database: PathBuf,

/// Enables use of a Kraken 2 compatible shared database. Default is false.
#[clap(long, default_value_t = false)]
pub kraken_db_type: bool,

/// chunk directory
#[clap(long)]
pub chunk_dir: PathBuf,
Expand Down Expand Up @@ -178,13 +174,7 @@ fn process_chunk_file<P: AsRef<Path>>(

println!("start load table...");
let config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?;
let chtm = CHTable::from_range(
config,
hash_files,
page_index,
page_index + 1,
args.kraken_db_type,
)?;
let chtm = CHTable::from_range(config, hash_files, page_index, page_index + 1)?;

// 计算持续时间
let duration = start.elapsed();
Expand Down
23 changes: 7 additions & 16 deletions kr2r/src/bin/build_k2_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
use clap::Parser;
use kr2r::args::{parse_size, Build};
use kr2r::compact_hash::HashConfig;
use kr2r::db::{
convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file,
write_config_to_file,
};
use kr2r::db::{convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file};
use kr2r::utils::{
create_partition_files, create_partition_writers, find_library_fna_files, get_file_limit,
read_id_to_taxon_map, set_fd_limit,
Expand Down Expand Up @@ -58,7 +55,8 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro

let capacity = required_capacity;
let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
let hash_config = HashConfig::new(capacity, value_bits, 0, partition, args.hash_capacity);
let mut hash_config =
HashConfig::new(1, capacity, value_bits, 0, partition, args.hash_capacity);

// 开始计时
let start = Instant::now();
Expand Down Expand Up @@ -91,7 +89,7 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro

let hash_filename = k2d_dir.join("hash_config.k2d");
let partition = chunk_files.len();
let mut size: u64 = 0;
let mut size: usize = 0;

println!("start process k2 files...");
for i in 1..=partition {
Expand All @@ -104,23 +102,16 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
chunk_size,
i,
)?;
size += count as u64;
size += count;
let duration = start.elapsed();
println!(
"process chunk file {:?}/{:}: duration: {:?}",
i, partition, duration
);
}

write_config_to_file(
&hash_filename,
partition as u64,
args.hash_capacity as u64,
capacity as u64,
size,
32 - hash_config.value_bits as u64,
hash_config.value_bits as u64,
)?;
hash_config.size = size;
hash_config.write_to_file(&hash_filename)?;

// 计算持续时间
let duration = start.elapsed();
Expand Down
6 changes: 1 addition & 5 deletions kr2r/src/bin/direct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,6 @@ pub struct Args {
#[clap(short = 'p', long = "num-threads", value_parser, default_value_t = num_cpus::get())]
pub num_threads: usize,

/// Enables use of a Kraken 2 compatible shared database. Default is false.
#[clap(long, default_value_t = false)]
pub kraken_db_type: bool,

/// A list of input file paths (FASTA/FASTQ) to be processed by the classify program.
/// Supports fasta or fastq format files (e.g., .fasta, .fastq) and gzip compressed files (e.g., .fasta.gz, .fastq.gz).
// #[clap(short = 'F', long = "files")]
Expand Down Expand Up @@ -352,7 +348,7 @@ pub fn run(args: Args) -> Result<()> {
let start = Instant::now();
let meros = idx_opts.as_meros();
let hash_files = find_and_sort_files(&args.database, "hash", ".k2d")?;
let chtable = CHTable::from_hash_files(hash_config, &hash_files, args.kraken_db_type)?;
let chtable = CHTable::from_hash_files(hash_config, &hash_files)?;

process_files(args, meros, hash_config, &chtable, &taxo)?;
let duration = start.elapsed();
Expand Down
19 changes: 5 additions & 14 deletions kr2r/src/bin/hashshard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,6 @@ fn mmap_read_write<P: AsRef<Path>, Q: AsRef<Path>>(

// 将读取的数据写入目标文件
dest_file.write_all(&buffer)?;
// let file = OpenOptions::new().read(true).open(&source_path)?;
// let mmap = unsafe { MmapOptions::new().offset(offset).len(length).map(&file)? };

// // 将内存映射的数据写入目标文件
// dest_file.write_all(&mmap)?;

Ok(())
}
Expand All @@ -68,9 +63,12 @@ pub struct Args {

pub fn run(args: Args) -> IOResult<()> {
let index_filename = &args.database.join("hash.k2d");
let hash_config = HashConfig::from_hash_header(index_filename)?;

let mut hash_config = HashConfig::from_kraken2_header(index_filename)?;
let partition = (hash_config.capacity + args.hash_capacity - 1) / args.hash_capacity;
hash_config.partition = partition;
hash_config.hash_capacity = args.hash_capacity;

println!("hashshard start...");
// 开始计时
let start = Instant::now();
Expand All @@ -87,14 +85,7 @@ pub fn run(args: Args) -> IOResult<()> {
panic!("hash config is exists!!!");
}

mmap_read_write(
&index_filename,
config_file,
partition,
args.hash_capacity,
0,
32,
)?;
hash_config.write_to_file(config_file)?;

for i in 1..=partition {
let chunk_file = k2d_dir.join(format!("hash_{}.k2d", i));
Expand Down
1 change: 0 additions & 1 deletion kr2r/src/bin/kun.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ impl From<ClassifyArgs> for annotate::Args {
database: item.database,
chunk_dir: item.chunk_dir,
batch_size: item.batch_size,
kraken_db_type: item.kraken_db_type,
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion kr2r/src/bin/splitr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ pub fn run(args: Args) -> Result<()> {
}
let hash_config = HashConfig::from_hash_header(&args.database.join("hash_config.k2d"))?;

println!("hash_config {:?}", hash_config);
println!("{:?}", hash_config);
if hash_config.hash_capacity == 0 {
panic!("`hash_capacity` can't be zero!");
}
Expand Down
50 changes: 39 additions & 11 deletions kr2r/src/compact_hash.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use byteorder::{ByteOrder, LittleEndian, ReadBytesExt};
use byteorder::{ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt};
use std::cmp::Ordering as CmpOrdering;
use std::fs::File;
use std::fs::OpenOptions;
use std::io::{Read, Result};
use std::io::{BufWriter, Read, Result, Write};
use std::path::Path;

/// 1101010101 => left: 11010, right: 10101;
Expand Down Expand Up @@ -176,24 +177,29 @@ pub struct HashConfig {
pub partition: usize,
// 分块大小
pub hash_capacity: usize,
// 数据库版本 0是kraken 2 database转换过来的
pub version: usize,
}

// 为HashConfig手动实现Debug trait
impl fmt::Debug for HashConfig {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("HashConfig")
.field("value_mask", &self.value_mask)
.field("value_bits", &self.value_bits)
.field("version", &self.version)
.field("partition", &self.partition)
.field("hash_capacity", &self.hash_capacity)
.field("capacity", &self.capacity)
.field("size", &self.size)
.field("hash_capacity", &self.hash_capacity)
.field("value_bits", &self.value_bits)
.field("value_mask", &self.value_mask)
// 注意,我们没有包括_phantom字段
.finish()
}
}

impl HashConfig {
pub fn new(
version: usize,
capacity: usize,
value_bits: usize,
size: usize,
Expand All @@ -208,20 +214,44 @@ impl HashConfig {
size,
partition,
hash_capacity,
version,
}
}

pub fn write_to_file<P: AsRef<Path>>(&self, file_path: P) -> Result<()> {
// 打开文件用于写入
let file = File::create(file_path)?;
let mut writer = BufWriter::new(file);
writer.write_u64::<LittleEndian>(self.version as u64)?;
writer.write_u64::<LittleEndian>(self.partition as u64)?;
writer.write_u64::<LittleEndian>(self.hash_capacity as u64)?;
writer.write_u64::<LittleEndian>(self.capacity as u64)?;
writer.write_u64::<LittleEndian>(self.size as u64)?;
writer.write_u64::<LittleEndian>(self.value_bits as u64)?;
writer.flush()?;
Ok(())
}

pub fn from_kraken2_header<P: AsRef<Path>>(filename: P) -> Result<Self> {
let mut file = OpenOptions::new().read(true).open(&filename)?;
let capacity = file.read_u64::<LittleEndian>()? as usize;
let size = file.read_u64::<LittleEndian>()? as usize;
let _ = file.read_u64::<LittleEndian>()? as usize;
let value_bits = file.read_u64::<LittleEndian>()? as usize;
Ok(Self::new(0, capacity, value_bits, size, 0, 0))
}

pub fn from_hash_header<P: AsRef<Path>>(filename: P) -> Result<Self> {
let mut file = OpenOptions::new().read(true).open(&filename)?;
let version = file.read_u64::<LittleEndian>()? as usize;
let partition = file.read_u64::<LittleEndian>()? as usize;
let hash_capacity = file.read_u64::<LittleEndian>()? as usize;

let capacity = file.read_u64::<LittleEndian>()? as usize;
let size = file.read_u64::<LittleEndian>()? as usize;
let _ = file.read_u64::<LittleEndian>()? as usize;
let value_bits = file.read_u64::<LittleEndian>()? as usize;

Ok(Self::new(
version,
capacity,
value_bits,
size,
Expand Down Expand Up @@ -406,26 +436,24 @@ impl CHTable {
pub fn from_hash_files<P: AsRef<Path> + Debug>(
config: HashConfig,
hash_sorted_files: &Vec<P>,
kd_type: bool,
) -> Result<CHTable> {
let end = hash_sorted_files.len();
Self::from_range(config, hash_sorted_files, 0, end, kd_type)
Self::from_range(config, hash_sorted_files, 0, end)
}

pub fn from_range<P: AsRef<Path> + Debug>(
config: HashConfig,
hash_sorted_files: &Vec<P>,
start: usize,
end: usize,
kd_type: bool,
) -> Result<CHTable> {
let mut pages = vec![Page::default(); start];
let parition = hash_sorted_files.len();
for i in start..end {
let mut hash_file = &hash_sorted_files[i];
let mut page = read_page_from_file(&hash_file)?;
let next_page = if page.data.last().map_or(false, |&x| x != 0) {
if kd_type {
if config.version < 1 {
hash_file = &hash_sorted_files[(i + 1) % parition]
}
read_first_block_from_file(&hash_file)?
Expand Down
22 changes: 0 additions & 22 deletions kr2r/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,28 +78,6 @@ fn write_hashtable_to_file(
Ok(count)
}

pub fn write_config_to_file(
file_path: &PathBuf,
partition: u64,
hash_capacity: u64,
capacity: u64,
size: u64,
key_bits: u64,
value_bits: u64,
) -> IOResult<()> {
// 打开文件用于写入
let file = File::create(file_path)?;
let mut writer = BufWriter::new(file);
writer.write_u64::<LittleEndian>(partition)?;
writer.write_u64::<LittleEndian>(hash_capacity)?;
writer.write_u64::<LittleEndian>(capacity)?;
writer.write_u64::<LittleEndian>(size)?;
writer.write_u64::<LittleEndian>(key_bits)?;
writer.write_u64::<LittleEndian>(value_bits)?;
writer.flush()?;
Ok(())
}

pub fn process_k2file(
config: HashConfig,
database: &PathBuf,
Expand Down

0 comments on commit 5bf4174

Please sign in to comment.