Skip to content

Commit

Permalink
refactor: update hash types and add hasher trait
Browse files Browse the repository at this point in the history
  • Loading branch information
jRimbault committed Jan 31, 2025
1 parent 40f60e6 commit f370026
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 91 deletions.
12 changes: 2 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ human-panic = { version = "2.0.2", optional = true }
metrohash = { version = "1.0.6", optional = true }
seahash = { version = "4.1.0", optional = true }
serde_json = { version = "1.0.132", optional = true }
twox-hash = { version = "1.6.3", optional = true }
twox-hash = { version = "2.1.0", optional = true }

[dev-dependencies]
assert_cmd = "2"
Expand All @@ -77,6 +77,6 @@ highway = "1.2.0"
once_cell = "1.20.2"
serde_json = "1.0.132"
seahash = "4.1.0"
twox-hash = "1.6.3"
twox-hash = "2.1.0"
predicates = "3.1.2"
rand = "0.8"
23 changes: 11 additions & 12 deletions src/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ mod hash;
use crate::ext::{IteratorExt, WalkBuilderAddPaths, WalkParallelForEach};
use crate::TreeBag;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use std::hash::Hasher;
use std::path::{Path, PathBuf};

const CHANNEL_SIZE: usize = 8 * 1024;
Expand All @@ -19,9 +18,9 @@ pub fn find_dupes_partial<H, P>(
directories: &[P],
max_depth: Option<usize>,
filter: filter::FileFilter,
) -> TreeBag<u64, PathBuf>
) -> TreeBag<H::Hash, PathBuf>
where
H: Hasher + Default,
H: crate::hasher::Hasher,
P: AsRef<Path>,
{
let mut paths = directories
Expand Down Expand Up @@ -55,9 +54,9 @@ where
.0
}

fn hash_entry<H>(filter: &filter::FileFilter, entry: ignore::DirEntry) -> Option<(u64, PathBuf)>
fn hash_entry<H>(filter: &filter::FileFilter, entry: ignore::DirEntry) -> Option<(H::Hash, PathBuf)>
where
H: Hasher + Default,
H: crate::hasher::Hasher,
{
let path = entry.path();
let meta = entry
Expand All @@ -73,9 +72,9 @@ where
Some((hash, entry.into_path()))
}

pub fn dedupe<H>(tree: TreeBag<u64, PathBuf>) -> crate::FileCounter
pub fn dedupe<H>(tree: TreeBag<H::Hash, PathBuf>) -> crate::FileCounter<H::Hash>
where
H: Hasher + Default,
H: crate::hasher::Hasher,
{
let (sender, receiver) = crossbeam_channel::bounded(CHANNEL_SIZE);
rayon::join(
Expand All @@ -90,10 +89,10 @@ where
}

fn process_bucket<H>(
sender: &mut crossbeam_channel::Sender<(u64, crate::Path)>,
(old_hash, bucket): (u64, Vec<PathBuf>),
sender: &mut crossbeam_channel::Sender<(H::Hash, crate::Path)>,
(old_hash, bucket): (H::Hash, Vec<PathBuf>),
) where
H: Hasher + Default,
H: crate::hasher::Hasher,
{
if bucket.len() == 1 {
let file = bucket.into_iter().next().unwrap();
Expand All @@ -112,9 +111,9 @@ fn process_bucket<H>(
}
}

fn rehash_file<H>(file: &Path) -> Result<u64, ()>
fn rehash_file<H>(file: &Path) -> Result<H::Hash, ()>
where
H: Hasher + Default,
H: crate::hasher::Hasher,
{
if file.metadata().map(|f| f.len()).unwrap_or(0) < BLOCK_SIZE as _ {
return Err(());
Expand Down
13 changes: 6 additions & 7 deletions src/fs/hash.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
use super::BLOCK_SIZE;
use std::fs::File;
use std::hash::Hasher;
use std::io::{self, Read};
use std::path::Path;

/// Get a checksum of the first 4 KiB (at most) of a file.
pub fn partial<H>(path: &Path) -> io::Result<u64>
pub fn partial<H>(path: &Path) -> io::Result<H::Hash>
where
H: Hasher + Default,
H: crate::hasher::Hasher,
{
let mut file = File::open(path)?;
let mut buffer = [0u8; BLOCK_SIZE];
Expand All @@ -27,18 +26,18 @@ where
}

/// Get a complete checksum of a file.
pub fn full<H>(path: &Path) -> io::Result<u64>
pub fn full<H>(path: &Path) -> io::Result<H::Hash>
where
H: Hasher + Default,
H: crate::hasher::Hasher,
{
/// Compile time [`Write`](std::io::Write) wrapper for a [`Hasher`](core::hash::Hasher).
/// This should get erased at compile time.
#[repr(transparent)]
struct HashWriter<H>(H);

impl<H: Hasher> io::Write for HashWriter<H> {
impl<H: crate::hasher::Hasher> io::Write for HashWriter<H> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.0.write(buf);
crate::hasher::Hasher::write(&mut self.0, buf);
Ok(buf.len())
}

Expand Down
64 changes: 64 additions & 0 deletions src/hasher.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
pub trait Hasher: Default {
type Hash: Hash;
fn write(&mut self, buf: &[u8]);
fn finish(self) -> Self::Hash;
}

pub trait Hash: PartialEq + Eq + PartialOrd + Ord + Send + Sync + Copy {}

impl<T> Hash for T where T: PartialEq + Eq + PartialOrd + Ord + Send + Sync + Copy {}

impl Hasher for ahash::AHasher {

Check failure on line 11 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `ahash`

Check failure on line 11 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `ahash`

Check failure on line 11 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `ahash`

Check failure on line 11 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `ahash`

Check failure on line 11 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `ahash`

Check failure on line 11 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `ahash`
type Hash = u64;
fn write(&mut self, buf: &[u8]) {
std::hash::Hasher::write(self, buf);
}
fn finish(self) -> Self::Hash {
std::hash::Hasher::finish(&self)
}
}

impl Hasher for highway::HighwayHasher {

Check failure on line 21 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `highway`

Check failure on line 21 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `highway`

Check failure on line 21 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `highway`

Check failure on line 21 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `highway`

Check failure on line 21 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `highway`

Check failure on line 21 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `highway`
type Hash = [u64; 4];
fn write(&mut self, buf: &[u8]) {
use highway::HighwayHash;

Check failure on line 24 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

unresolved import `highway`

Check failure on line 24 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

unresolved import `highway`

Check failure on line 24 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

unresolved import `highway`

Check failure on line 24 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

unresolved import `highway`

Check failure on line 24 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

unresolved import `highway`

Check failure on line 24 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

unresolved import `highway`
self.append(buf);
}

fn finish(self) -> Self::Hash {
use highway::HighwayHash;

Check failure on line 29 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

unresolved import `highway`

Check failure on line 29 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

unresolved import `highway`

Check failure on line 29 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

unresolved import `highway`

Check failure on line 29 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

unresolved import `highway`

Check failure on line 29 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

unresolved import `highway`

Check failure on line 29 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

unresolved import `highway`
self.finalize256()
}
}

impl Hasher for metrohash::MetroHash128 {

Check failure on line 34 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `metrohash`

Check failure on line 34 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `metrohash`

Check failure on line 34 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `metrohash`

Check failure on line 34 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `metrohash`

Check failure on line 34 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `metrohash`

Check failure on line 34 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `metrohash`
type Hash = (u64, u64);
fn write(&mut self, buf: &[u8]) {
std::hash::Hasher::write(self, buf);
}

fn finish(self) -> Self::Hash {
self.finish128()
}
}

impl Hasher for seahash::SeaHasher {

Check failure on line 45 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `seahash`

Check failure on line 45 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `seahash`

Check failure on line 45 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `seahash`

Check failure on line 45 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `seahash`

Check failure on line 45 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `seahash`

Check failure on line 45 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `seahash`
type Hash = u64;
fn write(&mut self, buf: &[u8]) {
std::hash::Hasher::write(self, buf);
}
fn finish(self) -> Self::Hash {
std::hash::Hasher::finish(&self)
}
}

impl Hasher for twox_hash::xxhash3_128::Hasher {

Check failure on line 55 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `twox_hash`

Check failure on line 55 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (ubuntu-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `twox_hash`

Check failure on line 55 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `twox_hash`

Check failure on line 55 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (macos-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `twox_hash`

Check failure on line 55 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, stable, --no-default-features)

failed to resolve: use of undeclared crate or module `twox_hash`

Check failure on line 55 in src/hasher.rs

View workflow job for this annotation

GitHub Actions / Check build (windows-latest, 1.74.1, --no-default-features)

failed to resolve: use of undeclared crate or module `twox_hash`
type Hash = u128;
fn write(&mut self, buf: &[u8]) {
self.write(buf);
}

fn finish(self) -> Self::Hash {
self.finish_128()
}
}
12 changes: 7 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,18 @@
mod bag;
mod ext;
mod fs;
mod hasher;
mod path;

pub use bag::{Factor, Fdupes, Machine, TreeBag};
pub use globset;
pub use hasher::Hasher;
pub use path::Path;
pub use regex;
use std::hash::Hasher;
use std::rc::Rc;

pub type FileCounter = TreeBag<u64, Path>;
pub type FileReplicates<'a> = bag::Replicates<'a, u64, Path>;
pub type FileCounter<H> = TreeBag<H, Path>;
pub type FileReplicates<'a, H> = bag::Replicates<'a, H, Path>;

/// Search configuration.
///
Expand Down Expand Up @@ -83,9 +84,10 @@ where
P: AsRef<std::path::Path>,
{
/// This will attemps a complete scan according to its configuration.
pub fn scan<H>(self) -> FileCounter
pub fn scan<H>(self) -> FileCounter<H::Hash>
where
H: Hasher + Default,
H: hasher::Hasher,
H::Hash: std::fmt::Debug,
{
#[cfg(unix)]
let file_filter = fs::filter::FileFilter::new(
Expand Down
Loading

0 comments on commit f370026

Please sign in to comment.