From 8c287d810f800b3280e19fbccbb32ba51260fbbd Mon Sep 17 00:00:00 2001
From: K--Aethiax <36335359+eternal-io@users.noreply.github.com>
Date: Fri, 20 Dec 2024 23:12:04 +0800
Subject: [PATCH] v0.3 is ready.

---
 .gitattributes      |   4 +-
 CRATES.IO-README.md |  64 +++---
 Cargo.lock          |   6 +-
 Cargo.toml          |  18 +-
 README.md           | 186 +++++++++++------
 README.zh-Hans.md   | 186 +++++++++++++++++
 benches/hashes.rs   |  10 +-
 museair.cpp         |  83 ++++----
 src/lib.rs          | 483 +++++++++++++++++++++++++++++++-------------
 9 files changed, 756 insertions(+), 284 deletions(-)
 create mode 100644 README.zh-Hans.md
diff --git a/.gitattributes b/.gitattributes
index cd8cd4e..c55e7e7 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,3 @@
-museair.cpp -linguist-detectable
+museair.cpp     -linguist-detectable
+results/**      -linguist-detectable
+results/*.log   -diff
diff --git a/CRATES.IO-README.md b/CRATES.IO-README.md
index c503f9e..4260274 100644
--- a/CRATES.IO-README.md
+++ b/CRATES.IO-README.md
@@ -1,24 +1,30 @@
+# MuseAir
+
 [![](https://img.shields.io/crates/v/museair)](https://crates.io/crates/museair)
 [![](https://img.shields.io/crates/d/museair)](https://crates.io/crates/museair)
 [![](https://img.shields.io/crates/l/museair)](#)
 [![](https://img.shields.io/docsrs/museair)](https://docs.rs/museair)
 [![](https://img.shields.io/github/stars/eternal-io/museair?style=social)](https://github.com/eternal-io/museair)
 
-This is the new fastest portable hash: immune to blinding multiplication, even faster then wyhash, SMHasher3 passed.
 
-See [repository](https://github.com/eternal-io/museair) for details.
+A fast portable hash algorithm with highest bulk throughput and lowest small key latency (1-32 bytes) among portable hashes listed in SMHasher3,
+and made improvements for quality and usability. See [repository](https://github.com/eternal-io/museair) for details.
+
+It provides two variants: `Standard` (items listed in crate root) and `BFast`.
+The former offers better quality and the latter offers better performance.
+Both variants offer 64-bit and 128-bit output with essentially the same overhead.
 
 
 ## Usage
 
-```
+```rust
 let seed: u64 = 42;
 
-let one_shot = museair::hash_128("MuseAir hash!".as_bytes(), seed);
+let one_shot = museair::hash_128("K--Aethiax".as_bytes(), seed);
 let streamed = {
     let mut hasher = museair::Hasher::with_seed(seed);
-    hasher.write("MuseAir".as_bytes());
-    hasher.write(" hash!".as_bytes());
+    hasher.write("K--Ae".as_bytes());
+    hasher.write("thiax".as_bytes());
     hasher.finish_128()
 };
 
@@ -26,37 +32,35 @@ assert_eq!(one_shot, streamed);
 ```
 
 
-## Benchmarks
-
-| Hash              | Digest length |                      Throughput |
-|:----------------- | -------------:| -------------------------------:|
-| MuseAir           |        64-bit |  29.1 GiB/s   <sub>(0.88)</sub> |
-| MuseAir-128       |       128-bit |  29.0 GiB/s   <sub>(0.88)</sub> |
-| MuseAir-BFast     |        64-bit |**33.0 GiB/s** <sub>(1.00)</sub> |
-| MuseAir-BFast-128 |       128-bit |**33.0 GiB/s** <sub>(1.00)</sub> |
-| [WyHash]          |        64-bit |  29.0 GiB/s   <sub>(0.88)</sub> |
-| [WyHash]-condom   |        64-bit |  24.3 GiB/s   <sub>(0.74)</sub> |
-| [KomiHash]        |        64-bit |  27.7 GiB/s   <sub>(0.84)</sub> |
-
-(These results are obtained by running `cargo bench` on AMD Ryzen 7 5700G 4.6GHz Desktop.)
+## Security
 
-[WyHash]: https://crates.io/crates/wyhash-final4
-[KomiHash]: https://crates.io/crates/komihash
+MuseAir is **NOT** designed for cryptographic security. You shouldn't use this for security purposes,
+such as ensuring that files have not been maliciously tampered with. For these use cases, consider SHA-3, Ascon or Blake3.
 
+Besides, MuseAir-`Standard` is planned to be stable after some time (1.0.0).
+Due to its improved quality, it will then be available for the following purposes:
 
-## Security
+- Persistent file format
+- Communication protocol
+- ...
 
-MuseAir is ***NOT*** intended for cryptographic security.
+_Until then, it should only be used for local sessions!_
 
-- To resist HashDoS, your hash must comes with a private seed.
-- To ensure the protection of your data, it is recommended to use a well-established algorithm, such as SHA-3.
 
+## Benchmarks
 
-## Versioning policy
+| Hash               | Digest length |      Throughput   |
+|:------------------ | -------------:| -----------------:|
+| MuseAir            |        64-bit |      30.5 GiB/s   |
+| MuseAir-128        |       128-bit |      30.4 GiB/s   |
+| MuseAir-BFast      |        64-bit |    **36.4 GiB/s** |
+| MuseAir-BFast-128  |       128-bit |    **36.3 GiB/s** |
+| [wyhash] 4.2       |        64-bit |      28.4 GiB/s   |
+|  wyhash.condom 4.2 |        64-bit |      22.8 GiB/s   |
+| [komihash] 5.10    |        64-bit |      26.8 GiB/s   |
 
-The `-Standard` variant (functions listed in the crate root) is not scheduled to be stable until version 1.0.0 is released.
-That is, the result of the hash may change from minor version to minor version. Don't use it for persistent storage yet.
+<img src="https://github.com/eternal-io/museair/blob/master/results/bench-smallkeys.png?raw=true" alt="Bench small keys" width="100%" />
 
-The `-BFast` variant will never be stable, you should only use this on local sessions.
-For persistent storage, you should always use the `-Standard` variant (after it is stable).
 
+[wyhash]: https://crates.io/crates/wyhash-final4
+[komihash]: https://crates.io/crates/komihash
diff --git a/Cargo.lock b/Cargo.lock
index 368e0da..ac7192c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -255,7 +255,7 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "museair"
-version = "0.3.0-rc6"
+version = "0.3.0"
 dependencies = [
  "criterion",
  "hashverify",
@@ -637,6 +637,6 @@ dependencies = [
 
 [[package]]
 name = "wyhash-final4"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e4f724fd8118d0922c0ec9269aa80365aa79542096d8557b02ccb569d0a913e"
+checksum = "e19276aeff1b4987182db862cc0e75a79023528cd1ec951ea3bd6cddba029c7d"
diff --git a/Cargo.toml b/Cargo.toml
index 46e15cf..14d11a9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,11 +1,11 @@
 [package]
 name = "museair"
-version = "0.3.0-rc6"
+version = "0.3.0"
 edition = "2021"
 authors = ["K--Aethiax"]
 publish = true
 
-description = "Blazingly™ fast portable hash function designed by K--Aethiax."
+description = "Fast portable hash algorithm with highest bulk throughput and lowest small key latency (1-32 bytes) among portable hashes listed in SMHasher3, and made improvements for quality and usability."
 
 documentation = "https://docs.rs/museair"
 repository = "https://github.com/eternal-io/museair"
@@ -15,17 +15,21 @@ readme = "CRATES.IO-README.md"
 keywords = ["hash", "hasher", "museair"]
 categories = ["algorithms", "no-std"]
 
-exclude = ["smhasher*", "results/*", "*.cpp", "*.md", "!CRATES.IO-README.md"]
-
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+exclude = [
+    "smhasher*",
+    "results/*",
+    "*.png",
+    "*.cpp",
+    "*.md",
+    "!CRATES.IO-README.md",
+]
 
 
 [dev-dependencies]
 hashverify = "0.1.0"
 criterion = "0.5.1"
 komihash = "0.4.1"
-wyhash-final4 = "0.2.1"
+wyhash-final4 = "0.2.2"
 wyhash = "0.5.0"
 
 
diff --git a/README.md b/README.md
index 2663188..8245500 100644
--- a/README.md
+++ b/README.md
@@ -6,102 +6,174 @@
 [![](https://img.shields.io/docsrs/museair)](https://docs.rs/museair)
 [![](https://img.shields.io/github/stars/eternal-io/museair?style=social)](https://github.com/eternal-io/museair)
 
-- **Portable**, does NOT rely on machine-specific instructions such as SSE, AVX, CLMUL, CRC32, etc.
+<p align="right"><sup><b><i>English</i></b> 丨 <a href="README.zh-Hans.md">简体中文</a></sup></p>
 
-- Completely **immune to blinding multiplication**, and accumulates the full 128-bit multiplication results instead of prematurely "compressing" them into 64-bit, this helps maintain the differentiation between consecutive states and *reduces entropy loss*.
 
-- **As fast as [WyHash] and its successor [RapidHash]** on bulks, but they suffer from blinding multiplication.
+MuseAir is a fast portable hash algorithm that:
 
-- Inputs are never simply mixed with constants, and the algorithm correctly implements seeding. This [**prevents seed-independent attacks**](https://github.com/Cyan4973/xxHash/issues/180) and does not require additional `secret[]` to be remedied.
+- has the highest throughput among the portable hashes listed in SMHasher3, also provides good performance on small keys (see _Benchmark_).
+- made improvements for quality and usability (see _Algorithm analysis_), [passed](results) the full [SMHasher3] `--extra` tests.
 
-- Inputs are not divided into stripes while processing, this results in *better confusion and diffusion*.
+MuseAir provides two variants: `Standard` (default) and `BFast`. The former offers better quality and the latter offers better performance.
 
-- Produces either **64-bit or 128-bit** results with nearly the **same performance overhead**.
+- Both variants offer 64-bit and 128-bit output with essentially the same overhead.
 
+MuseAir is **NOT** designed for cryptographic security. You shouldn't use this for security purposes, such as ensuring that files have not been maliciously tampered with. For these use cases, consider SHA-3, Ascon or Blake3.
 
-Actually, MuseAir has two variants: **Standard** (this suffix is ​​omitted where it does not cause confusion) and **BFast**.
+Besides, MuseAir-`Standard` is planned to be stable after some time (1.0.0). Due to its improved quality, it will then be available for the following purposes:
 
-As its name suggests, the **BFast** variant is faster but *less* immune to blinding multiplication.
-("less" here means when it actually happens, it will only result in the most recent 8 bytes being lost, rather than all the past state of a stripe being catastrophically lost!)
+- Persistent file format
+- Communication protocol
+- ...
+
+_Until then, it should only be used for local sessions!_
 
 
 ## Benchmarks
 
+    AMD Ryzen 7 5700G 4.6GHz Desktop, Windows 10 22H2, rustc 1.82.0 (f6e511eec 2024-10-15)
+
+    - SMHasher3 (34093a3 2024-06-17) runs in WSL 2 with clang 14.0.0-1ubuntu1.1
+
+
 ### Bulk datas
 
-| Hash              | Digest length |   Throughput (C++ - [SMHasher3]) | Throughput (Rust - Criterion.rs) |
-|:----------------- | -------------:| --------------------------------:| --------------------------------:|
-| MuseAir           |        64-bit |   31.8 GiB/s   <sub>(0.96)</sub> |   29.1 GiB/s   <sub>(0.88)</sub> |
-| MuseAir-128       |       128-bit |   31.8 GiB/s   <sub>(0.96)</sub> |   29.0 GiB/s   <sub>(0.88)</sub> |
-| MuseAir-BFast     |        64-bit | **33.2 GiB/s** <sub>(1.00)</sub> | **33.0 GiB/s** <sub>(1.00)</sub> |
-| MuseAir-BFast-128 |       128-bit | **33.2 GiB/s** <sub>(1.00)</sub> | **33.0 GiB/s** <sub>(1.00)</sub> |
-| WyHash            |        64-bit |   31.9 GiB/s   <sub>(0.96)</sub> |   29.0 GiB/s   <sub>(0.88)</sub> |
-| WyHash-condom     |        64-bit |   25.3 GiB/s   <sub>(0.76)</sub> |   24.3 GiB/s   <sub>(0.74)</sub> |
-| KomiHash          |        64-bit |   25.5 GiB/s   <sub>(0.77)</sub> |   27.7 GiB/s   <sub>(0.84)</sub> |
+| Hash              | Digest length | Throughput (C++ - SMHasher3) | Throughput (Rust - Criterion.rs) |
+|:----------------- | -------------:| ----------------------------:| --------------------------------:|
+| MuseAir           |        64-bit |                   28.5 GiB/s |                       30.5 GiB/s |
+| MuseAir-128       |       128-bit |                   28.5 GiB/s |                       30.4 GiB/s |
+| MuseAir-BFast     |        64-bit |                   33.3 GiB/s |                       36.4 GiB/s |
+| MuseAir-BFast-128 |       128-bit |                   33.3 GiB/s |                       36.3 GiB/s |
+| wyhash 4.2        |        64-bit |                   31.9 GiB/s |                       28.4 GiB/s |
+| wyhash.condom 4.2 |        64-bit |                   25.3 GiB/s |                       22.8 GiB/s |
+| komihash 5.7      |        64-bit |                   25.5 GiB/s |                              N/A |
+| komihash 5.10     |        64-bit |                          N/A |                       26.8 GiB/s |
 
-(These results are obtained by running `./SMHasher3 --test=Speed <HASH>` and `cargo bench` on AMD Ryzen 7 5700G 4.6GHz Desktop.)
+<sup>*(These results are obtained by running `./SMHasher3 --test=Speed <HASH>` and `cargo bench`)*</sup>
 
-### Small datas
+Peak throughput is implementation specific. But no matter what, MuseAir is the fastest for bulk datas among portable hashes, reaching 1.14x the previous fastest (wyhash).
 
-Currently there is no targeted design, it is simply modified from rapidhash.
 
-Therefore, for short inputs no more than 16 bytes, the performance is similar to rapidhash.
+### Small keys
 
-For short inputs larger than 16 bytes, the function call overhead makes them slower because there is a function that should not be inlined
-(otherwise the entire hash performance will be slower on all input sizes). This is the next step to focus on optimization.
+<img src="results/bench-smallkeys.png" alt="Bench small keys" width="100%" />
 
+<sup>*(These results are obtained by running `./SMHasher3 --test=Speed <HASH>`)*</sup>
 
-## Quality
+For the more common small keys of 1-32 bytes, MuseAir has significant performance advantages. In this range, MuseAir is still the fastest on average.
 
-They all passed [SMHasher3] with `--extra` option.
 
-- [MuseAir](results/SMHasher3_MuseAir_--extra.txt)
-- [MuseAir-128](results/SMHasher3_MuseAir-128_--extra.txt)
-- [MuseAir-BFast](results/SMHasher3_MuseAir-BFast_--extra.txt)
-  (While testing this variant, I was gaming, so the `[[[ Speed Tests ]]]` result were actually on the small side :P)
-- [MuseAir-BFast-128](results/SMHasher3_MuseAir-BFast-128_--extra.txt)
+## Implementations
 
-And no bad seeds were found (took too long, so only [MuseAir-BFast](./results/SMHasher3_MuseAir-BFast_--extra_--test=BadSeeds.txt) was searched).
+This repository provides the official Rust implementation of MuseAir. You can find this crate on [crates.io](https://crates.io/crates/museair).
 
-The `museair.cpp` in the repository root is for use with SMHasher3, so you can reproduce these results on your computer.
-Since it relies on the entire SMHasher3, it is not very usable in production.
+| Language | Link
+|:-------- |:----
+| **C**    | [eternal-io/museair-c](https://github.com/eternal-io/museair-c)
+| **C++**  | [Twilight-Dream-Of-Magic/museair-cpp](https://github.com/Twilight-Dream-Of-Magic/museair-cpp)
 
-#### Update: They also passed [SMHasher] with `--extra` option, with only one false positive.
 
-- [MuseAir](results/SMHasher_MuseAir_--extra.txt)
-- [MuseAir-128](results/SMHasher_MuseAir-128_--extra.txt)
-- [MuseAir-BFast](results/SMHasher_MuseAir-BFast_--extra.txt)
-- [MuseAir-BFast-128](results/SMHasher_MuseAir-BFast-128_--extra.txt)
+## Algorithm analysis
 
-(For keys shorter than 16 bytes, MuseAir and MuseAir-BFast have the same output.)
+First define `wide_mul` and `fold_mul`:
 
+```rust
+/// 64 x 64 -> 128 multiplication, returns lower 64-bit, then upper 64-bit.
+fn wide_mul(a: u64, b: u64) -> (u64, u64) {
+    x = a as u128 * b as u128;
+    (x as u64, (x >> 64) as u64)
+}
 
-## Security
+/// XOR-fold the lower half and the upper half of the multiplication result.
+fn fold_mul(a: u64, b: u64) -> u64 {
+    let (lo, hi) = wide_mul(a, b);
+    lo ^ hi
+}
+```
 
-MuseAir is ***NOT*** intended for cryptographic security.
+**For small keys**, the reason why there is a significant speedup for 16-32 bytes is mainly because the _data hazard_ is solved:
 
-- To resist HashDoS, your hash must comes with a private seed.
-- To ensure the protection of your data, it is recommended to use a well-established algorithm, such as SHA-3.
+```rust
+/* not what they actually read, just to simplify the situation. */
 
+let mut acc_i = read_u64(&bytes[0..8]);
+let mut acc_j = read_u64(&bytes[8..16]);
 
-## Versioning policy
+if bytes.len() > 16 {
+    let (lo0, hi0) = wide_mul(CONSTANT[2], CONSTANT[3] ^ read_u64(&bytes[16..24]));
+    let (lo1, hi1) = wide_mul(CONSTANT[4], CONSTANT[5] ^ read_u64(&bytes[24..32]));
+    acc_i ^= lo0 ^ hi1;
+    acc_j ^= lo1 ^ hi0;
+}
+```
 
-The **Standard** variant (functions listed in the crate root) is not scheduled to be stable until version 1.0.0 is released.
-That is, the result of the hash may change from minor version to minor version. Don't use it for persistent storage *yet*.
+**For bulk datas**, consider wyhash's core loop:
 
-The **BFast** variant will never be stable, you should only use this on local sessions.
+```rust
+acc0 = fold_mul(acc0 ^ read_u64(&bytes[8 * 0..]), SECRET[0] ^ read_u64(&bytes[8 * 1..]));
+acc1 = fold_mul(acc1 ^ read_u64(&bytes[8 * 2..]), SECRET[1] ^ read_u64(&bytes[8 * 3..]));
+acc2 = fold_mul(acc2 ^ read_u64(&bytes[8 * 4..]), SECRET[2] ^ read_u64(&bytes[8 * 5..]));
+                                /* Left side */                        /* Right side */
+```
 
+actually the following problems:
 
-## Implementations
+1. Divide the input into multiple strips and process them separately, without diffusion between strips.
+2. Direct folding after wide multiplication. Although it is beneficial to confusion and further diffusion, it will also cause a certain _entropy loss_, and possible to design collisions accordingly.
+3. Once the _right side_ input happens to be the same as `SECRET[n]`, it causes one of the multipliers to be zero. Due to the nature of multiplication, the accumulator of the current strip will be destructively zeroized, and all past states will no longer exist. This situation is called "blinding multiplication" —— It is actually very easy to design collisions here, and its security comes entirely from the confidentiality of `SECRET[..]`, a set of constants that are independent of the seed. Therefore this type of attack is also called "seed-independent attack". This limits its application in communication protocols, persistent file formats, etc.
 
-This repository provides the official Rust implementation of MuseAir. You can find this crate on [crates.io](https://crates.io/crates/museair).
+In fact, to alleviate problem 3, wyhash also proposed the `condom` mode, uses modified folding multiplication:
+
+```rust
+fn fold_mul(a: u64, b: u64) -> u64 {
+    let (lo, hi) = wide_mul(a, b);
+    a ^ b ^ lo ^ hi
+}
+```
+
+can obviously avoid this problem now. However, when the _right side_ continues to be zero, the _left side_ inputs will not diffuse at all and will be repeatedly overwritten by subsequent inputs. Worse, performance dropped by more than 20%. At this time, there are other algorithms that are better than it, such as [komihash].
+
+In order to solve all the above problems, MuseAir proposed the **Ring Accumulator Group** structure:
+
+```rust
+/* `wrapping_add` omitted. */
+
+state[0] ^= read_u64(&bytes[8 * 0..]);
+state[1] ^= read_u64(&bytes[8 * 1..]);
+let (lo0, hi0) = wide_mul(state[0], state[1]);
+state[0] += ring_prev ^ hi0;
+
+state[1] ^= read_u64(&bytes[8 * 2..]);
+state[2] ^= read_u64(&bytes[8 * 3..]);
+let (lo1, hi1) = wide_mul(state[1], state[2]);
+state[1] += lo0 ^ hi1;
+
+...
+
+state[5] ^= read_u64(&bytes[8 * 10..]);
+state[0] ^= read_u64(&bytes[8 * 11..]);
+let (lo5, hi5) = wide_mul(state[5], state[0]);
+state[5] += lo4 ^ hi5;
+
+ring_prev = lo5;
+```
+
+This is the accumulator group for the `Standard` variant. For the `BFast` variant, simply replace `+=` with `=`.
+
+For problem 1 and 2, all accumulator updates come from _this_ upper 64-bit and _previous_ lower 64-bit, which has good diffusion properties.
+
+For problem 3, since the multipliers are always dynamic and thanks to good diffusion, MuseAir is not affected by seed-independent attacks. As for blinding multiplication, the `Standard` variant does not overwrite the accumulators, is therefore not affected by this. The `BFast` variant overwrites the accumulators, which needs a brief discussion:
+
+- After a certain read, if `state[0] == 0 && state[1] != 0`, then overwrite the accumulator will not cause any data loss. Also, `state[0]` will almost never fall into an all-zero state due to the lagging mix-in of the multiplication results.
+- After a certain read, if `state[0] != 0 && state[1] == 0`, then overwrite the accumulator will cause the data of `state[0]` (the most recent 8 bytes) being lost. As for the previous datas, they have already been diffused into the entire state and is not affected. Likewise, `state[0]` will almost never fall into the all-zero state, but for `state[1]`:
+  - If the next read unfortunately encounters an all-zero block, or the first byte is `0x01`, and the last seven bytes are all zeros (for universal inputs, only $2^{-127}$ probability go there), then it will remain all-zero for this round.
+  - Of course it is more likely to hit a non-all-zero block. The precious upper 64-bit will bring it back from the low entropy state immediately after the next multiplication.
+
+In summary, for universal inputs, MuseAir-BFast only has a $2^{-64}$ probability that a certain 8 bytes of input will not affect the output. Referring to wyhash, there is a $2^{-63}$ probability that the past one-third of input does not affect the output.
 
-### 0.2
+As for performance, its improvement mainly comes from a deep understanding of instruction-level parallelism (ILP). Benchmarks show that the performance difference between MuseAir-Standard and wyhash is less than 6%.
 
-| Language | Author                  | Link                                                     |
-|:-------- |:----------------------- |:-------------------------------------------------------- |
-| **C**    | K--Aethiax              | <https://github.com/eternal-io/museair-c>                |
-| **C++**  | Twilight-Dream-Of-Magic | <https://github.com/Twilight-Dream-Of-Magic/museair-cpp> |
+MuseAir-Standard will be the fastest portable hash available for communication protocols and persistent file formats.
 
 
 ## License
@@ -111,7 +183,5 @@ MuseAir algorithm itself is released into the public domain under the CC0 licens
 These codes (implementation) in this repository are released under the MIT or Apache 2.0 dual license, at your option.
 
 
-[WyHash]: https://github.com/wangyi-fudan/wyhash
-[RapidHash]: https://github.com/Nicoshev/rapidhash
+[komihash]: https://github.com/avaneev/komihash
 [SMHasher3]: https://gitlab.com/fwojcik/smhasher3
-[SMHasher]: https://github.com/rurban/smhasher
diff --git a/README.zh-Hans.md b/README.zh-Hans.md
new file mode 100644
index 0000000..a5a28b9
--- /dev/null
+++ b/README.zh-Hans.md
@@ -0,0 +1,186 @@
+# <img src="MuseAir-icon-light.png" style="height:1em" /> MuseAir
+
+[![](https://img.shields.io/crates/v/museair)](https://crates.io/crates/museair)
+[![](https://img.shields.io/crates/d/museair)](https://crates.io/crates/museair)
+[![](https://img.shields.io/crates/l/museair)](#)
+[![](https://img.shields.io/docsrs/museair)](https://docs.rs/museair)
+[![](https://img.shields.io/github/stars/eternal-io/museair?style=social)](https://github.com/eternal-io/museair)
+
+<p align="right"><sup><b><i>简体中文</i></b> 丨 <a href="README.md">English</a></sup></p>
+
+
+MuseAir 是一个快速便携（portable）散列算法，拥有便携散列中最高的吞吐量，并且在短键上也能提供不俗的表现（详见“*基准测试*”）。她还针对质量和可用性进行了改进（详见“*算法分析*”），并且能够[*通过*](results)完整的 [SMHasher3] 扩展测试。
+
+MuseAir 提供了两种变体：`Standard`（默认）和 `BFast`。前者提供更好的质量，后者提供更好的性能。
+
+- 两种变体都提供 64 位和 128 位输出，并且开销基本一致。
+
+MuseAir **不是**为了密码学安全而设计的，你不应将其用于安全用途，例如，保证文件未经恶意篡改。对于这类用途，请考虑 SHA-3，Ascon 或 Blake3。
+
+此外，MuseAir-`Standard` 计划在一段时间（1.0.0）之后稳定。由于她已提升的质量，届时，她将可以用于以下用途：
+
+- 持久文件格式
+- 通信协议
+- ……
+
+在此之前，她应该只用于本地会话。
+
+
+## 基准测试 <sub>/ Benchmarks</sub>
+
+    AMD Ryzen 7 5700G 4.6GHz Desktop, Windows 10 22H2, rustc 1.82.0 (f6e511eec 2024-10-15)
+
+    - SMHasher3 (34093a3 2024-06-17) runs in WSL 2 with clang 14.0.0-1ubuntu1.1
+
+
+### 大块数据 <sub>/ Bulk datas</sub>
+
+| Hash              | Digest length | Throughput (C++ - SMHasher3) | Throughput (Rust - Criterion.rs) |
+|:----------------- | -------------:| ----------------------------:| --------------------------------:|
+| MuseAir           |        64-bit |                   28.5 GiB/s |                       30.5 GiB/s |
+| MuseAir-128       |       128-bit |                   28.5 GiB/s |                       30.4 GiB/s |
+| MuseAir-BFast     |        64-bit |                   33.3 GiB/s |                       36.4 GiB/s |
+| MuseAir-BFast-128 |       128-bit |                   33.3 GiB/s |                       36.3 GiB/s |
+| wyhash 4.2        |        64-bit |                   31.9 GiB/s |                       28.4 GiB/s |
+| wyhash.condom 4.2 |        64-bit |                   25.3 GiB/s |                       22.8 GiB/s |
+| komihash 5.7      |        64-bit |                   25.5 GiB/s |                              N/A |
+| komihash 5.10     |        64-bit |                          N/A |                       26.8 GiB/s |
+
+<sup>*(These results are obtained by running `./SMHasher3 --test=Speed <HASH>` and `cargo bench`)*</sup>
+
+峰值吞吐量与具体实现有关。但不论如何，对于大块数据，MuseAir 都是便携散列当中最快的，可以达到先前最快（wyhash）的 1.14 倍。
+
+
+### 短键 <sub>/ Small keys</sub>
+
+<img src="results/bench-smallkeys.png" alt="Bench small keys" width="100%" />
+
+<sup>*(These results are obtained by running `./SMHasher3 --test=Speed <HASH>`)*</sup>
+
+对于更加常见的 1-32 bytes 短键，MuseAir 系列算法拥有显著的性能优势。在这个范围内，平均而言，她仍是最快的。
+
+
+## 实现 <sub>/ Implementations</sub>
+
+这个存储库提供 MuseAir 的官方 Rust 实现，你可以在 [crates.io](https://crates.io/crates/museair) 上找到这个 crate。
+
+| Language | Link
+|:-------- |:----
+| **C**    | [eternal-io/museair-c](https://github.com/eternal-io/museair-c)
+| **C++**  | [Twilight-Dream-Of-Magic/museair-cpp](https://github.com/Twilight-Dream-Of-Magic/museair-cpp)
+
+
+## 算法分析 <sub>/ Algorithm analysis</sub>
+
+首先定义 `wide_mul` 和 `fold_mul`：
+
+```rust
+/// 64 x 64 -> 128 multiplication, returns lower 64-bit, then upper 64-bit.
+fn wide_mul(a: u64, b: u64) -> (u64, u64) {
+    x = a as u128 * b as u128;
+    (x as u64, (x >> 64) as u64)
+}
+
+/// XOR-fold the lower half and the upper half of the multiplication result.
+fn fold_mul(a: u64, b: u64) -> u64 {
+    let (lo, hi) = wide_mul(a, b);
+    lo ^ hi
+}
+```
+
+**对于短键**，之所以对 16-32 字节长度有显著提速，主要是因为解决了数据依赖问题，使得那部分的乘法运算不需要等待先前数据，有效地利用了 CPU 流水线：
+
+```rust
+/* not what they actually read, just to simplify the situation. */
+
+let mut acc_i = read_u64(&bytes[0..8]);
+let mut acc_j = read_u64(&bytes[8..16]);
+
+if bytes.len() > 16 {
+    let (lo0, hi0) = wide_mul(CONSTANT[2], CONSTANT[3] ^ read_u64(&bytes[16..24]));
+    let (lo1, hi1) = wide_mul(CONSTANT[4], CONSTANT[5] ^ read_u64(&bytes[24..32]));
+    acc_i ^= lo0 ^ hi1;
+    acc_j ^= lo1 ^ hi0;
+}
+```
+
+**对于大块数据**，考虑 wyhash 的核心循环：
+
+```rust
+acc0 = fold_mul(acc0 ^ read_u64(&bytes[8 * 0..]), SECRET[0] ^ read_u64(&bytes[8 * 1..]));
+acc1 = fold_mul(acc1 ^ read_u64(&bytes[8 * 2..]), SECRET[1] ^ read_u64(&bytes[8 * 3..]));
+acc2 = fold_mul(acc2 ^ read_u64(&bytes[8 * 4..]), SECRET[2] ^ read_u64(&bytes[8 * 5..]));
+                                /* Left side */                        /* Right side */
+```
+
+实际上有以下问题：
+
+1. 将输入划分为多个条带分别处理，条带之间没有扩散（diffusion）。
+2. 宽乘法后直接折叠，尽管有利于混淆（confusion）和进一步扩散，但也会造成一定的*熵损失*，有可能依此设计出碰撞。
+3. 当*右侧*输入恰好与 `SECRET[n]` 相同时，会导致其中一个乘数为零。由于乘法的性质“零乘以任何数都等于零”，当前条带的累加器将被毁灭性清零，过去的所有状态都将不复存在。这一情形也被称作“致盲乘法”——在这里，设计碰撞实际上非常容易，它的安全性完全来自于 `SECRET[..]` 的保密，一组与种子（seed）无关的常数。因此这类攻击也被称作“种子无关攻击”。这限制了它在通信协议、持久文件格式等方面的应用。
+
+实际上，为了缓解问题 3，wyhash 还提出了 `condom` 模式，使用修改的折叠乘法：
+
+```rust
+fn fold_mul(a: u64, b: u64) -> u64 {
+    let (lo, hi) = wide_mul(a, b);
+    a ^ b ^ lo ^ hi
+}
+```
+
+显然能够避免致盲乘法问题。但当*右侧*持续为零时，*左侧*的输入将完全不会扩散，还将被后来的输入反复覆盖。此外，还有超过 20% 的性能下降。此时已有其它算法比它快且好了，比如 [komihash]。
+
+为了解决上述所有问题，MuseAir 提出了**环形累加器组**结构：
+
+```rust
+/* `wrapping_add` omitted. */
+
+state[0] ^= read_u64(&bytes[8 * 0..]);
+state[1] ^= read_u64(&bytes[8 * 1..]);
+let (lo0, hi0) = wide_mul(state[0], state[1]);
+state[0] += ring_prev ^ hi0;
+
+state[1] ^= read_u64(&bytes[8 * 2..]);
+state[2] ^= read_u64(&bytes[8 * 3..]);
+let (lo1, hi1) = wide_mul(state[1], state[2]);
+state[1] += lo0 ^ hi1;
+
+...
+
+state[5] ^= read_u64(&bytes[8 * 10..]);
+state[0] ^= read_u64(&bytes[8 * 11..]);
+let (lo5, hi5) = wide_mul(state[5], state[0]);
+state[5] += lo4 ^ hi5;
+
+ring_prev = lo5;
+```
+
+这是 `Standard` 变体的累加器组。对于 `BFast` 变体，直接将 `+=` 替换成 `=` 即是。
+
+对于问题 1 和 2：所有累加器的更新皆来自于本次乘法的高 64 位结果和上次乘法的低 64 位结果，拥有良好的扩散性质。
+
+对于问题 3：由于乘数总是动态的，且得益于良好的扩散，MuseAir 不会遭受种子无关攻击。至于致盲乘法，`Standard` 变体没有对累加器的覆写，因此不受此影响。`BFast` 变体有对累加器的覆写，需要进行简单讨论：
+
+- 若在某次读入之后，`state[0] == 0 && state[1] != 0`，则接下来覆写累加器时，不会导致任何数据丢失。同时，由于乘法结果的滞后混入，`state[0]` 几乎不会陷入全零状态。
+- 若在某次读入之后，`state[0] != 0 && state[1] == 0`，则接下来覆写累加器时，会导致读入`state[0]`的那部分数据（8 字节）丢失。至于更先前的数据，则早已被扩散至整个状态中，不受影响。同样，`state[0]` 几乎不会陷入全零状态，但对于`state[1]`：
+  - 如果接下来的读取不幸碰上了全零块，或是前七个字节都是零，最后一个字节是 `0x01`（对于普遍输入而言，只有 $2^{-127}$ 概率能走到这里），那么它在这一轮内都会保持全零。
+  - 当然，它更有可能碰上非全零块。在接下来的乘法完成之后，拥有宝贵混合的高 64 位乘法结果还会立刻让它从低熵状态中恢复。
+
+综上，对于普遍输入，MuseAir-BFast 只有 $2^{-64}$ 概率导致某 8 个字节的输入不影响输出。参考 wyhash，有 $2^{-63}$ 概率导致过去三分之一的输入不影响输出。
+
+至于性能，它的提升主要来自对指令级并行（ILP）的深入理解。基准测试表明 MuseAir-Standard 与 wyhash 的性能差异在 6% 以下。
+
+MuseAir-Standard 将是能够用于通信协议/持久文件格式的最快的便携散列。
+
+<sub>_扩展资料：MuseAir 0.2 算法介绍，[B站专栏](https://www.bilibili.com/read/cv37413023) 或 [知乎文章](https://zhuanlan.zhihu.com/p/715753300)。尽管介绍的是老版本，但其中有一些未在此处提及的设计动机，没有太大变动，仍具有一定参考性。_</sub>
+
+
+## 许可 <sub>/ License</sub>
+
+MuseAir 算法本身以 CC0 许可发布到公共领域。
+
+该存储库下的代码（算法实现）以 MIT 和 Apache 2.0 双许可发布。
+
+
+[komihash]: https://github.com/avaneev/komihash
+[SMHasher3]: https://gitlab.com/fwojcik/smhasher3
diff --git a/benches/hashes.rs b/benches/hashes.rs
index 315fa2e..5d3f294 100644
--- a/benches/hashes.rs
+++ b/benches/hashes.rs
@@ -1,6 +1,7 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use wyhash_final4::{generics::WyHashVariant, WyHash64, WyHash64Condom};
 
+#[rustfmt::skip]
 fn criterion_benchmark(c: &mut Criterion) {
     const SIZE: u64 = 256 * 1024;
     let msg = vec![0xABu8; SIZE as usize];
@@ -26,7 +27,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             b.iter(|| {
                 black_box(museair::bfast::hash_128(&msg, 42));
             })
-        });
+        })
+        ;
 
     c.benchmark_group("WyHash")
         .throughput(Throughput::Bytes(SIZE))
@@ -39,7 +41,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             b.iter(|| {
                 black_box(WyHash64Condom::with_seed(42).hash(&msg));
             })
-        });
+        })
+        ;
 
     c.benchmark_group("KomiHash")
         .throughput(Throughput::Bytes(SIZE))
@@ -47,7 +50,8 @@ fn criterion_benchmark(c: &mut Criterion) {
             b.iter(|| {
                 black_box(komihash::komihash(&msg, 42));
             })
-        });
+        })
+        ;
 }
 
 criterion_group! {
diff --git a/museair.cpp b/museair.cpp
index 7263f12..7bb04b5 100644
--- a/museair.cpp
+++ b/museair.cpp
@@ -40,7 +40,7 @@
 
 #include "Mathmult.h"
 
-#define ALGORITHM_VERSION "0.3-rc6"
+#define ALGORITHM_VERSION "0.3"
 
 #define u64x(N) N * 8
 
@@ -80,13 +80,18 @@ static FORCE_INLINE void read_short(const uint8_t* bytes, const size_t len, uint
     }
 }
 
+template <bool bfast>
 static FORCE_INLINE void _mumix(uint64_t* state_p, uint64_t* state_q, uint64_t input_p, uint64_t input_q) {
-    uint64_t lo, hi;
-    *state_p ^= input_p;
-    *state_q ^= input_q;
-    MathMult::mult64_128(lo, hi, *state_p, *state_q);
-    *state_p ^= lo;
-    *state_q ^= hi;
+    if (!bfast) {
+        uint64_t lo, hi;
+        *state_p ^= input_p;
+        *state_q ^= input_q;
+        MathMult::mult64_128(lo, hi, *state_p, *state_q);
+        *state_p ^= lo;
+        *state_q ^= hi;
+    } else {
+        MathMult::mult64_128(*state_p, *state_q, *state_p ^ input_p, *state_q ^ input_q);
+    }
 }
 
 //------------------------------------------------------------------------------
@@ -133,11 +138,9 @@ static FORCE_INLINE void hash_short(const uint8_t* bytes,
     uint64_t lo0, lo1, lo2;
     uint64_t hi0, hi1, hi2;
 
-    uint64_t i, j;
-
     MathMult::mult64_128(lo2, hi2, seed ^ CONSTANT[0], len ^ CONSTANT[1]);
 
-    // Seems compilers are smart enough to make `min(len, 16)` branchless.
+    uint64_t i, j;
     read_short<bswap>(bytes, len <= 16 ? len : 16, &i, &j);
     i ^= len ^ lo2;
     j ^= seed ^ hi2;
@@ -158,29 +161,22 @@ static FORCE_INLINE void hash_short(const uint8_t* bytes,
         j = lo1 ^ hi0;
         MathMult::mult64_128(lo0, hi0, i, j);
         MathMult::mult64_128(lo1, hi1, i ^ CONSTANT[4], j ^ CONSTANT[5]);
-
         *out_lo = lo0 ^ hi1;
         *out_hi = lo1 ^ hi0;
     } else {
-        i ^= CONSTANT[2];
-        j ^= CONSTANT[3];
-
-        MathMult::mult64_128(lo0, hi0, i, j);
-
+        MathMult::mult64_128(lo2, hi2, i ^ CONSTANT[2], j ^ CONSTANT[3]);
         if (!bfast) {
-            i ^= lo0 ^ CONSTANT[4];
-            j ^= hi0 ^ CONSTANT[5];
+            i ^= lo2;
+            j ^= hi2;
         } else {
-            i = lo0 ^ CONSTANT[4];
-            j = hi0 ^ CONSTANT[5];
+            i = lo2;
+            j = hi2;
         }
-
-        MathMult::mult64_128(lo0, hi0, i, j);
-
+        MathMult::mult64_128(lo2, hi2, i ^ CONSTANT[4], j ^ CONSTANT[5]);
         if (!bfast) {
-            *out_lo = i ^ j ^ lo0 ^ hi0;
+            *out_lo = i ^ j ^ lo2 ^ hi2;
         } else {
-            *out_lo = lo0 ^ hi0;
+            *out_lo = lo2 ^ hi2;
         }
     }
 }
@@ -274,32 +270,29 @@ static NEVER_INLINE void hash_loong(const uint8_t* bytes,
         state[0] ^= lo5;
     }
 
-    /* 交换下方`state[]`的使用顺序会明显影响性能表现，现在这样似乎是最好的组合。
-       还没有检查过它们生成的汇编，有可能是编译器“太过聪明”以至于产生了一些负优化。就像之前的`state[N] += lo[N-1] ^ hi[N]`那样，加法不能用异或替代。 */
-
     if (unlikely(q >= u64x(6))) {
-        _mumix(&state[0], &state[1], read_u64<bswap>(p + u64x(0)), read_u64<bswap>(p + u64x(1)));
-        _mumix(&state[2], &state[3], read_u64<bswap>(p + u64x(2)), read_u64<bswap>(p + u64x(3)));
-        _mumix(&state[4], &state[5], read_u64<bswap>(p + u64x(4)), read_u64<bswap>(p + u64x(5)));
+        _mumix<bfast>(&state[0], &state[1], read_u64<bswap>(p + u64x(0)), read_u64<bswap>(p + u64x(1)));
+        _mumix<bfast>(&state[2], &state[3], read_u64<bswap>(p + u64x(2)), read_u64<bswap>(p + u64x(3)));
+        _mumix<bfast>(&state[4], &state[5], read_u64<bswap>(p + u64x(4)), read_u64<bswap>(p + u64x(5)));
 
         p += u64x(6);
         q -= u64x(6);
     }
 
     if (likely(q >= u64x(2))) {
-        _mumix(&state[0], &state[3], read_u64<bswap>(p + u64x(0)), read_u64<bswap>(p + u64x(1)));
+        _mumix<bfast>(&state[0], &state[3], read_u64<bswap>(p + u64x(0)), read_u64<bswap>(p + u64x(1)));
         if (likely(q >= u64x(4))) {
-            _mumix(&state[1], &state[4], read_u64<bswap>(p + u64x(2)), read_u64<bswap>(p + u64x(3)));
+            _mumix<bfast>(&state[1], &state[4], read_u64<bswap>(p + u64x(2)), read_u64<bswap>(p + u64x(3)));
         }
     }
 
-    _mumix(&state[2], &state[5], read_u64<bswap>(p + q - u64x(2)), read_u64<bswap>(p + q - u64x(1)));
+    _mumix<bfast>(&state[2], &state[5], read_u64<bswap>(p + q - u64x(2)), read_u64<bswap>(p + q - u64x(1)));
 
     /*-------- epilogue --------*/
 
-    i = state[0] + state[1];
-    j = state[2] + state[3];
-    k = state[4] + state[5];
+    i = state[0] - state[1];
+    j = state[2] - state[3];
+    k = state[4] - state[5];
 
     int rot = len & 63;
     i = ROTL64(i, rot);
@@ -342,8 +335,8 @@ REGISTER_HASH(
                  | FLAG_IMPL_ROTATE_VARIABLE
                  | FLAG_IMPL_LICENSE_PUBLIC_DOMAIN,
     $.bits = 64,
-    $.verification_LE = 0x4F7AF44C,
-    $.verification_BE = 0x7CB9CFCD,
+    $.verification_LE = 0xF89F1683,
+    $.verification_BE = 0xDFEF2570,
     $.hashfn_native   = hash<false, false, false>,
     $.hashfn_bswap    = hash<true, false, false>
 );
@@ -355,8 +348,8 @@ REGISTER_HASH(
                  | FLAG_IMPL_ROTATE_VARIABLE
                  | FLAG_IMPL_LICENSE_PUBLIC_DOMAIN,
     $.bits = 128,
-    $.verification_LE = 0xEFACD140,
-    $.verification_BE = 0xF7DE649D,
+    $.verification_LE = 0xD3DFE238,
+    $.verification_BE = 0x05EC3BE4,
     $.hashfn_native   = hash<false, false, true>,
     $.hashfn_bswap    = hash<true, false, true>
 );
@@ -369,8 +362,8 @@ REGISTER_HASH(
                  | FLAG_IMPL_ROTATE_VARIABLE
                  | FLAG_IMPL_LICENSE_PUBLIC_DOMAIN,
     $.bits = 64,
-    $.verification_LE = 0x4E8C0789,
-    $.verification_BE = 0xAAF61B77,
+    $.verification_LE = 0xC61BEE56,
+    $.verification_BE = 0x16186D00,
     $.hashfn_native   = hash<false, true, false>,
     $.hashfn_bswap    = hash<true, true, false>
 );
@@ -382,8 +375,8 @@ REGISTER_HASH(
                  | FLAG_IMPL_ROTATE_VARIABLE
                  | FLAG_IMPL_LICENSE_PUBLIC_DOMAIN,
     $.bits = 128,
-    $.verification_LE = 0x7CCE23A2,
-    $.verification_BE = 0x102D89CC,
+    $.verification_LE = 0x27939BF1,
+    $.verification_BE = 0xCB4AB283,
     $.hashfn_native   = hash<false, true, true>,
     $.hashfn_bswap    = hash<true, true, true>
 );
diff --git a/src/lib.rs b/src/lib.rs
index 4443267..b10f348 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -36,15 +36,23 @@
  */
 
 #![no_std]
-// #![doc = include_str!("../CRATES.IO-README.md")]
-#![doc = "🚧 ***Under development so no details here.*** 🚧"]
+#![doc = include_str!("../CRATES.IO-README.md")]
 #![doc(html_logo_url = "https://github.com/eternal-io/museair/blob/master/MuseAir-icon-light.png?raw=true")]
 #![warn(missing_docs)]
 
 /// Currently algorithm version.
 ///
-/// Note that this is NOT the implementation version.
-pub const ALGORITHM_VERSION: &str = "0.3-rc6";
+/// Note that this is **NOT** the implementation version.
+///
+/// If you want to see an older version of the algorithm, check out the historical commits
+/// for [`museair.cpp`](https://github.com/eternal-io/museair/blob/master/museair.cpp) in repository root.
+pub const ALGORITHM_VERSION: &str = "0.3";
+
+/// Incremental MuseAir hasher.
+///
+/// Due to its incremental nature, it's much slower than one-shot functions for small keys.
+/// Consider [musemap](https://crates.io/crates/musemap) for hashmap use case.
+pub type Hasher = BaseHasher<false>;
 
 /// One-shot MuseAir hash with 64-bit output.
 #[inline]
@@ -53,21 +61,21 @@ pub fn hash(bytes: &[u8], seed: u64) -> u64 {
 }
 
 /// One-shot MuseAir hash with 128-bit output.
-///
-/// Note that the 128-bit variant is designed to be **as fast as** the 64-bit variant,
-/// so you can use it without worrying about performance if necessary.
 #[inline]
 pub fn hash_128(bytes: &[u8], seed: u64) -> u128 {
     base_hash_128::<false>(bytes, seed)
 }
 
-/// The `-BFast` variant is faster but *less* immune to blinding multiplication.
-///
-/// ("less" here means when it actually happens, it will only result in the most recent state being lost,
-/// rather than all the past state of a stripe being catastrophically lost!)
+/// The `BFast` variant.
 pub mod bfast {
     use super::*;
 
+    /// Incremental MuseAir-BFast hasher.
+    ///
+    /// Due to its incremental nature, it's much slower than one-shot functions for small keys.
+    /// Consider [musemap](https://crates.io/crates/musemap) for hashmap use case.
+    pub type Hasher = BaseHasher<true>;
+
     /// One-shot MuseAir-BFast hash with 64-bit output.
     #[inline]
     pub fn hash(bytes: &[u8], seed: u64) -> u64 {
@@ -75,18 +83,15 @@ pub mod bfast {
     }
 
     /// One-shot MuseAir-BFast hash with 128-bit output.
-    ///
-    /// Note that the 128-bit variant is designed to be **as fast as** the 64-bit variant,
-    /// so you can use it without worrying about performance if necessary.
     #[inline]
     pub fn hash_128(bytes: &[u8], seed: u64) -> u128 {
         base_hash_128::<true>(bytes, seed)
     }
 }
 
-//------------------------------------------------------------------------------
+//--------------------------------------------------------------------------------------------------
 
-/// `AiryAi(0)` calculated by Y-Cruncher.
+/// `AiryAi(0)` mantissa calculated by Y-Cruncher.
 const CONSTANT: [u64; 7] = [
     0x5ae31e589c56e17a,
     0x96d7bb04e64f6da9,
@@ -139,9 +144,9 @@ macro_rules! u64x {
     };
 }
 
-/*--------------------- 🛰 Safe Rust only slows me down. ---------------------*/
+/*------------------------------- 🛰 Safe Rust only slows me down. -------------------------------*/
 
-use core::ops::Sub;
+use core::{fmt, ops::Sub};
 
 #[inline(always)]
 unsafe fn read_u32(bytes: *const u8) -> u64 {
@@ -176,36 +181,33 @@ unsafe fn read_short(bytes: *const u8, len: usize) -> (u64, u64) {
 }
 
 #[inline(always)]
-fn _mumix(mut state_p: u64, mut state_q: u64, input_p: u64, input_q: u64) -> (u64, u64) {
-    state_p ^= input_p;
-    state_q ^= input_q;
-    let (lo, hi) = wmul(state_p, state_q);
-    state_p ^= lo;
-    state_q ^= hi;
-    (state_p, state_q)
+fn _mumix<const BFAST: bool>(mut state_p: u64, mut state_q: u64, input_p: u64, input_q: u64) -> (u64, u64) {
+    if !BFAST {
+        state_p ^= input_p;
+        state_q ^= input_q;
+        let (lo, hi) = wmul(state_p, state_q);
+        (state_p ^ lo, state_q ^ hi)
+    } else {
+        wmul(state_p ^ input_p, state_q ^ input_q)
+    }
 }
 
-//------------------------------------------------------------------------------
+//--------------------------------------------------------------------------------------------------
 
 #[inline(always)]
 fn hash_short_64<const BFAST: bool>(bytes: &[u8], seed: u64) -> u64 {
-    let (mut i, mut j) = unsafe { hash_short_inner(bytes.as_ptr(), bytes.len(), seed) };
-
-    i ^= CONSTANT[2];
-    j ^= CONSTANT[3];
-
-    let (lo, hi) = wmul(i, j);
+    let (mut i, mut j) = unsafe { hash_short_common(bytes.as_ptr(), bytes.len(), seed) };
 
+    let (lo, hi) = wmul(i ^ CONSTANT[2], j ^ CONSTANT[3]);
     if !BFAST {
-        i ^= lo ^ CONSTANT[4];
-        j ^= hi ^ CONSTANT[5];
+        i ^= lo;
+        j ^= hi;
     } else {
-        i = lo ^ CONSTANT[4];
-        j = hi ^ CONSTANT[5];
+        i = lo;
+        j = hi;
     }
 
-    let (lo, hi) = wmul(i, j);
-
+    let (lo, hi) = wmul(i ^ CONSTANT[4], j ^ CONSTANT[5]);
     if !BFAST {
         i ^ j ^ lo ^ hi
     } else {
@@ -215,7 +217,7 @@ fn hash_short_64<const BFAST: bool>(bytes: &[u8], seed: u64) -> u64 {
 
 #[inline(always)]
 fn hash_short_128(bytes: &[u8], seed: u64) -> u128 {
-    let (mut i, mut j) = unsafe { hash_short_inner(bytes.as_ptr(), bytes.len(), seed) };
+    let (mut i, mut j) = unsafe { hash_short_common(bytes.as_ptr(), bytes.len(), seed) };
     let (lo0, hi0) = wmul(i, j);
     let (lo1, hi1) = wmul(i ^ CONSTANT[2], j ^ CONSTANT[3]);
     i = lo0 ^ hi1;
@@ -226,7 +228,7 @@ fn hash_short_128(bytes: &[u8], seed: u64) -> u128 {
 }
 
 #[inline(always)]
-unsafe fn hash_short_inner(bytes: *const u8, len: usize, seed: u64) -> (u64, u64) {
+unsafe fn hash_short_common(bytes: *const u8, len: usize, seed: u64) -> (u64, u64) {
     let len_ = len as u64;
     let (lo, hi) = wmul(seed ^ CONSTANT[0], len_ ^ CONSTANT[1]);
 
@@ -245,28 +247,20 @@ unsafe fn hash_short_inner(bytes: *const u8, len: usize, seed: u64) -> (u64, u64
     (i, j)
 }
 
-//------------------------------------------------------------------------------
+//--------------------------------------------------------------------------------------------------
 
 #[inline(never)]
 fn hash_loong_64<const BFAST: bool>(bytes: &[u8], seed: u64) -> u64 {
-    let (i, j, k) = unsafe { hash_loong_inner::<BFAST>(bytes.as_ptr(), bytes.len(), seed) };
-    let (lo0, hi0) = wmul(i, j);
-    let (lo1, hi1) = wmul(j, k);
-    let (lo2, hi2) = wmul(k, i);
-    (lo0 ^ hi2).wrapping_add(lo1 ^ hi0).wrapping_add(lo2 ^ hi1)
+    unsafe { epilogue_64(hash_loong_common::<BFAST>(bytes.as_ptr(), bytes.len(), seed)) }
 }
 
 #[inline(never)]
 fn hash_loong_128<const BFAST: bool>(bytes: &[u8], seed: u64) -> u128 {
-    let (i, j, k) = unsafe { hash_loong_inner::<BFAST>(bytes.as_ptr(), bytes.len(), seed) };
-    let (lo0, hi0) = wmul(i, j);
-    let (lo1, hi1) = wmul(j, k);
-    let (lo2, hi2) = wmul(k, i);
-    u64s_to_u128(lo0 ^ lo1 ^ hi2, hi0 ^ hi1 ^ lo2)
+    unsafe { epilogue_128(hash_loong_common::<BFAST>(bytes.as_ptr(), bytes.len(), seed)) }
 }
 
 #[inline(always)]
-unsafe fn hash_loong_inner<const BFAST: bool>(bytes: *const u8, len: usize, seed: u64) -> (u64, u64, u64) {
+unsafe fn hash_loong_common<const BFAST: bool>(bytes: *const u8, len: usize, seed: u64) -> (u64, u64, u64) {
     let mut p = bytes;
     let mut q = len;
 
@@ -283,72 +277,7 @@ unsafe fn hash_loong_inner<const BFAST: bool>(bytes: *const u8, len: usize, seed
         let mut ring_prev = CONSTANT[6];
 
         while likely(q >= u64x!(12)) {
-            if !BFAST {
-                state[0] ^= read_u64(p.add(u64x!(0)));
-                state[1] ^= read_u64(p.add(u64x!(1)));
-                let (lo0, hi0) = wmul(state[0], state[1]);
-                state[0] = state[0].wrapping_add(ring_prev ^ hi0);
-
-                state[1] ^= read_u64(p.add(u64x!(2)));
-                state[2] ^= read_u64(p.add(u64x!(3)));
-                let (lo1, hi1) = wmul(state[1], state[2]);
-                state[1] = state[1].wrapping_add(lo0 ^ hi1);
-
-                state[2] ^= read_u64(p.add(u64x!(4)));
-                state[3] ^= read_u64(p.add(u64x!(5)));
-                let (lo2, hi2) = wmul(state[2], state[3]);
-                state[2] = state[2].wrapping_add(lo1 ^ hi2);
-
-                state[3] ^= read_u64(p.add(u64x!(6)));
-                state[4] ^= read_u64(p.add(u64x!(7)));
-                let (lo3, hi3) = wmul(state[3], state[4]);
-                state[3] = state[3].wrapping_add(lo2 ^ hi3);
-
-                state[4] ^= read_u64(p.add(u64x!(8)));
-                state[5] ^= read_u64(p.add(u64x!(9)));
-                let (lo4, hi4) = wmul(state[4], state[5]);
-                state[4] = state[4].wrapping_add(lo3 ^ hi4);
-
-                state[5] ^= read_u64(p.add(u64x!(10)));
-                state[0] ^= read_u64(p.add(u64x!(11)));
-                let (lo5, hi5) = wmul(state[5], state[0]);
-                state[5] = state[5].wrapping_add(lo4 ^ hi5);
-
-                ring_prev = lo5;
-            } else {
-                state[0] ^= read_u64(p.add(u64x!(0)));
-                state[1] ^= read_u64(p.add(u64x!(1)));
-                let (lo0, hi0) = wmul(state[0], state[1]);
-                state[0] = ring_prev ^ hi0;
-
-                state[1] ^= read_u64(p.add(u64x!(2)));
-                state[2] ^= read_u64(p.add(u64x!(3)));
-                let (lo1, hi1) = wmul(state[1], state[2]);
-                state[1] = lo0 ^ hi1;
-
-                state[2] ^= read_u64(p.add(u64x!(4)));
-                state[3] ^= read_u64(p.add(u64x!(5)));
-                let (lo2, hi2) = wmul(state[2], state[3]);
-                state[2] = lo1 ^ hi2;
-
-                state[3] ^= read_u64(p.add(u64x!(6)));
-                state[4] ^= read_u64(p.add(u64x!(7)));
-                let (lo3, hi3) = wmul(state[3], state[4]);
-                state[3] = lo2 ^ hi3;
-
-                state[4] ^= read_u64(p.add(u64x!(8)));
-                state[5] ^= read_u64(p.add(u64x!(9)));
-                let (lo4, hi4) = wmul(state[4], state[5]);
-                state[4] = lo3 ^ hi4;
-
-                state[5] ^= read_u64(p.add(u64x!(10)));
-                state[0] ^= read_u64(p.add(u64x!(11)));
-                let (lo5, hi5) = wmul(state[5], state[0]);
-                state[5] = lo4 ^ hi5;
-
-                ring_prev = lo5;
-            }
-
+            ring_prev = frac_tower::<BFAST>(&mut state, ring_prev, p);
             p = p.add(u64x!(12));
             q = q.sub(u64x!(12));
         }
@@ -356,39 +285,124 @@ unsafe fn hash_loong_inner<const BFAST: bool>(bytes: *const u8, len: usize, seed
         state[0] ^= ring_prev;
     }
 
-    if unlikely(q >= u64x!(6)) {
-        (state[0], state[1]) = _mumix(state[0], state[1], read_u64(p.add(u64x!(0))), read_u64(p.add(u64x!(1))));
-        (state[2], state[3]) = _mumix(state[2], state[3], read_u64(p.add(u64x!(2))), read_u64(p.add(u64x!(3))));
-        (state[4], state[5]) = _mumix(state[4], state[5], read_u64(p.add(u64x!(4))), read_u64(p.add(u64x!(5))));
+    frac_final::<BFAST>(&mut state, p, q);
 
+    epilogue_common(state, len)
+}
+
+/// Must remainder `u64x!(12)` or more bytes otherwise UB.
+#[inline(always)]
+unsafe fn frac_tower<const BFAST: bool>(state: &mut [u64; 6], ring_prev: u64, p: *const u8) -> u64 {
+    if !BFAST {
+        state[0] ^= read_u64(p.add(u64x!(0)));
+        state[1] ^= read_u64(p.add(u64x!(1)));
+        let (lo0, hi0) = wmul(state[0], state[1]);
+        state[0] = state[0].wrapping_add(ring_prev ^ hi0);
+
+        state[1] ^= read_u64(p.add(u64x!(2)));
+        state[2] ^= read_u64(p.add(u64x!(3)));
+        let (lo1, hi1) = wmul(state[1], state[2]);
+        state[1] = state[1].wrapping_add(lo0 ^ hi1);
+
+        state[2] ^= read_u64(p.add(u64x!(4)));
+        state[3] ^= read_u64(p.add(u64x!(5)));
+        let (lo2, hi2) = wmul(state[2], state[3]);
+        state[2] = state[2].wrapping_add(lo1 ^ hi2);
+
+        state[3] ^= read_u64(p.add(u64x!(6)));
+        state[4] ^= read_u64(p.add(u64x!(7)));
+        let (lo3, hi3) = wmul(state[3], state[4]);
+        state[3] = state[3].wrapping_add(lo2 ^ hi3);
+
+        state[4] ^= read_u64(p.add(u64x!(8)));
+        state[5] ^= read_u64(p.add(u64x!(9)));
+        let (lo4, hi4) = wmul(state[4], state[5]);
+        state[4] = state[4].wrapping_add(lo3 ^ hi4);
+
+        state[5] ^= read_u64(p.add(u64x!(10)));
+        state[0] ^= read_u64(p.add(u64x!(11)));
+        let (lo5, hi5) = wmul(state[5], state[0]);
+        state[5] = state[5].wrapping_add(lo4 ^ hi5);
+
+        lo5
+    } else {
+        state[0] ^= read_u64(p.add(u64x!(0)));
+        state[1] ^= read_u64(p.add(u64x!(1)));
+        let (lo0, hi0) = wmul(state[0], state[1]);
+        state[0] = ring_prev ^ hi0;
+
+        state[1] ^= read_u64(p.add(u64x!(2)));
+        state[2] ^= read_u64(p.add(u64x!(3)));
+        let (lo1, hi1) = wmul(state[1], state[2]);
+        state[1] = lo0 ^ hi1;
+
+        state[2] ^= read_u64(p.add(u64x!(4)));
+        state[3] ^= read_u64(p.add(u64x!(5)));
+        let (lo2, hi2) = wmul(state[2], state[3]);
+        state[2] = lo1 ^ hi2;
+
+        state[3] ^= read_u64(p.add(u64x!(6)));
+        state[4] ^= read_u64(p.add(u64x!(7)));
+        let (lo3, hi3) = wmul(state[3], state[4]);
+        state[3] = lo2 ^ hi3;
+
+        state[4] ^= read_u64(p.add(u64x!(8)));
+        state[5] ^= read_u64(p.add(u64x!(9)));
+        let (lo4, hi4) = wmul(state[4], state[5]);
+        state[4] = lo3 ^ hi4;
+
+        state[5] ^= read_u64(p.add(u64x!(10)));
+        state[0] ^= read_u64(p.add(u64x!(11)));
+        let (lo5, hi5) = wmul(state[5], state[0]);
+        state[5] = lo4 ^ hi5;
+
+        lo5
+    }
+}
+
+/// Must remainder `u64x!(2)` or more bytes otherwise UB.
+#[inline(always)]
+unsafe fn frac_final<const BFAST: bool>(state: &mut [u64; 6], mut p: *const u8, mut q: usize) {
+    if unlikely(q >= u64x!(6)) {
+        (state[0], state[1]) =
+            _mumix::<BFAST>(state[0], state[1], read_u64(p.add(u64x!(0))), read_u64(p.add(u64x!(1))));
+        (state[2], state[3]) =
+            _mumix::<BFAST>(state[2], state[3], read_u64(p.add(u64x!(2))), read_u64(p.add(u64x!(3))));
+        (state[4], state[5]) =
+            _mumix::<BFAST>(state[4], state[5], read_u64(p.add(u64x!(4))), read_u64(p.add(u64x!(5))));
         p = p.add(u64x!(6));
         q = q.sub(u64x!(6));
     }
 
     if likely(q >= u64x!(2)) {
-        (state[0], state[3]) = _mumix(state[0], state[3], read_u64(p.add(u64x!(0))), read_u64(p.add(u64x!(1))));
+        (state[0], state[3]) =
+            _mumix::<BFAST>(state[0], state[3], read_u64(p.add(u64x!(0))), read_u64(p.add(u64x!(1))));
         if likely(q >= u64x!(4)) {
-            (state[1], state[4]) = _mumix(state[1], state[4], read_u64(p.add(u64x!(2))), read_u64(p.add(u64x!(3))));
+            (state[1], state[4]) =
+                _mumix::<BFAST>(state[1], state[4], read_u64(p.add(u64x!(2))), read_u64(p.add(u64x!(3))));
         }
     }
 
-    (state[2], state[5]) = _mumix(
+    (state[2], state[5]) = _mumix::<BFAST>(
         state[2],
         state[5],
         read_u64(p.add(q).sub(u64x!(2))),
         read_u64(p.add(q).sub(u64x!(1))),
     );
+}
 
-    /*-------- pre-epilogue --------*/
+// Note that only `_loong` requires separated `epilogue`.
 
-    let mut i = state[0].wrapping_add(state[1]);
-    let mut j = state[2].wrapping_add(state[3]);
-    let mut k = state[4].wrapping_add(state[5]);
+#[inline(always)]
+fn epilogue_common(state: [u64; 6], tot_len: usize) -> (u64, u64, u64) {
+    let mut i = state[0].wrapping_sub(state[1]);
+    let mut j = state[2].wrapping_sub(state[3]);
+    let mut k = state[4].wrapping_sub(state[5]);
 
-    let rot = len as u32 & 63;
+    let rot = tot_len as u32 & 63;
     i = i.rotate_left(rot);
     j = j.rotate_right(rot);
-    k ^= len as u64;
+    k ^= tot_len as u64;
 
     let (lo0, hi0) = wmul(i, j);
     let (lo1, hi1) = wmul(j, k);
@@ -400,7 +414,25 @@ unsafe fn hash_loong_inner<const BFAST: bool>(bytes: *const u8, len: usize, seed
     (i, j, k)
 }
 
-//------------------------------------------------------------------------------
+#[inline(always)]
+fn epilogue_64(triple: (u64, u64, u64)) -> u64 {
+    let (i, j, k) = triple;
+    let (lo0, hi0) = wmul(i, j);
+    let (lo1, hi1) = wmul(j, k);
+    let (lo2, hi2) = wmul(k, i);
+    (lo0 ^ hi2).wrapping_add(lo1 ^ hi0).wrapping_add(lo2 ^ hi1)
+}
+
+#[inline(always)]
+fn epilogue_128(triple: (u64, u64, u64)) -> u128 {
+    let (i, j, k) = triple;
+    let (lo0, hi0) = wmul(i, j);
+    let (lo1, hi1) = wmul(j, k);
+    let (lo2, hi2) = wmul(k, i);
+    u64s_to_u128(lo0 ^ lo1 ^ hi2, hi0 ^ hi1 ^ lo2)
+}
+
+//--------------------------------------------------------------------------------------------------
 
 #[inline(always)]
 fn base_hash_64<const BFAST: bool>(bytes: &[u8], seed: u64) -> u64 {
@@ -420,33 +452,210 @@ fn base_hash_128<const BFAST: bool>(bytes: &[u8], seed: u64) -> u128 {
     }
 }
 
-//------------------------------------------------------------------------------
+/// Common incremental hasher that implemented both `Standard` and `BFast` variants.
+/// Use [`Hasher`] or [`bfast::Hasher`] type aliases.
+#[repr(C)]
+#[derive(Clone)]
+pub struct BaseHasher<const BFAST: bool> {
+    /// It is guaranteed `buffered_len < 96` after arbitrary write,
+    /// that is to say `frac_tower` is eager executed.
+    buffer: [u8; 96],
+    buffered_len: usize,
+    processed_len: u64,
+
+    state: [u64; 6],
+    ring_prev: u64,
+}
+
+#[allow(missing_docs)]
+impl<const BFAST: bool> BaseHasher<BFAST> {
+    pub const fn new() -> Self {
+        Self::with_seed(0)
+    }
+
+    pub const fn with_seed(seed: u64) -> Self {
+        Self {
+            buffer: [0x00; 96],
+            buffered_len: 0,
+            processed_len: 0,
+
+            state: [
+                CONSTANT[0].wrapping_add(seed),
+                CONSTANT[1].wrapping_sub(seed),
+                CONSTANT[2] ^ seed,
+                CONSTANT[3].wrapping_add(seed),
+                CONSTANT[4].wrapping_sub(seed),
+                CONSTANT[5] ^ seed,
+            ],
+            ring_prev: CONSTANT[6],
+        }
+    }
+
+    #[inline(always)]
+    fn restore_seed(&self) -> u64 {
+        debug_assert!(self.processed_len == 0);
+        self.state[0].wrapping_sub(CONSTANT[0])
+    }
+
+    pub fn write(&mut self, mut bytes: &[u8]) {
+        let vacancy = u64x!(12) - self.buffered_len;
+        if bytes.len() < vacancy {
+            self.buffer[self.buffered_len..][..bytes.len()].copy_from_slice(bytes);
+            self.buffered_len += bytes.len();
+            return;
+        } else {
+            self.buffer[self.buffered_len..][..vacancy].copy_from_slice(&bytes[..vacancy]);
+            self.buffered_len = 0;
+            bytes = &bytes[vacancy..];
+        }
+
+        self.ring_prev = unsafe { frac_tower::<BFAST>(&mut self.state, self.ring_prev, self.buffer.as_ptr()) };
+        self.processed_len = self.processed_len.wrapping_add(u64x!(12));
+        while likely(bytes.len() >= u64x!(12)) {
+            self.ring_prev = unsafe { frac_tower::<BFAST>(&mut self.state, self.ring_prev, bytes.as_ptr()) };
+            self.processed_len = self.processed_len.wrapping_add(u64x!(12));
+            bytes = &bytes[u64x!(12)..];
+        }
+
+        self.buffer[..bytes.len()].copy_from_slice(bytes);
+        self.buffered_len = bytes.len();
+    }
+
+    #[inline(always)]
+    fn contiguous_remainder(&self) -> ([u8; 96], usize) {
+        let (younger, older) = self.buffer.split_at(self.buffered_len);
+        let mut buffer = [0x00; 96];
+        buffer[..older.len()].copy_from_slice(older);
+        buffer[older.len()..].copy_from_slice(younger);
+        (buffer, older.len())
+    }
+
+    pub fn finish(&self) -> u64 {
+        let tot_len = self.processed_len.wrapping_add(self.buffered_len as u64);
+        if unlikely(tot_len <= u64x!(4)) {
+            hash_short_64::<BFAST>(&self.buffer[..self.buffered_len], self.restore_seed())
+        } else {
+            let (remainder, delta) = self.contiguous_remainder();
+            let mut state = self.state;
+            if likely(tot_len >= u64x!(12)) {
+                state[0] ^= self.ring_prev;
+            }
+            unsafe { frac_final::<BFAST>(&mut state, remainder.as_ptr().add(delta), self.buffered_len) };
+            epilogue_64(epilogue_common(state, tot_len as usize))
+        }
+    }
+
+    pub fn finish_128(&self) -> u128 {
+        let tot_len = self.processed_len.wrapping_add(self.buffered_len as u64);
+        if unlikely(tot_len <= u64x!(4)) {
+            hash_short_128(&self.buffer[..self.buffered_len], self.restore_seed())
+        } else {
+            let (remainder, delta) = self.contiguous_remainder();
+            let mut state = self.state;
+            if likely(tot_len >= u64x!(12)) {
+                state[0] ^= self.ring_prev;
+            }
+            unsafe { frac_final::<BFAST>(&mut state, remainder.as_ptr().add(delta), self.buffered_len) };
+            epilogue_128(epilogue_common(state, tot_len as usize))
+        }
+    }
+}
+
+impl<const BFAST: bool> Default for BaseHasher<BFAST> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<const BFAST: bool> fmt::Debug for BaseHasher<BFAST> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match !BFAST {
+            true => f.write_str("museair::Hasher { ... }"),
+            false => f.write_str("museair::bfast::Hasher { ... }"),
+        }
+    }
+}
+
+impl<const BFAST: bool> core::hash::Hasher for BaseHasher<BFAST> {
+    fn finish(&self) -> u64 {
+        self.finish()
+    }
+
+    fn write(&mut self, bytes: &[u8]) {
+        self.write(bytes);
+    }
+}
+
+//--------------------------------------------------------------------------------------------------
 
 #[cfg(all(test, target_endian = "little"))]
 mod verify {
     use super::*;
+    extern crate std;
 
     #[test]
     fn verification_code() {
         assert_eq!(
-            0x4F7AF44C,
+            0xF89F1683,
             hashverify::compute(64, |bytes, seed, out| out
                 .copy_from_slice(&hash(bytes, seed).to_le_bytes()))
         );
         assert_eq!(
-            0xEFACD140,
+            0xD3DFE238,
             hashverify::compute(128, |bytes, seed, out| out
                 .copy_from_slice(&hash_128(bytes, seed).to_le_bytes()))
         );
         assert_eq!(
-            0x4E8C0789,
+            0xC61BEE56,
             hashverify::compute(64, |bytes, seed, out| out
                 .copy_from_slice(&bfast::hash(bytes, seed).to_le_bytes()))
         );
         assert_eq!(
-            0x7CCE23A2,
+            0x27939BF1,
             hashverify::compute(128, |bytes, seed, out| out
                 .copy_from_slice(&bfast::hash_128(bytes, seed).to_le_bytes()))
         );
     }
+
+    #[test]
+    fn one_shot_eq_streamed() {
+        macro_rules! one_shot_vs_streamed {
+            ($hash:path, $hasher:ty, $finish:ident) => {
+                for n in 0..1024 {
+                    let bytes = std::vec![0xAB; n];
+                    let one_shot = $hash(&bytes, n as u64);
+                    let streamed = {
+                        let mut hasher = <$hasher>::with_seed(n as u64);
+                        let (x, y, z) = random_split(&bytes);
+                        hasher.write(x);
+                        hasher.write(y);
+                        hasher.write(z);
+                        hasher.$finish()
+                    };
+                    assert_eq!(one_shot, streamed, "len == {}", n);
+                }
+            };
+        }
+
+        one_shot_vs_streamed!(hash, Hasher, finish);
+        one_shot_vs_streamed!(hash_128, Hasher, finish_128);
+        one_shot_vs_streamed!(bfast::hash, bfast::Hasher, finish);
+        one_shot_vs_streamed!(bfast::hash_128, bfast::Hasher, finish_128);
+    }
+
+    fn random_split(bytes: &[u8]) -> (&[u8], &[u8], &[u8]) {
+        match bytes.len() as u64 {
+            0 => (&[], &[], &[]),
+            1 => (&bytes[0..1], &[], &[]),
+            2 => (&bytes[0..1], &bytes[1..2], &[]),
+            3 => (&bytes[0..1], &bytes[1..2], &bytes[2..3]),
+            n => {
+                let p = wyhash::wyrng(&mut n.clone()) % (n - 2);
+                let q = wyhash::wyrng(&mut !n) % (n - p);
+                let (x, y) = bytes.split_at(p as usize);
+                let (y, z) = y.split_at(q as usize);
+                (x, y, z)
+            }
+        }
+    }
 }