Skip to content

Commit

Permalink
Add and test hashdataset
Browse files Browse the repository at this point in the history
  • Loading branch information
brenhinkeller committed Mar 19, 2024
1 parent 673eb96 commit 281d252
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "StatGeochem"
uuid = "df4de05a-b714-11e8-3c2a-c30fb13e804c"
authors = ["C. Brenhin Keller <cbkeller@dartmouth.edu>"]
version = "0.7.3"
version = "0.7.4"

[deps]
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
Expand Down
52 changes: 52 additions & 0 deletions src/utilities/Import.jl
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,58 @@
end
export concatenatedatasets

## --- Hashing of imported datasets

function rescale(x::Number, digits::Integer=1)
n = if isfinite(x) && !iszero(x)
-(floor(Int, log10(abs(x)))-digits+1)
else
0
end
return trunc(x * 10.0^n)
end

prehash(x, digits::Integer) = hash(x)
prehash(x::Number, digits::Integer) = prehash(Float64(x), digits)
prehash(x::Float64, digits::Integer) = reinterpret(UInt64, rescale(x, digits))


"""
```julia
hashdataset(ds::Union{Dict, NamedTuple}; digits::Number=3, elements=keys(ds))
```
Calculate a hash value for each row of a dataset.
By default, this considers only the first 3 `digits` of each number, regardless of scale.
### Examples
```julia
julia> ds = (La=rand(5), Yb=rand(5)/10)
NamedTuple with 2 elements:
La = Vector{Float64}(5,) [0.580683620945775 ... 0.23810020661332487]
Yb = Vector{Float64}(5,) [0.014069255862588826 ... 0.067367584177675]
julia> hashdataset(ds)
5-element Vector{UInt64}:
0x89a02fa88348e07c
0x181e78f0ad2af144
0xa3811bd05cca4743
0xfcfe1b6edf0c81cf
0x647868efa9352972
```
"""
function hashdataset(ds::Union{Dict, NamedTuple}; digits::Number=3, elements=keys(ds))
I = eachindex(ds[first(elements)])
for e in elements
@assert eachindex(ds[e]) == I
end
hashes = similar(ds[first(elements)], UInt64)
for i in eachindex(hashes)
dt = ntuple(j -> prehash(ds[elements[j]][i], digits), length(elements))
hashes[i] = hash(dt)
end
return hashes
end
export hashdataset

## --- Renormalization of imported datasets

Expand Down
2 changes: 2 additions & 0 deletions test/testImport.jl
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@
@test isa(d, NamedTuple)
@test isequal(d.La, [a.La; b.La; a.La; b.La; a.La])

@test hashdataset(d) == [0x69f0025597bf6523, 0xe8341bcc0a64d447, 0x69f0025597bf6523, 0x6eb8871cf9477895, 0x4f3831d3feae830b, 0x69f0025597bf6523, 0xe8341bcc0a64d447, 0x69f0025597bf6523, 0x6eb8871cf9477895, 0x4f3831d3feae830b, 0x69f0025597bf6523, 0xe8341bcc0a64d447]

## --- Clean up

rm("dictdataset.csv")
Expand Down

2 comments on commit 281d252

@brenhinkeller
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register
Release notes:

  • Add hashdataset function

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/103139

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.7.4 -m "<description of version>" 281d2523aa47add7c3426d4939c0e705d3468ff9
git push origin v0.7.4

Please sign in to comment.