Skip to content

Commit

Permalink
Merge pull request #156 from dynatrace-oss/simhash
Browse files Browse the repository at this point in the history
Simhash
  • Loading branch information
oertl authored Sep 20, 2023
2 parents 274d2fb + ca7a110 commit 8f8064f
Show file tree
Hide file tree
Showing 13 changed files with 400 additions and 50 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ To add a dependency on hash4j using Maven, use the following:
<dependency>
<groupId>com.dynatrace.hash4j</groupId>
<artifactId>hash4j</artifactId>
<version>0.12.0</version>
<version>0.13.0</version>
</dependency>
```
To add a dependency using Gradle:
```gradle
implementation 'com.dynatrace.hash4j:hash4j:0.12.0'
implementation 'com.dynatrace.hash4j:hash4j:0.13.0'
```

## Hash algorithms
Expand Down Expand Up @@ -74,7 +74,8 @@ More examples can be found in [HashingDemo.java](src/test/java/com/dynatrace/has
Similarity hashing algorithms are able to compute hash signature of sets that allow estimation of set similarity without using the original sets. Following algorithms are currently available:
* [MinHash](https://en.wikipedia.org/wiki/MinHash)
* [SuperMinHash](https://arxiv.org/abs/1706.05698)
* FastSimHash: A fast implementation of [SimHash](https://en.wikipedia.org/wiki/SimHash)
* [SimHash](https://en.wikipedia.org/wiki/SimHash)
* FastSimHash: A fast implementation of SimHash using a bit hack

### Usage

Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ java {
}

group = 'com.dynatrace.hash4j'
version = '0.12.0'
version = '0.13.0'

spotless {
ratchetFrom 'origin/main'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.similarity;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.infra.Blackhole;

public class SimHash_v1PerformanceTest extends SimilarityHashingPerformanceTest {

public static class State_64 extends StateBase {
public State_64() {
super(new SimHashPolicy_v1(64, getPseudoRandomGeneratorProvider()).createHasher());
}
}

public static class State_256 extends StateBase {
public State_256() {
super(new SimHashPolicy_v1(256, getPseudoRandomGeneratorProvider()).createHasher());
}
}

public static class State_1024 extends StateBase {
public State_1024() {
super(new SimHashPolicy_v1(1024, getPseudoRandomGeneratorProvider()).createHasher());
}
}

public static class State_4096 extends StateBase {
public State_4096() {
super(new SimHashPolicy_v1(4096, getPseudoRandomGeneratorProvider()).createHasher());
}
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void testSimilarityHashing_64_1(State_64 state, Blackhole blackhole) {
testSimilarityHashing(state, blackhole);
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void testSimilarityHashing_256_1(State_256 state, Blackhole blackhole) {
testSimilarityHashing(state, blackhole);
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void testSimilarityHashing_1024_1(State_1024 state, Blackhole blackhole) {
testSimilarityHashing(state, blackhole);
}

@Benchmark
@BenchmarkMode(Mode.AverageTime)
public void testSimilarityHashing_4096_1(State_4096 state, Blackhole blackhole) {
testSimilarityHashing(state, blackhole);
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2022 Dynatrace LLC
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -27,14 +27,14 @@ public static class StateBase implements ElementHashProvider {
public final SplittableRandom random = new SplittableRandom();
public final SimilarityHasher similarityHasher;

@Param({"1", "10", "100", "1000", "10000"})
@Param({"1", "10", "100", "1000", "10000", "100000"})
public int numElements;

public StateBase(SimilarityHasher similarityHasher) {
this.similarityHasher = similarityHasher;
}

public final long[] elementHashes = new long[10000]; // maximum number of elements
public final long[] elementHashes = new long[100000]; // maximum number of elements

@Override
public long getElementHash(int elementIndex) {
Expand All @@ -52,9 +52,7 @@ protected void testSimilarityHashing(StateBase state, Blackhole blackhole) {
state.elementHashes[i] = state.random.nextLong();
}
byte[] signature = state.similarityHasher.compute(state);
for (byte b : signature) {
blackhole.consume(b);
}
blackhole.consume(signature);
}

protected static PseudoRandomGeneratorProvider getPseudoRandomGeneratorProvider() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,16 @@ public byte[] compute(ElementHashProvider elementHashProvider) {

for (int h = 0; h < numTmpCountChunks; ++h) {
long randomValue = pseudoRandomGenerator.nextLong();
int off = h << (6 - BULK_CONSTANT);
for (int j = 0; j < (1 << (6 - BULK_CONSTANT)); ++j) {
tmpCounts[(h << (6 - BULK_CONSTANT)) + j] += (randomValue >>> j) & BULK_MASK;
tmpCounts[off + j] += (randomValue >>> j) & BULK_MASK;
}
}
if (numTmpCountRemaining > 0) {
long randomValue = pseudoRandomGenerator.nextLong();
int off = numTmpCountChunks << (6 - BULK_CONSTANT);
for (int j = 0; j < numTmpCountRemaining; ++j) {
tmpCounts[(numTmpCountChunks << (6 - BULK_CONSTANT)) + j] +=
(randomValue >>> j) & BULK_MASK;
tmpCounts[off + j] += (randomValue >>> j) & BULK_MASK;
}
}
c += 1;
Expand All @@ -99,16 +100,18 @@ public byte[] compute(ElementHashProvider elementHashProvider) {
for (int h = 0; h < (counts.length >>> BULK_CONSTANT); ++h) {
long tmp = tmpCounts[h];
tmpCounts[h] = 0;
int off = h << BULK_CONSTANT;
for (int g = 0; g < (1 << BULK_CONSTANT); ++g) {
counts[g + (h << BULK_CONSTANT)] +=
counts[off + g] +=
(int) ((tmp >>> (g << (6 - BULK_CONSTANT))) & TEMPORARY_COUNTER_LIMIT);
}
}
for (int h = (counts.length >>> BULK_CONSTANT); h < tmpCounts.length; ++h) {
long tmp = tmpCounts[h];
tmpCounts[h] = 0;
int off = h << BULK_CONSTANT;
for (int g = 0; g < counts.length - (h << BULK_CONSTANT); ++g) {
counts[g + (h << BULK_CONSTANT)] +=
counts[off + g] +=
(int) ((tmp >>> (g << (6 - BULK_CONSTANT))) & TEMPORARY_COUNTER_LIMIT);
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
/*
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.similarity;

import static com.dynatrace.hash4j.util.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;

import com.dynatrace.hash4j.random.PseudoRandomGenerator;
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
import java.util.Arrays;

final class SimHashPolicy_v1 extends AbstractSimilarityHashPolicy {

public SimHashPolicy_v1(
int numberOfComponents, PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
super(numberOfComponents, 1, pseudoRandomGeneratorProvider);
}

@Override
public SimilarityHasher createHasher() {
return new Hasher();
}

private class Hasher implements SimilarityHasher {

private final int[] counts = new int[numberOfComponents];

private final PseudoRandomGenerator pseudoRandomGenerator =
pseudoRandomGeneratorProvider.create();

public byte[] compute(ElementHashProvider elementHashProvider) {

requireNonNull(elementHashProvider);
int numberOfElements = elementHashProvider.getNumberOfElements();
checkArgument(numberOfElements > 0, "Number of elements must be positive!");

Arrays.fill(counts, 0);

int numChunks = numberOfComponents >>> 6;
int numRemaining = numberOfComponents & 0x3F;

for (int k = 0; k < numberOfElements; ++k) {

long elementHash = elementHashProvider.getElementHash(k);
pseudoRandomGenerator.reset(elementHash);

for (int j = 0; j < numChunks; j++) {
long randomValue = pseudoRandomGenerator.nextLong();
int off = j << 6;
for (int h = 0; h < 64; ++h) {
counts[off + h] += (((int) (randomValue >>> h)) & 1);
}
}

if (numRemaining > 0) {
long randomValue = pseudoRandomGenerator.nextLong();
int off = numChunks << 6;
for (int h = 0; h < numRemaining; ++h) {
counts[off + h] += (((int) (randomValue >>> h)) & 1);
}
}
}

final long limit = numberOfElements >>> 1;
return packedArrayHandler.create(
i -> (counts[i] + (i & (~numberOfElements & 1)) > limit) ? 1L : 0L, numberOfComponents);
}
}
}
45 changes: 45 additions & 0 deletions src/main/java/com/dynatrace/hash4j/similarity/SimHashVersion.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.similarity;

import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;

/** Versions of FastSimHash implementations. */
public enum SimHashVersion {
/**
* Default version.
*
* <p>Not stable! Use concrete version if compatibility is important, if for example hash
* signatures are persisted.
*/
DEFAULT {
@Override
SimilarityHashPolicy create(int numberOfComponents) {
return new SimHashPolicy_v1(
numberOfComponents, PseudoRandomGeneratorProvider.splitMix64_V1());
}
},
/** Version 1. */
V1 {
@Override
SimilarityHashPolicy create(int numberOfComponents) {
return new SimHashPolicy_v1(
numberOfComponents, PseudoRandomGeneratorProvider.splitMix64_V1());
}
};

abstract SimilarityHashPolicy create(int numberOfComponents);
}
Original file line number Diff line number Diff line change
Expand Up @@ -125,4 +125,39 @@ static SimilarityHashPolicy fastSimHash(
int numberOfComponents, FastSimHashVersion fastSimHashVersion) {
return fastSimHashVersion.create(numberOfComponents);
}

/**
* Returns a {@link SimilarityHashPolicy} for SimHash as introduced in <a
* href="https://dl.acm.org/doi/abs/10.1145/509907.509965?casa_token=LO2phP3daHEAAAAA%3Ad2zE2ktXOGP8JqCsSo0jqsQcfOx8-Jclq7_katfP_FRpXWJMPU3OuDE8QZATbYdePl7VRbibDUqWdQ">Moses
* S. Charikar, Similarity estimation techniques from rounding algorithms, 2002.</a>
*
* <p>As SimHash is significantly slower than FastSimHash, prefer using {@link #fastSimHash(int)}
* instead!
*
* @param numberOfComponents the number of components of the similarity hash
* @return a policy
*/
static SimilarityHashPolicy simHash(int numberOfComponents) {
return simHash(numberOfComponents, SimHashVersion.DEFAULT);
}

/**
* Returns a {@link SimilarityHashPolicy} for SimHash as introduced in <a
* href="https://dl.acm.org/doi/abs/10.1145/509907.509965?casa_token=LO2phP3daHEAAAAA%3Ad2zE2ktXOGP8JqCsSo0jqsQcfOx8-Jclq7_katfP_FRpXWJMPU3OuDE8QZATbYdePl7VRbibDUqWdQ">Moses
* S. Charikar, Similarity estimation techniques from rounding algorithms, 2002.</a>
*
* <p>As SimHash is significantly slower than FastSimHash, prefer using {@link #fastSimHash(int,
* FastSimHashVersion)} instead!
*
* <p>Specifying the version of the implementation ensures compatibility with later hash4j
* versions that may change the default implementation. This is especially important if the
* signatures are persisted.
*
* @param numberOfComponents the number of components of the similarity hash
* @param simHashVersion the version of the implementation
* @return a policy
*/
static SimilarityHashPolicy simHash(int numberOfComponents, SimHashVersion simHashVersion) {
return simHashVersion.create(numberOfComponents);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.similarity;

abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest {

protected static double calculateComponentCollisionProbability(double cosineSimilarity) {
return Math.min(1., Math.max(0.5, Math.acos(-cosineSimilarity) / Math.PI));
}

@Override
protected double calculateExpectedMatchProbability(
long intersectionSize, long difference1Size, long difference2Size) {

double expectedCosineSimilarity =
intersectionSize
/ Math.sqrt(
(intersectionSize + difference1Size)
* (double) (intersectionSize + difference2Size));

return calculateComponentCollisionProbability(expectedCosineSimilarity);
}

@Override
protected int getMaxSizeForCheckSumTest() {
return 300;
}
}
Loading

0 comments on commit 8f8064f

Please sign in to comment.