From 4cb67aa3c60cd6888cd621cbd42d6f7966d10c1c Mon Sep 17 00:00:00 2001 From: Mark Bennett Date: Wed, 12 Aug 2015 14:16:05 -0700 Subject: [PATCH] Add util to calculate document ID hash and show predicted shard routing --- README.md | 118 +++++++++++- .../lucidworks/dq/util/CmdLineLauncher.java | 1 + .../com/lucidworks/dq/util/HashAndShard.java | 169 ++++++++++++++++++ 3 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 src/main/java/com/lucidworks/dq/util/HashAndShard.java diff --git a/README.md b/README.md index 259d56e..f22458e 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,10 @@ Preliminary DQ / Data Quality experiments and related utilities. * ```dump_ids``` - Dump all the IDs from a collection to standard out / stdout _(com.lucidworks.dq.data.DumpIds)_ * ```delete_by_ids``` - Delete documents by their ID, either passed on the command line, or from a file, or from standard in / stdin _(com.lucidworks.dq.data.DeleteByIds)_ * ```solr_to_solr``` - Copy records from one Solr collection or core to another, can control which fields and records _(com.lucidworks.dq.data.SolrToSolr)_ +* ```solr_to_csv``` - Export records from Solr collection or core to delimited file, such as CSV. _(com.lucidworks.dq.data.SolrToCsv)_ + +## Debugging Utilities: +* ```hash_and_shard``` - Calculate hash and shard for a document ID _(com.lucidworks.dq.util.HashAndShard)_ ## Sample Reports @@ -30,7 +34,7 @@ See ```src/main/resources/sample-reports/``` Fully runnable jar is available here: -* https://github.com/LucidWorks/data-quality/releases/tag/0.4 +* https://github.com/LucidWorks/data-quality/releases/tag/0.5 * Click the **green button** with ```data-quality-java-1.0-SNAPSHOT.jar``` and the download will start # Building From Source @@ -132,6 +136,8 @@ Pass a command name on the command line to see help for that class: dump_ids: Dump all the IDs from a collection to standard out / stdout. delete_by_ids: Delete documents by their ID, either passed on the command line, or from a file, or from standard in / stdin. solr_to_solr: Copy records from one Solr collection or core to another. + solr_to_csv: Export records from Solr collection or core to delimited file, such as CSV. + hash_and_shard: Calculate hash and shard for a document ID ``` Example: Show the syntax for a specific command, for example ```empty_fields```: @@ -361,6 +367,116 @@ Options: ", or similar errors. ``` +# Collection Debugging: Hash Values and Shard Routing + +Have you ever wondered which shard a document will wind up in? +This can be useful when testing if you suspect your shards are not equally fully. +This could happen, for example, if document keys happen to be generated with a hash algorithm similar to that used by Solr (MurmurHash3). + +As you may know, once indexed, you can always find out which shards documents were put in by including ```[shard]``` in the field list: + +`http://localhost:8983/solr/collection1/select?q=*&fl=*,[shard]` + +Or more compact output: + +`http://localhost:8983/solr/collection1/select?q=*&fl=id,[shard]&wt=csv` + +But you can also use a utility that's included in this toolkit, `hash_and_shard`, to view the hash of a document ID, and then which shard it would be routed to. +To figure out the 32-bit hash value, it only needs the document ID. +But to figure out which shard it would be routed to, it also needs to know the total number of shards. And this is only an approximation; if you've split shards this output won't be correct. + +Here's how to find out the hash for "doc1", and how it'll be routed in a 4-shard system: + +```java -jar data-quality.jar hash_and_shard doc1 4``` + +This gives the output: + +``` +docId: "doc1" +32-bit Hash (signed decimal int): -657533388 +32-bit Hash (unsigned dec int): 3637433908 +32-bit Hash (hex): 0xd8ced634 +32-bit Hash (binary): 11011000110011101101011000110100 +Number of Shards: 4 +Shard # 1 + Range: 0x80000000 to 0xbfffffff +Shard # 2 + Range: 0xc0000000 to 0xffffffff + contains 0xd8ced634 +Shard # 3 + Range: 0x00000000 to 0x3fffffff +Shard # 4 + Range: 0x40000000 to 0x7fffffff +``` + +Shard boundaries are **inclusive**. Running with 3 shards instead of 4 gives more interesting output for shard boundaries: + +```java -jar data-quality.jar hash_and_shard doc1 3``` + +``` +docId: "doc1" +32-bit Hash (signed decimal int): -657533388 +32-bit Hash (unsigned dec int): 3637433908 +32-bit Hash (hex): 0xd8ced634 +32-bit Hash (binary): 11011000110011101101011000110100 +Number of Shards: 3 +Shard # 1 + Range: 0x80000000 to 0xd554ffff +Shard # 2 + Range: 0xd5550000 to 0x2aa9ffff + contains 0xd8ced634 +Shard # 3 + Range: 0x2aaa0000 to 0x7fffffff +``` + +The ranges might look a bit confusing: + +* Remember that Java uses only signed integers, so that hex numbers starting with 8 or above are actually negative numbers, so the shards ARE sorted numerically from smallest to largest. +* Further, Solr likes to put shard boundaries at certain powers of 2, vs. random integer division. + +If you only give the utility a document ID, but not the number of shards, it'll just show the 32-bit hash. + +You can include the ```-q``` option along with the number of shards, you'll get a very compact output; +this is useful for scripting! (Unlike other DQ utilities, the -q must come last) + +Here's a script to check the simple doc ID's "1" through "10": + +**calc-shards.sh** +``` +#!/bin/bash + +JAR=data-quality.jar +SHARDS=3 # try values like 2, 3, 4, 7, 23! + +echo +echo Predicting final shard for $SHARDS shards +echo +for (( i = 1; i <= 10; i++ )) +do + java -jar "$JAR" hash_and_shard $i $SHARDS -q +done + +echo +echo Reminder: Those predictions were for $SHARDS shards +``` + +Summary of output: + +``` + id hash shard + 1 0x9416ac93 1 + 2 0x0129e217 2 + 3 0x0fc7a1b4 2 + 4 0xe131cc88 2 + 5 0x531a35e4 3 + 6 0x27fa7cc0 2 + 7 0x23ea8628 2 + 8 0xbd920017 1 + 9 0x248be6a1 2 + 10 0x86e4093f 1 +``` + + # Developers: Bonus Utilities, SolrJ wrappers, etc! All under ```src/main/java/com/lucidworks/dq/util/``` diff --git a/src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java b/src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java index 4a4db9f..85b8f69 100644 --- a/src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java +++ b/src/main/java/com/lucidworks/dq/util/CmdLineLauncher.java @@ -26,6 +26,7 @@ public class CmdLineLauncher { put( "delete_by_ids", com.lucidworks.dq.data.DeleteByIds.class ); put( "solr_to_solr", com.lucidworks.dq.data.SolrToSolr.class ); put( "solr_to_csv", com.lucidworks.dq.data.SolrToCsv.class ); + put( "hash_and_shard", com.lucidworks.dq.util.HashAndShard.class ); }}; public static void main( String[] argv ) { if( argv.length < 1 ) { diff --git a/src/main/java/com/lucidworks/dq/util/HashAndShard.java b/src/main/java/com/lucidworks/dq/util/HashAndShard.java new file mode 100644 index 0000000..1c02598 --- /dev/null +++ b/src/main/java/com/lucidworks/dq/util/HashAndShard.java @@ -0,0 +1,169 @@ +package com.lucidworks.dq.util; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.solr.common.cloud.DocRouter.Range; +import org.apache.solr.common.util.Hash; + +public class HashAndShard { + + // Should correspond to: + // http://localhost:8983/solr/collection1/select?q=*&fl=*,[shard] + + static String HELP_WHAT_IS_IT = "Calculate hash and shard for a document ID"; + static String HELP_USAGE = "HashAndShard docId [numberOfShards [-q]] # shards can be decimal, hex, octal, etc"; + public static String getShortDescription() { + return HELP_WHAT_IS_IT; + } + + + /* From: + * solr-lucene-490-src/solr/solrj/src/java/org/apache/solr/common/cloud/CompositeIdRouter.java + */ + private static int bits = 16; + static List partitionRange( int partitions ) { + int min = Integer.MIN_VALUE; // -2^31 = -2147483648 = -2,147,483,648 + int max = Integer.MAX_VALUE; // 2^31-1 = 2147483647 = 2,147,483,647 + + // assert max >= min; + // if (partitions == 0) return Collections.EMPTY_LIST; + long rangeSize = (long) max - (long) min; + long rangeStep = Math.max(1, rangeSize / partitions); + + List ranges = new ArrayList<>(partitions); + + long start = min; + long end = start; + + // keep track of the idealized target to avoid accumulating rounding errors + long targetStart = min; + long targetEnd = targetStart; + + // Round to avoid splitting hash domains across ranges if such rounding is not significant. + // With default bits==16, one would need to create more than 4000 shards before this + // becomes false by default. + int mask = 0x0000ffff; + boolean round = rangeStep >= (1 << bits) * 16; + + while (end < max) { + targetEnd = targetStart + rangeStep; + end = targetEnd; + + if (round && ((end & mask) != mask)) { + // round up or down? + int increment = 1 << bits; // 0x00010000 + long roundDown = (end | mask) - increment; + long roundUp = (end | mask) + increment; + if (end - roundDown < roundUp - end && roundDown > start) { + end = roundDown; + } else { + end = roundUp; + } + } + + // make last range always end exactly on MAX_VALUE + if (ranges.size() == partitions - 1) { + end = max; + } + ranges.add(new Range((int) start, (int) end)); + start = end + 1L; + targetStart = targetEnd + 1L; + } + + return ranges; + } + + static void printRanges( List ranges, Integer hash ) { + int shardCounter = 0; + for ( Range r : ranges ) { + shardCounter++; + System.out.println( "Shard # " + shardCounter ); + System.out.println( "\tRange: " + + String.format("0x%8s", Integer.toHexString(r.min)).replace(' ', '0') + + " to " + + String.format("0x%8s", Integer.toHexString(r.max)).replace(' ', '0') + ); + if ( null!=hash ) { + if ( hash >= r.min && hash <= r.max ) { + System.out.println( "\tcontains " + + String.format("0x%8s", Integer.toHexString(hash)).replace(' ', '0') + ); + } + } + } + } + static int findShardForHash( List ranges, Integer hash ) { + int shardCounter = 0; + for ( Range r : ranges ) { + shardCounter++; + if ( hash >= r.min && hash <= r.max ) { + return shardCounter; + } + } + return -1; + } + + public static void main(String[] args) { + if ( args.length < 1 || args.length > 3 ) { + System.err.println( "Error: syntax: " + HELP_USAGE ); + System.exit(1); + } + String docId = args[0]; + if ( docId.length() < 1 ) { + System.err.println( "Error: empty docId" ); + System.exit(2); + } + String numShardsStr = args.length >= 2 ? args[1] : null; + String quietStr = args.length >= 3 ? args[2] : null; + boolean quiet = null!=quietStr && quietStr.equalsIgnoreCase("-q"); + + int signedHash = Hash.murmurhash3_x86_32( docId, 0, docId.length(), 0 ); + long unsignedHash = signedHash & 0x00000000ffffffffL; + if ( ! quiet ) { + System.out.println( "docId: \"" + docId + '"' ); + System.out.println( "32-bit Hash (signed decimal int): " + signedHash ); + System.out.println( "32-bit Hash (unsigned dec int): " + unsignedHash ); + System.out.println( "32-bit Hash (hex): " + String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') ); + System.out.println( "32-bit Hash (binary): " + String.format("%32s", Integer.toBinaryString(signedHash)).replace(' ', '0') ); + } + else { + System.out.print( String.format("0x%8s", Integer.toHexString(signedHash)).replace(' ', '0') ); + } + + if ( null != numShardsStr ) { + Integer numShards = null; + try { + numShards = Integer.decode( numShardsStr ); + } + catch( NumberFormatException e ) { + System.err.println( "Error parsing numberOfShards: " + e ); + System.exit(3); + } + if ( numShards <= 0 ) { + System.err.println( "Error: numberOfShards must be > 0; got " + numShards ); + System.exit(4); + } + // WRONG! + // long shardNumber = (unsignedHash % numShards) + 1; + // System.out.println( "Route to Shard (base-ONE): " + shardNumber ); + + List ranges = partitionRange( numShards ); + + if ( ! quiet ) { + System.out.println( "Number of Shards: " + numShards ); + + printRanges( ranges, signedHash ); + } + else { + int targetShard = findShardForHash( ranges, signedHash ); + System.out.print( " " + targetShard ); + } + } + if ( quiet ) { + System.out.println(); + } + + } + +}