From 8b4012a8e8329bf27be9adfcb50ec22987f4c2ba Mon Sep 17 00:00:00 2001 From: "Ivan R. Judson" Date: Mon, 12 Jan 2015 19:04:09 -0800 Subject: [PATCH] Added files. --- ANONYMIZE.java | 107 +++++++++++++++++++++++++++++++++++++++++++++++ DEANONYMIZE.java | 64 ++++++++++++++++++++++++++++ anonymize.pig | 28 +++++++++++++ deanonymize.pig | 28 +++++++++++++ test.pig | 19 +++++++++ testdata | 4 ++ 6 files changed, 250 insertions(+) create mode 100644 ANONYMIZE.java create mode 100644 DEANONYMIZE.java create mode 100644 anonymize.pig create mode 100644 deanonymize.pig create mode 100644 test.pig create mode 100644 testdata diff --git a/ANONYMIZE.java b/ANONYMIZE.java new file mode 100644 index 0000000..8b76a33 --- /dev/null +++ b/ANONYMIZE.java @@ -0,0 +1,107 @@ +package jaglion; + +import java.io.IOException; +import java.util.UUID; + +import org.apache.commons.codec.digest.DigestUtils; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.Tuple; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.MasterNotRunningException; +import org.apache.hadoop.hbase.ZooKeeperConnectionException; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.util.Bytes; + +public class ANONYMIZE extends EvalFunc +{ + public Configuration config; + public String fmap = "FowardMap"; + public String rmap = "ReversMap"; + + public String exec(Tuple input) throws IOException { + + // As long as we have a reasonable inputs + if (input.size() != 2 || input.get(0) == null || input.get(1) == null) + return null; + + // Connect to HBASE + config = HBaseConfiguration.create(); + HBaseAdmin hba = new HBaseAdmin(config); + HTableDescriptor tableDescriptor = null; + HColumnDescriptor columnDescriptor = null; + + // Check and optionally create the forward mapping table + if (hba.tableExists(fmap) == false) { + tableDescriptor = new HTableDescriptor(fmap); + columnDescriptor = new HColumnDescriptor("AnonymousValue"); + tableDescriptor.addFamily(columnDescriptor); + hba.createTable(tableDescriptor); + } + + // Check and optionally create the reverse map table + if (hba.tableExists(rmap) == false) { + tableDescriptor = new HTableDescriptor(rmap); + columnDescriptor = new HColumnDescriptor("ActualValue"); + tableDescriptor.addFamily(columnDescriptor); + hba.createTable(tableDescriptor); + } + + try { + // Read inputs + String id = (String)input.get(0); + int unique = (int)input.get(1); + System.out.println("Value: " + id + " Unique: " + unique); + Put p = null; + + // Generate value for input id + String uuid = UUID.randomUUID().toString(); + String hash = DigestUtils.shaHex(id); + + System.out.println("UUID: " + uuid + " Hash: " + hash); + + // Store the value forward and backward + HTable forward = new HTable(config, fmap); + HTable backward = new HTable(config, rmap); + + // Forward Map + p = new Put(Bytes.toBytes(id)); + p.add(Bytes.toBytes("AnonymousValue"), Bytes.toBytes("hash"), Bytes.toBytes(hash)); + p.add(Bytes.toBytes("AnonymousValue"), Bytes.toBytes("uuid"), Bytes.toBytes(uuid)); + forward.put(p); + + System.out.println("Stored forward map."); + + // Reverse Map - Hash + p = new Put(Bytes.toBytes(hash)); + p.add(Bytes.toBytes("ActualValue"), Bytes.toBytes(""), Bytes.toBytes(id)); + backward.put(p); + + System.out.println("Stored reverse map - hash."); + + // Reverse Map - UUID + p = new Put(Bytes.toBytes(uuid)); + p.add(Bytes.toBytes("ActualValue"), Bytes.toBytes(""), Bytes.toBytes(id)); + backward.put(p); + + System.out.println("Stored reverse map - uuid."); + + // If the request was for a unique value return the uuid + if (unique == 1) { + return uuid; + } else { + // otherwise return the hashed value + return hash; + } + } catch(Exception e) { + System.out.println(input); + throw new IOException("Caught exception processing input row ", e); + } + } +} diff --git a/DEANONYMIZE.java b/DEANONYMIZE.java new file mode 100644 index 0000000..3423d2f --- /dev/null +++ b/DEANONYMIZE.java @@ -0,0 +1,64 @@ +package jaglion; + +import java.io.IOException; +import java.util.UUID; + +import org.apache.commons.codec.digest.DigestUtils; + +import org.apache.pig.EvalFunc; +import org.apache.pig.data.Tuple; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.MasterNotRunningException; +import org.apache.hadoop.hbase.ZooKeeperConnectionException; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.util.Bytes; + +public class DEANONYMIZE extends EvalFunc +{ + public Configuration config; + public String rmap = "ReversMap"; + + public String exec(Tuple input) throws IOException { + + // Check inputs to ensure there's only one value coming in and that it's not null + if (input == null || input.size() != 1 || input.get(0) == null) + return null; + + // Connect to the HBase database + config = HBaseConfiguration.create(); + HBaseAdmin hba = new HBaseAdmin(config); + HTableDescriptor tableDescriptor = null; + HColumnDescriptor columnDescriptor = null; + + // Ensure the reverse table exists + if (hba.tableExists(rmap) == false) { + throw new IOException("Value not found to map."); + } + + try { + String code = (String)input.get(0); + String original = null; + + // The backward map table + HTable backward = new HTable(config, rmap); + + // Retrieve the value from the reverse map + Get g = new Get(Bytes.toBytes(code)); + Result r = backward.get(g); + byte[] value = r.getValue(Bytes.toBytes("ActualValue"), Bytes.toBytes("")); + original = Bytes.toString(value); + + return original; + } catch(Exception e) { + System.out.println(input); + throw new IOException("Caught exception processing input row ", e); + } + } +} diff --git a/anonymize.pig b/anonymize.pig new file mode 100644 index 0000000..b269262 --- /dev/null +++ b/anonymize.pig @@ -0,0 +1,28 @@ +-- anonymize.pig + +-- -- Hadoop / HBase / Zookeeper +REGISTER /usr/lib/zookeeper/zookeeper.jar; +REGISTER /usr/lib/hadoop/lib/commons-codec-1.4.jar; +REGISTER /usr/lib/hbase/hbase-client.jar; +REGISTER /usr/lib/hbase/hbase-common.jar; +REGISTER /usr/lib/hbase/hbase-protocol.jar; +REGISTER /usr/lib/hbase/hbase-hadoop-compat.jar; +REGISTER /usr/lib/hbase/lib/htrace-core.jar; + +-- -- Our UDF +REGISTER bin/jaglion.jar; + +-- Load the data with variable length records +-- pass $input to script as -param input='filename' +-- load each line as a character array +A = LOAD '$input' USING TextLoader AS (line:chararray); + +-- Split each line into first, rest +SPLT = FOREACH A GENERATE STRSPLIT($0, ',', 2) AS A1; + +-- For each row in the data, deanonymize the first column, use the rest as is +B = FOREACH SPLT GENERATE jaglion.ANONYMIZE(TRIM(A1.$0)), A1.$1; + +-- Store the deanonymized results out to a file +-- pass $output to script as -param output='filename' +STORE B INTO '$output' using PigStorage(','); diff --git a/deanonymize.pig b/deanonymize.pig new file mode 100644 index 0000000..8b5624b --- /dev/null +++ b/deanonymize.pig @@ -0,0 +1,28 @@ +-- deanonymize.pig + +-- -- Hadoop / HBase / Zookeeper +REGISTER /usr/lib/zookeeper/zookeeper.jar; +REGISTER /usr/lib/hadoop/lib/commons-codec-1.4.jar; +REGISTER /usr/lib/hbase/hbase-client.jar; +REGISTER /usr/lib/hbase/hbase-common.jar; +REGISTER /usr/lib/hbase/hbase-protocol.jar; +REGISTER /usr/lib/hbase/hbase-hadoop-compat.jar; +REGISTER /usr/lib/hbase/lib/htrace-core.jar; + +-- -- Our UDF +REGISTER bin/jaglion.jar; + +-- Load the anonymized data, with variable length records +-- pass $input to script as -param input='filename' +-- load each line as a character array +A = LOAD '$input' USING TextLoader AS (line:chararray); + +-- Split each line into first, rest +SPLT = FOREACH A GENERATE STRSPLIT($0, ',', 2) AS A1; + +-- For each row in the data, deanonymize the first column, use the rest as is +B = FOREACH SPLT GENERATE jaglion.DEANONYMIZE(TRIM(A1.$0)), A1.$1; + +-- Store the deanonymized results out to a file +-- pass $output to script as -param output='filename' +STORE B INTO '$output' using PigStorage(','); diff --git a/test.pig b/test.pig new file mode 100644 index 0000000..d2988a9 --- /dev/null +++ b/test.pig @@ -0,0 +1,19 @@ +-- test.pig +REGISTER /usr/lib/zookeeper/zookeeper.jar; +REGISTER /usr/lib/hbase/hbase-client.jar; +REGISTER /usr/lib/hbase/hbase-common.jar; +REGISTER /usr/lib/hbase/hbase-protocol.jar; +REGISTER /usr/lib/hbase/hbase-hadoop-compat.jar; +REGISTER /usr/lib/hbase/lib/htrace-core.jar; + +REGISTER bin/jaglion.jar; +A = LOAD 'testdata'; +DUMP A; +B = FOREACH A GENERATE jaglion.ANONYMIZE($0, 0); +DUMP B; +-- C = FOREACH A GENERATE jaglion.ANONYMIZE($0, 1); +-- DUMP C; +-- D = FOREACH B GENERATE jaglion.DEANONYMIZE($0); +-- DUMP D; +-- E = FOREACH C GENERATE jaglion.DEANONYMIZE($0); +-- DUMP E; diff --git a/testdata b/testdata new file mode 100644 index 0000000..cfbe55a --- /dev/null +++ b/testdata @@ -0,0 +1,4 @@ +Test-String +Test2 +IRJ +aaabbb \ No newline at end of file