From b3b5bde5d10f33d16545704a116f1ec098f883a9 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 22 Aug 2018 18:16:37 +0200 Subject: [PATCH 001/114] Subset logic implemented --- config/cfg.properties | 3 +- pom.xml | 22 +++++ .../subset/DataSubsetGenerator.java | 98 +++++++++++++++++++ .../subset/DataSubsetProvider.java | 53 ++++++++++ .../tool/ModelToCSVWriter.java | 51 ++++++++++ .../word2vec/Word2VecModel.java | 35 +++++++ 6 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java create mode 100644 src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java create mode 100644 src/main/java/org/aksw/word2vecrestful/tool/ModelToCSVWriter.java diff --git a/config/cfg.properties b/config/cfg.properties index bd776ef..9d7bbe6 100644 --- a/config/cfg.properties +++ b/config/cfg.properties @@ -2,4 +2,5 @@ org.aksw.word2vecrestful.web.Word2VecController.apikey: 1234 org.aksw.word2vecrestful.web.Word2VecController.maxN: 10000 org.aksw.word2vecrestful.word2vec.Word2VecModelLoader.bin: true org.aksw.word2vecrestful.word2vec.Word2VecFactory.model: data/GoogleNews-vectors-negative300.bin -org.aksw.word2vecrestful.Application.inmemory: true \ No newline at end of file +org.aksw.word2vecrestful.Application.inmemory: true +org.aksw.word2vecrestful.Application.subsetfiledir: data/subset-files-1/ \ No newline at end of file diff --git a/pom.xml b/pom.xml index 6ce6cc3..c8d85f2 100644 --- a/pom.xml +++ b/pom.xml @@ -44,6 +44,27 @@ 4.11 test + + + javax.xml.bind + jaxb-api + 2.1 + + + + + com.fasterxml.jackson.core + jackson-core + 2.9.6 + + + + com.opencsv + opencsv + 4.1 + + + src/test/java @@ -70,6 +91,7 @@ methods 10 + -Xmx6056m diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java new file mode 100644 index 0000000..796e693 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -0,0 +1,98 @@ +package org.aksw.word2vecrestful.subset; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; + +import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; + +public class DataSubsetGenerator { + + public static final JsonNodeFactory JSON_FACTORY = JsonNodeFactory.instance; + public static final String DATA_LABEL = "data"; + public static final String KEY_LABEL = "key"; + public static final String CENTROID_LABEL = "centroid"; + public static final String SD_LABEL = "sd"; + public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); + public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); + public static final ObjectWriter OBJ_WRITER = OBJ_MAPPER.writer(new DefaultPrettyPrinter()); + public static void generateSubsetFiles(File subsetConfig, String outputFileDir, Map word2vec, int vectorSize) + throws JsonProcessingException, FileNotFoundException, IOException { + // Read file into a json + ObjectNode inpObj = (ObjectNode) OBJ_READER.readTree(new FileInputStream(subsetConfig)); + ArrayNode inpDt = (ArrayNode) inpObj.get(DATA_LABEL); + // Traverse the json for keys + Iterator inpIt = inpDt.iterator(); + float[] maxlim = new float[vectorSize]; + float[] minlim = new float[vectorSize]; + while (inpIt.hasNext()) { + + JsonNode curNode = inpIt.next(); + // fetch value of key + String key = curNode.get(KEY_LABEL).asText(); + // fetch value of centroid + ArrayNode centroid = (ArrayNode) curNode.get(CENTROID_LABEL); + // fetch value of standard deviation + ArrayNode stndrdDev = (ArrayNode) curNode.get(SD_LABEL); + // initiate a list of words + ArrayNode wordArr = OBJ_MAPPER.createArrayNode(); + // Initialize result json + ObjectNode resObj = new ObjectNode(JSON_FACTORY); + // insert a mapping of key to list in the result json + resObj.set(key, wordArr); + boolean limitNotSet = true; + // loop through the model + for (Entry wordEntry : word2vec.entrySet()) { + String word = wordEntry.getKey(); + float[] wordvec = wordEntry.getValue(); + boolean isValid = true; + for (int i = 0; i < centroid.size(); i++) { + if(limitNotSet) { + float centVal = centroid.get(i).floatValue(); + float sdVal = stndrdDev.get(i).floatValue(); + // maxlim = add sd to centroid + maxlim[i] = centVal + 3*sdVal; + // minlim = subtract sb from centroid + minlim[i] = centVal - 3*sdVal; + } + // check if values of all the dimensions are under maxlim and minlim + float curVal = wordvec[i]; + if (curVal > maxlim[i] || curVal < minlim[i]) { + isValid = false; + break; + } + } + limitNotSet = false; + if (isValid) { + // store the word in the list + wordArr.add(word); + } + } + // Persist each json file + File outputJsonFile = new File(outputFileDir + "/" + key + ".json"); + OBJ_WRITER.writeValue(outputJsonFile, resObj); + } + } + + public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { + File subsetConfig = new File(".\\word2vec-dump\\subsetconfig2.json"); + Word2VecModel model = Word2VecFactory.get(); + generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.word2vec, model.vectorSize); + } +} diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java new file mode 100644 index 0000000..a694f70 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java @@ -0,0 +1,53 @@ +package org.aksw.word2vecrestful.subset; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.aksw.word2vecrestful.utils.Cfg; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; + +public class DataSubsetProvider { + + public static String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); + public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); + public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); + + public static List fetchSubsetWords(String subsetKey) + throws JsonProcessingException, FileNotFoundException, IOException { + List resList = new ArrayList<>(); + // logic to fetch the words from the stored subsets + ObjectNode inpObj = null; + File dir = new File(fileDir); + File[] directoryListing = dir.listFiles(); + if (directoryListing != null) { + for (File child : directoryListing) { + // Do something with child + if (child.getName().equalsIgnoreCase(getJsonExtension(subsetKey))) { + inpObj = (ObjectNode) OBJ_READER.readTree(new FileInputStream(child)); + break; + } + } + } + if(inpObj!=null) { + ArrayNode wordArr = (ArrayNode) inpObj.get(subsetKey); + for(int i=0;i word2vec, int vectorSize) { + // first create file object for file placed at location + // specified by filepath + File file = new File(filePath); + try { + // create FileWriter object with file as parameter + FileWriter outputfile = new FileWriter(file); + + // create CSVWriter object filewriter object as parameter + CSVWriter writer = new CSVWriter(outputfile); + String[] rowStr = new String[vectorSize + 1]; + for (Entry wordEntry : word2vec.entrySet()) { + rowStr[0] = wordEntry.getKey(); + float[] value = wordEntry.getValue(); + for (int i = 0; i < vectorSize; i++) { + rowStr[i + 1] = String.valueOf(value[i]); + } + writer.writeNext(rowStr); + } + + // closing writer connection + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { + Word2VecModel model = Word2VecFactory.get(); + writeDataLineByLine("D:\\Nikit\\DICE-Group\\word2vec-dump\\word2vec-model.csv", model.word2vec, + model.vectorSize); + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java index 2d70161..6768e56 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java @@ -1,13 +1,18 @@ package org.aksw.word2vecrestful.word2vec; +import java.io.FileNotFoundException; +import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.TreeSet; +import org.aksw.word2vecrestful.subset.DataSubsetProvider; import org.aksw.word2vecrestful.utils.Word2VecMath; +import com.fasterxml.jackson.core.JsonProcessingException; + /** * * @author MichaelRoeder @@ -74,6 +79,36 @@ public Map getClosestEntry(float[] inpvec){ resMap.put(minWord, minVec); return resMap; } + + /** + * Method to fetch a single closest word entry to the passed input vector + * inside the subset of word2vec model + * @param inpvec - input vector to find closest word for + * @param subsetKey - key to identify the subset model + * @return Map of the single closest word and its vector + * @throws IOException + * @throws FileNotFoundException + * @throws JsonProcessingException + */ + public Map getClosestEntryInSub(float[] inpvec, String subsetKey) throws JsonProcessingException, FileNotFoundException, IOException{ + Map resMap = new HashMap<>(); + double minDist = -2; + String minWord = null; + float[] minVec = null; + double tempDist; + // Loop on the subset + for(String word : DataSubsetProvider.fetchSubsetWords(subsetKey)) { + float[] wordvec = word2vec.get(word); + tempDist = getSqEucDist(inpvec, wordvec, minDist); + if(tempDist != -1) { + minWord = word; + minVec = wordvec; + minDist = tempDist; + } + } + resMap.put(minWord, minVec); + return resMap; + } /** * Method to find the squared value of euclidean distance between two vectors if it is less * than the provided minimum distance value, otherwise return -1 From c1e4913c644229c71e5f6cbe8a3694cb8231daf4 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 22 Aug 2018 18:49:56 +0200 Subject: [PATCH 002/114] javadoc comments + makedir --- .../subset/DataSubsetGenerator.java | 40 +++++++++++++++---- .../subset/DataSubsetProvider.java | 25 ++++++++++-- .../tool/ModelToCSVWriter.java | 23 +++++++++-- 3 files changed, 72 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index 796e693..d80488a 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -22,6 +22,12 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; +/** + * Class to help generate and persist the subsets of a word2vec model + * + * @author Nikit + * + */ public class DataSubsetGenerator { public static final JsonNodeFactory JSON_FACTORY = JsonNodeFactory.instance; @@ -31,9 +37,19 @@ public class DataSubsetGenerator { public static final String SD_LABEL = "sd"; public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); - public static final ObjectWriter OBJ_WRITER = OBJ_MAPPER.writer(new DefaultPrettyPrinter()); - public static void generateSubsetFiles(File subsetConfig, String outputFileDir, Map word2vec, int vectorSize) - throws JsonProcessingException, FileNotFoundException, IOException { + public static final ObjectWriter OBJ_WRITER = OBJ_MAPPER.writer(new DefaultPrettyPrinter()); + /** + * Method to generate subset json files for a given configuration and word2vec model + * @param subsetConfig - configuration json file + * @param outputFileDir - output directory for the subset files + * @param word2vec - word2vec model map + * @param vectorSize - size of the vectors in model + * @throws JsonProcessingException + * @throws FileNotFoundException + * @throws IOException + */ + public static void generateSubsetFiles(File subsetConfig, String outputFileDir, Map word2vec, + int vectorSize) throws JsonProcessingException, FileNotFoundException, IOException { // Read file into a json ObjectNode inpObj = (ObjectNode) OBJ_READER.readTree(new FileInputStream(subsetConfig)); ArrayNode inpDt = (ArrayNode) inpObj.get(DATA_LABEL); @@ -63,13 +79,13 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, float[] wordvec = wordEntry.getValue(); boolean isValid = true; for (int i = 0; i < centroid.size(); i++) { - if(limitNotSet) { + if (limitNotSet) { float centVal = centroid.get(i).floatValue(); float sdVal = stndrdDev.get(i).floatValue(); // maxlim = add sd to centroid - maxlim[i] = centVal + 3*sdVal; + maxlim[i] = centVal + 3 * sdVal; // minlim = subtract sb from centroid - minlim[i] = centVal - 3*sdVal; + minlim[i] = centVal - 3 * sdVal; } // check if values of all the dimensions are under maxlim and minlim float curVal = wordvec[i]; @@ -86,13 +102,21 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, } // Persist each json file File outputJsonFile = new File(outputFileDir + "/" + key + ".json"); + outputJsonFile.getParentFile().mkdirs(); OBJ_WRITER.writeValue(outputJsonFile, resObj); } } - + /** + * Method to demonstrate example usage + * @param args + * @throws JsonProcessingException + * @throws FileNotFoundException + * @throws IOException + */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { File subsetConfig = new File(".\\word2vec-dump\\subsetconfig2.json"); Word2VecModel model = Word2VecFactory.get(); - generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.word2vec, model.vectorSize); + generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.word2vec, + model.vectorSize); } } diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java index a694f70..95a2568 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java @@ -15,12 +15,25 @@ import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; +/** + * Class to help retrieve the list of words from a subset stored on the disk + * + * @author Nikit + * + */ public class DataSubsetProvider { public static String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); - + /** + * Method to fetch the list of words in a subset + * @param subsetKey - key to identify the subset + * @return - a list of words in the related subset + * @throws JsonProcessingException + * @throws FileNotFoundException + * @throws IOException + */ public static List fetchSubsetWords(String subsetKey) throws JsonProcessingException, FileNotFoundException, IOException { List resList = new ArrayList<>(); @@ -37,15 +50,19 @@ public static List fetchSubsetWords(String subsetKey) } } } - if(inpObj!=null) { + if (inpObj != null) { ArrayNode wordArr = (ArrayNode) inpObj.get(subsetKey); - for(int i=0;i word2vec, int vectorSize) { // first create file object for file placed at location // specified by filepath @@ -41,10 +50,16 @@ public static void writeDataLineByLine(String filePath, Map wor e.printStackTrace(); } } - + /** + * Method to demonstrate example usage + * @param args + * @throws JsonProcessingException + * @throws FileNotFoundException + * @throws IOException + */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { Word2VecModel model = Word2VecFactory.get(); - writeDataLineByLine("D:\\Nikit\\DICE-Group\\word2vec-dump\\word2vec-model.csv", model.word2vec, + writeDataLineByLine(".\\word2vec-dump\\word2vec-model.csv", model.word2vec, model.vectorSize); } From de2dea001cbc596c1d0671172e2b6ccba27ae66d Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 22 Aug 2018 18:53:33 +0200 Subject: [PATCH 003/114] cleaning pom of extra entries --- pom.xml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pom.xml b/pom.xml index c8d85f2..562d688 100644 --- a/pom.xml +++ b/pom.xml @@ -44,12 +44,6 @@ 4.11 test - - - javax.xml.bind - jaxb-api - 2.1 - @@ -91,7 +85,6 @@ methods 10 - -Xmx6056m From 9dfc8514a0b1a34665aef28277ce0c0117c15c4f Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 23 Aug 2018 13:41:14 +0200 Subject: [PATCH 004/114] Implementing requested changes 1. subset files stored as a list of words separated by endline 2. subset models saved in memory at time of query 3. removing unnecessary exception declarations --- .../subset/DataSubsetGenerator.java | 29 +++----- .../subset/DataSubsetProvider.java | 74 +++++++++++-------- .../word2vec/Word2VecModel.java | 2 +- 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index d80488a..538a57f 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -1,8 +1,10 @@ package org.aksw.word2vecrestful.subset; +import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileWriter; import java.io.IOException; import java.util.Iterator; import java.util.Map; @@ -13,13 +15,10 @@ import org.aksw.word2vecrestful.word2vec.Word2VecModel; import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; -import com.fasterxml.jackson.databind.ObjectWriter; import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; /** @@ -30,14 +29,12 @@ */ public class DataSubsetGenerator { - public static final JsonNodeFactory JSON_FACTORY = JsonNodeFactory.instance; public static final String DATA_LABEL = "data"; public static final String KEY_LABEL = "key"; public static final String CENTROID_LABEL = "centroid"; public static final String SD_LABEL = "sd"; public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); - public static final ObjectWriter OBJ_WRITER = OBJ_MAPPER.writer(new DefaultPrettyPrinter()); /** * Method to generate subset json files for a given configuration and word2vec model * @param subsetConfig - configuration json file @@ -66,12 +63,11 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, ArrayNode centroid = (ArrayNode) curNode.get(CENTROID_LABEL); // fetch value of standard deviation ArrayNode stndrdDev = (ArrayNode) curNode.get(SD_LABEL); - // initiate a list of words - ArrayNode wordArr = OBJ_MAPPER.createArrayNode(); - // Initialize result json - ObjectNode resObj = new ObjectNode(JSON_FACTORY); - // insert a mapping of key to list in the result json - resObj.set(key, wordArr); + // create an output file + File outputFile = new File(outputFileDir + "/" + key + ".txt"); + outputFile.getParentFile().mkdirs(); + // open an output stream + BufferedWriter bWriter = new BufferedWriter(new FileWriter(outputFile)); boolean limitNotSet = true; // loop through the model for (Entry wordEntry : word2vec.entrySet()) { @@ -96,14 +92,13 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, } limitNotSet = false; if (isValid) { - // store the word in the list - wordArr.add(word); + // write the word in the file + bWriter.write(word); + bWriter.newLine(); } } - // Persist each json file - File outputJsonFile = new File(outputFileDir + "/" + key + ".json"); - outputJsonFile.getParentFile().mkdirs(); - OBJ_WRITER.writeValue(outputJsonFile, resObj); + //Close the stream + bWriter.close(); } } /** diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java index 95a2568..dd7d643 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java @@ -1,19 +1,18 @@ package org.aksw.word2vecrestful.subset; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.aksw.word2vecrestful.utils.Cfg; import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectReader; -import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; /** * Class to help retrieve the list of words from a subset stored on the disk @@ -24,47 +23,62 @@ public class DataSubsetProvider { public static String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); - public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); - public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); + public static final Map> SUBSET_MODELS = new HashMap<>(); + /** * Method to fetch the list of words in a subset - * @param subsetKey - key to identify the subset + * + * @param subsetKey + * - key to identify the subset * @return - a list of words in the related subset * @throws JsonProcessingException * @throws FileNotFoundException * @throws IOException */ public static List fetchSubsetWords(String subsetKey) - throws JsonProcessingException, FileNotFoundException, IOException { - List resList = new ArrayList<>(); - // logic to fetch the words from the stored subsets - ObjectNode inpObj = null; - File dir = new File(fileDir); - File[] directoryListing = dir.listFiles(); - if (directoryListing != null) { - for (File child : directoryListing) { - // Do something with child - if (child.getName().equalsIgnoreCase(getJsonExtension(subsetKey))) { - inpObj = (ObjectNode) OBJ_READER.readTree(new FileInputStream(child)); - break; + throws IOException { + // fetch from cache + List resList = SUBSET_MODELS.get(subsetKey); + // if not in cache then read from file and add to cache + if (resList == null) { + resList = new ArrayList<>(); + // logic to fetch the words from the stored subsets + File dir = new File(fileDir); + // declare inputstream + BufferedReader bReader = null; + File[] directoryListing = dir.listFiles(); + if (directoryListing != null) { + for (File child : directoryListing) { + // Do something with child + if (child.getName().equalsIgnoreCase(appendFileExtension(subsetKey))) { + bReader = new BufferedReader(new FileReader(child)); + break; + } } } - } - if (inpObj != null) { - ArrayNode wordArr = (ArrayNode) inpObj.get(subsetKey); - for (int i = 0; i < wordArr.size(); i++) { - resList.add(wordArr.get(i).asText()); + if (bReader != null) { + while (true) { + String word = bReader.readLine(); + if (word == null) { + break; + } + resList.add(word); + } } + SUBSET_MODELS.put(subsetKey, resList); } return resList; } + /** - * Method to append json extension at the end of a key - * @param name - key - * @return key append with json extension + * Method to append txt extension at the end of a key + * + * @param name + * - key + * @return key appended with txt extension */ - public static String getJsonExtension(String name) { - return name + ".json"; + public static String appendFileExtension(String name) { + return name + ".txt"; } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java index 6768e56..196e4a9 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java @@ -90,7 +90,7 @@ public Map getClosestEntry(float[] inpvec){ * @throws FileNotFoundException * @throws JsonProcessingException */ - public Map getClosestEntryInSub(float[] inpvec, String subsetKey) throws JsonProcessingException, FileNotFoundException, IOException{ + public Map getClosestEntryInSub(float[] inpvec, String subsetKey) throws IOException{ Map resMap = new HashMap<>(); double minDist = -2; String minWord = null; From 7a94730d5c68ba7e4697f01bda55487fb2d597bb Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 23 Aug 2018 13:45:58 +0200 Subject: [PATCH 005/114] documenting expected config format --- .../aksw/word2vecrestful/subset/DataSubsetGenerator.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index 538a57f..ccd555e 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -23,6 +23,15 @@ /** * Class to help generate and persist the subsets of a word2vec model + * Expected json config format: + * + * { + * "data" : [ { + * "key" : "xyz", + * "centroid" : [...], + * "sd" : [...] + * }...] + * } * * @author Nikit * From fd5b906c4247bbec35feb3d1de7435dc6ac0d937 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 23 Aug 2018 16:46:59 +0200 Subject: [PATCH 006/114] Implementing requested changes 1. Removed static handling of DataSubsetProvider 2. Better file handling for reading subset file 3. UTF8 encoding enforced on the text 4. Formatted documentation of data json --- pom.xml | 8 ++- .../subset/DataSubsetGenerator.java | 54 +++++++++++++------ .../subset/DataSubsetProvider.java | 39 ++++---------- .../word2vec/Word2VecModel.java | 4 +- 4 files changed, 56 insertions(+), 49 deletions(-) diff --git a/pom.xml b/pom.xml index 562d688..b78df5b 100644 --- a/pom.xml +++ b/pom.xml @@ -57,7 +57,13 @@ opencsv 4.1 - + + + + commons-io + commons-io + 2.6 + diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index ccd555e..793e874 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -4,8 +4,8 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; -import java.io.FileWriter; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; @@ -13,6 +13,7 @@ import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; +import org.apache.commons.io.output.FileWriterWithEncoding; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; @@ -22,16 +23,26 @@ import com.fasterxml.jackson.databind.node.ObjectNode; /** - * Class to help generate and persist the subsets of a word2vec model - * Expected json config format: + * Class to help generate and persist the subsets of a word2vec model Expected + * json config format: * - * { - * "data" : [ { - * "key" : "xyz", - * "centroid" : [...], - * "sd" : [...] - * }...] - * } + *
+ *{
+ * "data" : [ {
+ *  "key" : "xyz",
+ *  "centroid" : [...],
+ *  "sd" : [...]
+ * }...]
+ *}
+ * 
+ *
+ *
key
+ *
identification literal for the subset
+ *
centroid
+ *
centroid array for the subset
+ *
sd
+ *
standard deviation array for the subset
+ *
* * @author Nikit * @@ -44,12 +55,19 @@ public class DataSubsetGenerator { public static final String SD_LABEL = "sd"; public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); + /** - * Method to generate subset json files for a given configuration and word2vec model - * @param subsetConfig - configuration json file - * @param outputFileDir - output directory for the subset files - * @param word2vec - word2vec model map - * @param vectorSize - size of the vectors in model + * Method to generate subset json files for a given configuration and word2vec + * model + * + * @param subsetConfig + * - configuration json file + * @param outputFileDir + * - output directory for the subset files + * @param word2vec + * - word2vec model map + * @param vectorSize + * - size of the vectors in model * @throws JsonProcessingException * @throws FileNotFoundException * @throws IOException @@ -76,7 +94,7 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, File outputFile = new File(outputFileDir + "/" + key + ".txt"); outputFile.getParentFile().mkdirs(); // open an output stream - BufferedWriter bWriter = new BufferedWriter(new FileWriter(outputFile)); + BufferedWriter bWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile, StandardCharsets.UTF_8)); boolean limitNotSet = true; // loop through the model for (Entry wordEntry : word2vec.entrySet()) { @@ -106,12 +124,14 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, bWriter.newLine(); } } - //Close the stream + // Close the stream bWriter.close(); } } + /** * Method to demonstrate example usage + * * @param args * @throws JsonProcessingException * @throws FileNotFoundException diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java index dd7d643..ce533b4 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java @@ -1,16 +1,15 @@ package org.aksw.word2vecrestful.subset; -import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; -import java.io.FileReader; import java.io.IOException; -import java.util.ArrayList; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.List; import java.util.Map; import org.aksw.word2vecrestful.utils.Cfg; +import org.apache.commons.io.FileUtils; import com.fasterxml.jackson.core.JsonProcessingException; @@ -22,8 +21,8 @@ */ public class DataSubsetProvider { - public static String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); - public static final Map> SUBSET_MODELS = new HashMap<>(); + private String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); + private final Map> SUBSET_MODELS = new HashMap<>(); /** * Method to fetch the list of words in a subset @@ -35,37 +34,17 @@ public class DataSubsetProvider { * @throws FileNotFoundException * @throws IOException */ - public static List fetchSubsetWords(String subsetKey) - throws IOException { + public List fetchSubsetWords(String subsetKey) throws IOException { // fetch from cache List resList = SUBSET_MODELS.get(subsetKey); // if not in cache then read from file and add to cache if (resList == null) { - resList = new ArrayList<>(); // logic to fetch the words from the stored subsets - File dir = new File(fileDir); - // declare inputstream - BufferedReader bReader = null; - File[] directoryListing = dir.listFiles(); - if (directoryListing != null) { - for (File child : directoryListing) { - // Do something with child - if (child.getName().equalsIgnoreCase(appendFileExtension(subsetKey))) { - bReader = new BufferedReader(new FileReader(child)); - break; - } - } + File file1 = new File(fileDir + "/" + appendFileExtension(subsetKey)); + if (file1.exists()) { + resList = FileUtils.readLines(file1, StandardCharsets.UTF_8); + SUBSET_MODELS.put(subsetKey, resList); } - if (bReader != null) { - while (true) { - String word = bReader.readLine(); - if (word == null) { - break; - } - resList.add(word); - } - } - SUBSET_MODELS.put(subsetKey, resList); } return resList; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java index 196e4a9..ce5b53d 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java @@ -24,10 +24,12 @@ public class Word2VecModel { public int vectorSize; public TreeSet sortedVecDimns; public int[] dimRngIndxDesc; + private DataSubsetProvider dataSubsetProvider; public Word2VecModel(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; this.sortedVecDimns = new TreeSet<>(); + this.dataSubsetProvider = new DataSubsetProvider(); //fetch vector dimension details fetchVectorDimensions(); } @@ -97,7 +99,7 @@ public Map getClosestEntryInSub(float[] inpvec, String subsetKe float[] minVec = null; double tempDist; // Loop on the subset - for(String word : DataSubsetProvider.fetchSubsetWords(subsetKey)) { + for(String word : dataSubsetProvider.fetchSubsetWords(subsetKey)) { float[] wordvec = word2vec.get(word); tempDist = getSqEucDist(inpvec, wordvec, minDist); if(tempDist != -1) { From 1707493dca17ab997ebf44cadcb1646dfe592336 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 30 Aug 2018 17:32:09 +0200 Subject: [PATCH 007/114] Intermediate commit --- src/main/java/nikit/test/Word2VecTester.java | 73 +++++ .../db/NormalizedDBModelGenerator.java | 89 ++++++ .../word2vecrestful/db/SQLiteDBHandler.java | 98 +++++++ .../subset/DataSubsetProvider.java | 14 +- .../word2vecrestful/tool/ModelNormalizer.java | 155 ++++++++++ .../tool/ModelStatsWriter.java | 93 ++++++ .../word2vecrestful/utils/Word2VecMath.java | 22 +- .../word2vec/GenWord2VecModel.java | 9 + .../word2vec/W2VNrmlDbModel.java | 39 +++ .../word2vec/W2VNrmlMemModel.java | 272 ++++++++++++++++++ .../word2vec/Word2VecModelLoader.java | 4 +- 11 files changed, 859 insertions(+), 9 deletions(-) create mode 100644 src/main/java/nikit/test/Word2VecTester.java create mode 100644 src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java create mode 100644 src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java create mode 100644 src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java create mode 100644 src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java diff --git a/src/main/java/nikit/test/Word2VecTester.java b/src/main/java/nikit/test/Word2VecTester.java new file mode 100644 index 0000000..e6fe8c3 --- /dev/null +++ b/src/main/java/nikit/test/Word2VecTester.java @@ -0,0 +1,73 @@ +package nikit.test; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Map; + +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; + +import com.fasterxml.jackson.core.JsonProcessingException; + +public class Word2VecTester { + public static void main(final String[] a) throws JsonProcessingException, FileNotFoundException, IOException { + + // loads model in memory + final Word2VecModel model = Word2VecFactory.get(); + /* final float[] vecCat = model.word2vec.get("cat"); + final float[] vecDog = model.word2vec.get("dog"); + final float[] vecItaly = model.word2vec.get("Italy"); + final float[] vecFrance = model.word2vec.get("France"); + final float[] vecInstead = model.word2vec.get("instead"); + final float[] vecWhether = model.word2vec.get("whether"); + final float[] vecHello = model.word2vec.get("hello"); + final float[] vecBand = model.word2vec.get("band"); + final float[] vecBeer = model.word2vec.get("beer"); + final float[] vecBear = model.word2vec.get("bear"); + final float[] vecBang = model.word2vec.get("bang"); + final float[] vecBang = model.word2vec.get("bang"); + vecCat[34] = 234; + vecDog[12] = 342; + vecItaly[80] = 12; + vecItaly[23] = 212; + vecFrance[76] = 12; + vecFrance[26] = 34; + vecFrance[35] = 1231; + vecInstead[37] = 35; + vecWhether[29] = 67; + vecHello[78] = 34; + vecBand[76] = -12; + vecBeer[23] = 32; + vecBear[56] = -8; + vecBang[13] = 23; + + System.out.println(model.getClosestEntry(vecCat)); + System.out.println(model.getClosestEntry(vecDog)); + System.out.println(model.getClosestEntry(vecItaly)); + System.out.println(model.getClosestEntry(vecFrance)); + System.out.println(model.getClosestEntry(vecInstead)); + System.out.println(model.getClosestEntry(vecWhether)); + System.out.println(model.getClosestEntry(vecHello)); + System.out.println(model.getClosestEntry(vecBand)); + System.out.println(model.getClosestEntry(vecBeer)); + System.out.println(model.getClosestEntry(vecBear)); + System.out.println(model.getClosestEntry(vecBang)); + + final double sim = Word2VecMath.cosineSimilarity(vecCat, vecDog); + System.out.println(sim);*/ + // test 1 + float[] vec1 = model.word2vec.get("WesternOne"); + Map closestWord = model.getClosestEntryInSub(vec1, "ns#country-name"); + System.out.println(closestWord.keySet()); + // test 2 + vec1 = model.word2vec.get("Donald_O._Schnuck"); + closestWord = model.getClosestEntryInSub(vec1, "ontology#ConferenceVenuePlacerdf-schema#label"); + System.out.println(closestWord.keySet()); + // test 3 + vec1 = model.word2vec.get("аn_ԁ_ѕοmе"); + closestWord = model.getClosestEntryInSub(vec1, "ontology#ConferenceVenuePlacerdf-schema#label"); + System.out.println(closestWord.keySet()); + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java b/src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java new file mode 100644 index 0000000..9e62852 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java @@ -0,0 +1,89 @@ +package org.aksw.word2vecrestful.db; + +import java.sql.PreparedStatement; +import java.sql.SQLException; + +public class NormalizedDBModelGenerator extends SQLiteDBHandler { + + private String mainTblName; + private int vectorSize; + private String insertQuery; + + public NormalizedDBModelGenerator(String dbName, String mainTblName, int vectorSize) { + super(dbName); + this.mainTblName = mainTblName; + this.vectorSize = vectorSize; + this.insertQuery = this.createInsertQuery(); + createMainTable(); + } + + public void createMainTable() { + StringBuilder sqlStr = new StringBuilder("CREATE TABLE IF NOT EXISTS "); + sqlStr.append(this.mainTblName); + sqlStr.append(" ( word text "); + for (int i = 0; i < vectorSize; i++) { + sqlStr.append(", val").append(i + 1).append(" float NOT NULL "); + } + sqlStr.append(");"); + executeStatement(sqlStr.toString()); + } + + private String createInsertQuery() { + StringBuilder insrtStr = new StringBuilder(); + insrtStr.append("insert into ").append(this.mainTblName).append(" values ( ?"); + for (int i = 0; i < vectorSize; i++) { + insrtStr.append(", ?"); + } + insrtStr.append(") ;"); + return insrtStr.toString(); + } + + public PreparedStatement generateMainTblInsrtStmnt() throws SQLException { + PreparedStatement prep = connection.prepareStatement(this.insertQuery); + connection.setAutoCommit(false); + return prep; + } + + /** + * Creates an index. + */ + public void makeIndex() { + final String sql = "CREATE INDEX Idx1 ON " + this.mainTblName + "(word)"; + if (connection != null) { + try { + final PreparedStatement prep = connection.prepareStatement(sql); + prep.execute(); + prep.close(); + commit(); + } catch (final SQLException e) { + LOG.error(e.getLocalizedMessage(), e); + } + } + } + + public void addMainTblInsrtBatch(String word, float[] vector, PreparedStatement ps) throws SQLException { + ps.setString(1, word); + for (int i = 0; i < this.vectorSize; i++) { + ps.setFloat(i + 2, vector[i]); + } + ps.addBatch(); + } + + public int[] executeBatchCommit(PreparedStatement ps) throws SQLException { + int[] res = ps.executeBatch(); + connection.commit(); + return res; + } + + public boolean insertMainTblRecord(String word, float[] vector) throws SQLException { + boolean recInserted = false; + PreparedStatement prep = connection.prepareStatement(this.insertQuery); + prep.setString(1, word); + for (int i = 0; i < this.vectorSize; i++) { + prep.setFloat(i + 2, vector[i]); + } + recInserted = prep.execute(); + prep.close(); + return recInserted; + } +} diff --git a/src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java b/src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java new file mode 100644 index 0000000..e37f0dd --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java @@ -0,0 +1,98 @@ +package org.aksw.word2vecrestful.db; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.log4j.Logger; +import org.sqlite.SQLiteConfig; + +public class SQLiteDBHandler { + + public static Logger LOG = Logger.getLogger(AbstractSQLightDB.class); + + private static int queryTimeout = 30; + + private String db; + protected Connection connection = null; + protected Statement statement = null; + + public SQLiteDBHandler(String dbName) { + try { + this.db = dbName.concat(".db"); + Class.forName("org.sqlite.JDBC"); + } catch (final ClassNotFoundException e) { + LOG.error(e.getLocalizedMessage(), e); + } + + } + + public void commit() throws SQLException { + if (connection != null) { + connection.commit(); + } + } + + /** + * Executes the given query on database and returns the numbers of rows updated + * + * @param query + * - statement to be executed + * @return - numbers of rows updated + */ + protected boolean executeStatement(String query) { + boolean res = false; + if (connect()) { + try { + statement = connection.createStatement(); + statement.setQueryTimeout(queryTimeout); + res = statement.execute(query); + } catch (final SQLException e) { + LOG.error(e.getLocalizedMessage(), e); + } finally { + disconnect(); + } + } + return res; + } + + /** + * Disconnect DB. + */ + public void disconnect() { + try { + if (connection != null) { + connection.close(); + } + } catch (final SQLException e) { + LOG.error("\n", e); + } + } + + /** + * Connect DB. + */ + public boolean connect() { + final SQLiteConfig config = new SQLiteConfig(); + // config.setEncoding(SQLiteConfig.Encoding.UTF8); + return connect(config); + } + + /** + * Connect DB. + * + * @param config + * @return true if connected + */ + protected boolean connect(final SQLiteConfig config) { + try { + connection = DriverManager.getConnection("jdbc:sqlite:".concat(db), config.toProperties()); + } catch (final SQLException e) { + LOG.error("\n", e); + statement = null; + } + return connection == null ? false : true; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java index ce533b4..9af9ebb 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java @@ -5,8 +5,9 @@ import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.HashMap; -import java.util.List; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import org.aksw.word2vecrestful.utils.Cfg; import org.apache.commons.io.FileUtils; @@ -22,10 +23,10 @@ public class DataSubsetProvider { private String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); - private final Map> SUBSET_MODELS = new HashMap<>(); + private final Map> SUBSET_MODELS = new HashMap<>(); /** - * Method to fetch the list of words in a subset + * Method to fetch the set of words in a subset * * @param subsetKey * - key to identify the subset @@ -34,15 +35,16 @@ public class DataSubsetProvider { * @throws FileNotFoundException * @throws IOException */ - public List fetchSubsetWords(String subsetKey) throws IOException { + public Set fetchSubsetWords(String subsetKey) throws IOException { // fetch from cache - List resList = SUBSET_MODELS.get(subsetKey); + Set resList = SUBSET_MODELS.get(subsetKey); // if not in cache then read from file and add to cache if (resList == null) { // logic to fetch the words from the stored subsets File file1 = new File(fileDir + "/" + appendFileExtension(subsetKey)); if (file1.exists()) { - resList = FileUtils.readLines(file1, StandardCharsets.UTF_8); + resList = new HashSet<>(); + resList.addAll(FileUtils.readLines(file1, StandardCharsets.UTF_8)); SUBSET_MODELS.put(subsetKey, resList); } } diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java new file mode 100644 index 0000000..def3e4b --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java @@ -0,0 +1,155 @@ +package org.aksw.word2vecrestful.tool; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.sql.PreparedStatement; +import java.sql.SQLException; + +import org.aksw.word2vecrestful.db.NormalizedDBModelGenerator; +import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModelLoader; +import org.apache.commons.io.output.FileWriterWithEncoding; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +public class ModelNormalizer { + public static Logger LOG = LogManager.getLogger(ModelNormalizer.class); + + /** + * Method to normalize a bin word2vec model line + * + * @param line + * - line from a bin model to be normalized + * @param vectorSize + * - size of the vector + * @return - normalized line + */ + public String getNormalizedVecLine(String word, float[] vector) { + StringBuffer resStr = new StringBuffer(); + resStr.append(word); + vector = Word2VecMath.normalize(vector); + for (int i = 0; i < vector.length; i++) { + resStr.append(" ").append(String.valueOf(vector[i])); + } + return resStr.toString(); + } + + /** + * Method to generate a normalized model for a word2vec bin model + * + * @param inputFile + * - word2vec file of the model to be normalized + * @param outputFile + * - output file for normalized model + * @throws IOException + */ + public void generateNormalizedModel(File inputFile, File outputFile) throws IOException { + // ensure directory creation + outputFile.getParentFile().mkdirs(); + // open an output stream + BufferedWriter bWriter = null; + FileInputStream fin = null; + try { + // reads file header + fin = new FileInputStream(inputFile); + String word = Word2VecModelLoader.readWord(fin); + int words = Integer.parseInt(word); + word = Word2VecModelLoader.readWord(fin); + int vectorSize = Integer.parseInt(word); + bWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile, StandardCharsets.UTF_8)); + bWriter.write(words + " " + vectorSize); + LOG.info("Expecting " + words + " words with " + vectorSize + " values per vector."); + for (int w = 0; w < words; ++w) { + word = Word2VecModelLoader.readWord(fin); + // LOG.info(word); + float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); + bWriter.newLine(); + bWriter.write(getNormalizedVecLine(word, vector)); + if (w % 10000 == 0) { + bWriter.flush(); + } + } + } catch (final IOException e) { + LOG.error(e.getLocalizedMessage(), e); + } finally { + fin.close(); + bWriter.close(); + } + } + + /** + * Method to persist a normalized model for a word2vec bin model + * + * @param inputFile + * - word2vec file of the model to be normalized + * @param dbName + * - name of the database + * @param tablName + * - name of the table to store the data in + * @throws IOException + * @throws SQLException + */ + public void persistNormalizedModel(File inputFile, String dbName, String tblName) throws IOException, SQLException { + // intialize handler instance + NormalizedDBModelGenerator dbHandler = null; + + FileInputStream fin = null; + try { + // reads file header + fin = new FileInputStream(inputFile); + String word = Word2VecModelLoader.readWord(fin); + int words = Integer.parseInt(word); + word = Word2VecModelLoader.readWord(fin); + int vectorSize = Integer.parseInt(word); + dbHandler = new NormalizedDBModelGenerator(dbName, tblName, vectorSize); + // open connection + dbHandler.connect(); + LOG.info("Expecting " + words + " words with " + vectorSize + " values per vector."); + // create preparedstatement + PreparedStatement ps = dbHandler.generateMainTblInsrtStmnt(); + for (int w = 0; w < words; ++w) { + word = Word2VecModelLoader.readWord(fin); + // LOG.info(word); + float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); + // dbHandler.insertMainTblRecord(word, vector); + dbHandler.addMainTblInsrtBatch(word, Word2VecMath.normalize(vector), ps); + if((w+1)%50000 == 0) { + dbHandler.executeBatchCommit(ps); + LOG.info((w+1)+" Records inserted."); + } + } + dbHandler.executeBatchCommit(ps); + // Generate Index on completion + dbHandler.makeIndex(); + } catch (final IOException e) { + LOG.error(e.getLocalizedMessage(), e); + } finally { + fin.close(); + dbHandler.disconnect(); + } + } + + /*public static void main(String[] args) throws IOException { + String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); + String model = (Cfg.get(cfgKeyModel)); + ModelNormalizer modelNormalizer = new ModelNormalizer(); + File inputFile = new File(model); + File outputFile = new File("D:\\Nikit\\DICE-Group\\Jword2vec\\data\\normal\\GoogleNews-vectors-negative300-normalized.txt"); + modelNormalizer.generateNormalizedModel(inputFile, outputFile); + }*/ + + public static void main(String[] args) throws IOException, SQLException { + String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); + String model = (Cfg.get(cfgKeyModel)); + ModelNormalizer modelNormalizer = new ModelNormalizer(); + File inputFile = new File(model); + // modelNormalizer.generateNormalizedModel(inputFile, outputFile); + modelNormalizer.persistNormalizedModel(inputFile, "data/nrmldb/word2vecmodel", "wordtovec"); + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java new file mode 100644 index 0000000..16b6bc7 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java @@ -0,0 +1,93 @@ +package org.aksw.word2vecrestful.tool; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.opencsv.CSVWriter; + +public class ModelStatsWriter { + public static Logger LOG = LogManager.getLogger(ModelStatsWriter.class); + + // Method to write the results to a csv + public static Map writeModelStats(Map word2vecMap, int vectorSize, File outputFile) + throws IOException { + Map resMap = new HashMap<>(); + outputFile.getParentFile().mkdirs(); + // create FileWriter object with file as parameter + FileWriter fileWriter = new FileWriter(outputFile); + // create CSVWriter object filewriter object as parameter + CSVWriter writer = new CSVWriter(fileWriter); + // Writer Header + String[] header = { "dimension_index", "min", "max", "mean", "variance", "sd" }; + writer.writeNext(header); + Set> entries = word2vecMap.entrySet(); + int totSize = word2vecMap.size(); + // loop all dimensions + for (int i = 0; i < vectorSize; i++) { + // loop through all the words + Float min = null; + Float max = null; + int j = 0; + float[] dimsnArr = new float[totSize]; + float sum = 0; + for (Entry entry : entries) { + float[] vecArr = entry.getValue(); + float val = vecArr[i]; + if (min == null || val < min) { + min = val; + } + if (max == null || val > max) { + max = val; + } + sum += val; + dimsnArr[j++] = val; + } + // mean + float mean = sum / dimsnArr.length; + sum = 0; + for (j = 0; j < dimsnArr.length; j++) { + sum += Math.pow(dimsnArr[j] - mean, 2); + } + float variance = sum / dimsnArr.length; + Double sd = Math.sqrt(variance); + resMap.put(i, sd.floatValue()); + // Write values to file + writeValues(i, min, max, mean, variance, sd, writer); + } + writer.close(); + return resMap; + } + + private static void writeValues(int index, float min, float max, float mean, float variance, double sd, + CSVWriter writer) { + String[] row = { String.valueOf(index + 1), String.valueOf(min), String.valueOf(max), String.valueOf(mean), + String.valueOf(variance), String.valueOf(sd) }; + writer.writeNext(row); + } + + /** + * Method to demonstrate example usage + * + * @param args + * @throws JsonProcessingException + * @throws FileNotFoundException + * @throws IOException + */ + public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { + // Get the normalized model + Word2VecModel model = Word2VecFactory.get(); + writeModelStats(model.word2vec, model.vectorSize, new File("data/normal/stat/modelstats.csv")); + } +} diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index 8652660..5fd9d1d 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -92,5 +92,25 @@ public static float getMin(float[] inputArray){ } } return minValue; - } + } + + // Method to calculate mean + public static float calcMean(float[] inpArr) { + float res = 0; + for(int i=0;ilen;i++) { + res += Math.pow(inpArr[i]- meanVal, 2); + } + return res; + } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java new file mode 100644 index 0000000..77fd3e6 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java @@ -0,0 +1,9 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.util.Map; + +public interface GenWord2VecModel { + public int getVectorSize(); + public Map getClosestEntry(float[] vector); + public Map getClosestSubEntry(float[] vector, String subKey); +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java new file mode 100644 index 0000000..46daf57 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java @@ -0,0 +1,39 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.util.Map; + +import org.aksw.word2vecrestful.db.SQLiteDBHandler; + +public class W2VNrmlDbModel implements GenWord2VecModel{ + + private String mainTblName= "wordtovec"; + private String wordColName = "word"; + private String vecColPrefix = "val"; + + private int vectorSize; + private SQLiteDBHandler dbHandler; + + private Map sdMap; + public W2VNrmlDbModel(String dbName, int vectorSize) { + this.vectorSize = vectorSize; + this.dbHandler = new SQLiteDBHandler(dbName); + // Calculate sd*3/10 and save in map + } + @Override + public int getVectorSize() { + return this.vectorSize; + } + + @Override + public Map getClosestEntry(float[] vector) { + // TODO Auto-generated method stub + return null; + } + + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java new file mode 100644 index 0000000..deada6a --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -0,0 +1,272 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; + +import org.aksw.word2vecrestful.subset.DataSubsetProvider; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModel implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private Map sdMap; + /** + * Limit to the multiplier of area in which nearby vectors are to be looked + */ + private static final int EXHAUSTION_MULT = 10; + /** + * Multiplier for the standard deviation + */ + private static final int SIGMA_MULT = 3; + /** + * Divisor for the standard deviation's value + */ + private static final int AREA_DIVISOR = 10; + private DataSubsetProvider dataSubsetProvider; + + public W2VNrmlMemModel(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + this.dataSubsetProvider = new DataSubsetProvider(); + // Calculate sd*3/10 and save in map + setModelSd(word2vec, vectorSize); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private Map getClosestEntry(float[] vector, String subKey) { + Set wordSet = null; + Map closestVec = null; + try { + if (subKey == null) { + wordSet = word2vec.keySet(); + } else { + wordSet = dataSubsetProvider.fetchSubsetWords(subKey); + } + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + Map nearbyVecs = fetchNearbyVectors(vector, wordSet); + closestVec = findClosestVecInNearbyVecs(nearbyVecs, vector); + } catch (IOException e) { + LOG.error(e.getStackTrace()); + } + return closestVec; + } + + private Map findClosestVecInNearbyVecs(Map nearbyVecs, float[] vector) { + Map closestVec = new HashMap<>(); + TreeMap cosineSimMap = new TreeMap<>(); + for (String word : nearbyVecs.keySet()) { + cosineSimMap.put(Word2VecMath.cosineSimilarity(vector, nearbyVecs.get(word)), word); + } + String closestWord = cosineSimMap.lastEntry().getValue(); + closestVec.put(closestWord, nearbyVecs.get(closestWord)); + return closestVec; + } + + /** + * Method to find standard deviation for each dimension of word vector and store + * the operated value next to the dimension's index in sdMap + * + * @param word2vecMap + * - mapping of words alongwith their vectors + * @param vectorSize + * - size of each vector + */ + public void setModelSd(Map word2vecMap, int vectorSize) { + Map resMap = new HashMap<>(); + Set> entries = word2vecMap.entrySet(); + int totSize = word2vecMap.size(); + // loop all dimensions + for (int i = 0; i < vectorSize; i++) { + // loop through all the words + int j = 0; + float[] dimsnArr = new float[totSize]; + float sum = 0; + for (Entry entry : entries) { + float[] vecArr = entry.getValue(); + float val = vecArr[i]; + sum += val; + dimsnArr[j++] = val; + } + // mean + float mean = sum / dimsnArr.length; + sum = 0; + for (j = 0; j < dimsnArr.length; j++) { + sum += Math.pow(dimsnArr[j] - mean, 2); + } + float variance = sum / dimsnArr.length; + Double sd = Math.sqrt(variance); + resMap.put(i, sd.floatValue() * SIGMA_MULT / AREA_DIVISOR); + } + // Set as sdMap + this.sdMap = resMap; + } + + /** + * Method to fetch nearby vectors for a given vector in a particular word set + * + * @param vector + * - vector to look nearby vectors for + * @param wordSet + * - word set to look into for nearby vectors + * @return - mapping of nearby words alongwith with their vector values + */ + private Map fetchNearbyVectors(float[] vector, Set wordSet) { + Map nearbyVecMap = new HashMap<>(); + boolean mapEmpty = true; + boolean notExhausted = true; + float[][] minMaxVec = getMinMaxVec(vector); + int mult = 1; + while (mapEmpty && notExhausted) { + if (mult > 1) { + minMaxVec = multMinMaxVec(minMaxVec, mult); + } + for (String word : wordSet) { + float[] entryVec = word2vec.get(word); + if (isVectorInArea(entryVec, minMaxVec)) { + nearbyVecMap.put(word, entryVec); + } + } + if (nearbyVecMap.size() > 0) { + mapEmpty = false; + } else if (mult > EXHAUSTION_MULT) { + notExhausted = false; + } + + } + return nearbyVecMap; + } + + /** + * Method to check if vector falls in a particular area + * + * @param entryVec + * - vector to be verified + * @param minMaxVec + * - min vec and max vec as area's boundary + * @return - if the given vector is inside min and max vec's range + */ + private boolean isVectorInArea(float[] entryVec, float[][] minMaxVec) { + boolean isValid = true; + float[] minVec = minMaxVec[0]; + float[] maxVec = minMaxVec[1]; + for (int i = 0; i < entryVec.length; i++) { + if (entryVec[i] < minVec[i] || entryVec[i] > maxVec[i]) { + isValid = false; + break; + } + } + return isValid; + } + + /** + * Multiply each element of the given multi dimensional vector with a given + * multiplier + * + * @param minMaxVec + * - vector at which operation is to be performed + * @param mult + * - multiplier + * @return - Vector after multiplication with the multiplier + */ + private float[][] multMinMaxVec(float[][] minMaxVec, int mult) { + for (int i = 0; i < minMaxVec[0].length; i++) { + minMaxVec[0][i] = minMaxVec[0][i] * mult; + minMaxVec[1][i] = minMaxVec[1][i] * mult; + } + return minMaxVec; + } + + /** + * Method to generate two vectors from a given vector by adding and subtracting + * value in sdMap from the given vector + * + * @param vector + * - input vector to perform operation on + * @return - min vector at index 0 and max vector at index 1 + */ + private float[][] getMinMaxVec(float[] vector) { + float[][] resVec = new float[2][vector.length]; + for (int i = 0; i < vector.length; i++) { + float diff = sdMap.get(i); + // MinVec + resVec[0][i] = vector[i] - diff; + // MaxVec + resVec[1][i] = vector[i] + diff; + } + return resVec; + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java index 910a2e9..1dbd540 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java @@ -113,7 +113,7 @@ public Word2VecModel loadModel(final File file) { return new Word2VecModel(word2Vector, vectorSize); } - private static float[] readVector(final FileInputStream fin, final int vectorSize) + public static float[] readVector(final FileInputStream fin, final int vectorSize) throws IOException { final byte bytes[] = new byte[vectorSize * 4]; fin.read(bytes); @@ -126,7 +126,7 @@ private static float[] readVector(final FileInputStream fin, final int vectorSiz return vector; } - private static String readWord(final FileInputStream fin) throws IOException { + public static String readWord(final FileInputStream fin) throws IOException { char c; final StringBuffer buffer = new StringBuffer(); c = (char) fin.read(); From 99a746db97f1ec39707a87960104c2d4f7c8ad78 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 6 Sep 2018 12:55:37 +0200 Subject: [PATCH 008/114] intermediate commit --- config/cfg.properties | 7 +- pom.xml | 6 + src/main/java/nikit/test/TestConst.java | 13 + src/main/java/nikit/test/Word2VecTester.java | 42 ---- .../db/mongo/MongoDbHandler.java | 82 ++++++ .../subset/DataSubsetGenerator.java | 10 +- .../word2vecrestful/tool/ModelNormalizer.java | 117 +++++++-- .../tool/ModelStatsWriter.java | 6 +- .../tool/MongoDBModelGenerator.java | 200 +++++++++++++++ .../NormalizedDBModelGenerator.java | 4 +- .../word2vecrestful/utils/Word2VecMath.java | 20 +- .../word2vec/W2VNrmlMemModel.java | 16 +- .../word2vec/W2VNrmlMongoDbModel.java | 238 ++++++++++++++++++ .../word2vec/Word2VecFactory.java | 30 ++- .../word2vec/Word2VecModelLoader.java | 6 +- .../NormalizedInMemModelTest.java | 46 ++++ .../NormalizedMongoModelTest.java | 50 ++++ 17 files changed, 800 insertions(+), 93 deletions(-) create mode 100644 src/main/java/nikit/test/TestConst.java create mode 100644 src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java create mode 100644 src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java rename src/main/java/org/aksw/word2vecrestful/{db => tool}/NormalizedDBModelGenerator.java (93%) create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java create mode 100644 src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java create mode 100644 src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java diff --git a/config/cfg.properties b/config/cfg.properties index 9d7bbe6..dc20612 100644 --- a/config/cfg.properties +++ b/config/cfg.properties @@ -3,4 +3,9 @@ org.aksw.word2vecrestful.web.Word2VecController.maxN: 10000 org.aksw.word2vecrestful.word2vec.Word2VecModelLoader.bin: true org.aksw.word2vecrestful.word2vec.Word2VecFactory.model: data/GoogleNews-vectors-negative300.bin org.aksw.word2vecrestful.Application.inmemory: true -org.aksw.word2vecrestful.Application.subsetfiledir: data/subset-files-1/ \ No newline at end of file +org.aksw.word2vecrestful.Application.subsetfiledir: data/subset-files-1/ +org.aksw.word2vecrestful.word2vec.normalizedmodel.bin: false +org.aksw.word2vecrestful.word2vec.normalizedmodel.model: data/normal/GoogleNews-vectors-negative300-normalized.txt +org.aksw.word2vecrestful.word2vec.normalizedbinmodel.bin: true +org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model: data/normalbinmodel/GoogleNews-vectors-negative300-normalized.bin +org.aksw.word2vecrestful.word2vec.stats.sdfile: data/normal/stat/normal-model-sd.csv \ No newline at end of file diff --git a/pom.xml b/pom.xml index b78df5b..5f5e247 100644 --- a/pom.xml +++ b/pom.xml @@ -64,6 +64,12 @@ commons-io 2.6 + + + org.mongodb + mongo-java-driver + 3.4.1 + diff --git a/src/main/java/nikit/test/TestConst.java b/src/main/java/nikit/test/TestConst.java new file mode 100644 index 0000000..b6984f6 --- /dev/null +++ b/src/main/java/nikit/test/TestConst.java @@ -0,0 +1,13 @@ +package nikit.test; + +import java.util.HashMap; +import java.util.Map; + +public interface TestConst { + + public static final float[] CAT = { .00405874f, .06719029f, -.09387352f, .0713696f, .03889965f, .0273262f, .01639572f, -.00313448f, .07265553f, -.04147152f, .02652248f, -.19289079f, -.00146677f, -.09773134f, -.00431995f, -.02748694f, .01663683f, .04983012f, -.14788294f, -.0044606f, .07072663f, -.04854418f, .07394147f, -.04115004f, -.03198772f, .08197859f, -.09515946f, .12023526f, .13566652f, -.02829065f, -.02587952f, -.0649399f, -.02989807f, -.04661528f, -.03375589f, .04307894f, -.00114027f, .02378986f, .01454718f, .11380557f, .02459358f, -.03697073f, .02218244f, .03697073f, .00651006f, -.04066781f, .0691192f, -.02378986f, -.00916231f, .01824425f, -.10994776f, .02812991f, .1131626f, .04597231f, .01623498f, -.04436488f, .05368794f, -.12280714f, .13052277f, .03520257f, .07297701f, .07008366f, -.02957659f, .06815474f, .02941585f, -.02716545f, .01961056f, .03359514f, -.06333248f, -.02989807f, .16202827f, .01302013f, -.02330764f, -.00065804f, -.07587038f, .08422898f, .02957659f, -.03504183f, .0003466f, -.01936945f, .01679757f, -.00960435f, .06365396f, -.04661528f, -.10994776f, .02684397f, -.09065867f, .03311292f, -.03439886f, -.04066781f, -.04629379f, .01149308f, -.0387389f, -.05915318f, -.07072663f, -.07844225f, .02780842f, -.02475432f, -.08551492f, -.07008366f, -.07361998f, -.03198772f, .03809593f, .05079458f, .02427209f, -.09065867f, .04693676f, -.00663062f, .03295218f, -.06268951f, -.03086253f, .04661528f, .05625981f, .10287509f, -.05497388f, -.02861213f, -.01647609f, -.08197859f, -.06847623f, -.03102327f, -.04034632f, -.03118401f, -.12859386f, -.02186096f, -.10416103f, .03616703f, -.00051488f, .01430607f, .05143755f, -.06236802f, -.01149308f, .01117159f, -.04757973f, .0053045f, -.04661528f, -.00787637f, .00494283f, .02491506f, .03536331f, .041793f, .03520257f, -.003898f, -.07908522f, .00960435f, .05336645f, .06558287f, -.09258758f, .05497388f, -.03825668f, -.08422898f, .12730792f, -.02186096f, -.15174076f, -.0203339f, -.04757973f, -.03825668f, .01872648f, .01181456f, -.03520257f, .06204654f, -.05497388f, -.00590728f, .03616703f, -.04066781f, -.05368794f, -.04757973f, .04243597f, .03793519f, .04500785f, .01872648f, -.02668323f, -.02025353f, -.0220217f, .09194461f, -.06461842f, .02378986f, .04050707f, -.0691192f, -.07265553f, .07040514f, -.04886567f, -.01953019f, .01719943f, .02121799f, -.00868009f, .04340043f, .06397545f, .08937273f, .06140357f, .04629379f, .02153947f, -.04757973f, .01655646f, .02909436f, .04034632f, .0880868f, .01760129f, -.10608993f, -.06815474f, .05979615f, .01454718f, -.07297701f, -.04532934f, -.01374347f, -.00094436f, .01342198f, .02395061f, -.0273262f, .02764768f, -.11059072f, -.13180871f, .00578672f, -.06140357f, -.01583312f, -.06301099f, .03327366f, .03054104f, -.10094618f, -.06558287f, -.00554561f, .04018558f, .0482227f, -.04243597f, -.07747781f, -.01768166f, -.02186096f, .02266467f, -.0119753f, .06944069f, -.01977131f, .06365396f, .01695831f, -.01752091f, .00952398f, -.09065867f, .02780842f, .10801885f, .00598765f, .00492273f, .01551163f, .12409308f, -.07201257f, -.01117159f, .00367698f, .1215212f, .00711285f, .01141271f, .02395061f, .05272348f, -.05433091f, -.09773134f, .04950864f, -.09580243f, .04597231f, .00147682f, .0565813f, -.07233405f, .03070178f, -.06268951f, .00526431f, -.0303803f, .05175903f, -.04661528f, -.01760129f, .01189493f, .07779929f, -.04983012f, -.00227049f, -.08937273f, -.02346838f, -.05433091f, .0607606f, -.02764768f, .06108208f, .03841742f, .00908194f, -.01567238f, .05883169f, .02153947f, -.01165382f, .07554889f, .00888101f, -.03214847f, .08872976f, .02748694f, -.04307894f, -.03327366f, -.00243123f, .02346838f, .04918715f, -.06783326f, .05947466f, -.05208052f, .01953019f, .09451649f, -.01535089f, .04983012f, .16202827f, -.09065867f, .01848537f }; + public static final float[] DOG = { .01719806f, -.00749344f, -.05798202f, .05405104f, -.02833585f, .01924545f, .01965492f, -.02768068f, -.00515942f, -.02129283f, .0602751f, -.14217061f, -.00757533f, -.05568895f, -.00843524f, .03603403f, -.06682674f, .05339587f, -.06289575f, -.04029259f, .05208554f, -.03324958f, .04782698f, -.05503378f, -.02997376f, .06715432f, -.05012005f, .10744691f, .11006757f, .00818955f, -.03259441f, -.02751689f, -.01220243f, -.02882722f, -.03308579f, .00261042f, -.00450425f, .01768943f, .04979247f, .11203305f, .00556889f, -.07141288f, -.00505705f, .01760753f, -.03603403f, -.02980996f, .08353342f, -.02358591f, -.00536416f, .02538761f, -.02358591f, .03996501f, .07698178f, .00474994f, .03865468f, .00251829f, .09237813f, -.08189551f, .09958494f, .01171106f, .08124035f, .0455339f, .04782698f, .00589648f, .00982746f, -.04078396f, .00765723f, -.01596962f, -.05208554f, .0010544f, .11596404f, .04111155f, -.0655164f, .02718931f, -.02293074f, -.0049342f, .07206805f, -.04062017f, .05274071f, -.06944739f, .04586149f, -.04356841f, .01777133f, -.09106781f, -.1002401f, -.06191301f, -.07698178f, .03996501f, .00513894f, -.08779199f, -.04127534f, -.00626501f, -.0217842f, -.0273531f, .02637035f, -.1185847f, .01760753f, -.0082305f, -.00182218f, -.07010256f, -.07075772f, -.0602751f, .08124035f, .08648166f, .04618907f, -.07075772f, -.0072887f, -.04651665f, .00618311f, -.0041562f, -.05339587f, .05405104f, .06977498f, .03455991f, .032922f, -.02293074f, -.00292776f, -.09696428f, -.07206805f, -.0383271f, -.07436112f, .01384034f, -.10482625f, -.01875407f, -.0327582f, .01949113f, -.01359465f, -.0583096f, .05503378f, -.08517133f, -.05175796f, -.00778007f, -.07993001f, .00696112f, -.09172297f, .00131033f, .03816331f, -.0583096f, .08648166f, .07894727f, .01752564f, .02293074f, -.05896477f, .05372345f, -.0002009f, .0200644f, -.0710853f, -.01859028f, -.02522382f, -.10286076f, .14348093f, .01785322f, -.07010256f, -.01916355f, -.07043014f, .01105589f, .03537886f, -.05044764f, -.03144788f, .03914605f, .02162042f, .00941798f, .08091276f, -.04225808f, -.03374095f, -.00411525f, -.00010941f, .00532321f, .04291325f, -.01113779f, .01367655f, -.04422358f, .032922f, .05863719f, -.07927486f, .01736185f, .06158542f, .0081486f, -.14479126f, .08255067f, -.01019599f, -.00831239f, -.03930984f, .05405104f, -.01916355f, .00391051f, .09434363f, .14348093f, .01531446f, .03390474f, -.01326707f, .00593742f, -.03013755f, .0455339f, .06977498f, .06322333f, -.0511028f, -.07960244f, -.0638785f, .02391349f, -.08255067f, -.08779199f, -.07861969f, -.04880973f, -.00393098f, -.05044764f, -.03799952f, .06125784f, .08844715f, -.04618907f, -.15396357f, -.01572394f, -.04258566f, -.01416792f, -.05601653f, .04258566f, .08713682f, -.08189551f, -.07370596f, -.0291548f, .05339587f, -.0126938f, .00300966f, -.0930333f, -.03521507f, -.05896477f, .0766542f, -.0090904f, .09565396f, -.09172297f, .0054051f, .01981871f, -.0802576f, .05961993f, -.04520632f, .04651665f, .1185847f, .04094775f, .04815456f, .0309565f, .07698178f, -.10089527f, -.0163791f, -.0602751f, .09958494f, .05896477f, .01613341f, -.0011363f, .02653415f, -.07993001f, -.07763694f, .05568895f, -.07174046f, -.02358591f, -.02538761f, .06584399f, -.04356841f, -.03554265f, -.1185847f, -.03914605f, -.01711616f, .01138348f, -.04815456f, -.00131033f, .0583096f, -.03341337f, -.05568895f, -.02866343f, -.12841216f, .01981871f, -.02088336f, .02964617f, -.02980996f, .11006757f, .02293074f, -.06420608f, -.00028024f, .03488749f, .0511028f, -.00051441f, .13954996f, -.01113779f, .05012005f, .08124035f, -.05929235f, -.01654289f, -.04176671f, .04225808f, .05863719f, .09434363f, -.06060268f, .03455991f, -.09237813f, .08779199f, .08255067f, -.01580583f, .02096525f, .13954996f, -.11923986f, .0746887f }; + public static final float[] AIRPLANE = { .05517051f, -.06537547f, -.05261928f, .03810911f, -.05230037f, -.11671913f, -.05708394f, -.05804066f, .14797178f, .00605919f, .08227742f, -.10332513f, .07876946f, -.09439579f, -.03763076f, .12118379f, .00422549f, .05485161f, .0204099f, .02519347f, -.01466962f, -.00178387f, -.01171975f, -.0749426f, .00181377f, .0085307f, -.04241433f, .02471511f, .04496557f, .01243728f, -.09120674f, -.09375799f, -.08482865f, .01570606f, .00721522f, .02806361f, -.00300966f, .04113871f, .01945319f, .09439579f, .02423676f, .02033018f, .09312017f, .0237584f, -.00693618f, -.09120674f, .03364445f, .01227783f, .1084276f, -.0408198f, -.08674208f, .04847351f, .03906583f, -.01427099f, -.0408198f, -.06633218f, .0605919f, -.02439621f, .0816396f, .02742581f, .0880177f, -.10332513f, -.08546646f, .01227783f, .03492007f, .00849084f, -.02184497f, .02168552f, -.01235756f, .01738031f, -.05453271f, -.07940727f, -.05899737f, .009607f, -.12564845f, .00446467f, .03189047f, -.10077389f, .03492007f, -.02822307f, -.0204099f, .02168552f, -.00785303f, -.04113871f, -.02017072f, -.09758484f, -.00332857f, .06346204f, .06250533f, .09056894f, -.02694745f, .00920837f, -.04943023f, -.00685645f, .0374713f, -.07845056f, .04273323f, -.05070585f, .04624118f, -.11161665f, .05102475f, -.11097884f, .02033018f, .02025045f, .10778979f, .05867847f, .0813207f, .01355345f, .06665108f, .00198319f, -.15116084f, -.03109321f, -.07653713f, -.0408198f, -.08737989f, -.01793839f, .00095671f, .09886046f, -.03794966f, .00574028f, -.03173102f, -.06888342f, .04815461f, -.12437284f, -.00360761f, .02583128f, -.04305214f, -.06920232f, .04113871f, .11225446f, .04464666f, -.01945319f, -.06856451f, .03220938f, .06633218f, -.05485161f, -.03555787f, -.08291522f, .03794966f, .00725508f, -.01738031f, -.03970364f, -.06696999f, .01044413f, .06473766f, -.00621864f, -.06314313f, -.0676078f, -.02232333f, -.05835956f, .05357599f, -.02997704f, .01514797f, -.00968673f, -.05134366f, -.06346204f, .01650332f, -.01658305f, -.02519347f, .04656009f, -.01132112f, .02822307f, -.0044846f, -.00605919f, .013394f, .02933923f, .05102475f, -.05804066f, .01323455f, -.04273323f, -.04592228f, -.04847351f, -.05006804f, -.05995408f, -.0152277f, -.05230037f, -.01044413f, -.05453271f, -.05644614f, -.0041059f, -.0542138f, -.09949827f, -.04496557f, -.08227742f, .09056894f, -.02567183f, -.01259674f, .03683349f, -.07972617f, .02886088f, .0542138f, .09120674f, -.01482907f, .11671913f, -.05325709f, .05198147f, -.01331427f, -.08355303f, -.05261928f, -.05676504f, -.03523897f, -.00920837f, -.13585341f, -.10778979f, -.00193336f, .02758526f, .03603623f, .01817757f, -.07589932f, .00534165f, -.03922528f, -.0026509f, -.00980632f, .013394f, -.04496557f, .04241433f, .01482907f, .1900672f, -.000583f, -.01419126f, -.03332554f, -.04337104f, .0538949f, .0237584f, .14860959f, -.00542138f, -.00510248f, .07717494f, -.10077389f, -.02949869f, .05580832f, -.02025045f, -.08482865f, -.13776684f, -.06569437f, .01108194f, -.05293818f, .01921401f, .04400885f, -.01299537f, .04400885f, -.01730058f, -.07876946f, -.08036399f, .00865029f, -.04496557f, -.0810018f, -.11799474f, .02646909f, -.06154861f, -.04592228f, -.01203865f, .01164002f, -.03029595f, -.01219811f, -.03587678f, .03460116f, .04241433f, .01730058f, -.08993113f, .01722085f, .0676078f, -.00508254f, .03157157f, -.04177652f, .04815461f, -.00713549f, -.08227742f, -.13266437f, .05740285f, -.01347372f, .01761949f, .08674208f, -.00617878f, -.07717494f, -.08674208f, .00158456f, -.02232333f, -.06537547f, -.07845056f, -.01435071f, -.04687899f, -.03252828f, -.03460116f, -.00035129f, -.0271069f, -.05357599f, .0234395f, -.04528447f, -.01825729f, -.10715199f, -.04560337f, .04177652f, .00143507f, .05644614f, -.00781317f, -.08355303f, .05453271f, -.00402617f, -.06665108f }; + public static final float[] ROAD = { .04295584f, .10129324f, .03301409f, -.03226377f, .05627402f, -.0121927f, -.07465687f, -.10054292f, -.00001145f, .06827915f, .07465687f, .03751602f, -.09341488f, -.03676569f, -.00135409f, -.10429452f, .00717494f, -.06265175f, .04483164f, .02869975f, .0667785f, .06715367f, -.0160381f, .04070488f, .10129324f, -.0922894f, -.06640334f, -.05027146f, .0121927f, -.01378714f, -.01275545f, .02832459f, .01941454f, .07803331f, -.01735116f, -.03657812f, .04764534f, -.02663637f, .03395199f, .02532331f, -.0405173f, -.00623704f, .05627402f, .06865431f, -.0997926f, -.01660084f, .04295584f, .01575673f, .10579516f, .06752883f, .00006485f, .10054292f, -.00872247f, .02494815f, -.01111412f, -.0160381f, -.06265175f, .05439822f, .09679132f, .02926249f, -.10804612f, -.04258068f, .08703715f, -.00975416f, .06340206f, .0941652f, .00764389f, .05139694f, -.0461447f, -.0001634f, -.02757427f, -.04164278f, -.03545263f, -.07503203f, .05027146f, -.09566583f, .05627402f, .07090527f, -.00515845f, .03657812f, -.00235647f, -.00294266f, -.07203075f, -.05139694f, -.00637772f, .04576954f, -.01950833f, .01027001f, .01041069f, .05702434f, -.06640334f, -.03188861f, -.00438468f, -.06865431f, -.01613189f, -.03395199f, .02401025f, .00565085f, -.0207276f, .02419783f, .03695327f, .02119655f, .05102178f, .09829196f, .08328556f, .00016047f, .04576954f, -.00147719f, .00614325f, -.00307162f, .05289758f, -.12530349f, -.01941454f, -.09566583f, .07765815f, -.00368126f, .03020039f, -.01950833f, -.12680413f, .02644879f, .00361092f, .03095071f, -.04501922f, .00947279f, -.07015495f, .03301409f, -.11404868f, -.06640334f, .05214726f, -.08103459f, -.03545263f, -.06152626f, -.02869975f, .01041069f, -.04839566f, -.03057555f, -.06227658f, -.04183036f, -.0103169f, .15081438f, -.04483164f, .01997728f, .05965046f, -.0461447f, -.02907491f, -.06340206f, -.09904228f, -.08328556f, -.10804612f, -.03545263f, -.00459571f, -.08366071f, -.01331819f, -.02213445f, -.08328556f, -.00361092f, -.01594431f, -.05064662f, -.04483164f, .0423931f, -.0922894f, .11404868f, .08966327f, -.01519399f, -.01566294f, -.00279025f, -.01073896f, -.096041f, -.06527787f, -.12380285f, .01631947f, .00247371f, .01378714f, .00886316f, .03207619f, -.10129324f, -.01556915f, -.03001281f, .00726873f, .0405173f, -.0997926f, .01406851f, .04501922f, .03113829f, .0023213f, .01027001f, -.0498963f, -.03582779f, .07503203f, .02982523f, .14706278f, -.00858179f, -.08966327f, .13580798f, -.08628684f, -.03076313f, -.01669463f, .11029708f, -.06977979f, -.02907491f, .0094259f, .02016486f, -.03151345f, .02185308f, .04745776f, .01875801f, -.0395794f, -.01195823f, .05777466f, -.01369335f, .05477338f, .04108004f, .00806594f, -.05439822f, -.07203075f, .08065943f, .02513573f, .04464406f, -.03582779f, -.02457299f, -.01012932f, -.07053011f, .05289758f, -.01650705f, -.03338925f, -.04952114f, -.04914598f, .03732843f, .03657812f, .07653267f, .04745776f, .00492398f, .01139549f, .02757427f, -.06977979f, -.0480205f, -.03582779f, -.03789118f, .08178491f, -.00349368f, .01050448f, -.00323576f, -.03601537f, -.08966327f, .01083275f, -.06115111f, .03095071f, .06415238f, -.03751602f, -.12605381f, .00998864f, .06640334f, .00703425f, .05139694f, .03826633f, -.0941652f, .00703425f, .05139694f, -.04089246f, -.05589886f, -.07165559f, -.06490271f, .04314342f, .10204356f, .09153908f, .09078876f, -.17407431f, -.03676569f, -.10504484f, .00361092f, .07653267f, .00947279f, -.02063381f, .03245135f, .00290749f, .02738669f, .03188861f, -.05402306f, .00185235f, -.01660084f, -.03470231f, -.14106022f, .0517721f, -.05214726f, .00454882f, -.08966327f, -.15831758f, .07240591f, .00214545f, .0498963f, -.01950833f, .01350577f, -.11254805f, .00035611f, .01238029f, -.01491262f, -.04258068f, -.02588605f, .0885378f, -.08891296f }; + public static final Map VEC_MAP = new HashMap<>(); +} diff --git a/src/main/java/nikit/test/Word2VecTester.java b/src/main/java/nikit/test/Word2VecTester.java index e6fe8c3..6a496f2 100644 --- a/src/main/java/nikit/test/Word2VecTester.java +++ b/src/main/java/nikit/test/Word2VecTester.java @@ -4,7 +4,6 @@ import java.io.IOException; import java.util.Map; -import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; @@ -15,47 +14,6 @@ public static void main(final String[] a) throws JsonProcessingException, FileNo // loads model in memory final Word2VecModel model = Word2VecFactory.get(); - /* final float[] vecCat = model.word2vec.get("cat"); - final float[] vecDog = model.word2vec.get("dog"); - final float[] vecItaly = model.word2vec.get("Italy"); - final float[] vecFrance = model.word2vec.get("France"); - final float[] vecInstead = model.word2vec.get("instead"); - final float[] vecWhether = model.word2vec.get("whether"); - final float[] vecHello = model.word2vec.get("hello"); - final float[] vecBand = model.word2vec.get("band"); - final float[] vecBeer = model.word2vec.get("beer"); - final float[] vecBear = model.word2vec.get("bear"); - final float[] vecBang = model.word2vec.get("bang"); - final float[] vecBang = model.word2vec.get("bang"); - vecCat[34] = 234; - vecDog[12] = 342; - vecItaly[80] = 12; - vecItaly[23] = 212; - vecFrance[76] = 12; - vecFrance[26] = 34; - vecFrance[35] = 1231; - vecInstead[37] = 35; - vecWhether[29] = 67; - vecHello[78] = 34; - vecBand[76] = -12; - vecBeer[23] = 32; - vecBear[56] = -8; - vecBang[13] = 23; - - System.out.println(model.getClosestEntry(vecCat)); - System.out.println(model.getClosestEntry(vecDog)); - System.out.println(model.getClosestEntry(vecItaly)); - System.out.println(model.getClosestEntry(vecFrance)); - System.out.println(model.getClosestEntry(vecInstead)); - System.out.println(model.getClosestEntry(vecWhether)); - System.out.println(model.getClosestEntry(vecHello)); - System.out.println(model.getClosestEntry(vecBand)); - System.out.println(model.getClosestEntry(vecBeer)); - System.out.println(model.getClosestEntry(vecBear)); - System.out.println(model.getClosestEntry(vecBang)); - - final double sim = Word2VecMath.cosineSimilarity(vecCat, vecDog); - System.out.println(sim);*/ // test 1 float[] vec1 = model.word2vec.get("WesternOne"); Map closestWord = model.getClosestEntryInSub(vec1, "ns#country-name"); diff --git a/src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java b/src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java new file mode 100644 index 0000000..487448d --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java @@ -0,0 +1,82 @@ +package org.aksw.word2vecrestful.db.mongo; + +import org.bson.Document; + +import com.mongodb.MongoClient; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +public class MongoDbHandler { + private String dbName; + private String host = "localhost"; + private Integer port = 27017; + private MongoDatabase database; + private MongoClient mongoClient; + + public MongoDbHandler(String dbName, String host, Integer port) { + if (host != null) { + this.host = host; + } + if (port != null) { + this.port = port; + } + this.dbName = dbName; + } + + public void connect() { + this.mongoClient = new MongoClient(this.host, this.port); + this.database = mongoClient.getDatabase(this.dbName); + } + + public void close() { + this.mongoClient.close(); + } + + public MongoCollection createCollection(String collctnName) { + this.database.createCollection(collctnName); + return this.database.getCollection(collctnName); + } + + // Setter and Getters + + public String getDbName() { + return dbName; + } + + public void setDbName(String dbName) { + this.dbName = dbName; + } + + public String getHost() { + return host; + } + + public void setHost(String host) { + this.host = host; + } + + public Integer getPort() { + return port; + } + + public void setPort(Integer port) { + this.port = port; + } + + public MongoDatabase getDatabase() { + return database; + } + + public void setDatabase(MongoDatabase database) { + this.database = database; + } + + public MongoClient getMongoClient() { + return mongoClient; + } + + public void setMongoClient(MongoClient mongoClient) { + this.mongoClient = mongoClient; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index 793e874..46ccf7e 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -11,8 +11,8 @@ import java.util.Map.Entry; import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; -import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.commons.io.output.FileWriterWithEncoding; import com.fasterxml.jackson.core.JsonProcessingException; @@ -138,9 +138,9 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, * @throws IOException */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { - File subsetConfig = new File(".\\word2vec-dump\\subsetconfig2.json"); - Word2VecModel model = Word2VecFactory.get(); - generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.word2vec, - model.vectorSize); + File subsetConfig = new File("D:\\Nikit\\DICE-Group\\word2vec-dump\\subsetconfig2.json"); + W2VNrmlMemModel model = Word2VecFactory.getNormalizedBinModel(); + generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), + model.getWord2VecMap(), model.getVectorSize()); } } diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java index def3e4b..92ab07a 100644 --- a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java @@ -1,14 +1,17 @@ package org.aksw.word2vecrestful.tool; +import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; import java.sql.PreparedStatement; import java.sql.SQLException; -import org.aksw.word2vecrestful.db.NormalizedDBModelGenerator; import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; @@ -19,6 +22,8 @@ public class ModelNormalizer { public static Logger LOG = LogManager.getLogger(ModelNormalizer.class); + public static final byte[] END_LINE_BA = "\n".getBytes(StandardCharsets.UTF_8); + public static final byte[] WHITESPACE_BA = " ".getBytes(StandardCharsets.UTF_8); /** * Method to normalize a bin word2vec model line @@ -39,6 +44,16 @@ public String getNormalizedVecLine(String word, float[] vector) { return resStr.toString(); } + public byte[] getNormalizedVecBA(float[] vector) { + vector = Word2VecMath.normalize(vector); + ByteBuffer buffer = ByteBuffer.allocate(vector.length * 4); + buffer.order(ByteOrder.LITTLE_ENDIAN); + for (int i = 0; i < vector.length; i++) { + buffer.putFloat(vector[i]); + } + return buffer.array(); + } + /** * Method to generate a normalized model for a word2vec bin model * @@ -81,7 +96,7 @@ public void generateNormalizedModel(File inputFile, File outputFile) throws IOEx bWriter.close(); } } - + /** * Method to persist a normalized model for a word2vec bin model * @@ -89,15 +104,15 @@ public void generateNormalizedModel(File inputFile, File outputFile) throws IOEx * - word2vec file of the model to be normalized * @param dbName * - name of the database - * @param tablName - * - name of the table to store the data in + * @param tablName + * - name of the table to store the data in * @throws IOException - * @throws SQLException + * @throws SQLException */ public void persistNormalizedModel(File inputFile, String dbName, String tblName) throws IOException, SQLException { // intialize handler instance NormalizedDBModelGenerator dbHandler = null; - + FileInputStream fin = null; try { // reads file header @@ -106,7 +121,7 @@ public void persistNormalizedModel(File inputFile, String dbName, String tblName int words = Integer.parseInt(word); word = Word2VecModelLoader.readWord(fin); int vectorSize = Integer.parseInt(word); - dbHandler = new NormalizedDBModelGenerator(dbName, tblName, vectorSize); + dbHandler = new NormalizedDBModelGenerator(dbName, tblName, vectorSize); // open connection dbHandler.connect(); LOG.info("Expecting " + words + " words with " + vectorSize + " values per vector."); @@ -118,9 +133,9 @@ public void persistNormalizedModel(File inputFile, String dbName, String tblName float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); // dbHandler.insertMainTblRecord(word, vector); dbHandler.addMainTblInsrtBatch(word, Word2VecMath.normalize(vector), ps); - if((w+1)%50000 == 0) { + if ((w + 1) % 50000 == 0) { dbHandler.executeBatchCommit(ps); - LOG.info((w+1)+" Records inserted."); + LOG.info((w + 1) + " Records inserted."); } } dbHandler.executeBatchCommit(ps); @@ -134,22 +149,86 @@ public void persistNormalizedModel(File inputFile, String dbName, String tblName } } - /*public static void main(String[] args) throws IOException { - String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); - String model = (Cfg.get(cfgKeyModel)); - ModelNormalizer modelNormalizer = new ModelNormalizer(); - File inputFile = new File(model); - File outputFile = new File("D:\\Nikit\\DICE-Group\\Jword2vec\\data\\normal\\GoogleNews-vectors-negative300-normalized.txt"); - modelNormalizer.generateNormalizedModel(inputFile, outputFile); - }*/ - + /** + * Method to generate a normalized model for a word2vec bin model + * + * @param inputFile + * - word2vec file of the model to be normalized + * @param outputFile + * - output file for normalized model + * @throws IOException + */ + public void generateNormalizedBinModel(File inputFile, File outputFile) throws IOException { + // ensure directory creation + outputFile.getParentFile().mkdirs(); + // open an output stream + BufferedOutputStream bOutStrm = null; + FileInputStream fin = null; + try { + bOutStrm = new BufferedOutputStream(new FileOutputStream(outputFile)); + // reads file header + fin = new FileInputStream(inputFile); + String word = Word2VecModelLoader.readWord(fin); + bOutStrm.write(word.getBytes(StandardCharsets.UTF_8)); + bOutStrm.write(WHITESPACE_BA); + Integer words = Integer.parseInt(word); + word = Word2VecModelLoader.readWord(fin); + bOutStrm.write(word.getBytes(StandardCharsets.UTF_8)); + Integer vectorSize = Integer.parseInt(word); + bOutStrm.write(END_LINE_BA); + LOG.info("Expecting " + words + " words with " + vectorSize + " values per vector."); + for (int w = 0; w < words; ++w) { + word = Word2VecModelLoader.readWord(fin); + // LOG.info(word); + float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); + + bOutStrm.write(word.getBytes(StandardCharsets.UTF_8)); + bOutStrm.write(WHITESPACE_BA); + bOutStrm.write(getNormalizedVecBA(vector)); + + if ((w + 1) % 10000 == 0) { + bOutStrm.flush(); + LOG.info((w + 1) + " Records inserted."); + } + } + } catch (final IOException e) { + LOG.error(e.getLocalizedMessage(), e); + } finally { + fin.close(); + bOutStrm.close(); + } + } + + /* + * public static void main(String[] args) throws IOException { String + * cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); String model + * = (Cfg.get(cfgKeyModel)); ModelNormalizer modelNormalizer = new + * ModelNormalizer(); File inputFile = new File(model); File outputFile = new + * File( + * "D:\\Nikit\\DICE-Group\\Jword2vec\\data\\normal\\GoogleNews-vectors-negative300-normalized.txt" + * ); modelNormalizer.generateNormalizedModel(inputFile, outputFile); } + */ + + /* + * public static void main(String[] args) throws IOException, SQLException { + * String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); String + * model = (Cfg.get(cfgKeyModel)); ModelNormalizer modelNormalizer = new + * ModelNormalizer(); File inputFile = new File(model); // + * modelNormalizer.generateNormalizedModel(inputFile, outputFile); + * modelNormalizer.persistNormalizedModel(inputFile, + * "data/nrmldb/word2vecmodel", "wordtovec"); } + */ + public static void main(String[] args) throws IOException, SQLException { String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); String model = (Cfg.get(cfgKeyModel)); ModelNormalizer modelNormalizer = new ModelNormalizer(); File inputFile = new File(model); + // "org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model" + String outputModel = (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model")); + File outputFile = new File(outputModel); // modelNormalizer.generateNormalizedModel(inputFile, outputFile); - modelNormalizer.persistNormalizedModel(inputFile, "data/nrmldb/word2vecmodel", "wordtovec"); + modelNormalizer.generateNormalizedBinModel(inputFile, outputFile); } } diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java index 16b6bc7..7d51db9 100644 --- a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java @@ -9,8 +9,8 @@ import java.util.Map.Entry; import java.util.Set; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; -import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -87,7 +87,7 @@ private static void writeValues(int index, float min, float max, float mean, flo */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { // Get the normalized model - Word2VecModel model = Word2VecFactory.get(); - writeModelStats(model.word2vec, model.vectorSize, new File("data/normal/stat/modelstats.csv")); + W2VNrmlMemModel model = Word2VecFactory.getNormalizedBinModel(); + writeModelStats(model.getWord2VecMap(), model.getVectorSize(), new File("data/normal/stat/normal-model-stats.csv")); } } diff --git a/src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java b/src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java new file mode 100644 index 0000000..207e0b6 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java @@ -0,0 +1,200 @@ +package org.aksw.word2vecrestful.tool; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map.Entry; +import java.util.TreeMap; + +import org.aksw.word2vecrestful.db.mongo.MongoDbHandler; +import org.aksw.word2vecrestful.word2vec.Word2VecModelLoader; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.bson.BsonDocument; +import org.bson.BsonString; +import org.bson.Document; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.mongodb.client.AggregateIterable; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.Aggregates; +import com.mongodb.client.model.BsonField; +import com.mongodb.client.model.IndexModel; +import com.mongodb.client.model.Indexes; +import com.opencsv.CSVReader; + +public class MongoDBModelGenerator { + + public static Logger LOG = LogManager.getLogger(MongoDBModelGenerator.class); + + private String wordColName = "word"; + private String vecColPrefix = "val"; + private String[] vecColNames; + private MongoDbHandler mongoDbHandler; + private MongoCollection mongoCollection; + private String collctnName; + + public MongoDBModelGenerator(String collctnName, String dbName, String host, Integer port) { + this.mongoDbHandler = new MongoDbHandler(dbName, host, port); + this.collctnName = collctnName; + } + + private Document createWord2VecDoc(String word, float[] vector) { + Document doc = new Document(); + doc.put(wordColName, word); + for (int i = 0; i < vector.length; i++) { + doc.put(vecColNames[i], vector[i]); + } + return doc; + } + + private String[] createVecColNames(int vectorSize) { + String[] colnames = new String[vectorSize]; + for (int i = 0; i < vectorSize; i++) { + colnames[i] = vecColPrefix + (i + 1); + } + return colnames; + } + + private void createIndexes() { + List indexList = new ArrayList<>(); + IndexModel indexModel = new IndexModel(Indexes.text(wordColName)); + indexList.add(indexModel); + for (int i = 0; i < vecColNames.length; i++) { + indexModel = new IndexModel(Indexes.ascending(vecColNames[i])); + indexList.add(indexModel); + } + this.mongoCollection.createIndexes(indexList); + } + + private void createIndexes(List fieldNames) { + List indexList = new ArrayList<>(); + IndexModel indexModel; + for (String field : fieldNames) { + indexModel = new IndexModel(Indexes.ascending(field)); + indexList.add(indexModel); + } + this.mongoCollection.createIndexes(indexList); + } + + /** + * Use ModelStatsWriter Instead + * + * @param n + * @param vectorSize + * @return + */ + @Deprecated + private List getNImportantFieldNames(int n, int vectorSize) { + List fieldNames = new ArrayList<>(); + TreeMap stdDevMap = new TreeMap<>(Collections.reverseOrder()); + for (int i = 0; i < vectorSize; i++) { + String colName = vecColPrefix + (i + 1); + AggregateIterable aggregate = mongoCollection.aggregate(Arrays.asList(Aggregates + .group("_id", new BsonField("stdDev", new BsonDocument("$stdDevPop", new BsonString(colName)))))); + Document result = aggregate.first(); + Double stdDevVal = result.getDouble("stdDev"); + stdDevMap.put(stdDevVal, colName); + } + int count = 0; + for (Entry entry : stdDevMap.entrySet()) { + fieldNames.add(entry.getValue()); + if (++count == n) { + break; + } + } + return fieldNames; + } + + private List getNImportantFieldNames(int n) throws IOException { + List fieldNames = new ArrayList<>(); + File file = new File("D:\\Nikit\\DICE-Group\\Jword2vec\\data\\normal\\stat\\normal-model-sd.csv"); + CSVReader csvReader = new CSVReader(new FileReader(file)); + // Reading header + csvReader.readNext(); + for (int i = 0; i < n; i++) { + String fieldName = vecColPrefix + csvReader.readNext()[0]; + fieldNames.add(fieldName); + } + csvReader.close(); + return fieldNames; + } + + public void generateTopIndexes(int vectorSize) { + this.mongoDbHandler.connect(); + this.mongoCollection = this.mongoDbHandler.getDatabase().getCollection(this.collctnName); + try { + // Find top vectors + List fieldNames = getNImportantFieldNames(63); + // Generate indexes + createIndexes(fieldNames); + } catch (Exception e) { + e.printStackTrace(); + } finally { + this.mongoDbHandler.close(); + } + + } + + public void persistWord2VecModel(File inputFile) { + LOG.info("Starting insertion of records to MongoDB.."); + FileInputStream fin = null; + this.mongoDbHandler.connect(); + this.mongoCollection = mongoDbHandler.createCollection(collctnName); + // this.mongoCollection = + // this.mongoDbHandler.getDatabase().getCollection(this.collctnName); + try { + fin = new FileInputStream(inputFile); + String word = Word2VecModelLoader.readWord(fin); + int words = Integer.parseInt(word); + word = Word2VecModelLoader.readWord(fin); + int vectorSize = Integer.parseInt(word); + List docList = new ArrayList<>(); + int recCount = 0; + this.vecColNames = createVecColNames(vectorSize); + // Insert records + for (int w = 0; w < words; ++w) { + word = Word2VecModelLoader.readWord(fin); + // LOG.info(word); + float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); + docList.add(createWord2VecDoc(word, vector)); + recCount++; + if (recCount % 10000 == 0) { + this.mongoCollection.insertMany(docList); + docList.clear(); + LOG.info((recCount) + " Records inserted."); + } + } + // Insert leftover records + if (docList.size() > 0) { + this.mongoCollection.insertMany(docList); + } + LOG.info("Records insertion successfully completed."); + LOG.info("Starting creation of indexes"); + // create indexes + createIndexes(); + LOG.info("Indexes creation successfully completed."); + } catch (Exception e) { + e.printStackTrace(); + } finally { + this.mongoDbHandler.close(); + } + } + + public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { + /* + * String inputModel = + * (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model")); File + * inputFile = new File(inputModel); + */ + MongoDBModelGenerator modelGenerator = new MongoDBModelGenerator("mainCollection", "word2vec", null, null); + /* modelGenerator.persistWord2VecModel(inputFile); */ + modelGenerator.generateTopIndexes(300); + } +} diff --git a/src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java b/src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java similarity index 93% rename from src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java rename to src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java index 9e62852..662324b 100644 --- a/src/main/java/org/aksw/word2vecrestful/db/NormalizedDBModelGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java @@ -1,8 +1,10 @@ -package org.aksw.word2vecrestful.db; +package org.aksw.word2vecrestful.tool; import java.sql.PreparedStatement; import java.sql.SQLException; +import org.aksw.word2vecrestful.db.SQLiteDBHandler; + public class NormalizedDBModelGenerator extends SQLiteDBHandler { private String mainTblName; diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index 5fd9d1d..8d0e4a6 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -1,5 +1,9 @@ package org.aksw.word2vecrestful.utils; +import java.util.HashMap; +import java.util.Map; +import java.util.TreeMap; + public class Word2VecMath { public static void main(final String[] args) { @@ -112,5 +116,19 @@ public static float calcVariance(float[] inpArr) { res += Math.pow(inpArr[i]- meanVal, 2); } return res; - } + } + + public static Map findClosestVecInNearbyVecs(Map nearbyVecs, float[] vector) { + Map closestVec = null; + if(nearbyVecs !=null && vector != null) { + closestVec = new HashMap<>(); + TreeMap cosineSimMap = new TreeMap<>(); + for (String word : nearbyVecs.keySet()) { + cosineSimMap.put(Word2VecMath.cosineSimilarity(vector, nearbyVecs.get(word)), word); + } + String closestWord = cosineSimMap.lastEntry().getValue(); + closestVec.put(closestWord, nearbyVecs.get(closestWord)); + } + return closestVec; + } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index deada6a..e301946 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -5,7 +5,6 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import java.util.TreeMap; import org.aksw.word2vecrestful.subset.DataSubsetProvider; import org.aksw.word2vecrestful.utils.Word2VecMath; @@ -97,25 +96,16 @@ private Map getClosestEntry(float[] vector, String subKey) { } // Normalize incoming vector vector = Word2VecMath.normalize(vector); + //Find nearby vectors Map nearbyVecs = fetchNearbyVectors(vector, wordSet); - closestVec = findClosestVecInNearbyVecs(nearbyVecs, vector); + // Select the closest vector + closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); } catch (IOException e) { LOG.error(e.getStackTrace()); } return closestVec; } - private Map findClosestVecInNearbyVecs(Map nearbyVecs, float[] vector) { - Map closestVec = new HashMap<>(); - TreeMap cosineSimMap = new TreeMap<>(); - for (String word : nearbyVecs.keySet()) { - cosineSimMap.put(Word2VecMath.cosineSimilarity(vector, nearbyVecs.get(word)), word); - } - String closestWord = cosineSimMap.lastEntry().getValue(); - closestVec.put(closestWord, nearbyVecs.get(closestWord)); - return closestVec; - } - /** * Method to find standard deviation for each dimension of word vector and store * the operated value next to the dimension's index in sdMap diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java new file mode 100644 index 0000000..cc24fcf --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java @@ -0,0 +1,238 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.aksw.word2vecrestful.db.mongo.MongoDbHandler; +import org.aksw.word2vecrestful.subset.DataSubsetProvider; +import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.bson.Document; + +import com.mongodb.BasicDBObject; +import com.mongodb.client.FindIterable; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoCursor; +import com.opencsv.CSVReader; + +public class W2VNrmlMongoDbModel implements GenWord2VecModel { + + public static Logger LOG = LogManager.getLogger(W2VNrmlMongoDbModel.class); + + private String wordColName = "word"; + private String vecColPrefix = "val"; + + private int indexSize = 63; + + private int vectorSize; + private MongoDbHandler dbHandler; + private String dbName = "word2vec"; + private String collctnName = "mainCollection"; + private Map sdMap; + private static String STAT_FILE_PATH = Cfg.get("org.aksw.word2vecrestful.word2vec.stats.sdfile"); + /** + * Limit to the multiplier of area in which nearby vectors are to be looked + */ + private static final int EXHAUSTION_MULT = 10; + /** + * Multiplier for the standard deviation + */ + private static final int SIGMA_MULT = 3; + /** + * Divisor for the standard deviation's value + */ + private static final int AREA_DIVISOR = 10; + private DataSubsetProvider dataSubsetProvider; + + public W2VNrmlMongoDbModel(int vectorSize) throws IOException { + this.vectorSize = vectorSize; + this.dbHandler = new MongoDbHandler(this.dbName, null, null); + this.dataSubsetProvider = new DataSubsetProvider(); + // Calculate sd*3/10 and save in map + setModelSd(new File(STAT_FILE_PATH)); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private Map getClosestEntry(float[] vector, String subKey) { + Set wordSet = null; + Map closestVec = null; + try { + dbHandler.connect(); + if (subKey == null) { + wordSet = null; + } else { + wordSet = dataSubsetProvider.fetchSubsetWords(subKey); + } + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // Find nearby vectors + Map nearbyVecs = fetchNearbyVectors(vector, wordSet); + // Select the closest vector + closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + } catch (Exception e) { + LOG.error(e.getStackTrace()); + } finally { + dbHandler.close(); + } + return closestVec; + } + + /** + * Method to fetch nearby vectors for a given vector in a particular word set + * + * @param vector + * - vector to look nearby vectors for + * @param wordSet + * - word set to look into for nearby vectors + * @return - mapping of nearby words alongwith with their vector values + */ + private Map fetchNearbyVectors(float[] vector, Set wordSet) { + Map nearbyVecMap = null; + boolean mapEmpty = true; + boolean notExhausted = true; + Map minMaxVecMap = getMinMaxVecMap(vector); + int mult = 1; + while (mapEmpty && notExhausted) { + if (mult > 1) { + multMinMaxVecMap(minMaxVecMap, mult); + } + // Query Nearby + nearbyVecMap = queryNearbyVecs(vector, minMaxVecMap, wordSet); + if (nearbyVecMap.size() > 0) { + mapEmpty = false; + } else if (++mult > EXHAUSTION_MULT) { + notExhausted = false; + } + + } + return nearbyVecMap; + } + + private Map queryNearbyVecs(float[] vector, Map minMaxVecMap, + Set wordSet) { + Map resMap = new HashMap<>(); + + BasicDBObject queryObj = new BasicDBObject(); + MongoCollection mongoCollection = dbHandler.getDatabase().getCollection(this.collctnName); + if (wordSet != null && wordSet.size() > 0) { + queryObj.put(wordColName, wordSet); + } + for (Integer vecDim : minMaxVecMap.keySet()) { + float[] minMaxVal = minMaxVecMap.get(vecDim); + BasicDBObject filterObj = new BasicDBObject("$gt", minMaxVal[0]).append("$lt", minMaxVal[1]); + queryObj.put(vecColPrefix + vecDim, filterObj); + } + FindIterable findIt = mongoCollection.find(queryObj); + MongoCursor it = findIt.iterator(); + while (it.hasNext()) { + Document doc = it.next(); + String word = doc.getString(wordColName); + float[] vec = new float[vectorSize]; + for (int i = 0; i < vectorSize; i++) { + Double dVal = doc.getDouble(vecColPrefix + (i + 1)); + vec[i] = dVal.floatValue(); + } + resMap.put(word, vec); + } + + return resMap; + + } + + /** + * Multiply each element of the given multi dimensional vector with a given + * multiplier + * + * @param minMaxVecMap + * - vector at which operation is to be performed + * @param mult + * - multiplier + * @return - Vector after multiplication with the multiplier + */ + private void multMinMaxVecMap(Map minMaxVecMap, int mult) { + for (Integer vecDim : minMaxVecMap.keySet()) { + float[] minMaxVal = minMaxVecMap.get(vecDim); + minMaxVal[0] *= mult; + minMaxVal[1] *= mult; + minMaxVecMap.put(vecDim, minMaxVal); + } + } + + /** + * Method to generate two vectors from a given vector by adding and subtracting + * value in sdMap from the given vector + * + * @param vector + * - input vector to perform operation on + * @return - min vector at index 0 and max vector at index 1 + */ + private Map getMinMaxVecMap(float[] vector) { + Map resMap = new HashMap<>(); + float[] minMaxVal; + for (Integer vecDim : sdMap.keySet()) { + minMaxVal = new float[2]; + float diff = sdMap.get(vecDim); + minMaxVal[0] = vector[vecDim] - diff; + minMaxVal[1] = vector[vecDim] + diff; + resMap.put(vecDim, minMaxVal); + } + return resMap; + } + + @Override + public int getVectorSize() { + return this.vectorSize; + } + + @Override + public Map getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to find standard deviation for each dimension of word vector and store + * the operated value next to the dimension's index in sdMap + * + * @param file + * - file with standard deviation values of vectors in descending + * order + * @throws IOException + */ + public void setModelSd(File file) throws IOException { + Map resMap = new HashMap<>(); + CSVReader csvReader = new CSVReader(new FileReader(file)); + // Reading header + csvReader.readNext(); + // loop all dimensions + for (int i = 0; i < indexSize; i++) { + String[] entry = csvReader.readNext(); + + Double sd = Double.parseDouble(entry[1]); + resMap.put(Integer.parseInt(entry[0]), sd.floatValue() * SIGMA_MULT / AREA_DIVISOR); + } + csvReader.close(); + // Set as sdMap + this.sdMap = resMap; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index 4693e87..4f8a3b2 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -6,10 +6,30 @@ public class Word2VecFactory { - public static final String CFG_KEY_MODEL = Word2VecFactory.class.getName().concat(".model"); - public static String model = (Cfg.get(CFG_KEY_MODEL)); + public static final String CFG_KEY_MODEL = Word2VecFactory.class.getName().concat(".model"); + public static String model = (Cfg.get(CFG_KEY_MODEL)); + public static final String CFG_KEY_BIN = Word2VecModelLoader.class.getName().concat(".bin"); + public static boolean binModel = Boolean.parseBoolean(Cfg.get(CFG_KEY_BIN)); - public static Word2VecModel get() { - return new Word2VecModelLoader().loadModel(new File(model)); - } + private static String nrmlMdlFilePath = (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedmodel.model")); + private static boolean nrmlMdlBinFlg = Boolean + .parseBoolean(Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedmodel.bin")); + + private static String nrmlBinMdlFilePath = (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model")); + private static boolean nrmlBinMdlBinFlg = Boolean + .parseBoolean(Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.bin")); + + public static Word2VecModel get() { + return new Word2VecModelLoader().loadModel(new File(model), binModel); + } + + public static W2VNrmlMemModel getNormalizedModel() { + Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlMdlFilePath), nrmlMdlBinFlg); + return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); + } + + public static W2VNrmlMemModel getNormalizedBinModel() { + Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); + return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); + } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java index 1dbd540..cb2b002 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModelLoader.java @@ -29,8 +29,8 @@ public class Word2VecModelLoader { } public static Logger LOG = LogManager.getLogger(Word2VecModelLoader.class); - public static final String CFG_KEY_BIN = Word2VecModelLoader.class.getName().concat(".bin"); - protected boolean binModel = Boolean.parseBoolean(Cfg.get(CFG_KEY_BIN)); + /*public static final String CFG_KEY_BIN = Word2VecModelLoader.class.getName().concat(".bin"); + protected boolean binModel = Boolean.parseBoolean(Cfg.get(CFG_KEY_BIN));*/ public static void main(final String[] a) { @@ -50,7 +50,7 @@ public static void main(final String[] a) { * @throws IOException * @throws FileNotFoundException */ - public Word2VecModel loadModel(final File file) { + public Word2VecModel loadModel(final File file, boolean binModel) { int vectorSize = -1; int words = -1; diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java new file mode 100644 index 0000000..71084df --- /dev/null +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -0,0 +1,46 @@ +package org.aksw.word2vecrestful; + +import java.util.HashMap; +import java.util.Map; + +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.Assert; +import org.junit.Test; + +public class NormalizedInMemModelTest { + public static Logger LOG = LogManager.getLogger(NormalizedInMemModelTest.class); + + @Test + public void testNormalizedModel() { + + final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); + + Map wordKeyMap = new HashMap<>(); + wordKeyMap.put("WesternOne", "ns#country-name"); + wordKeyMap.put("Donald_O._Schnuck", "ontology#ConferenceVenuePlacerdf-schema#label"); + wordKeyMap.put("Skyytek", "icaltzd#summary"); + wordKeyMap.put("Sungai_Muar", "ontology#Presenterrdf-schema#label"); + wordKeyMap.put("cat", null); + wordKeyMap.put("dog", null); + wordKeyMap.put("airplane", null); + wordKeyMap.put("road", null); + long startTime, diff; + long totTime = 0; + for (String word : wordKeyMap.keySet()) { + startTime = System.currentTimeMillis(); + float[] vec = memModel.getWord2VecMap().get(word); + Map closestWord = memModel.getClosestSubEntry(vec, wordKeyMap.get(word)); + Assert.assertTrue(closestWord.containsKey(word)); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " + + diff + " milliseconds."); + } + + LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); + + } +} diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java new file mode 100644 index 0000000..51adb92 --- /dev/null +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java @@ -0,0 +1,50 @@ +package org.aksw.word2vecrestful; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.aksw.word2vecrestful.word2vec.W2VNrmlMongoDbModel; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.junit.Assert; +import org.junit.Test; + +import nikit.test.TestConst; + +public class NormalizedMongoModelTest { + public static Logger LOG = LogManager.getLogger(NormalizedMongoModelTest.class); + static { + TestConst.VEC_MAP.put("cat", TestConst.CAT); + TestConst.VEC_MAP.put("dog", TestConst.DOG); + TestConst.VEC_MAP.put("airplane", TestConst.AIRPLANE); + TestConst.VEC_MAP.put("road", TestConst.ROAD); + } + @Test + public void testNormalizedModel() throws IOException { + + //final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); + final W2VNrmlMongoDbModel mongoModel = new W2VNrmlMongoDbModel(300); + Map wordKeyMap = new HashMap<>(); + wordKeyMap.put("cat", null); + wordKeyMap.put("dog", null); + wordKeyMap.put("airplane", null); + wordKeyMap.put("road", null); + long startTime, diff; + long totTime = 0; + for (String word : wordKeyMap.keySet()) { + startTime = System.currentTimeMillis(); + float[] vec = TestConst.VEC_MAP.get(word); + Map closestWord = mongoModel.getClosestSubEntry(vec, wordKeyMap.get(word)); + Assert.assertNotNull(closestWord); + Assert.assertTrue(closestWord.containsKey(word)); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " + + diff + " milliseconds."); + } + + LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); + + } +} From 362ff9695f4422d0258a581de6859bb518617198 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 6 Sep 2018 17:18:28 +0200 Subject: [PATCH 009/114] intermediate commit --- pom.xml | 17 ++++ .../word2vec/W2VNrmlMemModel.java | 97 +++++++++++++++++-- .../NormalizedInMemModelTest.java | 11 ++- .../NormalizedMongoModelTest.java | 50 ---------- 4 files changed, 113 insertions(+), 62 deletions(-) delete mode 100644 src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java diff --git a/pom.xml b/pom.xml index 5f5e247..54613e7 100644 --- a/pom.xml +++ b/pom.xml @@ -10,6 +10,7 @@ 1.8 1.8 + org.springframework.boot @@ -70,6 +71,12 @@ mongo-java-driver 3.4.1 + + + org.dice-research + topicmodeling.commons + 0.0.3-SNAPSHOT + @@ -106,6 +113,16 @@ spring-releases https://repo.spring.io/libs-release + + maven.aksw.internal + University Leipzig, AKSW Maven2 Repository + http://maven.aksw.org/repository/internal + + + maven.aksw.snapshots + University Leipzig, AKSW Maven2 Repository + http://maven.aksw.org/repository/snapshots + diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index e301946..81fc4ee 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -1,7 +1,10 @@ package org.aksw.word2vecrestful.word2vec; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; @@ -10,6 +13,7 @@ import org.aksw.word2vecrestful.utils.Word2VecMath; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.dice_research.topicmodeling.commons.sort.AssociativeSort; /** * Class to encapsulate word2vec in-memory model and expose methods to perform @@ -37,13 +41,21 @@ public class W2VNrmlMemModel implements GenWord2VecModel { */ private static final int AREA_DIVISOR = 10; private DataSubsetProvider dataSubsetProvider; + /** + * Contains the sorted dimensional values mapped to their words + */ + private Object[][] indexesArr; public W2VNrmlMemModel(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; this.dataSubsetProvider = new DataSubsetProvider(); + this.indexesArr = new Object[vectorSize][2]; // Calculate sd*3/10 and save in map - setModelSd(word2vec, vectorSize); + // Initialize indexesArr unsorted + this.setModelVals(word2vec, vectorSize); + // Sort the indexes + this.sortIndexes(); } /** @@ -96,8 +108,8 @@ private Map getClosestEntry(float[] vector, String subKey) { } // Normalize incoming vector vector = Word2VecMath.normalize(vector); - //Find nearby vectors - Map nearbyVecs = fetchNearbyVectors(vector, wordSet); + // Find nearby vectors + Map nearbyVecs = fetchNearbyVectors(vector, wordSet, true); // Select the closest vector closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); } catch (IOException e) { @@ -115,7 +127,7 @@ private Map getClosestEntry(float[] vector, String subKey) { * @param vectorSize * - size of each vector */ - public void setModelSd(Map word2vecMap, int vectorSize) { + public void setModelVals(Map word2vecMap, int vectorSize) { Map resMap = new HashMap<>(); Set> entries = word2vecMap.entrySet(); int totSize = word2vecMap.size(); @@ -124,13 +136,20 @@ public void setModelSd(Map word2vecMap, int vectorSize) { // loop through all the words int j = 0; float[] dimsnArr = new float[totSize]; + String[] wordArr = new String[totSize]; float sum = 0; for (Entry entry : entries) { float[] vecArr = entry.getValue(); float val = vecArr[i]; sum += val; + wordArr[j] = entry.getKey(); dimsnArr[j++] = val; } + // Setting value in indexArr + Object[] dimValWordMap = new Object[2]; + dimValWordMap[0] = wordArr; + dimValWordMap[1] = dimsnArr; + this.indexesArr[i] = dimValWordMap; // mean float mean = sum / dimsnArr.length; sum = 0; @@ -154,7 +173,7 @@ public void setModelSd(Map word2vecMap, int vectorSize) { * - word set to look into for nearby vectors * @return - mapping of nearby words alongwith with their vector values */ - private Map fetchNearbyVectors(float[] vector, Set wordSet) { + private Map fetchNearbyVectors(float[] vector, Set wordSet, boolean indxd) { Map nearbyVecMap = new HashMap<>(); boolean mapEmpty = true; boolean notExhausted = true; @@ -164,11 +183,10 @@ private Map fetchNearbyVectors(float[] vector, Set word if (mult > 1) { minMaxVec = multMinMaxVec(minMaxVec, mult); } - for (String word : wordSet) { - float[] entryVec = word2vec.get(word); - if (isVectorInArea(entryVec, minMaxVec)) { - nearbyVecMap.put(word, entryVec); - } + if (indxd) { + putNearbyVecsIndxd(minMaxVec, wordSet, nearbyVecMap); + } else { + putNearbyVecsNonIndxd(minMaxVec, wordSet, nearbyVecMap); } if (nearbyVecMap.size() > 0) { mapEmpty = false; @@ -180,6 +198,56 @@ private Map fetchNearbyVectors(float[] vector, Set word return nearbyVecMap; } + private void putNearbyVecsNonIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { + for (String word : wordSet) { + float[] entryVec = word2vec.get(word); + if (isVectorInArea(entryVec, minMaxVec)) { + nearbyVecMap.put(word, entryVec); + } + } + } + + private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { + // init a set to hold words + Set nearbyWords = new HashSet<>(); + float[] minVec = minMaxVec[0]; + float[] maxVec = minMaxVec[1]; + for (int i = 0; i < vectorSize; i++) { + float minVal = minVec[i]; + float maxVal = maxVec[i]; + Object[] entryArr = indexesArr[i]; + String[] wordArr = (String[]) entryArr[0]; + float[] dimsnValArr = (float[]) entryArr[1]; + int from = Arrays.binarySearch(dimsnValArr, minVal); + if (from < 0) { + // To select the index one behind the current element + from = Math.abs(from); + from = from - (from>1?2:1); + } + int to = Arrays.binarySearch(dimsnValArr, maxVal); + if (to < 0) { + // To select the index one after the current element + to = Math.abs(to); + to = to - (to > dimsnValArr.length ? 1 : 0); + } + String[] tWords = Arrays.copyOfRange(wordArr, from, to); + List tWordList = Arrays.asList(tWords); + if (i == 0) { + nearbyWords.addAll(tWordList); + } else { + nearbyWords.retainAll(tWordList); + } + if(nearbyWords.isEmpty()) { + break; + } + } + // Clear all the words not in wordset + nearbyWords.retainAll(wordSet); + for(String word: nearbyWords) { + nearbyVecMap.put(word, word2vec.get(word)); + } + } + /** * Method to check if vector falls in a particular area * @@ -240,6 +308,15 @@ private float[][] getMinMaxVec(float[] vector) { return resVec; } + private void sortIndexes() { + for (int i = 0; i < indexesArr.length; i++) { + Object[] entryArr = indexesArr[i]; + String[] wordArr = (String[]) entryArr[0]; + float[] dimsnValArr = (float[]) entryArr[1]; + AssociativeSort.quickSort(dimsnValArr, wordArr); + } + } + /** * Method to fetch vectorSize * diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index 71084df..a931a57 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -19,10 +19,10 @@ public void testNormalizedModel() { final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); Map wordKeyMap = new HashMap<>(); - wordKeyMap.put("WesternOne", "ns#country-name"); + /*wordKeyMap.put("WesternOne", "ns#country-name"); wordKeyMap.put("Donald_O._Schnuck", "ontology#ConferenceVenuePlacerdf-schema#label"); wordKeyMap.put("Skyytek", "icaltzd#summary"); - wordKeyMap.put("Sungai_Muar", "ontology#Presenterrdf-schema#label"); + wordKeyMap.put("Sungai_Muar", "ontology#Presenterrdf-schema#label");*/ wordKeyMap.put("cat", null); wordKeyMap.put("dog", null); wordKeyMap.put("airplane", null); @@ -43,4 +43,11 @@ public void testNormalizedModel() { LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); } + + public static void main(String[] args) { + LOG.info("Starting test!"); + NormalizedInMemModelTest inMemModelTest = new NormalizedInMemModelTest(); + inMemModelTest.testNormalizedModel(); + LOG.info("Test finished!"); + } } diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java deleted file mode 100644 index 51adb92..0000000 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedMongoModelTest.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.aksw.word2vecrestful; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.aksw.word2vecrestful.word2vec.W2VNrmlMongoDbModel; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.junit.Assert; -import org.junit.Test; - -import nikit.test.TestConst; - -public class NormalizedMongoModelTest { - public static Logger LOG = LogManager.getLogger(NormalizedMongoModelTest.class); - static { - TestConst.VEC_MAP.put("cat", TestConst.CAT); - TestConst.VEC_MAP.put("dog", TestConst.DOG); - TestConst.VEC_MAP.put("airplane", TestConst.AIRPLANE); - TestConst.VEC_MAP.put("road", TestConst.ROAD); - } - @Test - public void testNormalizedModel() throws IOException { - - //final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); - final W2VNrmlMongoDbModel mongoModel = new W2VNrmlMongoDbModel(300); - Map wordKeyMap = new HashMap<>(); - wordKeyMap.put("cat", null); - wordKeyMap.put("dog", null); - wordKeyMap.put("airplane", null); - wordKeyMap.put("road", null); - long startTime, diff; - long totTime = 0; - for (String word : wordKeyMap.keySet()) { - startTime = System.currentTimeMillis(); - float[] vec = TestConst.VEC_MAP.get(word); - Map closestWord = mongoModel.getClosestSubEntry(vec, wordKeyMap.get(word)); - Assert.assertNotNull(closestWord); - Assert.assertTrue(closestWord.containsKey(word)); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " - + diff + " milliseconds."); - } - - LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); - - } -} From 2666a79a92c13a2b2e2d1300ea5d4d1a2da4035d Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 6 Sep 2018 18:04:08 +0200 Subject: [PATCH 010/114] log4j changes --- pom.xml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 54613e7..d69c863 100644 --- a/pom.xml +++ b/pom.xml @@ -9,6 +9,7 @@ 1.8 1.8 1.8 + 1.7.10 @@ -16,6 +17,12 @@ org.springframework.boot spring-boot-starter-web 1.2.2.RELEASE + + + org.slf4j + log4j-over-slf4j + + org.json @@ -77,7 +84,20 @@ topicmodeling.commons 0.0.3-SNAPSHOT - + + + + org.slf4j + slf4j-api + ${slf4j.version} + + + + org.slf4j + slf4j-log4j12 + ${slf4j.version} + + src/test/java From 0ea2e6115038e4e7287cb921aa8d21a004a058b1 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 6 Sep 2018 18:18:40 +0200 Subject: [PATCH 011/114] increasing jvm size --- pom.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pom.xml b/pom.xml index d69c863..20f65b2 100644 --- a/pom.xml +++ b/pom.xml @@ -115,6 +115,7 @@ 1.8 1.8 UTF-8 + -Xmx15024m @@ -124,6 +125,7 @@ methods 10 + -Xmx15024m From 22de5d31880294b46142e887aa09d620c521a435 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 14:18:30 +0200 Subject: [PATCH 012/114] Multiplier logic fix --- .../word2vec/W2VNrmlMemModel.java | 16 +++++++++------- .../NormalizedInMemModelTest.java | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 81fc4ee..6a474c2 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -190,10 +190,12 @@ private Map fetchNearbyVectors(float[] vector, Set word } if (nearbyVecMap.size() > 0) { mapEmpty = false; - } else if (mult > EXHAUSTION_MULT) { - notExhausted = false; + } else { + ++mult; + if (mult > EXHAUSTION_MULT) { + notExhausted = false; + } } - } return nearbyVecMap; } @@ -220,9 +222,9 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map1?2:1); + from = from - (from > 1 ? 1 : 0); } int to = Arrays.binarySearch(dimsnValArr, maxVal); if (to < 0) { @@ -237,13 +239,13 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Mon, 10 Sep 2018 14:38:10 +0200 Subject: [PATCH 013/114] removing mongodb related features --- pom.xml | 6 - .../db/mongo/MongoDbHandler.java | 82 ------ .../tool/MongoDBModelGenerator.java | 200 --------------- .../word2vec/W2VNrmlMongoDbModel.java | 238 ------------------ 4 files changed, 526 deletions(-) delete mode 100644 src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java diff --git a/pom.xml b/pom.xml index 20f65b2..90e1d46 100644 --- a/pom.xml +++ b/pom.xml @@ -73,12 +73,6 @@ 2.6 - - org.mongodb - mongo-java-driver - 3.4.1 - - org.dice-research topicmodeling.commons diff --git a/src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java b/src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java deleted file mode 100644 index 487448d..0000000 --- a/src/main/java/org/aksw/word2vecrestful/db/mongo/MongoDbHandler.java +++ /dev/null @@ -1,82 +0,0 @@ -package org.aksw.word2vecrestful.db.mongo; - -import org.bson.Document; - -import com.mongodb.MongoClient; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; - -public class MongoDbHandler { - private String dbName; - private String host = "localhost"; - private Integer port = 27017; - private MongoDatabase database; - private MongoClient mongoClient; - - public MongoDbHandler(String dbName, String host, Integer port) { - if (host != null) { - this.host = host; - } - if (port != null) { - this.port = port; - } - this.dbName = dbName; - } - - public void connect() { - this.mongoClient = new MongoClient(this.host, this.port); - this.database = mongoClient.getDatabase(this.dbName); - } - - public void close() { - this.mongoClient.close(); - } - - public MongoCollection createCollection(String collctnName) { - this.database.createCollection(collctnName); - return this.database.getCollection(collctnName); - } - - // Setter and Getters - - public String getDbName() { - return dbName; - } - - public void setDbName(String dbName) { - this.dbName = dbName; - } - - public String getHost() { - return host; - } - - public void setHost(String host) { - this.host = host; - } - - public Integer getPort() { - return port; - } - - public void setPort(Integer port) { - this.port = port; - } - - public MongoDatabase getDatabase() { - return database; - } - - public void setDatabase(MongoDatabase database) { - this.database = database; - } - - public MongoClient getMongoClient() { - return mongoClient; - } - - public void setMongoClient(MongoClient mongoClient) { - this.mongoClient = mongoClient; - } - -} diff --git a/src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java b/src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java deleted file mode 100644 index 207e0b6..0000000 --- a/src/main/java/org/aksw/word2vecrestful/tool/MongoDBModelGenerator.java +++ /dev/null @@ -1,200 +0,0 @@ -package org.aksw.word2vecrestful.tool; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.Map.Entry; -import java.util.TreeMap; - -import org.aksw.word2vecrestful.db.mongo.MongoDbHandler; -import org.aksw.word2vecrestful.word2vec.Word2VecModelLoader; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.bson.BsonDocument; -import org.bson.BsonString; -import org.bson.Document; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.mongodb.client.AggregateIterable; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.model.Aggregates; -import com.mongodb.client.model.BsonField; -import com.mongodb.client.model.IndexModel; -import com.mongodb.client.model.Indexes; -import com.opencsv.CSVReader; - -public class MongoDBModelGenerator { - - public static Logger LOG = LogManager.getLogger(MongoDBModelGenerator.class); - - private String wordColName = "word"; - private String vecColPrefix = "val"; - private String[] vecColNames; - private MongoDbHandler mongoDbHandler; - private MongoCollection mongoCollection; - private String collctnName; - - public MongoDBModelGenerator(String collctnName, String dbName, String host, Integer port) { - this.mongoDbHandler = new MongoDbHandler(dbName, host, port); - this.collctnName = collctnName; - } - - private Document createWord2VecDoc(String word, float[] vector) { - Document doc = new Document(); - doc.put(wordColName, word); - for (int i = 0; i < vector.length; i++) { - doc.put(vecColNames[i], vector[i]); - } - return doc; - } - - private String[] createVecColNames(int vectorSize) { - String[] colnames = new String[vectorSize]; - for (int i = 0; i < vectorSize; i++) { - colnames[i] = vecColPrefix + (i + 1); - } - return colnames; - } - - private void createIndexes() { - List indexList = new ArrayList<>(); - IndexModel indexModel = new IndexModel(Indexes.text(wordColName)); - indexList.add(indexModel); - for (int i = 0; i < vecColNames.length; i++) { - indexModel = new IndexModel(Indexes.ascending(vecColNames[i])); - indexList.add(indexModel); - } - this.mongoCollection.createIndexes(indexList); - } - - private void createIndexes(List fieldNames) { - List indexList = new ArrayList<>(); - IndexModel indexModel; - for (String field : fieldNames) { - indexModel = new IndexModel(Indexes.ascending(field)); - indexList.add(indexModel); - } - this.mongoCollection.createIndexes(indexList); - } - - /** - * Use ModelStatsWriter Instead - * - * @param n - * @param vectorSize - * @return - */ - @Deprecated - private List getNImportantFieldNames(int n, int vectorSize) { - List fieldNames = new ArrayList<>(); - TreeMap stdDevMap = new TreeMap<>(Collections.reverseOrder()); - for (int i = 0; i < vectorSize; i++) { - String colName = vecColPrefix + (i + 1); - AggregateIterable aggregate = mongoCollection.aggregate(Arrays.asList(Aggregates - .group("_id", new BsonField("stdDev", new BsonDocument("$stdDevPop", new BsonString(colName)))))); - Document result = aggregate.first(); - Double stdDevVal = result.getDouble("stdDev"); - stdDevMap.put(stdDevVal, colName); - } - int count = 0; - for (Entry entry : stdDevMap.entrySet()) { - fieldNames.add(entry.getValue()); - if (++count == n) { - break; - } - } - return fieldNames; - } - - private List getNImportantFieldNames(int n) throws IOException { - List fieldNames = new ArrayList<>(); - File file = new File("D:\\Nikit\\DICE-Group\\Jword2vec\\data\\normal\\stat\\normal-model-sd.csv"); - CSVReader csvReader = new CSVReader(new FileReader(file)); - // Reading header - csvReader.readNext(); - for (int i = 0; i < n; i++) { - String fieldName = vecColPrefix + csvReader.readNext()[0]; - fieldNames.add(fieldName); - } - csvReader.close(); - return fieldNames; - } - - public void generateTopIndexes(int vectorSize) { - this.mongoDbHandler.connect(); - this.mongoCollection = this.mongoDbHandler.getDatabase().getCollection(this.collctnName); - try { - // Find top vectors - List fieldNames = getNImportantFieldNames(63); - // Generate indexes - createIndexes(fieldNames); - } catch (Exception e) { - e.printStackTrace(); - } finally { - this.mongoDbHandler.close(); - } - - } - - public void persistWord2VecModel(File inputFile) { - LOG.info("Starting insertion of records to MongoDB.."); - FileInputStream fin = null; - this.mongoDbHandler.connect(); - this.mongoCollection = mongoDbHandler.createCollection(collctnName); - // this.mongoCollection = - // this.mongoDbHandler.getDatabase().getCollection(this.collctnName); - try { - fin = new FileInputStream(inputFile); - String word = Word2VecModelLoader.readWord(fin); - int words = Integer.parseInt(word); - word = Word2VecModelLoader.readWord(fin); - int vectorSize = Integer.parseInt(word); - List docList = new ArrayList<>(); - int recCount = 0; - this.vecColNames = createVecColNames(vectorSize); - // Insert records - for (int w = 0; w < words; ++w) { - word = Word2VecModelLoader.readWord(fin); - // LOG.info(word); - float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); - docList.add(createWord2VecDoc(word, vector)); - recCount++; - if (recCount % 10000 == 0) { - this.mongoCollection.insertMany(docList); - docList.clear(); - LOG.info((recCount) + " Records inserted."); - } - } - // Insert leftover records - if (docList.size() > 0) { - this.mongoCollection.insertMany(docList); - } - LOG.info("Records insertion successfully completed."); - LOG.info("Starting creation of indexes"); - // create indexes - createIndexes(); - LOG.info("Indexes creation successfully completed."); - } catch (Exception e) { - e.printStackTrace(); - } finally { - this.mongoDbHandler.close(); - } - } - - public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { - /* - * String inputModel = - * (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model")); File - * inputFile = new File(inputModel); - */ - MongoDBModelGenerator modelGenerator = new MongoDBModelGenerator("mainCollection", "word2vec", null, null); - /* modelGenerator.persistWord2VecModel(inputFile); */ - modelGenerator.generateTopIndexes(300); - } -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java deleted file mode 100644 index cc24fcf..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMongoDbModel.java +++ /dev/null @@ -1,238 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -import org.aksw.word2vecrestful.db.mongo.MongoDbHandler; -import org.aksw.word2vecrestful.subset.DataSubsetProvider; -import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.bson.Document; - -import com.mongodb.BasicDBObject; -import com.mongodb.client.FindIterable; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoCursor; -import com.opencsv.CSVReader; - -public class W2VNrmlMongoDbModel implements GenWord2VecModel { - - public static Logger LOG = LogManager.getLogger(W2VNrmlMongoDbModel.class); - - private String wordColName = "word"; - private String vecColPrefix = "val"; - - private int indexSize = 63; - - private int vectorSize; - private MongoDbHandler dbHandler; - private String dbName = "word2vec"; - private String collctnName = "mainCollection"; - private Map sdMap; - private static String STAT_FILE_PATH = Cfg.get("org.aksw.word2vecrestful.word2vec.stats.sdfile"); - /** - * Limit to the multiplier of area in which nearby vectors are to be looked - */ - private static final int EXHAUSTION_MULT = 10; - /** - * Multiplier for the standard deviation - */ - private static final int SIGMA_MULT = 3; - /** - * Divisor for the standard deviation's value - */ - private static final int AREA_DIVISOR = 10; - private DataSubsetProvider dataSubsetProvider; - - public W2VNrmlMongoDbModel(int vectorSize) throws IOException { - this.vectorSize = vectorSize; - this.dbHandler = new MongoDbHandler(this.dbName, null, null); - this.dataSubsetProvider = new DataSubsetProvider(); - // Calculate sd*3/10 and save in map - setModelSd(new File(STAT_FILE_PATH)); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - private Map getClosestEntry(float[] vector, String subKey) { - Set wordSet = null; - Map closestVec = null; - try { - dbHandler.connect(); - if (subKey == null) { - wordSet = null; - } else { - wordSet = dataSubsetProvider.fetchSubsetWords(subKey); - } - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - // Find nearby vectors - Map nearbyVecs = fetchNearbyVectors(vector, wordSet); - // Select the closest vector - closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); - } catch (Exception e) { - LOG.error(e.getStackTrace()); - } finally { - dbHandler.close(); - } - return closestVec; - } - - /** - * Method to fetch nearby vectors for a given vector in a particular word set - * - * @param vector - * - vector to look nearby vectors for - * @param wordSet - * - word set to look into for nearby vectors - * @return - mapping of nearby words alongwith with their vector values - */ - private Map fetchNearbyVectors(float[] vector, Set wordSet) { - Map nearbyVecMap = null; - boolean mapEmpty = true; - boolean notExhausted = true; - Map minMaxVecMap = getMinMaxVecMap(vector); - int mult = 1; - while (mapEmpty && notExhausted) { - if (mult > 1) { - multMinMaxVecMap(minMaxVecMap, mult); - } - // Query Nearby - nearbyVecMap = queryNearbyVecs(vector, minMaxVecMap, wordSet); - if (nearbyVecMap.size() > 0) { - mapEmpty = false; - } else if (++mult > EXHAUSTION_MULT) { - notExhausted = false; - } - - } - return nearbyVecMap; - } - - private Map queryNearbyVecs(float[] vector, Map minMaxVecMap, - Set wordSet) { - Map resMap = new HashMap<>(); - - BasicDBObject queryObj = new BasicDBObject(); - MongoCollection mongoCollection = dbHandler.getDatabase().getCollection(this.collctnName); - if (wordSet != null && wordSet.size() > 0) { - queryObj.put(wordColName, wordSet); - } - for (Integer vecDim : minMaxVecMap.keySet()) { - float[] minMaxVal = minMaxVecMap.get(vecDim); - BasicDBObject filterObj = new BasicDBObject("$gt", minMaxVal[0]).append("$lt", minMaxVal[1]); - queryObj.put(vecColPrefix + vecDim, filterObj); - } - FindIterable findIt = mongoCollection.find(queryObj); - MongoCursor it = findIt.iterator(); - while (it.hasNext()) { - Document doc = it.next(); - String word = doc.getString(wordColName); - float[] vec = new float[vectorSize]; - for (int i = 0; i < vectorSize; i++) { - Double dVal = doc.getDouble(vecColPrefix + (i + 1)); - vec[i] = dVal.floatValue(); - } - resMap.put(word, vec); - } - - return resMap; - - } - - /** - * Multiply each element of the given multi dimensional vector with a given - * multiplier - * - * @param minMaxVecMap - * - vector at which operation is to be performed - * @param mult - * - multiplier - * @return - Vector after multiplication with the multiplier - */ - private void multMinMaxVecMap(Map minMaxVecMap, int mult) { - for (Integer vecDim : minMaxVecMap.keySet()) { - float[] minMaxVal = minMaxVecMap.get(vecDim); - minMaxVal[0] *= mult; - minMaxVal[1] *= mult; - minMaxVecMap.put(vecDim, minMaxVal); - } - } - - /** - * Method to generate two vectors from a given vector by adding and subtracting - * value in sdMap from the given vector - * - * @param vector - * - input vector to perform operation on - * @return - min vector at index 0 and max vector at index 1 - */ - private Map getMinMaxVecMap(float[] vector) { - Map resMap = new HashMap<>(); - float[] minMaxVal; - for (Integer vecDim : sdMap.keySet()) { - minMaxVal = new float[2]; - float diff = sdMap.get(vecDim); - minMaxVal[0] = vector[vecDim] - diff; - minMaxVal[1] = vector[vecDim] + diff; - resMap.put(vecDim, minMaxVal); - } - return resMap; - } - - @Override - public int getVectorSize() { - return this.vectorSize; - } - - @Override - public Map getClosestEntry(float[] vector) { - return getClosestEntry(vector, null); - } - - @Override - public Map getClosestSubEntry(float[] vector, String subKey) { - return getClosestEntry(vector, subKey); - } - - /** - * Method to find standard deviation for each dimension of word vector and store - * the operated value next to the dimension's index in sdMap - * - * @param file - * - file with standard deviation values of vectors in descending - * order - * @throws IOException - */ - public void setModelSd(File file) throws IOException { - Map resMap = new HashMap<>(); - CSVReader csvReader = new CSVReader(new FileReader(file)); - // Reading header - csvReader.readNext(); - // loop all dimensions - for (int i = 0; i < indexSize; i++) { - String[] entry = csvReader.readNext(); - - Double sd = Double.parseDouble(entry[1]); - resMap.put(Integer.parseInt(entry[0]), sd.floatValue() * SIGMA_MULT / AREA_DIVISOR); - } - csvReader.close(); - // Set as sdMap - this.sdMap = resMap; - } - -} From 20f7b9d61fc8f971b52463c24fc249a2c70fea87 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 15:23:08 +0200 Subject: [PATCH 014/114] Debugging messages added --- config/log4j.properties | 2 +- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++++ .../aksw/word2vecrestful/word2vec/Word2VecFactory.java | 5 ++++- .../aksw/word2vecrestful/NormalizedInMemModelTest.java | 8 ++++---- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/config/log4j.properties b/config/log4j.properties index 4ed3f76..4f6fa2b 100644 --- a/config/log4j.properties +++ b/config/log4j.properties @@ -1,4 +1,4 @@ -log4j.rootLogger=INFO, stdout, file +log4j.rootLogger=DEBUG, stdout, file log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{HH:mm:ss} %5p [%t] (%F:%M:%L) - %m%n diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 6a474c2..027b160 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -53,7 +53,9 @@ public W2VNrmlMemModel(final Map word2vec, final int vectorSize this.indexesArr = new Object[vectorSize][2]; // Calculate sd*3/10 and save in map // Initialize indexesArr unsorted + LOG.debug("Initializing indexes and calculating standard deviation"); this.setModelVals(word2vec, vectorSize); + LOG.debug("Sorting indexes"); // Sort the indexes this.sortIndexes(); } @@ -115,6 +117,7 @@ private Map getClosestEntry(float[] vector, String subKey) { } catch (IOException e) { LOG.error(e.getStackTrace()); } + LOG.debug("Closest word found is "+closestVec.keySet()); return closestVec; } @@ -150,6 +153,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { dimValWordMap[0] = wordArr; dimValWordMap[1] = dimsnArr; this.indexesArr[i] = dimValWordMap; + LOG.debug("Dimension "+(i+1)+" index stored to memory"); // mean float mean = sum / dimsnArr.length; sum = 0; diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index 4f8a3b2..e54b4c5 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -3,9 +3,11 @@ import java.io.File; import org.aksw.word2vecrestful.utils.Cfg; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; public class Word2VecFactory { - + public static Logger LOG = LogManager.getLogger(Word2VecFactory.class); public static final String CFG_KEY_MODEL = Word2VecFactory.class.getName().concat(".model"); public static String model = (Cfg.get(CFG_KEY_MODEL)); public static final String CFG_KEY_BIN = Word2VecModelLoader.class.getName().concat(".bin"); @@ -30,6 +32,7 @@ public static W2VNrmlMemModel getNormalizedModel() { public static W2VNrmlMemModel getNormalizedBinModel() { Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); + LOG.debug("Model loaded to memory"); return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); } } diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index c4d63ab..67cad84 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -15,9 +15,9 @@ public class NormalizedInMemModelTest { @Test public void testNormalizedModel() { - + LOG.debug("Starting InMemory indexed model test!"); final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); - + LOG.debug("Indexed Model instance created"); Map wordKeyMap = new HashMap<>(); /*wordKeyMap.put("WesternOne", "ns#country-name"); wordKeyMap.put("Donald_O._Schnuck", "ontology#ConferenceVenuePlacerdf-schema#label"); @@ -37,11 +37,11 @@ public void testNormalizedModel() { Assert.assertTrue(closestWord.containsKey(word)); diff = System.currentTimeMillis() - startTime; totTime += diff; - LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " + LOG.debug("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " + diff + " milliseconds."); } - LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); + LOG.debug("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); } From c6c29e9af99d984be299fe25cc8526ee2e670be4 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 15:27:48 +0200 Subject: [PATCH 015/114] changing debug to info --- config/log4j.properties | 2 +- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 8 ++++---- .../aksw/word2vecrestful/word2vec/Word2VecFactory.java | 2 +- .../aksw/word2vecrestful/NormalizedInMemModelTest.java | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config/log4j.properties b/config/log4j.properties index 4f6fa2b..4ed3f76 100644 --- a/config/log4j.properties +++ b/config/log4j.properties @@ -1,4 +1,4 @@ -log4j.rootLogger=DEBUG, stdout, file +log4j.rootLogger=INFO, stdout, file log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{HH:mm:ss} %5p [%t] (%F:%M:%L) - %m%n diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 027b160..3fba2fe 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -53,9 +53,9 @@ public W2VNrmlMemModel(final Map word2vec, final int vectorSize this.indexesArr = new Object[vectorSize][2]; // Calculate sd*3/10 and save in map // Initialize indexesArr unsorted - LOG.debug("Initializing indexes and calculating standard deviation"); + LOG.info("Initializing indexes and calculating standard deviation"); this.setModelVals(word2vec, vectorSize); - LOG.debug("Sorting indexes"); + LOG.info("Sorting indexes"); // Sort the indexes this.sortIndexes(); } @@ -117,7 +117,7 @@ private Map getClosestEntry(float[] vector, String subKey) { } catch (IOException e) { LOG.error(e.getStackTrace()); } - LOG.debug("Closest word found is "+closestVec.keySet()); + LOG.info("Closest word found is "+closestVec.keySet()); return closestVec; } @@ -153,7 +153,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { dimValWordMap[0] = wordArr; dimValWordMap[1] = dimsnArr; this.indexesArr[i] = dimValWordMap; - LOG.debug("Dimension "+(i+1)+" index stored to memory"); + LOG.info("Dimension "+(i+1)+" index stored to memory"); // mean float mean = sum / dimsnArr.length; sum = 0; diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index e54b4c5..5ebdee3 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -32,7 +32,7 @@ public static W2VNrmlMemModel getNormalizedModel() { public static W2VNrmlMemModel getNormalizedBinModel() { Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); - LOG.debug("Model loaded to memory"); + LOG.info("Model loaded to memory"); return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); } } diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index 67cad84..dead443 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -15,9 +15,9 @@ public class NormalizedInMemModelTest { @Test public void testNormalizedModel() { - LOG.debug("Starting InMemory indexed model test!"); + LOG.info("Starting InMemory indexed model test!"); final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); - LOG.debug("Indexed Model instance created"); + LOG.info("Indexed Model instance created"); Map wordKeyMap = new HashMap<>(); /*wordKeyMap.put("WesternOne", "ns#country-name"); wordKeyMap.put("Donald_O._Schnuck", "ontology#ConferenceVenuePlacerdf-schema#label"); @@ -37,7 +37,7 @@ public void testNormalizedModel() { Assert.assertTrue(closestWord.containsKey(word)); diff = System.currentTimeMillis() - startTime; totTime += diff; - LOG.debug("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " + LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " + diff + " milliseconds."); } From 1a20603c9a7242e36d821e01ea44f61dd531c583 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 15:54:59 +0200 Subject: [PATCH 016/114] log correction --- .../NormalizedInMemModelTest.java | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index dead443..94b898e 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -3,14 +3,19 @@ import java.util.HashMap; import java.util.Map; +import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.apache.log4j.PropertyConfigurator; import org.junit.Assert; import org.junit.Test; public class NormalizedInMemModelTest { + static { + PropertyConfigurator.configure(Cfg.LOG_FILE); + } public static Logger LOG = LogManager.getLogger(NormalizedInMemModelTest.class); @Test @@ -19,14 +24,17 @@ public void testNormalizedModel() { final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); LOG.info("Indexed Model instance created"); Map wordKeyMap = new HashMap<>(); - /*wordKeyMap.put("WesternOne", "ns#country-name"); - wordKeyMap.put("Donald_O._Schnuck", "ontology#ConferenceVenuePlacerdf-schema#label"); - wordKeyMap.put("Skyytek", "icaltzd#summary"); - wordKeyMap.put("Sungai_Muar", "ontology#Presenterrdf-schema#label");*/ + /* + * wordKeyMap.put("WesternOne", "ns#country-name"); + * wordKeyMap.put("Donald_O._Schnuck", + * "ontology#ConferenceVenuePlacerdf-schema#label"); wordKeyMap.put("Skyytek", + * "icaltzd#summary"); wordKeyMap.put("Sungai_Muar", + * "ontology#Presenterrdf-schema#label"); + */ wordKeyMap.put("cat", null); wordKeyMap.put("dog", null); wordKeyMap.put("airplane", null); - + wordKeyMap.put("road", null); long startTime, diff; long totTime = 0; @@ -44,7 +52,7 @@ public void testNormalizedModel() { LOG.debug("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); } - + public static void main(String[] args) { LOG.info("Starting test!"); NormalizedInMemModelTest inMemModelTest = new NormalizedInMemModelTest(); From e99a8232f5122a8d9ade440d7d9fd52944d047ca Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 16:32:49 +0200 Subject: [PATCH 017/114] increasing logs --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 6 ++++++ .../org/aksw/word2vecrestful/word2vec/Word2VecFactory.java | 1 + .../org/aksw/word2vecrestful/NormalizedInMemModelTest.java | 1 + 3 files changed, 8 insertions(+) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 3fba2fe..d118c8b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -58,6 +58,7 @@ public W2VNrmlMemModel(final Map word2vec, final int vectorSize LOG.info("Sorting indexes"); // Sort the indexes this.sortIndexes(); + LOG.info("Sorting completed"); } /** @@ -108,10 +109,13 @@ private Map getClosestEntry(float[] vector, String subKey) { } else { wordSet = dataSubsetProvider.fetchSubsetWords(subKey); } + LOG.info("Normalizing input vector"); // Normalize incoming vector vector = Word2VecMath.normalize(vector); + LOG.info("fetching nearby vectors"); // Find nearby vectors Map nearbyVecs = fetchNearbyVectors(vector, wordSet, true); + LOG.info("found the following nearby words: "+nearbyVecs.keySet()); // Select the closest vector closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); } catch (IOException e) { @@ -316,10 +320,12 @@ private float[][] getMinMaxVec(float[] vector) { private void sortIndexes() { for (int i = 0; i < indexesArr.length; i++) { + LOG.info("Sorting index "+i); Object[] entryArr = indexesArr[i]; String[] wordArr = (String[]) entryArr[0]; float[] dimsnValArr = (float[]) entryArr[1]; AssociativeSort.quickSort(dimsnValArr, wordArr); + LOG.info("Sorting completed for index "+i); } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index 5ebdee3..e8cdc3e 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -31,6 +31,7 @@ public static W2VNrmlMemModel getNormalizedModel() { } public static W2VNrmlMemModel getNormalizedBinModel() { + LOG.info("Loading model to memory"); Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); LOG.info("Model loaded to memory"); return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index 94b898e..57d0c96 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -39,6 +39,7 @@ public void testNormalizedModel() { long startTime, diff; long totTime = 0; for (String word : wordKeyMap.keySet()) { + LOG.info("Sending query for word :"+ word); startTime = System.currentTimeMillis(); float[] vec = memModel.getWord2VecMap().get(word); Map closestWord = memModel.getClosestSubEntry(vec, wordKeyMap.get(word)); From 2fd8763de9d9c8bb1873a9ea5aeb676e9b9ae9c3 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 17:20:26 +0200 Subject: [PATCH 018/114] log for fetchNearbyVectors --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index d118c8b..afece73 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -157,7 +157,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { dimValWordMap[0] = wordArr; dimValWordMap[1] = dimsnArr; this.indexesArr[i] = dimValWordMap; - LOG.info("Dimension "+(i+1)+" index stored to memory"); + LOG.info("Dimension "+(i)+" index stored to memory"); // mean float mean = sum / dimsnArr.length; sum = 0; @@ -223,30 +223,37 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map 1 ? 1 : 0); } + LOG.info("Final From value of dimension array: "+from); + LOG.info("To value of dimension array: "+from); int to = Arrays.binarySearch(dimsnValArr, maxVal); if (to < 0) { // To select the index one after the current element to = Math.abs(to); to = to - (to > dimsnValArr.length ? 1 : 0); } + LOG.info("Final To value of dimension array: "+from); String[] tWords = Arrays.copyOfRange(wordArr, from, to); + LOG.info("Matching words list size for current dimension: "+tWords.length); List tWordList = Arrays.asList(tWords); if (i == 0) { nearbyWords.addAll(tWordList); } else { nearbyWords.retainAll(tWordList); } + LOG.info("Nearby words list size for current dimension: "+nearbyWords.size()); if (nearbyWords.isEmpty()) { break; } From b4931bbfc30209301621e7980886286eb6bdd343 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 10 Sep 2018 17:46:56 +0200 Subject: [PATCH 019/114] changing twordlist to set --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index afece73..5aba8d1 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -247,11 +247,11 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map tWordList = Arrays.asList(tWords); + Set tWordSet = new HashSet<>(Arrays.asList(tWords)); if (i == 0) { - nearbyWords.addAll(tWordList); + nearbyWords.addAll(tWordSet); } else { - nearbyWords.retainAll(tWordList); + nearbyWords.retainAll(tWordSet); } LOG.info("Nearby words list size for current dimension: "+nearbyWords.size()); if (nearbyWords.isEmpty()) { From 0619f5fefc47f0a5dcc5490b073c62150ca8bfc3 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 06:43:00 +0200 Subject: [PATCH 020/114] logic change to bitset and ids --- .../word2vec/W2VNrmlMemModel.java | 89 ++++++++++++------- 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 5aba8d1..adec45e 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -2,9 +2,9 @@ import java.io.IOException; import java.util.Arrays; +import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; @@ -46,11 +46,14 @@ public class W2VNrmlMemModel implements GenWord2VecModel { */ private Object[][] indexesArr; + private String[] gWordArr; + private float[][] gVecArr; + public W2VNrmlMemModel(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; this.dataSubsetProvider = new DataSubsetProvider(); - this.indexesArr = new Object[vectorSize][2]; + this.initArrays(); // Calculate sd*3/10 and save in map // Initialize indexesArr unsorted LOG.info("Initializing indexes and calculating standard deviation"); @@ -61,6 +64,18 @@ public W2VNrmlMemModel(final Map word2vec, final int vectorSize LOG.info("Sorting completed"); } + private void initArrays() { + this.indexesArr = new Object[vectorSize][2]; + this.gWordArr = new String[word2vec.size()]; + this.gVecArr = new float[word2vec.size()][vectorSize]; + int i = 0; + for (Entry entry : word2vec.entrySet()) { + gWordArr[i] = entry.getKey(); + gVecArr[i] = entry.getValue(); + i++; + } + } + /** * Method to fetch the closest word entry for a given vector using cosine * similarity @@ -115,13 +130,13 @@ private Map getClosestEntry(float[] vector, String subKey) { LOG.info("fetching nearby vectors"); // Find nearby vectors Map nearbyVecs = fetchNearbyVectors(vector, wordSet, true); - LOG.info("found the following nearby words: "+nearbyVecs.keySet()); + LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); } catch (IOException e) { LOG.error(e.getStackTrace()); } - LOG.info("Closest word found is "+closestVec.keySet()); + LOG.info("Closest word found is " + closestVec.keySet()); return closestVec; } @@ -136,32 +151,29 @@ private Map getClosestEntry(float[] vector, String subKey) { */ public void setModelVals(Map word2vecMap, int vectorSize) { Map resMap = new HashMap<>(); - Set> entries = word2vecMap.entrySet(); int totSize = word2vecMap.size(); // loop all dimensions for (int i = 0; i < vectorSize; i++) { // loop through all the words - int j = 0; float[] dimsnArr = new float[totSize]; - String[] wordArr = new String[totSize]; + int[] idArr = new int[totSize]; float sum = 0; - for (Entry entry : entries) { - float[] vecArr = entry.getValue(); - float val = vecArr[i]; + for (int j = 0; j < gWordArr.length; j++) { + float val = gVecArr[j][i]; sum += val; - wordArr[j] = entry.getKey(); + idArr[j] = j; dimsnArr[j++] = val; } // Setting value in indexArr Object[] dimValWordMap = new Object[2]; - dimValWordMap[0] = wordArr; + dimValWordMap[0] = idArr; dimValWordMap[1] = dimsnArr; this.indexesArr[i] = dimValWordMap; - LOG.info("Dimension "+(i)+" index stored to memory"); + LOG.info("Dimension " + (i) + " index stored to memory"); // mean float mean = sum / dimsnArr.length; sum = 0; - for (j = 0; j < dimsnArr.length; j++) { + for (int j = 0; j < dimsnArr.length; j++) { sum += Math.pow(dimsnArr[j] - mean, 2); } float variance = sum / dimsnArr.length; @@ -222,44 +234,59 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map nearbyWords = new HashSet<>(); float[] minVec = minMaxVec[0]; float[] maxVec = minMaxVec[1]; + BitSet finBitSet = new BitSet(word2vec.size()); + BitSet tempBitSet; for (int i = 0; i < vectorSize; i++) { - LOG.info("Searching inside dimension "+(i)+"'s index"); + tempBitSet = new BitSet(word2vec.size()); + LOG.info("Searching inside dimension " + (i) + "'s index"); float minVal = minVec[i]; float maxVal = maxVec[i]; Object[] entryArr = indexesArr[i]; - String[] wordArr = (String[]) entryArr[0]; + int[] idArr = (int[]) entryArr[0]; float[] dimsnValArr = (float[]) entryArr[1]; int from = Arrays.binarySearch(dimsnValArr, minVal); - LOG.info("From value of dimension array: "+from); + LOG.info("From value of dimension array: " + from); if (from < 0) { // To select the index one after the current element from = Math.abs(from); from = from - (from > 1 ? 1 : 0); } - LOG.info("Final From value of dimension array: "+from); - LOG.info("To value of dimension array: "+from); + LOG.info("Final From value of dimension array: " + from); + LOG.info("To value of dimension array: " + from); int to = Arrays.binarySearch(dimsnValArr, maxVal); if (to < 0) { // To select the index one after the current element to = Math.abs(to); to = to - (to > dimsnValArr.length ? 1 : 0); } - LOG.info("Final To value of dimension array: "+from); - String[] tWords = Arrays.copyOfRange(wordArr, from, to); - LOG.info("Matching words list size for current dimension: "+tWords.length); - Set tWordSet = new HashSet<>(Arrays.asList(tWords)); + LOG.info("Final To value of dimension array: " + from); + LOG.info("Setting bits for the words between 'from' and 'to' indexes"); + for (int j = from; j < from; j++) { + tempBitSet.set(idArr[j], true); + } if (i == 0) { - nearbyWords.addAll(tWordSet); + finBitSet = tempBitSet; } else { - nearbyWords.retainAll(tWordSet); + finBitSet.and(tempBitSet); + } + if (finBitSet.isEmpty()) { + break; } - LOG.info("Nearby words list size for current dimension: "+nearbyWords.size()); - if (nearbyWords.isEmpty()) { + } + LOG.info("Extracting words for set bits"); + int nextBit=0; + while (true) { + nextBit = finBitSet.nextSetBit(nextBit); + if (nextBit > -1) { + nearbyWords.add(gWordArr[nextBit]); + } else { break; } } + LOG.info("Nearby words size before retainAll from wordset: "+ nearbyWords.size()); // Clear all the words not in wordset nearbyWords.retainAll(wordSet); + LOG.info("Nearby words size after retainAll from wordset: "+ nearbyWords.size()); for (String word : nearbyWords) { nearbyVecMap.put(word, word2vec.get(word)); } @@ -327,12 +354,12 @@ private float[][] getMinMaxVec(float[] vector) { private void sortIndexes() { for (int i = 0; i < indexesArr.length; i++) { - LOG.info("Sorting index "+i); + LOG.info("Sorting index " + i); Object[] entryArr = indexesArr[i]; - String[] wordArr = (String[]) entryArr[0]; + int[] idArr = (int[]) entryArr[0]; float[] dimsnValArr = (float[]) entryArr[1]; - AssociativeSort.quickSort(dimsnValArr, wordArr); - LOG.info("Sorting completed for index "+i); + AssociativeSort.quickSort(dimsnValArr, idArr); + LOG.info("Sorting completed for index " + i); } } From 6f243ee7a6cd906af08c0470bff7f7c86a0fa66a Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 07:01:16 +0200 Subject: [PATCH 021/114] handling null values --- .../java/org/aksw/word2vecrestful/utils/Word2VecMath.java | 5 ++--- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index 8d0e4a6..ac8d293 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -119,9 +119,8 @@ public static float calcVariance(float[] inpArr) { } public static Map findClosestVecInNearbyVecs(Map nearbyVecs, float[] vector) { - Map closestVec = null; - if(nearbyVecs !=null && vector != null) { - closestVec = new HashMap<>(); + Map closestVec = new HashMap<>(); + if(nearbyVecs !=null && vector != null && nearbyVecs.size()>0) { TreeMap cosineSimMap = new TreeMap<>(); for (String word : nearbyVecs.keySet()) { cosineSimMap.put(Word2VecMath.cosineSimilarity(vector, nearbyVecs.get(word)), word); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index adec45e..79ba9b4 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -234,7 +234,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map nearbyWords = new HashSet<>(); float[] minVec = minMaxVec[0]; float[] maxVec = minMaxVec[1]; - BitSet finBitSet = new BitSet(word2vec.size()); + BitSet finBitSet = null; BitSet tempBitSet; for (int i = 0; i < vectorSize; i++) { tempBitSet = new BitSet(word2vec.size()); @@ -261,7 +261,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Tue, 11 Sep 2018 07:15:00 +0200 Subject: [PATCH 022/114] logic change to select subset --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 79ba9b4..c2575dd 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -249,7 +249,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map 1 ? 1 : 0); + from = from - (from > 0 ? 1 : 0); } LOG.info("Final From value of dimension array: " + from); LOG.info("To value of dimension array: " + from); @@ -257,11 +257,11 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map dimsnValArr.length ? 1 : 0); + to = to - (to > dimsnValArr.length ? 2 : 1); } LOG.info("Final To value of dimension array: " + from); LOG.info("Setting bits for the words between 'from' and 'to' indexes"); - for (int j = from; j <= from; j++) { + for (int j = from; j < to; j++) { tempBitSet.set(idArr[j], true); } if (i == 0) { From f1b2e00b7628144383c6302ba62909064d862443 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 07:17:34 +0200 Subject: [PATCH 023/114] adding logs for normal model loading --- .../org/aksw/word2vecrestful/word2vec/Word2VecFactory.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index e8cdc3e..8fe5ba1 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -34,6 +34,9 @@ public static W2VNrmlMemModel getNormalizedBinModel() { LOG.info("Loading model to memory"); Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); LOG.info("Model loaded to memory"); - return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); + LOG.info("Initializing "+W2VNrmlMemModel.class + " instance"); + W2VNrmlMemModel nrmlMemModel = new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); + LOG.info("Initialization of "+W2VNrmlMemModel.class + " instance finished"); + return nrmlMemModel; } } From 19bddd4469cb8dd5af1dbe61bbe02dc81ebde981 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 07:33:05 +0200 Subject: [PATCH 024/114] logging minVal and maxVal --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index c2575dd..72362a1 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -241,6 +241,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map 0 ? 1 : 0); + from = from - (from > 1 ? 2 : 1); } LOG.info("Final From value of dimension array: " + from); LOG.info("To value of dimension array: " + from); From 7ff770c94816c2208426ac54d112d457aa7c0dea Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 08:09:31 +0200 Subject: [PATCH 025/114] word add logic change --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 72362a1..57cffa1 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -236,6 +236,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map dimsnValArr.length ? 2 : 1); + to = to - (to > dimsnValArr.length ? 1 : 0); } LOG.info("Final To value of dimension array: " + from); LOG.info("Setting bits for the words between 'from' and 'to' indexes"); @@ -271,12 +272,13 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map -1) { nearbyWords.add(gWordArr[nextBit]); From d2ad89a7bf0238742d6dcd43cfde09c2ff103cee Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 08:37:00 +0200 Subject: [PATCH 026/114] changing bitset iteration logic --- .../word2vec/W2VNrmlMemModel.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 57cffa1..a8d8a38 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -252,8 +252,10 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map 1 ? 2 : 1); + } else { + from--; } - LOG.info("Final From value of dimension array: " + from); + LOG.info("Final From value of current dimension array: " + from); LOG.info("To value of dimension array: " + from); int to = Arrays.binarySearch(dimsnValArr, maxVal); if (to < 0) { @@ -261,7 +263,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map dimsnValArr.length ? 1 : 0); } - LOG.info("Final To value of dimension array: " + from); + LOG.info("Final To value of current dimension array: " + from); LOG.info("Setting bits for the words between 'from' and 'to' indexes"); for (int j = from; j < to; j++) { tempBitSet.set(idArr[j], true); @@ -278,14 +280,14 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map -1) { - nearbyWords.add(gWordArr[nextBit]); - } else { - break; - } - } + + for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i+1)) { + // operate on index i here + nearbyWords.add(gWordArr[nextBit]); + if (i == Integer.MAX_VALUE) { + break; // or (i+1) would overflow + } + } LOG.info("Nearby words size before retainAll from wordset: "+ nearbyWords.size()); // Clear all the words not in wordset nearbyWords.retainAll(wordSet); From 22fba33c3d0e12fa2ff04c5f7ff594a8140aa683 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 09:05:47 +0200 Subject: [PATCH 027/114] updating 'from' and 'to' logic --- .../word2vec/W2VNrmlMemModel.java | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index a8d8a38..c7ff72f 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -236,7 +236,6 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map 1 ? 2 : 1); - } else { - from--; - } + // To select the insertion point + from = -1 - from; + } LOG.info("Final From value of current dimension array: " + from); LOG.info("To value of dimension array: " + from); int to = Arrays.binarySearch(dimsnValArr, maxVal); if (to < 0) { - // To select the index one after the current element - to = Math.abs(to); - to = to - (to > dimsnValArr.length ? 1 : 0); + // To select the insertion point + to = -1-to; + } else { + to++; } LOG.info("Final To value of current dimension array: " + from); LOG.info("Setting bits for the words between 'from' and 'to' indexes"); @@ -274,7 +271,6 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Tue, 11 Sep 2018 09:14:40 +0200 Subject: [PATCH 028/114] logs correction --- .../java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index c7ff72f..048d58b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -260,7 +260,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Tue, 11 Sep 2018 12:13:08 +0200 Subject: [PATCH 029/114] logs correction --- .../word2vec/W2VNrmlMemModel.java | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 048d58b..832e858 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -241,7 +241,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map wordSet, Map= 0; i = finBitSet.nextSetBit(i+1)) { - // operate on index i here - nearbyWords.add(gWordArr[nextBit]); - if (i == Integer.MAX_VALUE) { - break; // or (i+1) would overflow - } - } - LOG.info("Nearby words size before retainAll from wordset: "+ nearbyWords.size()); + int nextBit = 0; + + for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1)) { + // operate on index i here + nearbyWords.add(gWordArr[nextBit]); + if (i == Integer.MAX_VALUE) { + break; // or (i+1) would overflow + } + } + LOG.info("Nearby words size before retainAll from wordset: " + nearbyWords.size()); // Clear all the words not in wordset nearbyWords.retainAll(wordSet); - LOG.info("Nearby words size after retainAll from wordset: "+ nearbyWords.size()); + LOG.info("Nearby words size after retainAll from wordset: " + nearbyWords.size()); for (String word : nearbyWords) { nearbyVecMap.put(word, word2vec.get(word)); } From 26a7b72ae8d88b53631aa20a5fb80792442cad88 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 12:34:37 +0200 Subject: [PATCH 030/114] fixing bitset logic --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 832e858..5a7c50a 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -258,6 +258,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map= 0; i = finBitSet.nextSetBit(i + 1)) { // operate on index i here - nearbyWords.add(gWordArr[nextBit]); + nearbyWords.add(gWordArr[i]); if (i == Integer.MAX_VALUE) { break; // or (i+1) would overflow } From 83f17091d93c22e80b5966eaeeb56ef6de432900 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 13:03:53 +0200 Subject: [PATCH 031/114] multplier logic fix --- .../word2vec/W2VNrmlMemModel.java | 30 ++++--------------- 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 5a7c50a..b8c51eb 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -197,12 +197,10 @@ private Map fetchNearbyVectors(float[] vector, Set word Map nearbyVecMap = new HashMap<>(); boolean mapEmpty = true; boolean notExhausted = true; - float[][] minMaxVec = getMinMaxVec(vector); + float[][] minMaxVec; int mult = 1; while (mapEmpty && notExhausted) { - if (mult > 1) { - minMaxVec = multMinMaxVec(minMaxVec, mult); - } + minMaxVec = getMinMaxVec(vector, mult); if (indxd) { putNearbyVecsIndxd(minMaxVec, wordSet, nearbyVecMap); } else { @@ -215,6 +213,7 @@ private Map fetchNearbyVectors(float[] vector, Set word if (mult > EXHAUSTION_MULT) { notExhausted = false; } + LOG.info("MinMax multiplier incremented to " + mult); } } return nearbyVecMap; @@ -272,6 +271,7 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Tue, 11 Sep 2018 13:59:58 +0200 Subject: [PATCH 032/114] dimnsArr value fix --- .../word2vecrestful/word2vec/W2VNrmlMemModel.java | 2 +- .../word2vecrestful/NormalizedInMemModelTest.java | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index b8c51eb..f483c45 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -162,7 +162,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { float val = gVecArr[j][i]; sum += val; idArr[j] = j; - dimsnArr[j++] = val; + dimsnArr[j] = val; } // Setting value in indexArr Object[] dimValWordMap = new Object[2]; diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index 57d0c96..3f4ba05 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -24,22 +24,15 @@ public void testNormalizedModel() { final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); LOG.info("Indexed Model instance created"); Map wordKeyMap = new HashMap<>(); - /* - * wordKeyMap.put("WesternOne", "ns#country-name"); - * wordKeyMap.put("Donald_O._Schnuck", - * "ontology#ConferenceVenuePlacerdf-schema#label"); wordKeyMap.put("Skyytek", - * "icaltzd#summary"); wordKeyMap.put("Sungai_Muar", - * "ontology#Presenterrdf-schema#label"); - */ wordKeyMap.put("cat", null); wordKeyMap.put("dog", null); wordKeyMap.put("airplane", null); - wordKeyMap.put("road", null); + long startTime, diff; long totTime = 0; for (String word : wordKeyMap.keySet()) { - LOG.info("Sending query for word :"+ word); + LOG.info("Sending query for word :" + word); startTime = System.currentTimeMillis(); float[] vec = memModel.getWord2VecMap().get(word); Map closestWord = memModel.getClosestSubEntry(vec, wordKeyMap.get(word)); From 7a2d194d39dc5074108046d7402d352ca555f086 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 14:10:41 +0200 Subject: [PATCH 033/114] removing logs --- .../word2vec/W2VNrmlMemModel.java | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index f483c45..54e8b04 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -56,12 +56,12 @@ public W2VNrmlMemModel(final Map word2vec, final int vectorSize this.initArrays(); // Calculate sd*3/10 and save in map // Initialize indexesArr unsorted - LOG.info("Initializing indexes and calculating standard deviation"); + // LOG.info("Initializing indexes and calculating standard deviation"); this.setModelVals(word2vec, vectorSize); - LOG.info("Sorting indexes"); + // LOG.info("Sorting indexes"); // Sort the indexes this.sortIndexes(); - LOG.info("Sorting completed"); + // LOG.info("Sorting completed"); } private void initArrays() { @@ -124,19 +124,19 @@ private Map getClosestEntry(float[] vector, String subKey) { } else { wordSet = dataSubsetProvider.fetchSubsetWords(subKey); } - LOG.info("Normalizing input vector"); + // LOG.info("Normalizing input vector"); // Normalize incoming vector vector = Word2VecMath.normalize(vector); - LOG.info("fetching nearby vectors"); + // LOG.info("fetching nearby vectors"); // Find nearby vectors Map nearbyVecs = fetchNearbyVectors(vector, wordSet, true); - LOG.info("found the following nearby words: " + nearbyVecs.keySet()); + // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); } catch (IOException e) { - LOG.error(e.getStackTrace()); + // LOG.error(e.getStackTrace()); } - LOG.info("Closest word found is " + closestVec.keySet()); + // LOG.info("Closest word found is " + closestVec.keySet()); return closestVec; } @@ -169,7 +169,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { dimValWordMap[0] = idArr; dimValWordMap[1] = dimsnArr; this.indexesArr[i] = dimValWordMap; - LOG.info("Dimension " + (i) + " index stored to memory"); + // LOG.info("Dimension " + (i) + " index stored to memory"); // mean float mean = sum / dimsnArr.length; sum = 0; @@ -213,7 +213,7 @@ private Map fetchNearbyVectors(float[] vector, Set word if (mult > EXHAUSTION_MULT) { notExhausted = false; } - LOG.info("MinMax multiplier incremented to " + mult); + // LOG.info("MinMax multiplier incremented to " + mult); } } return nearbyVecMap; @@ -237,22 +237,22 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map wordSet, Map= 0; i = finBitSet.nextSetBit(i + 1)) { // operate on index i here @@ -284,10 +284,10 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Tue, 11 Sep 2018 14:53:55 +0200 Subject: [PATCH 034/114] putting time logger --- .../word2vec/W2VNrmlMemModel.java | 30 ++++++++++++++++--- .../NormalizedInMemModelTest.java | 2 +- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 54e8b04..844d492 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -15,6 +15,8 @@ import org.apache.log4j.Logger; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; +import nikit.test.TimeLogger; + /** * Class to encapsulate word2vec in-memory model and expose methods to perform * search on the model @@ -49,6 +51,9 @@ public class W2VNrmlMemModel implements GenWord2VecModel { private String[] gWordArr; private float[][] gVecArr; + // TODO : Remove this + private TimeLogger tl = new TimeLogger(); + public W2VNrmlMemModel(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; @@ -122,17 +127,23 @@ private Map getClosestEntry(float[] vector, String subKey) { if (subKey == null) { wordSet = word2vec.keySet(); } else { + tl.logTime(1); wordSet = dataSubsetProvider.fetchSubsetWords(subKey); + tl.printTime(1, "fetchSubsetWords"); } // LOG.info("Normalizing input vector"); // Normalize incoming vector vector = Word2VecMath.normalize(vector); // LOG.info("fetching nearby vectors"); // Find nearby vectors + tl.logTime(2); Map nearbyVecs = fetchNearbyVectors(vector, wordSet, true); + tl.printTime(2, "fetchNearbyVectors"); // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector + tl.logTime(3); closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + tl.printTime(3, "findClosestVecInNearbyVecs"); } catch (IOException e) { // LOG.error(e.getStackTrace()); } @@ -201,8 +212,11 @@ private Map fetchNearbyVectors(float[] vector, Set word int mult = 1; while (mapEmpty && notExhausted) { minMaxVec = getMinMaxVec(vector, mult); + tl.printTime(2, "getMinMaxVec"); if (indxd) { + tl.logTime(4); putNearbyVecsIndxd(minMaxVec, wordSet, nearbyVecMap); + tl.printTime(4, "putNearbyVecsIndxd"); } else { putNearbyVecsNonIndxd(minMaxVec, wordSet, nearbyVecMap); } @@ -235,12 +249,14 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map= 0; i = finBitSet.nextSetBit(i + 1)) { // operate on index i here nearbyWords.add(gWordArr[i]); @@ -284,10 +301,15 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Tue, 11 Sep 2018 14:55:18 +0200 Subject: [PATCH 035/114] adding timeLogger --- src/main/java/nikit/test/TimeLogger.java | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 src/main/java/nikit/test/TimeLogger.java diff --git a/src/main/java/nikit/test/TimeLogger.java b/src/main/java/nikit/test/TimeLogger.java new file mode 100644 index 0000000..e0209d5 --- /dev/null +++ b/src/main/java/nikit/test/TimeLogger.java @@ -0,0 +1,23 @@ +package nikit.test; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +public class TimeLogger { + public static Logger LOG = LogManager.getLogger(TimeLogger.class); + private Map strtLog = new HashMap<>(); + + public void logTime(int id) { + strtLog.put(id, System.currentTimeMillis()); + } + + public void printTime(int id, String procName) { + long diff = System.currentTimeMillis() - strtLog.get(id); + LOG.info("Query time recorded for '" + procName + "' is " + + diff + " milliseconds."); + } + +} From 57fe7fe0e0b0e334659092391a2c4554c2753c48 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 15:11:45 +0200 Subject: [PATCH 036/114] adding logs to binary search --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 844d492..ff4a2f1 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -260,13 +260,16 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map Date: Tue, 11 Sep 2018 15:27:57 +0200 Subject: [PATCH 037/114] logging setting of bits --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index ff4a2f1..450f8fd 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -260,16 +260,13 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map Date: Tue, 11 Sep 2018 15:47:13 +0200 Subject: [PATCH 038/114] changing area values --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 450f8fd..f03ead8 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -33,7 +33,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { /** * Limit to the multiplier of area in which nearby vectors are to be looked */ - private static final int EXHAUSTION_MULT = 10; + private static final int EXHAUSTION_MULT = 100; /** * Multiplier for the standard deviation */ @@ -41,7 +41,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { /** * Divisor for the standard deviation's value */ - private static final int AREA_DIVISOR = 10; + private static final int AREA_DIVISOR = 100; private DataSubsetProvider dataSubsetProvider; /** * Contains the sorted dimensional values mapped to their words From d4205a225b2340c49b843f1d7b56eb655a41aa57 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 16:33:11 +0200 Subject: [PATCH 039/114] new test scenario --- src/main/java/nikit/test/TestConst.java | 6 ++++ .../word2vec/W2VNrmlMemModel.java | 2 -- .../NormalizedInMemModelTest.java | 32 +++++++++++++++---- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/main/java/nikit/test/TestConst.java b/src/main/java/nikit/test/TestConst.java index b6984f6..a2c26f4 100644 --- a/src/main/java/nikit/test/TestConst.java +++ b/src/main/java/nikit/test/TestConst.java @@ -10,4 +10,10 @@ public interface TestConst { public static final float[] AIRPLANE = { .05517051f, -.06537547f, -.05261928f, .03810911f, -.05230037f, -.11671913f, -.05708394f, -.05804066f, .14797178f, .00605919f, .08227742f, -.10332513f, .07876946f, -.09439579f, -.03763076f, .12118379f, .00422549f, .05485161f, .0204099f, .02519347f, -.01466962f, -.00178387f, -.01171975f, -.0749426f, .00181377f, .0085307f, -.04241433f, .02471511f, .04496557f, .01243728f, -.09120674f, -.09375799f, -.08482865f, .01570606f, .00721522f, .02806361f, -.00300966f, .04113871f, .01945319f, .09439579f, .02423676f, .02033018f, .09312017f, .0237584f, -.00693618f, -.09120674f, .03364445f, .01227783f, .1084276f, -.0408198f, -.08674208f, .04847351f, .03906583f, -.01427099f, -.0408198f, -.06633218f, .0605919f, -.02439621f, .0816396f, .02742581f, .0880177f, -.10332513f, -.08546646f, .01227783f, .03492007f, .00849084f, -.02184497f, .02168552f, -.01235756f, .01738031f, -.05453271f, -.07940727f, -.05899737f, .009607f, -.12564845f, .00446467f, .03189047f, -.10077389f, .03492007f, -.02822307f, -.0204099f, .02168552f, -.00785303f, -.04113871f, -.02017072f, -.09758484f, -.00332857f, .06346204f, .06250533f, .09056894f, -.02694745f, .00920837f, -.04943023f, -.00685645f, .0374713f, -.07845056f, .04273323f, -.05070585f, .04624118f, -.11161665f, .05102475f, -.11097884f, .02033018f, .02025045f, .10778979f, .05867847f, .0813207f, .01355345f, .06665108f, .00198319f, -.15116084f, -.03109321f, -.07653713f, -.0408198f, -.08737989f, -.01793839f, .00095671f, .09886046f, -.03794966f, .00574028f, -.03173102f, -.06888342f, .04815461f, -.12437284f, -.00360761f, .02583128f, -.04305214f, -.06920232f, .04113871f, .11225446f, .04464666f, -.01945319f, -.06856451f, .03220938f, .06633218f, -.05485161f, -.03555787f, -.08291522f, .03794966f, .00725508f, -.01738031f, -.03970364f, -.06696999f, .01044413f, .06473766f, -.00621864f, -.06314313f, -.0676078f, -.02232333f, -.05835956f, .05357599f, -.02997704f, .01514797f, -.00968673f, -.05134366f, -.06346204f, .01650332f, -.01658305f, -.02519347f, .04656009f, -.01132112f, .02822307f, -.0044846f, -.00605919f, .013394f, .02933923f, .05102475f, -.05804066f, .01323455f, -.04273323f, -.04592228f, -.04847351f, -.05006804f, -.05995408f, -.0152277f, -.05230037f, -.01044413f, -.05453271f, -.05644614f, -.0041059f, -.0542138f, -.09949827f, -.04496557f, -.08227742f, .09056894f, -.02567183f, -.01259674f, .03683349f, -.07972617f, .02886088f, .0542138f, .09120674f, -.01482907f, .11671913f, -.05325709f, .05198147f, -.01331427f, -.08355303f, -.05261928f, -.05676504f, -.03523897f, -.00920837f, -.13585341f, -.10778979f, -.00193336f, .02758526f, .03603623f, .01817757f, -.07589932f, .00534165f, -.03922528f, -.0026509f, -.00980632f, .013394f, -.04496557f, .04241433f, .01482907f, .1900672f, -.000583f, -.01419126f, -.03332554f, -.04337104f, .0538949f, .0237584f, .14860959f, -.00542138f, -.00510248f, .07717494f, -.10077389f, -.02949869f, .05580832f, -.02025045f, -.08482865f, -.13776684f, -.06569437f, .01108194f, -.05293818f, .01921401f, .04400885f, -.01299537f, .04400885f, -.01730058f, -.07876946f, -.08036399f, .00865029f, -.04496557f, -.0810018f, -.11799474f, .02646909f, -.06154861f, -.04592228f, -.01203865f, .01164002f, -.03029595f, -.01219811f, -.03587678f, .03460116f, .04241433f, .01730058f, -.08993113f, .01722085f, .0676078f, -.00508254f, .03157157f, -.04177652f, .04815461f, -.00713549f, -.08227742f, -.13266437f, .05740285f, -.01347372f, .01761949f, .08674208f, -.00617878f, -.07717494f, -.08674208f, .00158456f, -.02232333f, -.06537547f, -.07845056f, -.01435071f, -.04687899f, -.03252828f, -.03460116f, -.00035129f, -.0271069f, -.05357599f, .0234395f, -.04528447f, -.01825729f, -.10715199f, -.04560337f, .04177652f, .00143507f, .05644614f, -.00781317f, -.08355303f, .05453271f, -.00402617f, -.06665108f }; public static final float[] ROAD = { .04295584f, .10129324f, .03301409f, -.03226377f, .05627402f, -.0121927f, -.07465687f, -.10054292f, -.00001145f, .06827915f, .07465687f, .03751602f, -.09341488f, -.03676569f, -.00135409f, -.10429452f, .00717494f, -.06265175f, .04483164f, .02869975f, .0667785f, .06715367f, -.0160381f, .04070488f, .10129324f, -.0922894f, -.06640334f, -.05027146f, .0121927f, -.01378714f, -.01275545f, .02832459f, .01941454f, .07803331f, -.01735116f, -.03657812f, .04764534f, -.02663637f, .03395199f, .02532331f, -.0405173f, -.00623704f, .05627402f, .06865431f, -.0997926f, -.01660084f, .04295584f, .01575673f, .10579516f, .06752883f, .00006485f, .10054292f, -.00872247f, .02494815f, -.01111412f, -.0160381f, -.06265175f, .05439822f, .09679132f, .02926249f, -.10804612f, -.04258068f, .08703715f, -.00975416f, .06340206f, .0941652f, .00764389f, .05139694f, -.0461447f, -.0001634f, -.02757427f, -.04164278f, -.03545263f, -.07503203f, .05027146f, -.09566583f, .05627402f, .07090527f, -.00515845f, .03657812f, -.00235647f, -.00294266f, -.07203075f, -.05139694f, -.00637772f, .04576954f, -.01950833f, .01027001f, .01041069f, .05702434f, -.06640334f, -.03188861f, -.00438468f, -.06865431f, -.01613189f, -.03395199f, .02401025f, .00565085f, -.0207276f, .02419783f, .03695327f, .02119655f, .05102178f, .09829196f, .08328556f, .00016047f, .04576954f, -.00147719f, .00614325f, -.00307162f, .05289758f, -.12530349f, -.01941454f, -.09566583f, .07765815f, -.00368126f, .03020039f, -.01950833f, -.12680413f, .02644879f, .00361092f, .03095071f, -.04501922f, .00947279f, -.07015495f, .03301409f, -.11404868f, -.06640334f, .05214726f, -.08103459f, -.03545263f, -.06152626f, -.02869975f, .01041069f, -.04839566f, -.03057555f, -.06227658f, -.04183036f, -.0103169f, .15081438f, -.04483164f, .01997728f, .05965046f, -.0461447f, -.02907491f, -.06340206f, -.09904228f, -.08328556f, -.10804612f, -.03545263f, -.00459571f, -.08366071f, -.01331819f, -.02213445f, -.08328556f, -.00361092f, -.01594431f, -.05064662f, -.04483164f, .0423931f, -.0922894f, .11404868f, .08966327f, -.01519399f, -.01566294f, -.00279025f, -.01073896f, -.096041f, -.06527787f, -.12380285f, .01631947f, .00247371f, .01378714f, .00886316f, .03207619f, -.10129324f, -.01556915f, -.03001281f, .00726873f, .0405173f, -.0997926f, .01406851f, .04501922f, .03113829f, .0023213f, .01027001f, -.0498963f, -.03582779f, .07503203f, .02982523f, .14706278f, -.00858179f, -.08966327f, .13580798f, -.08628684f, -.03076313f, -.01669463f, .11029708f, -.06977979f, -.02907491f, .0094259f, .02016486f, -.03151345f, .02185308f, .04745776f, .01875801f, -.0395794f, -.01195823f, .05777466f, -.01369335f, .05477338f, .04108004f, .00806594f, -.05439822f, -.07203075f, .08065943f, .02513573f, .04464406f, -.03582779f, -.02457299f, -.01012932f, -.07053011f, .05289758f, -.01650705f, -.03338925f, -.04952114f, -.04914598f, .03732843f, .03657812f, .07653267f, .04745776f, .00492398f, .01139549f, .02757427f, -.06977979f, -.0480205f, -.03582779f, -.03789118f, .08178491f, -.00349368f, .01050448f, -.00323576f, -.03601537f, -.08966327f, .01083275f, -.06115111f, .03095071f, .06415238f, -.03751602f, -.12605381f, .00998864f, .06640334f, .00703425f, .05139694f, .03826633f, -.0941652f, .00703425f, .05139694f, -.04089246f, -.05589886f, -.07165559f, -.06490271f, .04314342f, .10204356f, .09153908f, .09078876f, -.17407431f, -.03676569f, -.10504484f, .00361092f, .07653267f, .00947279f, -.02063381f, .03245135f, .00290749f, .02738669f, .03188861f, -.05402306f, .00185235f, -.01660084f, -.03470231f, -.14106022f, .0517721f, -.05214726f, .00454882f, -.08966327f, -.15831758f, .07240591f, .00214545f, .0498963f, -.01950833f, .01350577f, -.11254805f, .00035611f, .01238029f, -.01491262f, -.04258068f, -.02588605f, .0885378f, -.08891296f }; public static final Map VEC_MAP = new HashMap<>(); + + public static final float[] CENT1 = {0.012048473f,-0.024212155f,-0.0157357f,0.02262468f,-0.024654279f,0.0457433f,0.0064664893f,0.00068743055f,0.0043786275f,0.043083176f,0.01947275f,0.031054974f,0.0050106747f,0.040616468f,-0.06849119f,-0.028724432f,0.06430485f,0.07792129f,-0.043440804f,-0.036740836f,-0.0050671464f,0.073524095f,0.02135851f,0.09556471f,-0.030524706f,-0.014748728f,-0.05861443f,0.06757884f,0.039201904f,-0.046120245f,-0.06681424f,-0.040789366f,-0.00768607f,-0.06021417f,0.0057549067f,-0.03610283f,-0.026283398f,-0.009566606f,-0.019777304f,0.008105119f,0.0007985072f,-0.0006997036f,-0.0203185f,0.05312286f,-0.027849432f,-0.13610683f,-0.08889125f,-0.024479646f,-0.022746436f,0.06843884f,-0.17998825f,0.026803045f,-0.017824588f,-0.037123863f,0.020866392f,0.10881086f,-0.04636463f,-0.06507374f,0.011943677f,-0.032842845f,-0.0379873f,0.030996734f,-0.05825527f,-0.013570298f,-0.053774796f,-0.028395433f,-0.031026928f,0.0978569f,0.016083691f,-0.022547344f,-0.024394602f,-0.07463887f,0.029379064f,-0.018185558f,-0.04939903f,-0.114169985f,0.054695312f,-0.034766104f,0.0031865097f,0.07521222f,0.0060346685f,-0.027695695f,0.024973672f,0.0049606054f,0.040194273f,-0.07115193f,0.018864758f,0.06304432f,0.046690594f,0.018227981f,-0.011774269f,-0.078641266f,-0.008184901f,0.008990256f,0.027709894f,0.025042433f,0.08738773f,-0.025084957f,0.05794878f,-0.0002867386f,0.04075641f,-0.06282663f,-0.042190086f,-0.055376306f,-0.0054030158f,0.06463771f,-0.041824155f,-0.015070578f,-0.022487497f,-0.00033046876f,-0.01209619f,-0.009023646f,0.007929764f,-0.041775007f,0.09193121f,0.024664393f,-0.013769424f,-0.014402794f,0.09445259f,-0.014983056f,-0.005287673f,0.023613283f,-0.07623077f,-0.0038938622f,-0.0034399356f,0.017054306f,-0.1441689f,0.08759932f,-0.007884077f,0.022168875f,0.010296094f,-0.046407104f,-0.009118033f,-0.028891914f,-0.031117633f,-0.02871687f,-0.0046469797f,-0.04992117f,-0.019451704f,0.069421805f,0.055781126f,-0.016452271f,-0.03818504f,0.000106584404f,-0.056315906f,-0.048518386f,0.0724545f,0.050923787f,0.031283163f,-0.01384957f,0.061167654f,-0.08324516f,0.035242077f,0.022417426f,0.05430028f,-0.046469364f,-0.04185331f,-0.042400487f,-0.04921055f,-0.091220774f,0.0027087114f,0.034520615f,0.023072524f,-0.0077218637f,-0.0011392183f,0.05464266f,0.035545517f,0.022161836f,-0.05659765f,0.07609063f,-0.02870641f,-0.04740215f,-0.011478219f,-0.09368904f,0.01599188f,0.039735265f,0.024059445f,-0.116237536f,-0.060398255f,-0.0711672f,-0.07546678f,-0.049090724f,0.009575134f,-0.010417352f,-0.023362938f,-0.0038178598f,0.0029536507f,0.006134175f,0.0178646f,-0.004190245f,-0.066817544f,-0.044505805f,-0.013706623f,-0.03433649f,-0.108880125f,-0.029211186f,0.023804974f,-0.10099187f,0.041438986f,-0.10345428f,0.035116937f,0.07459071f,-0.045708906f,0.011743813f,-0.042854473f,0.048905853f,-0.03171399f,-0.026989073f,0.024468027f,-0.021753585f,-0.05618831f,-0.0038483562f,-0.028999245f,0.07630604f,-0.071918495f,0.04300855f,0.07065186f,0.05587815f,0.010918163f,-0.03475483f,-0.06843472f,0.054367058f,-0.06401773f,-0.033731382f,-0.049490917f,-0.036138013f,0.016919916f,0.022843298f,-0.030318176f,-0.04193131f,-0.027653892f,0.063327186f,-0.0016599294f,0.012881171f,-0.0027521544f,-0.05379905f,0.021017954f,-0.006927093f,0.044466246f,0.009862491f,0.0031623652f,-0.021989003f,0.013225218f,-0.045097247f,0.014426014f,-0.06732055f,-0.0735208f,0.15174854f,-0.05046516f,-0.036308214f,-0.04791744f,0.07355565f,-0.031534396f,0.04362686f,0.043246135f,-0.00998336f,-0.06007801f,-0.052771896f,-0.0041799587f,-0.060795482f,0.079710945f,-0.0035539696f,0.0015720833f,-0.03142349f,-0.012874763f,0.05546811f,-0.04548275f,-0.048887257f,-0.071003325f,-0.036039308f,-0.008604928f,0.07330001f,-0.025046118f,0.013428866f,0.049462494f,0.019332785f,-0.01788359f,-0.0035423029f,0.0026713153f,-0.0043865293f,-0.03831189f,0.04390023f,0.018204223f,0.03912164f,0.038038414f,0.021346357f,-0.020249007f,0.033192717f,0.026151495f,0.1017491f,-0.021874143f,-0.0034825318f,-0.08193575f,0.07040302f,0.0150283305f,-0.04449221f,-0.0065290653f,-0.03790575f,0.016539441f,0.03268941f}; + public static final float[] CENT2 = {0.0011224479f,-0.001863233f,0.03263754f,0.066455856f,-0.07758351f,0.016661314f,0.05407381f,-0.03671384f,0.064221285f,0.03266836f,-0.031561505f,-0.04960479f,-0.015367692f,0.0031025135f,-0.08196445f,0.05449307f,0.012373786f,0.06530031f,-0.032638453f,-0.059603374f,-0.014447233f,0.046425983f,-0.038042918f,0.08232817f,0.012978931f,-0.03651421f,-0.061884727f,0.07259438f,0.0036989967f,-0.08266768f,-0.01822925f,-0.048035406f,-0.039166685f,-0.013688043f,0.01926087f,-0.030591851f,-0.009005229f,-0.033160154f,0.07743789f,0.0011018296f,0.05371014f,0.04185595f,0.053121638f,0.05598798f,-0.019086173f,-0.111519024f,-0.0868146f,-0.010070587f,-0.01108926f,0.05708204f,-0.02231529f,-0.008074704f,-0.052298963f,-0.059062336f,-0.022169085f,0.056413084f,-0.020609455f,-0.092539705f,0.04031809f,-0.044353258f,-0.009878872f,0.020261401f,-0.06894275f,-0.062330727f,-0.04498814f,0.003066458f,-0.052467234f,0.10177743f,0.0021486194f,0.0154087385f,0.004489649f,-0.0095335385f,0.09414281f,-0.03148624f,-0.0395236f,-0.092772774f,0.08917366f,0.025269797f,0.012749687f,0.060295627f,0.02299269f,-0.0033834197f,-0.001978396f,0.022860922f,0.044535223f,-0.0662878f,-0.05360099f,0.081459686f,-0.006393052f,0.06541517f,0.054051742f,-0.08151124f,-0.076532595f,-0.048224714f,0.03294888f,-0.035631858f,0.009274301f,-0.03200488f,0.031193743f,-0.0025883138f,0.022473684f,-0.059758842f,-0.027705202f,0.019945115f,-0.03305572f,-0.009211555f,-0.046354476f,0.016504511f,0.051720116f,-0.015701503f,-0.039897718f,-0.0034910303f,-0.043169603f,-0.0032664624f,0.0973586f,0.045616977f,0.0012771824f,-0.025580116f,0.14779237f,0.027200254f,-0.063536674f,-0.03377593f,-0.05288367f,0.03317395f,0.025131427f,0.015098988f,-0.09985905f,0.03307604f,0.00317302f,0.040220052f,-0.020050881f,-0.03631977f,-0.043799937f,-0.009416937f,0.010302052f,-0.030561732f,0.057100028f,-0.030730985f,-0.023588074f,0.029835591f,0.026301857f,-0.03256527f,-0.014552586f,0.04447647f,0.013681483f,-0.01690498f,0.0043507633f,-0.039237954f,-0.029638784f,-0.016659573f,0.061713964f,-0.009091977f,-0.04115817f,0.01399963f,0.015765075f,-0.06176697f,-0.06059548f,-0.01961178f,-0.0689507f,-0.05466f,0.008955869f,0.042786963f,-0.027474424f,0.01438025f,-0.0025842553f,-0.00898309f,0.031604897f,-0.024786215f,-0.023709876f,-0.022238966f,-0.06986748f,-0.0328827f,-0.022661535f,-0.079846226f,0.06432295f,-0.029166747f,0.069291405f,-0.11324385f,-0.06973677f,-0.016149884f,-0.09459782f,-0.05430912f,-0.0012159615f,-0.014952947f,0.011775484f,-0.004740334f,-0.039360583f,0.020428661f,0.048237324f,0.028093142f,-0.048959043f,-0.05492418f,0.013712874f,-0.04553646f,-0.09681428f,0.0022662391f,0.01593046f,-0.07242234f,-0.018899413f,-0.13065086f,0.031014368f,0.06929672f,-0.027318746f,0.03434523f,-0.051217493f,0.013183998f,-0.001281976f,-0.04754092f,0.019721871f,-0.01698455f,-0.026823288f,0.024809958f,-0.019565452f,0.07769384f,-0.051748566f,0.011201868f,0.10550809f,0.044799376f,-0.06713498f,0.005978165f,-0.024218574f,0.0074360454f,-0.01333945f,-0.02607493f,-0.029164964f,-0.023122739f,0.0037433826f,0.013671214f,0.02957095f,-0.01917583f,-0.010799792f,-0.018916888f,0.036433138f,-0.020096151f,0.014759338f,-0.033712663f,0.017735727f,0.01159687f,0.065676205f,0.005510972f,0.014117873f,-0.012877099f,-0.0024081317f,-0.051336925f,0.0050226017f,-0.038583867f,-0.029694732f,0.11698223f,-0.010932898f,-0.059856724f,-0.0046913885f,0.057615787f,-0.023230674f,0.023875365f,0.041627854f,-0.062831655f,-0.021146324f,-0.020344304f,-0.07484446f,-0.045819506f,0.03742025f,-0.030994968f,-0.0254251f,0.023674546f,0.021253662f,0.0816581f,-0.005607605f,-0.041836344f,-0.12665127f,-0.01622519f,0.029598342f,0.045117233f,0.004459488f,0.048652746f,0.048366934f,-0.031923924f,-0.042036504f,-0.05557346f,-0.032806776f,-0.0013186494f,-0.0385013f,0.030741462f,0.029661812f,0.06660657f,0.018905401f,-0.0056354366f,-0.030359453f,-0.004873472f,0.05220532f,0.05874947f,-0.09530542f,0.034675412f,-0.05020458f,0.024449749f,-0.02171062f,-0.002091583f,0.027687436f,-0.034217242f,-0.0088846795f,-0.011294499f}; + public static final float[] CENT3 = {0.0045599197f,-0.016655529f,0.03039212f,0.072564f,-0.012314448f,-0.017728124f,-0.014679423f,-0.03531948f,-0.021118527f,0.040627014f,-0.029296335f,0.06789777f,0.013045308f,0.00410344f,-0.0684175f,0.0019473588f,-0.012823016f,0.056388646f,0.039152887f,0.021038044f,-0.00095268164f,-0.015722638f,0.016130328f,0.06405417f,0.035486434f,0.0012872473f,-0.04126779f,0.042538967f,-0.057445608f,0.020221751f,0.057181627f,-0.0595517f,-0.044382412f,-0.06861117f,0.04883461f,-0.084817395f,-0.058351003f,0.024775567f,0.018097626f,0.018196356f,-0.034807358f,0.013536167f,-0.027318547f,0.0952005f,-0.05871476f,-0.019638764f,-0.03597215f,-0.049303804f,-0.0021026363f,0.094765685f,-0.088323735f,0.07100553f,-0.01156632f,-0.051242463f,-0.08165367f,0.078404605f,-0.13566828f,-0.07969416f,-0.061671786f,-0.13423134f,-0.08536931f,0.00008282155f,-0.049163125f,-0.024533534f,-0.043691263f,-0.08095603f,0.010672941f,0.019639365f,0.0013429034f,0.0567831f,-0.0967779f,-0.019170664f,0.05618282f,0.011713989f,-0.047731653f,-0.04699047f,0.07399622f,-0.047449544f,0.03590385f,0.01322831f,-0.0039109844f,-0.07518891f,0.025434965f,-0.05743741f,0.004045694f,-0.04617371f,0.033967968f,0.0711757f,-0.009795415f,0.019290686f,-0.037020046f,-0.08178564f,-0.097990505f,-0.0004118288f,-0.09405116f,0.024903592f,0.10294186f,-0.056358308f,0.008722311f,0.003291616f,-0.018818645f,-0.09692368f,-0.06916255f,0.04351524f,0.06276146f,0.015949301f,-0.011483621f,-0.006140696f,-0.018099293f,-0.00995063f,-0.02339901f,-0.024050366f,0.07546873f,-0.028504811f,0.04963921f,-0.0076172072f,-0.02817498f,-0.060157653f,0.17303619f,0.08201302f,-0.10454881f,-0.037780367f,-0.10166054f,-0.04218956f,0.036145452f,-0.05909255f,-0.1310377f,0.042928588f,-0.074216254f,0.06736096f,-0.015906654f,0.017667266f,-0.10083889f,-0.0981416f,-0.041044645f,-0.044278126f,-0.043435194f,0.03941682f,-0.06488468f,0.05151011f,0.06728203f,-0.03063651f,-0.008622926f,-0.031948224f,-0.010434174f,0.068239905f,0.060161065f,0.051712357f,-0.026403125f,-0.02375673f,0.087121055f,-0.064310744f,-0.018927738f,0.082358375f,0.055305336f,-0.049550034f,-0.0042584497f,0.041982714f,-0.021151712f,-0.02128548f,0.026912898f,0.0756465f,0.06861557f,-0.008753623f,0.09500656f,-0.016722199f,0.045179248f,0.06433635f,-0.050378326f,0.057967953f,-0.06901692f,-0.056074627f,0.020965451f,-0.046926398f,0.035873193f,-0.04375493f,0.029856201f,-0.090699345f,-0.03156256f,-0.07428955f,0.0077958796f,-0.04266842f,0.015002862f,-0.070534125f,0.053053115f,-0.037462384f,0.050507464f,-0.0070602717f,-0.0030573292f,-0.029228054f,-0.041755334f,0.00685498f,-0.004669522f,0.009998457f,-0.1278749f,0.034629613f,0.06283661f,-0.12521815f,0.013474653f,-0.0799926f,0.026823737f,0.061804105f,0.010250122f,-0.029549334f,-0.031165676f,-0.048067544f,0.09705487f,-0.011098428f,-0.020320965f,-0.04148633f,-0.04995002f,-0.07669439f,-0.013797552f,0.06545086f,-0.11271107f,0.020970607f,0.03662167f,0.013129356f,0.06338316f,-0.022021094f,-0.0111510875f,-0.001241367f,-0.02091455f,-0.02392863f,-0.07325383f,0.0030011064f,0.0035147914f,0.018672392f,-0.006221267f,-0.08329453f,-0.02850241f,0.058879375f,-0.060902014f,-0.031377528f,0.009679124f,-0.028970672f,-0.018530989f,0.010012114f,0.016068267f,-0.007894194f,0.081875786f,-0.11858768f,-0.017476896f,-0.020515822f,-0.03861367f,0.009478026f,-0.0664049f,0.073679835f,-0.048740324f,-0.07530314f,-0.027699525f,0.008710347f,-0.036728617f,0.05363878f,0.066634215f,0.03055375f,-0.015267707f,-0.0062362403f,-0.048134644f,-0.027497273f,0.106398955f,0.07820926f,0.022096202f,0.017608056f,0.058401976f,0.054751392f,-0.06744572f,-0.08412845f,-0.00500415f,0.018115649f,-0.005963796f,-0.054437798f,-0.042635784f,0.031453032f,0.06092953f,-0.019631077f,-0.008254761f,0.017513359f,-0.031257857f,0.02838266f,-0.03598625f,0.028826846f,0.024132045f,0.059701487f,-0.01007231f,0.024742646f,0.052117206f,0.030826205f,-0.013547746f,0.062779665f,-0.027365256f,0.0034637607f,-0.1755821f,0.10319762f,0.011423345f,0.0038134188f,-0.034051597f,0.0043514566f,0.02310374f,0.022277541f}; + public static final float[] CENT4 = {0.010704855f,-0.0670643f,0.021093868f,0.0065769083f,-0.012839604f,0.034078848f,0.032994095f,0.01122795f,0.009345008f,0.03385807f,0.051176053f,0.06273422f,0.03913983f,0.0322756f,-0.08576432f,-0.051884178f,0.08340187f,0.09360067f,-0.057922807f,-0.035607494f,-0.030732682f,0.11356546f,0.014922392f,0.12317613f,-0.032346856f,-0.025270259f,-0.036590725f,0.07295593f,0.10391724f,-0.089836046f,-0.07472955f,-0.03303165f,-0.02964042f,-0.07523887f,0.021307195f,-0.035367355f,-0.082960255f,-0.046930898f,-0.025115374f,0.012069998f,-0.057034306f,0.01572307f,-0.059006996f,0.077960856f,-0.030932972f,-0.15429792f,-0.12866907f,-0.021584965f,-0.029835228f,0.11236409f,-0.191379f,0.04282179f,-0.013448678f,-0.023948299f,0.0037557732f,0.16706619f,-0.05362709f,-0.07566937f,0.014789757f,-0.08677718f,-0.061279073f,0.020474998f,-0.09120756f,0.011759666f,-0.1306745f,-0.018631075f,-0.0186695f,0.11664367f,0.026543181f,-0.015455006f,-0.06260978f,-0.10498881f,0.036383174f,0.013893794f,-0.013775872f,-0.15882859f,0.09005678f,-0.018552678f,0.016706819f,0.1094073f,0.02589461f,-0.02969979f,0.029911079f,0.014708954f,0.027515892f,-0.07330767f,0.060595468f,0.04038546f,0.02411474f,0.026085714f,0.010706781f,-0.095420465f,-0.029445278f,-0.030828753f,0.023032123f,0.03859944f,0.09770636f,-0.017120324f,0.08896674f,0.011373428f,0.045112073f,-0.081473045f,-0.059673958f,-0.05634908f,-0.009259863f,0.09190117f,-0.05638226f,-0.02564247f,-0.037803873f,-0.0054110927f,0.0017864634f,-0.03682105f,-0.039525744f,-0.03141533f,0.11275712f,0.0004839851f,-0.002380149f,-0.026794951f,0.14561033f,0.02361305f,-0.013039672f,0.028168613f,-0.106352076f,-0.0012364064f,-0.047067884f,0.008281522f,-0.2103559f,0.08604091f,-0.0060125557f,0.04257702f,0.028519936f,-0.05306307f,-0.045899067f,-0.05791303f,-0.05610731f,-0.03951493f,-0.021780033f,-0.0448661f,-0.06279184f,0.07662312f,0.09883195f,-0.024417933f,-0.03946856f,-0.0005570199f,-0.09816524f,-0.041022364f,0.10536386f,0.062715694f,0.060231254f,0.0005412426f,0.06900588f,-0.08660474f,0.04899578f,0.068696626f,0.07466629f,-0.012135404f,-0.03598778f,-0.07057797f,-0.0666644f,-0.09265552f,0.0221408f,0.052452605f,-0.016891258f,-0.032897357f,0.036327623f,0.0872705f,0.06578879f,0.06010996f,-0.09918983f,0.094453976f,-0.015436117f,-0.08050265f,0.018843891f,-0.09102668f,0.059967782f,0.031252038f,0.02346507f,-0.12187432f,-0.07320767f,-0.108193256f,-0.07079308f,-0.06311027f,0.013159557f,-0.010688115f,-0.025490178f,-0.042861346f,0.02347437f,0.035059363f,0.03562216f,-0.05843507f,-0.053761233f,-0.041519757f,-0.026732028f,-0.026059309f,-0.19108123f,-0.041591886f,0.0372987f,-0.13595153f,0.05862723f,-0.16445152f,0.028238056f,0.09991044f,-0.025273887f,-0.0014634734f,-0.066377215f,0.04209974f,0.0018236845f,-0.026547626f,0.06924512f,-0.050397336f,-0.052222244f,-0.02491701f,-0.02636067f,0.06482354f,-0.059252396f,0.09828912f,0.081354156f,0.076036766f,0.042895827f,-0.01686822f,-0.08791581f,0.04457833f,-0.034038357f,-0.047805637f,-0.08799373f,-0.0638367f,0.016854111f,0.021876955f,-0.055394217f,-0.047526322f,-0.03587267f,0.06970263f,-0.04866665f,0.0011970974f,0.020647142f,-0.04771379f,-0.0010832258f,-0.020835506f,0.03243466f,0.038084455f,0.019542176f,-0.072884016f,-0.016455641f,-0.0345566f,0.0043037934f,-0.0975967f,-0.04744776f,0.14361002f,-0.06910765f,-0.021607704f,-0.092588946f,0.080095164f,-0.08491559f,0.041206434f,0.038057495f,0.023832599f,-0.06784454f,-0.08696313f,-0.028889777f,-0.067823686f,0.09943456f,0.009428746f,0.0066630538f,-0.0024553319f,-0.033138607f,0.0723644f,-0.045171943f,-0.085606396f,-0.10135047f,-0.037499584f,0.020281449f,0.055990275f,-0.01740628f,-0.019308072f,0.038056754f,0.0039541004f,-0.008331595f,0.048621465f,0.0032122715f,-0.030135294f,-0.039464857f,0.04360947f,0.048852123f,0.06722341f,0.020990908f,0.021438524f,-0.045839995f,0.009275418f,0.007442289f,0.12134915f,0.00068679365f,-0.07127784f,-0.13177507f,0.11126146f,0.039484635f,-0.036843125f,0.0511725f,-0.0598986f,0.045724258f,0.036662277f}; + public static final float[] CENT5 = {0.022839382f,-0.03359495f,-0.0066703307f,0.022163201f,-0.02408755f,0.06442031f,0.0045688753f,0.010488761f,-0.00044476232f,0.05044493f,0.033675246f,0.049774945f,0.008621748f,0.042746447f,-0.07700397f,-0.0451958f,0.07938519f,0.08385187f,-0.061964415f,-0.035912927f,-0.003467658f,0.08524248f,0.014303198f,0.11600093f,-0.04287127f,-0.022873333f,-0.06825271f,0.08194304f,0.04793892f,-0.0527855f,-0.07581738f,-0.055221103f,-0.012851007f,-0.07099417f,0.014147388f,-0.037063416f,-0.04454149f,-0.0095205465f,-0.023536751f,0.014266046f,-0.014146757f,0.0072805923f,-0.020179052f,0.059608992f,-0.03419748f,-0.15566263f,-0.1015401f,-0.035094157f,-0.029369399f,0.08147777f,-0.21588254f,0.029588284f,-0.019559735f,-0.047735233f,0.016593328f,0.12735985f,-0.05213097f,-0.06873328f,0.014142882f,-0.038571272f,-0.042302243f,0.03880131f,-0.06659886f,-0.0073683104f,-0.06918143f,-0.025084415f,-0.038701344f,0.11273736f,0.012830569f,-0.033017233f,-0.031226004f,-0.09289884f,0.032232117f,-0.017454978f,-0.039876193f,-0.13845666f,0.06345957f,-0.03838888f,-0.00023332387f,0.088637434f,0.015690532f,-0.024366027f,0.04522402f,0.009410225f,0.04446052f,-0.0868467f,0.033179335f,0.063996814f,0.051150206f,0.014990753f,-0.0046050698f,-0.08338682f,-0.0021660307f,0.016665481f,0.042485926f,0.041599285f,0.099288955f,-0.027209742f,0.06921274f,0.0077553103f,0.052449062f,-0.07543572f,-0.056188744f,-0.0680761f,0.00052976294f,0.08570987f,-0.057802778f,-0.015359663f,-0.037239462f,0.0069555137f,-0.013637077f,-0.012860835f,0.007908063f,-0.04955145f,0.10830962f,0.028258102f,-0.011194532f,-0.015489036f,0.11507248f,-0.009770631f,0.009489947f,0.03191821f,-0.09354445f,-0.0064501795f,-0.015091094f,0.025286112f,-0.16656414f,0.10895973f,-0.012308648f,0.019734804f,0.015983343f,-0.05980111f,-0.005718719f,-0.03080548f,-0.039776705f,-0.039912112f,-0.005679874f,-0.05862779f,-0.027916564f,0.074065804f,0.07143131f,-0.0075149457f,-0.041759312f,0.0010628969f,-0.07478614f,-0.059406433f,0.085929275f,0.061952654f,0.04479289f,-0.004131101f,0.06671996f,-0.097041205f,0.03594322f,0.022973606f,0.070404366f,-0.050781988f,-0.048104316f,-0.04680078f,-0.05829367f,-0.101707675f,0.011345205f,0.042796608f,0.02355706f,-0.020254193f,-0.002402296f,0.08201183f,0.040505495f,0.034578394f,-0.072379276f,0.09444956f,-0.02363955f,-0.052523956f,-0.014398527f,-0.100167096f,0.017307185f,0.048116777f,0.021709686f,-0.12685254f,-0.07343213f,-0.078357436f,-0.08621896f,-0.06159465f,0.008424792f,-0.0012125799f,-0.027283844f,-0.0044014268f,0.008536106f,0.009195816f,0.018210959f,-0.017518131f,-0.067163736f,-0.046488773f,-0.01052375f,-0.03658865f,-0.1409519f,-0.04512672f,0.024262238f,-0.12324837f,0.051286887f,-0.113918826f,0.049287096f,0.09406128f,-0.056964476f,0.014383443f,-0.05087614f,0.07044303f,-0.034645855f,-0.025743358f,0.036189727f,-0.03501847f,-0.063039966f,-0.012151702f,-0.04026078f,0.07979788f,-0.08106726f,0.057337496f,0.07416963f,0.07374935f,0.025263727f,-0.02613804f,-0.091119975f,0.05832003f,-0.07452149f,-0.04719072f,-0.059752468f,-0.048668176f,0.02071755f,0.03147056f,-0.04182357f,-0.043808248f,-0.031065056f,0.07393967f,-0.013676625f,0.013433168f,-0.0050358544f,-0.06556628f,0.015460819f,-0.008181974f,0.05046547f,0.012215743f,-0.001954195f,-0.032166645f,0.010198491f,-0.05324453f,0.016410602f,-0.07528097f,-0.08296055f,0.17857912f,-0.06782797f,-0.037963327f,-0.06631536f,0.0805591f,-0.04899296f,0.042215392f,0.043943767f,-0.0076278094f,-0.07121953f,-0.06301679f,-0.009259667f,-0.075178064f,0.10176182f,-0.0027576364f,-0.0014918603f,-0.03900084f,-0.013977311f,0.06704828f,-0.044342674f,-0.056591447f,-0.08333293f,-0.046077147f,-0.00720852f,0.07769522f,-0.033034276f,0.010620653f,0.051043287f,0.024553757f,-0.016105993f,0.0055337967f,0.0018331109f,-0.009095089f,-0.03252504f,0.053799663f,0.022690354f,0.04455544f,0.04527506f,0.025460564f,-0.019688746f,0.044049434f,0.02170557f,0.11448796f,-0.0138435885f,-0.011530503f,-0.093710974f,0.078288384f,0.024630895f,-0.044313632f,-0.008687107f,-0.05694564f,0.020266589f,0.04716828f}; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index f03ead8..b0d04a6 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -278,11 +278,9 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map closestWord = memModel.getClosestSubEntry( centroids[i], null); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + LOG.info(closestWord.keySet()); + LOG.info("Query time recorded for Centroid " + (i+1) + " is " + + diff + " milliseconds."); + } + + LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); - public static void main(String[] args) { - LOG.info("Starting test!"); - NormalizedInMemModelTest inMemModelTest = new NormalizedInMemModelTest(); - inMemModelTest.testNormalizedModel(); - LOG.info("Test finished!"); } } From a47436ecbea1e08d763b534c8b3a6bf78d30a351 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 16:47:57 +0200 Subject: [PATCH 040/114] setting area to 1/30th --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index b0d04a6..9c12c75 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -33,7 +33,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { /** * Limit to the multiplier of area in which nearby vectors are to be looked */ - private static final int EXHAUSTION_MULT = 100; + private static final int EXHAUSTION_MULT = 30; /** * Multiplier for the standard deviation */ @@ -41,7 +41,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { /** * Divisor for the standard deviation's value */ - private static final int AREA_DIVISOR = 100; + private static final int AREA_DIVISOR = 30; private DataSubsetProvider dataSubsetProvider; /** * Contains the sorted dimensional values mapped to their words From 338bfadd486d1b17c6a304af010332be50266ebb Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 17:03:12 +0200 Subject: [PATCH 041/114] setting area to 1/20th --- .../org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 9c12c75..79a56dc 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -33,7 +33,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { /** * Limit to the multiplier of area in which nearby vectors are to be looked */ - private static final int EXHAUSTION_MULT = 30; + private static final int EXHAUSTION_MULT = 20; /** * Multiplier for the standard deviation */ @@ -41,7 +41,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { /** * Divisor for the standard deviation's value */ - private static final int AREA_DIVISOR = 30; + private static final int AREA_DIVISOR = 20; private DataSubsetProvider dataSubsetProvider; /** * Contains the sorted dimensional values mapped to their words From b1f753b782aff9b578df256aebd0b4c6e4caaacb Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 11 Sep 2018 18:17:20 +0200 Subject: [PATCH 042/114] sdmap logic change and new time logs --- .../word2vec/W2VNrmlMemModel.java | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java index 79a56dc..b1dc808 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java @@ -29,7 +29,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { private Map word2vec; private int vectorSize; - private Map sdMap; + private float[] sdArr; /** * Limit to the multiplier of area in which nearby vectors are to be looked */ @@ -161,7 +161,7 @@ private Map getClosestEntry(float[] vector, String subKey) { * - size of each vector */ public void setModelVals(Map word2vecMap, int vectorSize) { - Map resMap = new HashMap<>(); + float[] resMap = new float[vectorSize]; int totSize = word2vecMap.size(); // loop all dimensions for (int i = 0; i < vectorSize; i++) { @@ -189,10 +189,10 @@ public void setModelVals(Map word2vecMap, int vectorSize) { } float variance = sum / dimsnArr.length; Double sd = Math.sqrt(variance); - resMap.put(i, sd.floatValue() * SIGMA_MULT / AREA_DIVISOR); + resMap[i] = sd.floatValue() * SIGMA_MULT / AREA_DIVISOR; } // Set as sdMap - this.sdMap = resMap; + this.sdArr = resMap; } /** @@ -208,10 +208,14 @@ private Map fetchNearbyVectors(float[] vector, Set word Map nearbyVecMap = new HashMap<>(); boolean mapEmpty = true; boolean notExhausted = true; - float[][] minMaxVec; + float[][] minMaxVec = getMinMaxVec(vector); int mult = 1; while (mapEmpty && notExhausted) { - minMaxVec = getMinMaxVec(vector, mult); + if (mult > 1) { + tl.logTime(8); + incrementMinMaxVec(minMaxVec); + tl.printTime(8, "incrementMinMaxVec"); + } tl.printTime(2, "getMinMaxVec"); if (indxd) { tl.logTime(4); @@ -277,10 +281,12 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map Date: Wed, 12 Sep 2018 16:13:05 +0200 Subject: [PATCH 043/114] new less restrictive model added --- .../subset/DataSubsetGenerator.java | 4 +- .../tool/ModelStatsWriter.java | 4 +- ...emModel.java => W2VNrmlMemModelIndxd.java} | 56 +-- .../word2vec/W2VNrmlMemModelIndxdLR.java | 344 ++++++++++++++++++ .../word2vec/W2VNrmlMemModelNonIndxd.java | 308 ++++++++++++++++ .../word2vec/Word2VecFactory.java | 16 +- .../NormalizedInMemModelTest.java | 4 +- .../NrmlzdMdlPrfmncTester.java | 44 +++ 8 files changed, 723 insertions(+), 57 deletions(-) rename src/main/java/org/aksw/word2vecrestful/word2vec/{W2VNrmlMemModel.java => W2VNrmlMemModelIndxd.java} (85%) create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java create mode 100644 src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index 46ccf7e..8595239 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -11,7 +11,7 @@ import java.util.Map.Entry; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.apache.commons.io.output.FileWriterWithEncoding; @@ -139,7 +139,7 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { File subsetConfig = new File("D:\\Nikit\\DICE-Group\\word2vec-dump\\subsetconfig2.json"); - W2VNrmlMemModel model = Word2VecFactory.getNormalizedBinModel(); + W2VNrmlMemModelNonIndxd model = Word2VecFactory.getNormalizedBinNonIndxdModel(); generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.getWord2VecMap(), model.getVectorSize()); } diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java index 7d51db9..774942d 100644 --- a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java @@ -9,7 +9,7 @@ import java.util.Map.Entry; import java.util.Set; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -87,7 +87,7 @@ private static void writeValues(int index, float min, float max, float mean, flo */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { // Get the normalized model - W2VNrmlMemModel model = Word2VecFactory.getNormalizedBinModel(); + W2VNrmlMemModelNonIndxd model = Word2VecFactory.getNormalizedBinNonIndxdModel(); writeModelStats(model.getWord2VecMap(), model.getVectorSize(), new File("data/normal/stat/normal-model-stats.csv")); } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java similarity index 85% rename from src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java rename to src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java index b1dc808..be141a6 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java @@ -24,7 +24,7 @@ * @author Nikit * */ -public class W2VNrmlMemModel implements GenWord2VecModel { +public class W2VNrmlMemModelIndxd implements GenWord2VecModel { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); private Map word2vec; @@ -54,7 +54,7 @@ public class W2VNrmlMemModel implements GenWord2VecModel { // TODO : Remove this private TimeLogger tl = new TimeLogger(); - public W2VNrmlMemModel(final Map word2vec, final int vectorSize) { + public W2VNrmlMemModelIndxd(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; this.dataSubsetProvider = new DataSubsetProvider(); @@ -137,7 +137,7 @@ private Map getClosestEntry(float[] vector, String subKey) { // LOG.info("fetching nearby vectors"); // Find nearby vectors tl.logTime(2); - Map nearbyVecs = fetchNearbyVectors(vector, wordSet, true); + Map nearbyVecs = fetchNearbyVectors(vector, wordSet); tl.printTime(2, "fetchNearbyVectors"); // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector @@ -204,11 +204,12 @@ public void setModelVals(Map word2vecMap, int vectorSize) { * - word set to look into for nearby vectors * @return - mapping of nearby words alongwith with their vector values */ - private Map fetchNearbyVectors(float[] vector, Set wordSet, boolean indxd) { + private Map fetchNearbyVectors(float[] vector, Set wordSet) { Map nearbyVecMap = new HashMap<>(); boolean mapEmpty = true; boolean notExhausted = true; float[][] minMaxVec = getMinMaxVec(vector); + tl.printTime(2, "getMinMaxVec"); int mult = 1; while (mapEmpty && notExhausted) { if (mult > 1) { @@ -216,14 +217,10 @@ private Map fetchNearbyVectors(float[] vector, Set word incrementMinMaxVec(minMaxVec); tl.printTime(8, "incrementMinMaxVec"); } - tl.printTime(2, "getMinMaxVec"); - if (indxd) { - tl.logTime(4); - putNearbyVecsIndxd(minMaxVec, wordSet, nearbyVecMap); - tl.printTime(4, "putNearbyVecsIndxd"); - } else { - putNearbyVecsNonIndxd(minMaxVec, wordSet, nearbyVecMap); - } + + tl.logTime(4); + putNearbyVecsIndxd(minMaxVec, wordSet, nearbyVecMap); + tl.printTime(4, "putNearbyVecsIndxd"); if (nearbyVecMap.size() > 0) { mapEmpty = false; } else { @@ -237,15 +234,6 @@ private Map fetchNearbyVectors(float[] vector, Set word return nearbyVecMap; } - private void putNearbyVecsNonIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { - for (String word : wordSet) { - float[] entryVec = word2vec.get(word); - if (isVectorInArea(entryVec, minMaxVec)) { - nearbyVecMap.put(word, entryVec); - } - } - } - private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { // init a set to hold words Set nearbyWords = new HashSet<>(); @@ -281,12 +269,12 @@ private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map wordSet, Map maxVec[i]) { - isValid = false; - break; - } - } - return isValid; - } - /** * Method to generate two vectors from a given vector by adding and subtracting * value in sdMap from the given vector diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java new file mode 100644 index 0000000..2f7526f --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -0,0 +1,344 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.aksw.word2vecrestful.subset.DataSubsetProvider; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.dice_research.topicmodeling.commons.sort.AssociativeSort; + +import nikit.test.TimeLogger; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private float[] sdArr; + /** + * Multiplier for the standard deviation + */ + private int sigmaMult = 3; + /** + * Divisor for the standard deviation's value + */ + private int areaDivisor = 10; + private DataSubsetProvider dataSubsetProvider; + /** + * Contains the sorted dimensional values mapped to their words + */ + private Object[][] indexesArr; + + private String[] gWordArr; + private float[][] gVecArr; + + // TODO : Remove this + private TimeLogger tl = new TimeLogger(); + + public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + this.dataSubsetProvider = new DataSubsetProvider(); + this.initArrays(); + // Calculate sd*3/10 and save in map + // Initialize indexesArr unsorted + // LOG.info("Initializing indexes and calculating standard deviation"); + this.setModelVals(word2vec, vectorSize); + // LOG.info("Sorting indexes"); + // Sort the indexes + this.sortIndexes(); + // LOG.info("Sorting completed"); + } + + public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vectorSize, int sigmaMult, + int areaDivisor) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + this.sigmaMult = sigmaMult; + this.areaDivisor = areaDivisor; + this.dataSubsetProvider = new DataSubsetProvider(); + this.initArrays(); + // Calculate sd*3/10 and save in map + // Initialize indexesArr unsorted + // LOG.info("Initializing indexes and calculating standard deviation"); + this.setModelVals(word2vec, vectorSize); + // LOG.info("Sorting indexes"); + // Sort the indexes + this.sortIndexes(); + // LOG.info("Sorting completed"); + } + + public void updateSdArr(int newSigmaMult, int newAreaDivisor) { + // Updating SdArr values + for(int i=0;i< sdArr.length;i++) { + sdArr[i] *= (areaDivisor/sigmaMult)*(newSigmaMult/newAreaDivisor); + } + this.sigmaMult = newSigmaMult; + this.areaDivisor = newAreaDivisor; + } + + private void initArrays() { + this.indexesArr = new Object[vectorSize][2]; + this.gWordArr = new String[word2vec.size()]; + this.gVecArr = new float[word2vec.size()][vectorSize]; + int i = 0; + for (Entry entry : word2vec.entrySet()) { + gWordArr[i] = entry.getKey(); + gVecArr[i] = entry.getValue(); + i++; + } + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private Map getClosestEntry(float[] vector, String subKey) { + Set wordSet = null; + Map closestVec = null; + try { + if (subKey == null) { + wordSet = word2vec.keySet(); + } else { + tl.logTime(1); + wordSet = dataSubsetProvider.fetchSubsetWords(subKey); + tl.printTime(1, "fetchSubsetWords"); + } + // LOG.info("Normalizing input vector"); + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // LOG.info("fetching nearby vectors"); + // Find nearby vectors + tl.logTime(2); + Map nearbyVecs = fetchNearbyVectors(vector, wordSet); + tl.printTime(2, "fetchNearbyVectors"); + // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); + // Select the closest vector + tl.logTime(3); + closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + tl.printTime(3, "findClosestVecInNearbyVecs"); + } catch (IOException e) { + // LOG.error(e.getStackTrace()); + } + // LOG.info("Closest word found is " + closestVec.keySet()); + return closestVec; + } + + /** + * Method to find standard deviation for each dimension of word vector and store + * the operated value next to the dimension's index in sdMap + * + * @param word2vecMap + * - mapping of words alongwith their vectors + * @param vectorSize + * - size of each vector + */ + public void setModelVals(Map word2vecMap, int vectorSize) { + float[] resArr = new float[vectorSize]; + int totSize = word2vecMap.size(); + // loop all dimensions + for (int i = 0; i < vectorSize; i++) { + // loop through all the words + float[] dimsnArr = new float[totSize]; + int[] idArr = new int[totSize]; + float sum = 0; + for (int j = 0; j < gWordArr.length; j++) { + float val = gVecArr[j][i]; + sum += val; + idArr[j] = j; + dimsnArr[j] = val; + } + // Setting value in indexArr + Object[] dimValWordMap = new Object[2]; + dimValWordMap[0] = idArr; + dimValWordMap[1] = dimsnArr; + this.indexesArr[i] = dimValWordMap; + // LOG.info("Dimension " + (i) + " index stored to memory"); + // mean + float mean = sum / dimsnArr.length; + sum = 0; + for (int j = 0; j < dimsnArr.length; j++) { + sum += Math.pow(dimsnArr[j] - mean, 2); + } + float variance = sum / dimsnArr.length; + Double sd = Math.sqrt(variance); + resArr[i] = sd.floatValue() * sigmaMult / areaDivisor; + } + // Set as sdMap + this.sdArr = resArr; + } + + /** + * Method to fetch nearby vectors for a given vector in a particular word set + * + * @param vector + * - vector to look nearby vectors for + * @param wordSet + * - word set to look into for nearby vectors + * @return - mapping of nearby words alongwith with their vector values + */ + private Map fetchNearbyVectors(float[] vector, Set wordSet) { + Map nearbyVecMap = new HashMap<>(); + float[][] minMaxVec = getMinMaxVec(vector); + putNearbyVecs(minMaxVec, nearbyVecMap); + return nearbyVecMap; + } + + private void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { + // init score array + short[] scoreArr = new short[gWordArr.length]; + float[] minVec = minMaxVec[0]; + float[] maxVec = minMaxVec[1]; + // loop through each dimension and increment the score of words in that area + for(int i=0;i getMaxIdList(short[] scoreArr){ + List resList= new ArrayList<>(); + short max = 0; + for(int i=0;imax) { + resList.clear(); + resList.add(i); + } else if(score == max) { + resList.add(i); + } + } + return resList; + } + + /** + * Method to generate two vectors from a given vector by adding and subtracting + * value in sdMap from the given vector + * + * @param vector + * - input vector to perform operation on + * @return - min vector at index 0 and max vector at index 1 + */ + private float[][] getMinMaxVec(float[] vector) { + float[][] resVec = new float[2][vector.length]; + for (int i = 0; i < vector.length; i++) { + // TODO: change sdmap to array + float diff = sdArr[i]; + // MinVec + resVec[0][i] = vector[i] - diff; + // MaxVec + resVec[1][i] = vector[i] + diff; + } + return resVec; + } + + private void sortIndexes() { + for (int i = 0; i < indexesArr.length; i++) { + // LOG.info("Sorting index " + i); + Object[] entryArr = indexesArr[i]; + int[] idArr = (int[]) entryArr[0]; + float[] dimsnValArr = (float[]) entryArr[1]; + AssociativeSort.quickSort(dimsnValArr, idArr); + // LOG.info("Sorting completed for index " + i); + } + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java new file mode 100644 index 0000000..5f95449 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java @@ -0,0 +1,308 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.aksw.word2vecrestful.subset.DataSubsetProvider; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import nikit.test.TimeLogger; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModelNonIndxd implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private float[] sdArr; + /** + * Limit to the multiplier of area in which nearby vectors are to be looked + */ + private static final int EXHAUSTION_MULT = 20; + /** + * Multiplier for the standard deviation + */ + private static final int SIGMA_MULT = 3; + /** + * Divisor for the standard deviation's value + */ + private static final int AREA_DIVISOR = 20; + private DataSubsetProvider dataSubsetProvider; + /** + * Contains the sorted dimensional values mapped to their words + */ + + private String[] gWordArr; + private float[][] gVecArr; + + // TODO : Remove this + private TimeLogger tl = new TimeLogger(); + + public W2VNrmlMemModelNonIndxd(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + this.dataSubsetProvider = new DataSubsetProvider(); + // Calculate sd*3/10 and save in map + // Initialize indexesArr unsorted + // LOG.info("Initializing indexes and calculating standard deviation"); + initArrays(); + this.setModelVals(word2vec, vectorSize); + // LOG.info("Sorting indexes"); + // Sort the indexes + // LOG.info("Sorting completed"); + } + + private void initArrays() { + this.gWordArr = new String[word2vec.size()]; + this.gVecArr = new float[word2vec.size()][vectorSize]; + int i = 0; + for (Entry entry : word2vec.entrySet()) { + gWordArr[i] = entry.getKey(); + gVecArr[i] = entry.getValue(); + i++; + } + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private Map getClosestEntry(float[] vector, String subKey) { + Set wordSet = null; + Map closestVec = null; + try { + if (subKey == null) { + wordSet = word2vec.keySet(); + } else { + tl.logTime(1); + wordSet = dataSubsetProvider.fetchSubsetWords(subKey); + tl.printTime(1, "fetchSubsetWords"); + } + // LOG.info("Normalizing input vector"); + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // LOG.info("fetching nearby vectors"); + // Find nearby vectors + tl.logTime(2); + Map nearbyVecs = fetchNearbyVectors(vector, wordSet); + tl.printTime(2, "fetchNearbyVectors"); + // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); + // Select the closest vector + tl.logTime(3); + closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + tl.printTime(3, "findClosestVecInNearbyVecs"); + } catch (IOException e) { + // LOG.error(e.getStackTrace()); + } + // LOG.info("Closest word found is " + closestVec.keySet()); + return closestVec; + } + + /** + * Method to find standard deviation for each dimension of word vector and store + * the operated value next to the dimension's index in sdMap + * + * @param word2vecMap + * - mapping of words alongwith their vectors + * @param vectorSize + * - size of each vector + */ + public void setModelVals(Map word2vecMap, int vectorSize) { + float[] resMap = new float[vectorSize]; + int totSize = word2vecMap.size(); + // loop all dimensions + for (int i = 0; i < vectorSize; i++) { + // loop through all the words + float[] dimsnArr = new float[totSize]; + int[] idArr = new int[totSize]; + float sum = 0; + for (int j = 0; j < gWordArr.length; j++) { + float val = gVecArr[j][i]; + sum += val; + idArr[j] = j; + dimsnArr[j] = val; + } + // Setting value in indexArr + // LOG.info("Dimension " + (i) + " index stored to memory"); + // mean + float mean = sum / dimsnArr.length; + sum = 0; + for (int j = 0; j < dimsnArr.length; j++) { + sum += Math.pow(dimsnArr[j] - mean, 2); + } + float variance = sum / dimsnArr.length; + Double sd = Math.sqrt(variance); + resMap[i] = sd.floatValue() * SIGMA_MULT / AREA_DIVISOR; + } + // Set as sdMap + this.sdArr = resMap; + } + + /** + * Method to fetch nearby vectors for a given vector in a particular word set + * + * @param vector + * - vector to look nearby vectors for + * @param wordSet + * - word set to look into for nearby vectors + * @return - mapping of nearby words alongwith with their vector values + */ + private Map fetchNearbyVectors(float[] vector, Set wordSet) { + Map nearbyVecMap = new HashMap<>(); + boolean mapEmpty = true; + boolean notExhausted = true; + float[][] minMaxVec = getMinMaxVec(vector); + tl.printTime(2, "getMinMaxVec"); + int mult = 1; + while (mapEmpty && notExhausted) { + if (mult > 1) { + tl.logTime(8); + incrementMinMaxVec(minMaxVec); + tl.printTime(8, "incrementMinMaxVec"); + } + tl.logTime(4); + putNearbyVecsNonIndxd(minMaxVec, wordSet, nearbyVecMap); + tl.printTime(4, "putNearbyVecsNonIndxd"); + if (nearbyVecMap.size() > 0) { + mapEmpty = false; + } else { + ++mult; + if (mult > EXHAUSTION_MULT) { + notExhausted = false; + } + // LOG.info("MinMax multiplier incremented to " + mult); + } + } + return nearbyVecMap; + } + + private void putNearbyVecsNonIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { + for (String word : wordSet) { + float[] entryVec = word2vec.get(word); + if (isVectorInArea(entryVec, minMaxVec)) { + nearbyVecMap.put(word, entryVec); + } + } + } + + /** + * Method to check if vector falls in a particular area + * + * @param entryVec + * - vector to be verified + * @param minMaxVec + * - min vec and max vec as area's boundary + * @return - if the given vector is inside min and max vec's range + */ + private boolean isVectorInArea(float[] entryVec, float[][] minMaxVec) { + boolean isValid = true; + float[] minVec = minMaxVec[0]; + float[] maxVec = minMaxVec[1]; + for (int i = 0; i < entryVec.length; i++) { + if (entryVec[i] < minVec[i] || entryVec[i] > maxVec[i]) { + isValid = false; + break; + } + } + return isValid; + } + + /** + * Method to generate two vectors from a given vector by adding and subtracting + * value in sdMap from the given vector + * + * @param vector + * - input vector to perform operation on + * @return - min vector at index 0 and max vector at index 1 + */ + private float[][] getMinMaxVec(float[] vector) { + float[][] resVec = new float[2][vector.length]; + for (int i = 0; i < vector.length; i++) { + // TODO: change sdmap to array + float diff = sdArr[i]; + // MinVec + resVec[0][i] = vector[i] - diff; + // MaxVec + resVec[1][i] = vector[i] + diff; + } + return resVec; + } + + private void incrementMinMaxVec(float[][] minMaxVec) { + float[] minVec = minMaxVec[0]; + float[] maxVec = minMaxVec[1]; + for (int i = 0; i < vectorSize; i++) { + float diff = sdArr[i]; + // MinVec + minVec[i] -= diff; + // MaxVec + maxVec[i] += diff; + } + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index 8fe5ba1..c3e2487 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -25,18 +25,22 @@ public static Word2VecModel get() { return new Word2VecModelLoader().loadModel(new File(model), binModel); } - public static W2VNrmlMemModel getNormalizedModel() { + public static W2VNrmlMemModelNonIndxd getNormalizedModel() { Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlMdlFilePath), nrmlMdlBinFlg); - return new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); + return new W2VNrmlMemModelNonIndxd(w2vmodel.word2vec, w2vmodel.vectorSize); } - public static W2VNrmlMemModel getNormalizedBinModel() { + public static W2VNrmlMemModelNonIndxd getNormalizedBinNonIndxdModel() { LOG.info("Loading model to memory"); Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); LOG.info("Model loaded to memory"); - LOG.info("Initializing "+W2VNrmlMemModel.class + " instance"); - W2VNrmlMemModel nrmlMemModel = new W2VNrmlMemModel(w2vmodel.word2vec, w2vmodel.vectorSize); - LOG.info("Initialization of "+W2VNrmlMemModel.class + " instance finished"); + LOG.info("Initializing "+W2VNrmlMemModelNonIndxd.class + " instance"); + W2VNrmlMemModelNonIndxd nrmlMemModel = new W2VNrmlMemModelNonIndxd(w2vmodel.word2vec, w2vmodel.vectorSize); + LOG.info("Initialization of "+W2VNrmlMemModelNonIndxd.class + " instance finished"); return nrmlMemModel; } + + public static Word2VecModel getNormalBinModel() { + return new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); + } } diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index 21a3998..5235658 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -4,7 +4,7 @@ import java.util.Map; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModel; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -52,7 +52,7 @@ public void testNormalizedModel() { @Test public void testNbmTime() { LOG.info("Starting InMemory indexed model test!"); - final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); + final W2VNrmlMemModelNonIndxd memModel = Word2VecFactory.getNormalizedBinNonIndxdModel(); float[][] centroids = {TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5}; long startTime, diff; diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java new file mode 100644 index 0000000..dc7e4f5 --- /dev/null +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -0,0 +1,44 @@ +package org.aksw.word2vecrestful; + +import java.util.Map; + +import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLR; +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.log4j.PropertyConfigurator; +import org.junit.Test; + +import nikit.test.TestConst; + +public class NrmlzdMdlPrfmncTester { + static { + PropertyConfigurator.configure(Cfg.LOG_FILE); + } + public static Logger LOG = LogManager.getLogger(NrmlzdMdlPrfmncTester.class); + + @Test + public void testNbmTime() { + LOG.info("Starting InMemory indexed model test!"); + Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); + final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); + float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; + + long startTime, diff; + long totTime = 0; + for (int i = 0; i < centroids.length; i++) { + LOG.info("Sending query for Centroid " + (i + 1)); + startTime = System.currentTimeMillis(); + Map closestWord = memModel.getClosestSubEntry(centroids[i], null); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + LOG.info(closestWord.keySet()); + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); + } + + LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); + + } +} From c04810e1a8a3d2ee2bada57b8627543666634764 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 12 Sep 2018 18:14:50 +0200 Subject: [PATCH 044/114] New test created --- .../word2vec/W2VNrmlMemModelBruteForce.java | 134 ++++++++++++++++++ .../word2vec/W2VNrmlMemModelIndxdLR.java | 60 ++++---- .../NrmlzdMdlPrfmncTester.java | 66 ++++++++- 3 files changed, 231 insertions(+), 29 deletions(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java new file mode 100644 index 0000000..8361d10 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -0,0 +1,134 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.aksw.word2vecrestful.subset.DataSubsetProvider; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.dice_research.topicmodeling.commons.sort.AssociativeSort; + +import nikit.test.TimeLogger; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModelBruteForce implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private DataSubsetProvider dataSubsetProvider; + // TODO : Remove this + private TimeLogger tl = new TimeLogger(); + + public W2VNrmlMemModelBruteForce(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + this.dataSubsetProvider = new DataSubsetProvider(); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private Map getClosestEntry(float[] vector, String subKey) { + Set wordSet = null; + Map closestVec = new HashMap<>(); + try { + if (subKey == null) { + wordSet = word2vec.keySet(); + } else { + tl.logTime(1); + wordSet = dataSubsetProvider.fetchSubsetWords(subKey); + tl.printTime(1, "fetchSubsetWords"); + } + // LOG.info("Normalizing input vector"); + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // LOG.info("fetching nearby vectors"); + // calculate cosine similarity of all distances + String[] wordArr = new String[wordSet.size()]; + int[] idArr = new int[wordSet.size()]; + double[] cosineArr = new double[wordSet.size()]; + int i = 0; + for (String word : wordSet) { + wordArr[i] = word; + idArr[i] = i; + float[] wordVec = word2vec.get(word); + cosineArr[i] = Word2VecMath.cosineSimilarity(wordVec, vector); + i++; + } + cosineArr = AssociativeSort.quickSort(cosineArr, idArr); + int closestWordId = idArr[idArr.length - 1]; + String closestWord = wordArr[closestWordId]; + closestVec.put(closestWord, word2vec.get(closestWord)); + } catch (IOException e) { + LOG.error(e.getStackTrace()); + } + // LOG.info("Closest word found is " + closestVec.keySet()); + return closestVec; + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index 2f7526f..52cff95 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -1,10 +1,8 @@ package org.aksw.word2vecrestful.word2vec; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; @@ -13,6 +11,7 @@ import org.aksw.word2vecrestful.utils.Word2VecMath; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import org.dice_research.topicmodeling.commons.collections.TopShortIntCollection; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; import nikit.test.TimeLogger; @@ -38,6 +37,10 @@ public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { * Divisor for the standard deviation's value */ private int areaDivisor = 10; + /** + * Number of top elements to give priority + */ + private int k = 10; private DataSubsetProvider dataSubsetProvider; /** * Contains the sorted dimensional values mapped to their words @@ -84,9 +87,12 @@ public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vec } public void updateSdArr(int newSigmaMult, int newAreaDivisor) { + if (newSigmaMult == sigmaMult && newAreaDivisor == areaDivisor) { + return; + } // Updating SdArr values - for(int i=0;i< sdArr.length;i++) { - sdArr[i] *= (areaDivisor/sigmaMult)*(newSigmaMult/newAreaDivisor); + for (int i = 0; i < sdArr.length; i++) { + sdArr[i] *= (areaDivisor / sigmaMult) * (newSigmaMult / newAreaDivisor); } this.sigmaMult = newSigmaMult; this.areaDivisor = newAreaDivisor; @@ -150,23 +156,23 @@ private Map getClosestEntry(float[] vector, String subKey) { if (subKey == null) { wordSet = word2vec.keySet(); } else { - tl.logTime(1); + // tl.logTime(1); wordSet = dataSubsetProvider.fetchSubsetWords(subKey); - tl.printTime(1, "fetchSubsetWords"); + // tl.printTime(1, "fetchSubsetWords"); } // LOG.info("Normalizing input vector"); // Normalize incoming vector vector = Word2VecMath.normalize(vector); // LOG.info("fetching nearby vectors"); // Find nearby vectors - tl.logTime(2); + // tl.logTime(2); Map nearbyVecs = fetchNearbyVectors(vector, wordSet); - tl.printTime(2, "fetchNearbyVectors"); + // tl.printTime(2, "fetchNearbyVectors"); // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector - tl.logTime(3); + // tl.logTime(3); closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); - tl.printTime(3, "findClosestVecInNearbyVecs"); + // tl.printTime(3, "findClosestVecInNearbyVecs"); } catch (IOException e) { // LOG.error(e.getStackTrace()); } @@ -240,7 +246,7 @@ private void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMa float[] minVec = minMaxVec[0]; float[] maxVec = minMaxVec[1]; // loop through each dimension and increment the score of words in that area - for(int i=0;i nearbyVecMa to++; } LOG.info("Setting scores for the words between 'from' and 'to' indexes:\t" + from + " " + to); - tl.logTime(9); + // tl.logTime(9); for (int j = from; j < to; j++) { scoreArr[idArr[j]]++; } - tl.printTime(9, "Score set for index " + i); + // tl.printTime(9, "Score set for index " + i); } // find the index of the words with highest score and add them to nearbyVecMap - for(Integer wordId : getMaxIdList(scoreArr)) { + for (int wordId : getMaxIdList(scoreArr)) { nearbyVecMap.put(gWordArr[wordId], gVecArr[wordId]); } } - - private List getMaxIdList(short[] scoreArr){ - List resList= new ArrayList<>(); - short max = 0; - for(int i=0;imax) { - resList.clear(); - resList.add(i); - } else if(score == max) { - resList.add(i); - } + collection.add(score, i); } - return resList; + return collection.getObjects(); } /** @@ -341,4 +341,12 @@ public Map getWord2VecMap() { return this.word2vec; } + public int getK() { + return k; + } + + public void setK(int k) { + this.k = k; + } + } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index dc7e4f5..8a61266 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -1,8 +1,11 @@ package org.aksw.word2vecrestful; +import java.util.ArrayList; +import java.util.List; import java.util.Map; import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLR; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; @@ -23,9 +26,47 @@ public class NrmlzdMdlPrfmncTester { public void testNbmTime() { LOG.info("Starting InMemory indexed model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); - final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; + List correctWords = getCorrectWords(centroids, nbm); + int kStrt = 1; + int kEnd = 100; + int sigStrt = 1; + int sigEnd = 5; + int arDivStrt = 1; + int arDivEnd = 200; + int count = 1; + final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); + for (int a = kStrt; a <= kEnd; a++) { + for (int b = arDivStrt; b <= arDivEnd; b++) { + for (int c = sigStrt; c <= sigEnd; c++) { + LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); + List lrModelWords = runLRMemModel(centroids, memModel, a, b, c); + LOG.info("Percentage Score for Test " + (count++) + " is" + + calcPercScore(correctWords, lrModelWords)); + } + } + } + + } + private float calcPercScore(List correctWords, List lrModelWords) { + float percScore = 0; + int len = correctWords.size(); + float lenInv = 100f / len; + for (int i = 0; i < len; i++) { + if (correctWords.get(i).equals(lrModelWords.get(i))) { + percScore += lenInv; + } + } + return percScore; + + } + + private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR memModel, int k, int arDiv, + int sigMult) { + memModel.setK(k); + memModel.updateSdArr(sigMult, arDiv); + List wordSet = new ArrayList<>(); long startTime, diff; long totTime = 0; for (int i = 0; i < centroids.length; i++) { @@ -34,11 +75,30 @@ public void testNbmTime() { Map closestWord = memModel.getClosestSubEntry(centroids[i], null); diff = System.currentTimeMillis() - startTime; totTime += diff; - LOG.info(closestWord.keySet()); + wordSet.addAll(closestWord.keySet()); LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); + LOG.info( + "Average query time for W2VNrmlMemModelIndxdLR is : " + (totTime / centroids.length) + " milliseconds"); + return wordSet; + } + public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { + List wordSet = new ArrayList<>(); + W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); + long startTime, diff; + long totTime = 0; + for (int i = 0; i < centroids.length; i++) { + LOG.info("Sending query for Centroid " + (i + 1)); + startTime = System.currentTimeMillis(); + Map closestWord = bruteForce.getClosestSubEntry(centroids[i], null); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + wordSet.addAll(closestWord.keySet()); + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); + } + LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); + return wordSet; } } From fade5c8c1774b787981f25da0aed8ff669afb9ac Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 12 Sep 2018 18:40:20 +0200 Subject: [PATCH 045/114] removing unnecessary logs and putting performance identification log --- .../word2vec/W2VNrmlMemModelIndxdLR.java | 2 +- .../word2vecrestful/NrmlzdMdlPrfmncTester.java | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index 52cff95..e08dec5 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -268,7 +268,7 @@ private void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMa // Because binarySearch returns the exact index if element exists to++; } - LOG.info("Setting scores for the words between 'from' and 'to' indexes:\t" + from + " " + to); + // LOG.info("Setting scores for the words between 'from' and 'to' indexes:\t" + from + " " + to); // tl.logTime(9); for (int j = from; j < to; j++) { scoreArr[idArr[j]]++; diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 8a61266..6f2c29a 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -12,6 +12,7 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; +import org.dice_research.topicmodeling.commons.sort.AssociativeSort; import org.junit.Test; import nikit.test.TestConst; @@ -28,25 +29,31 @@ public void testNbmTime() { Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List correctWords = getCorrectWords(centroids, nbm); + LOG.info("Correct Words are :" + correctWords); int kStrt = 1; int kEnd = 100; int sigStrt = 1; int sigEnd = 5; int arDivStrt = 1; int arDivEnd = 200; - int count = 1; + int indx = 0; + float[] percScore = new float[(kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)]; + int[] idArr = new int[percScore.length]; final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); for (int a = kStrt; a <= kEnd; a++) { for (int b = arDivStrt; b <= arDivEnd; b++) { for (int c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); List lrModelWords = runLRMemModel(centroids, memModel, a, b, c); - LOG.info("Percentage Score for Test " + (count++) + " is" - + calcPercScore(correctWords, lrModelWords)); + float percVal = calcPercScore(correctWords, lrModelWords); + idArr[indx] = indx + 1; + percScore[indx] = percVal; + LOG.info("Percentage Score for Test id: " + (++indx) + " is" + percVal); } } } - + AssociativeSort.quickSort(percScore, idArr); + LOG.info("Highest Score is achieved by the test id: " + idArr[idArr.length - 1]); } private float calcPercScore(List correctWords, List lrModelWords) { From dc4415cbc9013b2e94f6e8c8987a3034900d5859 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 14:46:32 +0200 Subject: [PATCH 046/114] updated test config and log4j properties --- config/log4j.properties | 2 +- .../aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/config/log4j.properties b/config/log4j.properties index 4ed3f76..1b4af65 100644 --- a/config/log4j.properties +++ b/config/log4j.properties @@ -4,7 +4,7 @@ log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{HH:mm:ss} %5p [%t] (%F:%M:%L) - %m%n log4j.appender.file=org.apache.log4j.RollingFileAppender log4j.appender.file.File=log/root.log -log4j.appender.file.MaxFileSize=1MB +log4j.appender.file.MaxFileSize=150MB log4j.appender.file.MaxBackupIndex=100 log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d{HH:mm:ss} %5p [%t] (%F:%M:%L) - %m%n diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 6f2c29a..ab33847 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -30,12 +30,12 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 1; - int kEnd = 100; - int sigStrt = 1; + int kStrt = 100; + int kEnd = 250; + int sigStrt = 3; int sigEnd = 5; int arDivStrt = 1; - int arDivEnd = 200; + int arDivEnd = 10; int indx = 0; float[] percScore = new float[(kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)]; int[] idArr = new int[percScore.length]; @@ -45,6 +45,7 @@ public void testNbmTime() { for (int c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); List lrModelWords = runLRMemModel(centroids, memModel, a, b, c); + LOG.info("Predicted Words are :" + correctWords); float percVal = calcPercScore(correctWords, lrModelWords); idArr[indx] = indx + 1; percScore[indx] = percVal; @@ -53,7 +54,7 @@ public void testNbmTime() { } } AssociativeSort.quickSort(percScore, idArr); - LOG.info("Highest Score is achieved by the test id: " + idArr[idArr.length - 1]); + LOG.info("Highest Score ("+percScore[percScore.length-1]+"%) is achieved by the test id: " + idArr[idArr.length - 1]); } private float calcPercScore(List correctWords, List lrModelWords) { From 255bcf0a84d4bd5d83225a76e06448315663841d Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 14:57:21 +0200 Subject: [PATCH 047/114] correcting predicted words log --- .../java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index ab33847..c32bc05 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -45,7 +45,7 @@ public void testNbmTime() { for (int c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); List lrModelWords = runLRMemModel(centroids, memModel, a, b, c); - LOG.info("Predicted Words are :" + correctWords); + LOG.info("Predicted Words are :" + lrModelWords); float percVal = calcPercScore(correctWords, lrModelWords); idArr[indx] = indx + 1; percScore[indx] = percVal; From f7a1941c173953efc69369d67952ed95068bee5e Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 15:52:53 +0200 Subject: [PATCH 048/114] adjusting config --- .../word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java | 3 ++- .../org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index e08dec5..de1f385 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -90,9 +90,10 @@ public void updateSdArr(int newSigmaMult, int newAreaDivisor) { if (newSigmaMult == sigmaMult && newAreaDivisor == areaDivisor) { return; } + float mult = (areaDivisor / sigmaMult) * (newSigmaMult / newAreaDivisor); // Updating SdArr values for (int i = 0; i < sdArr.length; i++) { - sdArr[i] *= (areaDivisor / sigmaMult) * (newSigmaMult / newAreaDivisor); + sdArr[i] *= mult; } this.sigmaMult = newSigmaMult; this.areaDivisor = newAreaDivisor; diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index c32bc05..6c9ef53 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -30,10 +30,10 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 100; - int kEnd = 250; + int kStrt = 1000; + int kEnd = 1050; int sigStrt = 3; - int sigEnd = 5; + int sigEnd = 8; int arDivStrt = 1; int arDivEnd = 10; int indx = 0; From 6c93dc721a2e5c4acf5bff90d04649cc74d95c28 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 16:35:56 +0200 Subject: [PATCH 049/114] changing multiplier type to float --- .../word2vec/W2VNrmlMemModelIndxdLR.java | 6 ++-- .../NrmlzdMdlPrfmncTester.java | 36 +++++++++++++------ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index de1f385..d45cd04 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -32,11 +32,11 @@ public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { /** * Multiplier for the standard deviation */ - private int sigmaMult = 3; + private float sigmaMult = 3; /** * Divisor for the standard deviation's value */ - private int areaDivisor = 10; + private float areaDivisor = 10; /** * Number of top elements to give priority */ @@ -86,7 +86,7 @@ public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vec // LOG.info("Sorting completed"); } - public void updateSdArr(int newSigmaMult, int newAreaDivisor) { + public void updateSdArr(float newSigmaMult, float newAreaDivisor) { if (newSigmaMult == sigmaMult && newAreaDivisor == areaDivisor) { return; } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 6c9ef53..65ed9bb 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -1,10 +1,12 @@ package org.aksw.word2vecrestful; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLR; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; @@ -32,29 +34,31 @@ public void testNbmTime() { LOG.info("Correct Words are :" + correctWords); int kStrt = 1000; int kEnd = 1050; - int sigStrt = 3; - int sigEnd = 8; - int arDivStrt = 1; - int arDivEnd = 10; + float sigStrt = 3; + float sigEnd = 8; + float arDivStrt = 1; + float arDivEnd = 10; int indx = 0; - float[] percScore = new float[(kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)]; + int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); + float[] percScore = new float[scrSize]; int[] idArr = new int[percScore.length]; final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); for (int a = kStrt; a <= kEnd; a++) { - for (int b = arDivStrt; b <= arDivEnd; b++) { - for (int c = sigStrt; c <= sigEnd; c++) { + for (float b = arDivStrt; b <= arDivEnd; b++) { + for (float c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); List lrModelWords = runLRMemModel(centroids, memModel, a, b, c); LOG.info("Predicted Words are :" + lrModelWords); float percVal = calcPercScore(correctWords, lrModelWords); idArr[indx] = indx + 1; percScore[indx] = percVal; - LOG.info("Percentage Score for Test id: " + (++indx) + " is" + percVal); + LOG.info("Score for Test id: " + (++indx) + " is " + percVal + "%"); } } } AssociativeSort.quickSort(percScore, idArr); - LOG.info("Highest Score ("+percScore[percScore.length-1]+"%) is achieved by the test id: " + idArr[idArr.length - 1]); + LOG.info("Highest Score (" + percScore[percScore.length - 1] + "%) is achieved by the test id: " + + idArr[idArr.length - 1]); } private float calcPercScore(List correctWords, List lrModelWords) { @@ -70,8 +74,8 @@ private float calcPercScore(List correctWords, List lrModelWords } - private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR memModel, int k, int arDiv, - int sigMult) { + private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR memModel, int k, float arDiv, + float sigMult) { memModel.setK(k); memModel.updateSdArr(sigMult, arDiv); List wordSet = new ArrayList<>(); @@ -109,4 +113,14 @@ public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); return wordSet; } + + public static void main(String[] args) { + // Normalization test + float[] vecA = { 0.012048473f, -0.024212155f, -0.0157357f, 0.02262468f, -0.024654279f }; + for (int i = 0; i < 100; i++) { + Word2VecMath.normalize(vecA); + System.out.println(Arrays.toString(vecA)); + } + } + } From 08229a19454e6a125f7549e4c88f61cfdd4fe0f1 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 17:02:27 +0200 Subject: [PATCH 050/114] configuring test vals --- .../word2vec/W2VNrmlMemModelIndxdLR.java | 3 ++- .../aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index d45cd04..9283b8f 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -192,6 +192,7 @@ private Map getClosestEntry(float[] vector, String subKey) { */ public void setModelVals(Map word2vecMap, int vectorSize) { float[] resArr = new float[vectorSize]; + float mult = sigmaMult / areaDivisor; int totSize = word2vecMap.size(); // loop all dimensions for (int i = 0; i < vectorSize; i++) { @@ -219,7 +220,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { } float variance = sum / dimsnArr.length; Double sd = Math.sqrt(variance); - resArr[i] = sd.floatValue() * sigmaMult / areaDivisor; + resArr[i] = sd.floatValue() * mult; } // Set as sdMap this.sdArr = resArr; diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 65ed9bb..7cb44bc 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -32,12 +32,12 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 1000; - int kEnd = 1050; - float sigStrt = 3; - float sigEnd = 8; + int kStrt = 1; + int kEnd = 10; + float sigStrt = 1; + float sigEnd = 5; float arDivStrt = 1; - float arDivEnd = 10; + float arDivEnd = 100; int indx = 0; int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; From 9e6a3225e778eb0226a647580f1aeccbc1350e3e Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 17:21:53 +0200 Subject: [PATCH 051/114] adjusting config vals --- .../org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 7cb44bc..7f43553 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -32,11 +32,11 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 1; - int kEnd = 10; + int kStrt = 1000; + int kEnd = 1050; float sigStrt = 1; float sigEnd = 5; - float arDivStrt = 1; + float arDivStrt = 90; float arDivEnd = 100; int indx = 0; int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); From 6915c4db605716a4701f4956f04e3283493d3fe7 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 17:43:40 +0200 Subject: [PATCH 052/114] selecting all closest words --- .../word2vec/W2VNrmlMemModelBruteForce.java | 12 +++++++++--- .../word2vecrestful/NrmlzdMdlPrfmncTester.java | 15 ++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java index 8361d10..9a63d45 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -102,9 +102,15 @@ private Map getClosestEntry(float[] vector, String subKey) { i++; } cosineArr = AssociativeSort.quickSort(cosineArr, idArr); - int closestWordId = idArr[idArr.length - 1]; - String closestWord = wordArr[closestWordId]; - closestVec.put(closestWord, word2vec.get(closestWord)); + double maxVal = cosineArr[cosineArr.length - 1]; + for (int j = cosineArr.length - 1; j >= 0; j--) { + if (cosineArr[j] == maxVal) { + int closestWordId = idArr[j]; + String closestWord = wordArr[closestWordId]; + closestVec.put(closestWord, word2vec.get(closestWord)); + } + } + } catch (IOException e) { LOG.error(e.getStackTrace()); } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 7f43553..962281a 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -4,6 +4,7 @@ import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Set; import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.utils.Word2VecMath; @@ -30,7 +31,7 @@ public void testNbmTime() { LOG.info("Starting InMemory indexed model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; - List correctWords = getCorrectWords(centroids, nbm); + List> correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); int kStrt = 1000; int kEnd = 1050; @@ -61,12 +62,12 @@ public void testNbmTime() { + idArr[idArr.length - 1]); } - private float calcPercScore(List correctWords, List lrModelWords) { + private float calcPercScore(List> correctWordSet, List lrModelWords) { float percScore = 0; - int len = correctWords.size(); + int len = correctWordSet.size(); float lenInv = 100f / len; for (int i = 0; i < len; i++) { - if (correctWords.get(i).equals(lrModelWords.get(i))) { + if (correctWordSet.get(i).contains(lrModelWords.get(i))) { percScore += lenInv; } } @@ -96,8 +97,8 @@ private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR m return wordSet; } - public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { - List wordSet = new ArrayList<>(); + public List> getCorrectWords(float[][] centroids, Word2VecModel nbm) { + List> wordSet = new ArrayList<>(); W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); long startTime, diff; long totTime = 0; @@ -107,7 +108,7 @@ public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { Map closestWord = bruteForce.getClosestSubEntry(centroids[i], null); diff = System.currentTimeMillis() - startTime; totTime += diff; - wordSet.addAll(closestWord.keySet()); + wordSet.add(closestWord.keySet()); LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); From 24c2fa7febb8a9fa338862366d4c87de20144e1e Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 14 Sep 2018 17:49:23 +0200 Subject: [PATCH 053/114] adjusting values --- .../org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 962281a..50de65c 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -33,12 +33,12 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List> correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 1000; - int kEnd = 1050; + int kStrt = 10; + int kEnd = 20; float sigStrt = 1; float sigEnd = 5; - float arDivStrt = 90; - float arDivEnd = 100; + float arDivStrt = 1; + float arDivEnd = 10; int indx = 0; int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; From e07df7c55ea7b572cd26cf85cf117793dd7aa26d Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Sat, 15 Sep 2018 11:10:14 +0200 Subject: [PATCH 054/114] Theta model introduced --- .../word2vecrestful/utils/Word2VecMath.java | 7 + .../word2vec/W2VNrmlMemModelBruteForce.java | 2 + .../word2vec/W2VNrmlMemModelTheta.java | 154 ++++++++++++++++++ .../NrmlzdMdlPrfmncTester.java | 20 ++- .../NrmlzdThetaMdlPrfmncTester.java | 76 +++++++++ 5 files changed, 254 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java create mode 100644 src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index ac8d293..347f5e9 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -130,4 +130,11 @@ public static Map findClosestVecInNearbyVecs(Map getClosestEntry(float[] vector, String subKey) { int closestWordId = idArr[j]; String closestWord = wordArr[closestWordId]; closestVec.put(closestWord, word2vec.get(closestWord)); + }else { + break; } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java new file mode 100644 index 0000000..d011cea --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java @@ -0,0 +1,154 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModelTheta implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private Map> cosineIndxMap; + private float[] comparisonVec = null; + private float gMultiplier = 10000; + + public W2VNrmlMemModelTheta(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + // Generating index bucket for degrees + generateCosineIndxMap(); + } + + private void generateCosineIndxMap() { + cosineIndxMap = new HashMap<>(); + float[] curVec; + for (String word : word2vec.keySet()) { + curVec = word2vec.get(word); + if (comparisonVec == null) { + comparisonVec = curVec; + } + + Long cosineIndx = Math + .round(Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVec, curVec) * gMultiplier); + int intIndxVal = cosineIndx.intValue(); + List wordsBucket = cosineIndxMap.get(intIndxVal); + if (wordsBucket == null) { + wordsBucket = new ArrayList<>(); + cosineIndxMap.put(intIndxVal, wordsBucket); + } + wordsBucket.add(word); + } + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public Map getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private Map getClosestEntry(float[] vector, String subKey) { + Map closestVec= null; + try { + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // calculate cosine similarity of all distances + double cosSimMultVal = Word2VecMath.cosineSimilarity(comparisonVec, vector) * gMultiplier; + Double dMinIndx = Math.floor(cosSimMultVal); + Double dMaxIndx = Math.ceil(cosSimMultVal); + int minIndx = dMinIndx.intValue(); + int maxIndx = dMaxIndx.intValue(); + Set nearbyWords = new HashSet<>(); + List minWordList = cosineIndxMap.get(minIndx); + if (minWordList != null) { + nearbyWords.addAll(minWordList); + } + List maxWordList = cosineIndxMap.get(maxIndx); + if (maxWordList != null) { + nearbyWords.addAll(maxWordList); + } + Map nearbyVecMap = createNearbyVecMap(nearbyWords); + closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecMap, vector); + } catch (Exception e) { + LOG.error(e.getStackTrace()); + } + // LOG.info("Closest word found is " + closestVec.keySet()); + return closestVec; + } + + private Map createNearbyVecMap(Collection wordCol) { + Map vecMap = new HashMap<>(); + for (String word : wordCol) { + vecMap.put(word, word2vec.get(word)); + } + return vecMap; + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 50de65c..48787da 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -1,10 +1,12 @@ package org.aksw.word2vecrestful; +import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ThreadLocalRandom; import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.utils.Word2VecMath; @@ -116,12 +118,20 @@ public List> getCorrectWords(float[][] centroids, Word2VecModel nbm) } public static void main(String[] args) { - // Normalization test - float[] vecA = { 0.012048473f, -0.024212155f, -0.0157357f, 0.02262468f, -0.024654279f }; - for (int i = 0; i < 100; i++) { - Word2VecMath.normalize(vecA); - System.out.println(Arrays.toString(vecA)); + // Creating random coordinates + float[][] coordArr = new float[10][2]; + ThreadLocalRandom rand = ThreadLocalRandom.current(); + for (int i = 0; i < coordArr.length; i++) { + for (int j = 0; j < coordArr[i].length; j++) { + coordArr[i][j] = rand.nextFloat(); + } + } + // Random coord created + System.out.println("Input Coords: "+Arrays.deepToString(coordArr)); + for (int i = 0; i < coordArr.length; i++) { + Word2VecMath.normalize(coordArr[i]); } + System.out.println("Normalized Coords: "+Arrays.deepToString(coordArr)); } } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java new file mode 100644 index 0000000..f825246 --- /dev/null +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -0,0 +1,76 @@ +package org.aksw.word2vecrestful; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelTheta; +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.log4j.PropertyConfigurator; +import org.junit.Test; + +import nikit.test.TestConst; + +public class NrmlzdThetaMdlPrfmncTester { + static { + PropertyConfigurator.configure(Cfg.LOG_FILE); + } + public static Logger LOG = LogManager.getLogger(NrmlzdThetaMdlPrfmncTester.class); + + @Test + public void testNbmTime() { + LOG.info("Starting InMemory Theta Model test!"); + Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); + float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; + LOG.info("Starting BruteForce-Model Test"); + List> correctWords = getCorrectWords(centroids, nbm); + LOG.info("Correct Words are :" + correctWords); + LOG.info("Initializing Theta Model"); + final W2VNrmlMemModelTheta memModel = new W2VNrmlMemModelTheta(nbm.word2vec, nbm.vectorSize); + List lrModelWords = new ArrayList<>(); + LOG.info("Starting Theta-Model Test"); + for (int i = 0; i < centroids.length; i++) { + lrModelWords.addAll(memModel.getClosestEntry(centroids[i]).keySet()); + } + LOG.info("Predicted Words are :" + lrModelWords); + float percVal = calcPercScore(correctWords, lrModelWords); + LOG.info("Score for Test is : " + percVal + "%"); + } + + private float calcPercScore(List> correctWordSet, List lrModelWords) { + float percScore = 0; + int len = correctWordSet.size(); + float lenInv = 100f / len; + for (int i = 0; i < len; i++) { + if (correctWordSet.get(i).contains(lrModelWords.get(i))) { + percScore += lenInv; + } + } + return percScore; + + } + + public List> getCorrectWords(float[][] centroids, Word2VecModel nbm) { + List> wordSet = new ArrayList<>(); + W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); + long startTime, diff; + long totTime = 0; + for (int i = 0; i < centroids.length; i++) { + LOG.info("Sending query for Centroid " + (i + 1)); + startTime = System.currentTimeMillis(); + Map closestWord = bruteForce.getClosestSubEntry(centroids[i], null); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + wordSet.add(closestWord.keySet()); + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); + } + LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); + return wordSet; + } +} From 6b0042becf0fe13775e4e0afde5377b80135f480 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Sat, 15 Sep 2018 11:47:36 +0200 Subject: [PATCH 055/114] calling cosineSim for normalized vectors --- .../word2vecrestful/utils/Word2VecMath.java | 17 ++++++++++++----- .../word2vec/W2VNrmlMemModelBruteForce.java | 2 +- .../word2vec/W2VNrmlMemModelIndxd.java | 2 +- .../word2vec/W2VNrmlMemModelIndxdLR.java | 2 +- .../word2vec/W2VNrmlMemModelNonIndxd.java | 2 +- .../word2vec/W2VNrmlMemModelTheta.java | 5 ++--- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index 347f5e9..840c5da 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -2,7 +2,8 @@ import java.util.HashMap; import java.util.Map; -import java.util.TreeMap; + +import org.dice_research.topicmodeling.commons.sort.AssociativeSort; public class Word2VecMath { @@ -118,14 +119,20 @@ public static float calcVariance(float[] inpArr) { return res; } - public static Map findClosestVecInNearbyVecs(Map nearbyVecs, float[] vector) { + public static Map findClosestNormalizedVec(Map nearbyVecs, float[] vector) { Map closestVec = new HashMap<>(); + String[] wordArr = new String[nearbyVecs.size()]; + double[] cosineValArr = new double[nearbyVecs.size()]; if(nearbyVecs !=null && vector != null && nearbyVecs.size()>0) { - TreeMap cosineSimMap = new TreeMap<>(); + int i=0; for (String word : nearbyVecs.keySet()) { - cosineSimMap.put(Word2VecMath.cosineSimilarity(vector, nearbyVecs.get(word)), word); + cosineValArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(vector, nearbyVecs.get(word)); + wordArr[i] = word; + i++; } - String closestWord = cosineSimMap.lastEntry().getValue(); + AssociativeSort.quickSort(cosineValArr, wordArr); + int maxIndx = cosineValArr.length -1; + String closestWord = wordArr[maxIndx]; closestVec.put(closestWord, nearbyVecs.get(closestWord)); } return closestVec; diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java index ddbdd74..918ef9e 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -98,7 +98,7 @@ private Map getClosestEntry(float[] vector, String subKey) { wordArr[i] = word; idArr[i] = i; float[] wordVec = word2vec.get(word); - cosineArr[i] = Word2VecMath.cosineSimilarity(wordVec, vector); + cosineArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(wordVec, vector); i++; } cosineArr = AssociativeSort.quickSort(cosineArr, idArr); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java index be141a6..20a8662 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java @@ -142,7 +142,7 @@ private Map getClosestEntry(float[] vector, String subKey) { // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector tl.logTime(3); - closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); tl.printTime(3, "findClosestVecInNearbyVecs"); } catch (IOException e) { // LOG.error(e.getStackTrace()); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index 9283b8f..1987bb7 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -172,7 +172,7 @@ private Map getClosestEntry(float[] vector, String subKey) { // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector // tl.logTime(3); - closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); // tl.printTime(3, "findClosestVecInNearbyVecs"); } catch (IOException e) { // LOG.error(e.getStackTrace()); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java index 5f95449..c2c6190 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java @@ -135,7 +135,7 @@ private Map getClosestEntry(float[] vector, String subKey) { // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); // Select the closest vector tl.logTime(3); - closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecs, vector); + closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); tl.printTime(3, "findClosestVecInNearbyVecs"); } catch (IOException e) { // LOG.error(e.getStackTrace()); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java index d011cea..79a6904 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java @@ -101,7 +101,7 @@ private Map getClosestEntry(float[] vector, String subKey) { // Normalize incoming vector vector = Word2VecMath.normalize(vector); // calculate cosine similarity of all distances - double cosSimMultVal = Word2VecMath.cosineSimilarity(comparisonVec, vector) * gMultiplier; + double cosSimMultVal = Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVec, vector) * gMultiplier; Double dMinIndx = Math.floor(cosSimMultVal); Double dMaxIndx = Math.ceil(cosSimMultVal); int minIndx = dMinIndx.intValue(); @@ -116,11 +116,10 @@ private Map getClosestEntry(float[] vector, String subKey) { nearbyWords.addAll(maxWordList); } Map nearbyVecMap = createNearbyVecMap(nearbyWords); - closestVec = Word2VecMath.findClosestVecInNearbyVecs(nearbyVecMap, vector); + closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecMap, vector); } catch (Exception e) { LOG.error(e.getStackTrace()); } - // LOG.info("Closest word found is " + closestVec.keySet()); return closestVec; } From 40b9074d81fb9b4f960ef130aef8e4ecd72cbc09 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 14:40:38 +0200 Subject: [PATCH 056/114] Model with mean comparison vector --- .../word2vec/W2VNrmlMemModelTheta.java | 26 ++++++++++++++++--- .../NrmlzdThetaMdlPrfmncTester.java | 9 +++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java index 79a6904..26d8a88 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java @@ -31,18 +31,36 @@ public class W2VNrmlMemModelTheta implements GenWord2VecModel { public W2VNrmlMemModelTheta(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; + // Setting mean as comparison vec + setComparisonVec(word2vec, vectorSize); // Generating index bucket for degrees generateCosineIndxMap(); } + public void setComparisonVec(Map word2vecMap, int vectorSize) { + float[] meanArr = new float[vectorSize]; + int totSize = word2vecMap.size(); + // loop all dimensions + for (int i = 0; i < vectorSize; i++) { + // loop through all the words + float[] dimsnArr = new float[totSize]; + float sum = 0; + for (float[] vecEntry : word2vecMap.values()) { + float val = vecEntry[i]; + sum += val; + } + // mean + float mean = sum / dimsnArr.length; + meanArr[i] = mean; + } + this.comparisonVec = meanArr; + } + private void generateCosineIndxMap() { cosineIndxMap = new HashMap<>(); float[] curVec; for (String word : word2vec.keySet()) { curVec = word2vec.get(word); - if (comparisonVec == null) { - comparisonVec = curVec; - } Long cosineIndx = Math .round(Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVec, curVec) * gMultiplier); @@ -96,7 +114,7 @@ public Map getClosestSubEntry(float[] vector, String subKey) { * @return closest word to the given vector alongwith it's vector */ private Map getClosestEntry(float[] vector, String subKey) { - Map closestVec= null; + Map closestVec = null; try { // Normalize incoming vector vector = Word2VecMath.normalize(vector); diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index f825246..ed068d1 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -25,6 +25,8 @@ public class NrmlzdThetaMdlPrfmncTester { @Test public void testNbmTime() { + long startTime, diff; + long totTime = 0; LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; @@ -34,10 +36,17 @@ public void testNbmTime() { LOG.info("Initializing Theta Model"); final W2VNrmlMemModelTheta memModel = new W2VNrmlMemModelTheta(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); + LOG.info("Starting Theta-Model Test"); for (int i = 0; i < centroids.length; i++) { + LOG.info("Sending query for Centroid " + (i + 1)); + startTime = System.currentTimeMillis(); lrModelWords.addAll(memModel.getClosestEntry(centroids[i]).keySet()); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } + LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From 0e70dec4c723dd97d83b186113dbb5b6cc7079dd Mon Sep 17 00:00:00 2001 From: Micha Date: Tue, 18 Sep 2018 14:28:39 +0200 Subject: [PATCH 057/114] Added a small class for reducing the original model to a model of a given size. Some smaller changes to reduce problems. --- .../word2vecrestful/tool/ModelNormalizer.java | 4 +- .../word2vecrestful/tool/ModelReducer.java | 101 ++++++++++++++++++ .../word2vec/W2VNrmlMemModelIndxdLR.java | 22 ++-- .../NrmlzdMdlPrfmncTester.java | 6 +- 4 files changed, 119 insertions(+), 14 deletions(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/tool/ModelReducer.java diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java index 92ab07a..e1d5d16 100644 --- a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java @@ -34,7 +34,7 @@ public class ModelNormalizer { * - size of the vector * @return - normalized line */ - public String getNormalizedVecLine(String word, float[] vector) { + public static String getNormalizedVecLine(String word, float[] vector) { StringBuffer resStr = new StringBuffer(); resStr.append(word); vector = Word2VecMath.normalize(vector); @@ -44,7 +44,7 @@ public String getNormalizedVecLine(String word, float[] vector) { return resStr.toString(); } - public byte[] getNormalizedVecBA(float[] vector) { + public static byte[] getNormalizedVecBA(float[] vector) { vector = Word2VecMath.normalize(vector); ByteBuffer buffer = ByteBuffer.allocate(vector.length * 4); buffer.order(ByteOrder.LITTLE_ENDIAN); diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelReducer.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelReducer.java new file mode 100644 index 0000000..98d159d --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelReducer.java @@ -0,0 +1,101 @@ +package org.aksw.word2vecrestful.tool; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ModelReducer { + + private static final Logger LOGGER = LoggerFactory.getLogger(ModelReducer.class); + + public static final int REDUCED_WORDS = 100000; + public static final String OUPUT_FILE = "reducedModel-" + REDUCED_WORDS + "-normalized.bin"; + + public static void main(String[] args) throws IOException { + LOGGER.info("Reading model..."); + Word2VecModel w2vModel = Word2VecFactory.getNormalBinModel(); + Map model = w2vModel.word2vec; + int vectorSize = w2vModel.vectorSize; + // we don't need the model anymore + w2vModel = null; + + LOGGER.info("Preparing random ids..."); + Random random = new Random(); + int wordCount = model.size(); + // Use a set to make sure that no ID is chosen twice + Set idSet = new HashSet(); + while(idSet.size() < REDUCED_WORDS) { + idSet.add(random.nextInt(wordCount)); + } + int[] wordsKept = new int[REDUCED_WORDS]; + Iterator idIter = idSet.iterator(); + for (int i = 0; i < wordsKept.length; i++) { + wordsKept[i] = idIter.next(); + } + idSet = null; + idIter = null; + // sort the array ascending + Arrays.sort(wordsKept); + // Iterate over all words and remove them if they are not listed in the + // wordsKept array + LOGGER.info("Starting to remove words..."); + int idPos = 0; + int count = 0; + Iterator iterator = model.keySet().iterator(); + while ((idPos < REDUCED_WORDS) && (iterator.hasNext())) { + iterator.next(); + if (count == wordsKept[idPos]) { + ++idPos; + } else { + // remove the word + iterator.remove(); + } + ++count; + } + LOGGER.info("Kept " + idPos + " words. Writing output file..."); + writeModel(model, vectorSize, new File(OUPUT_FILE)); + } + + public static void writeModel(Map model, int vectorSize, File outputFile) throws IOException { + // ensure directory creation + if(outputFile.getParentFile() != null) { + outputFile.getParentFile().mkdirs(); + } + // open an output stream + BufferedOutputStream bOutStrm = null; + try { + bOutStrm = new BufferedOutputStream(new FileOutputStream(outputFile)); + String words = Integer.toString(model.size()); + bOutStrm.write(words.getBytes(StandardCharsets.UTF_8)); + bOutStrm.write(ModelNormalizer.WHITESPACE_BA); + bOutStrm.write(Integer.toString(vectorSize).getBytes(StandardCharsets.UTF_8)); + bOutStrm.write(ModelNormalizer.END_LINE_BA); + LOGGER.info("Expecting " + words + " words with " + vectorSize + " values per vector."); + for (String word : model.keySet()) { + bOutStrm.write(word.getBytes(StandardCharsets.UTF_8)); + bOutStrm.write(ModelNormalizer.WHITESPACE_BA); + float[] vector = model.get(word); + bOutStrm.write(ModelNormalizer.getNormalizedVecBA(vector)); + } + } catch (final IOException e) { + LOGGER.error(e.getLocalizedMessage(), e); + } finally { + if (bOutStrm != null) { + bOutStrm.close(); + } + } + } +} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index 1987bb7..f31260a 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -29,6 +29,7 @@ public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { private Map word2vec; private int vectorSize; private float[] sdArr; + private float[] multSdArr; /** * Multiplier for the standard deviation */ @@ -87,13 +88,14 @@ public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vec } public void updateSdArr(float newSigmaMult, float newAreaDivisor) { - if (newSigmaMult == sigmaMult && newAreaDivisor == areaDivisor) { - return; - } - float mult = (areaDivisor / sigmaMult) * (newSigmaMult / newAreaDivisor); +// if (newSigmaMult == sigmaMult && newAreaDivisor == areaDivisor) { +// return; +// } +// float mult = (areaDivisor / sigmaMult) * (newSigmaMult / newAreaDivisor); + float mult = (newSigmaMult / newAreaDivisor); // Updating SdArr values - for (int i = 0; i < sdArr.length; i++) { - sdArr[i] *= mult; + for (int i = 0; i < multSdArr.length; i++) { + multSdArr[i] = mult * sdArr[i]; } this.sigmaMult = newSigmaMult; this.areaDivisor = newAreaDivisor; @@ -192,7 +194,6 @@ private Map getClosestEntry(float[] vector, String subKey) { */ public void setModelVals(Map word2vecMap, int vectorSize) { float[] resArr = new float[vectorSize]; - float mult = sigmaMult / areaDivisor; int totSize = word2vecMap.size(); // loop all dimensions for (int i = 0; i < vectorSize; i++) { @@ -220,10 +221,13 @@ public void setModelVals(Map word2vecMap, int vectorSize) { } float variance = sum / dimsnArr.length; Double sd = Math.sqrt(variance); - resArr[i] = sd.floatValue() * mult; + resArr[i] = sd.floatValue(); } // Set as sdMap this.sdArr = resArr; + this.multSdArr = new float[sdArr.length]; + // Apply the multiplicator + updateSdArr(sigmaMult, areaDivisor); } /** @@ -304,7 +308,7 @@ private float[][] getMinMaxVec(float[] vector) { float[][] resVec = new float[2][vector.length]; for (int i = 0; i < vector.length; i++) { // TODO: change sdmap to array - float diff = sdArr[i]; + float diff = multSdArr[i]; // MinVec resVec[0][i] = vector[i] - diff; // MaxVec diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 48787da..33c0ed6 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -35,12 +35,12 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List> correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 10; - int kEnd = 20; + int kStrt = 1000; + int kEnd = 1000;//20; float sigStrt = 1; float sigEnd = 5; float arDivStrt = 1; - float arDivEnd = 10; + float arDivEnd = 1;//10; int indx = 0; int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; From 876a040330ece5b0c510f6ff79fa72d069b49746 Mon Sep 17 00:00:00 2001 From: Micha Date: Tue, 18 Sep 2018 15:36:08 +0200 Subject: [PATCH 058/114] Changed sum to be double value instead of float. --- .../word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index f31260a..354f4fe 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -200,7 +200,7 @@ public void setModelVals(Map word2vecMap, int vectorSize) { // loop through all the words float[] dimsnArr = new float[totSize]; int[] idArr = new int[totSize]; - float sum = 0; + double sum = 0; for (int j = 0; j < gWordArr.length; j++) { float val = gVecArr[j][i]; sum += val; @@ -214,12 +214,12 @@ public void setModelVals(Map word2vecMap, int vectorSize) { this.indexesArr[i] = dimValWordMap; // LOG.info("Dimension " + (i) + " index stored to memory"); // mean - float mean = sum / dimsnArr.length; + double mean = sum / dimsnArr.length; sum = 0; for (int j = 0; j < dimsnArr.length; j++) { sum += Math.pow(dimsnArr[j] - mean, 2); } - float variance = sum / dimsnArr.length; + double variance = sum / dimsnArr.length; Double sd = Math.sqrt(variance); resArr[i] = sd.floatValue(); } From c3392ab2729fad3f957d276450fb100abc254836 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 15:40:03 +0200 Subject: [PATCH 059/114] pushing test cfg changes --- .../org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 33c0ed6..2f5c116 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -35,18 +35,18 @@ public void testNbmTime() { float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; List> correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 1000; - int kEnd = 1000;//20; + int kStrt = 10000; + int kEnd = 12000;//20; float sigStrt = 1; float sigEnd = 5; float arDivStrt = 1; - float arDivEnd = 1;//10; + float arDivEnd = 10;//10; int indx = 0; int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; int[] idArr = new int[percScore.length]; final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); - for (int a = kStrt; a <= kEnd; a++) { + for (int a = kStrt; a <= kEnd; a+=10) { for (float b = arDivStrt; b <= arDivEnd; b++) { for (float c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); From 0ad858590b8d6b9644f5901b89bf9f9ee380b170 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 15:48:10 +0200 Subject: [PATCH 060/114] reducing bucket multiplier to 100 and logging bucket size --- .../word2vecrestful/word2vec/W2VNrmlMemModelTheta.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java index 26d8a88..9bb7584 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java @@ -26,7 +26,7 @@ public class W2VNrmlMemModelTheta implements GenWord2VecModel { private int vectorSize; private Map> cosineIndxMap; private float[] comparisonVec = null; - private float gMultiplier = 10000; + private float gMultiplier = 100; public W2VNrmlMemModelTheta(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; @@ -35,6 +35,14 @@ public W2VNrmlMemModelTheta(final Map word2vec, final int vecto setComparisonVec(word2vec, vectorSize); // Generating index bucket for degrees generateCosineIndxMap(); + // TODO: Remove this + printBucketSize(); + } + + private void printBucketSize() { + for(int i : cosineIndxMap.keySet()) { + LOG.info("Bucket "+i+" has the size: "+cosineIndxMap.get(i).size()); + } } public void setComparisonVec(Map word2vecMap, int vectorSize) { From 3d3b8c26b669730cfc92f8fd8cb13b443d0ed0c4 Mon Sep 17 00:00:00 2001 From: Micha Date: Tue, 18 Sep 2018 15:55:09 +0200 Subject: [PATCH 061/114] Changed the API of the getCloses methods to reduce complexity. --- .../word2vecrestful/utils/Word2VecMath.java | 269 ++++++++++-------- .../word2vec/GenWord2VecModel.java | 4 +- .../word2vec/W2VNrmlDbModel.java | 4 +- .../word2vec/W2VNrmlMemModelBruteForce.java | 10 +- .../word2vec/W2VNrmlMemModelIndxd.java | 8 +- .../word2vec/W2VNrmlMemModelIndxdLR.java | 8 +- .../word2vec/W2VNrmlMemModelNonIndxd.java | 8 +- .../word2vec/W2VNrmlMemModelTheta.java | 8 +- .../NormalizedInMemModelTest.java | 4 +- .../NrmlzdMdlPrfmncTester.java | 33 ++- .../NrmlzdThetaMdlPrfmncTester.java | 27 +- 11 files changed, 194 insertions(+), 189 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index 840c5da..92b40b0 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -1,147 +1,166 @@ package org.aksw.word2vecrestful.utils; +import java.util.Comparator; import java.util.HashMap; import java.util.Map; +import java.util.Map.Entry; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; public class Word2VecMath { - public static void main(final String[] args) { + public static void main(final String[] args) { - final float[] vecA = new float[] {11.2f, 1.4f, 1.9f, 0.1f}; - final float[] vecB = new float[] {11f, 1.3f, 1.2f, 1.1f}; + final float[] vecA = new float[] { 11.2f, 1.4f, 1.9f, 0.1f }; + final float[] vecB = new float[] { 11f, 1.3f, 1.2f, 1.1f }; - System.out.println(Word2VecMath.cosineSimilarity(vecA, vecB)); - System.out.println(Word2VecMath.cosineSimilarityNormalizedVecs(Word2VecMath.normalize(vecA), - Word2VecMath.normalize(vecB))); + System.out.println(Word2VecMath.cosineSimilarity(vecA, vecB)); + System.out.println(Word2VecMath.cosineSimilarityNormalizedVecs(Word2VecMath.normalize(vecA), + Word2VecMath.normalize(vecB))); - } + } + + public synchronized static float[] sub(final float[] vectorA, final float[] vectorB) { + if (vectorA.length == vectorB.length) { + final float[] add = new float[vectorA.length]; + for (int i = 0; i < vectorA.length; i++) { + add[i] = vectorA[i] - vectorB[i]; + } + return add; + } + return null; + } + + public synchronized static float[] add(final float[] vectorA, final float[] vectorB) { + if (vectorA.length == vectorB.length) { + final float[] add = new float[vectorA.length]; + for (int i = 0; i < vectorA.length; i++) { + add[i] = vectorA[i] + vectorB[i]; + } + return add; + } + return null; + } + + public synchronized static double cosineSimilarity(final float[] vectorA, final float[] vectorB) { + double dotProduct = 0.0; + double normA = 0.0; + double normB = 0.0; + for (int i = 0; i < vectorA.length; i++) { + dotProduct += vectorA[i] * vectorB[i]; + normA += vectorA[i] * vectorA[i]; + normB += vectorB[i] * vectorB[i]; + } + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); + } + + public synchronized static double cosineSimilarityNormalizedVecs(final float[] vectorA, final float[] vectorB) { + double c = 0.0; + for (int i = 0; i < vectorA.length; i++) { + c += vectorA[i] * vectorB[i]; + } + return c; + } - public synchronized static float[] sub(final float[] vectorA, final float[] vectorB) { - if (vectorA.length == vectorB.length) { - final float[] add = new float[vectorA.length]; - for (int i = 0; i < vectorA.length; i++) { - add[i] = vectorA[i] - vectorB[i]; - } - return add; + public synchronized static double norm(final float[] vectorA) { + double normA = 0.0; + for (int i = 0; i < vectorA.length; i++) { + normA += vectorA[i] * vectorA[i]; + } + return Math.sqrt(normA); } - return null; - } - - public synchronized static float[] add(final float[] vectorA, final float[] vectorB) { - if (vectorA.length == vectorB.length) { - final float[] add = new float[vectorA.length]; - for (int i = 0; i < vectorA.length; i++) { - add[i] = vectorA[i] + vectorB[i]; - } - return add; + + public synchronized static float[] normalize(final float[] vectorA) { + final Double normA = norm(vectorA); + for (int i = 0; i < vectorA.length; i++) { + vectorA[i] /= normA.floatValue(); + } + return vectorA; } - return null; - } - - public synchronized static double cosineSimilarity(final float[] vectorA, final float[] vectorB) { - double dotProduct = 0.0; - double normA = 0.0; - double normB = 0.0; - for (int i = 0; i < vectorA.length; i++) { - dotProduct += vectorA[i] * vectorB[i]; - normA += vectorA[i] * vectorA[i]; - normB += vectorB[i] * vectorB[i]; + + // Adding maxval and minval methods + // Method for getting the maximum value + public static float getMax(float[] inputArray) { + float maxValue = inputArray[0]; + for (int i = 1; i < inputArray.length; i++) { + if (inputArray[i] > maxValue) { + maxValue = inputArray[i]; + } + } + return maxValue; } - return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); - } - - public synchronized static double cosineSimilarityNormalizedVecs(final float[] vectorA, - final float[] vectorB) { - double c = 0.0; - for (int i = 0; i < vectorA.length; i++) { - c += vectorA[i] * vectorB[i]; + + // Method for getting the minimum value + public static float getMin(float[] inputArray) { + float minValue = inputArray[0]; + for (int i = 1; i < inputArray.length; i++) { + if (inputArray[i] < minValue) { + minValue = inputArray[i]; + } + } + return minValue; + } + + // Method to calculate mean + public static float calcMean(float[] inpArr) { + float res = 0; + for (int i = 0; i < inpArr.length; i++) { + res += inpArr[i]; + } + res = res / inpArr.length; + return res; + } + + // Method to calculate variance + public static float calcVariance(float[] inpArr) { + float res = 0; + float meanVal = calcMean(inpArr); + float len = inpArr.length; + for (int i = 0; i > len; i++) { + res += Math.pow(inpArr[i] - meanVal, 2); + } + return res; + } + + public static String findClosestNormalizedVec(Map nearbyVecs, float[] vector) { + String closestWord = null; + double maxValue = Double.NEGATIVE_INFINITY; + double temp; + if (nearbyVecs != null && vector != null && nearbyVecs.size() > 0) { + for (String word : nearbyVecs.keySet()) { + temp = Word2VecMath.cosineSimilarityNormalizedVecs(vector, nearbyVecs.get(word)); + if(temp > maxValue) { + maxValue = temp; + closestWord = word; + } + } + } + return closestWord; } - return c; - } - public synchronized static double norm(final float[] vectorA) { - double normA = 0.0; - for (int i = 0; i < vectorA.length; i++) { - normA += vectorA[i] * vectorA[i]; + /** + * Multi-threaded version of {@link #findClosestNormalizedVec(Map, float[])}. + * + * @param nearbyVecs + * @param vector + * @return + */ + public static String findClosestNormalizedVecMT(Map nearbyVecs, float[] vector) { + if(nearbyVecs !=null && vector != null && nearbyVecs.size()>0) { + Object[] result = nearbyVecs.entrySet().parallelStream().map(e -> new Object[] {e, new Double(Word2VecMath.cosineSimilarityNormalizedVecs(vector,e.getValue()))}).max(new Comparator() {@Override + public int compare(Object[] o1, Object[] o2) { + return Double.compare((Double) o1[1], (Double) o2[1]); + }}).get(); + return ((Entry) result[0]).getKey(); + } else { + return null; + } } - return Math.sqrt(normA); - } - public synchronized static float[] normalize(final float[] vectorA) { - final Double normA = norm(vectorA); - for (int i = 0; i < vectorA.length; i++) { - vectorA[i] /= normA.floatValue(); + public static double getAngDegrees(float[] vecA, float[] vecB) { + double cosSim = cosineSimilarity(vecA, vecB); + double radians = Math.acos(cosSim); + double degrees = Math.toDegrees(radians); + return degrees; } - return vectorA; - } - //Adding maxval and minval methods - //Method for getting the maximum value - public static float getMax(float[] inputArray){ - float maxValue = inputArray[0]; - for(int i=1;i < inputArray.length;i++){ - if(inputArray[i] > maxValue){ - maxValue = inputArray[i]; - } - } - return maxValue; - } - - // Method for getting the minimum value - public static float getMin(float[] inputArray){ - float minValue = inputArray[0]; - for(int i=1;ilen;i++) { - res += Math.pow(inpArr[i]- meanVal, 2); - } - return res; - } - - public static Map findClosestNormalizedVec(Map nearbyVecs, float[] vector) { - Map closestVec = new HashMap<>(); - String[] wordArr = new String[nearbyVecs.size()]; - double[] cosineValArr = new double[nearbyVecs.size()]; - if(nearbyVecs !=null && vector != null && nearbyVecs.size()>0) { - int i=0; - for (String word : nearbyVecs.keySet()) { - cosineValArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(vector, nearbyVecs.get(word)); - wordArr[i] = word; - i++; - } - AssociativeSort.quickSort(cosineValArr, wordArr); - int maxIndx = cosineValArr.length -1; - String closestWord = wordArr[maxIndx]; - closestVec.put(closestWord, nearbyVecs.get(closestWord)); - } - return closestVec; - } - - public static double getAngDegrees(float[] vecA, float[] vecB) { - double cosSim = cosineSimilarity(vecA, vecB); - double radians = Math.acos(cosSim); - double degrees = Math.toDegrees(radians); - return degrees; - } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java index 77fd3e6..4301c10 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java @@ -4,6 +4,6 @@ public interface GenWord2VecModel { public int getVectorSize(); - public Map getClosestEntry(float[] vector); - public Map getClosestSubEntry(float[] vector, String subKey); + public String getClosestEntry(float[] vector); + public String getClosestSubEntry(float[] vector, String subKey); } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java index 46daf57..f90ae1c 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java @@ -25,13 +25,13 @@ public int getVectorSize() { } @Override - public Map getClosestEntry(float[] vector) { + public String getClosestEntry(float[] vector) { // TODO Auto-generated method stub return null; } @Override - public Map getClosestSubEntry(float[] vector, String subKey) { + public String getClosestSubEntry(float[] vector, String subKey) { // TODO Auto-generated method stub return null; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java index 918ef9e..388737c 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -45,7 +45,7 @@ public W2VNrmlMemModelBruteForce(final Map word2vec, final int * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestEntry(float[] vector) { + public String getClosestEntry(float[] vector) { return getClosestEntry(vector, null); } @@ -60,7 +60,7 @@ public Map getClosestEntry(float[] vector) { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestSubEntry(float[] vector, String subKey) { + public String getClosestSubEntry(float[] vector, String subKey) { return getClosestEntry(vector, subKey); } @@ -74,9 +74,9 @@ public Map getClosestSubEntry(float[] vector, String subKey) { * - key to subset if any * @return closest word to the given vector alongwith it's vector */ - private Map getClosestEntry(float[] vector, String subKey) { + private String getClosestEntry(float[] vector, String subKey) { Set wordSet = null; - Map closestVec = new HashMap<>(); + String closestVec = null; try { if (subKey == null) { wordSet = word2vec.keySet(); @@ -107,7 +107,7 @@ private Map getClosestEntry(float[] vector, String subKey) { if (cosineArr[j] == maxVal) { int closestWordId = idArr[j]; String closestWord = wordArr[closestWordId]; - closestVec.put(closestWord, word2vec.get(closestWord)); + closestVec = closestWord; }else { break; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java index 20a8662..590dd00 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java @@ -91,7 +91,7 @@ private void initArrays() { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestEntry(float[] vector) { + public String getClosestEntry(float[] vector) { return getClosestEntry(vector, null); } @@ -106,7 +106,7 @@ public Map getClosestEntry(float[] vector) { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestSubEntry(float[] vector, String subKey) { + public String getClosestSubEntry(float[] vector, String subKey) { return getClosestEntry(vector, subKey); } @@ -120,9 +120,9 @@ public Map getClosestSubEntry(float[] vector, String subKey) { * - key to subset if any * @return closest word to the given vector alongwith it's vector */ - private Map getClosestEntry(float[] vector, String subKey) { + private String getClosestEntry(float[] vector, String subKey) { Set wordSet = null; - Map closestVec = null; + String closestVec = null; try { if (subKey == null) { wordSet = word2vec.keySet(); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index 354f4fe..b1b4462 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -123,7 +123,7 @@ private void initArrays() { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestEntry(float[] vector) { + public String getClosestEntry(float[] vector) { return getClosestEntry(vector, null); } @@ -138,7 +138,7 @@ public Map getClosestEntry(float[] vector) { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestSubEntry(float[] vector, String subKey) { + public String getClosestSubEntry(float[] vector, String subKey) { return getClosestEntry(vector, subKey); } @@ -152,9 +152,9 @@ public Map getClosestSubEntry(float[] vector, String subKey) { * - key to subset if any * @return closest word to the given vector alongwith it's vector */ - private Map getClosestEntry(float[] vector, String subKey) { + private String getClosestEntry(float[] vector, String subKey) { Set wordSet = null; - Map closestVec = null; + String closestVec = null; try { if (subKey == null) { wordSet = word2vec.keySet(); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java index c2c6190..4bd687b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java @@ -84,7 +84,7 @@ private void initArrays() { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestEntry(float[] vector) { + public String getClosestEntry(float[] vector) { return getClosestEntry(vector, null); } @@ -99,7 +99,7 @@ public Map getClosestEntry(float[] vector) { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestSubEntry(float[] vector, String subKey) { + public String getClosestSubEntry(float[] vector, String subKey) { return getClosestEntry(vector, subKey); } @@ -113,9 +113,9 @@ public Map getClosestSubEntry(float[] vector, String subKey) { * - key to subset if any * @return closest word to the given vector alongwith it's vector */ - private Map getClosestEntry(float[] vector, String subKey) { + private String getClosestEntry(float[] vector, String subKey) { Set wordSet = null; - Map closestVec = null; + String closestVec = null; try { if (subKey == null) { wordSet = word2vec.keySet(); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java index 26d8a88..54bc537 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java @@ -84,7 +84,7 @@ private void generateCosineIndxMap() { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestEntry(float[] vector) { + public String getClosestEntry(float[] vector) { return getClosestEntry(vector, null); } @@ -99,7 +99,7 @@ public Map getClosestEntry(float[] vector) { * @return closest word to the given vector alongwith it's vector */ @Override - public Map getClosestSubEntry(float[] vector, String subKey) { + public String getClosestSubEntry(float[] vector, String subKey) { return getClosestEntry(vector, subKey); } @@ -113,8 +113,8 @@ public Map getClosestSubEntry(float[] vector, String subKey) { * - key to subset if any * @return closest word to the given vector alongwith it's vector */ - private Map getClosestEntry(float[] vector, String subKey) { - Map closestVec = null; + private String getClosestEntry(float[] vector, String subKey) { + String closestVec = null; try { // Normalize incoming vector vector = Word2VecMath.normalize(vector); diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index 5235658..dd5b934 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -60,10 +60,10 @@ public void testNbmTime() { for (int i=0;i closestWord = memModel.getClosestSubEntry( centroids[i], null); + String closestWord = memModel.getClosestSubEntry( centroids[i], null); diff = System.currentTimeMillis() - startTime; totTime += diff; - LOG.info(closestWord.keySet()); + LOG.info(closestWord); LOG.info("Query time recorded for Centroid " + (i+1) + " is " + diff + " milliseconds."); } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 33c0ed6..962ecc0 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -1,6 +1,5 @@ package org.aksw.word2vecrestful; -import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -33,20 +32,20 @@ public void testNbmTime() { LOG.info("Starting InMemory indexed model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; - List> correctWords = getCorrectWords(centroids, nbm); + List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 1000; - int kEnd = 1000;//20; - float sigStrt = 1; - float sigEnd = 5; - float arDivStrt = 1; - float arDivEnd = 1;//10; + int kStrt = 100; + int kEnd = 100;//20; + float sigStrt = 2; + float sigEnd = 3; + float arDivStrt = 10; + float arDivEnd = 10;//10; int indx = 0; int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; int[] idArr = new int[percScore.length]; final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); - for (int a = kStrt; a <= kEnd; a++) { + for (int a = kStrt; a <= kEnd; a+=10) { for (float b = arDivStrt; b <= arDivEnd; b++) { for (float c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); @@ -64,12 +63,12 @@ public void testNbmTime() { + idArr[idArr.length - 1]); } - private float calcPercScore(List> correctWordSet, List lrModelWords) { + public static float calcPercScore(List correctWordSet, List lrModelWords) { float percScore = 0; int len = correctWordSet.size(); float lenInv = 100f / len; for (int i = 0; i < len; i++) { - if (correctWordSet.get(i).contains(lrModelWords.get(i))) { + if (correctWordSet.get(i).equals(lrModelWords.get(i))) { percScore += lenInv; } } @@ -87,10 +86,10 @@ private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR m for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); startTime = System.currentTimeMillis(); - Map closestWord = memModel.getClosestSubEntry(centroids[i], null); + String closestWord = memModel.getClosestSubEntry(centroids[i], null); diff = System.currentTimeMillis() - startTime; totTime += diff; - wordSet.addAll(closestWord.keySet()); + wordSet.add(closestWord); LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } @@ -99,18 +98,18 @@ private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR m return wordSet; } - public List> getCorrectWords(float[][] centroids, Word2VecModel nbm) { - List> wordSet = new ArrayList<>(); + public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { + List wordSet = new ArrayList<>(); W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); long startTime, diff; long totTime = 0; for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); startTime = System.currentTimeMillis(); - Map closestWord = bruteForce.getClosestSubEntry(centroids[i], null); + String closestWord = bruteForce.getClosestSubEntry(centroids[i], null); diff = System.currentTimeMillis() - startTime; totTime += diff; - wordSet.add(closestWord.keySet()); + wordSet.add(closestWord); LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index ed068d1..116ffbf 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -31,7 +31,7 @@ public void testNbmTime() { Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; LOG.info("Starting BruteForce-Model Test"); - List> correctWords = getCorrectWords(centroids, nbm); + List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); LOG.info("Initializing Theta Model"); final W2VNrmlMemModelTheta memModel = new W2VNrmlMemModelTheta(nbm.word2vec, nbm.vectorSize); @@ -41,42 +41,29 @@ public void testNbmTime() { for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); startTime = System.currentTimeMillis(); - lrModelWords.addAll(memModel.getClosestEntry(centroids[i]).keySet()); + lrModelWords.add(memModel.getClosestEntry(centroids[i])); diff = System.currentTimeMillis() - startTime; totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); - float percVal = calcPercScore(correctWords, lrModelWords); + float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); } - private float calcPercScore(List> correctWordSet, List lrModelWords) { - float percScore = 0; - int len = correctWordSet.size(); - float lenInv = 100f / len; - for (int i = 0; i < len; i++) { - if (correctWordSet.get(i).contains(lrModelWords.get(i))) { - percScore += lenInv; - } - } - return percScore; - - } - - public List> getCorrectWords(float[][] centroids, Word2VecModel nbm) { - List> wordSet = new ArrayList<>(); + public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { + List wordSet = new ArrayList<>(); W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); long startTime, diff; long totTime = 0; for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); startTime = System.currentTimeMillis(); - Map closestWord = bruteForce.getClosestSubEntry(centroids[i], null); + String closestWord = bruteForce.getClosestSubEntry(centroids[i], null); diff = System.currentTimeMillis() - startTime; totTime += diff; - wordSet.add(closestWord.keySet()); + wordSet.add(closestWord); LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); From 2dd29c480b62f94352c6642dae43c256b4fbdc11 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 16:19:20 +0200 Subject: [PATCH 062/114] updating multiplier logic in theta model --- .../word2vec/W2VNrmlMemModelTheta.java | 21 +++++++++++---- .../NrmlzdThetaMdlPrfmncTester.java | 27 +++++++++++-------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java index 9bb7584..f3b9ae7 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java @@ -26,7 +26,7 @@ public class W2VNrmlMemModelTheta implements GenWord2VecModel { private int vectorSize; private Map> cosineIndxMap; private float[] comparisonVec = null; - private float gMultiplier = 100; + private float gMultiplier = 10; public W2VNrmlMemModelTheta(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; @@ -35,13 +35,24 @@ public W2VNrmlMemModelTheta(final Map word2vec, final int vecto setComparisonVec(word2vec, vectorSize); // Generating index bucket for degrees generateCosineIndxMap(); - // TODO: Remove this + // TODO: Remove this printBucketSize(); } - + + public void updateGMultiplier(float gMult) { + if (this.gMultiplier == gMult) { + return; + } + this.gMultiplier = gMult; + // Generating index bucket for degrees + generateCosineIndxMap(); + // TODO: Remove this + printBucketSize(); + } + private void printBucketSize() { - for(int i : cosineIndxMap.keySet()) { - LOG.info("Bucket "+i+" has the size: "+cosineIndxMap.get(i).size()); + for (int i : cosineIndxMap.keySet()) { + LOG.info("Bucket " + i + " has the size: " + cosineIndxMap.get(i).size()); } } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index ed068d1..a76d465 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -38,18 +38,23 @@ public void testNbmTime() { List lrModelWords = new ArrayList<>(); LOG.info("Starting Theta-Model Test"); - for (int i = 0; i < centroids.length; i++) { - LOG.info("Sending query for Centroid " + (i + 1)); - startTime = System.currentTimeMillis(); - lrModelWords.addAll(memModel.getClosestEntry(centroids[i]).keySet()); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); + for (int mult = 10; mult < 1000; mult += 10) { + memModel.updateGMultiplier(mult); + LOG.info("Testing with multplier: " + mult); + for (int i = 0; i < centroids.length; i++) { + LOG.info("Sending query for Centroid " + (i + 1)); + startTime = System.currentTimeMillis(); + lrModelWords.addAll(memModel.getClosestEntry(centroids[i]).keySet()); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); + } + LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / centroids.length) + + " milliseconds"); + LOG.info("Predicted Words are :" + lrModelWords); + float percVal = calcPercScore(correctWords, lrModelWords); + LOG.info("Score for Test is : " + percVal + "%"); } - LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / centroids.length) + " milliseconds"); - LOG.info("Predicted Words are :" + lrModelWords); - float percVal = calcPercScore(correctWords, lrModelWords); - LOG.info("Score for Test is : " + percVal + "%"); } private float calcPercScore(List> correctWordSet, List lrModelWords) { From e567802e0e218ff57ceed5dc358d6268eb96f224 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 16:28:18 +0200 Subject: [PATCH 063/114] fixing test logic --- .../org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index a76d465..f093bb4 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -39,8 +39,9 @@ public void testNbmTime() { LOG.info("Starting Theta-Model Test"); for (int mult = 10; mult < 1000; mult += 10) { - memModel.updateGMultiplier(mult); LOG.info("Testing with multplier: " + mult); + memModel.updateGMultiplier(mult); + for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); startTime = System.currentTimeMillis(); @@ -54,6 +55,7 @@ public void testNbmTime() { LOG.info("Predicted Words are :" + lrModelWords); float percVal = calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); + lrModelWords.clear(); } } From 0e4bb4c8dc34c61a63edabfeee9555922a4cdd7a Mon Sep 17 00:00:00 2001 From: Micha Date: Tue, 18 Sep 2018 16:32:37 +0200 Subject: [PATCH 064/114] Added a multi-threading implementation of the IndxLR model. --- .../word2vecrestful/utils/Word2VecMath.java | 1 + .../word2vec/W2VNrmlMemModelBruteForce.java | 46 +++++++------ .../word2vec/W2VNrmlMemModelIndxdLR.java | 12 ++-- .../word2vec/W2VNrmlMemModelIndxdLRMulti.java | 69 +++++++++++++++++++ .../NrmlzdMdlPrfmncTester.java | 1 + 5 files changed, 101 insertions(+), 28 deletions(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index 92b40b0..e4dabfc 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -5,6 +5,7 @@ import java.util.Map; import java.util.Map.Entry; +import org.dice_research.topicmodeling.commons.collections.TopDoubleObjectCollection; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; public class Word2VecMath { diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java index 388737c..a1956c9 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -88,30 +88,32 @@ private String getClosestEntry(float[] vector, String subKey) { // LOG.info("Normalizing input vector"); // Normalize incoming vector vector = Word2VecMath.normalize(vector); + + return Word2VecMath.findClosestNormalizedVec(word2vec, vector); // LOG.info("fetching nearby vectors"); // calculate cosine similarity of all distances - String[] wordArr = new String[wordSet.size()]; - int[] idArr = new int[wordSet.size()]; - double[] cosineArr = new double[wordSet.size()]; - int i = 0; - for (String word : wordSet) { - wordArr[i] = word; - idArr[i] = i; - float[] wordVec = word2vec.get(word); - cosineArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(wordVec, vector); - i++; - } - cosineArr = AssociativeSort.quickSort(cosineArr, idArr); - double maxVal = cosineArr[cosineArr.length - 1]; - for (int j = cosineArr.length - 1; j >= 0; j--) { - if (cosineArr[j] == maxVal) { - int closestWordId = idArr[j]; - String closestWord = wordArr[closestWordId]; - closestVec = closestWord; - }else { - break; - } - } +// String[] wordArr = new String[wordSet.size()]; +// int[] idArr = new int[wordSet.size()]; +// double[] cosineArr = new double[wordSet.size()]; +// int i = 0; +// for (String word : wordSet) { +// wordArr[i] = word; +// idArr[i] = i; +// float[] wordVec = word2vec.get(word); +// cosineArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(wordVec, vector); +// i++; +// } +// cosineArr = AssociativeSort.quickSort(cosineArr, idArr); +// double maxVal = cosineArr[cosineArr.length - 1]; +// for (int j = cosineArr.length - 1; j >= 0; j--) { +// if (cosineArr[j] == maxVal) { +// int closestWordId = idArr[j]; +// String closestWord = wordArr[closestWordId]; +// closestVec = closestWord; +// }else { +// break; +// } +// } } catch (IOException e) { LOG.error(e.getStackTrace()); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java index b1b4462..d8ce0b7 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java @@ -27,7 +27,7 @@ public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); private Map word2vec; - private int vectorSize; + protected int vectorSize; private float[] sdArr; private float[] multSdArr; /** @@ -41,15 +41,15 @@ public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { /** * Number of top elements to give priority */ - private int k = 10; + protected int k = 10; private DataSubsetProvider dataSubsetProvider; /** * Contains the sorted dimensional values mapped to their words */ - private Object[][] indexesArr; + protected Object[][] indexesArr; - private String[] gWordArr; - private float[][] gVecArr; + protected String[] gWordArr; + protected float[][] gVecArr; // TODO : Remove this private TimeLogger tl = new TimeLogger(); @@ -246,7 +246,7 @@ private Map fetchNearbyVectors(float[] vector, Set word return nearbyVecMap; } - private void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { + protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { // init score array short[] scoreArr = new short[gWordArr.length]; float[] minVec = minMaxVec[0]; diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java new file mode 100644 index 0000000..b2ab49a --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java @@ -0,0 +1,69 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.util.Arrays; +import java.util.Map; +import java.util.concurrent.atomic.AtomicIntegerArray; +import java.util.stream.IntStream; + +import org.dice_research.topicmodeling.commons.collections.TopIntIntCollection; + +public class W2VNrmlMemModelIndxdLRMulti extends W2VNrmlMemModelIndxdLR { + + public W2VNrmlMemModelIndxdLRMulti(final Map word2vec, final int vectorSize) { + super(word2vec, vectorSize); + } + + public W2VNrmlMemModelIndxdLRMulti(Map word2vec, int vectorSize, int sigmaMult, int areaDivisor) { + super(word2vec, vectorSize, sigmaMult, areaDivisor); + } + + protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { + // init score array + AtomicIntegerArray scoreArr = new AtomicIntegerArray(gWordArr.length); + float[] minVec = minMaxVec[0]; + float[] maxVec = minMaxVec[1]; + // loop through each dimension and increment the score of words in that area + IntStream.range(0, vectorSize).parallel().forEach(i -> { + float minVal = minVec[i]; + float maxVal = maxVec[i]; + Object[] entryArr = indexesArr[i]; + int[] idArr = (int[]) entryArr[0]; + float[] dimsnValArr = (float[]) entryArr[1]; + int from = Arrays.binarySearch(dimsnValArr, minVal); + // LOG.info("From value of dimension array: " + from); + if (from < 0) { + // To select the insertion point + from = -1 - from; + } + // LOG.info("Final From value of current dimension array: " + from); + int to = Arrays.binarySearch(dimsnValArr, maxVal); + // LOG.info("To value of dimension array: " + to); + if (to < 0) { + // To select the insertion point + to = -1 - to; + } else { + // Because binarySearch returns the exact index if element exists + to++; + } + // LOG.info("Setting scores for the words between 'from' and 'to' indexes:\t" + + // from + " " + to); + // tl.logTime(9); + for (int j = from; j < to; j++) { + scoreArr.incrementAndGet(idArr[j]); + } + // tl.printTime(9, "Score set for index " + i); + }); + // find the index of the words with highest score and add them to nearbyVecMap + for (int wordId : getMaxIdList(scoreArr)) { + nearbyVecMap.put(gWordArr[wordId], gVecArr[wordId]); + } + } + + private int[] getMaxIdList(AtomicIntegerArray scoreArr) { + TopIntIntCollection collection = new TopIntIntCollection(k, false); + for (int i = 0; i < scoreArr.length(); i++) { + collection.add(scoreArr.get(i), i); + } + return collection.getObjects(); + } +} diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 962ecc0..336ecd7 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -11,6 +11,7 @@ import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLR; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLRMulti; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; From 080c6387222f0fed2e8a53125c9582d044bcfee9 Mon Sep 17 00:00:00 2001 From: Micha Date: Tue, 18 Sep 2018 16:53:38 +0200 Subject: [PATCH 065/114] Updated multithreaded solution for indexed model. --- .../word2vec/W2VNrmlMemModelIndxdLRMulti.java | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java index b2ab49a..e743c8d 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java @@ -2,6 +2,7 @@ import java.util.Arrays; import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicIntegerArray; import java.util.stream.IntStream; @@ -19,7 +20,11 @@ public W2VNrmlMemModelIndxdLRMulti(Map word2vec, int vectorSize protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { // init score array - AtomicIntegerArray scoreArr = new AtomicIntegerArray(gWordArr.length); +// AtomicIntegerArray scoreArr = new AtomicIntegerArray(gWordArr.length); + AtomicInteger scoreArr[] = new AtomicInteger[gWordArr.length]; + for (int j = 0; j < scoreArr.length; j++) { + scoreArr[j] = new AtomicInteger(); + } float[] minVec = minMaxVec[0]; float[] maxVec = minMaxVec[1]; // loop through each dimension and increment the score of words in that area @@ -49,7 +54,7 @@ protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVec // from + " " + to); // tl.logTime(9); for (int j = from; j < to; j++) { - scoreArr.incrementAndGet(idArr[j]); + scoreArr[idArr[j]].incrementAndGet(); } // tl.printTime(9, "Score set for index " + i); }); @@ -59,6 +64,14 @@ protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVec } } + private int[] getMaxIdList(AtomicInteger[] scoreArr) { + TopIntIntCollection collection = new TopIntIntCollection(k, false); + for (int i = 0; i < scoreArr.length; i++) { + collection.add(scoreArr[i].get(), i); + } + return collection.getObjects(); + } + private int[] getMaxIdList(AtomicIntegerArray scoreArr) { TopIntIntCollection collection = new TopIntIntCollection(k, false); for (int i = 0; i < scoreArr.length(); i++) { From 6cc5bff75b5a0fdc25a196eaa41cb3363bb93564 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 17:02:16 +0200 Subject: [PATCH 066/114] fixing compilation error --- .../org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index b9e43ee..0351454 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -53,7 +53,7 @@ public void testNbmTime() { LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); - float percVal = calcPercScore(correctWords, lrModelWords); + float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); } From 6b742075d74c9b5c8c15bbe870d3cc4ab159828a Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 17:05:04 +0200 Subject: [PATCH 067/114] configuring mutli threaded test --- .../org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 162415f..094ca2f 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -3,8 +3,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.Map; -import java.util.Set; import java.util.concurrent.ThreadLocalRandom; import org.aksw.word2vecrestful.utils.Cfg; @@ -46,7 +44,7 @@ public void testNbmTime() { int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; int[] idArr = new int[percScore.length]; - final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); + final W2VNrmlMemModelIndxdLRMulti memModel = new W2VNrmlMemModelIndxdLRMulti(nbm.word2vec, nbm.vectorSize); for (int a = kStrt; a <= kEnd; a+=10) { for (float b = arDivStrt; b <= arDivEnd; b++) { for (float c = sigStrt; c <= sigEnd; c++) { @@ -96,7 +94,7 @@ private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR m } LOG.info( - "Average query time for W2VNrmlMemModelIndxdLR is : " + (totTime / centroids.length) + " milliseconds"); + "Average query time for W2VNrmlMemModelIndxdLRMulti is : " + (totTime / centroids.length) + " milliseconds"); return wordSet; } From 1cc51e0506bc0d7ebf53d7cc58318625c00f3299 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 18 Sep 2018 17:05:52 +0200 Subject: [PATCH 068/114] increasing k size --- .../org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index 094ca2f..b491927 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -34,8 +34,8 @@ public void testNbmTime() { List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - int kStrt = 100; - int kEnd = 100;//20; + int kStrt = 3000; + int kEnd = 4000;//20; float sigStrt = 2; float sigEnd = 3; float arDivStrt = 10; @@ -45,7 +45,7 @@ public void testNbmTime() { float[] percScore = new float[scrSize]; int[] idArr = new int[percScore.length]; final W2VNrmlMemModelIndxdLRMulti memModel = new W2VNrmlMemModelIndxdLRMulti(nbm.word2vec, nbm.vectorSize); - for (int a = kStrt; a <= kEnd; a+=10) { + for (int a = kStrt; a <= kEnd; a+=100) { for (float b = arDivStrt; b <= arDivEnd; b++) { for (float c = sigStrt; c <= sigEnd; c++) { LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); From a0208fe16de2d4b627031eca6d80eeaefe877ee8 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 15:19:26 +0200 Subject: [PATCH 069/114] euclidean distance test --- .../NormalizedInMemModelTest.java | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index dd5b934..ba628ce 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -1,11 +1,14 @@ package org.aksw.word2vecrestful; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.log4j.PropertyConfigurator; @@ -49,7 +52,7 @@ public void testNormalizedModel() { }*/ - @Test + /*@Test public void testNbmTime() { LOG.info("Starting InMemory indexed model test!"); final W2VNrmlMemModelNonIndxd memModel = Word2VecFactory.getNormalizedBinNonIndxdModel(); @@ -70,5 +73,32 @@ public void testNbmTime() { LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); + }*/ + + @Test + public void testNbmTime() { + LOG.info("Starting InMemory indexed model test!"); + Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); + float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; + List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); + LOG.info("Correct Words are :" + correctWords); + + long startTime, diff; + long totTime = 0; + List wordSet = new ArrayList<>(); + for (int i=0;i closestWordMap = nbm.getClosestEntry( centroids[i]); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + wordSet.addAll(closestWordMap.keySet()); + LOG.info("Query time recorded for Centroid " + (i+1) + " is " + + diff + " milliseconds."); + } + + LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); + float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, wordSet); + LOG.info("Score for the Test is " + percVal + "%"); } } From 8937fea857d4cd1ec30168393c078281fbccfc71 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 15:21:48 +0200 Subject: [PATCH 070/114] static method --- .../java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index b491927..ea3586b 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -98,7 +98,7 @@ private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR m return wordSet; } - public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { + public static List getCorrectWords(float[][] centroids, Word2VecModel nbm) { List wordSet = new ArrayList<>(); W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); long startTime, diff; From 316e51b8bb9c187a17bbe4d8bde232bb3b9457fb Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 18:38:38 +0200 Subject: [PATCH 071/114] implementing multiple vector model --- .../word2vec/W2VNrmlMemModelBinSrch.java | 224 ++++++++++++++++++ .../NrmlzdMdlPrfmncTester.java | 3 +- .../NrmlzdThetaMdlPrfmncTester.java | 77 +++--- 3 files changed, 266 insertions(+), 38 deletions(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java new file mode 100644 index 0000000..6066e92 --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -0,0 +1,224 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.dice_research.topicmodeling.commons.sort.AssociativeSort; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private float[][] comparisonVecs = null; + private String[] wordArr; + private float[][] vecArr; + private int[] indxArr; + private double[] simValArr; + private int compareVecCount = 4; + private int bucketCount = 10; + private BitSet[][] csBucketContainer; + + public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + comparisonVecs = new float[compareVecCount][vectorSize]; + csBucketContainer = new BitSet[compareVecCount][bucketCount]; + // Setting mean as comparison vec + setMeanComparisonVec(word2vec, vectorSize); + // Initialize Arrays + processCosineSim(); + // Set other comparison vecs + setAllComparisonVecs(); + + } + + private void setBucketVals(int compVecIndex, float[] comparisonVec) { + BitSet[] comparisonVecBuckets = csBucketContainer[compVecIndex]; + double cosSimVal; + int i = 0; + for (String word : word2vec.keySet()) { + cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVec, word2vec.get(word)); + // Setting bitset for the comparison vec + setValToBucket(i, cosSimVal, comparisonVecBuckets); + i++; + } + } + + private void setAllComparisonVecs() { + int diff = (word2vec.size() / compareVecCount) - 1; + int curIndx = diff; + for (int i = 1; i < compareVecCount; i++) { + comparisonVecs[i] = vecArr[indxArr[curIndx]]; + setBucketVals(i, comparisonVecs[i]); + curIndx += diff; + } + } + + private int getBucketIndex(double cosineSimVal) { + Double dIndx = (bucketCount / 2) * cosineSimVal; + return Math.round(dIndx.floatValue()); + } + + private void processCosineSim() { + this.wordArr = new String[word2vec.size()]; + this.vecArr = new float[word2vec.size()][vectorSize]; + this.indxArr = new int[word2vec.size()]; + this.simValArr = new double[word2vec.size()]; + int i = 0; + BitSet[] meanComparisonVecBuckets = csBucketContainer[0]; + for (String word : word2vec.keySet()) { + wordArr[i] = word; + float[] vec = word2vec.get(word); + vecArr[i] = vec; + indxArr[i] = i; + simValArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVecs[0], vec); + // Setting bitset for the first comparison vec + setValToBucket(i, simValArr[i], meanComparisonVecBuckets); + i++; + } + AssociativeSort.quickSort(simValArr, indxArr); + } + + private void setValToBucket(int wordIndex, double cosSimVal, BitSet[] meanComparisonVecBuckets) { + int bucketIndex = getBucketIndex(cosSimVal); + BitSet bitset = meanComparisonVecBuckets[bucketIndex]; + if (bitset == null) { + bitset = new BitSet(word2vec.size()); + meanComparisonVecBuckets[bucketIndex] = bitset; + } + bitset.set(wordIndex); + } + + private void setMeanComparisonVec(Map word2vecMap, int vectorSize) { + float[] meanArr = new float[vectorSize]; + int totSize = word2vecMap.size(); + // loop all dimensions + for (int i = 0; i < vectorSize; i++) { + // loop through all the words + float[] dimsnArr = new float[totSize]; + float sum = 0; + for (float[] vecEntry : word2vecMap.values()) { + float val = vecEntry[i]; + sum += val; + } + // mean + float mean = sum / dimsnArr.length; + meanArr[i] = mean; + } + this.comparisonVecs[0] = meanArr; + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public String getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public String getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private String getClosestEntry(float[] vector, String subKey) { + String closestVec = null; + try { + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // calculate cosine similarity of all distances + float[] curCompVec; + BitSet finBitSet = null; + for (int i = 0; i < compareVecCount; i++) { + curCompVec = comparisonVecs[i]; + double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); + int indx = getBucketIndex(cosSimVal); + BitSet curBs = new BitSet(word2vec.size()); + curBs.or(csBucketContainer[i][indx]); + int temIndx = indx + 1; + if (temIndx < csBucketContainer[i].length) { + curBs.or(csBucketContainer[i][temIndx]); + } + temIndx = indx - 1; + if (temIndx > -1) { + curBs.or(csBucketContainer[i][temIndx]); + } + if (i == 0) { + finBitSet = curBs; + } else { + finBitSet.and(curBs); + } + } + Map nearbyVecs = new HashMap<>(); + for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1)) { + // operate on index i here + nearbyVecs.put(wordArr[i], vecArr[i]); + if (i == Integer.MAX_VALUE) { + break; // or (i+1) would overflow + } + } + closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); + } catch (Exception e) { + LOG.error(e.getStackTrace()); + } + return closestVec; + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java index ea3586b..f8d1b41 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java @@ -9,7 +9,6 @@ import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLR; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLRMulti; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -44,7 +43,7 @@ public void testNbmTime() { int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); float[] percScore = new float[scrSize]; int[] idArr = new int[percScore.length]; - final W2VNrmlMemModelIndxdLRMulti memModel = new W2VNrmlMemModelIndxdLRMulti(nbm.word2vec, nbm.vectorSize); + final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); for (int a = kStrt; a <= kEnd; a+=100) { for (float b = arDivStrt; b <= arDivEnd; b++) { for (float c = sigStrt; c <= sigEnd; c++) { diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 0351454..730a5e4 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -2,12 +2,10 @@ import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.Set; import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelTheta; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -23,6 +21,35 @@ public class NrmlzdThetaMdlPrfmncTester { } public static Logger LOG = LogManager.getLogger(NrmlzdThetaMdlPrfmncTester.class); + /* + * @Test public void testNbmTime() { long startTime, diff; long totTime = 0; + * LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = + * Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, + * TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; + * LOG.info("Starting BruteForce-Model Test"); List correctWords = + * getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + + * correctWords); LOG.info("Initializing Theta Model"); final + * W2VNrmlMemModelTheta memModel = new W2VNrmlMemModelTheta(nbm.word2vec, + * nbm.vectorSize); List lrModelWords = new ArrayList<>(); + * + * LOG.info("Starting Theta-Model Test"); for (int mult = 10; mult < 1000; mult + * += 10) { LOG.info("Testing with multplier: " + mult); + * memModel.updateGMultiplier(mult); + * + * for (int i = 0; i < centroids.length; i++) { + * LOG.info("Sending query for Centroid " + (i + 1)); startTime = + * System.currentTimeMillis(); + * lrModelWords.add(memModel.getClosestEntry(centroids[i])); diff = + * System.currentTimeMillis() - startTime; totTime += diff; + * LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + + * " milliseconds."); } + * LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / + * centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + + * lrModelWords); float percVal = + * NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); + * LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); } } + */ + @Test public void testNbmTime() { long startTime, diff; @@ -31,49 +58,27 @@ public void testNbmTime() { Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; LOG.info("Starting BruteForce-Model Test"); - List correctWords = getCorrectWords(centroids, nbm); + List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing Theta Model"); - final W2VNrmlMemModelTheta memModel = new W2VNrmlMemModelTheta(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); + final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting Theta-Model Test"); - for (int mult = 10; mult < 1000; mult += 10) { - LOG.info("Testing with multplier: " + mult); - memModel.updateGMultiplier(mult); - - for (int i = 0; i < centroids.length; i++) { - LOG.info("Sending query for Centroid " + (i + 1)); - startTime = System.currentTimeMillis(); - lrModelWords.add(memModel.getClosestEntry(centroids[i])); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); - } - LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / centroids.length) - + " milliseconds"); - LOG.info("Predicted Words are :" + lrModelWords); - float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); - LOG.info("Score for Test is : " + percVal + "%"); - lrModelWords.clear(); - } - } + LOG.info("Starting W2VNrmlMemModelBinSrch Test"); - public List getCorrectWords(float[][] centroids, Word2VecModel nbm) { - List wordSet = new ArrayList<>(); - W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); - long startTime, diff; - long totTime = 0; for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); startTime = System.currentTimeMillis(); - String closestWord = bruteForce.getClosestSubEntry(centroids[i], null); + lrModelWords.add(memModel.getClosestEntry(centroids[i])); diff = System.currentTimeMillis() - startTime; totTime += diff; - wordSet.add(closestWord); LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); - return wordSet; + LOG.info( + "Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Predicted Words are :" + lrModelWords); + float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); + LOG.info("Score for Test is : " + percVal + "%"); + lrModelWords.clear(); } } From 3c06b49c63bf41226594229030e6d984d4510641 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 21:12:41 +0200 Subject: [PATCH 072/114] fixing index logic --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 6066e92..d1427d8 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -67,7 +67,7 @@ private void setAllComparisonVecs() { } private int getBucketIndex(double cosineSimVal) { - Double dIndx = (bucketCount / 2) * cosineSimVal; + Double dIndx = ((bucketCount-1d) / 2d) * (cosineSimVal + 1d); return Math.round(dIndx.floatValue()); } From aab6df74d2cab70eb1b548406fe12210417088fe Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 21:46:22 +0200 Subject: [PATCH 073/114] putting logs --- .../word2vec/W2VNrmlMemModelBinSrch.java | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index d1427d8..7689410 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -9,6 +9,8 @@ import org.apache.log4j.Logger; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; +import nikit.test.TimeLogger; + /** * Class to encapsulate word2vec in-memory model and expose methods to perform * search on the model @@ -29,7 +31,8 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private int compareVecCount = 4; private int bucketCount = 10; private BitSet[][] csBucketContainer; - + // TODO : Remove this + private TimeLogger tl = new TimeLogger(); public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; @@ -167,6 +170,7 @@ private String getClosestEntry(float[] vector, String subKey) { // calculate cosine similarity of all distances float[] curCompVec; BitSet finBitSet = null; + tl.logTime(1); for (int i = 0; i < compareVecCount; i++) { curCompVec = comparisonVecs[i]; double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); @@ -187,6 +191,8 @@ private String getClosestEntry(float[] vector, String subKey) { finBitSet.and(curBs); } } + tl.printTime(1, "Setting Bits"); + tl.logTime(1); Map nearbyVecs = new HashMap<>(); for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1)) { // operate on index i here @@ -195,9 +201,12 @@ private String getClosestEntry(float[] vector, String subKey) { break; // or (i+1) would overflow } } + tl.printTime(1, "Extracting words"); + tl.logTime(1); closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); + tl.printTime(1, "finding closest word"); } catch (Exception e) { - LOG.error(e.getStackTrace()); + e.printStackTrace(); } return closestVec; } From 83b6b2f7d80c2cda5cca6de558eb2ed1b69ff7e8 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 21:58:40 +0200 Subject: [PATCH 074/114] null check --- .../word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 7689410..9634b53 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -33,6 +33,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private BitSet[][] csBucketContainer; // TODO : Remove this private TimeLogger tl = new TimeLogger(); + public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; @@ -70,7 +71,7 @@ private void setAllComparisonVecs() { } private int getBucketIndex(double cosineSimVal) { - Double dIndx = ((bucketCount-1d) / 2d) * (cosineSimVal + 1d); + Double dIndx = ((bucketCount - 1d) / 2d) * (cosineSimVal + 1d); return Math.round(dIndx.floatValue()); } @@ -178,11 +179,11 @@ private String getClosestEntry(float[] vector, String subKey) { BitSet curBs = new BitSet(word2vec.size()); curBs.or(csBucketContainer[i][indx]); int temIndx = indx + 1; - if (temIndx < csBucketContainer[i].length) { + if (temIndx < csBucketContainer[i].length && csBucketContainer[i][temIndx] != null) { curBs.or(csBucketContainer[i][temIndx]); } temIndx = indx - 1; - if (temIndx > -1) { + if (temIndx > -1 && csBucketContainer[i][temIndx] != null) { curBs.or(csBucketContainer[i][temIndx]); } if (i == 0) { From 4518fc309888dbed474a905caf228f0c52f956a2 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 22:05:58 +0200 Subject: [PATCH 075/114] changing config --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 9634b53..ec60f55 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -28,8 +28,8 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 4; - private int bucketCount = 10; + private int compareVecCount = 2; + private int bucketCount = 30; private BitSet[][] csBucketContainer; // TODO : Remove this private TimeLogger tl = new TimeLogger(); From 3a0d5443de20f4e85697072684acaf6870aa97f4 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 22:14:12 +0200 Subject: [PATCH 076/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index ec60f55..fd6c56a 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -28,8 +28,8 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 2; - private int bucketCount = 30; + private int compareVecCount = 10; + private int bucketCount = 15; private BitSet[][] csBucketContainer; // TODO : Remove this private TimeLogger tl = new TimeLogger(); From ddf3c52e6c9c8d0d1e52a5fd8d33fd7da4591ced Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 22:21:02 +0200 Subject: [PATCH 077/114] changing config --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index fd6c56a..bafb86c 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -29,7 +29,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private int[] indxArr; private double[] simValArr; private int compareVecCount = 10; - private int bucketCount = 15; + private int bucketCount = 10; private BitSet[][] csBucketContainer; // TODO : Remove this private TimeLogger tl = new TimeLogger(); From d1fb2fb8f0675df7ab7a4ebbc48371e7393b8f87 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 22:28:06 +0200 Subject: [PATCH 078/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index bafb86c..f2d2247 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -28,7 +28,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 10; + private int compareVecCount = 100; private int bucketCount = 10; private BitSet[][] csBucketContainer; // TODO : Remove this From 54a6fd42b7647b63aa99003e6b7059458431a2bc Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 22:52:41 +0200 Subject: [PATCH 079/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index f2d2247..d94354d 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -28,7 +28,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 100; + private int compareVecCount = 200; private int bucketCount = 10; private BitSet[][] csBucketContainer; // TODO : Remove this From 284b8293844ee1b23ba1385c5dd09966cea46483 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 19 Sep 2018 23:05:15 +0200 Subject: [PATCH 080/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index d94354d..b18c3bd 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -28,7 +28,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 200; + private int compareVecCount = 150; private int bucketCount = 10; private BitSet[][] csBucketContainer; // TODO : Remove this From 9c740bb872343a914aadd2e41095f2ee2e21822e Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 21 Sep 2018 14:53:35 +0200 Subject: [PATCH 081/114] putting euclidean distance logic --- .../word2vec/W2VNrmlMemModelBinSrch.java | 46 +++++++++++++++++-- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index b18c3bd..9515978 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -28,7 +28,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 150; + private int compareVecCount = 100; private int bucketCount = 10; private BitSet[][] csBucketContainer; // TODO : Remove this @@ -164,7 +164,7 @@ public String getClosestSubEntry(float[] vector, String subKey) { * @return closest word to the given vector alongwith it's vector */ private String getClosestEntry(float[] vector, String subKey) { - String closestVec = null; + String closestWord = null; try { // Normalize incoming vector vector = Word2VecMath.normalize(vector); @@ -204,12 +204,50 @@ private String getClosestEntry(float[] vector, String subKey) { } tl.printTime(1, "Extracting words"); tl.logTime(1); - closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); + closestWord = findClosestWord(nearbyVecs, vector); tl.printTime(1, "finding closest word"); } catch (Exception e) { e.printStackTrace(); } - return closestVec; + return closestWord; + } + + private String findClosestWord(Map nearbyVecs, float[] vector) { + double minDist = -2; + String minWord = null; + double tempDist; + // Loop on the subset + for (String word : nearbyVecs.keySet()) { + float[] wordvec = word2vec.get(word); + tempDist = getSqEucDist(vector, wordvec, minDist); + if (tempDist != -1) { + minWord = word; + minDist = tempDist; + } + } + return minWord; + } + + /** + * Method to find the squared value of euclidean distance between two vectors if + * it is less than the provided minimum distance value, otherwise return -1 + * + * @param arr1 + * - first vector + * @param arr2 + * - second vector + * @param minDist + * - minimum distance constraint + * @return squared euclidean distance between two vector or -1 + */ + private double getSqEucDist(float[] arr1, float[] arr2, double minDist) { + double dist = 0; + for (int i = 0; i < vectorSize; i++) { + dist += Math.pow(arr1[i] - arr2[i], 2); + if (minDist != -2 && dist > minDist) + return -1; + } + return dist; } /** From d8e1576fa63894061054d8ee67cc273ce5695679 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 21 Sep 2018 15:20:03 +0200 Subject: [PATCH 082/114] changing nearby vec logic --- .../word2vec/W2VNrmlMemModelBinSrch.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 9515978..5a0d243 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -1,7 +1,6 @@ package org.aksw.word2vecrestful.word2vec; import java.util.BitSet; -import java.util.HashMap; import java.util.Map; import org.aksw.word2vecrestful.utils.Word2VecMath; @@ -194,17 +193,18 @@ private String getClosestEntry(float[] vector, String subKey) { } tl.printTime(1, "Setting Bits"); tl.logTime(1); - Map nearbyVecs = new HashMap<>(); - for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1)) { + int[] nearbyIndexes = new int[finBitSet.cardinality()]; + int j = 0; + for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { // operate on index i here - nearbyVecs.put(wordArr[i], vecArr[i]); + nearbyIndexes[j] = i; if (i == Integer.MAX_VALUE) { break; // or (i+1) would overflow } } tl.printTime(1, "Extracting words"); tl.logTime(1); - closestWord = findClosestWord(nearbyVecs, vector); + closestWord = findClosestWord(nearbyIndexes, vector); tl.printTime(1, "finding closest word"); } catch (Exception e) { e.printStackTrace(); @@ -212,16 +212,15 @@ private String getClosestEntry(float[] vector, String subKey) { return closestWord; } - private String findClosestWord(Map nearbyVecs, float[] vector) { + private String findClosestWord(int[] nearbyIndexes, float[] vector) { double minDist = -2; String minWord = null; double tempDist; - // Loop on the subset - for (String word : nearbyVecs.keySet()) { - float[] wordvec = word2vec.get(word); + for (int indx : nearbyIndexes) { + float[] wordvec = vecArr[indx]; tempDist = getSqEucDist(vector, wordvec, minDist); if (tempDist != -1) { - minWord = word; + minWord = wordArr[indx]; minDist = tempDist; } } From d491eb664da99590edaaf4646519f3e5278d88cd Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 21 Sep 2018 17:56:04 +0200 Subject: [PATCH 083/114] making KMeans model --- pom.xml | 7 +- .../word2vecrestful/utils/ClusterableVec.java | 21 ++ .../word2vecrestful/utils/Word2VecMath.java | 28 ++ .../word2vec/W2VNrmlMemModelKMeans.java | 250 ++++++++++++++++++ .../NrmlzdThetaMdlPrfmncTester.java | 11 +- 5 files changed, 310 insertions(+), 7 deletions(-) create mode 100644 src/main/java/org/aksw/word2vecrestful/utils/ClusterableVec.java create mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java diff --git a/pom.xml b/pom.xml index 90e1d46..8cf8d60 100644 --- a/pom.xml +++ b/pom.xml @@ -91,7 +91,12 @@ slf4j-log4j12 ${slf4j.version} - + + + org.apache.commons + commons-math3 + 3.5 + src/test/java diff --git a/src/main/java/org/aksw/word2vecrestful/utils/ClusterableVec.java b/src/main/java/org/aksw/word2vecrestful/utils/ClusterableVec.java new file mode 100644 index 0000000..d858f2a --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/utils/ClusterableVec.java @@ -0,0 +1,21 @@ +package org.aksw.word2vecrestful.utils; + +import org.apache.commons.math3.ml.clustering.Clusterable; + +public class ClusterableVec implements Clusterable { + private float[] vec; + + public ClusterableVec(float[] vec) { + this.vec = vec; + } + + @Override + public double[] getPoint() { + return Word2VecMath.convertFloatsToDoubles(vec); + } + + public float[] getVec() { + return this.vec; + } + +} diff --git a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java index e4dabfc..9370678 100644 --- a/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/Word2VecMath.java @@ -164,4 +164,32 @@ public static double getAngDegrees(float[] vecA, float[] vecB) { double degrees = Math.toDegrees(radians); return degrees; } + + public static double[] convertFloatsToDoubles(float[] input) + { + if (input == null) + { + return null; // Or throw an exception - your choice + } + double[] output = new double[input.length]; + for (int i = 0; i < input.length; i++) + { + output[i] = input[i]; + } + return output; + } + + public static float[] convertDoublesToFloats(double[] input) + { + if (input == null) + { + return null; // Or throw an exception - your choice + } + float[] output = new float[input.length]; + for (int i = 0; i < input.length; i++) + { + output[i] = (float) input[i]; + } + return output; + } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java new file mode 100644 index 0000000..fe3ea5e --- /dev/null +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -0,0 +1,250 @@ +package org.aksw.word2vecrestful.word2vec; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; +import java.util.Map; + +import org.aksw.word2vecrestful.utils.ClusterableVec; +import org.aksw.word2vecrestful.utils.Word2VecMath; +import org.apache.commons.math3.ml.clustering.CentroidCluster; +import org.apache.commons.math3.ml.clustering.Clusterable; +import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import nikit.test.TimeLogger; + +/** + * Class to encapsulate word2vec in-memory model and expose methods to perform + * search on the model + * + * @author Nikit + * + */ +public class W2VNrmlMemModelKMeans implements GenWord2VecModel { + public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); + + private Map word2vec; + private int vectorSize; + private float[][] comparisonVecs = null; + private String[] wordArr; + private float[][] vecArr; + private int compareVecCount = 100; + private int bucketCount = 10; + private BitSet[][] csBucketContainer; + // TODO : Remove this + private TimeLogger tl = new TimeLogger(); + + public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) { + this.word2vec = word2vec; + this.vectorSize = vectorSize; + comparisonVecs = new float[compareVecCount][vectorSize]; + csBucketContainer = new BitSet[compareVecCount][bucketCount]; + // Fetch comparison vectors + generateComparisonVectors(); + // Initialize Arrays + processCosineSim(); + + } + + private void generateComparisonVectors() { + KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount); + List vecList = new ArrayList<>(); + for (float[] vec : word2vec.values()) { + vecList.add(getClusterablePoint(vec)); + } + List> compVecList = clusterer.cluster(vecList); + int i = 0; + for (CentroidCluster entry : compVecList) { + Clusterable centroid = entry.getCenter(); + float[] fCentroid = Word2VecMath.convertDoublesToFloats(centroid.getPoint()); + comparisonVecs[i] = fCentroid; + i++; + } + } + + public static ClusterableVec getClusterablePoint(float[] vec) { + return new ClusterableVec(vec); + } + + private void processCosineSim() { + double cosSimVal; + this.wordArr = new String[word2vec.size()]; + this.vecArr = new float[word2vec.size()][vectorSize]; + int i = 0; + for (String word : word2vec.keySet()) { + wordArr[i] = word; + float[] vec = word2vec.get(word); + vecArr[i] = vec; + for (int j = 0; j < compareVecCount; j++) { + BitSet[] comparisonVecBuckets = csBucketContainer[j]; + cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVecs[j], vec); + // Setting bitset for the comparison vec + setValToBucket(i, cosSimVal, comparisonVecBuckets); + } + i++; + } + } + + private int getBucketIndex(double cosineSimVal) { + Double dIndx = ((bucketCount - 1d) / 2d) * (cosineSimVal + 1d); + return Math.round(dIndx.floatValue()); + } + + private void setValToBucket(int wordIndex, double cosSimVal, BitSet[] meanComparisonVecBuckets) { + int bucketIndex = getBucketIndex(cosSimVal); + BitSet bitset = meanComparisonVecBuckets[bucketIndex]; + if (bitset == null) { + bitset = new BitSet(word2vec.size()); + meanComparisonVecBuckets[bucketIndex] = bitset; + } + bitset.set(wordIndex); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * + * @return closest word to the given vector alongwith it's vector + */ + @Override + public String getClosestEntry(float[] vector) { + return getClosestEntry(vector, null); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + @Override + public String getClosestSubEntry(float[] vector, String subKey) { + return getClosestEntry(vector, subKey); + } + + /** + * Method to fetch the closest word entry for a given vector using cosine + * similarity + * + * @param vector + * - vector to find closest word to + * @param subKey + * - key to subset if any + * @return closest word to the given vector alongwith it's vector + */ + private String getClosestEntry(float[] vector, String subKey) { + String closestWord = null; + try { + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + // calculate cosine similarity of all distances + float[] curCompVec; + BitSet finBitSet = null; + tl.logTime(1); + for (int i = 0; i < compareVecCount; i++) { + curCompVec = comparisonVecs[i]; + double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); + int indx = getBucketIndex(cosSimVal); + BitSet curBs = new BitSet(word2vec.size()); + curBs.or(csBucketContainer[i][indx]); + int temIndx = indx + 1; + if (temIndx < csBucketContainer[i].length && csBucketContainer[i][temIndx] != null) { + curBs.or(csBucketContainer[i][temIndx]); + } + temIndx = indx - 1; + if (temIndx > -1 && csBucketContainer[i][temIndx] != null) { + curBs.or(csBucketContainer[i][temIndx]); + } + if (i == 0) { + finBitSet = curBs; + } else { + finBitSet.and(curBs); + } + } + tl.printTime(1, "Setting Bits"); + tl.logTime(1); + int[] nearbyIndexes = new int[finBitSet.cardinality()]; + int j = 0; + for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { + // operate on index i here + nearbyIndexes[j] = i; + if (i == Integer.MAX_VALUE) { + break; // or (i+1) would overflow + } + } + tl.printTime(1, "Extracting words"); + tl.logTime(1); + closestWord = findClosestWord(nearbyIndexes, vector); + tl.printTime(1, "finding closest word"); + } catch (Exception e) { + e.printStackTrace(); + } + return closestWord; + } + + private String findClosestWord(int[] nearbyIndexes, float[] vector) { + double minDist = -2; + String minWord = null; + double tempDist; + for (int indx : nearbyIndexes) { + float[] wordvec = vecArr[indx]; + tempDist = getSqEucDist(vector, wordvec, minDist); + if (tempDist != -1) { + minWord = wordArr[indx]; + minDist = tempDist; + } + } + return minWord; + } + + /** + * Method to find the squared value of euclidean distance between two vectors if + * it is less than the provided minimum distance value, otherwise return -1 + * + * @param arr1 + * - first vector + * @param arr2 + * - second vector + * @param minDist + * - minimum distance constraint + * @return squared euclidean distance between two vector or -1 + */ + private double getSqEucDist(float[] arr1, float[] arr2, double minDist) { + double dist = 0; + for (int i = 0; i < vectorSize; i++) { + dist += Math.pow(arr1[i] - arr2[i], 2); + if (minDist != -2 && dist > minDist) + return -1; + } + return dist; + } + + /** + * Method to fetch vectorSize + * + * @return - vectorSize + */ + @Override + public int getVectorSize() { + return this.vectorSize; + } + + /** + * Method to fetch word2vec map + * + * @return - word2vec map + */ + public Map getWord2VecMap() { + return this.word2vec; + } + +} diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 730a5e4..6408e38 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -4,8 +4,7 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -60,11 +59,11 @@ public void testNbmTime() { LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); - final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelKMeans Model"); + final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelBinSrch Test"); + LOG.info("Starting W2VNrmlMemModelKMeans Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -75,7 +74,7 @@ public void testNbmTime() { LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info( - "Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); + "Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From 6b621e8510450031152b0e5ba5b449c25878b46a Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 4 Oct 2018 16:02:41 +0200 Subject: [PATCH 084/114] setting maxiterations to 10 --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index fe3ea5e..72ac267 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -49,7 +49,7 @@ public W2VNrmlMemModelKMeans(final Map word2vec, final int vect } private void generateComparisonVectors() { - KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount); + KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, 10); List vecList = new ArrayList<>(); for (float[] vec : word2vec.values()) { vecList.add(getClusterablePoint(vec)); From 6080c3afabe0a7490e601a97bbd97fec70bce70c Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 4 Oct 2018 17:04:20 +0200 Subject: [PATCH 085/114] logs included --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 72ac267..944ebf0 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -41,8 +41,10 @@ public W2VNrmlMemModelKMeans(final Map word2vec, final int vect this.vectorSize = vectorSize; comparisonVecs = new float[compareVecCount][vectorSize]; csBucketContainer = new BitSet[compareVecCount][bucketCount]; + LOG.info("Starting Generation of comparison vectors!"); // Fetch comparison vectors generateComparisonVectors(); + LOG.info("Comparison vectors generated. Building buckets."); // Initialize Arrays processCosineSim(); From 9fed0fd269f45626a97f55318fb5cd79ac15cf6f Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 4 Oct 2018 18:34:33 +0200 Subject: [PATCH 086/114] previous model --- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 6408e38..943941c 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -4,7 +4,7 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -59,11 +59,11 @@ public void testNbmTime() { LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelKMeans Model"); - final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); + final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelKMeans Test"); + LOG.info("Starting W2VNrmlMemModelBinSrch Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -74,7 +74,7 @@ public void testNbmTime() { LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info( - "Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); + "Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From ef6aa6e4b1384331818e12c5e58ad82f8c8ea047 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 8 Oct 2018 12:03:49 +0200 Subject: [PATCH 087/114] changing config --- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 943941c..6408e38 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -4,7 +4,7 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -59,11 +59,11 @@ public void testNbmTime() { LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); - final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelKMeans Model"); + final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelBinSrch Test"); + LOG.info("Starting W2VNrmlMemModelKMeans Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -74,7 +74,7 @@ public void testNbmTime() { LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info( - "Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); + "Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From fcf91e4cfe3533ba1addaca83209e7cd6004ae44 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 8 Oct 2018 12:04:16 +0200 Subject: [PATCH 088/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 944ebf0..2326182 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -30,8 +30,9 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private float[][] comparisonVecs = null; private String[] wordArr; private float[][] vecArr; - private int compareVecCount = 100; + private int compareVecCount = 10; private int bucketCount = 10; + private int kMeansMaxItr = 10; private BitSet[][] csBucketContainer; // TODO : Remove this private TimeLogger tl = new TimeLogger(); @@ -51,7 +52,7 @@ public W2VNrmlMemModelKMeans(final Map word2vec, final int vect } private void generateComparisonVectors() { - KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, 10); + KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, kMeansMaxItr); List vecList = new ArrayList<>(); for (float[] vec : word2vec.values()) { vecList.add(getClusterablePoint(vec)); From f50d36b89c79fae7335a4d1d7a0f7cce62fba2e1 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Mon, 8 Oct 2018 12:20:12 +0200 Subject: [PATCH 089/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 2326182..fc4086c 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -30,9 +30,9 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private float[][] comparisonVecs = null; private String[] wordArr; private float[][] vecArr; - private int compareVecCount = 10; + private int compareVecCount = 150; private int bucketCount = 10; - private int kMeansMaxItr = 10; + private int kMeansMaxItr = 5; private BitSet[][] csBucketContainer; // TODO : Remove this private TimeLogger tl = new TimeLogger(); From 0fdfaeefe55b8b12776810c8b32d6bb4de97c0ef Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 9 Oct 2018 15:03:22 +0200 Subject: [PATCH 090/114] comparison vecs persistence logic added --- .../word2vec/W2VNrmlMemModelKMeans.java | 104 ++++++++++++++++-- .../NrmlzdThetaMdlPrfmncTester.java | 3 +- 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index fc4086c..9a2b4d4 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -1,5 +1,9 @@ package org.aksw.word2vecrestful.word2vec; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; import java.util.ArrayList; import java.util.BitSet; import java.util.List; @@ -13,6 +17,9 @@ import org.apache.log4j.LogManager; import org.apache.log4j.Logger; +import com.opencsv.CSVReader; +import com.opencsv.CSVWriter; + import nikit.test.TimeLogger; /** @@ -30,29 +37,44 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private float[][] comparisonVecs = null; private String[] wordArr; private float[][] vecArr; - private int compareVecCount = 150; + private int compareVecCount = 100; private int bucketCount = 10; - private int kMeansMaxItr = 5; + private int kMeansMaxItr = 10; private BitSet[][] csBucketContainer; + private String vecFilePath = "data/kmeans/comparison-vecs.csv"; // TODO : Remove this private TimeLogger tl = new TimeLogger(); - public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) { + public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) throws IOException { this.word2vec = word2vec; this.vectorSize = vectorSize; comparisonVecs = new float[compareVecCount][vectorSize]; csBucketContainer = new BitSet[compareVecCount][bucketCount]; - LOG.info("Starting Generation of comparison vectors!"); - // Fetch comparison vectors - generateComparisonVectors(); - LOG.info("Comparison vectors generated. Building buckets."); + fetchComparisonVectors(); // Initialize Arrays processCosineSim(); + } + private void fetchComparisonVectors() throws IOException { + File vecFile = new File(vecFilePath); + if (vecFile.exists()) { + LOG.info("Reading Comparsion vectors from the file."); + // read the persisted vectors + comparisonVecs = readVecsFromFile(vecFile); + } else { + LOG.info("Starting Generation of comparison vectors!"); + // Fetch comparison vectors + generateComparisonVectors(); + // persist the generated vectors + + writeVecsToFile(comparisonVecs, vecFile); + } + LOG.info("Comparison vectors generated/fetched. Building buckets."); } private void generateComparisonVectors() { - KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, kMeansMaxItr); + KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, + kMeansMaxItr); List vecList = new ArrayList<>(); for (float[] vec : word2vec.values()) { vecList.add(getClusterablePoint(vec)); @@ -61,6 +83,7 @@ private void generateComparisonVectors() { int i = 0; for (CentroidCluster entry : compVecList) { Clusterable centroid = entry.getCenter(); + LOG.info("Number of points in the cluster " + (i + 1) + " are: " + entry.getPoints().size()); float[] fCentroid = Word2VecMath.convertDoublesToFloats(centroid.getPoint()); comparisonVecs[i] = fCentroid; i++; @@ -250,4 +273,69 @@ public Map getWord2VecMap() { return this.word2vec; } + public static float[][] readVecsFromFile(File inputFile) throws IOException { + float[][] vecArr = null; + FileReader fileReader; + CSVReader reader = null; + try { + fileReader = new FileReader(inputFile); + reader = new CSVReader(fileReader); + List vecList = reader.readAll(); + vecArr = new float[vecList.size()][vecList.get(0).length]; + for (int i = 0; i < vecList.size(); i++) { + vecArr[i] = convertToFloatArr(vecList.get(i)); + } + + } catch (IOException e) { + e.printStackTrace(); + } finally { + if (reader != null) { + // closing writer connection + reader.close(); + } + } + return vecArr; + } + + public static void writeVecsToFile(float[][] vecArr, File outputFile) throws IOException { + outputFile.getParentFile().mkdirs(); + CSVWriter writer = null; + try { + // create FileWriter object with file as parameter + FileWriter fileWriter = new FileWriter(outputFile); + + // create CSVWriter object filewriter object as parameter + writer = new CSVWriter(fileWriter); + for (int i = 0; i < vecArr.length; i++) { + float[] vec = vecArr[i]; + String[] line = convertToStrArr(vec); + writer.writeNext(line); + } + + } catch (IOException e) { + e.printStackTrace(); + } finally { + if (writer != null) { + // closing writer connection + writer.close(); + } + } + } + + public static String[] convertToStrArr(float[] vec) { + String[] resArr = new String[vec.length]; + for (int i = 0; i < resArr.length; i++) { + resArr[i] = String.valueOf(vec[i]); + } + return resArr; + } + + public static float[] convertToFloatArr(String[] vec) { + float[] resArr = new float[vec.length]; + for (int i = 0; i < resArr.length; i++) { + resArr[i] = Float.parseFloat(vec[i]); + } + return resArr; + } + } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 6408e38..ef4211d 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -1,5 +1,6 @@ package org.aksw.word2vecrestful; +import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -50,7 +51,7 @@ public class NrmlzdThetaMdlPrfmncTester { */ @Test - public void testNbmTime() { + public void testNbmTime() throws IOException { long startTime, diff; long totTime = 0; LOG.info("Starting InMemory Theta Model test!"); From f41c869cb1deab8b2d4e177f126a88b5fdcc94b5 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Tue, 9 Oct 2018 17:42:40 +0200 Subject: [PATCH 091/114] config change --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 9a2b4d4..08e2a5b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -39,7 +39,7 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private float[][] vecArr; private int compareVecCount = 100; private int bucketCount = 10; - private int kMeansMaxItr = 10; + private int kMeansMaxItr = 5; private BitSet[][] csBucketContainer; private String vecFilePath = "data/kmeans/comparison-vecs.csv"; // TODO : Remove this From 827dde6dc3b398e2d4abaa74533f223ef880340c Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 11:38:00 +0200 Subject: [PATCH 092/114] adding more test cases --- src/main/java/nikit/test/TestConst.java | 16 +++ .../NormalizedInMemModelTest.java | 109 +++++++++--------- 2 files changed, 68 insertions(+), 57 deletions(-) diff --git a/src/main/java/nikit/test/TestConst.java b/src/main/java/nikit/test/TestConst.java index a2c26f4..3e77319 100644 --- a/src/main/java/nikit/test/TestConst.java +++ b/src/main/java/nikit/test/TestConst.java @@ -16,4 +16,20 @@ public interface TestConst { public static final float[] CENT3 = {0.0045599197f,-0.016655529f,0.03039212f,0.072564f,-0.012314448f,-0.017728124f,-0.014679423f,-0.03531948f,-0.021118527f,0.040627014f,-0.029296335f,0.06789777f,0.013045308f,0.00410344f,-0.0684175f,0.0019473588f,-0.012823016f,0.056388646f,0.039152887f,0.021038044f,-0.00095268164f,-0.015722638f,0.016130328f,0.06405417f,0.035486434f,0.0012872473f,-0.04126779f,0.042538967f,-0.057445608f,0.020221751f,0.057181627f,-0.0595517f,-0.044382412f,-0.06861117f,0.04883461f,-0.084817395f,-0.058351003f,0.024775567f,0.018097626f,0.018196356f,-0.034807358f,0.013536167f,-0.027318547f,0.0952005f,-0.05871476f,-0.019638764f,-0.03597215f,-0.049303804f,-0.0021026363f,0.094765685f,-0.088323735f,0.07100553f,-0.01156632f,-0.051242463f,-0.08165367f,0.078404605f,-0.13566828f,-0.07969416f,-0.061671786f,-0.13423134f,-0.08536931f,0.00008282155f,-0.049163125f,-0.024533534f,-0.043691263f,-0.08095603f,0.010672941f,0.019639365f,0.0013429034f,0.0567831f,-0.0967779f,-0.019170664f,0.05618282f,0.011713989f,-0.047731653f,-0.04699047f,0.07399622f,-0.047449544f,0.03590385f,0.01322831f,-0.0039109844f,-0.07518891f,0.025434965f,-0.05743741f,0.004045694f,-0.04617371f,0.033967968f,0.0711757f,-0.009795415f,0.019290686f,-0.037020046f,-0.08178564f,-0.097990505f,-0.0004118288f,-0.09405116f,0.024903592f,0.10294186f,-0.056358308f,0.008722311f,0.003291616f,-0.018818645f,-0.09692368f,-0.06916255f,0.04351524f,0.06276146f,0.015949301f,-0.011483621f,-0.006140696f,-0.018099293f,-0.00995063f,-0.02339901f,-0.024050366f,0.07546873f,-0.028504811f,0.04963921f,-0.0076172072f,-0.02817498f,-0.060157653f,0.17303619f,0.08201302f,-0.10454881f,-0.037780367f,-0.10166054f,-0.04218956f,0.036145452f,-0.05909255f,-0.1310377f,0.042928588f,-0.074216254f,0.06736096f,-0.015906654f,0.017667266f,-0.10083889f,-0.0981416f,-0.041044645f,-0.044278126f,-0.043435194f,0.03941682f,-0.06488468f,0.05151011f,0.06728203f,-0.03063651f,-0.008622926f,-0.031948224f,-0.010434174f,0.068239905f,0.060161065f,0.051712357f,-0.026403125f,-0.02375673f,0.087121055f,-0.064310744f,-0.018927738f,0.082358375f,0.055305336f,-0.049550034f,-0.0042584497f,0.041982714f,-0.021151712f,-0.02128548f,0.026912898f,0.0756465f,0.06861557f,-0.008753623f,0.09500656f,-0.016722199f,0.045179248f,0.06433635f,-0.050378326f,0.057967953f,-0.06901692f,-0.056074627f,0.020965451f,-0.046926398f,0.035873193f,-0.04375493f,0.029856201f,-0.090699345f,-0.03156256f,-0.07428955f,0.0077958796f,-0.04266842f,0.015002862f,-0.070534125f,0.053053115f,-0.037462384f,0.050507464f,-0.0070602717f,-0.0030573292f,-0.029228054f,-0.041755334f,0.00685498f,-0.004669522f,0.009998457f,-0.1278749f,0.034629613f,0.06283661f,-0.12521815f,0.013474653f,-0.0799926f,0.026823737f,0.061804105f,0.010250122f,-0.029549334f,-0.031165676f,-0.048067544f,0.09705487f,-0.011098428f,-0.020320965f,-0.04148633f,-0.04995002f,-0.07669439f,-0.013797552f,0.06545086f,-0.11271107f,0.020970607f,0.03662167f,0.013129356f,0.06338316f,-0.022021094f,-0.0111510875f,-0.001241367f,-0.02091455f,-0.02392863f,-0.07325383f,0.0030011064f,0.0035147914f,0.018672392f,-0.006221267f,-0.08329453f,-0.02850241f,0.058879375f,-0.060902014f,-0.031377528f,0.009679124f,-0.028970672f,-0.018530989f,0.010012114f,0.016068267f,-0.007894194f,0.081875786f,-0.11858768f,-0.017476896f,-0.020515822f,-0.03861367f,0.009478026f,-0.0664049f,0.073679835f,-0.048740324f,-0.07530314f,-0.027699525f,0.008710347f,-0.036728617f,0.05363878f,0.066634215f,0.03055375f,-0.015267707f,-0.0062362403f,-0.048134644f,-0.027497273f,0.106398955f,0.07820926f,0.022096202f,0.017608056f,0.058401976f,0.054751392f,-0.06744572f,-0.08412845f,-0.00500415f,0.018115649f,-0.005963796f,-0.054437798f,-0.042635784f,0.031453032f,0.06092953f,-0.019631077f,-0.008254761f,0.017513359f,-0.031257857f,0.02838266f,-0.03598625f,0.028826846f,0.024132045f,0.059701487f,-0.01007231f,0.024742646f,0.052117206f,0.030826205f,-0.013547746f,0.062779665f,-0.027365256f,0.0034637607f,-0.1755821f,0.10319762f,0.011423345f,0.0038134188f,-0.034051597f,0.0043514566f,0.02310374f,0.022277541f}; public static final float[] CENT4 = {0.010704855f,-0.0670643f,0.021093868f,0.0065769083f,-0.012839604f,0.034078848f,0.032994095f,0.01122795f,0.009345008f,0.03385807f,0.051176053f,0.06273422f,0.03913983f,0.0322756f,-0.08576432f,-0.051884178f,0.08340187f,0.09360067f,-0.057922807f,-0.035607494f,-0.030732682f,0.11356546f,0.014922392f,0.12317613f,-0.032346856f,-0.025270259f,-0.036590725f,0.07295593f,0.10391724f,-0.089836046f,-0.07472955f,-0.03303165f,-0.02964042f,-0.07523887f,0.021307195f,-0.035367355f,-0.082960255f,-0.046930898f,-0.025115374f,0.012069998f,-0.057034306f,0.01572307f,-0.059006996f,0.077960856f,-0.030932972f,-0.15429792f,-0.12866907f,-0.021584965f,-0.029835228f,0.11236409f,-0.191379f,0.04282179f,-0.013448678f,-0.023948299f,0.0037557732f,0.16706619f,-0.05362709f,-0.07566937f,0.014789757f,-0.08677718f,-0.061279073f,0.020474998f,-0.09120756f,0.011759666f,-0.1306745f,-0.018631075f,-0.0186695f,0.11664367f,0.026543181f,-0.015455006f,-0.06260978f,-0.10498881f,0.036383174f,0.013893794f,-0.013775872f,-0.15882859f,0.09005678f,-0.018552678f,0.016706819f,0.1094073f,0.02589461f,-0.02969979f,0.029911079f,0.014708954f,0.027515892f,-0.07330767f,0.060595468f,0.04038546f,0.02411474f,0.026085714f,0.010706781f,-0.095420465f,-0.029445278f,-0.030828753f,0.023032123f,0.03859944f,0.09770636f,-0.017120324f,0.08896674f,0.011373428f,0.045112073f,-0.081473045f,-0.059673958f,-0.05634908f,-0.009259863f,0.09190117f,-0.05638226f,-0.02564247f,-0.037803873f,-0.0054110927f,0.0017864634f,-0.03682105f,-0.039525744f,-0.03141533f,0.11275712f,0.0004839851f,-0.002380149f,-0.026794951f,0.14561033f,0.02361305f,-0.013039672f,0.028168613f,-0.106352076f,-0.0012364064f,-0.047067884f,0.008281522f,-0.2103559f,0.08604091f,-0.0060125557f,0.04257702f,0.028519936f,-0.05306307f,-0.045899067f,-0.05791303f,-0.05610731f,-0.03951493f,-0.021780033f,-0.0448661f,-0.06279184f,0.07662312f,0.09883195f,-0.024417933f,-0.03946856f,-0.0005570199f,-0.09816524f,-0.041022364f,0.10536386f,0.062715694f,0.060231254f,0.0005412426f,0.06900588f,-0.08660474f,0.04899578f,0.068696626f,0.07466629f,-0.012135404f,-0.03598778f,-0.07057797f,-0.0666644f,-0.09265552f,0.0221408f,0.052452605f,-0.016891258f,-0.032897357f,0.036327623f,0.0872705f,0.06578879f,0.06010996f,-0.09918983f,0.094453976f,-0.015436117f,-0.08050265f,0.018843891f,-0.09102668f,0.059967782f,0.031252038f,0.02346507f,-0.12187432f,-0.07320767f,-0.108193256f,-0.07079308f,-0.06311027f,0.013159557f,-0.010688115f,-0.025490178f,-0.042861346f,0.02347437f,0.035059363f,0.03562216f,-0.05843507f,-0.053761233f,-0.041519757f,-0.026732028f,-0.026059309f,-0.19108123f,-0.041591886f,0.0372987f,-0.13595153f,0.05862723f,-0.16445152f,0.028238056f,0.09991044f,-0.025273887f,-0.0014634734f,-0.066377215f,0.04209974f,0.0018236845f,-0.026547626f,0.06924512f,-0.050397336f,-0.052222244f,-0.02491701f,-0.02636067f,0.06482354f,-0.059252396f,0.09828912f,0.081354156f,0.076036766f,0.042895827f,-0.01686822f,-0.08791581f,0.04457833f,-0.034038357f,-0.047805637f,-0.08799373f,-0.0638367f,0.016854111f,0.021876955f,-0.055394217f,-0.047526322f,-0.03587267f,0.06970263f,-0.04866665f,0.0011970974f,0.020647142f,-0.04771379f,-0.0010832258f,-0.020835506f,0.03243466f,0.038084455f,0.019542176f,-0.072884016f,-0.016455641f,-0.0345566f,0.0043037934f,-0.0975967f,-0.04744776f,0.14361002f,-0.06910765f,-0.021607704f,-0.092588946f,0.080095164f,-0.08491559f,0.041206434f,0.038057495f,0.023832599f,-0.06784454f,-0.08696313f,-0.028889777f,-0.067823686f,0.09943456f,0.009428746f,0.0066630538f,-0.0024553319f,-0.033138607f,0.0723644f,-0.045171943f,-0.085606396f,-0.10135047f,-0.037499584f,0.020281449f,0.055990275f,-0.01740628f,-0.019308072f,0.038056754f,0.0039541004f,-0.008331595f,0.048621465f,0.0032122715f,-0.030135294f,-0.039464857f,0.04360947f,0.048852123f,0.06722341f,0.020990908f,0.021438524f,-0.045839995f,0.009275418f,0.007442289f,0.12134915f,0.00068679365f,-0.07127784f,-0.13177507f,0.11126146f,0.039484635f,-0.036843125f,0.0511725f,-0.0598986f,0.045724258f,0.036662277f}; public static final float[] CENT5 = {0.022839382f,-0.03359495f,-0.0066703307f,0.022163201f,-0.02408755f,0.06442031f,0.0045688753f,0.010488761f,-0.00044476232f,0.05044493f,0.033675246f,0.049774945f,0.008621748f,0.042746447f,-0.07700397f,-0.0451958f,0.07938519f,0.08385187f,-0.061964415f,-0.035912927f,-0.003467658f,0.08524248f,0.014303198f,0.11600093f,-0.04287127f,-0.022873333f,-0.06825271f,0.08194304f,0.04793892f,-0.0527855f,-0.07581738f,-0.055221103f,-0.012851007f,-0.07099417f,0.014147388f,-0.037063416f,-0.04454149f,-0.0095205465f,-0.023536751f,0.014266046f,-0.014146757f,0.0072805923f,-0.020179052f,0.059608992f,-0.03419748f,-0.15566263f,-0.1015401f,-0.035094157f,-0.029369399f,0.08147777f,-0.21588254f,0.029588284f,-0.019559735f,-0.047735233f,0.016593328f,0.12735985f,-0.05213097f,-0.06873328f,0.014142882f,-0.038571272f,-0.042302243f,0.03880131f,-0.06659886f,-0.0073683104f,-0.06918143f,-0.025084415f,-0.038701344f,0.11273736f,0.012830569f,-0.033017233f,-0.031226004f,-0.09289884f,0.032232117f,-0.017454978f,-0.039876193f,-0.13845666f,0.06345957f,-0.03838888f,-0.00023332387f,0.088637434f,0.015690532f,-0.024366027f,0.04522402f,0.009410225f,0.04446052f,-0.0868467f,0.033179335f,0.063996814f,0.051150206f,0.014990753f,-0.0046050698f,-0.08338682f,-0.0021660307f,0.016665481f,0.042485926f,0.041599285f,0.099288955f,-0.027209742f,0.06921274f,0.0077553103f,0.052449062f,-0.07543572f,-0.056188744f,-0.0680761f,0.00052976294f,0.08570987f,-0.057802778f,-0.015359663f,-0.037239462f,0.0069555137f,-0.013637077f,-0.012860835f,0.007908063f,-0.04955145f,0.10830962f,0.028258102f,-0.011194532f,-0.015489036f,0.11507248f,-0.009770631f,0.009489947f,0.03191821f,-0.09354445f,-0.0064501795f,-0.015091094f,0.025286112f,-0.16656414f,0.10895973f,-0.012308648f,0.019734804f,0.015983343f,-0.05980111f,-0.005718719f,-0.03080548f,-0.039776705f,-0.039912112f,-0.005679874f,-0.05862779f,-0.027916564f,0.074065804f,0.07143131f,-0.0075149457f,-0.041759312f,0.0010628969f,-0.07478614f,-0.059406433f,0.085929275f,0.061952654f,0.04479289f,-0.004131101f,0.06671996f,-0.097041205f,0.03594322f,0.022973606f,0.070404366f,-0.050781988f,-0.048104316f,-0.04680078f,-0.05829367f,-0.101707675f,0.011345205f,0.042796608f,0.02355706f,-0.020254193f,-0.002402296f,0.08201183f,0.040505495f,0.034578394f,-0.072379276f,0.09444956f,-0.02363955f,-0.052523956f,-0.014398527f,-0.100167096f,0.017307185f,0.048116777f,0.021709686f,-0.12685254f,-0.07343213f,-0.078357436f,-0.08621896f,-0.06159465f,0.008424792f,-0.0012125799f,-0.027283844f,-0.0044014268f,0.008536106f,0.009195816f,0.018210959f,-0.017518131f,-0.067163736f,-0.046488773f,-0.01052375f,-0.03658865f,-0.1409519f,-0.04512672f,0.024262238f,-0.12324837f,0.051286887f,-0.113918826f,0.049287096f,0.09406128f,-0.056964476f,0.014383443f,-0.05087614f,0.07044303f,-0.034645855f,-0.025743358f,0.036189727f,-0.03501847f,-0.063039966f,-0.012151702f,-0.04026078f,0.07979788f,-0.08106726f,0.057337496f,0.07416963f,0.07374935f,0.025263727f,-0.02613804f,-0.091119975f,0.05832003f,-0.07452149f,-0.04719072f,-0.059752468f,-0.048668176f,0.02071755f,0.03147056f,-0.04182357f,-0.043808248f,-0.031065056f,0.07393967f,-0.013676625f,0.013433168f,-0.0050358544f,-0.06556628f,0.015460819f,-0.008181974f,0.05046547f,0.012215743f,-0.001954195f,-0.032166645f,0.010198491f,-0.05324453f,0.016410602f,-0.07528097f,-0.08296055f,0.17857912f,-0.06782797f,-0.037963327f,-0.06631536f,0.0805591f,-0.04899296f,0.042215392f,0.043943767f,-0.0076278094f,-0.07121953f,-0.06301679f,-0.009259667f,-0.075178064f,0.10176182f,-0.0027576364f,-0.0014918603f,-0.03900084f,-0.013977311f,0.06704828f,-0.044342674f,-0.056591447f,-0.08333293f,-0.046077147f,-0.00720852f,0.07769522f,-0.033034276f,0.010620653f,0.051043287f,0.024553757f,-0.016105993f,0.0055337967f,0.0018331109f,-0.009095089f,-0.03252504f,0.053799663f,0.022690354f,0.04455544f,0.04527506f,0.025460564f,-0.019688746f,0.044049434f,0.02170557f,0.11448796f,-0.0138435885f,-0.011530503f,-0.093710974f,0.078288384f,0.024630895f,-0.044313632f,-0.008687107f,-0.05694564f,0.020266589f,0.04716828f}; + public static final float[] CENT6 = { 0.024620242f, 0.004581881f, 0.018166043f, 0.06227891f, -0.061162405f, 0.013741626f, 0.044876844f, -0.036030397f, 0.053827956f, 0.027380435f, -0.0471392f, -0.04204845f, -0.009506275f, 0.020029308f, -0.08312132f, 0.029896686f, 0.04067525f, 0.09245238f, -0.013074374f, -0.04913029f, -0.017801015f, 0.037382003f, -0.0060805012f, 0.05185311f, 0.017305424f, -0.024362259f, -0.061483163f, 0.06753725f, 0.015710967f, -0.056004137f, -0.01779394f, -0.017728835f, -0.02974689f, -0.030583208f, 0.026169317f, -0.02976448f, -0.009048237f, -0.015299143f, 0.047370635f, 0.023807826f, 0.0387492f, 0.020267045f, 0.04886563f, 0.03723455f, -0.03244521f, -0.0764756f, -0.07415507f, 0.0011962979f, 0.0055027567f, 0.048585437f, -0.038443398f, 0.019955175f, -0.047729407f, -0.052725423f, -0.006969106f, 0.07254954f, -0.047007643f, -0.08467738f, 0.04061473f, -0.062533975f, -0.021762446f, 0.025954228f, -0.08872862f, -0.043742858f, -0.057866864f, 0.0041118483f, -0.043268688f, 0.11072746f, -0.01805699f, 0.030518847f, -0.023909934f, -0.026104925f, 0.076754354f, 0.004053712f, -0.06995296f, -0.09154115f, 0.075504944f, 0.040173028f, 0.026455829f, 0.082629554f, 0.016627932f, -0.0051428163f, 0.024459042f, 0.015129218f, 0.018602457f, -0.08419634f, -0.057839863f, 0.08033778f, -0.0013325086f, 0.055078622f, 0.042367503f, -0.07068303f, -0.06828275f, -0.058070116f, 0.0053460547f, -0.019511431f, 0.045705497f, -0.000801574f, 0.03786749f, 0.005702353f, -0.0050054016f, -0.063415f, -0.023768667f, 0.018773384f, -0.014672446f, -0.010386008f, -0.045537412f, -0.010747025f, 0.036203172f, -0.04547878f, -0.0311748f, -0.004422967f, -0.016213521f, -0.007252287f, 0.1038726f, 0.032724522f, -0.001620814f, -0.042247213f, 0.11742329f, 0.023391817f, -0.06930579f, 0.014805584f, -0.07088485f, 0.04336114f, -0.0023193853f, -0.0022214814f, -0.092611134f, 0.032726016f, -0.012243834f, 0.041499358f, -0.014656232f, -0.05408376f, -0.054427855f, -0.014049016f, 0.00060254347f, -0.039658014f, 0.051309865f, -0.012923217f, -0.03065576f, 0.044731896f, 0.035845127f, -0.051087443f, -0.017503005f, 0.039425496f, 0.027843334f, -0.014077761f, 0.03436604f, -0.03947595f, -0.01995382f, -0.016716857f, 0.062736176f, -0.0005059514f, -0.043363214f, 0.037263878f, -0.005644477f, -0.043776855f, -0.056233525f, -0.04498145f, -0.06936946f, -0.050500743f, -0.005256612f, 0.045315918f, -0.015799314f, 0.018809715f, 0.018617317f, -0.011042218f, 0.022425804f, -0.018904038f, -0.022462932f, -0.020504456f, -0.058547776f, -0.020229293f, -0.015738696f, -0.07861845f, 0.03022755f, -0.02165184f, 0.059349928f, -0.10356587f, -0.06959571f, -0.020027049f, -0.09769989f, -0.052323867f, 0.0047713546f, -0.02914543f, 0.0017568229f, -0.019724976f, -0.034156065f, 0.026697673f, 0.057145983f, 0.020453366f, -0.041135155f, -0.03343804f, 0.012924303f, -0.038549993f, -0.09130385f, 0.009884244f, -0.0091959005f, -0.0637349f, -0.017700942f, -0.120334014f, 0.0077728014f, 0.06459247f, -0.0014964747f, 0.0006911421f, -0.045633074f, 0.020923134f, -0.006582069f, -0.038680766f, -0.007783623f, -0.026034914f, -0.038614545f, 0.01880521f, -0.038663223f, 0.071993805f, -0.06256164f, 0.020326348f, 0.0779981f, 0.053440906f, -0.06678991f, 0.00855628f, -0.033688184f, 0.018025815f, -0.02576774f, -0.018506877f, -0.020364707f, -0.028946634f, 0.0011659188f, 0.0054863994f, 0.0074107675f, -0.027979849f, -0.02635988f, -0.015683394f, 0.025166072f, -0.017781012f, 0.032416053f, -0.036917035f, -0.0060269516f, -0.017210985f, 0.053411476f, 0.022020396f, 0.016008614f, -0.014266844f, -0.0011225356f, -0.060189664f, 0.013540677f, -0.02270245f, -0.013724378f, 0.099358544f, -0.0068594962f, -0.06162434f, -0.0152784735f, 0.045405116f, -0.011245036f, 0.049158163f, 0.025907485f, -0.042243924f, 0.0025546027f, -0.012954748f, -0.07886514f, -0.034392856f, 0.03659385f, -0.020002384f, -0.017308095f, 0.03527795f, 0.01813971f, 0.100328945f, -0.026122957f, -0.05829907f, -0.11123179f, -0.007846556f, 0.027480204f, 0.061267905f, 0.015509253f, 0.023971874f, 0.04261052f, -0.031556986f, -0.034943216f, -0.054021634f, -0.024998529f, -0.014506589f, -0.026578754f, 0.024031604f, 0.018464245f, 0.06997085f, 0.004949784f, 0.017799584f, -0.030472776f, 0.0007584721f, 0.04867968f, 0.077957116f, -0.06591868f, 0.011426647f, -0.08557331f, 0.035616335f, -0.016662348f, -0.032495733f, 0.035773657f, -0.031418074f, 0.021861078f, 0.004955305f }; + public static final float[] CENT7 = {0.019262243f, -0.047450032f, -0.033709772f, 0.026448427f, 0.022781372f, -0.026656963f, -0.03764739f, 0.0016911825f, 0.07212604f, 0.07736319f, -0.050524678f, 0.010541563f, 0.07272678f, -0.015600134f, 0.005611137f, -0.015378599f, 0.03459902f, 0.03301585f, 0.07012148f, 0.015784934f, -0.005498886f, -0.06966725f, -0.00945395f, -0.047787137f, 0.047511913f, 0.012095698f, -0.048950125f, 0.08338462f, 0.060457584f, 0.007559317f, -0.06302671f, -0.08693271f, -0.03595875f, -0.10539288f, -0.0054129143f, -0.057737846f, 0.044270832f, 0.023376465f, -0.06834221f, -0.07169794f, -0.04968064f, -0.07866527f, -0.03292458f, 0.040082578f, -0.003292225f, 0.014434814f, -0.06912676f, -0.053517375f, 0.029180173f, 0.08435454f, -0.09744828f, 0.1092874f, 0.0785952f, -0.067124546f, -0.004529882f, 0.06547914f, -0.0840991f, -0.10905725f, -0.09637677f, -0.063065425f, -0.072178595f, -0.004865152f, -0.0917997f, -0.08916219f, -0.014386212f, -0.07682885f, -0.031699285f, 0.04970296f, 0.0968594f, 0.066641346f, 0.07686841f, -0.13810222f, 0.08980363f, -0.018776223f, 0.005949656f, 0.011389979f, 0.032351527f, -0.07210569f, 0.04713864f, 0.018966958f, -0.05511001f, -0.100171976f, -0.02048888f, -0.04821947f, -0.00063394615f, -0.08647834f, -0.048097964f, 0.084778965f, 0.08190664f, -0.0024091932f, -0.042036835f, -0.09182174f, -0.0136754075f, -0.059593555f, -0.07548693f, 0.08486713f, 0.026695251f, -0.0038216203f, 0.027196249f, -0.01344211f, -0.004624967f, -0.08925487f, -0.004985668f, -0.049914327f, 0.025742142f, -0.06252232f, 0.049877875f, -0.10391801f, -0.041932847f, -0.008888527f, -0.041182905f, 0.038068984f, 0.0140443025f, -0.027253406f, 0.06668995f, -0.0436371f, 0.0071729026f, -0.058220193f, 0.090868205f, 0.027124476f, -0.046288386f, 0.004169888f, 0.014910486f, -0.044218842f, 0.03950105f, -0.087479346f, -0.14780794f, 0.063308276f, 0.040863037f, 0.10589656f, -0.017971464f, -0.034241572f, -0.0760255f, -0.044791326f, -0.061286077f, -0.056376703f, -0.05403335f, 0.03857309f, -0.06698919f, 0.18217298f, 0.002434342f, 0.03374001f, 0.037234835f, -0.09519266f, 0.018446552f, 0.033890337f, 0.020267062f, 0.055544455f, -0.019579288f, 0.061070196f, 0.06954335f, -0.054193497f, 0.032756664f, 0.09429536f, -0.026101572f, -0.009754596f, 0.021697186f, -0.03163486f, -0.0853588f, -0.033341583f, -0.015221207f, -0.0020288539f, 0.09309783f, -0.037338823f, -0.029504564f, 0.024548283f, 0.020117583f, 0.06914202f, -0.03743631f, 0.029088903f, -0.1340332f, -0.065506265f, 0.053863525f, -0.09228968f, -0.055058822f, -0.01167693f, 0.056795474f, -0.0500128f, -0.055351432f, 0.026549729f, -0.08948263f, -0.027862549f, 0.036409732f, 0.040634155f, -0.052502666f, -0.07473755f, 0.16722785f, 0.029117372f, 0.10732015f, -0.11223418f, -0.033195212f, -0.02668536f, 0.01914526f, -0.011237251f, -0.09547749f, 0.00007346825f, 0.030704074f, -0.0073185675f, 0.06939584f, -0.092531554f, 0.02482266f, 0.040196452f, -0.0022337171f, -0.037009805f, 0.016749064f, 0.009459107f, 0.018783852f, 0.07128387f, 0.018052561f, -0.033848517f, -0.06172378f, -0.020982742f, 0.006961469f, -0.0028618707f, -0.06279726f, 0.04613975f, 0.026834982f, -0.033189844f, 0.032854717f, -0.08542322f, -0.019851508f, 0.09112719f, -0.044110898f, -0.030613158f, -0.051780876f, -0.010538737f, -0.0738774f, 0.065354526f, -0.0077989367f, -0.062425967f, -0.038196705f, 0.011241771f, -0.07971855f, 0.05487357f, -0.038697418f, 0.023068322f, 0.03980029f, -0.05109095f, 0.069692545f, 0.04610104f, 0.043640137f, 0.045593332f, 0.04156127f, -0.056463808f, -0.007460135f, -0.036827937f, 0.0043883147f, 0.0857633f, -0.036361977f, -0.066014044f, -0.012620149f, 0.039482117f, -0.008902762f, 0.0013648139f, 0.08955101f, 0.045312244f, 0.0170168f, -0.039230064f, -0.015186451f, -0.031099673f, 0.043060303f, 0.0024148447f, -0.015193232f, -0.001280608f, 0.036471333f, 0.032366153f, -0.056205183f, -0.04759075f, 0.0045138467f, 0.0026965672f, -0.01756343f, 0.027643275f, -0.033978496f, 0.0064075612f, 0.102878146f, -0.0074215997f, -0.04704454f, -0.08324856f, -0.03443061f, 0.0031845658f, -0.024608895f, 0.00010059498f, 0.0049452605f, 0.035672504f, 0.07282305f, -0.026274787f, -0.024568629f, 0.04938168f, -0.0103037795f, 0.114112005f, 0.003952309f, -0.06333832f, -0.13478372f, 0.123098865f, 0.014156765f, -0.10642949f, -0.04868458f, -0.042651143f, -0.012438739f, 0.061533045f}; + public static final float[] CENT8 = {0.027360043f, -0.025965704f, -0.025352705f, 0.012733885f, -0.027053047f, 0.061531954f, -0.011780665f, 0.0127583835f, 0.0009362112f, 0.057690624f, 0.025988521f, 0.06306871f, 0.007991002f, 0.04337099f, -0.06917777f, -0.064493686f, 0.093104266f, 0.087003574f, -0.053339563f, -0.033822943f, 0.0023489126f, 0.09644651f, 0.04148517f, 0.11359376f, -0.05368193f, -0.0071985535f, -0.06778475f, 0.08371435f, 0.057090394f, -0.042397533f, -0.09618619f, -0.049929246f, -0.019565824f, -0.084293835f, 0.02212012f, -0.036760654f, -0.038540788f, -0.019786943f, -0.054367147f, 0.02092793f, -0.016923893f, -0.01011285f, -0.03233062f, 0.055574656f, -0.01573994f, -0.14518075f, -0.09920954f, -0.03591306f, -0.018302502f, 0.0826956f, -0.24027744f, 0.048662644f, 0.004573726f, -0.032091014f, 0.023725713f, 0.12245056f, -0.06867782f, -0.063255265f, 0.010222337f, -0.02822147f, -0.03569107f, 0.042172413f, -0.07549714f, -0.007572351f, -0.06605758f, -0.039712235f, -0.030573737f, 0.11540968f, 0.025995197f, -0.02159619f, -0.03037651f, -0.10221809f, 0.02639844f, -0.00020311975f, -0.03825955f, -0.1370221f, 0.063111946f, -0.043894224f, -0.0045068674f, 0.09466753f, 0.015182107f, -0.04098556f, 0.04705408f, 0.0076449453f, 0.028924167f, -0.09507061f, 0.030360699f, 0.058965035f, 0.058024704f, 0.0030948538f, -0.014743276f, -0.0752356f, 0.0026429319f, 0.022314927f, 0.043495156f, 0.050213635f, 0.111053474f, -0.018567799f, 0.0696716f, 0.014296255f, 0.06000532f, -0.080688216f, -0.049196992f, -0.08411225f, -0.0088486485f, 0.09350153f, -0.06459696f, -0.03472063f, -0.042360056f, 0.0042306827f, -0.017478535f, -0.0023657861f, 0.01936531f, -0.072368465f, 0.10383953f, 0.01272139f, -0.00576753f, -0.006625162f, 0.09363834f, -0.023628486f, 0.008259529f, 0.032499235f, -0.10720832f, -0.016256543f, -0.007052035f, 0.022646869f, -0.16391185f, 0.10893212f, -0.010604683f, 0.018671777f, 0.007998912f, -0.07285046f, 0.0028506846f, -0.027823832f, -0.040509705f, -0.040785853f, -0.005613743f, -0.06082498f, -0.01188811f, 0.09090232f, 0.06924448f, -0.009943456f, -0.033468496f, -0.009826051f, -0.053927626f, -0.07307058f, 0.10978922f, 0.05489834f, 0.057211448f, -0.018824786f, 0.06313307f, -0.092232145f, 0.061492153f, 0.016006058f, 0.06995724f, -0.053459994f, -0.059194297f, -0.06799303f, -0.051310703f, -0.106857225f, 0.005883538f, 0.032994427f, 0.039161604f, -0.019086517f, 0.0009888109f, 0.06846376f, 0.03432977f, 0.022550598f, -0.049754396f, 0.10987148f, -0.03394277f, -0.059249617f, 0.0014984206f, -0.12347709f, -0.009259841f, 0.06661322f, 0.017795173f, -0.13278453f, -0.07638226f, -0.09236983f, -0.074975885f, -0.06257708f, -0.003421441f, -0.008867511f, -0.03219554f, -0.00662612f, 0.009359607f, 0.018035803f, 0.014240515f, -0.017690897f, -0.065586425f, -0.033609137f, -0.0016639141f, -0.03481626f, -0.13954565f, -0.03457412f, 0.025281567f, -0.12963872f, 0.060053088f, -0.11543707f, 0.052813236f, 0.08809334f, -0.06395992f, 0.011370646f, -0.041527756f, 0.07895797f, -0.060237076f, -0.015135019f, 0.014526161f, -0.020744007f, -0.07332272f, 0.004984537f, -0.043703508f, 0.09075253f, -0.083114065f, 0.055535305f, 0.07899801f, 0.06152374f, 0.02024601f, -0.043273665f, -0.09655056f, 0.07368096f, -0.10885059f, -0.05464901f, -0.055298105f, -0.036376476f, 0.028940732f, 0.025830904f, -0.049135238f, -0.060407348f, -0.027741728f, 0.078037895f, -0.00067610614f, 0.02440834f, 0.004880942f, -0.0697656f, 0.031432074f, -0.021118985f, 0.048426885f, 0.020539016f, 0.0003811315f, -0.016142707f, 0.03173171f, -0.051850002f, 0.019654687f, -0.06882275f, -0.09353903f, 0.1784982f, -0.063227795f, -0.033999432f, -0.057996336f, 0.083436504f, -0.04332025f, 0.040805798f, 0.04102325f, 0.0011754916f, -0.08060662f, -0.06684013f, 0.010560134f, -0.099073574f, 0.08933483f, 0.003416465f, -0.00024122295f, -0.024116768f, -0.029420648f, 0.06258559f, -0.07625294f, -0.047339268f, -0.05775082f, -0.03365024f, -0.004045004f, 0.092349656f, -0.033730973f, 0.014067938f, 0.06454564f, 0.024935625f, -0.022538269f, 0.001468328f, 0.002396063f, -0.0128753735f, -0.031925377f, 0.051773984f, 0.018841544f, 0.04191543f, 0.054949857f, 0.03003431f, -0.032000683f, 0.054949045f, 0.014582391f, 0.12086886f, 0.0038320986f, -0.0071083675f, -0.09553304f, 0.074392214f, 0.029396396f, -0.06745949f, -0.01507579f, -0.05421302f, 0.016768038f, 0.033838905f}; + public static final float[] CENT9 = {0.013885498f, -0.08883667f, 0.02969551f, 0.0047683716f, -0.013946533f, -0.09719849f, 0.028320312f, 0.054382324f, 0.02986145f, -0.0076293945f, -0.060668945f, 0.15856934f, 0.08668518f, -0.14053345f, -0.020935059f, -0.06097412f, 0.038082123f, 0.08602905f, 0.0075531006f, -0.014930725f, 0.03173828f, -0.0826416f, 0.04399681f, 0.018472672f, 0.0501709f, -0.08050537f, 0.098602295f, 0.05491638f, 0.1619873f, -0.01663971f, 0.010910034f, -0.11820984f, -0.15222168f, 0.10585022f, 0.03942871f, 0.04358673f, -0.05142212f, 0.039775252f, 0.050476074f, -0.10160065f, -0.037715912f, -0.064971924f, -0.027404785f, 0.06896973f, -0.053970337f, -0.12936401f, -0.0776062f, -0.09890747f, 0.086135864f, 0.25219727f, -0.0068740845f, 0.15872955f, 0.034118652f, 0.0072631836f, -0.0057373047f, 0.043121338f, -0.14389038f, -0.19690704f, 0.040252686f, -0.08874512f, -0.083984375f, 0.03700447f, -0.003479004f, -0.015625f, 0.07363129f, -0.04437256f, -0.016647339f, 0.018798828f, 0.07019043f, -0.007080078f, -0.03100586f, -0.005432129f, 0.1179657f, 0.052261353f, 0.04396057f, -0.07382202f, -0.06530762f, -0.031352997f, 0.12116051f, 0.056035995f, 0.034957886f, -0.14463043f, -0.09924698f, -0.000869751f, -0.16131592f, -0.041397095f, -0.0021972656f, 0.007873535f, 0.063201904f, 0.072265625f, 0.013793945f, -0.028038025f, -0.117012024f, -0.06665039f, 0.0058288574f, -0.04724121f, 0.1444397f, -0.009857178f, 0.10101318f, 0.04462433f, -0.04498291f, 0.007507324f, 0.11695099f, -0.031181335f, 0.07330322f, 0.02494812f, 0.15075684f, -0.23913574f, 0.004211426f, -0.071777344f, 0.007293701f, 0.07975006f, 0.08457184f, 0.012817383f, 0.09764099f, 0.017425537f, 0.13725281f, 0.054748535f, 0.13565063f, 0.009414673f, -0.12731934f, -0.023269653f, -0.058288574f, 0.008468628f, 0.10852051f, -0.041625977f, -0.061401367f, -0.0313797f, 0.050857544f, 0.016296387f, 0.06796265f, 0.09118652f, 0.042144775f, -0.017700195f, -0.07501984f, -0.1899414f, -0.09338379f, 0.04272461f, -0.17037964f, 0.12792969f, -0.0020751953f, -0.047454834f, 0.07700539f, -0.07696533f, 0.06103134f, -0.001373291f, 0.10974121f, 0.13071442f, 0.02961731f, 0.061431885f, 0.0745163f, -0.055862427f, 0.078048706f, 0.0098724365f, 0.09902954f, -0.048858643f, 0.012329102f, -0.08453369f, 0.07024193f, -0.05886841f, 0.05519104f, 0.06726074f, -0.07366943f, 0.0014038086f, -0.09075928f, -0.048309326f, 0.032409668f, 0.036598206f, 0.0033111572f, 0.0982666f, -0.042663574f, -0.026245117f, 0.040618896f, -0.10574341f, -0.10557556f, -0.01687622f, 0.0028839111f, -0.0014038086f, -0.019266129f, -0.016235352f, -0.123313904f, 0.006652832f, 0.1237793f, -0.052505493f, -0.003036499f, -0.17700195f, 0.054504395f, -0.034301758f, -0.024353027f, -0.07888794f, -0.053649902f, 0.068237305f, -0.0046367645f, 0.0059814453f, -0.09329224f, 0.1590271f, 0.029922485f, -0.15948486f, 0.032714844f, -0.19274902f, 0.08618927f, 0.11917114f, 0.07891846f, -0.07272339f, 0.016784668f, -0.08206177f, 0.13716888f, 0.052139282f, 0.028396606f, 0.10827637f, 0.0135650635f, -0.031539917f, -0.07319641f, 0.11047363f, 0.026321411f, 0.08778381f, -0.110061646f, 0.039154053f, 0.011871338f, 0.012844086f, 0.021209717f, 0.15563965f, -0.07659912f, -0.11977768f, -0.07562256f, -0.02078247f, -0.007965088f, 0.04714966f, -0.09592438f, -0.030273438f, 0.027023315f, -0.030029297f, -0.13244629f, 0.03944397f, 0.021240234f, 0.003753662f, -0.03579712f, 0.008575439f, 0.09628296f, -0.029388428f, 0.0026483536f, -0.017189026f, -0.00024414062f, -0.040779114f, -0.016098022f, -0.07110596f, 0.030883789f, 0.14639282f, -0.014694214f, 0.030456543f, -0.013870239f, -0.025238037f, -0.047859192f, -0.18838501f, -0.030675888f, -0.0119018555f, 0.047798157f, -0.05104065f, -0.09328461f, -0.039978027f, 0.019805908f, 0.02037239f, 0.11383057f, 0.014328003f, 0.09942627f, 0.11779785f, 0.010307312f, -0.102005005f, 0.053588867f, 0.15567017f, 0.009269714f, -0.011489868f, -0.12113953f, -0.005821228f, 0.039216995f, -0.019973755f, -0.03152466f, 0.049804688f, -0.14051819f, 0.01965332f, 0.0101623535f, 0.0657959f, 0.025378227f, 0.02911377f, 0.07123566f, 0.021392822f, -0.055236816f, 0.0076856613f, -0.025644302f, 0.15881348f, 0.05105591f, -0.0748291f, -0.10303497f, 0.12768555f, 0.045618057f, -0.05141735f, -0.06373024f, 0.059906006f, -0.0077819824f, -0.0068359375f}; + public static final float[] CENT10 = {0.021668974f, -0.05004165f, 0.019741794f, 0.00779765f, -0.0045129503f, 0.048191205f, 0.014692515f, 0.006606069f, -0.0033101244f, 0.041532584f, 0.02470044f, 0.07252844f, 0.015758036f, 0.04205027f, -0.079197414f, -0.042632714f, 0.08540655f, 0.07445798f, -0.049537383f, -0.028350769f, -0.013660554f, 0.08695323f, 0.023848312f, 0.10665727f, -0.02174582f, -0.018023994f, -0.044742227f, 0.06374246f, 0.070899405f, -0.046449225f, -0.08162557f, -0.034712907f, -0.018790565f, -0.09707169f, 0.022725675f, -0.037166525f, -0.052456122f, -0.015289678f, -0.04096576f, 0.01109224f, -0.050141178f, -0.002744066f, -0.05399635f, 0.062814966f, -0.02097499f, -0.13801916f, -0.11119716f, -0.02758061f, -0.0399705f, 0.10142966f, -0.19322528f, 0.05058232f, -0.0037690538f, -0.01886666f, 0.013786521f, 0.13671595f, -0.047292467f, -0.06073177f, 0.004180926f, -0.07223272f, -0.05182167f, 0.015976341f, -0.07038088f, -0.013065565f, -0.095240474f, -0.027715039f, -0.034623716f, 0.12960277f, 0.028853552f, -0.0056765354f, -0.03948371f, -0.1134972f, 0.04368578f, 0.009686529f, -0.029474068f, -0.13285163f, 0.04997057f, -0.02444474f, 0.0041827695f, 0.10219008f, 0.01684378f, -0.038368747f, 0.04208769f, 0.009445968f, 0.020245835f, -0.0916629f, 0.040262155f, 0.06143533f, 0.03863632f, 0.03248864f, -0.005987152f, -0.1027879f, -0.0081914645f, -0.033888083f, 0.020213421f, 0.038411945f, 0.09727192f, -0.015616164f, 0.06734591f, 0.0010645613f, 0.038189553f, -0.08257143f, -0.057132956f, -0.05624801f, 0.000102296275f, 0.07539177f, -0.040842216f, -0.039026175f, -0.029958935f, 0.005116133f, 0.01074006f, -0.017258294f, -0.016733738f, -0.036688793f, 0.09737252f, 0.0030457648f, -0.00039191157f, -0.039071396f, 0.12752554f, -0.0040631946f, -0.00063009234f, 0.007159849f, -0.09007748f, 0.014584881f, -0.044108965f, 0.008260139f, -0.17035489f, 0.09607732f, 0.007439814f, 0.03018414f, 0.017692259f, -0.0615429f, -0.010109736f, -0.059450626f, -0.055562712f, -0.028130028f, -0.0026889294f, -0.04103278f, -0.05063029f, 0.07658045f, 0.06639816f, 0.00861173f, -0.03205479f, -0.024214717f, -0.08616781f, -0.03585523f, 0.112857334f, 0.066518694f, 0.057702452f, 0.001476881f, 0.06312412f, -0.08385868f, 0.032948654f, 0.037338298f, 0.07373857f, -0.030105365f, -0.037533145f, -0.0649149f, -0.04637142f, -0.118010126f, 0.011419258f, 0.039121598f, 0.0065562706f, -0.028015444f, 0.009521546f, 0.09075341f, 0.056968864f, 0.0628409f, -0.075602114f, 0.101636216f, -0.01655033f, -0.05742561f, 0.024127297f, -0.10444707f, 0.02890621f, 0.049818035f, 0.022248536f, -0.11327497f, -0.06400471f, -0.07810988f, -0.067137435f, -0.07273801f, 0.013768322f, -0.003033656f, -0.035832126f, -0.028019575f, 0.03131606f, 0.03464033f, 0.038618255f, -0.03970862f, -0.04245806f, -0.048089456f, -0.018777419f, -0.017472101f, -0.14950939f, -0.03964057f, 0.0451959f, -0.11354075f, 0.06794856f, -0.134985f, 0.04291864f, 0.10673419f, -0.028469443f, 0.00047126596f, -0.07188624f, 0.038152635f, -0.020412404f, -0.010131386f, 0.044825327f, -0.046960436f, -0.06504542f, -0.022133589f, -0.029044451f, 0.057349183f, -0.06399978f, 0.059432473f, 0.07093326f, 0.05611221f, 0.024246046f, -0.02335101f, -0.07997312f, 0.056047436f, -0.06054177f, -0.032051027f, -0.08614712f, -0.055415645f, 0.017505094f, 0.032665998f, -0.06492337f, -0.073431075f, -0.04337855f, 0.096066914f, -0.044003483f, 0.0074485047f, 0.017381722f, -0.04958468f, 0.0007566396f, -0.013538834f, 0.04159147f, 0.022069499f, 0.005504567f, -0.049329437f, 0.0069431355f, -0.039654616f, 0.011385777f, -0.078975014f, -0.060576964f, 0.14154674f, -0.060085636f, -0.03949624f, -0.06907309f, 0.06912115f, -0.06961753f, 0.03659974f, 0.041451983f, 0.031915702f, -0.075638935f, -0.0715621f, -0.027965695f, -0.07204257f, 0.07929139f, 0.016691683f, 0.013302954f, -0.022770423f, -0.018689726f, 0.046218984f, -0.052770782f, -0.0572773f, -0.09035173f, -0.027980303f, 0.01377566f, 0.046130143f, -0.029309189f, -0.015685974f, 0.057679184f, 0.009008239f, -0.022164475f, 0.024052024f, -0.018444706f, -0.022262001f, -0.03715153f, 0.038390506f, 0.023577359f, 0.06481538f, 0.03209804f, 0.03235033f, -0.01995189f, 0.040795587f, 0.021792782f, 0.12365544f, -0.012186818f, -0.050684974f, -0.12966827f, 0.102917016f, 0.040965144f, -0.02405007f, -0.0008032137f, -0.041035242f, 0.029524898f, 0.05550137f}; + public static final float[] CENT11 = {-0.060195923f, -0.014125824f, 0.09173584f, 0.061401367f, -0.011138916f, -0.018676758f, 0.021766663f, -0.027526855f, -0.033447266f, 0.07458496f, -0.09448242f, 0.18829346f, 0.06607056f, 0.049072266f, -0.09396362f, 0.05908203f, 0.072509766f, -0.06945801f, -0.06225586f, -0.021938324f, 0.029907227f, 0.1217041f, -0.02722168f, 0.115112305f, 0.0020141602f, -0.0692749f, -0.03552246f, 0.11920166f, 0.077560425f, 0.06604004f, -0.0727005f, -0.1533966f, -0.023925781f, -0.0036621094f, 0.15039062f, -0.0006713867f, -0.07771301f, -0.10595703f, 0.024124146f, 0.050689697f, -0.03173828f, 0.011047363f, -0.05783081f, -0.021957397f, 0.068481445f, -0.18798828f, -0.023986816f, -0.07788086f, 0.023376465f, 0.18408203f, -0.049194336f, 0.05444336f, 0.04522705f, 0.004058838f, -0.11352539f, -0.088970184f, -0.04815674f, -0.0018005371f, -0.028160095f, -0.10650635f, -0.19213867f, -0.004043579f, -0.12585449f, -0.06628418f, -0.048461914f, -0.022705078f, 0.083099365f, 0.09851074f, 0.06555176f, 0.12062073f, -0.15393066f, -0.0357666f, 0.08239746f, 0.08111572f, 0.07891846f, -0.111816406f, 0.064819336f, -0.015991211f, 0.032791138f, 0.17797852f, -0.027862549f, -0.014404297f, -0.09640503f, -0.101135254f, -0.03261566f, -0.022644043f, 0.09375f, -0.13171387f, 0.08117676f, -0.013824463f, -0.010040283f, -0.031799316f, -0.020080566f, -0.21875f, 0.020996094f, -0.12799072f, 0.20452881f, -0.018371582f, 0.11932373f, 0.03781128f, 0.062469482f, -0.0357666f, -0.048034668f, -0.018798828f, -0.008056641f, 0.09265137f, 0.027801514f, -0.13928223f, 0.041748047f, 0.0048828125f, -0.038330078f, 0.14208984f, -0.065460205f, 0.09637451f, 0.17028809f, 0.05569458f, 0.047729492f, 0.0012207031f, 0.119018555f, -0.011581421f, -0.013000488f, 0.017944336f, -0.14697266f, 0.15740967f, -0.15435791f, -0.043151855f, -0.13140869f, -0.05444336f, -0.1159668f, 0.07577515f, 0.13439941f, -0.020446777f, 0.05908203f, -0.06359863f, -0.018188477f, -0.048849106f, 0.09802246f, 0.043884277f, -0.1199646f, 0.15737915f, -0.029907227f, 0.13391113f, 0.06637573f, -0.099121094f, -0.1237793f, -0.048095703f, 0.075927734f, -0.08364868f, 0.004638672f, 0.075683594f, -0.01953125f, -0.08807373f, 0.0025634766f, 0.025878906f, 0.19824219f, -0.10424805f, -0.10772705f, -0.042404175f, 0.037597656f, -0.11669922f, 0.06225586f, 0.13726807f, -0.061187744f, -0.011352539f, -0.11206055f, 0.08355713f, -0.026367188f, 0.05923462f, 0.017791748f, 0.096710205f, -0.08648682f, 0.08544922f, -0.023376465f, -0.08300781f, 0.037109375f, -0.087537766f, 0.09222412f, -0.08605957f, 0.012329102f, 0.069366455f, -0.119140625f, -0.08274841f, 0.08850098f, 0.08482361f, -0.12313843f, 0.08607483f, 0.0027770996f, -0.02835083f, 0.030273438f, 0.035476685f, 0.018737793f, -0.14605713f, -0.015380859f, 0.014953613f, -0.12475586f, 0.04586792f, 0.020996094f, -0.068847656f, -0.020446777f, -0.087402344f, 0.06291199f, 0.08972168f, 0.03857422f, 0.053955078f, 0.0501709f, -0.010070801f, 0.082927704f, 0.15551758f, 0.15072632f, -0.01586914f, -0.07751465f, -0.06591797f, -0.072052f, 0.008178711f, -0.0043945312f, 0.032104492f, 0.005279541f, -0.07019043f, 0.03869629f, -0.0008239746f, 0.016906738f, 0.18945312f, -0.017150879f, 0.040771484f, -0.037017822f, -0.05947876f, 0.016113281f, -0.032196045f, -0.024383545f, -0.1028595f, 0.008361816f, 0.14367676f, 0.118774414f, -0.0859617f, -0.054229736f, -0.04147339f, 0.026367188f, 0.05239868f, 0.0617218f, -0.05822754f, -0.019500732f, -0.031394005f, -0.0732193f, -0.06237793f, 0.009895325f, -0.04348755f, 0.017456055f, -0.02709961f, -0.112335205f, -0.024505615f, -0.016479492f, 0.042175293f, -0.021278381f, -0.05444336f, 0.02747345f, 0.07368088f, 0.004699707f, -0.071380615f, -0.033691406f, -0.111328125f, 0.11135864f, -0.0071411133f, -0.061920166f, 0.07702637f, 0.0146484375f, 0.07809448f, -0.053771973f, 0.064575195f, -0.030273438f, 0.058410645f, -0.0368042f, -0.086868286f, -0.004272461f, 0.08566284f, 0.024902344f, -0.045217514f, 0.097976685f, 0.040283203f, -0.22698975f, 0.026611328f, -0.052124023f, -0.07800293f, -0.026947021f, 0.046253204f, 0.09573364f, 0.12817383f, -0.07620239f, 0.05822754f, 0.013305664f, 0.07409668f, -0.058380127f, -0.07273865f, -0.08312988f, 0.005126953f, -0.07797241f, -0.0029935837f, -0.016998291f, 0.028442383f, -0.020629883f, 0.0753479f}; + public static final float[] CENT12 = { 0.06805674f, 0.12919447f, -0.038472492f, 0.14589098f, -0.10427136f, -0.10670301f, 0.040964764f, -0.0031636555f, -0.058451336f, 0.002638075f, -0.11113146f, 0.0329539f, 0.025200738f, 0.017100016f, -0.17188178f, 0.056267634f, 0.07155355f, -0.106526695f, -0.026631674f, 0.04659589f, -0.006713867f, -0.0326572f, 0.04901801f, -0.06733958f, -0.0032891168f, -0.035386827f, -0.113716975f, -0.093566895f, 0.041679487f, -0.02307468f, -0.17012532f, -0.08881293f, -0.013836331f, -0.06714206f, -0.026601156f, -0.11480035f, -0.010026719f, 0.030171713f, 0.14204237f, -0.02214898f, 0.12425402f, 0.1252594f, 0.015455458f, 0.14497884f, -0.091668025f, -0.12074789f, -0.15755548f, -0.019774966f, 0.21474542f, -0.023035685f, -0.017799802f, 0.07431927f, 0.055338543f, -0.009465535f, 0.009209103f, 0.025580512f, -0.11038886f, -0.13448079f, 0.06487698f, -0.21812609f, 0.06153361f, -0.00110711f, -0.04666837f, -0.0205129f, -0.17906359f, -0.0692766f, 0.069125704f, 0.041497126f, 0.015970865f, -0.029756334f, -0.16274685f, 0.034501817f, 0.034501392f, -0.045237225f, -0.073611364f, -0.11956109f, 0.080934316f, -0.0984247f, 0.006015354f, 0.020329794f, 0.057803683f, 0.029324003f, -0.018907335f, 0.00917032f, 0.026489258f, 0.06908502f, 0.055770874f, 0.13696289f, 0.025482178f, -0.050137416f, 0.021221586f, -0.07700263f, -0.124937266f, -0.07269965f, -0.19362132f, 0.035625882f, 0.11542087f, 0.13591512f, 0.13841417f, 0.077046715f, -0.059956867f, -0.13900757f, -0.0084974505f, -0.08254666f, 0.037951812f, 0.052052815f, 0.0040249294f, 0.07557508f, 0.07439677f, 0.033354018f, -0.03094652f, -0.12217882f, 0.10142687f, -0.15155707f, 0.13718669f, 0.13123067f, -0.054760404f, -0.26071507f, 0.1525472f, 0.0047200522f, -0.15452068f, 0.2642585f, -0.29032388f, -0.020836724f, -0.07495117f, 0.10506524f, 0.0008050866f, 0.14537218f, 0.01689148f, -0.00072224933f, -0.120524086f, 0.059821233f, -0.0855306f, -0.19926623f, -0.018608518f, -0.10668977f, 0.009535048f, 0.038886175f, -0.08980306f, 0.043497723f, 0.104899086f, -0.09753418f, -0.06396336f, -0.018012153f, -0.236301f, -0.057210285f, 0.12693956f, -0.022835625f, -0.026767306f, 0.06186252f, 0.043619793f, -0.100986056f, -0.009162055f, 0.018795438f, -0.1062283f, -0.1180759f, -0.1449178f, -0.008541531f, -0.04486762f, 0.04437934f, 0.04341634f, -0.1254069f, -0.016215853f, -0.0129869245f, 0.101999916f, -0.23410374f, 0.03477478f, -0.00022379558f, -0.029252794f, 0.016430326f, -0.067104764f, -0.16140154f, -0.054602727f, -0.17793782f, 0.11973742f, -0.22290039f, 0.11149088f, -0.0669047f, -0.04801093f, -0.15920681f, 0.076280385f, 0.13127984f, -0.06636323f, -0.053025987f, 0.15916634f, 0.045598347f, 0.15272352f, 0.08546109f, 0.09436374f, -0.17894152f, -0.26646593f, -0.05309889f, -0.2061632f, 0.003969828f, -0.008043077f, -0.04152425f, -0.07333374f, -0.10318332f, 0.03955926f, -0.113119975f, 0.028394911f, 0.059149846f, 0.0126953125f, -0.08539496f, -0.03816647f, -0.0004679362f, 0.11260308f, -0.12664117f, -0.030244615f, -0.2008684f, -0.010398441f, -0.10445828f, -0.045935735f, -0.046459623f, -0.018873427f, 0.03949992f, -0.01626926f, 0.2116428f, -0.18247138f, -0.12221951f, -0.07839627f, 0.047254775f, -0.07347955f, 0.06665039f, -0.12548828f, 0.033925373f, -0.021511503f, -0.1254408f, 0.063802086f, 0.0017691718f, -0.20600043f, -0.039001465f, 0.024276733f, -0.15723334f, 0.12879096f, -0.022237143f, 0.08901808f, -0.066819936f, -0.16296387f, 0.1229994f, 0.051133897f, -0.108181424f, 0.0155164935f, 0.006395128f, -0.07823881f, -0.049099393f, -0.08496772f, 0.08513387f, -0.0019497342f, -0.069010414f, 0.03282335f, 0.08861457f, -0.052439373f, 0.23362902f, 0.03990343f, -0.03655328f, -0.061519198f, -0.0024685329f, -0.12774187f, -0.1493259f, 0.07357788f, 0.1507704f, 0.09578281f, -0.114461266f, 0.024664138f, 0.07507324f, -0.16441175f, -0.024095323f, 0.14252387f, -0.062808566f, 0.008338081f, -0.13509114f, -0.028503418f, 0.06719293f, 0.102903575f, -0.09318712f, -0.0728497f, 0.05083974f, -0.013576931f, 0.073766075f, -0.035902236f, -0.071787514f, -0.119032115f, 0.11783007f, -0.015913222f, -0.065707736f, 0.015178257f, -0.010457356f, -0.047120623f, 0.14442952f, 0.007721795f, 0.023444282f, 0.0053066676f, -0.029012045f, 0.032514784f, -0.07321845f, -0.09815809f, 0.02674018f, -0.030453153f, 0.11341858f}; + public static final float[] CENT13 = { 0.023887442f, 0.04328225f, -0.010615997f, -0.011113987f, 0.014236253f, -0.0430918f, 0.008559776f, -0.05946817f, 0.009133181f, -0.008995168f, 0.0022999437f, -0.008928592f, -0.026001275f, -0.01629829f, -0.041215546f, 0.015128399f, 0.042413536f, 0.02927332f, 0.007339804f, 0.034876205f, 0.021125833f, -0.012802619f, 0.036658563f, 0.007168808f, 0.039262254f, -0.010185934f, -0.023545453f, -0.0058815894f, 0.00042984384f, 0.0037821864f, -0.014377446f, -0.03403629f, 0.034196142f, -0.042221498f, 0.00941096f, -0.050391674f, 0.010207397f, 0.03963484f, 0.018589392f, -0.05907872f, 0.018258838f, 0.020952703f, 0.002253729f, 0.035919283f, -0.025100412f, -0.014855717f, 0.006790463f, -0.015182211f, 0.051088758f, 0.015250038f, -0.021682534f, 0.031982403f, -0.0129240975f, -0.046768267f, -0.017760443f, 0.03469587f, -0.027194778f, -0.016759176f, -0.043302435f, -0.05858666f, 0.004428014f, -0.00082386937f, -0.046825666f, -0.00025119373f, 0.008616349f, -0.07143855f, 0.0178493f, -0.03828467f, 0.018577926f, 0.04177994f, 0.004007671f, 0.0023730318f, 0.013645714f, -0.0016453027f, -0.04754308f, 0.018785695f, 0.03326965f, -0.001431091f, 0.04268769f, 0.012719795f, 0.0027666446f, -0.02921704f, 0.002785984f, -0.030465748f, -0.0073540653f, -0.00997982f, -0.031989984f, 0.07743722f, -0.0053592115f, -0.031771746f, 0.0027084653f, -0.075332254f, -0.0689272f, 0.022522721f, -0.093969144f, 0.00058223645f, 0.029364979f, 0.0038636622f, -0.013165569f, -0.010146545f, -0.060851023f, -0.041749667f, -0.009518807f, 0.0077533103f, 0.0466175f, 0.022864725f, 0.010516802f, -0.026686719f, -0.033597346f, 0.04361372f, 0.011671294f, -0.029025406f, 0.07234912f, -0.023016825f, -0.006820415f, 0.028353285f, -0.04196862f, -0.05788026f, 0.04924883f, -0.00901596f, -0.06477588f, 0.029714005f, -0.046167497f, 0.0034296536f, 0.0074805617f, -0.022608865f, -0.062459897f, 0.03606077f, -0.011069377f, 0.06481125f, -0.05644823f, -0.011992229f, -0.05089573f, 0.0073049227f, -0.010819596f, -0.014837794f, -0.013407598f, 0.015023272f, 0.010966384f, 0.024588298f, 0.028657649f, -0.019512752f, -0.024601744f, -0.0049135666f, -0.024682939f, 0.05225478f, 0.018332653f, -0.02326645f, -0.05480905f, 0.008553493f, 0.054482512f, -0.009253355f, -0.0042644828f, 0.049424272f, 0.0015176536f, 0.015923323f, -0.004973777f, 0.015784763f, -0.0050824485f, -0.028809816f, -0.0009051547f, 0.018248342f, 0.043301556f, 0.028620262f, 0.008295258f, -0.034418274f, 0.010144079f, 0.026495667f, 0.038976453f, 0.023473503f, -0.023442164f, -0.026858326f, -0.019612588f, -0.05169357f, 0.02481493f, -0.005562753f, 0.053349026f, -0.023030093f, 0.0049979254f, -0.010388954f, -0.006126483f, -0.030545894f, 0.018712599f, -0.0053512254f, 0.0031095357f, -0.02916358f, 0.03264064f, -0.023995161f, -0.015107573f, -0.017306454f, -0.03518537f, 0.030891867f, 0.0063416073f, 0.026934618f, -0.014153116f, -0.009344996f, 0.018916693f, -0.043965884f, -0.017963389f, -0.033212747f, -0.009345929f, 0.026899848f, -0.021013571f, 0.01096673f, -0.002146362f, -0.03863068f, 0.039443936f, 0.021393118f, -0.06742022f, -0.035460085f, 0.016222289f, -0.051651895f, -0.007712015f, 0.01750424f, -0.031537797f, -0.0061077406f, -0.0017888595f, -0.01443812f, 0.0012131595f, -0.03750441f, 0.006473097f, 0.02762059f, -0.0048683495f, -0.029081395f, 0.006447518f, -0.02242449f, -0.0026779545f, 0.014178307f, 0.010848135f, -0.011873919f, -0.018755317f, 0.0093434835f, -0.026704984f, -0.02565794f, -0.007460218f, 0.002535971f, 0.0069933077f, -0.0050370563f, -0.013613357f, 0.047536865f, 0.01674569f, -0.054112237f, -0.01349684f, -0.012108631f, -0.0015995634f, 0.0342833f, -0.04052497f, 0.02453487f, 0.0066620824f, -0.054981984f, -0.01108716f, 0.014572709f, -0.021390194f, 0.06686618f, 0.018571803f, 0.007833026f, -0.024126414f, -0.014005857f, -0.025563033f, -0.017231325f, 0.013814021f, 0.046524584f, -0.011900312f, -0.0060069542f, 0.03971834f, 0.036596384f, -0.024244066f, 0.046426922f, 0.048027005f, -0.012167879f, -0.006367762f, -0.056063052f, -0.03802658f, 0.08518752f, 0.0243557f, -0.007630976f, -0.029469613f, -0.01110791f, -0.0024744626f, 0.029586218f, 0.0074180793f, -0.016655238f, 0.0020666902f, 0.008282271f, -0.00071432843f, 0.019646727f, 0.001036279f, -0.007937904f, 0.0066516716f, 0.019833894f, -0.028454758f, 0.01589665f, -0.019746253f, 0.03534889f, -0.015503304f, -0.01816427f, -0.035352796f, -0.018944282f, 0.03613916f, 0.041357715f}; + public static final float[] CENT14 = {0.020693757f, -0.015522801f, -0.0020326127f, 0.015565384f, -0.0069977515f, 0.010830192f, 0.0034924885f, -0.026269248f, -0.028510515f, 0.021535475f, -0.004216039f, 0.016523493f, 0.013942985f, 0.005075854f, -0.03142317f, 0.0037486942f, 0.033810638f, 0.00828091f, -0.021524118f, -0.026710687f, -0.0048487466f, 0.010546307f, -0.016348906f, 0.030230856f, -0.021805165f, -0.012019668f, 0.008218455f, 0.041973736f, 0.009712397f, -0.019815134f, -0.034194592f, -0.005480389f, -0.009295796f, -0.019426301f, 0.0042682025f, -0.007933328f, -0.018543333f, 0.000024130179f, 0.00022710756f, -0.009135402f, 0.01356258f, 0.00036053325f, 0.025149323f, 0.030296149f, -0.011417832f, -0.04349252f, -0.037054017f, 0.0039743823f, 0.0032153463f, 0.036035582f, -0.02921171f, -0.020754082f, -0.015542674f, -0.005456259f, -0.0031839416f, 0.01216019f, -0.000028388446f, -0.017988337f, 0.014768379f, -0.008570294f, -0.006652832f, 0.0059644124f, -0.018712243f, 0.00109331f, -0.037915606f, -0.011358216f, -0.010906841f, 0.008953716f, 0.0024626977f, -0.008121224f, -0.012043797f, -0.023022054f, 0.0016096248f, -0.00240734f, -0.005618073f, -0.031329487f, 0.033062603f, -0.007608103f, -0.0027238713f, 0.020581622f, -0.005043207f, 0.004087936f, 0.006134743f, 0.0117812045f, 0.032220174f, -0.021237396f, 0.011139626f, 0.018321902f, 0.0092376f, -0.0035315226f, -0.007011946f, -0.02139921f, -0.011747139f, -0.015539834f, 0.023519827f, 0.008345493f, 0.02693815f, -0.008757835f, 0.017818008f, 0.016202705f, -0.0004911201f, -0.042855196f, -0.025830645f, -0.004753645f, -0.012942292f, 0.022313317f, -0.027604215f, -0.007309315f, -0.023317559f, 0.0027678735f, -0.0053299307f, 0.005691883f, -0.007433514f, -0.01586088f, 0.043275345f, 0.023644736f, 0.025041448f, -0.013082814f, 0.022540426f, -0.01402673f, 0.021072743f, -0.013439267f, -0.038723256f, 0.0003576944f, 0.005989962f, 0.011426349f, -0.037819088f, 0.039982285f, -0.0017373728f, 0.008053447f, 0.000383244f, -0.034991596f, -0.00852647f, 0.0023959847f, -0.0039617848f, -0.007481775f, 0.0010894065f, -0.0187676f, -0.002232751f, -0.0037103698f, 0.03944858f, -0.0056422036f, -0.023596475f, -0.01980378f, 0.0071084667f, -0.015091297f, 0.024711432f, -0.010818836f, 0.021549668f, -0.013720135f, 0.01018542f, -0.032205272f, -0.011925986f, 0.009175145f, 0.022006722f, -0.014334745f, 0.0018849927f, -0.011133948f, -0.012741444f, -0.025839163f, -0.0063476562f, 0.041481197f, -0.008877067f, -0.0050474657f, 0.011985601f, 0.016050827f, 0.007011946f, 0.0041844565f, -0.010984909f, 0.024599297f, -0.0010844386f, 0.009587487f, -0.025367914f, -0.021830713f, 0.0026032203f, -0.00568727f, 0.0350867f, -0.042520214f, -0.028291924f, -0.020648336f, -0.0034548738f, 0.00036621094f, -0.023206133f, -0.011294343f, 0.007396254f, -0.0030134334f, -0.0059047965f, -0.007122306f, 0.03535497f, -0.00075087434f, -0.023999237f, -0.005535747f, 0.027888099f, -0.02162206f, -0.04020389f, -0.002043968f, 0.014790379f, -0.053506542f, -0.0037408874f, -0.057554733f, 0.010581792f, 0.018980514f, -0.022801599f, -0.00014761991f, -0.027928552f, 0.023948492f, -0.009724816f, -0.0039715436f, 0.013544127f, -0.004070903f, -0.0014066474f, -0.035292514f, -0.006316429f, 0.019105423f, -0.03041538f, 0.015989792f, 0.024587233f, 0.0013331923f, 0.004286655f, 0.00018044405f, -0.02855026f, 0.017591609f, -0.012248195f, -0.017539801f, -0.0057401434f, 0.003855151f, 0.025555987f, -0.015150913f, 0.00675645f, 0.006588958f, 0.002043968f, 0.01606786f, -0.011051621f, -0.0022846821f, 0.009684009f, 0.0011099882f, 0.0056620752f, 0.010288682f, 0.021907363f, 0.011523668f, 0.010727993f, 0.014309994f, -0.0017437602f, -0.00071254995f, 0.004357626f, -0.038385436f, -0.0029431721f, 0.053620096f, -0.008983524f, -0.004975075f, -0.0048771347f, 0.018934382f, -0.011659844f, 0.026597135f, 0.019808747f, 0.013210208f, -0.018571721f, -0.00031369232f, -0.012962874f, -0.036749907f, 0.011417832f, 0.019128134f, -0.00033782248f, 0.010115513f, -0.010864258f, 0.01812407f, -0.008151742f, -0.015290016f, -0.030057685f, -0.0064393864f, 0.006279524f, 0.0248115f, 0.0041191634f, 0.008039608f, 0.022125954f, 0.005837019f, -0.019245947f, 0.002591865f, -0.010336232f, -0.0036621094f, -0.026395576f, 0.005168471f, 0.007031818f, 0.0275155f, 0.020506393f, 0.0036024936f, -0.011723274f, 0.008970749f, 0.00788702f, 0.032345794f, 0.00079771527f, -0.013184303f, -0.027126579f, 0.032757428f, 0.0057798875f, -0.010372695f, 0.007726625f, -0.01717288f, -0.028200017f, 0.005961573f}; + public static final float[] CENT15 = {-0.14274089f, -0.19466145f, -0.027974447f, -0.033854168f, -0.01739502f, -0.13108318f, 0.113464355f, -0.049479168f, 0.0546875f, -0.025716146f, 0.05033366f, 0.16748047f, -0.08561198f, 0.021647135f, -0.15690105f, 0.024719238f, -0.0120442705f, 0.048177082f, -0.008585612f, -0.024088541f, 0.08105469f, 0.07359823f, 0.03157552f, 0.115763344f, 0.052083332f, -0.12646484f, 0.033081055f, -0.17041016f, 0.08919271f, -0.11311849f, 0.11751302f, -0.16780598f, -0.056966145f, -0.036621094f, -0.018066406f, -0.080729164f, -0.11726888f, 0.07055664f, -0.08882141f, -0.05303955f, -0.15527344f, 0.02351888f, -0.027669271f, 0.042317707f, -0.083740234f, -0.20117188f, -0.14453125f, 0.12223307f, -0.23502605f, 0.10644531f, -0.20955403f, 0.20377605f, -0.0017089844f, 0.037027996f, 0.019429525f, 0.17610677f, -0.080851234f, 0.106770836f, -0.034505207f, -0.028645834f, -0.1659139f, 0.059368134f, -0.10432943f, -0.110188805f, -0.018717447f, -0.05170695f, -0.11897787f, 0.05078125f, 0.17602539f, 0.0094401045f, -0.3154297f, 0.21614583f, 0.17594402f, -0.03873698f, -0.10392252f, -0.053222656f, -0.03540039f, 0.025146484f, 0.033447266f, 0.13183594f, -0.041463215f, -0.041341145f, 0.010172526f, -0.13590495f, -0.2591146f, -0.124430336f, 0.12890625f, 0.10799154f, -0.055704754f, 0.13435872f, 0.09407552f, -0.036865234f, 0.020381927f, -0.22591145f, -0.06315104f, 0.1398112f, 0.16796875f, 0.025878906f, 0.1171875f, -0.067993164f, -0.06473795f, 0.08772787f, -0.17856216f, -0.10253906f, 0.088002525f, 0.063924156f, -0.11983236f, -0.018636068f, 0.32226562f, -0.23990886f, -0.048583984f, -0.05867513f, -0.045878094f, 0.13151042f, 0.26961264f, -0.041229248f, 0.00028483072f, -0.069600426f, 0.25423178f, -0.009114583f, 0.14111328f, 0.07307943f, -0.18945312f, -0.12906902f, -0.18473308f, -0.052408855f, -0.25976562f, -0.09037272f, -0.18131511f, -0.08850098f, 0.08557129f, 0.18164062f, 0.08211263f, -0.34342447f, -0.24609375f, 0.040649414f, -0.016947428f, -0.07674154f, 0.064941406f, 0.052571613f, -0.030761719f, -0.057739258f, -0.032145184f, 0.04414876f, -0.1944987f, -0.01419576f, 0.15722656f, 0.23307292f, 0.063802086f, 0.12687175f, 0.041829426f, -0.1282552f, -0.07336426f, 0.060384113f, -0.0032475789f, -0.032470703f, 0.0013020834f, -0.29003906f, -0.05891927f, -0.36132812f, 0.19628906f, 0.043863934f, 0.068359375f, -0.27571616f, 0.11669922f, 0.13655598f, 0.055664062f, 0.08390299f, 0.07454427f, 0.23331706f, -0.0013834635f, -0.09163412f, 0.031911213f, 0.13480632f, -0.028645834f, -0.10103353f, 0.11063639f, -0.18741862f, 0.01944987f, -0.044270832f, -0.095602036f, -0.10546875f, 0.11702474f, -0.10205078f, -0.00777181f, -0.1358846f, 0.14477539f, -0.065104164f, -0.07820638f, -0.1200765f, -0.06591797f, 0.05411784f, -0.044637043f, -0.039713543f, -0.09082031f, -0.083984375f, 0.08601888f, -0.20092773f, 0.20279948f, 0.10538737f, 0.044596355f, -0.09680939f, -0.013427734f, -0.019856771f, -0.10384115f, 0.06738281f, 0.0034993489f, -0.13509114f, -0.111979164f, -0.106404625f, 0.0843099f, -0.095458984f, 0.004597982f, 0.11743164f, -0.069885254f, 0.11230469f, 0.13330078f, 0.26041666f, 0.024332682f, -0.00032552084f, -0.2141927f, 0.025553385f, -0.13671875f, 0.10758463f, -0.037679035f, -0.19401042f, 0.3046875f, 0.11816406f, 0.2167155f, 0.015675863f, -0.021565756f, 0.05078125f, -0.16699219f, -0.112101234f, 0.10889181f, -0.05908203f, 0.06156413f, 0.102376305f, 0.19173177f, 0.024963379f, -0.08646647f, -0.13549805f, 0.06477865f, 0.0293986f, 0.16145833f, -0.17816162f, -0.024576822f, 0.057942707f, -0.12605794f, 0.08951823f, 0.07126363f, 0.0057779946f, 0.0013020834f, 0.05029297f, 0.08544922f, 0.0043945312f, 0.006693522f, 0.007863362f, -0.033528645f, 0.01570638f, 0.25297037f, 0.044596355f, 0.07712809f, 0.04593913f, 0.1303711f, 0.040364582f, -0.1985677f, -0.033284504f, -0.037719727f, 0.009521484f, -0.01171875f, 0.041341145f, -0.0041503906f, -0.08528646f, -0.048950195f, 0.14257812f, 0.13777669f, 0.10579427f, 0.123046875f, 0.07368978f, -0.23502605f, -0.13688152f, 0.044921875f, 0.053019207f, 0.25472006f, -0.028035482f, -0.045979816f, 0.020345053f, -0.006479899f, 0.2055664f, -0.20442708f, -0.032755535f, -0.18359375f, 0.084757484f, 0.057617188f, -0.15258789f, -0.023763021f, 0.08561198f, -0.09106445f, 0.17478435f}; + public static final float[] CENT16 = {0.02656795f, -0.027104424f, -0.023701625f, 0.011462082f, -0.025512533f, 0.060389522f, -0.011019379f, 0.012573456f, 0.00006660745f, 0.05440633f, 0.024419567f, 0.06254476f, 0.00945738f, 0.042720754f, -0.06743342f, -0.062002268f, 0.093855f, 0.08466438f, -0.053948864f, -0.03418326f, 0.0035179572f, 0.09657305f, 0.041789953f, 0.1114102f, -0.052297253f, -0.0072286013f, -0.06576554f, 0.0804786f, 0.056698054f, -0.041722827f, -0.0942104f, -0.049688164f, -0.020004777f, -0.08360924f, 0.023640066f, -0.03898163f, -0.03802612f, -0.016973581f, -0.05485116f, 0.020987524f, -0.017657237f, -0.00940428f, -0.03100636f, 0.055085022f, -0.016289782f, -0.14338008f, -0.0978748f, -0.035691503f, -0.018476747f, 0.08199449f, -0.23716761f, 0.048678715f, 0.0053085214f, -0.029854456f, 0.022752196f, 0.12057195f, -0.066667266f, -0.06031439f, 0.009872442f, -0.029122185f, -0.036573783f, 0.04026085f, -0.074521564f, -0.007982817f, -0.06545487f, -0.03930597f, -0.02995332f, 0.114491895f, 0.026842134f, -0.018609947f, -0.028653303f, -0.09892133f, 0.02646653f, -0.00049219467f, -0.03766971f, -0.13507031f, 0.06027189f, -0.042175874f, -0.0038599765f, 0.09306916f, 0.015026981f, -0.039265744f, 0.04640636f, 0.007602435f, 0.027302397f, -0.094714984f, 0.03138195f, 0.05761209f, 0.05664994f, 0.0036571308f, -0.013957359f, -0.07498821f, 0.0015954609f, 0.022175364f, 0.042785197f, 0.04936498f, 0.108851835f, -0.017658047f, 0.06870973f, 0.013605961f, 0.056776505f, -0.079834156f, -0.047623035f, -0.082656786f, -0.009184613f, 0.09143531f, -0.06428867f, -0.034113944f, -0.041463576f, 0.0033071435f, -0.01736521f, -0.0045458563f, 0.018470401f, -0.07156477f, 0.10132849f, 0.011189323f, -0.0042031673f, -0.008678358f, 0.09353669f, -0.023374408f, 0.008974004f, 0.03304258f, -0.10472009f, -0.016886134f, -0.0064085093f, 0.020377234f, -0.16142723f, 0.10706219f, -0.009268342f, 0.019277439f, 0.0074545406f, -0.070507176f, 0.0030508067f, -0.028105346f, -0.04113353f, -0.039555006f, -0.0053452025f, -0.059124712f, -0.011376239f, 0.08968982f, 0.06654408f, -0.0059532183f, -0.032439616f, -0.01190393f, -0.053534985f, -0.0715962f, 0.10922797f, 0.053967472f, 0.056878578f, -0.016893651f, 0.0608341f, -0.08945016f, 0.059513096f, 0.01656179f, 0.06696617f, -0.052033085f, -0.058528956f, -0.06854219f, -0.051555194f, -0.10613376f, 0.0056227036f, 0.03371274f, 0.039275832f, -0.018200492f, 0.00045457514f, 0.069378175f, 0.03423791f, 0.025084767f, -0.049370363f, 0.1101937f, -0.03191325f, -0.054423098f, 0.00052203424f, -0.12179094f, -0.008161711f, 0.0649785f, 0.017374473f, -0.13108492f, -0.07402524f, -0.090106815f, -0.07360143f, -0.06090822f, -0.0017429356f, -0.009459539f, -0.029883437f, -0.0058513056f, 0.009313043f, 0.017150085f, 0.014236924f, -0.01934018f, -0.06435077f, -0.031590696f, -0.0020541623f, -0.03461369f, -0.13596731f, -0.03453788f, 0.024404682f, -0.12659791f, 0.059909653f, -0.11344963f, 0.050642867f, 0.08782322f, -0.06213432f, 0.011047026f, -0.04064189f, 0.076447465f, -0.058785774f, -0.014466915f, 0.013738874f, -0.02089832f, -0.07065406f, 0.0048904438f, -0.043137375f, 0.08724391f, -0.080711655f, 0.055526365f, 0.07678428f, 0.058287468f, 0.020341216f, -0.04235529f, -0.094595075f, 0.07285795f, -0.10413374f, -0.05260495f, -0.054285236f, -0.035824493f, 0.029665837f, 0.025472462f, -0.04730713f, -0.060660373f, -0.027840154f, 0.07788931f, -0.002355852f, 0.021923304f, 0.0044257278f, -0.06905067f, 0.030987283f, -0.02046826f, 0.04784864f, 0.020299718f, -0.001193709f, -0.0172642f, 0.030180305f, -0.050354168f, 0.020537008f, -0.06904758f, -0.09149674f, 0.17464893f, -0.0600352f, -0.034378465f, -0.057807904f, 0.082749516f, -0.04385963f, 0.0392105f, 0.0417532f, 0.0036880733f, -0.077963606f, -0.0661607f, 0.011501812f, -0.09791016f, 0.08874466f, 0.0037862137f, 0.00021828938f, -0.024938494f, -0.02858041f, 0.062033206f, -0.07427365f, -0.045442957f, -0.057593673f, -0.03219853f, -0.0046714637f, 0.08933561f, -0.033553652f, 0.014436796f, 0.061099246f, 0.023641383f, -0.022744555f, 0.0013368449f, 0.0012309265f, -0.01283516f, -0.029628046f, 0.05035921f, 0.018101454f, 0.04118f, 0.055451624f, 0.030545818f, -0.030370355f, 0.052289147f, 0.014821105f, 0.11853957f, 0.0046891416f, -0.008374241f, -0.09473821f, 0.07351084f, 0.02768603f, -0.065745786f, -0.0152512025f, -0.052786786f, 0.016413951f, 0.034802563f}; + public static final float[] CENT17 = {0.021161398f, -0.03711319f, -0.039037388f, 0.020795027f, 0.03656451f, -0.012995879f, -0.035069782f, 0.008835156f, 0.09168053f, 0.068865456f, -0.049584705f, -0.012984593f, 0.054929096f, -0.013298035f, 0.015228748f, -0.020044962f, 0.02946218f, 0.031232595f, 0.075478874f, 0.016453108f, -0.007818937f, -0.080206715f, -0.0035835903f, -0.047954243f, 0.044600803f, 0.021486282f, -0.04513033f, 0.07988151f, 0.07167435f, 0.005168915f, -0.082972206f, -0.089253105f, -0.029696146f, -0.12600565f, -0.018627167f, -0.061745644f, 0.06435776f, 0.03155772f, -0.08172202f, -0.06942209f, -0.04021899f, -0.08317312f, -0.023119053f, 0.04123497f, -0.00998656f, 0.021935781f, -0.073498964f, -0.060555458f, 0.026864052f, 0.08098348f, -0.09366608f, 0.11097336f, 0.080596924f, -0.07087898f, 0.009587288f, 0.0775458f, -0.070334755f, -0.107387386f, -0.10397848f, -0.06756624f, -0.055469513f, -0.00032218298f, -0.09452502f, -0.098303474f, -0.0010375977f, -0.09127966f, -0.036734898f, 0.047098797f, 0.086210884f, 0.06067276f, 0.09621207f, -0.14546712f, 0.07867495f, -0.021165213f, 0.01514101f, 0.03850953f, 0.038534243f, -0.06870842f, 0.047059696f, -0.0018860499f, -0.05555447f, -0.11402607f, -0.024697622f, -0.043441135f, 0.0076384544f, -0.09158643f, -0.058855694f, 0.10580317f, 0.09152508f, -0.0044530234f, -0.048835117f, -0.09376017f, -0.010502021f, -0.044063013f, -0.074998856f, 0.09051641f, 0.009883881f, 0.0033046405f, 0.017051062f, -0.01797831f, -0.009545326f, -0.0966835f, -0.016155243f, -0.058233898f, 0.021101633f, -0.07574177f, 0.05640046f, -0.10064443f, -0.04831187f, -0.00554657f, -0.04132716f, 0.051321667f, 0.014981587f, -0.039093107f, 0.05645752f, -0.052484274f, 0.016121069f, -0.05233447f, 0.08966875f, 0.03136317f, -0.04396184f, 0.003165245f, 0.02381877f, -0.04537201f, 0.0696462f, -0.07842525f, -0.15355809f, 0.06634917f, 0.052481335f, 0.11067263f, -0.030817032f, -0.04990991f, -0.08596102f, -0.040086746f, -0.064427696f, -0.056094486f, -0.0687364f, 0.033021927f, -0.045628548f, 0.16889317f, -0.0045636497f, 0.031239828f, 0.014987946f, -0.09111325f, 0.01706483f, 0.03876241f, 0.011080424f, 0.04849012f, -0.025896072f, 0.04990514f, 0.06507301f, -0.050795157f, 0.033115387f, 0.09469732f, -0.0419782f, 0.011853228f, 0.039128978f, -0.016515732f, -0.094940186f, -0.026271185f, -0.017778715f, -0.011549632f, 0.098477684f, -0.030597687f, -0.022562345f, 0.039137203f, 0.021655718f, 0.07752792f, -0.044170696f, 0.03296407f, -0.14244588f, -0.06563791f, 0.052132923f, -0.0981191f, -0.07626155f, 0.005255381f, 0.035762787f, -0.036763668f, -0.06259588f, 0.024457296f, -0.107051216f, -0.029278437f, 0.042430878f, 0.04950778f, -0.050373077f, -0.086811066f, 0.18306732f, 0.042115767f, 0.116737366f, -0.119203724f, -0.037003834f, -0.010967891f, 0.032130558f, -0.01507314f, -0.076375805f, -0.0018297831f, 0.024028778f, 0.0068117776f, 0.07078171f, -0.08162689f, 0.017718634f, 0.030575117f, 0.007540067f, -0.028115472f, 0.0113658905f, 0.0046142736f, 0.01797835f, 0.08329189f, -0.006123861f, -0.02098465f, -0.061418217f, -0.008479039f, -0.00022498767f, 0.004346212f, -0.05483373f, 0.053641636f, 0.035109043f, -0.04006354f, 0.038001698f, -0.095181786f, -0.019408664f, 0.07150205f, -0.028364182f, -0.022036234f, -0.033699553f, -0.012293498f, -0.07649485f, 0.067111336f, -0.0070610046f, -0.06791592f, -0.039871216f, 0.00022125244f, -0.0846378f, 0.07417011f, -0.046431225f, 0.031129679f, 0.038763363f, -0.067466736f, 0.06755956f, 0.052381832f, 0.033643086f, 0.045773905f, 0.052595455f, -0.060716707f, -0.016037941f, -0.025623322f, 0.020775476f, 0.08478975f, -0.031114897f, -0.06060918f, -0.020280838f, 0.044234276f, -0.019839088f, 0.020787558f, 0.08852768f, 0.03827079f, 0.012430032f, -0.035530407f, -0.00952657f, -0.029183706f, 0.026107788f, 0.0082200365f, -0.018796286f, 0.0012804667f, 0.027149836f, 0.040764492f, -0.06421757f, -0.05242316f, -0.0060608387f, 0.0050783157f, -0.015781403f, 0.039179485f, -0.037661236f, 0.009583791f, 0.0986735f, -0.001112024f, -0.060036976f, -0.10703659f, -0.021316528f, -0.006624222f, -0.021764597f, 0.009518941f, -0.0013073286f, 0.040200233f, 0.062150538f, -0.047406834f, -0.042789143f, 0.05031554f, -0.0144120855f, 0.09724808f, 0.003803571f, -0.07053757f, -0.14302063f, 0.11892446f, 0.030239105f, -0.109812416f, -0.04820633f, -0.052171707f, -0.0014661154f, 0.0672156f}; + public static final float[] CENT18 = {0.042880792f, -0.07624927f, 0.010182256f, 0.015681723f, -0.027691385f, 0.053923316f, 0.017657958f, 0.032611128f, -0.008707683f, 0.055612758f, 0.03194173f, 0.07776874f, 0.008326434f, 0.03230706f, -0.13591567f, -0.036654487f, 0.13522029f, 0.1078133f, -0.047212906f, -0.044258006f, -0.011434361f, 0.12867129f, 0.06278796f, 0.13622992f, -0.053455297f, 0.0012224723f, -0.05748069f, 0.06669816f, 0.020377228f, -0.0818121f, -0.079135396f, -0.013410928f, 0.0050420347f, -0.13299538f, 0.00009630728f, -0.021058014f, -0.03192824f, -0.04384558f, -0.007006714f, 0.041602094f, 0.0011755902f, -0.019230496f, -0.020459382f, 0.06491676f, -0.017970873f, -0.1515222f, -0.1125025f, -0.06371474f, -0.041567154f, 0.1098739f, -0.2236934f, 0.052128945f, -0.024620028f, -0.044621564f, 0.043949626f, 0.1502596f, -0.062788814f, -0.05787371f, 0.019979062f, -0.07652559f, -0.0484712f, 0.036265496f, -0.07016124f, -0.0050609424f, -0.1068636f, -0.038411457f, -0.021202197f, 0.1463999f, -0.000028582586f, -0.022077698f, -0.04696589f, -0.12729502f, 0.044452835f, 0.008150515f, -0.022019759f, -0.16713637f, 0.07915082f, -0.034888007f, 0.030670607f, 0.10071504f, 0.037835605f, -0.048776984f, 0.044848457f, 0.0042622886f, 0.06409443f, -0.105552785f, 0.0705531f, 0.08535236f, 0.06528727f, 0.0317573f, -0.015307883f, -0.08798439f, -0.013357577f, -0.020328578f, 0.0325196f, 0.053226303f, 0.14792788f, -0.036282305f, 0.08275748f, -0.0021959387f, 0.04069472f, -0.078546055f, -0.08451163f, -0.06021472f, -0.03225484f, 0.07123687f, -0.066662885f, -0.025800014f, -0.033648062f, -0.002005978f, -0.0064068115f, 0.01236935f, -0.056479633f, -0.026692266f, 0.10910078f, 0.038337596f, 0.027983403f, -0.02224831f, 0.15500686f, -0.024340423f, -0.0002827437f, 0.030985404f, -0.0710842f, -0.006062715f, 0.0030623726f, 0.011736828f, -0.19717982f, 0.13527381f, -0.024397034f, 0.05075825f, 0.0048947544f, -0.07720284f, 0.006918368f, -0.032598965f, -0.077962354f, -0.028187186f, -0.007288608f, -0.04382059f, -0.029207645f, 0.085595615f, 0.074851654f, 0.0052127563f, -0.0053923232f, -0.045166235f, -0.07223953f, -0.07234701f, 0.12061476f, 0.07421875f, 0.08377164f, -0.00041187674f, 0.0824172f, -0.07487333f, 0.011843916f, 0.034113124f, 0.05061383f, -0.031537928f, -0.06263799f, -0.05961454f, -0.07395675f, -0.09714408f, 0.012731469f, 0.053072944f, -0.004439202f, -0.024217246f, 0.03865781f, 0.12039671f, 0.08435877f, 0.06962298f, -0.03526776f, 0.08654408f, -0.04358065f, -0.05603779f, -0.027131233f, -0.1227271f, 0.04802737f, 0.038418535f, 0.056839306f, -0.14018802f, -0.07296289f, -0.11705216f, -0.08657594f, -0.014468981f, 0.005891883f, -0.024486376f, -0.011529674f, 0.0051992666f, -0.0074802344f, 0.048726115f, 0.078553736f, -0.072771154f, -0.059586454f, -0.059504855f, -0.03231679f, -0.056307033f, -0.18120508f, -0.035537694f, 0.02283522f, -0.18167557f, 0.0014453833f, -0.16978565f, 0.012419438f, 0.10203331f, -0.026666641f, 0.031865742f, -0.06984069f, 0.035012066f, -0.01414241f, -0.03246053f, 0.059981193f, -0.013936803f, -0.06305573f, -0.017489446f, 0.0043777796f, 0.0902794f, -0.0946412f, 0.09555728f, 0.10858641f, 0.07100025f, -0.0025093504f, -0.056139518f, -0.07959946f, 0.051205844f, -0.05090332f, -0.06018199f, -0.0807342f, -0.09392957f, 0.040633935f, 0.0012425962f, -0.043289185f, -0.07107301f, -0.056377854f, 0.0778897f, -0.018377885f, 0.011250296f, -0.0046343594f, -0.05250041f, 0.01586914f, 0.01092557f, 0.04396809f, 0.022183405f, -0.03854547f, -0.018154366f, 0.03357265f, -0.05467246f, 0.01611129f, -0.14633046f, -0.053700324f, 0.1980071f, -0.07619908f, -0.029496953f, -0.058595907f, 0.07783132f, -0.10730777f, 0.07928732f, 0.04816824f, -0.01760798f, -0.1162304f, -0.05104048f, -0.020932626f, -0.069779105f, 0.13167892f, 0.019604227f, 0.0012649315f, -0.004268923f, -0.043876205f, 0.065887116f, -0.028502533f, -0.07389035f, -0.11210406f, -0.072872214f, 0.026853036f, 0.0729432f, -0.033727895f, 0.01655866f, 0.0022759922f, -0.022448333f, -0.031809047f, -0.014041624f, -0.0052208276f, -0.025296861f, -0.063355155f, 0.04960588f, 0.0017649332f, 0.057768144f, 0.04413555f, 0.0260127f, -0.030807717f, 0.056555044f, 0.03171141f, 0.17551234f, 0.003340569f, -0.0527043f, -0.1301196f, 0.07568818f, 0.08032608f, -0.02983707f, 0.005327971f, -0.07378929f, 0.006402942f, 0.045272607f}; + public static final float[] CENT19 = {0.014469522f, 0.007015611f, 0.009775747f, 0.051843736f, -0.06763714f, 0.014846305f, 0.05348311f, -0.04037206f, 0.07859488f, 0.04211869f, -0.02911627f, -0.056373544f, -0.009263961f, 0.04133615f, -0.07920257f, 0.04486323f, 0.025132203f, 0.08279483f, -0.020302182f, -0.06634655f, -0.018604577f, 0.026657216f, -0.035183996f, 0.06600752f, 0.009316866f, -0.04117117f, -0.061241064f, 0.054558113f, -0.003860721f, -0.051788345f, -0.020714628f, -0.037051298f, -0.017864957f, 0.0029328552f, 0.013650226f, -0.014823543f, 0.0045243544f, -0.012286617f, 0.07035388f, 0.003179769f, 0.06105771f, 0.029574163f, 0.03626453f, 0.044347983f, -0.030051664f, -0.104975924f, -0.07733047f, -0.013021423f, -0.013876168f, 0.037876096f, -0.030736012f, -0.016259551f, -0.06782997f, -0.059030514f, -0.008496574f, 0.0795777f, -0.02459979f, -0.09447214f, 0.05081923f, -0.057278715f, -0.017224971f, 0.024443382f, -0.06887052f, -0.05334697f, -0.042968642f, 0.007682094f, -0.058475602f, 0.10782898f, -0.009839149f, 0.004663631f, -0.007967484f, -0.026545865f, 0.08656577f, -0.019366553f, -0.06254438f, -0.08543674f, 0.08049359f, 0.040818986f, 0.02808782f, 0.088669404f, 0.015348791f, -0.00009424821f, 0.016397439f, 0.020236554f, 0.036644284f, -0.07525949f, -0.059739783f, 0.08526319f, 0.007714261f, 0.04663417f, 0.05720294f, -0.082084075f, -0.056377232f, -0.025218045f, 0.023348255f, -0.027365346f, 0.023605539f, -0.026022196f, 0.02796488f, 0.00034082663f, 0.005172748f, -0.0487523f, -0.018881602f, 0.010925305f, -0.019683385f, -0.022594221f, -0.04601932f, 0.00071226066f, 0.044715747f, -0.040805146f, -0.026737595f, -0.0074141813f, -0.018410137f, 0.0010086732f, 0.102629215f, 0.041188996f, 0.013370935f, -0.02240014f, 0.13358201f, 0.028496385f, -0.06500087f, 0.010557879f, -0.045168694f, 0.036849726f, 0.0062458795f, -0.0013879184f, -0.08188879f, 0.035283066f, -0.010419017f, 0.042407095f, -0.015434123f, -0.046854645f, -0.0460678f, -0.0045770854f, -0.0058696712f, -0.029734055f, 0.0463853f, -0.022027383f, -0.013552894f, 0.046999346f, 0.031936616f, -0.04144418f, -0.02924246f, 0.05012376f, 0.012666053f, -0.014641948f, 0.002757645f, -0.026138058f, -0.016868042f, -0.007943161f, 0.053335972f, 0.0019673929f, -0.05178222f, 0.030428601f, 0.007278082f, -0.044844527f, -0.061761554f, -0.029801061f, -0.06961413f, -0.04644612f, -0.002336318f, 0.04719083f, -0.016303785f, 0.0063981684f, -0.00557174f, 0.0009789455f, 0.027542928f, -0.018505165f, -0.038069624f, -0.01099081f, -0.06045079f, -0.019210672f, -0.021258084f, -0.07202606f, 0.048621856f, -0.016709287f, 0.053134546f, -0.109519444f, -0.065135516f, -0.019579073f, -0.09087152f, -0.04672797f, 0.0035288753f, -0.015243885f, 0.0068710055f, -0.0038201995f, -0.045886993f, 0.030791607f, 0.038667604f, 0.038273357f, -0.046059903f, -0.051111087f, 0.00585442f, -0.05210695f, -0.07326964f, 0.007541768f, -0.004215812f, -0.07085281f, -0.009573899f, -0.1240113f, 0.022205861f, 0.06607601f, -0.02358842f, 0.0013084867f, -0.040483736f, 0.004826578f, -0.004097631f, -0.040248662f, 0.016556425f, -0.012897729f, -0.027326308f, 0.012633681f, -0.029736673f, 0.071966164f, -0.06801389f, -0.00029209862f, 0.09730145f, 0.044594403f, -0.044615038f, -0.0026024152f, -0.020995889f, 0.022903198f, -0.03524775f, -0.017793693f, -0.010155545f, -0.03672407f, 0.013274971f, 0.0065897955f, 0.020038428f, -0.012736747f, -0.015420506f, -0.008375173f, 0.02470419f, 0.0018216759f, 0.021237614f, -0.034781154f, 0.016365243f, -0.024360757f, 0.06266979f, 0.008240807f, 0.0014309451f, -0.0118283825f, -0.0093105985f, -0.048253387f, 0.0056144567f, -0.029306864f, -0.018755464f, 0.110473745f, -0.009412424f, -0.0507726f, -0.007054656f, 0.059606258f, -0.023641342f, 0.041080322f, 0.028264841f, -0.059953805f, -0.009282413f, -0.012558762f, -0.053067904f, -0.035598066f, 0.042303417f, -0.028767992f, -0.022342514f, 0.022944901f, 0.029917285f, 0.08727772f, 0.0013490656f, -0.049155265f, -0.11554434f, -0.036680676f, 0.003305518f, 0.05895783f, 0.011847611f, 0.020743247f, 0.050384954f, -0.022261655f, -0.044348102f, -0.04860441f, -0.031728275f, 0.0048045465f, -0.030465025f, 0.026207563f, 0.019973284f, 0.041356426f, 0.012660176f, -0.0072446023f, -0.03820997f, 0.013607034f, 0.046828747f, 0.058420844f, -0.07433618f, 0.017476523f, -0.06213807f, 0.034771435f, -0.011899195f, -0.005945154f, 0.03675761f, -0.042279795f, 0.011430844f, 0.0077952463f}; + public static final float[] CENT20 = {-0.051882796f, -0.13422309f, 0.009901259f, 0.096947566f, 0.0047200522f, 0.06968859f, 0.057295058f, 0.017306857f, -0.028835721f, 0.0068291556f, -0.12868923f, 0.043789335f, -0.009453667f, 0.14176433f, 0.0999349f, -0.04783122f, 0.29618326f, 0.10454644f, -0.043252733f, -0.055033367f, 0.017734315f, 0.0636088f, 0.16931152f, -0.014373779f, 0.10953776f, -0.051486544f, -0.11858453f, 0.013956706f, 0.017818876f, 0.0007595486f, -0.038465712f, -0.13798015f, -0.19240315f, -0.045898438f, 0.103479594f, -0.018296983f, -0.08420139f, 0.073038734f, -0.04957411f, 0.06591797f, -0.07264879f, -0.0728285f, -0.023941888f, -0.09992133f, 0.009867351f, -0.18092176f, -0.17051867f, -0.0010715061f, 0.086235896f, 0.2633735f, -0.34358725f, 0.09311252f, 0.08540174f, -0.07889811f, 0.044799805f, 0.17175293f, -0.0152282715f, -0.008626302f, 0.034627277f, -0.1384413f, -0.022352431f, 0.0138441725f, -0.12098524f, -0.12426758f, -0.0032484266f, -0.12875706f, 0.06452433f, 0.22924805f, 0.114583336f, 0.21775988f, 0.052137587f, -0.08943685f, -0.05594211f, 0.10080295f, -0.08506945f, 0.0119222f, 0.019117568f, -0.048231337f, 0.026316324f, 0.0824924f, 0.030164931f, -0.028700087f, -0.0038384332f, 0.0656467f, -0.12858073f, -0.2414822f, -0.0057237414f, 0.034932453f, 0.114759654f, -0.011705186f, 0.0372857f, -0.06499566f, 0.07779948f, 0.025533041f, 0.2195638f, 0.11490885f, 0.12630208f, 0.0029296875f, -0.0041775173f, 0.07914225f, 0.051757812f, -0.054667156f, 0.039445665f, -0.13366699f, -0.11219618f, 0.09288195f, -0.0018310547f, -0.069620766f, -0.10058594f, 0.023105197f, -0.0593533f, 0.037275527f, 0.10396322f, -0.09391276f, 0.008422852f, -0.09509277f, 0.08767361f, -0.00069512264f, 0.13530816f, 0.0016818576f, 0.05472819f, 0.020955404f, -0.0855306f, -0.036193848f, 0.024291992f, -0.110812716f, -0.026991103f, 0.05460612f, 0.08935547f, 0.027108086f, -0.07748752f, -0.13921441f, 0.088127986f, -0.18562147f, -0.13923137f, -0.087239586f, 0.10636393f, -0.09134928f, 0.0797526f, 0.10715061f, 0.037326388f, 0.120271474f, -0.07055664f, -0.13448757f, 0.13986206f, -0.033596464f, 0.14124891f, -0.040744357f, 0.062445745f, -0.049045138f, 0.13623047f, 0.010904948f, 0.08359104f, -0.025146484f, 0.0934923f, -0.15299818f, 0.020621406f, -0.17502171f, -0.08352322f, -0.15157063f, 0.088127986f, -0.08015272f, 0.17944336f, 0.017062716f, 0.000027126736f, 0.22890896f, 0.09605238f, 0.006659614f, -0.036709256f, 0.23653157f, -0.07231818f, 0.09174262f, 0.08110894f, -0.28331164f, -0.122795954f, 0.010620117f, 0.022840712f, 0.07885742f, 0.0008002387f, -0.14819336f, -0.23318142f, -0.062479656f, 0.05501302f, -0.081739634f, -0.05211046f, -0.03962538f, 0.0064426f, -0.021501753f, -0.041566636f, -0.0069986978f, -0.10387251f, 0.0056084525f, -0.16438802f, 0.021606445f, -0.08930121f, -0.04014079f, 0.016036987f, -0.02460395f, 0.2204861f, -0.18132359f, 0.021891275f, 0.10394965f, -0.08281793f, 0.052571613f, -0.058539495f, 0.038465712f, -0.039635554f, 0.02548896f, 0.0124782985f, 0.039659288f, -0.01570638f, 0.04195828f, -0.016348945f, 0.0933431f, 0.08948178f, 0.05811225f, 0.13835992f, 0.0744222f, 0.12510173f, -0.0018174914f, -0.089138456f, 0.13000488f, 0.0026448567f, -0.031684026f, -0.08665636f, -0.05922275f, 0.11859809f, -0.020806206f, -0.0040961374f, -0.16315375f, 0.047987197f, 0.119615346f, -0.028700087f, 0.037136503f, -0.07187229f, -0.15334743f, 0.040700275f, 0.024820963f, 0.023573134f, 0.0967882f, -0.12277561f, -0.01167806f, 0.074910484f, 0.029201932f, 0.23464626f, -0.0105523f, 0.082085505f, 0.23063152f, -0.0400255f, -0.106662326f, -0.06415134f, 0.14609104f, -0.12375217f, -0.032765705f, 0.07941691f, -0.01961263f, 0.06188965f, -0.04089398f, 0.037760418f, -0.049811468f, 0.050547283f, 0.01744249f, 0.12310113f, -0.02416314f, 0.040130615f, 0.13659668f, -0.08180067f, 0.0663859f, 0.049316406f, 0.07910156f, -0.17890082f, 0.17515734f, -0.0257704f, 0.16341145f, 0.028240204f, -0.0841607f, -0.1633572f, -0.08631727f, -0.043999568f, 0.03382704f, -0.02484809f, 0.018229166f, -0.1433648f, 0.01811727f, 0.2909478f, 0.17488606f, -0.11437988f, -0.046169706f, 0.15422906f, 0.01969401f, -0.08602566f, -0.09290907f, -0.20019531f, 0.10454644f, -0.10055881f, -0.07435438f, -0.10297309f, -0.090535484f, -0.0053982204f, 0.02138265f}; + } diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java index ba628ce..de1a862 100644 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java @@ -23,78 +23,73 @@ public class NormalizedInMemModelTest { } public static Logger LOG = LogManager.getLogger(NormalizedInMemModelTest.class); - /*@Test - public void testNormalizedModel() { - LOG.info("Starting InMemory indexed model test!"); - final W2VNrmlMemModel memModel = Word2VecFactory.getNormalizedBinModel(); - LOG.info("Indexed Model instance created"); - Map wordKeyMap = new HashMap<>(); - wordKeyMap.put("cat", null); - wordKeyMap.put("dog", null); - wordKeyMap.put("airplane", null); - wordKeyMap.put("road", null); - - long startTime, diff; - long totTime = 0; - for (String word : wordKeyMap.keySet()) { - LOG.info("Sending query for word :" + word); - startTime = System.currentTimeMillis(); - float[] vec = memModel.getWord2VecMap().get(word); - Map closestWord = memModel.getClosestSubEntry(vec, wordKeyMap.get(word)); - Assert.assertTrue(closestWord.containsKey(word)); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + wordKeyMap.get(word) + "' is " - + diff + " milliseconds."); - } + /* + * @Test public void testNormalizedModel() { + * LOG.info("Starting InMemory indexed model test!"); final W2VNrmlMemModel + * memModel = Word2VecFactory.getNormalizedBinModel(); + * LOG.info("Indexed Model instance created"); Map wordKeyMap = + * new HashMap<>(); wordKeyMap.put("cat", null); wordKeyMap.put("dog", null); + * wordKeyMap.put("airplane", null); wordKeyMap.put("road", null); + * + * long startTime, diff; long totTime = 0; for (String word : + * wordKeyMap.keySet()) { LOG.info("Sending query for word :" + word); startTime + * = System.currentTimeMillis(); float[] vec = + * memModel.getWord2VecMap().get(word); Map closestWord = + * memModel.getClosestSubEntry(vec, wordKeyMap.get(word)); + * Assert.assertTrue(closestWord.containsKey(word)); diff = + * System.currentTimeMillis() - startTime; totTime += diff; + * LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + + * wordKeyMap.get(word) + "' is " + diff + " milliseconds."); } + * + * LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + + * " milliseconds"); + * + * } + */ - LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + " milliseconds"); + /* + * @Test public void testNbmTime() { + * LOG.info("Starting InMemory indexed model test!"); final + * W2VNrmlMemModelNonIndxd memModel = + * Word2VecFactory.getNormalizedBinNonIndxdModel(); float[][] centroids = + * {TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, + * TestConst.CENT5}; + * + * long startTime, diff; long totTime = 0; for (int i=0;i correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - + long startTime, diff; long totTime = 0; List wordSet = new ArrayList<>(); - for (int i=0;i closestWordMap = nbm.getClosestEntry( centroids[i]); + Map closestWordMap = nbm.getClosestEntry(centroids[i]); diff = System.currentTimeMillis() - startTime; totTime += diff; wordSet.addAll(closestWordMap.keySet()); - LOG.info("Query time recorded for Centroid " + (i+1) + " is " - + diff + " milliseconds."); + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); From c35541ad08d007fc1b972c1650221c5360a71f17 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 11:43:14 +0200 Subject: [PATCH 093/114] more centroids added to test --- .../aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index ef4211d..9f4487c 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -56,7 +56,10 @@ public void testNbmTime() throws IOException { long totTime = 0; LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); - float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; + float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5, + TestConst.CENT6, TestConst.CENT7, TestConst.CENT8, TestConst.CENT9, TestConst.CENT10, TestConst.CENT11, + TestConst.CENT12, TestConst.CENT13, TestConst.CENT14, TestConst.CENT15, TestConst.CENT16, + TestConst.CENT17, TestConst.CENT18, TestConst.CENT19, TestConst.CENT20 }; LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); @@ -74,8 +77,7 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info( - "Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From e64feea8fea61b70382dd7bbc4d35cf25bb66db0 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 11:51:25 +0200 Subject: [PATCH 094/114] model change --- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 9f4487c..f6d4e32 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -5,7 +5,7 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -63,11 +63,11 @@ public void testNbmTime() throws IOException { LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelKMeans Model"); - final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); + final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelKMeans Test"); + LOG.info("Starting W2VNrmlMemModelBinSrch Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -77,7 +77,7 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From e823fa9ca768f8b9b9d08fdc77213e4a48b7eff7 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 12:55:55 +0200 Subject: [PATCH 095/114] testing with words --- .../NrmlzdThetaMdlPrfmncTester.java | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index f6d4e32..270aa69 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -20,7 +20,13 @@ public class NrmlzdThetaMdlPrfmncTester { PropertyConfigurator.configure(Cfg.LOG_FILE); } public static Logger LOG = LogManager.getLogger(NrmlzdThetaMdlPrfmncTester.class); - + public static final float[][] TEST_CENTROIDS = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, + TestConst.CENT5, TestConst.CENT6, TestConst.CENT7, TestConst.CENT8, TestConst.CENT9, TestConst.CENT10, + TestConst.CENT11, TestConst.CENT12, TestConst.CENT13, TestConst.CENT14, TestConst.CENT15, TestConst.CENT16, + TestConst.CENT17, TestConst.CENT18, TestConst.CENT19, TestConst.CENT20 }; + public static final String[] TEST_WORDS = { "cat", "dog", "airplane", "road", "kennedy", "rome", "human", "disney", + "machine", "intelligence", "palaeontology", "surgeon", "amazon", "jesus", "gold", "atlantis", "ronaldo", + "pele", "scissors", "lizard" }; /* * @Test public void testNbmTime() { long startTime, diff; long totTime = 0; * LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = @@ -56,10 +62,8 @@ public void testNbmTime() throws IOException { long totTime = 0; LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); - float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5, - TestConst.CENT6, TestConst.CENT7, TestConst.CENT8, TestConst.CENT9, TestConst.CENT10, TestConst.CENT11, - TestConst.CENT12, TestConst.CENT13, TestConst.CENT14, TestConst.CENT15, TestConst.CENT16, - TestConst.CENT17, TestConst.CENT18, TestConst.CENT19, TestConst.CENT20 }; + // float[][] centroids = TEST_CENTROIDS; + float[][] centroids = fetchWordsVec(TEST_WORDS, nbm); LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); @@ -77,10 +81,19 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info( + "Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); } + + private static float[][] fetchWordsVec(String[] words, Word2VecModel nbm) { + float[][] resVec = new float[words.length][300]; + for (int i = 0; i < words.length; i++) { + resVec[i] = nbm.word2vec.get(words[i]); + } + return resVec; + } } From 5e04aacc19057a76520389b3469fe46961501b9a Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 13:06:40 +0200 Subject: [PATCH 096/114] switching model in test --- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 270aa69..318022a 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -5,7 +5,7 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -67,11 +67,11 @@ public void testNbmTime() throws IOException { LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); - final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelKMeans Model"); + final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelBinSrch Test"); + LOG.info("Starting W2VNrmlMemModelKMeans Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -82,7 +82,7 @@ public void testNbmTime() throws IOException { LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } LOG.info( - "Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); + "Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From 30fe9a0f71c3a8d88f88fe388e394c897f7dc021 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 14:04:16 +0200 Subject: [PATCH 097/114] logging comparison vec size --- .../word2vec/W2VNrmlMemModelBinSrch.java | 4 +++- .../word2vec/W2VNrmlMemModelKMeans.java | 4 +++- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 11 +++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 5a0d243..32aec58 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -193,7 +193,9 @@ private String getClosestEntry(float[] vector, String subKey) { } tl.printTime(1, "Setting Bits"); tl.logTime(1); - int[] nearbyIndexes = new int[finBitSet.cardinality()]; + int nearbyWordsCount = finBitSet.cardinality(); + LOG.info("Number of nearby words: "+nearbyWordsCount); + int[] nearbyIndexes = new int[nearbyWordsCount]; int j = 0; for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { // operate on index i here diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 08e2a5b..e0c025d 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -198,7 +198,9 @@ private String getClosestEntry(float[] vector, String subKey) { } tl.printTime(1, "Setting Bits"); tl.logTime(1); - int[] nearbyIndexes = new int[finBitSet.cardinality()]; + int nearbyWordsCount = finBitSet.cardinality(); + LOG.info("Number of nearby words: "+nearbyWordsCount); + int[] nearbyIndexes = new int[nearbyWordsCount]; int j = 0; for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { // operate on index i here diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 318022a..a661d7a 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -5,6 +5,7 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; +import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; @@ -81,12 +82,18 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info( - "Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); + + String word1 = "By_Jonas_Elmerraji"; + String word2 = "%_#F########_3v.jsn"; + float[] word1Vec = nbm.word2vec.get("By_Jonas_Elmerraji"); + float[] word2Vec = nbm.word2vec.get("%_#F########_3v.jsn"); + LOG.info("Cosine Similarity between " + word1 + " & " + word2 + " is : " + + Word2VecMath.cosineSimilarity(word1Vec, word2Vec)); } private static float[][] fetchWordsVec(String[] words, Word2VecModel nbm) { From a1ca6556efdd1b832d96c5c41bae6357c0adc4c4 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Wed, 10 Oct 2018 14:27:14 +0200 Subject: [PATCH 098/114] logs added --- .../org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index a661d7a..ecc5572 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -94,6 +94,10 @@ public void testNbmTime() throws IOException { float[] word2Vec = nbm.word2vec.get("%_#F########_3v.jsn"); LOG.info("Cosine Similarity between " + word1 + " & " + word2 + " is : " + Word2VecMath.cosineSimilarity(word1Vec, word2Vec)); + LOG.info("Cosine Similarity between " + word1 + " & Centroid19 is : " + + Word2VecMath.cosineSimilarity(TestConst.CENT19, word1Vec)); + LOG.info("Cosine Similarity between " + word2 + " & Centroid19 is : " + + Word2VecMath.cosineSimilarity(TestConst.CENT19, word2Vec)); } private static float[][] fetchWordsVec(String[] words, Word2VecModel nbm) { From b56475fac586085242eadbd4aa209c95b4c22ec6 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 12:54:43 +0200 Subject: [PATCH 099/114] changing config --- .../org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index ecc5572..87e55cb 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -63,8 +63,8 @@ public void testNbmTime() throws IOException { long totTime = 0; LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); - // float[][] centroids = TEST_CENTROIDS; - float[][] centroids = fetchWordsVec(TEST_WORDS, nbm); + float[][] centroids = TEST_CENTROIDS; + //float[][] centroids = fetchWordsVec(TEST_WORDS, nbm); LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); From c381f057abcc4c05689a7dd7e778c6e0256f3dc5 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 13:12:49 +0200 Subject: [PATCH 100/114] adding logs --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index e0c025d..82a2b23 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -214,8 +214,10 @@ private String getClosestEntry(float[] vector, String subKey) { closestWord = findClosestWord(nearbyIndexes, vector); tl.printTime(1, "finding closest word"); } catch (Exception e) { + LOG.error("Exception has occured while finding closest word."); e.printStackTrace(); } + LOG.info("Closest word found is: "+closestWord); return closestWord; } From c5dbeb28fa8f772cebf5cf655766aff029d62f92 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 13:35:43 +0200 Subject: [PATCH 101/114] null check added --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 82a2b23..30bcd8b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -181,7 +181,10 @@ private String getClosestEntry(float[] vector, String subKey) { double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); int indx = getBucketIndex(cosSimVal); BitSet curBs = new BitSet(word2vec.size()); - curBs.or(csBucketContainer[i][indx]); + BitSet tempBs = csBucketContainer[i][indx]; + if(tempBs!=null) { + curBs.or(tempBs); + } int temIndx = indx + 1; if (temIndx < csBucketContainer[i].length && csBucketContainer[i][temIndx] != null) { curBs.or(csBucketContainer[i][temIndx]); From b9bffd88331f4e82bfff219e090f4b621e36a156 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 13:57:40 +0200 Subject: [PATCH 102/114] config change --- .../word2vec/W2VNrmlMemModelKMeans.java | 2 +- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 30bcd8b..b7c6a69 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -38,7 +38,7 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private String[] wordArr; private float[][] vecArr; private int compareVecCount = 100; - private int bucketCount = 10; + private int bucketCount = 20; private int kMeansMaxItr = 5; private BitSet[][] csBucketContainer; private String vecFilePath = "data/kmeans/comparison-vecs.csv"; diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 87e55cb..447f6b3 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -87,17 +87,6 @@ public void testNbmTime() throws IOException { float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); - - String word1 = "By_Jonas_Elmerraji"; - String word2 = "%_#F########_3v.jsn"; - float[] word1Vec = nbm.word2vec.get("By_Jonas_Elmerraji"); - float[] word2Vec = nbm.word2vec.get("%_#F########_3v.jsn"); - LOG.info("Cosine Similarity between " + word1 + " & " + word2 + " is : " - + Word2VecMath.cosineSimilarity(word1Vec, word2Vec)); - LOG.info("Cosine Similarity between " + word1 + " & Centroid19 is : " - + Word2VecMath.cosineSimilarity(TestConst.CENT19, word1Vec)); - LOG.info("Cosine Similarity between " + word2 + " & Centroid19 is : " - + Word2VecMath.cosineSimilarity(TestConst.CENT19, word2Vec)); } private static float[][] fetchWordsVec(String[] words, Word2VecModel nbm) { From 7dcfff3c1f20a4644f9ae6c9579267fd784bad13 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 14:07:19 +0200 Subject: [PATCH 103/114] config change --- .../word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 8 ++++++-- .../word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 32aec58..c3344d6 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -27,7 +27,7 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private float[][] vecArr; private int[] indxArr; private double[] simValArr; - private int compareVecCount = 100; + private int compareVecCount = 150; private int bucketCount = 10; private BitSet[][] csBucketContainer; // TODO : Remove this @@ -176,7 +176,9 @@ private String getClosestEntry(float[] vector, String subKey) { double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); int indx = getBucketIndex(cosSimVal); BitSet curBs = new BitSet(word2vec.size()); - curBs.or(csBucketContainer[i][indx]); + if(csBucketContainer[i][indx]!=null) { + curBs.or(csBucketContainer[i][indx]); + } int temIndx = indx + 1; if (temIndx < csBucketContainer[i].length && csBucketContainer[i][temIndx] != null) { curBs.or(csBucketContainer[i][temIndx]); @@ -209,8 +211,10 @@ private String getClosestEntry(float[] vector, String subKey) { closestWord = findClosestWord(nearbyIndexes, vector); tl.printTime(1, "finding closest word"); } catch (Exception e) { + LOG.error("Exception has occured while finding closest word."); e.printStackTrace(); } + LOG.info("Closest word found is: "+closestWord); return closestWord; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index b7c6a69..30bcd8b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -38,7 +38,7 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private String[] wordArr; private float[][] vecArr; private int compareVecCount = 100; - private int bucketCount = 20; + private int bucketCount = 10; private int kMeansMaxItr = 5; private BitSet[][] csBucketContainer; private String vecFilePath = "data/kmeans/comparison-vecs.csv"; From 716efca81faef8a3b7d35e2a3482a4f902b04486 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 14:08:06 +0200 Subject: [PATCH 104/114] test change --- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 447f6b3..1b672d6 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -6,7 +6,7 @@ import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -68,11 +68,11 @@ public void testNbmTime() throws IOException { LOG.info("Starting BruteForce-Model Test"); List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelKMeans Model"); - final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); + final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelKMeans Test"); + LOG.info("Starting W2VNrmlMemModelBinSrch Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -82,7 +82,7 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From 7521898eff086eb062a86f71c18010caa8fdc07e Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 14:43:14 +0200 Subject: [PATCH 105/114] removing obsolete models --- config/cfg.properties | 2 - .../subset/DataSubsetGenerator.java | 10 +- .../tool/ModelStatsWriter.java | 6 +- .../word2vec/W2VNrmlDbModel.java | 39 -- .../word2vec/W2VNrmlMemModelIndxd.java | 375 ------------------ .../word2vec/W2VNrmlMemModelIndxdLR.java | 358 ----------------- .../word2vec/W2VNrmlMemModelIndxdLRMulti.java | 82 ---- .../word2vec/W2VNrmlMemModelNonIndxd.java | 308 -------------- .../word2vec/W2VNrmlMemModelTheta.java | 190 --------- .../word2vec/Word2VecFactory.java | 19 - .../NormalizedInMemModelTest.java | 99 ----- .../NrmlzdMdlPrfmncTester.java | 135 ------- .../NrmlzdThetaMdlPrfmncTester.java | 65 +-- 13 files changed, 42 insertions(+), 1646 deletions(-) delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java delete mode 100644 src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java delete mode 100644 src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java delete mode 100644 src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java diff --git a/config/cfg.properties b/config/cfg.properties index dc20612..1af9b6f 100644 --- a/config/cfg.properties +++ b/config/cfg.properties @@ -4,8 +4,6 @@ org.aksw.word2vecrestful.word2vec.Word2VecModelLoader.bin: true org.aksw.word2vecrestful.word2vec.Word2VecFactory.model: data/GoogleNews-vectors-negative300.bin org.aksw.word2vecrestful.Application.inmemory: true org.aksw.word2vecrestful.Application.subsetfiledir: data/subset-files-1/ -org.aksw.word2vecrestful.word2vec.normalizedmodel.bin: false -org.aksw.word2vecrestful.word2vec.normalizedmodel.model: data/normal/GoogleNews-vectors-negative300-normalized.txt org.aksw.word2vecrestful.word2vec.normalizedbinmodel.bin: true org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model: data/normalbinmodel/GoogleNews-vectors-negative300-normalized.bin org.aksw.word2vecrestful.word2vec.stats.sdfile: data/normal/stat/normal-model-sd.csv \ No newline at end of file diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java index 8595239..4294d14 100644 --- a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java +++ b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java @@ -11,8 +11,8 @@ import java.util.Map.Entry; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.commons.io.output.FileWriterWithEncoding; import com.fasterxml.jackson.core.JsonProcessingException; @@ -138,9 +138,9 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir, * @throws IOException */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { - File subsetConfig = new File("D:\\Nikit\\DICE-Group\\word2vec-dump\\subsetconfig2.json"); - W2VNrmlMemModelNonIndxd model = Word2VecFactory.getNormalizedBinNonIndxdModel(); - generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), - model.getWord2VecMap(), model.getVectorSize()); + File subsetConfig = new File("word2vec-dump\\subsetconfig2.json"); + Word2VecModel model = Word2VecFactory.getNormalBinModel(); + generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.word2vec, + model.vectorSize); } } diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java index 774942d..e19d498 100644 --- a/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelStatsWriter.java @@ -9,8 +9,8 @@ import java.util.Map.Entry; import java.util.Set; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; +import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -87,7 +87,7 @@ private static void writeValues(int index, float min, float max, float mean, flo */ public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { // Get the normalized model - W2VNrmlMemModelNonIndxd model = Word2VecFactory.getNormalizedBinNonIndxdModel(); - writeModelStats(model.getWord2VecMap(), model.getVectorSize(), new File("data/normal/stat/normal-model-stats.csv")); + Word2VecModel model = Word2VecFactory.getNormalBinModel(); + writeModelStats(model.word2vec, model.vectorSize, new File("data/normal/stat/normal-model-stats.csv")); } } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java deleted file mode 100644 index f90ae1c..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlDbModel.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.util.Map; - -import org.aksw.word2vecrestful.db.SQLiteDBHandler; - -public class W2VNrmlDbModel implements GenWord2VecModel{ - - private String mainTblName= "wordtovec"; - private String wordColName = "word"; - private String vecColPrefix = "val"; - - private int vectorSize; - private SQLiteDBHandler dbHandler; - - private Map sdMap; - public W2VNrmlDbModel(String dbName, int vectorSize) { - this.vectorSize = vectorSize; - this.dbHandler = new SQLiteDBHandler(dbName); - // Calculate sd*3/10 and save in map - } - @Override - public int getVectorSize() { - return this.vectorSize; - } - - @Override - public String getClosestEntry(float[] vector) { - // TODO Auto-generated method stub - return null; - } - - @Override - public String getClosestSubEntry(float[] vector, String subKey) { - // TODO Auto-generated method stub - return null; - } - -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java deleted file mode 100644 index 590dd00..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxd.java +++ /dev/null @@ -1,375 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.io.IOException; -import java.util.Arrays; -import java.util.BitSet; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.aksw.word2vecrestful.subset.DataSubsetProvider; -import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.dice_research.topicmodeling.commons.sort.AssociativeSort; - -import nikit.test.TimeLogger; - -/** - * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model - * - * @author Nikit - * - */ -public class W2VNrmlMemModelIndxd implements GenWord2VecModel { - public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - - private Map word2vec; - private int vectorSize; - private float[] sdArr; - /** - * Limit to the multiplier of area in which nearby vectors are to be looked - */ - private static final int EXHAUSTION_MULT = 20; - /** - * Multiplier for the standard deviation - */ - private static final int SIGMA_MULT = 3; - /** - * Divisor for the standard deviation's value - */ - private static final int AREA_DIVISOR = 20; - private DataSubsetProvider dataSubsetProvider; - /** - * Contains the sorted dimensional values mapped to their words - */ - private Object[][] indexesArr; - - private String[] gWordArr; - private float[][] gVecArr; - - // TODO : Remove this - private TimeLogger tl = new TimeLogger(); - - public W2VNrmlMemModelIndxd(final Map word2vec, final int vectorSize) { - this.word2vec = word2vec; - this.vectorSize = vectorSize; - this.dataSubsetProvider = new DataSubsetProvider(); - this.initArrays(); - // Calculate sd*3/10 and save in map - // Initialize indexesArr unsorted - // LOG.info("Initializing indexes and calculating standard deviation"); - this.setModelVals(word2vec, vectorSize); - // LOG.info("Sorting indexes"); - // Sort the indexes - this.sortIndexes(); - // LOG.info("Sorting completed"); - } - - private void initArrays() { - this.indexesArr = new Object[vectorSize][2]; - this.gWordArr = new String[word2vec.size()]; - this.gVecArr = new float[word2vec.size()][vectorSize]; - int i = 0; - for (Entry entry : word2vec.entrySet()) { - gWordArr[i] = entry.getKey(); - gVecArr[i] = entry.getValue(); - i++; - } - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestEntry(float[] vector) { - return getClosestEntry(vector, null); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestSubEntry(float[] vector, String subKey) { - return getClosestEntry(vector, subKey); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - private String getClosestEntry(float[] vector, String subKey) { - Set wordSet = null; - String closestVec = null; - try { - if (subKey == null) { - wordSet = word2vec.keySet(); - } else { - tl.logTime(1); - wordSet = dataSubsetProvider.fetchSubsetWords(subKey); - tl.printTime(1, "fetchSubsetWords"); - } - // LOG.info("Normalizing input vector"); - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - // LOG.info("fetching nearby vectors"); - // Find nearby vectors - tl.logTime(2); - Map nearbyVecs = fetchNearbyVectors(vector, wordSet); - tl.printTime(2, "fetchNearbyVectors"); - // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); - // Select the closest vector - tl.logTime(3); - closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); - tl.printTime(3, "findClosestVecInNearbyVecs"); - } catch (IOException e) { - // LOG.error(e.getStackTrace()); - } - // LOG.info("Closest word found is " + closestVec.keySet()); - return closestVec; - } - - /** - * Method to find standard deviation for each dimension of word vector and store - * the operated value next to the dimension's index in sdMap - * - * @param word2vecMap - * - mapping of words alongwith their vectors - * @param vectorSize - * - size of each vector - */ - public void setModelVals(Map word2vecMap, int vectorSize) { - float[] resMap = new float[vectorSize]; - int totSize = word2vecMap.size(); - // loop all dimensions - for (int i = 0; i < vectorSize; i++) { - // loop through all the words - float[] dimsnArr = new float[totSize]; - int[] idArr = new int[totSize]; - float sum = 0; - for (int j = 0; j < gWordArr.length; j++) { - float val = gVecArr[j][i]; - sum += val; - idArr[j] = j; - dimsnArr[j] = val; - } - // Setting value in indexArr - Object[] dimValWordMap = new Object[2]; - dimValWordMap[0] = idArr; - dimValWordMap[1] = dimsnArr; - this.indexesArr[i] = dimValWordMap; - // LOG.info("Dimension " + (i) + " index stored to memory"); - // mean - float mean = sum / dimsnArr.length; - sum = 0; - for (int j = 0; j < dimsnArr.length; j++) { - sum += Math.pow(dimsnArr[j] - mean, 2); - } - float variance = sum / dimsnArr.length; - Double sd = Math.sqrt(variance); - resMap[i] = sd.floatValue() * SIGMA_MULT / AREA_DIVISOR; - } - // Set as sdMap - this.sdArr = resMap; - } - - /** - * Method to fetch nearby vectors for a given vector in a particular word set - * - * @param vector - * - vector to look nearby vectors for - * @param wordSet - * - word set to look into for nearby vectors - * @return - mapping of nearby words alongwith with their vector values - */ - private Map fetchNearbyVectors(float[] vector, Set wordSet) { - Map nearbyVecMap = new HashMap<>(); - boolean mapEmpty = true; - boolean notExhausted = true; - float[][] minMaxVec = getMinMaxVec(vector); - tl.printTime(2, "getMinMaxVec"); - int mult = 1; - while (mapEmpty && notExhausted) { - if (mult > 1) { - tl.logTime(8); - incrementMinMaxVec(minMaxVec); - tl.printTime(8, "incrementMinMaxVec"); - } - - tl.logTime(4); - putNearbyVecsIndxd(minMaxVec, wordSet, nearbyVecMap); - tl.printTime(4, "putNearbyVecsIndxd"); - if (nearbyVecMap.size() > 0) { - mapEmpty = false; - } else { - ++mult; - if (mult > EXHAUSTION_MULT) { - notExhausted = false; - } - // LOG.info("MinMax multiplier incremented to " + mult); - } - } - return nearbyVecMap; - } - - private void putNearbyVecsIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { - // init a set to hold words - Set nearbyWords = new HashSet<>(); - float[] minVec = minMaxVec[0]; - float[] maxVec = minMaxVec[1]; - BitSet finBitSet = null; - BitSet tempBitSet; - tl.logTime(5); - for (int i = 0; i < vectorSize; i++) { - tempBitSet = new BitSet(word2vec.size()); - // LOG.info("Searching inside dimension " + (i) + "'s index"); - float minVal = minVec[i]; - float maxVal = maxVec[i]; - // LOG.info("MinVal and MaxVal for the current dimension: " + minVal + " " + - // maxVal); - Object[] entryArr = indexesArr[i]; - int[] idArr = (int[]) entryArr[0]; - float[] dimsnValArr = (float[]) entryArr[1]; - int from = Arrays.binarySearch(dimsnValArr, minVal); - // LOG.info("From value of dimension array: " + from); - if (from < 0) { - // To select the insertion point - from = -1 - from; - } - // LOG.info("Final From value of current dimension array: " + from); - int to = Arrays.binarySearch(dimsnValArr, maxVal); - // LOG.info("To value of dimension array: " + to); - if (to < 0) { - // To select the insertion point - to = -1 - to; - } else { - // Because binarySearch returns the exact index if element exists - to++; - } - // LOG.info("Final To value of current dimension array: " + to); - LOG.info("Setting bits for the words between 'from' and 'to' indexes:\t" + from + " " + to); - tl.logTime(9); - for (int j = from; j < to; j++) { - tempBitSet.set(idArr[j], true); - } - tl.printTime(9, "Setting bits for index" + i); - if (i == 0) { - finBitSet = tempBitSet; - } else { - finBitSet.and(tempBitSet); - } - if (finBitSet.isEmpty()) { - // LOG.info("Word not found in the current min-max area."); - break; - } - } - // LOG.info("Extracting words for set bits"); - tl.printTime(5, "Setting Bitset"); - tl.logTime(6); - for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1)) { - // operate on index i here - nearbyWords.add(gWordArr[i]); - if (i == Integer.MAX_VALUE) { - break; // or (i+1) would overflow - } - } - tl.printTime(6, "extracting words"); - // LOG.info("Nearby words size before retainAll from wordset: " + - // nearbyWords.size()); - // Clear all the words not in wordset - tl.logTime(7); - nearbyWords.retainAll(wordSet); - tl.printTime(7, "retaining words"); - // LOG.info("Nearby words size after retainAll from wordset: " + - // nearbyWords.size()); - for (String word : nearbyWords) { - nearbyVecMap.put(word, word2vec.get(word)); - } - } - - /** - * Method to generate two vectors from a given vector by adding and subtracting - * value in sdMap from the given vector - * - * @param vector - * - input vector to perform operation on - * @return - min vector at index 0 and max vector at index 1 - */ - private float[][] getMinMaxVec(float[] vector) { - float[][] resVec = new float[2][vector.length]; - for (int i = 0; i < vector.length; i++) { - // TODO: change sdmap to array - float diff = sdArr[i]; - // MinVec - resVec[0][i] = vector[i] - diff; - // MaxVec - resVec[1][i] = vector[i] + diff; - } - return resVec; - } - - private void incrementMinMaxVec(float[][] minMaxVec) { - float[] minVec = minMaxVec[0]; - float[] maxVec = minMaxVec[1]; - for (int i = 0; i < vectorSize; i++) { - float diff = sdArr[i]; - // MinVec - minVec[i] -= diff; - // MaxVec - maxVec[i] += diff; - } - } - - private void sortIndexes() { - for (int i = 0; i < indexesArr.length; i++) { - // LOG.info("Sorting index " + i); - Object[] entryArr = indexesArr[i]; - int[] idArr = (int[]) entryArr[0]; - float[] dimsnValArr = (float[]) entryArr[1]; - AssociativeSort.quickSort(dimsnValArr, idArr); - // LOG.info("Sorting completed for index " + i); - } - } - - /** - * Method to fetch vectorSize - * - * @return - vectorSize - */ - @Override - public int getVectorSize() { - return this.vectorSize; - } - - /** - * Method to fetch word2vec map - * - * @return - word2vec map - */ - public Map getWord2VecMap() { - return this.word2vec; - } - -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java deleted file mode 100644 index d8ce0b7..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLR.java +++ /dev/null @@ -1,358 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.aksw.word2vecrestful.subset.DataSubsetProvider; -import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.dice_research.topicmodeling.commons.collections.TopShortIntCollection; -import org.dice_research.topicmodeling.commons.sort.AssociativeSort; - -import nikit.test.TimeLogger; - -/** - * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model - * - * @author Nikit - * - */ -public class W2VNrmlMemModelIndxdLR implements GenWord2VecModel { - public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - - private Map word2vec; - protected int vectorSize; - private float[] sdArr; - private float[] multSdArr; - /** - * Multiplier for the standard deviation - */ - private float sigmaMult = 3; - /** - * Divisor for the standard deviation's value - */ - private float areaDivisor = 10; - /** - * Number of top elements to give priority - */ - protected int k = 10; - private DataSubsetProvider dataSubsetProvider; - /** - * Contains the sorted dimensional values mapped to their words - */ - protected Object[][] indexesArr; - - protected String[] gWordArr; - protected float[][] gVecArr; - - // TODO : Remove this - private TimeLogger tl = new TimeLogger(); - - public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vectorSize) { - this.word2vec = word2vec; - this.vectorSize = vectorSize; - this.dataSubsetProvider = new DataSubsetProvider(); - this.initArrays(); - // Calculate sd*3/10 and save in map - // Initialize indexesArr unsorted - // LOG.info("Initializing indexes and calculating standard deviation"); - this.setModelVals(word2vec, vectorSize); - // LOG.info("Sorting indexes"); - // Sort the indexes - this.sortIndexes(); - // LOG.info("Sorting completed"); - } - - public W2VNrmlMemModelIndxdLR(final Map word2vec, final int vectorSize, int sigmaMult, - int areaDivisor) { - this.word2vec = word2vec; - this.vectorSize = vectorSize; - this.sigmaMult = sigmaMult; - this.areaDivisor = areaDivisor; - this.dataSubsetProvider = new DataSubsetProvider(); - this.initArrays(); - // Calculate sd*3/10 and save in map - // Initialize indexesArr unsorted - // LOG.info("Initializing indexes and calculating standard deviation"); - this.setModelVals(word2vec, vectorSize); - // LOG.info("Sorting indexes"); - // Sort the indexes - this.sortIndexes(); - // LOG.info("Sorting completed"); - } - - public void updateSdArr(float newSigmaMult, float newAreaDivisor) { -// if (newSigmaMult == sigmaMult && newAreaDivisor == areaDivisor) { -// return; -// } -// float mult = (areaDivisor / sigmaMult) * (newSigmaMult / newAreaDivisor); - float mult = (newSigmaMult / newAreaDivisor); - // Updating SdArr values - for (int i = 0; i < multSdArr.length; i++) { - multSdArr[i] = mult * sdArr[i]; - } - this.sigmaMult = newSigmaMult; - this.areaDivisor = newAreaDivisor; - } - - private void initArrays() { - this.indexesArr = new Object[vectorSize][2]; - this.gWordArr = new String[word2vec.size()]; - this.gVecArr = new float[word2vec.size()][vectorSize]; - int i = 0; - for (Entry entry : word2vec.entrySet()) { - gWordArr[i] = entry.getKey(); - gVecArr[i] = entry.getValue(); - i++; - } - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestEntry(float[] vector) { - return getClosestEntry(vector, null); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestSubEntry(float[] vector, String subKey) { - return getClosestEntry(vector, subKey); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - private String getClosestEntry(float[] vector, String subKey) { - Set wordSet = null; - String closestVec = null; - try { - if (subKey == null) { - wordSet = word2vec.keySet(); - } else { - // tl.logTime(1); - wordSet = dataSubsetProvider.fetchSubsetWords(subKey); - // tl.printTime(1, "fetchSubsetWords"); - } - // LOG.info("Normalizing input vector"); - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - // LOG.info("fetching nearby vectors"); - // Find nearby vectors - // tl.logTime(2); - Map nearbyVecs = fetchNearbyVectors(vector, wordSet); - // tl.printTime(2, "fetchNearbyVectors"); - // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); - // Select the closest vector - // tl.logTime(3); - closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); - // tl.printTime(3, "findClosestVecInNearbyVecs"); - } catch (IOException e) { - // LOG.error(e.getStackTrace()); - } - // LOG.info("Closest word found is " + closestVec.keySet()); - return closestVec; - } - - /** - * Method to find standard deviation for each dimension of word vector and store - * the operated value next to the dimension's index in sdMap - * - * @param word2vecMap - * - mapping of words alongwith their vectors - * @param vectorSize - * - size of each vector - */ - public void setModelVals(Map word2vecMap, int vectorSize) { - float[] resArr = new float[vectorSize]; - int totSize = word2vecMap.size(); - // loop all dimensions - for (int i = 0; i < vectorSize; i++) { - // loop through all the words - float[] dimsnArr = new float[totSize]; - int[] idArr = new int[totSize]; - double sum = 0; - for (int j = 0; j < gWordArr.length; j++) { - float val = gVecArr[j][i]; - sum += val; - idArr[j] = j; - dimsnArr[j] = val; - } - // Setting value in indexArr - Object[] dimValWordMap = new Object[2]; - dimValWordMap[0] = idArr; - dimValWordMap[1] = dimsnArr; - this.indexesArr[i] = dimValWordMap; - // LOG.info("Dimension " + (i) + " index stored to memory"); - // mean - double mean = sum / dimsnArr.length; - sum = 0; - for (int j = 0; j < dimsnArr.length; j++) { - sum += Math.pow(dimsnArr[j] - mean, 2); - } - double variance = sum / dimsnArr.length; - Double sd = Math.sqrt(variance); - resArr[i] = sd.floatValue(); - } - // Set as sdMap - this.sdArr = resArr; - this.multSdArr = new float[sdArr.length]; - // Apply the multiplicator - updateSdArr(sigmaMult, areaDivisor); - } - - /** - * Method to fetch nearby vectors for a given vector in a particular word set - * - * @param vector - * - vector to look nearby vectors for - * @param wordSet - * - word set to look into for nearby vectors - * @return - mapping of nearby words alongwith with their vector values - */ - private Map fetchNearbyVectors(float[] vector, Set wordSet) { - Map nearbyVecMap = new HashMap<>(); - float[][] minMaxVec = getMinMaxVec(vector); - putNearbyVecs(minMaxVec, nearbyVecMap); - return nearbyVecMap; - } - - protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { - // init score array - short[] scoreArr = new short[gWordArr.length]; - float[] minVec = minMaxVec[0]; - float[] maxVec = minMaxVec[1]; - // loop through each dimension and increment the score of words in that area - for (int i = 0; i < vectorSize; i++) { - float minVal = minVec[i]; - float maxVal = maxVec[i]; - Object[] entryArr = indexesArr[i]; - int[] idArr = (int[]) entryArr[0]; - float[] dimsnValArr = (float[]) entryArr[1]; - int from = Arrays.binarySearch(dimsnValArr, minVal); - // LOG.info("From value of dimension array: " + from); - if (from < 0) { - // To select the insertion point - from = -1 - from; - } - // LOG.info("Final From value of current dimension array: " + from); - int to = Arrays.binarySearch(dimsnValArr, maxVal); - // LOG.info("To value of dimension array: " + to); - if (to < 0) { - // To select the insertion point - to = -1 - to; - } else { - // Because binarySearch returns the exact index if element exists - to++; - } - // LOG.info("Setting scores for the words between 'from' and 'to' indexes:\t" + from + " " + to); - // tl.logTime(9); - for (int j = from; j < to; j++) { - scoreArr[idArr[j]]++; - } - // tl.printTime(9, "Score set for index " + i); - } - // find the index of the words with highest score and add them to nearbyVecMap - for (int wordId : getMaxIdList(scoreArr)) { - nearbyVecMap.put(gWordArr[wordId], gVecArr[wordId]); - } - } - - private int[] getMaxIdList(short[] scoreArr) { - TopShortIntCollection collection = new TopShortIntCollection(k, false); - for (int i = 0; i < scoreArr.length; i++) { - short score = scoreArr[i]; - collection.add(score, i); - } - return collection.getObjects(); - } - - /** - * Method to generate two vectors from a given vector by adding and subtracting - * value in sdMap from the given vector - * - * @param vector - * - input vector to perform operation on - * @return - min vector at index 0 and max vector at index 1 - */ - private float[][] getMinMaxVec(float[] vector) { - float[][] resVec = new float[2][vector.length]; - for (int i = 0; i < vector.length; i++) { - // TODO: change sdmap to array - float diff = multSdArr[i]; - // MinVec - resVec[0][i] = vector[i] - diff; - // MaxVec - resVec[1][i] = vector[i] + diff; - } - return resVec; - } - - private void sortIndexes() { - for (int i = 0; i < indexesArr.length; i++) { - // LOG.info("Sorting index " + i); - Object[] entryArr = indexesArr[i]; - int[] idArr = (int[]) entryArr[0]; - float[] dimsnValArr = (float[]) entryArr[1]; - AssociativeSort.quickSort(dimsnValArr, idArr); - // LOG.info("Sorting completed for index " + i); - } - } - - /** - * Method to fetch vectorSize - * - * @return - vectorSize - */ - @Override - public int getVectorSize() { - return this.vectorSize; - } - - /** - * Method to fetch word2vec map - * - * @return - word2vec map - */ - public Map getWord2VecMap() { - return this.word2vec; - } - - public int getK() { - return k; - } - - public void setK(int k) { - this.k = k; - } - -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java deleted file mode 100644 index e743c8d..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelIndxdLRMulti.java +++ /dev/null @@ -1,82 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.util.Arrays; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicIntegerArray; -import java.util.stream.IntStream; - -import org.dice_research.topicmodeling.commons.collections.TopIntIntCollection; - -public class W2VNrmlMemModelIndxdLRMulti extends W2VNrmlMemModelIndxdLR { - - public W2VNrmlMemModelIndxdLRMulti(final Map word2vec, final int vectorSize) { - super(word2vec, vectorSize); - } - - public W2VNrmlMemModelIndxdLRMulti(Map word2vec, int vectorSize, int sigmaMult, int areaDivisor) { - super(word2vec, vectorSize, sigmaMult, areaDivisor); - } - - protected void putNearbyVecs(float[][] minMaxVec, Map nearbyVecMap) { - // init score array -// AtomicIntegerArray scoreArr = new AtomicIntegerArray(gWordArr.length); - AtomicInteger scoreArr[] = new AtomicInteger[gWordArr.length]; - for (int j = 0; j < scoreArr.length; j++) { - scoreArr[j] = new AtomicInteger(); - } - float[] minVec = minMaxVec[0]; - float[] maxVec = minMaxVec[1]; - // loop through each dimension and increment the score of words in that area - IntStream.range(0, vectorSize).parallel().forEach(i -> { - float minVal = minVec[i]; - float maxVal = maxVec[i]; - Object[] entryArr = indexesArr[i]; - int[] idArr = (int[]) entryArr[0]; - float[] dimsnValArr = (float[]) entryArr[1]; - int from = Arrays.binarySearch(dimsnValArr, minVal); - // LOG.info("From value of dimension array: " + from); - if (from < 0) { - // To select the insertion point - from = -1 - from; - } - // LOG.info("Final From value of current dimension array: " + from); - int to = Arrays.binarySearch(dimsnValArr, maxVal); - // LOG.info("To value of dimension array: " + to); - if (to < 0) { - // To select the insertion point - to = -1 - to; - } else { - // Because binarySearch returns the exact index if element exists - to++; - } - // LOG.info("Setting scores for the words between 'from' and 'to' indexes:\t" + - // from + " " + to); - // tl.logTime(9); - for (int j = from; j < to; j++) { - scoreArr[idArr[j]].incrementAndGet(); - } - // tl.printTime(9, "Score set for index " + i); - }); - // find the index of the words with highest score and add them to nearbyVecMap - for (int wordId : getMaxIdList(scoreArr)) { - nearbyVecMap.put(gWordArr[wordId], gVecArr[wordId]); - } - } - - private int[] getMaxIdList(AtomicInteger[] scoreArr) { - TopIntIntCollection collection = new TopIntIntCollection(k, false); - for (int i = 0; i < scoreArr.length; i++) { - collection.add(scoreArr[i].get(), i); - } - return collection.getObjects(); - } - - private int[] getMaxIdList(AtomicIntegerArray scoreArr) { - TopIntIntCollection collection = new TopIntIntCollection(k, false); - for (int i = 0; i < scoreArr.length(); i++) { - collection.add(scoreArr.get(i), i); - } - return collection.getObjects(); - } -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java deleted file mode 100644 index 4bd687b..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelNonIndxd.java +++ /dev/null @@ -1,308 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.aksw.word2vecrestful.subset.DataSubsetProvider; -import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import nikit.test.TimeLogger; - -/** - * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model - * - * @author Nikit - * - */ -public class W2VNrmlMemModelNonIndxd implements GenWord2VecModel { - public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - - private Map word2vec; - private int vectorSize; - private float[] sdArr; - /** - * Limit to the multiplier of area in which nearby vectors are to be looked - */ - private static final int EXHAUSTION_MULT = 20; - /** - * Multiplier for the standard deviation - */ - private static final int SIGMA_MULT = 3; - /** - * Divisor for the standard deviation's value - */ - private static final int AREA_DIVISOR = 20; - private DataSubsetProvider dataSubsetProvider; - /** - * Contains the sorted dimensional values mapped to their words - */ - - private String[] gWordArr; - private float[][] gVecArr; - - // TODO : Remove this - private TimeLogger tl = new TimeLogger(); - - public W2VNrmlMemModelNonIndxd(final Map word2vec, final int vectorSize) { - this.word2vec = word2vec; - this.vectorSize = vectorSize; - this.dataSubsetProvider = new DataSubsetProvider(); - // Calculate sd*3/10 and save in map - // Initialize indexesArr unsorted - // LOG.info("Initializing indexes and calculating standard deviation"); - initArrays(); - this.setModelVals(word2vec, vectorSize); - // LOG.info("Sorting indexes"); - // Sort the indexes - // LOG.info("Sorting completed"); - } - - private void initArrays() { - this.gWordArr = new String[word2vec.size()]; - this.gVecArr = new float[word2vec.size()][vectorSize]; - int i = 0; - for (Entry entry : word2vec.entrySet()) { - gWordArr[i] = entry.getKey(); - gVecArr[i] = entry.getValue(); - i++; - } - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestEntry(float[] vector) { - return getClosestEntry(vector, null); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestSubEntry(float[] vector, String subKey) { - return getClosestEntry(vector, subKey); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - private String getClosestEntry(float[] vector, String subKey) { - Set wordSet = null; - String closestVec = null; - try { - if (subKey == null) { - wordSet = word2vec.keySet(); - } else { - tl.logTime(1); - wordSet = dataSubsetProvider.fetchSubsetWords(subKey); - tl.printTime(1, "fetchSubsetWords"); - } - // LOG.info("Normalizing input vector"); - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - // LOG.info("fetching nearby vectors"); - // Find nearby vectors - tl.logTime(2); - Map nearbyVecs = fetchNearbyVectors(vector, wordSet); - tl.printTime(2, "fetchNearbyVectors"); - // LOG.info("found the following nearby words: " + nearbyVecs.keySet()); - // Select the closest vector - tl.logTime(3); - closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecs, vector); - tl.printTime(3, "findClosestVecInNearbyVecs"); - } catch (IOException e) { - // LOG.error(e.getStackTrace()); - } - // LOG.info("Closest word found is " + closestVec.keySet()); - return closestVec; - } - - /** - * Method to find standard deviation for each dimension of word vector and store - * the operated value next to the dimension's index in sdMap - * - * @param word2vecMap - * - mapping of words alongwith their vectors - * @param vectorSize - * - size of each vector - */ - public void setModelVals(Map word2vecMap, int vectorSize) { - float[] resMap = new float[vectorSize]; - int totSize = word2vecMap.size(); - // loop all dimensions - for (int i = 0; i < vectorSize; i++) { - // loop through all the words - float[] dimsnArr = new float[totSize]; - int[] idArr = new int[totSize]; - float sum = 0; - for (int j = 0; j < gWordArr.length; j++) { - float val = gVecArr[j][i]; - sum += val; - idArr[j] = j; - dimsnArr[j] = val; - } - // Setting value in indexArr - // LOG.info("Dimension " + (i) + " index stored to memory"); - // mean - float mean = sum / dimsnArr.length; - sum = 0; - for (int j = 0; j < dimsnArr.length; j++) { - sum += Math.pow(dimsnArr[j] - mean, 2); - } - float variance = sum / dimsnArr.length; - Double sd = Math.sqrt(variance); - resMap[i] = sd.floatValue() * SIGMA_MULT / AREA_DIVISOR; - } - // Set as sdMap - this.sdArr = resMap; - } - - /** - * Method to fetch nearby vectors for a given vector in a particular word set - * - * @param vector - * - vector to look nearby vectors for - * @param wordSet - * - word set to look into for nearby vectors - * @return - mapping of nearby words alongwith with their vector values - */ - private Map fetchNearbyVectors(float[] vector, Set wordSet) { - Map nearbyVecMap = new HashMap<>(); - boolean mapEmpty = true; - boolean notExhausted = true; - float[][] minMaxVec = getMinMaxVec(vector); - tl.printTime(2, "getMinMaxVec"); - int mult = 1; - while (mapEmpty && notExhausted) { - if (mult > 1) { - tl.logTime(8); - incrementMinMaxVec(minMaxVec); - tl.printTime(8, "incrementMinMaxVec"); - } - tl.logTime(4); - putNearbyVecsNonIndxd(minMaxVec, wordSet, nearbyVecMap); - tl.printTime(4, "putNearbyVecsNonIndxd"); - if (nearbyVecMap.size() > 0) { - mapEmpty = false; - } else { - ++mult; - if (mult > EXHAUSTION_MULT) { - notExhausted = false; - } - // LOG.info("MinMax multiplier incremented to " + mult); - } - } - return nearbyVecMap; - } - - private void putNearbyVecsNonIndxd(float[][] minMaxVec, Set wordSet, Map nearbyVecMap) { - for (String word : wordSet) { - float[] entryVec = word2vec.get(word); - if (isVectorInArea(entryVec, minMaxVec)) { - nearbyVecMap.put(word, entryVec); - } - } - } - - /** - * Method to check if vector falls in a particular area - * - * @param entryVec - * - vector to be verified - * @param minMaxVec - * - min vec and max vec as area's boundary - * @return - if the given vector is inside min and max vec's range - */ - private boolean isVectorInArea(float[] entryVec, float[][] minMaxVec) { - boolean isValid = true; - float[] minVec = minMaxVec[0]; - float[] maxVec = minMaxVec[1]; - for (int i = 0; i < entryVec.length; i++) { - if (entryVec[i] < minVec[i] || entryVec[i] > maxVec[i]) { - isValid = false; - break; - } - } - return isValid; - } - - /** - * Method to generate two vectors from a given vector by adding and subtracting - * value in sdMap from the given vector - * - * @param vector - * - input vector to perform operation on - * @return - min vector at index 0 and max vector at index 1 - */ - private float[][] getMinMaxVec(float[] vector) { - float[][] resVec = new float[2][vector.length]; - for (int i = 0; i < vector.length; i++) { - // TODO: change sdmap to array - float diff = sdArr[i]; - // MinVec - resVec[0][i] = vector[i] - diff; - // MaxVec - resVec[1][i] = vector[i] + diff; - } - return resVec; - } - - private void incrementMinMaxVec(float[][] minMaxVec) { - float[] minVec = minMaxVec[0]; - float[] maxVec = minMaxVec[1]; - for (int i = 0; i < vectorSize; i++) { - float diff = sdArr[i]; - // MinVec - minVec[i] -= diff; - // MaxVec - maxVec[i] += diff; - } - } - - /** - * Method to fetch vectorSize - * - * @return - vectorSize - */ - @Override - public int getVectorSize() { - return this.vectorSize; - } - - /** - * Method to fetch word2vec map - * - * @return - word2vec map - */ - public Map getWord2VecMap() { - return this.word2vec; - } - -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java deleted file mode 100644 index 8fdec89..0000000 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelTheta.java +++ /dev/null @@ -1,190 +0,0 @@ -package org.aksw.word2vecrestful.word2vec; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -/** - * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model - * - * @author Nikit - * - */ -public class W2VNrmlMemModelTheta implements GenWord2VecModel { - public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - - private Map word2vec; - private int vectorSize; - private Map> cosineIndxMap; - private float[] comparisonVec = null; - private float gMultiplier = 10; - - public W2VNrmlMemModelTheta(final Map word2vec, final int vectorSize) { - this.word2vec = word2vec; - this.vectorSize = vectorSize; - // Setting mean as comparison vec - setComparisonVec(word2vec, vectorSize); - // Generating index bucket for degrees - generateCosineIndxMap(); - // TODO: Remove this - printBucketSize(); - } - - public void updateGMultiplier(float gMult) { - if (this.gMultiplier == gMult) { - return; - } - this.gMultiplier = gMult; - // Generating index bucket for degrees - generateCosineIndxMap(); - // TODO: Remove this - printBucketSize(); - } - - private void printBucketSize() { - for (int i : cosineIndxMap.keySet()) { - LOG.info("Bucket " + i + " has the size: " + cosineIndxMap.get(i).size()); - } - } - - public void setComparisonVec(Map word2vecMap, int vectorSize) { - float[] meanArr = new float[vectorSize]; - int totSize = word2vecMap.size(); - // loop all dimensions - for (int i = 0; i < vectorSize; i++) { - // loop through all the words - float[] dimsnArr = new float[totSize]; - float sum = 0; - for (float[] vecEntry : word2vecMap.values()) { - float val = vecEntry[i]; - sum += val; - } - // mean - float mean = sum / dimsnArr.length; - meanArr[i] = mean; - } - this.comparisonVec = meanArr; - } - - private void generateCosineIndxMap() { - cosineIndxMap = new HashMap<>(); - float[] curVec; - for (String word : word2vec.keySet()) { - curVec = word2vec.get(word); - - Long cosineIndx = Math - .round(Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVec, curVec) * gMultiplier); - int intIndxVal = cosineIndx.intValue(); - List wordsBucket = cosineIndxMap.get(intIndxVal); - if (wordsBucket == null) { - wordsBucket = new ArrayList<>(); - cosineIndxMap.put(intIndxVal, wordsBucket); - } - wordsBucket.add(word); - } - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestEntry(float[] vector) { - return getClosestEntry(vector, null); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestSubEntry(float[] vector, String subKey) { - return getClosestEntry(vector, subKey); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - private String getClosestEntry(float[] vector, String subKey) { - String closestVec = null; - try { - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - // calculate cosine similarity of all distances - double cosSimMultVal = Word2VecMath.cosineSimilarityNormalizedVecs(comparisonVec, vector) * gMultiplier; - Double dMinIndx = Math.floor(cosSimMultVal); - Double dMaxIndx = Math.ceil(cosSimMultVal); - int minIndx = dMinIndx.intValue(); - int maxIndx = dMaxIndx.intValue(); - Set nearbyWords = new HashSet<>(); - List minWordList = cosineIndxMap.get(minIndx); - if (minWordList != null) { - nearbyWords.addAll(minWordList); - } - List maxWordList = cosineIndxMap.get(maxIndx); - if (maxWordList != null) { - nearbyWords.addAll(maxWordList); - } - Map nearbyVecMap = createNearbyVecMap(nearbyWords); - closestVec = Word2VecMath.findClosestNormalizedVec(nearbyVecMap, vector); - } catch (Exception e) { - LOG.error(e.getStackTrace()); - } - return closestVec; - } - - private Map createNearbyVecMap(Collection wordCol) { - Map vecMap = new HashMap<>(); - for (String word : wordCol) { - vecMap.put(word, word2vec.get(word)); - } - return vecMap; - } - - /** - * Method to fetch vectorSize - * - * @return - vectorSize - */ - @Override - public int getVectorSize() { - return this.vectorSize; - } - - /** - * Method to fetch word2vec map - * - * @return - word2vec map - */ - public Map getWord2VecMap() { - return this.word2vec; - } - -} diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java index c3e2487..0346748 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecFactory.java @@ -13,10 +13,6 @@ public class Word2VecFactory { public static final String CFG_KEY_BIN = Word2VecModelLoader.class.getName().concat(".bin"); public static boolean binModel = Boolean.parseBoolean(Cfg.get(CFG_KEY_BIN)); - private static String nrmlMdlFilePath = (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedmodel.model")); - private static boolean nrmlMdlBinFlg = Boolean - .parseBoolean(Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedmodel.bin")); - private static String nrmlBinMdlFilePath = (Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model")); private static boolean nrmlBinMdlBinFlg = Boolean .parseBoolean(Cfg.get("org.aksw.word2vecrestful.word2vec.normalizedbinmodel.bin")); @@ -24,21 +20,6 @@ public class Word2VecFactory { public static Word2VecModel get() { return new Word2VecModelLoader().loadModel(new File(model), binModel); } - - public static W2VNrmlMemModelNonIndxd getNormalizedModel() { - Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlMdlFilePath), nrmlMdlBinFlg); - return new W2VNrmlMemModelNonIndxd(w2vmodel.word2vec, w2vmodel.vectorSize); - } - - public static W2VNrmlMemModelNonIndxd getNormalizedBinNonIndxdModel() { - LOG.info("Loading model to memory"); - Word2VecModel w2vmodel = new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); - LOG.info("Model loaded to memory"); - LOG.info("Initializing "+W2VNrmlMemModelNonIndxd.class + " instance"); - W2VNrmlMemModelNonIndxd nrmlMemModel = new W2VNrmlMemModelNonIndxd(w2vmodel.word2vec, w2vmodel.vectorSize); - LOG.info("Initialization of "+W2VNrmlMemModelNonIndxd.class + " instance finished"); - return nrmlMemModel; - } public static Word2VecModel getNormalBinModel() { return new Word2VecModelLoader().loadModel(new File(nrmlBinMdlFilePath), nrmlBinMdlBinFlg); diff --git a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java b/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java deleted file mode 100644 index de1a862..0000000 --- a/src/test/java/org/aksw/word2vecrestful/NormalizedInMemModelTest.java +++ /dev/null @@ -1,99 +0,0 @@ -package org.aksw.word2vecrestful; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelNonIndxd; -import org.aksw.word2vecrestful.word2vec.Word2VecFactory; -import org.aksw.word2vecrestful.word2vec.Word2VecModel; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.log4j.PropertyConfigurator; -import org.junit.Assert; -import org.junit.Test; - -import nikit.test.TestConst; - -public class NormalizedInMemModelTest { - static { - PropertyConfigurator.configure(Cfg.LOG_FILE); - } - public static Logger LOG = LogManager.getLogger(NormalizedInMemModelTest.class); - - /* - * @Test public void testNormalizedModel() { - * LOG.info("Starting InMemory indexed model test!"); final W2VNrmlMemModel - * memModel = Word2VecFactory.getNormalizedBinModel(); - * LOG.info("Indexed Model instance created"); Map wordKeyMap = - * new HashMap<>(); wordKeyMap.put("cat", null); wordKeyMap.put("dog", null); - * wordKeyMap.put("airplane", null); wordKeyMap.put("road", null); - * - * long startTime, diff; long totTime = 0; for (String word : - * wordKeyMap.keySet()) { LOG.info("Sending query for word :" + word); startTime - * = System.currentTimeMillis(); float[] vec = - * memModel.getWord2VecMap().get(word); Map closestWord = - * memModel.getClosestSubEntry(vec, wordKeyMap.get(word)); - * Assert.assertTrue(closestWord.containsKey(word)); diff = - * System.currentTimeMillis() - startTime; totTime += diff; - * LOG.info("Query time recorded for the word: '" + word + "' and subset: '" + - * wordKeyMap.get(word) + "' is " + diff + " milliseconds."); } - * - * LOG.info("Average query time: " + (totTime / wordKeyMap.size()) + - * " milliseconds"); - * - * } - */ - - /* - * @Test public void testNbmTime() { - * LOG.info("Starting InMemory indexed model test!"); final - * W2VNrmlMemModelNonIndxd memModel = - * Word2VecFactory.getNormalizedBinNonIndxdModel(); float[][] centroids = - * {TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, - * TestConst.CENT5}; - * - * long startTime, diff; long totTime = 0; for (int i=0;i correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); - LOG.info("Correct Words are :" + correctWords); - - long startTime, diff; - long totTime = 0; - List wordSet = new ArrayList<>(); - for (int i = 0; i < centroids.length; i++) { - LOG.info("Sending query for Centroid " + (i + 1)); - startTime = System.currentTimeMillis(); - Map closestWordMap = nbm.getClosestEntry(centroids[i]); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - wordSet.addAll(closestWordMap.keySet()); - LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); - } - - LOG.info("Average query time: " + (totTime / centroids.length) + " milliseconds"); - float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, wordSet); - LOG.info("Score for the Test is " + percVal + "%"); - } -} diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java deleted file mode 100644 index f8d1b41..0000000 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdMdlPrfmncTester.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.aksw.word2vecrestful; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.ThreadLocalRandom; - -import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.utils.Word2VecMath; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelIndxdLR; -import org.aksw.word2vecrestful.word2vec.Word2VecFactory; -import org.aksw.word2vecrestful.word2vec.Word2VecModel; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.log4j.PropertyConfigurator; -import org.dice_research.topicmodeling.commons.sort.AssociativeSort; -import org.junit.Test; - -import nikit.test.TestConst; - -public class NrmlzdMdlPrfmncTester { - static { - PropertyConfigurator.configure(Cfg.LOG_FILE); - } - public static Logger LOG = LogManager.getLogger(NrmlzdMdlPrfmncTester.class); - - @Test - public void testNbmTime() { - LOG.info("Starting InMemory indexed model test!"); - Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); - float[][] centroids = { TestConst.CENT1, TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; - List correctWords = getCorrectWords(centroids, nbm); - LOG.info("Correct Words are :" + correctWords); - - int kStrt = 3000; - int kEnd = 4000;//20; - float sigStrt = 2; - float sigEnd = 3; - float arDivStrt = 10; - float arDivEnd = 10;//10; - int indx = 0; - int scrSize = Math.round((kEnd - kStrt + 1) * (sigEnd - sigStrt + 1) * (arDivEnd - arDivStrt + 1)); - float[] percScore = new float[scrSize]; - int[] idArr = new int[percScore.length]; - final W2VNrmlMemModelIndxdLR memModel = new W2VNrmlMemModelIndxdLR(nbm.word2vec, nbm.vectorSize); - for (int a = kStrt; a <= kEnd; a+=100) { - for (float b = arDivStrt; b <= arDivEnd; b++) { - for (float c = sigStrt; c <= sigEnd; c++) { - LOG.info("Starting LR-Model Test with config: kVal=" + a + " and sigMult=" + c + " and arDiv=" + b); - List lrModelWords = runLRMemModel(centroids, memModel, a, b, c); - LOG.info("Predicted Words are :" + lrModelWords); - float percVal = calcPercScore(correctWords, lrModelWords); - idArr[indx] = indx + 1; - percScore[indx] = percVal; - LOG.info("Score for Test id: " + (++indx) + " is " + percVal + "%"); - } - } - } - AssociativeSort.quickSort(percScore, idArr); - LOG.info("Highest Score (" + percScore[percScore.length - 1] + "%) is achieved by the test id: " - + idArr[idArr.length - 1]); - } - - public static float calcPercScore(List correctWordSet, List lrModelWords) { - float percScore = 0; - int len = correctWordSet.size(); - float lenInv = 100f / len; - for (int i = 0; i < len; i++) { - if (correctWordSet.get(i).equals(lrModelWords.get(i))) { - percScore += lenInv; - } - } - return percScore; - - } - - private List runLRMemModel(float[][] centroids, W2VNrmlMemModelIndxdLR memModel, int k, float arDiv, - float sigMult) { - memModel.setK(k); - memModel.updateSdArr(sigMult, arDiv); - List wordSet = new ArrayList<>(); - long startTime, diff; - long totTime = 0; - for (int i = 0; i < centroids.length; i++) { - LOG.info("Sending query for Centroid " + (i + 1)); - startTime = System.currentTimeMillis(); - String closestWord = memModel.getClosestSubEntry(centroids[i], null); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - wordSet.add(closestWord); - LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); - } - - LOG.info( - "Average query time for W2VNrmlMemModelIndxdLRMulti is : " + (totTime / centroids.length) + " milliseconds"); - return wordSet; - } - - public static List getCorrectWords(float[][] centroids, Word2VecModel nbm) { - List wordSet = new ArrayList<>(); - W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); - long startTime, diff; - long totTime = 0; - for (int i = 0; i < centroids.length; i++) { - LOG.info("Sending query for Centroid " + (i + 1)); - startTime = System.currentTimeMillis(); - String closestWord = bruteForce.getClosestSubEntry(centroids[i], null); - diff = System.currentTimeMillis() - startTime; - totTime += diff; - wordSet.add(closestWord); - LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); - } - LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); - return wordSet; - } - - public static void main(String[] args) { - // Creating random coordinates - float[][] coordArr = new float[10][2]; - ThreadLocalRandom rand = ThreadLocalRandom.current(); - for (int i = 0; i < coordArr.length; i++) { - for (int j = 0; j < coordArr[i].length; j++) { - coordArr[i][j] = rand.nextFloat(); - } - } - // Random coord created - System.out.println("Input Coords: "+Arrays.deepToString(coordArr)); - for (int i = 0; i < coordArr.length; i++) { - Word2VecMath.normalize(coordArr[i]); - } - System.out.println("Normalized Coords: "+Arrays.deepToString(coordArr)); - } - -} diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 1b672d6..ea5755b 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -5,8 +5,8 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.utils.Word2VecMath; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -28,34 +28,6 @@ public class NrmlzdThetaMdlPrfmncTester { public static final String[] TEST_WORDS = { "cat", "dog", "airplane", "road", "kennedy", "rome", "human", "disney", "machine", "intelligence", "palaeontology", "surgeon", "amazon", "jesus", "gold", "atlantis", "ronaldo", "pele", "scissors", "lizard" }; - /* - * @Test public void testNbmTime() { long startTime, diff; long totTime = 0; - * LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = - * Word2VecFactory.getNormalBinModel(); float[][] centroids = { TestConst.CENT1, - * TestConst.CENT2, TestConst.CENT3, TestConst.CENT4, TestConst.CENT5 }; - * LOG.info("Starting BruteForce-Model Test"); List correctWords = - * getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + - * correctWords); LOG.info("Initializing Theta Model"); final - * W2VNrmlMemModelTheta memModel = new W2VNrmlMemModelTheta(nbm.word2vec, - * nbm.vectorSize); List lrModelWords = new ArrayList<>(); - * - * LOG.info("Starting Theta-Model Test"); for (int mult = 10; mult < 1000; mult - * += 10) { LOG.info("Testing with multplier: " + mult); - * memModel.updateGMultiplier(mult); - * - * for (int i = 0; i < centroids.length; i++) { - * LOG.info("Sending query for Centroid " + (i + 1)); startTime = - * System.currentTimeMillis(); - * lrModelWords.add(memModel.getClosestEntry(centroids[i])); diff = - * System.currentTimeMillis() - startTime; totTime += diff; - * LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + - * " milliseconds."); } - * LOG.info("Average query time for W2VNrmlMemModelTheta is : " + (totTime / - * centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + - * lrModelWords); float percVal = - * NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); - * LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); } } - */ @Test public void testNbmTime() throws IOException { @@ -66,7 +38,7 @@ public void testNbmTime() throws IOException { float[][] centroids = TEST_CENTROIDS; //float[][] centroids = fetchWordsVec(TEST_WORDS, nbm); LOG.info("Starting BruteForce-Model Test"); - List correctWords = NrmlzdMdlPrfmncTester.getCorrectWords(centroids, nbm); + List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); @@ -84,7 +56,7 @@ public void testNbmTime() throws IOException { } LOG.info("Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); - float percVal = NrmlzdMdlPrfmncTester.calcPercScore(correctWords, lrModelWords); + float percVal = calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); lrModelWords.clear(); } @@ -96,4 +68,35 @@ private static float[][] fetchWordsVec(String[] words, Word2VecModel nbm) { } return resVec; } + + public static float calcPercScore(List correctWordSet, List lrModelWords) { + float percScore = 0; + int len = correctWordSet.size(); + float lenInv = 100f / len; + for (int i = 0; i < len; i++) { + if (correctWordSet.get(i).equals(lrModelWords.get(i))) { + percScore += lenInv; + } + } + return percScore; + + } + + public static List getCorrectWords(float[][] centroids, Word2VecModel nbm) { + List wordSet = new ArrayList<>(); + W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); + long startTime, diff; + long totTime = 0; + for (int i = 0; i < centroids.length; i++) { + LOG.info("Sending query for Centroid " + (i + 1)); + startTime = System.currentTimeMillis(); + String closestWord = bruteForce.getClosestSubEntry(centroids[i], null); + diff = System.currentTimeMillis() - startTime; + totTime += diff; + wordSet.add(closestWord); + LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); + } + LOG.info("Average query time for BruteForce is : " + (totTime / centroids.length) + " milliseconds"); + return wordSet; + } } From 1cd238d2d7ef59d95ae67162506d4cbc125da0e8 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 15:26:46 +0200 Subject: [PATCH 106/114] refactoring the code --- src/main/java/nikit/test/Word2VecTester.java | 31 ------------------- .../word2vecrestful/utils}/TimeLogger.java | 2 +- .../word2vec/W2VNrmlMemModelBinSrch.java | 26 +++++++--------- .../word2vec/W2VNrmlMemModelBruteForce.java | 3 +- .../word2vec/W2VNrmlMemModelKMeans.java | 26 +++++++--------- .../NrmlzdThetaMdlPrfmncTester.java | 2 -- .../org/aksw/word2vecrestful}/TestConst.java | 2 +- 7 files changed, 27 insertions(+), 65 deletions(-) delete mode 100644 src/main/java/nikit/test/Word2VecTester.java rename src/main/java/{nikit/test => org/aksw/word2vecrestful/utils}/TimeLogger.java (89%) rename src/{main/java/nikit/test => test/java/org/aksw/word2vecrestful}/TestConst.java (99%) diff --git a/src/main/java/nikit/test/Word2VecTester.java b/src/main/java/nikit/test/Word2VecTester.java deleted file mode 100644 index 6a496f2..0000000 --- a/src/main/java/nikit/test/Word2VecTester.java +++ /dev/null @@ -1,31 +0,0 @@ -package nikit.test; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Map; - -import org.aksw.word2vecrestful.word2vec.Word2VecFactory; -import org.aksw.word2vecrestful.word2vec.Word2VecModel; - -import com.fasterxml.jackson.core.JsonProcessingException; - -public class Word2VecTester { - public static void main(final String[] a) throws JsonProcessingException, FileNotFoundException, IOException { - - // loads model in memory - final Word2VecModel model = Word2VecFactory.get(); - // test 1 - float[] vec1 = model.word2vec.get("WesternOne"); - Map closestWord = model.getClosestEntryInSub(vec1, "ns#country-name"); - System.out.println(closestWord.keySet()); - // test 2 - vec1 = model.word2vec.get("Donald_O._Schnuck"); - closestWord = model.getClosestEntryInSub(vec1, "ontology#ConferenceVenuePlacerdf-schema#label"); - System.out.println(closestWord.keySet()); - // test 3 - vec1 = model.word2vec.get("аn_ԁ_ѕοmе"); - closestWord = model.getClosestEntryInSub(vec1, "ontology#ConferenceVenuePlacerdf-schema#label"); - System.out.println(closestWord.keySet()); - } - -} diff --git a/src/main/java/nikit/test/TimeLogger.java b/src/main/java/org/aksw/word2vecrestful/utils/TimeLogger.java similarity index 89% rename from src/main/java/nikit/test/TimeLogger.java rename to src/main/java/org/aksw/word2vecrestful/utils/TimeLogger.java index e0209d5..98a444f 100644 --- a/src/main/java/nikit/test/TimeLogger.java +++ b/src/main/java/org/aksw/word2vecrestful/utils/TimeLogger.java @@ -1,4 +1,4 @@ -package nikit.test; +package org.aksw.word2vecrestful.utils; import java.util.HashMap; import java.util.Map; diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index c3344d6..d772d48 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -8,11 +8,17 @@ import org.apache.log4j.Logger; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; -import nikit.test.TimeLogger; - /** * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model + * search on the model. + * + * This class selects {@link W2VNrmlMemModelBinSrch#compareVecCount} vectors (1 + * mean vector and others on basis Map iterator) and then calculates the cosine + * similarity of all words in model to those vectors. + * + * It uses the knowledge about pre-processed similarities with + * {@link W2VNrmlMemModelBinSrch#comparisonVecs} to narrow down the search of + * closest word for the user specified vector. * * @author Nikit * @@ -30,8 +36,6 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { private int compareVecCount = 150; private int bucketCount = 10; private BitSet[][] csBucketContainer; - // TODO : Remove this - private TimeLogger tl = new TimeLogger(); public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; @@ -170,13 +174,12 @@ private String getClosestEntry(float[] vector, String subKey) { // calculate cosine similarity of all distances float[] curCompVec; BitSet finBitSet = null; - tl.logTime(1); for (int i = 0; i < compareVecCount; i++) { curCompVec = comparisonVecs[i]; double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); int indx = getBucketIndex(cosSimVal); BitSet curBs = new BitSet(word2vec.size()); - if(csBucketContainer[i][indx]!=null) { + if (csBucketContainer[i][indx] != null) { curBs.or(csBucketContainer[i][indx]); } int temIndx = indx + 1; @@ -193,10 +196,8 @@ private String getClosestEntry(float[] vector, String subKey) { finBitSet.and(curBs); } } - tl.printTime(1, "Setting Bits"); - tl.logTime(1); int nearbyWordsCount = finBitSet.cardinality(); - LOG.info("Number of nearby words: "+nearbyWordsCount); + //LOG.info("Number of nearby words: " + nearbyWordsCount); int[] nearbyIndexes = new int[nearbyWordsCount]; int j = 0; for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { @@ -206,15 +207,12 @@ private String getClosestEntry(float[] vector, String subKey) { break; // or (i+1) would overflow } } - tl.printTime(1, "Extracting words"); - tl.logTime(1); closestWord = findClosestWord(nearbyIndexes, vector); - tl.printTime(1, "finding closest word"); } catch (Exception e) { LOG.error("Exception has occured while finding closest word."); e.printStackTrace(); } - LOG.info("Closest word found is: "+closestWord); + //LOG.info("Closest word found is: " + closestWord); return closestWord; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java index a1956c9..556980a 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -6,13 +6,12 @@ import java.util.Set; import org.aksw.word2vecrestful.subset.DataSubsetProvider; +import org.aksw.word2vecrestful.utils.TimeLogger; import org.aksw.word2vecrestful.utils.Word2VecMath; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.dice_research.topicmodeling.commons.sort.AssociativeSort; -import nikit.test.TimeLogger; - /** * Class to encapsulate word2vec in-memory model and expose methods to perform * search on the model diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 30bcd8b..d15322d 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -20,11 +20,17 @@ import com.opencsv.CSVReader; import com.opencsv.CSVWriter; -import nikit.test.TimeLogger; - /** * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model + * search on the model. + * + * This class selects {@link W2VNrmlMemModelKMeans#compareVecCount} vectors + * (centroids of the KMeans result on the model vectors) and then calculates the + * cosine similarity of all words in model to those vectors. + * + * It uses the knowledge about pre-processed similarities with + * {@link W2VNrmlMemModelKMeans#comparisonVecs} to narrow down the search of + * closest word for the user specified vector. * * @author Nikit * @@ -42,8 +48,6 @@ public class W2VNrmlMemModelKMeans implements GenWord2VecModel { private int kMeansMaxItr = 5; private BitSet[][] csBucketContainer; private String vecFilePath = "data/kmeans/comparison-vecs.csv"; - // TODO : Remove this - private TimeLogger tl = new TimeLogger(); public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) throws IOException { this.word2vec = word2vec; @@ -175,14 +179,13 @@ private String getClosestEntry(float[] vector, String subKey) { // calculate cosine similarity of all distances float[] curCompVec; BitSet finBitSet = null; - tl.logTime(1); for (int i = 0; i < compareVecCount; i++) { curCompVec = comparisonVecs[i]; double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); int indx = getBucketIndex(cosSimVal); BitSet curBs = new BitSet(word2vec.size()); BitSet tempBs = csBucketContainer[i][indx]; - if(tempBs!=null) { + if (tempBs != null) { curBs.or(tempBs); } int temIndx = indx + 1; @@ -199,10 +202,8 @@ private String getClosestEntry(float[] vector, String subKey) { finBitSet.and(curBs); } } - tl.printTime(1, "Setting Bits"); - tl.logTime(1); int nearbyWordsCount = finBitSet.cardinality(); - LOG.info("Number of nearby words: "+nearbyWordsCount); + //LOG.info("Number of nearby words: " + nearbyWordsCount); int[] nearbyIndexes = new int[nearbyWordsCount]; int j = 0; for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { @@ -212,15 +213,12 @@ private String getClosestEntry(float[] vector, String subKey) { break; // or (i+1) would overflow } } - tl.printTime(1, "Extracting words"); - tl.logTime(1); closestWord = findClosestWord(nearbyIndexes, vector); - tl.printTime(1, "finding closest word"); } catch (Exception e) { LOG.error("Exception has occured while finding closest word."); e.printStackTrace(); } - LOG.info("Closest word found is: "+closestWord); + //LOG.info("Closest word found is: " + closestWord); return closestWord; } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index ea5755b..928a4ed 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -14,8 +14,6 @@ import org.apache.log4j.PropertyConfigurator; import org.junit.Test; -import nikit.test.TestConst; - public class NrmlzdThetaMdlPrfmncTester { static { PropertyConfigurator.configure(Cfg.LOG_FILE); diff --git a/src/main/java/nikit/test/TestConst.java b/src/test/java/org/aksw/word2vecrestful/TestConst.java similarity index 99% rename from src/main/java/nikit/test/TestConst.java rename to src/test/java/org/aksw/word2vecrestful/TestConst.java index 3e77319..6581230 100644 --- a/src/main/java/nikit/test/TestConst.java +++ b/src/test/java/org/aksw/word2vecrestful/TestConst.java @@ -1,4 +1,4 @@ -package nikit.test; +package org.aksw.word2vecrestful; import java.util.HashMap; import java.util.Map; From 4e2b02e42ca8970550c07f110fba9f11db46a2f2 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 16:01:02 +0200 Subject: [PATCH 107/114] removing duplicate methods --- .../word2vec/W2VNrmlMemModelBinSrch.java | 45 +++-- .../word2vec/W2VNrmlMemModelKMeans.java | 183 +----------------- .../NrmlzdThetaMdlPrfmncTester.java | 10 +- 3 files changed, 38 insertions(+), 200 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index d772d48..4634997 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -1,5 +1,6 @@ package org.aksw.word2vecrestful.word2vec; +import java.io.IOException; import java.util.BitSet; import java.util.Map; @@ -10,7 +11,7 @@ /** * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model. + * search on the model. (Only works with Normalized Model) * * This class selects {@link W2VNrmlMemModelBinSrch#compareVecCount} vectors (1 * mean vector and others on basis Map iterator) and then calculates the cosine @@ -26,29 +27,33 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - private Map word2vec; - private int vectorSize; - private float[][] comparisonVecs = null; - private String[] wordArr; - private float[][] vecArr; - private int[] indxArr; - private double[] simValArr; - private int compareVecCount = 150; - private int bucketCount = 10; - private BitSet[][] csBucketContainer; + protected Map word2vec; + protected int vectorSize; + protected float[][] comparisonVecs = null; + protected String[] wordArr; + protected float[][] vecArr; + protected int[] indxArr; + protected double[] simValArr; + protected int compareVecCount = 150; + protected int bucketCount = 10; + protected BitSet[][] csBucketContainer; - public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) { + public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) throws IOException { this.word2vec = word2vec; this.vectorSize = vectorSize; comparisonVecs = new float[compareVecCount][vectorSize]; csBucketContainer = new BitSet[compareVecCount][bucketCount]; + process(); + } + + protected void process() throws IOException { + LOG.info("Process from BinSrch called"); // Setting mean as comparison vec setMeanComparisonVec(word2vec, vectorSize); // Initialize Arrays processCosineSim(); // Set other comparison vecs setAllComparisonVecs(); - } private void setBucketVals(int compVecIndex, float[] comparisonVec) { @@ -73,7 +78,7 @@ private void setAllComparisonVecs() { } } - private int getBucketIndex(double cosineSimVal) { + protected int getBucketIndex(double cosineSimVal) { Double dIndx = ((bucketCount - 1d) / 2d) * (cosineSimVal + 1d); return Math.round(dIndx.floatValue()); } @@ -98,7 +103,7 @@ private void processCosineSim() { AssociativeSort.quickSort(simValArr, indxArr); } - private void setValToBucket(int wordIndex, double cosSimVal, BitSet[] meanComparisonVecBuckets) { + protected void setValToBucket(int wordIndex, double cosSimVal, BitSet[] meanComparisonVecBuckets) { int bucketIndex = getBucketIndex(cosSimVal); BitSet bitset = meanComparisonVecBuckets[bucketIndex]; if (bitset == null) { @@ -166,7 +171,7 @@ public String getClosestSubEntry(float[] vector, String subKey) { * - key to subset if any * @return closest word to the given vector alongwith it's vector */ - private String getClosestEntry(float[] vector, String subKey) { + protected String getClosestEntry(float[] vector, String subKey) { String closestWord = null; try { // Normalize incoming vector @@ -197,7 +202,7 @@ private String getClosestEntry(float[] vector, String subKey) { } } int nearbyWordsCount = finBitSet.cardinality(); - //LOG.info("Number of nearby words: " + nearbyWordsCount); + // LOG.info("Number of nearby words: " + nearbyWordsCount); int[] nearbyIndexes = new int[nearbyWordsCount]; int j = 0; for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { @@ -212,11 +217,11 @@ private String getClosestEntry(float[] vector, String subKey) { LOG.error("Exception has occured while finding closest word."); e.printStackTrace(); } - //LOG.info("Closest word found is: " + closestWord); + // LOG.info("Closest word found is: " + closestWord); return closestWord; } - private String findClosestWord(int[] nearbyIndexes, float[] vector) { + protected String findClosestWord(int[] nearbyIndexes, float[] vector) { double minDist = -2; String minWord = null; double tempDist; @@ -243,7 +248,7 @@ private String findClosestWord(int[] nearbyIndexes, float[] vector) { * - minimum distance constraint * @return squared euclidean distance between two vector or -1 */ - private double getSqEucDist(float[] arr1, float[] arr2, double minDist) { + protected double getSqEucDist(float[] arr1, float[] arr2, double minDist) { double dist = 0; for (int i = 0; i < vectorSize; i++) { dist += Math.pow(arr1[i] - arr2[i], 2); diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index d15322d..4eee7de 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -22,7 +22,7 @@ /** * Class to encapsulate word2vec in-memory model and expose methods to perform - * search on the model. + * search on the model. (Only works with Normalized Model) * * This class selects {@link W2VNrmlMemModelKMeans#compareVecCount} vectors * (centroids of the KMeans result on the model vectors) and then calculates the @@ -35,25 +35,19 @@ * @author Nikit * */ -public class W2VNrmlMemModelKMeans implements GenWord2VecModel { +public class W2VNrmlMemModelKMeans extends W2VNrmlMemModelBinSrch { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - private Map word2vec; - private int vectorSize; - private float[][] comparisonVecs = null; - private String[] wordArr; - private float[][] vecArr; - private int compareVecCount = 100; - private int bucketCount = 10; private int kMeansMaxItr = 5; - private BitSet[][] csBucketContainer; private String vecFilePath = "data/kmeans/comparison-vecs.csv"; public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) throws IOException { - this.word2vec = word2vec; - this.vectorSize = vectorSize; - comparisonVecs = new float[compareVecCount][vectorSize]; - csBucketContainer = new BitSet[compareVecCount][bucketCount]; + super(word2vec, vectorSize); + } + + @Override + protected void process() throws IOException { + LOG.info("Process from KMeans called"); fetchComparisonVectors(); // Initialize Arrays processCosineSim(); @@ -117,167 +111,6 @@ private void processCosineSim() { } } - private int getBucketIndex(double cosineSimVal) { - Double dIndx = ((bucketCount - 1d) / 2d) * (cosineSimVal + 1d); - return Math.round(dIndx.floatValue()); - } - - private void setValToBucket(int wordIndex, double cosSimVal, BitSet[] meanComparisonVecBuckets) { - int bucketIndex = getBucketIndex(cosSimVal); - BitSet bitset = meanComparisonVecBuckets[bucketIndex]; - if (bitset == null) { - bitset = new BitSet(word2vec.size()); - meanComparisonVecBuckets[bucketIndex] = bitset; - } - bitset.set(wordIndex); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestEntry(float[] vector) { - return getClosestEntry(vector, null); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - @Override - public String getClosestSubEntry(float[] vector, String subKey) { - return getClosestEntry(vector, subKey); - } - - /** - * Method to fetch the closest word entry for a given vector using cosine - * similarity - * - * @param vector - * - vector to find closest word to - * @param subKey - * - key to subset if any - * @return closest word to the given vector alongwith it's vector - */ - private String getClosestEntry(float[] vector, String subKey) { - String closestWord = null; - try { - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - // calculate cosine similarity of all distances - float[] curCompVec; - BitSet finBitSet = null; - for (int i = 0; i < compareVecCount; i++) { - curCompVec = comparisonVecs[i]; - double cosSimVal = Word2VecMath.cosineSimilarityNormalizedVecs(curCompVec, vector); - int indx = getBucketIndex(cosSimVal); - BitSet curBs = new BitSet(word2vec.size()); - BitSet tempBs = csBucketContainer[i][indx]; - if (tempBs != null) { - curBs.or(tempBs); - } - int temIndx = indx + 1; - if (temIndx < csBucketContainer[i].length && csBucketContainer[i][temIndx] != null) { - curBs.or(csBucketContainer[i][temIndx]); - } - temIndx = indx - 1; - if (temIndx > -1 && csBucketContainer[i][temIndx] != null) { - curBs.or(csBucketContainer[i][temIndx]); - } - if (i == 0) { - finBitSet = curBs; - } else { - finBitSet.and(curBs); - } - } - int nearbyWordsCount = finBitSet.cardinality(); - //LOG.info("Number of nearby words: " + nearbyWordsCount); - int[] nearbyIndexes = new int[nearbyWordsCount]; - int j = 0; - for (int i = finBitSet.nextSetBit(0); i >= 0; i = finBitSet.nextSetBit(i + 1), j++) { - // operate on index i here - nearbyIndexes[j] = i; - if (i == Integer.MAX_VALUE) { - break; // or (i+1) would overflow - } - } - closestWord = findClosestWord(nearbyIndexes, vector); - } catch (Exception e) { - LOG.error("Exception has occured while finding closest word."); - e.printStackTrace(); - } - //LOG.info("Closest word found is: " + closestWord); - return closestWord; - } - - private String findClosestWord(int[] nearbyIndexes, float[] vector) { - double minDist = -2; - String minWord = null; - double tempDist; - for (int indx : nearbyIndexes) { - float[] wordvec = vecArr[indx]; - tempDist = getSqEucDist(vector, wordvec, minDist); - if (tempDist != -1) { - minWord = wordArr[indx]; - minDist = tempDist; - } - } - return minWord; - } - - /** - * Method to find the squared value of euclidean distance between two vectors if - * it is less than the provided minimum distance value, otherwise return -1 - * - * @param arr1 - * - first vector - * @param arr2 - * - second vector - * @param minDist - * - minimum distance constraint - * @return squared euclidean distance between two vector or -1 - */ - private double getSqEucDist(float[] arr1, float[] arr2, double minDist) { - double dist = 0; - for (int i = 0; i < vectorSize; i++) { - dist += Math.pow(arr1[i] - arr2[i], 2); - if (minDist != -2 && dist > minDist) - return -1; - } - return dist; - } - - /** - * Method to fetch vectorSize - * - * @return - vectorSize - */ - @Override - public int getVectorSize() { - return this.vectorSize; - } - - /** - * Method to fetch word2vec map - * - * @return - word2vec map - */ - public Map getWord2VecMap() { - return this.word2vec; - } - public static float[][] readVecsFromFile(File inputFile) throws IOException { float[][] vecArr = null; FileReader fileReader; diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index 928a4ed..ba797eb 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -5,8 +5,8 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -38,11 +38,11 @@ public void testNbmTime() throws IOException { LOG.info("Starting BruteForce-Model Test"); List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); - final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelKMeans Model"); + final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelBinSrch Test"); + LOG.info("Starting W2VNrmlMemModelKMeans Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -52,7 +52,7 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From 112ef8eb4184807551f9013631d3023df1d53275 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 16:11:59 +0200 Subject: [PATCH 108/114] removing database normalizing feature --- .../word2vecrestful/tool/ModelNormalizer.java | 73 --------------- .../tool/NormalizedDBModelGenerator.java | 91 ------------------- 2 files changed, 164 deletions(-) delete mode 100644 src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java diff --git a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java index e1d5d16..cccc21f 100644 --- a/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java +++ b/src/main/java/org/aksw/word2vecrestful/tool/ModelNormalizer.java @@ -9,7 +9,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.charset.StandardCharsets; -import java.sql.PreparedStatement; import java.sql.SQLException; import org.aksw.word2vecrestful.utils.Cfg; @@ -97,58 +96,6 @@ public void generateNormalizedModel(File inputFile, File outputFile) throws IOEx } } - /** - * Method to persist a normalized model for a word2vec bin model - * - * @param inputFile - * - word2vec file of the model to be normalized - * @param dbName - * - name of the database - * @param tablName - * - name of the table to store the data in - * @throws IOException - * @throws SQLException - */ - public void persistNormalizedModel(File inputFile, String dbName, String tblName) throws IOException, SQLException { - // intialize handler instance - NormalizedDBModelGenerator dbHandler = null; - - FileInputStream fin = null; - try { - // reads file header - fin = new FileInputStream(inputFile); - String word = Word2VecModelLoader.readWord(fin); - int words = Integer.parseInt(word); - word = Word2VecModelLoader.readWord(fin); - int vectorSize = Integer.parseInt(word); - dbHandler = new NormalizedDBModelGenerator(dbName, tblName, vectorSize); - // open connection - dbHandler.connect(); - LOG.info("Expecting " + words + " words with " + vectorSize + " values per vector."); - // create preparedstatement - PreparedStatement ps = dbHandler.generateMainTblInsrtStmnt(); - for (int w = 0; w < words; ++w) { - word = Word2VecModelLoader.readWord(fin); - // LOG.info(word); - float[] vector = Word2VecModelLoader.readVector(fin, vectorSize); - // dbHandler.insertMainTblRecord(word, vector); - dbHandler.addMainTblInsrtBatch(word, Word2VecMath.normalize(vector), ps); - if ((w + 1) % 50000 == 0) { - dbHandler.executeBatchCommit(ps); - LOG.info((w + 1) + " Records inserted."); - } - } - dbHandler.executeBatchCommit(ps); - // Generate Index on completion - dbHandler.makeIndex(); - } catch (final IOException e) { - LOG.error(e.getLocalizedMessage(), e); - } finally { - fin.close(); - dbHandler.disconnect(); - } - } - /** * Method to generate a normalized model for a word2vec bin model * @@ -199,26 +146,6 @@ public void generateNormalizedBinModel(File inputFile, File outputFile) throws I } } - /* - * public static void main(String[] args) throws IOException { String - * cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); String model - * = (Cfg.get(cfgKeyModel)); ModelNormalizer modelNormalizer = new - * ModelNormalizer(); File inputFile = new File(model); File outputFile = new - * File( - * "D:\\Nikit\\DICE-Group\\Jword2vec\\data\\normal\\GoogleNews-vectors-negative300-normalized.txt" - * ); modelNormalizer.generateNormalizedModel(inputFile, outputFile); } - */ - - /* - * public static void main(String[] args) throws IOException, SQLException { - * String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); String - * model = (Cfg.get(cfgKeyModel)); ModelNormalizer modelNormalizer = new - * ModelNormalizer(); File inputFile = new File(model); // - * modelNormalizer.generateNormalizedModel(inputFile, outputFile); - * modelNormalizer.persistNormalizedModel(inputFile, - * "data/nrmldb/word2vecmodel", "wordtovec"); } - */ - public static void main(String[] args) throws IOException, SQLException { String cfgKeyModel = Word2VecFactory.class.getName().concat(".model"); String model = (Cfg.get(cfgKeyModel)); diff --git a/src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java b/src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java deleted file mode 100644 index 662324b..0000000 --- a/src/main/java/org/aksw/word2vecrestful/tool/NormalizedDBModelGenerator.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.aksw.word2vecrestful.tool; - -import java.sql.PreparedStatement; -import java.sql.SQLException; - -import org.aksw.word2vecrestful.db.SQLiteDBHandler; - -public class NormalizedDBModelGenerator extends SQLiteDBHandler { - - private String mainTblName; - private int vectorSize; - private String insertQuery; - - public NormalizedDBModelGenerator(String dbName, String mainTblName, int vectorSize) { - super(dbName); - this.mainTblName = mainTblName; - this.vectorSize = vectorSize; - this.insertQuery = this.createInsertQuery(); - createMainTable(); - } - - public void createMainTable() { - StringBuilder sqlStr = new StringBuilder("CREATE TABLE IF NOT EXISTS "); - sqlStr.append(this.mainTblName); - sqlStr.append(" ( word text "); - for (int i = 0; i < vectorSize; i++) { - sqlStr.append(", val").append(i + 1).append(" float NOT NULL "); - } - sqlStr.append(");"); - executeStatement(sqlStr.toString()); - } - - private String createInsertQuery() { - StringBuilder insrtStr = new StringBuilder(); - insrtStr.append("insert into ").append(this.mainTblName).append(" values ( ?"); - for (int i = 0; i < vectorSize; i++) { - insrtStr.append(", ?"); - } - insrtStr.append(") ;"); - return insrtStr.toString(); - } - - public PreparedStatement generateMainTblInsrtStmnt() throws SQLException { - PreparedStatement prep = connection.prepareStatement(this.insertQuery); - connection.setAutoCommit(false); - return prep; - } - - /** - * Creates an index. - */ - public void makeIndex() { - final String sql = "CREATE INDEX Idx1 ON " + this.mainTblName + "(word)"; - if (connection != null) { - try { - final PreparedStatement prep = connection.prepareStatement(sql); - prep.execute(); - prep.close(); - commit(); - } catch (final SQLException e) { - LOG.error(e.getLocalizedMessage(), e); - } - } - } - - public void addMainTblInsrtBatch(String word, float[] vector, PreparedStatement ps) throws SQLException { - ps.setString(1, word); - for (int i = 0; i < this.vectorSize; i++) { - ps.setFloat(i + 2, vector[i]); - } - ps.addBatch(); - } - - public int[] executeBatchCommit(PreparedStatement ps) throws SQLException { - int[] res = ps.executeBatch(); - connection.commit(); - return res; - } - - public boolean insertMainTblRecord(String word, float[] vector) throws SQLException { - boolean recInserted = false; - PreparedStatement prep = connection.prepareStatement(this.insertQuery); - prep.setString(1, word); - for (int i = 0; i < this.vectorSize; i++) { - prep.setFloat(i + 2, vector[i]); - } - recInserted = prep.execute(); - prep.close(); - return recInserted; - } -} From cfe2e0a4ea5dcb5b250f53cb6c738762b115b3b9 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 16:14:42 +0200 Subject: [PATCH 109/114] changing access specifier --- .../aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index 4eee7de..d3dc9f8 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -38,8 +38,8 @@ public class W2VNrmlMemModelKMeans extends W2VNrmlMemModelBinSrch { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - private int kMeansMaxItr = 5; - private String vecFilePath = "data/kmeans/comparison-vecs.csv"; + protected int kMeansMaxItr = 5; + protected String vecFilePath = "data/kmeans/comparison-vecs.csv"; public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) throws IOException { super(word2vec, vectorSize); From d8d478a9628d7fca1a68532ab1f865d15d7bb617 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Thu, 18 Oct 2018 17:52:32 +0200 Subject: [PATCH 110/114] modifying access specifiers --- config/cfg.properties | 3 ++- .../word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java | 3 +++ .../word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java | 9 +++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/config/cfg.properties b/config/cfg.properties index 1af9b6f..b5eb8bc 100644 --- a/config/cfg.properties +++ b/config/cfg.properties @@ -6,4 +6,5 @@ org.aksw.word2vecrestful.Application.inmemory: true org.aksw.word2vecrestful.Application.subsetfiledir: data/subset-files-1/ org.aksw.word2vecrestful.word2vec.normalizedbinmodel.bin: true org.aksw.word2vecrestful.word2vec.normalizedbinmodel.model: data/normalbinmodel/GoogleNews-vectors-negative300-normalized.bin -org.aksw.word2vecrestful.word2vec.stats.sdfile: data/normal/stat/normal-model-sd.csv \ No newline at end of file +org.aksw.word2vecrestful.word2vec.stats.sdfile: data/normal/stat/normal-model-sd.csv +org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans.filepath: data/kmeans/comparison-vecs.csv \ No newline at end of file diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index 4634997..f50c8a8 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -38,6 +38,9 @@ public class W2VNrmlMemModelBinSrch implements GenWord2VecModel { protected int bucketCount = 10; protected BitSet[][] csBucketContainer; + protected W2VNrmlMemModelBinSrch() { + } + public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) throws IOException { this.word2vec = word2vec; this.vectorSize = vectorSize; diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index d3dc9f8..d8ec809 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -9,6 +9,7 @@ import java.util.List; import java.util.Map; +import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.utils.ClusterableVec; import org.aksw.word2vecrestful.utils.Word2VecMath; import org.apache.commons.math3.ml.clustering.CentroidCluster; @@ -38,8 +39,8 @@ public class W2VNrmlMemModelKMeans extends W2VNrmlMemModelBinSrch { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - protected int kMeansMaxItr = 5; - protected String vecFilePath = "data/kmeans/comparison-vecs.csv"; + private static final int KMEANS_MAX_ITR = 5; + private static final String VEC_FILEPATH = Cfg.get(W2VNrmlMemModelKMeans.class.getName().concat(".filepath")); public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) throws IOException { super(word2vec, vectorSize); @@ -54,7 +55,7 @@ protected void process() throws IOException { } private void fetchComparisonVectors() throws IOException { - File vecFile = new File(vecFilePath); + File vecFile = new File(VEC_FILEPATH); if (vecFile.exists()) { LOG.info("Reading Comparsion vectors from the file."); // read the persisted vectors @@ -72,7 +73,7 @@ private void fetchComparisonVectors() throws IOException { private void generateComparisonVectors() { KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, - kMeansMaxItr); + KMEANS_MAX_ITR); List vecList = new ArrayList<>(); for (float[] vec : word2vec.values()) { vecList.add(getClusterablePoint(vec)); From 2d4ba0b408ed50d1e92609f831178e0c0035f033 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 19 Oct 2018 07:51:29 +0200 Subject: [PATCH 111/114] Changing method calls --- .../word2vec/W2VNrmlMemModelBinSrch.java | 24 +++++++++++++++-- .../word2vec/W2VNrmlMemModelKMeans.java | 27 +++++++++++++++---- .../NrmlzdThetaMdlPrfmncTester.java | 7 ++--- 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java index f50c8a8..6f90405 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBinSrch.java @@ -44,12 +44,16 @@ protected W2VNrmlMemModelBinSrch() { public W2VNrmlMemModelBinSrch(final Map word2vec, final int vectorSize) throws IOException { this.word2vec = word2vec; this.vectorSize = vectorSize; + initVars(); + // process(); + } + + public void initVars() { comparisonVecs = new float[compareVecCount][vectorSize]; csBucketContainer = new BitSet[compareVecCount][bucketCount]; - process(); } - protected void process() throws IOException { + public void process() throws IOException { LOG.info("Process from BinSrch called"); // Setting mean as comparison vec setMeanComparisonVec(word2vec, vectorSize); @@ -280,4 +284,20 @@ public Map getWord2VecMap() { return this.word2vec; } + public int getCompareVecCount() { + return compareVecCount; + } + + public void setCompareVecCount(int compareVecCount) { + this.compareVecCount = compareVecCount; + } + + public int getBucketCount() { + return bucketCount; + } + + public void setBucketCount(int bucketCount) { + this.bucketCount = bucketCount; + } + } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java index d8ec809..1fc2316 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelKMeans.java @@ -39,15 +39,15 @@ public class W2VNrmlMemModelKMeans extends W2VNrmlMemModelBinSrch { public static Logger LOG = LogManager.getLogger(GenWord2VecModel.class); - private static final int KMEANS_MAX_ITR = 5; - private static final String VEC_FILEPATH = Cfg.get(W2VNrmlMemModelKMeans.class.getName().concat(".filepath")); + private int kMeansMaxItr = 5; + private String vecFilePath = Cfg.get(W2VNrmlMemModelKMeans.class.getName().concat(".filepath")); public W2VNrmlMemModelKMeans(final Map word2vec, final int vectorSize) throws IOException { super(word2vec, vectorSize); } @Override - protected void process() throws IOException { + public void process() throws IOException { LOG.info("Process from KMeans called"); fetchComparisonVectors(); // Initialize Arrays @@ -55,7 +55,7 @@ protected void process() throws IOException { } private void fetchComparisonVectors() throws IOException { - File vecFile = new File(VEC_FILEPATH); + File vecFile = new File(vecFilePath); if (vecFile.exists()) { LOG.info("Reading Comparsion vectors from the file."); // read the persisted vectors @@ -73,7 +73,7 @@ private void fetchComparisonVectors() throws IOException { private void generateComparisonVectors() { KMeansPlusPlusClusterer clusterer = new KMeansPlusPlusClusterer<>(compareVecCount, - KMEANS_MAX_ITR); + kMeansMaxItr); List vecList = new ArrayList<>(); for (float[] vec : word2vec.values()) { vecList.add(getClusterablePoint(vec)); @@ -177,4 +177,21 @@ public static float[] convertToFloatArr(String[] vec) { return resArr; } + // Getter and Setters + public int getkMeansMaxItr() { + return kMeansMaxItr; + } + + public void setkMeansMaxItr(int kMeansMaxItr) { + this.kMeansMaxItr = kMeansMaxItr; + } + + public String getVecFilePath() { + return vecFilePath; + } + + public void setVecFilePath(String vecFilePath) { + this.vecFilePath = vecFilePath; + } + } diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index ba797eb..afb52b3 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -34,12 +34,13 @@ public void testNbmTime() throws IOException { LOG.info("Starting InMemory Theta Model test!"); Word2VecModel nbm = Word2VecFactory.getNormalBinModel(); float[][] centroids = TEST_CENTROIDS; - //float[][] centroids = fetchWordsVec(TEST_WORDS, nbm); + // float[][] centroids = fetchWordsVec(TEST_WORDS, nbm); LOG.info("Starting BruteForce-Model Test"); List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); LOG.info("Initializing W2VNrmlMemModelKMeans Model"); final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); + memModel.process(); List lrModelWords = new ArrayList<>(); LOG.info("Starting W2VNrmlMemModelKMeans Test"); @@ -66,7 +67,7 @@ private static float[][] fetchWordsVec(String[] words, Word2VecModel nbm) { } return resVec; } - + public static float calcPercScore(List correctWordSet, List lrModelWords) { float percScore = 0; int len = correctWordSet.size(); @@ -79,7 +80,7 @@ public static float calcPercScore(List correctWordSet, List lrMo return percScore; } - + public static List getCorrectWords(float[][] centroids, Word2VecModel nbm) { List wordSet = new ArrayList<>(); W2VNrmlMemModelBruteForce bruteForce = new W2VNrmlMemModelBruteForce(nbm.word2vec, nbm.vectorSize); From 62137dde35fdc9539f35ff5a5ee138a813d4fa34 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 19 Oct 2018 08:03:34 +0200 Subject: [PATCH 112/114] changing test config --- .../word2vecrestful/NrmlzdThetaMdlPrfmncTester.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index afb52b3..e1245d3 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -6,7 +6,7 @@ import org.aksw.word2vecrestful.utils.Cfg; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelKMeans; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -38,12 +38,12 @@ public void testNbmTime() throws IOException { LOG.info("Starting BruteForce-Model Test"); List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); - LOG.info("Initializing W2VNrmlMemModelKMeans Model"); - final W2VNrmlMemModelKMeans memModel = new W2VNrmlMemModelKMeans(nbm.word2vec, nbm.vectorSize); + LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); + final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); memModel.process(); List lrModelWords = new ArrayList<>(); - LOG.info("Starting W2VNrmlMemModelKMeans Test"); + LOG.info("Starting W2VNrmlMemModelBinSrch Test"); for (int i = 0; i < centroids.length; i++) { LOG.info("Sending query for Centroid " + (i + 1)); @@ -53,7 +53,7 @@ public void testNbmTime() throws IOException { totTime += diff; LOG.info("Query time recorded for Centroid " + (i + 1) + " is " + diff + " milliseconds."); } - LOG.info("Average query time for W2VNrmlMemModelKMeans is : " + (totTime / centroids.length) + " milliseconds"); + LOG.info("Average query time for W2VNrmlMemModelBinSrch is : " + (totTime / centroids.length) + " milliseconds"); LOG.info("Predicted Words are :" + lrModelWords); float percVal = calcPercScore(correctWords, lrModelWords); LOG.info("Score for Test is : " + percVal + "%"); From 6b19bc5bbda6492273c4d9964ca11714b5785f8a Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 19 Oct 2018 08:27:30 +0200 Subject: [PATCH 113/114] removing unused code --- .../word2vec/GenWord2VecModel.java | 3 +- .../word2vec/W2VNrmlMemModelBruteForce.java | 61 +++---------------- .../NrmlzdThetaMdlPrfmncTester.java | 5 +- 3 files changed, 15 insertions(+), 54 deletions(-) diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java index 4301c10..1d9251b 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/GenWord2VecModel.java @@ -1,9 +1,10 @@ package org.aksw.word2vecrestful.word2vec; -import java.util.Map; +import java.io.IOException; public interface GenWord2VecModel { public int getVectorSize(); public String getClosestEntry(float[] vector); public String getClosestSubEntry(float[] vector, String subKey); + public void process() throws IOException; } diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java index 556980a..fa73efd 100644 --- a/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java +++ b/src/main/java/org/aksw/word2vecrestful/word2vec/W2VNrmlMemModelBruteForce.java @@ -1,16 +1,12 @@ package org.aksw.word2vecrestful.word2vec; import java.io.IOException; -import java.util.HashMap; import java.util.Map; -import java.util.Set; import org.aksw.word2vecrestful.subset.DataSubsetProvider; -import org.aksw.word2vecrestful.utils.TimeLogger; import org.aksw.word2vecrestful.utils.Word2VecMath; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; -import org.dice_research.topicmodeling.commons.sort.AssociativeSort; /** * Class to encapsulate word2vec in-memory model and expose methods to perform @@ -24,13 +20,18 @@ public class W2VNrmlMemModelBruteForce implements GenWord2VecModel { private Map word2vec; private int vectorSize; + // for future use + @SuppressWarnings("unused") private DataSubsetProvider dataSubsetProvider; - // TODO : Remove this - private TimeLogger tl = new TimeLogger(); public W2VNrmlMemModelBruteForce(final Map word2vec, final int vectorSize) { this.word2vec = word2vec; this.vectorSize = vectorSize; + + } + + @Override + public void process() throws IOException { this.dataSubsetProvider = new DataSubsetProvider(); } @@ -74,51 +75,9 @@ public String getClosestSubEntry(float[] vector, String subKey) { * @return closest word to the given vector alongwith it's vector */ private String getClosestEntry(float[] vector, String subKey) { - Set wordSet = null; - String closestVec = null; - try { - if (subKey == null) { - wordSet = word2vec.keySet(); - } else { - tl.logTime(1); - wordSet = dataSubsetProvider.fetchSubsetWords(subKey); - tl.printTime(1, "fetchSubsetWords"); - } - // LOG.info("Normalizing input vector"); - // Normalize incoming vector - vector = Word2VecMath.normalize(vector); - - return Word2VecMath.findClosestNormalizedVec(word2vec, vector); - // LOG.info("fetching nearby vectors"); - // calculate cosine similarity of all distances -// String[] wordArr = new String[wordSet.size()]; -// int[] idArr = new int[wordSet.size()]; -// double[] cosineArr = new double[wordSet.size()]; -// int i = 0; -// for (String word : wordSet) { -// wordArr[i] = word; -// idArr[i] = i; -// float[] wordVec = word2vec.get(word); -// cosineArr[i] = Word2VecMath.cosineSimilarityNormalizedVecs(wordVec, vector); -// i++; -// } -// cosineArr = AssociativeSort.quickSort(cosineArr, idArr); -// double maxVal = cosineArr[cosineArr.length - 1]; -// for (int j = cosineArr.length - 1; j >= 0; j--) { -// if (cosineArr[j] == maxVal) { -// int closestWordId = idArr[j]; -// String closestWord = wordArr[closestWordId]; -// closestVec = closestWord; -// }else { -// break; -// } -// } - - } catch (IOException e) { - LOG.error(e.getStackTrace()); - } - // LOG.info("Closest word found is " + closestVec.keySet()); - return closestVec; + // Normalize incoming vector + vector = Word2VecMath.normalize(vector); + return Word2VecMath.findClosestNormalizedVec(word2vec, vector); } /** diff --git a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java index e1245d3..f7609cf 100644 --- a/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java +++ b/src/test/java/org/aksw/word2vecrestful/NrmlzdThetaMdlPrfmncTester.java @@ -5,8 +5,9 @@ import java.util.List; import org.aksw.word2vecrestful.utils.Cfg; -import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; +import org.aksw.word2vecrestful.word2vec.GenWord2VecModel; import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBinSrch; +import org.aksw.word2vecrestful.word2vec.W2VNrmlMemModelBruteForce; import org.aksw.word2vecrestful.word2vec.Word2VecFactory; import org.aksw.word2vecrestful.word2vec.Word2VecModel; import org.apache.log4j.LogManager; @@ -39,7 +40,7 @@ public void testNbmTime() throws IOException { List correctWords = getCorrectWords(centroids, nbm); LOG.info("Correct Words are :" + correctWords); LOG.info("Initializing W2VNrmlMemModelBinSrch Model"); - final W2VNrmlMemModelBinSrch memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); + final GenWord2VecModel memModel = new W2VNrmlMemModelBinSrch(nbm.word2vec, nbm.vectorSize); memModel.process(); List lrModelWords = new ArrayList<>(); From 2df253987210c805782aaeea5cea4e21b2038670 Mon Sep 17 00:00:00 2001 From: "nikit91@gmail.com" Date: Fri, 19 Oct 2018 09:10:54 +0200 Subject: [PATCH 114/114] deleting unused class --- .../word2vecrestful/db/SQLiteDBHandler.java | 98 ------------------- 1 file changed, 98 deletions(-) delete mode 100644 src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java diff --git a/src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java b/src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java deleted file mode 100644 index e37f0dd..0000000 --- a/src/main/java/org/aksw/word2vecrestful/db/SQLiteDBHandler.java +++ /dev/null @@ -1,98 +0,0 @@ -package org.aksw.word2vecrestful.db; - -import java.sql.Connection; -import java.sql.DriverManager; -import java.sql.SQLException; -import java.sql.Statement; - -import org.apache.log4j.Logger; -import org.sqlite.SQLiteConfig; - -public class SQLiteDBHandler { - - public static Logger LOG = Logger.getLogger(AbstractSQLightDB.class); - - private static int queryTimeout = 30; - - private String db; - protected Connection connection = null; - protected Statement statement = null; - - public SQLiteDBHandler(String dbName) { - try { - this.db = dbName.concat(".db"); - Class.forName("org.sqlite.JDBC"); - } catch (final ClassNotFoundException e) { - LOG.error(e.getLocalizedMessage(), e); - } - - } - - public void commit() throws SQLException { - if (connection != null) { - connection.commit(); - } - } - - /** - * Executes the given query on database and returns the numbers of rows updated - * - * @param query - * - statement to be executed - * @return - numbers of rows updated - */ - protected boolean executeStatement(String query) { - boolean res = false; - if (connect()) { - try { - statement = connection.createStatement(); - statement.setQueryTimeout(queryTimeout); - res = statement.execute(query); - } catch (final SQLException e) { - LOG.error(e.getLocalizedMessage(), e); - } finally { - disconnect(); - } - } - return res; - } - - /** - * Disconnect DB. - */ - public void disconnect() { - try { - if (connection != null) { - connection.close(); - } - } catch (final SQLException e) { - LOG.error("\n", e); - } - } - - /** - * Connect DB. - */ - public boolean connect() { - final SQLiteConfig config = new SQLiteConfig(); - // config.setEncoding(SQLiteConfig.Encoding.UTF8); - return connect(config); - } - - /** - * Connect DB. - * - * @param config - * @return true if connected - */ - protected boolean connect(final SQLiteConfig config) { - try { - connection = DriverManager.getConnection("jdbc:sqlite:".concat(db), config.toProperties()); - } catch (final SQLException e) { - LOG.error("\n", e); - statement = null; - } - return connection == null ? false : true; - } - -}