Implementing requested changes

1. Removed static handling of DataSubsetProvider 2. Better file handling for reading subset file 3. UTF8 encoding enforced on the text 4. Formatted documentation of data json
dice-group · Aug 23, 2018 · fd5b906 · fd5b906
1 parent 7a94730
commit fd5b906
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 49 deletions.
diff --git a/pom.xml b/pom.xml
@@ -57,7 +57,13 @@
 		    <artifactId>opencsv</artifactId>
 		    <version>4.1</version>
 		</dependency>
-
+
+		<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
+		<dependency>
+		    <groupId>commons-io</groupId>
+		    <artifactId>commons-io</artifactId>
+		    <version>2.6</version>
+		</dependency>
 
     </dependencies>
     <build>

diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java
@@ -4,15 +4,16 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
-import java.io.FileWriter;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Map.Entry;
 
 import org.aksw.word2vecrestful.utils.Cfg;
 import org.aksw.word2vecrestful.word2vec.Word2VecFactory;
 import org.aksw.word2vecrestful.word2vec.Word2VecModel;
+import org.apache.commons.io.output.FileWriterWithEncoding;
 
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
@@ -22,16 +23,26 @@
 import com.fasterxml.jackson.databind.node.ObjectNode;
 
 /**
- * Class to help generate and persist the subsets of a word2vec model
- * Expected json config format:
+ * Class to help generate and persist the subsets of a word2vec model Expected
+ * json config format:
  * 
- * {
- *  "data" : [ {
- *		"key" : "xyz",
- *		"centroid" : [...],
- *	    "sd" : [...]
- *	 }...]
- * }
+ * <pre>
+ *{
+ * "data" : [ {
+ *  "key" : "xyz",
+ *  "centroid" : [...],
+ *  "sd" : [...]
+ * }...]
+ *}
+ * </pre>
+ * <dl>
+ * <dt>key</dt>
+ * <dd>identification literal for the subset</dd>
+ * <dt>centroid</dt>
+ * <dd>centroid array for the subset</dd>
+ * <dt>sd</dt>
+ * <dd>standard deviation array for the subset</dd>
+ * </dl>
  * 
  * @author Nikit
  *
@@ -44,12 +55,19 @@ public class DataSubsetGenerator {
 	public static final String SD_LABEL = "sd";
 	public static final ObjectMapper OBJ_MAPPER = new ObjectMapper();
 	public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader();
+
 	/**
-	 * Method to generate subset json files for a given configuration and word2vec model
-	 * @param subsetConfig - configuration json file
-	 * @param outputFileDir - output directory for the subset files
-	 * @param word2vec - word2vec model map
-	 * @param vectorSize - size of the vectors in model
+	 * Method to generate subset json files for a given configuration and word2vec
+	 * model
+	 * 
+	 * @param subsetConfig
+	 *            - configuration json file
+	 * @param outputFileDir
+	 *            - output directory for the subset files
+	 * @param word2vec
+	 *            - word2vec model map
+	 * @param vectorSize
+	 *            - size of the vectors in model
 	 * @throws JsonProcessingException
 	 * @throws FileNotFoundException
 	 * @throws IOException
@@ -76,7 +94,7 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir,
 			File outputFile = new File(outputFileDir + "/" + key + ".txt");
 			outputFile.getParentFile().mkdirs();
 			// open an output stream
-			BufferedWriter bWriter = new BufferedWriter(new FileWriter(outputFile));
+			BufferedWriter bWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile, StandardCharsets.UTF_8));
 			boolean limitNotSet = true;
 			// loop through the model
 			for (Entry<String, float[]> wordEntry : word2vec.entrySet()) {
@@ -106,12 +124,14 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir,
 					bWriter.newLine();
 				}
 			}
-			//Close the stream
+			// Close the stream
 			bWriter.close();
 		}
 	}
+
 	/**
 	 * Method to demonstrate example usage
+	 * 
 	 * @param args
 	 * @throws JsonProcessingException
 	 * @throws FileNotFoundException

diff --git a/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java b/src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java
@@ -1,16 +1,15 @@
 package org.aksw.word2vecrestful.subset;
 
-import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileNotFoundException;
-import java.io.FileReader;
 import java.io.IOException;
-import java.util.ArrayList;
+import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 import org.aksw.word2vecrestful.utils.Cfg;
+import org.apache.commons.io.FileUtils;
 
 import com.fasterxml.jackson.core.JsonProcessingException;
 
@@ -22,8 +21,8 @@
  */
 public class DataSubsetProvider {
 
-	public static String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir");
-	public static final Map<String, List<String>> SUBSET_MODELS = new HashMap<>();
+	private String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir");
+	private final Map<String, List<String>> SUBSET_MODELS = new HashMap<>();
 
 	/**
 	 * Method to fetch the list of words in a subset
@@ -35,37 +34,17 @@ public class DataSubsetProvider {
 	 * @throws FileNotFoundException
 	 * @throws IOException
 	 */
-	public static List<String> fetchSubsetWords(String subsetKey)
-			throws IOException {
+	public List<String> fetchSubsetWords(String subsetKey) throws IOException {
 		// fetch from cache
 		List<String> resList = SUBSET_MODELS.get(subsetKey);
 		// if not in cache then read from file and add to cache
 		if (resList == null) {
-			resList = new ArrayList<>();
 			// logic to fetch the words from the stored subsets
-			File dir = new File(fileDir);
-			// declare inputstream
-			BufferedReader bReader = null;
-			File[] directoryListing = dir.listFiles();
-			if (directoryListing != null) {
-				for (File child : directoryListing) {
-					// Do something with child
-					if (child.getName().equalsIgnoreCase(appendFileExtension(subsetKey))) {
-						bReader = new BufferedReader(new FileReader(child));
-						break;
-					}
-				}
+			File file1 = new File(fileDir + "/" + appendFileExtension(subsetKey));
+			if (file1.exists()) {
+				resList = FileUtils.readLines(file1, StandardCharsets.UTF_8);
+				SUBSET_MODELS.put(subsetKey, resList);
 			}
-			if (bReader != null) {
-				while (true) {
-					String word = bReader.readLine();
-					if (word == null) {
-						break;
-					}
-					resList.add(word);
-				}
-			}
-			SUBSET_MODELS.put(subsetKey, resList);
 		}
 		return resList;
 	}

diff --git a/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java b/src/main/java/org/aksw/word2vecrestful/word2vec/Word2VecModel.java
@@ -24,10 +24,12 @@ public class Word2VecModel {
   public int vectorSize;
   public TreeSet<VectorDimension> sortedVecDimns;
   public int[] dimRngIndxDesc;
+  private DataSubsetProvider dataSubsetProvider;
   public Word2VecModel(final Map<String, float[]> word2vec, final int vectorSize) {
     this.word2vec = word2vec;
     this.vectorSize = vectorSize;
     this.sortedVecDimns = new TreeSet<>();
+    this.dataSubsetProvider = new DataSubsetProvider();
     //fetch vector dimension details
     fetchVectorDimensions();
   }
@@ -97,7 +99,7 @@ public Map<String, float[]> getClosestEntryInSub(float[] inpvec, String subsetKe
 	  float[] minVec = null;
 	  double tempDist;
 	  // Loop on the subset
-	  for(String word : DataSubsetProvider.fetchSubsetWords(subsetKey)) {
+	  for(String word : dataSubsetProvider.fetchSubsetWords(subsetKey)) {
 		  float[] wordvec = word2vec.get(word);
 		  tempDist = getSqEucDist(inpvec, wordvec, minDist);
 		  if(tempDist != -1) {