Skip to content

Commit

Permalink
Implementing requested changes
Browse files Browse the repository at this point in the history
1. Removed static handling of DataSubsetProvider
2. Better file handling for reading subset file
3. UTF8 encoding enforced on the text
4. Formatted documentation of data json
  • Loading branch information
nikit-srivastava committed Aug 23, 2018
1 parent 7a94730 commit fd5b906
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 49 deletions.
8 changes: 7 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,13 @@
<artifactId>opencsv</artifactId>
<version>4.1</version>
</dependency>


<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>

</dependencies>
<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;

import org.aksw.word2vecrestful.utils.Cfg;
import org.aksw.word2vecrestful.word2vec.Word2VecFactory;
import org.aksw.word2vecrestful.word2vec.Word2VecModel;
import org.apache.commons.io.output.FileWriterWithEncoding;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
Expand All @@ -22,16 +23,26 @@
import com.fasterxml.jackson.databind.node.ObjectNode;

/**
* Class to help generate and persist the subsets of a word2vec model
* Expected json config format:
* Class to help generate and persist the subsets of a word2vec model Expected
* json config format:
*
* {
* "data" : [ {
* "key" : "xyz",
* "centroid" : [...],
* "sd" : [...]
* }...]
* }
* <pre>
*{
* "data" : [ {
* "key" : "xyz",
* "centroid" : [...],
* "sd" : [...]
* }...]
*}
* </pre>
* <dl>
* <dt>key</dt>
* <dd>identification literal for the subset</dd>
* <dt>centroid</dt>
* <dd>centroid array for the subset</dd>
* <dt>sd</dt>
* <dd>standard deviation array for the subset</dd>
* </dl>
*
* @author Nikit
*
Expand All @@ -44,12 +55,19 @@ public class DataSubsetGenerator {
public static final String SD_LABEL = "sd";
public static final ObjectMapper OBJ_MAPPER = new ObjectMapper();
public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader();

/**
* Method to generate subset json files for a given configuration and word2vec model
* @param subsetConfig - configuration json file
* @param outputFileDir - output directory for the subset files
* @param word2vec - word2vec model map
* @param vectorSize - size of the vectors in model
* Method to generate subset json files for a given configuration and word2vec
* model
*
* @param subsetConfig
* - configuration json file
* @param outputFileDir
* - output directory for the subset files
* @param word2vec
* - word2vec model map
* @param vectorSize
* - size of the vectors in model
* @throws JsonProcessingException
* @throws FileNotFoundException
* @throws IOException
Expand All @@ -76,7 +94,7 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir,
File outputFile = new File(outputFileDir + "/" + key + ".txt");
outputFile.getParentFile().mkdirs();
// open an output stream
BufferedWriter bWriter = new BufferedWriter(new FileWriter(outputFile));
BufferedWriter bWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile, StandardCharsets.UTF_8));
boolean limitNotSet = true;
// loop through the model
for (Entry<String, float[]> wordEntry : word2vec.entrySet()) {
Expand Down Expand Up @@ -106,12 +124,14 @@ public static void generateSubsetFiles(File subsetConfig, String outputFileDir,
bWriter.newLine();
}
}
//Close the stream
// Close the stream
bWriter.close();
}
}

/**
* Method to demonstrate example usage
*
* @param args
* @throws JsonProcessingException
* @throws FileNotFoundException
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
package org.aksw.word2vecrestful.subset;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.aksw.word2vecrestful.utils.Cfg;
import org.apache.commons.io.FileUtils;

import com.fasterxml.jackson.core.JsonProcessingException;

Expand All @@ -22,8 +21,8 @@
*/
public class DataSubsetProvider {

public static String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir");
public static final Map<String, List<String>> SUBSET_MODELS = new HashMap<>();
private String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir");
private final Map<String, List<String>> SUBSET_MODELS = new HashMap<>();

/**
* Method to fetch the list of words in a subset
Expand All @@ -35,37 +34,17 @@ public class DataSubsetProvider {
* @throws FileNotFoundException
* @throws IOException
*/
public static List<String> fetchSubsetWords(String subsetKey)
throws IOException {
public List<String> fetchSubsetWords(String subsetKey) throws IOException {
// fetch from cache
List<String> resList = SUBSET_MODELS.get(subsetKey);
// if not in cache then read from file and add to cache
if (resList == null) {
resList = new ArrayList<>();
// logic to fetch the words from the stored subsets
File dir = new File(fileDir);
// declare inputstream
BufferedReader bReader = null;
File[] directoryListing = dir.listFiles();
if (directoryListing != null) {
for (File child : directoryListing) {
// Do something with child
if (child.getName().equalsIgnoreCase(appendFileExtension(subsetKey))) {
bReader = new BufferedReader(new FileReader(child));
break;
}
}
File file1 = new File(fileDir + "/" + appendFileExtension(subsetKey));
if (file1.exists()) {
resList = FileUtils.readLines(file1, StandardCharsets.UTF_8);
SUBSET_MODELS.put(subsetKey, resList);
}
if (bReader != null) {
while (true) {
String word = bReader.readLine();
if (word == null) {
break;
}
resList.add(word);
}
}
SUBSET_MODELS.put(subsetKey, resList);
}
return resList;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ public class Word2VecModel {
public int vectorSize;
public TreeSet<VectorDimension> sortedVecDimns;
public int[] dimRngIndxDesc;
private DataSubsetProvider dataSubsetProvider;
public Word2VecModel(final Map<String, float[]> word2vec, final int vectorSize) {
this.word2vec = word2vec;
this.vectorSize = vectorSize;
this.sortedVecDimns = new TreeSet<>();
this.dataSubsetProvider = new DataSubsetProvider();
//fetch vector dimension details
fetchVectorDimensions();
}
Expand Down Expand Up @@ -97,7 +99,7 @@ public Map<String, float[]> getClosestEntryInSub(float[] inpvec, String subsetKe
float[] minVec = null;
double tempDist;
// Loop on the subset
for(String word : DataSubsetProvider.fetchSubsetWords(subsetKey)) {
for(String word : dataSubsetProvider.fetchSubsetWords(subsetKey)) {
float[] wordvec = word2vec.get(word);
tempDist = getSqEucDist(inpvec, wordvec, minDist);
if(tempDist != -1) {
Expand Down

0 comments on commit fd5b906

Please sign in to comment.