-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from nikit91/master
New Models Implementation
- Loading branch information
Showing
21 changed files
with
1,727 additions
and
105 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
146 changes: 146 additions & 0 deletions
146
src/main/java/org/aksw/word2vecrestful/subset/DataSubsetGenerator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
package org.aksw.word2vecrestful.subset; | ||
|
||
import java.io.BufferedWriter; | ||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.FileNotFoundException; | ||
import java.io.IOException; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.Iterator; | ||
import java.util.Map; | ||
import java.util.Map.Entry; | ||
|
||
import org.aksw.word2vecrestful.utils.Cfg; | ||
import org.aksw.word2vecrestful.word2vec.Word2VecFactory; | ||
import org.aksw.word2vecrestful.word2vec.Word2VecModel; | ||
import org.apache.commons.io.output.FileWriterWithEncoding; | ||
|
||
import com.fasterxml.jackson.core.JsonProcessingException; | ||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import com.fasterxml.jackson.databind.ObjectReader; | ||
import com.fasterxml.jackson.databind.node.ArrayNode; | ||
import com.fasterxml.jackson.databind.node.ObjectNode; | ||
|
||
/** | ||
* Class to help generate and persist the subsets of a word2vec model Expected | ||
* json config format: | ||
* | ||
* <pre> | ||
*{ | ||
* "data" : [ { | ||
* "key" : "xyz", | ||
* "centroid" : [...], | ||
* "sd" : [...] | ||
* }...] | ||
*} | ||
* </pre> | ||
* <dl> | ||
* <dt>key</dt> | ||
* <dd>identification literal for the subset</dd> | ||
* <dt>centroid</dt> | ||
* <dd>centroid array for the subset</dd> | ||
* <dt>sd</dt> | ||
* <dd>standard deviation array for the subset</dd> | ||
* </dl> | ||
* | ||
* @author Nikit | ||
* | ||
*/ | ||
public class DataSubsetGenerator { | ||
|
||
public static final String DATA_LABEL = "data"; | ||
public static final String KEY_LABEL = "key"; | ||
public static final String CENTROID_LABEL = "centroid"; | ||
public static final String SD_LABEL = "sd"; | ||
public static final ObjectMapper OBJ_MAPPER = new ObjectMapper(); | ||
public static final ObjectReader OBJ_READER = OBJ_MAPPER.reader(); | ||
|
||
/** | ||
* Method to generate subset json files for a given configuration and word2vec | ||
* model | ||
* | ||
* @param subsetConfig | ||
* - configuration json file | ||
* @param outputFileDir | ||
* - output directory for the subset files | ||
* @param word2vec | ||
* - word2vec model map | ||
* @param vectorSize | ||
* - size of the vectors in model | ||
* @throws JsonProcessingException | ||
* @throws FileNotFoundException | ||
* @throws IOException | ||
*/ | ||
public static void generateSubsetFiles(File subsetConfig, String outputFileDir, Map<String, float[]> word2vec, | ||
int vectorSize) throws JsonProcessingException, FileNotFoundException, IOException { | ||
// Read file into a json | ||
ObjectNode inpObj = (ObjectNode) OBJ_READER.readTree(new FileInputStream(subsetConfig)); | ||
ArrayNode inpDt = (ArrayNode) inpObj.get(DATA_LABEL); | ||
// Traverse the json for keys | ||
Iterator<JsonNode> inpIt = inpDt.iterator(); | ||
float[] maxlim = new float[vectorSize]; | ||
float[] minlim = new float[vectorSize]; | ||
while (inpIt.hasNext()) { | ||
|
||
JsonNode curNode = inpIt.next(); | ||
// fetch value of key | ||
String key = curNode.get(KEY_LABEL).asText(); | ||
// fetch value of centroid | ||
ArrayNode centroid = (ArrayNode) curNode.get(CENTROID_LABEL); | ||
// fetch value of standard deviation | ||
ArrayNode stndrdDev = (ArrayNode) curNode.get(SD_LABEL); | ||
// create an output file | ||
File outputFile = new File(outputFileDir + "/" + key + ".txt"); | ||
outputFile.getParentFile().mkdirs(); | ||
// open an output stream | ||
BufferedWriter bWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile, StandardCharsets.UTF_8)); | ||
boolean limitNotSet = true; | ||
// loop through the model | ||
for (Entry<String, float[]> wordEntry : word2vec.entrySet()) { | ||
String word = wordEntry.getKey(); | ||
float[] wordvec = wordEntry.getValue(); | ||
boolean isValid = true; | ||
for (int i = 0; i < centroid.size(); i++) { | ||
if (limitNotSet) { | ||
float centVal = centroid.get(i).floatValue(); | ||
float sdVal = stndrdDev.get(i).floatValue(); | ||
// maxlim = add sd to centroid | ||
maxlim[i] = centVal + 3 * sdVal; | ||
// minlim = subtract sb from centroid | ||
minlim[i] = centVal - 3 * sdVal; | ||
} | ||
// check if values of all the dimensions are under maxlim and minlim | ||
float curVal = wordvec[i]; | ||
if (curVal > maxlim[i] || curVal < minlim[i]) { | ||
isValid = false; | ||
break; | ||
} | ||
} | ||
limitNotSet = false; | ||
if (isValid) { | ||
// write the word in the file | ||
bWriter.write(word); | ||
bWriter.newLine(); | ||
} | ||
} | ||
// Close the stream | ||
bWriter.close(); | ||
} | ||
} | ||
|
||
/** | ||
* Method to demonstrate example usage | ||
* | ||
* @param args | ||
* @throws JsonProcessingException | ||
* @throws FileNotFoundException | ||
* @throws IOException | ||
*/ | ||
public static void main(String[] args) throws JsonProcessingException, FileNotFoundException, IOException { | ||
File subsetConfig = new File("word2vec-dump\\subsetconfig2.json"); | ||
Word2VecModel model = Word2VecFactory.getNormalBinModel(); | ||
generateSubsetFiles(subsetConfig, Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"), model.word2vec, | ||
model.vectorSize); | ||
} | ||
} |
65 changes: 65 additions & 0 deletions
65
src/main/java/org/aksw/word2vecrestful/subset/DataSubsetProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package org.aksw.word2vecrestful.subset; | ||
|
||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
import java.io.IOException; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Map; | ||
import java.util.Set; | ||
|
||
import org.aksw.word2vecrestful.utils.Cfg; | ||
import org.apache.commons.io.FileUtils; | ||
|
||
import com.fasterxml.jackson.core.JsonProcessingException; | ||
|
||
/** | ||
* Class to help retrieve the list of words from a subset stored on the disk | ||
* | ||
* @author Nikit | ||
* | ||
*/ | ||
public class DataSubsetProvider { | ||
|
||
private String fileDir = Cfg.get("org.aksw.word2vecrestful.Application.subsetfiledir"); | ||
private final Map<String, Set<String>> SUBSET_MODELS = new HashMap<>(); | ||
|
||
/** | ||
* Method to fetch the set of words in a subset | ||
* | ||
* @param subsetKey | ||
* - key to identify the subset | ||
* @return - a list of words in the related subset | ||
* @throws JsonProcessingException | ||
* @throws FileNotFoundException | ||
* @throws IOException | ||
*/ | ||
public Set<String> fetchSubsetWords(String subsetKey) throws IOException { | ||
// fetch from cache | ||
Set<String> resList = SUBSET_MODELS.get(subsetKey); | ||
// if not in cache then read from file and add to cache | ||
if (resList == null) { | ||
// logic to fetch the words from the stored subsets | ||
File file1 = new File(fileDir + "/" + appendFileExtension(subsetKey)); | ||
if (file1.exists()) { | ||
resList = new HashSet<>(); | ||
resList.addAll(FileUtils.readLines(file1, StandardCharsets.UTF_8)); | ||
SUBSET_MODELS.put(subsetKey, resList); | ||
} | ||
} | ||
return resList; | ||
} | ||
|
||
/** | ||
* Method to append txt extension at the end of a key | ||
* | ||
* @param name | ||
* - key | ||
* @return key appended with txt extension | ||
*/ | ||
public static String appendFileExtension(String name) { | ||
return name + ".txt"; | ||
} | ||
|
||
} |
Oops, something went wrong.