diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 16d900740..2b72648c8 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -12,6 +12,10 @@
* artifactId: salt-saltSample *
***********************************************************************************
+Version 2.1.1
+=============
+* this is just a bugfix release
+
Version 2.1.0
=============
* updated EMF version from 2.8 to 2.9.1-v20130827-0309
diff --git a/pom.xml b/pom.xml
index 699a6755e..d12b90bee 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
de.hu_berlin.german.korpling.saltnpeppersaltpom
- 2.1.0
+ 2.1.1-SNAPSHOTsalt-graphsalt-saltCore
@@ -31,7 +31,7 @@
scm:git:git://github.com/korpling/salt.gitscm:git:git@github.com:korpling/salt.githttps://github.com/korpling/salt
- salt-2.1.0
+ HEAD
diff --git a/salt-doc/src/main/docbkx/salt_modelGuide.xml b/salt-doc/src/main/docbkx/salt_modelGuide.xml
index 670a44745..479b04c91 100644
--- a/salt-doc/src/main/docbkx/salt_modelGuide.xml
+++ b/salt-doc/src/main/docbkx/salt_modelGuide.xml
@@ -165,9 +165,9 @@
a layer or another label as shown in . An
attribute-value-pair is a triple which consists of a namespace, a name and a value
(namespace:name=value). The combination of name and namespace is used to identify a
- label and therefore must be unique. The namespace is an optional value, to ditinguish in
- case of there are two labels having the same name. For instance a node etc. can have a
- label stts:pos=VVFIN as well as the label stts:pos=VVFIN as well as the label pos:VV to annotate it with two part-of-speech annotations
from different tagsets. Label mechanism for graph, node, edge and layer (class diagram)
diff --git a/salt-graph/pom.xml b/salt-graph/pom.xml
index df49c4252..7b302b066 100644
--- a/salt-graph/pom.xml
+++ b/salt-graph/pom.xml
@@ -3,7 +3,7 @@
de.hu_berlin.german.korpling.saltnpeppersalt
- 2.1.0
+ 2.1.1-SNAPSHOT../pom.xml4.0.0
diff --git a/salt-saltCommon/pom.xml b/salt-saltCommon/pom.xml
index 8606731b5..81f94b181 100644
--- a/salt-saltCommon/pom.xml
+++ b/salt-saltCommon/pom.xml
@@ -6,7 +6,7 @@
de.hu_berlin.german.korpling.saltnpeppersalt
- 2.1.0
+ 2.1.1-SNAPSHOT../pom.xml
diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java
index ab57791ff..212bd10f3 100644
--- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java
+++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java
@@ -68,6 +68,7 @@
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sCorpusStructure.SDocument;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCore.SElementId;
+import java.util.LinkedList;
/**
*
@@ -371,12 +372,12 @@ public synchronized void saveSaltProject(URI saltProjectURI)
(this.getSCorpusGraphs().size()> 0))
{//store all documents if exist
URI sDocumentFileURI= null;
- for (SCorpusGraph sCorpusGraph: Collections.synchronizedList(this.getSCorpusGraphs()))
+ for (SCorpusGraph sCorpusGraph: new LinkedList<>(this.getSCorpusGraphs()))
{
if ( (sCorpusGraph.getSDocuments()!= null) &&
(sCorpusGraph.getSDocuments().size()> 0))
{
- for (SDocument sDocument: Collections.synchronizedList(sCorpusGraph.getSDocuments()))
+ for (SDocument sDocument: new LinkedList<>(sCorpusGraph.getSDocuments()))
{
if (sDocument.getSDocumentGraph()!= null)
{//only store sDocument, when there is some content in sDocumentGraph corresponding to sDocument
diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java
index 41e11b508..087981881 100644
--- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java
+++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java
@@ -782,8 +782,9 @@ public EList tokenize()
{
if (sTextualDS!= null)
{
- if (retVal== null)
+ if (retVal== null){
retVal= new BasicEList();
+ }
retVal.addAll(tokenizer.tokenize(sTextualDS));
}
}
diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java
index b49d8cd46..5bac7dfa4 100644
--- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java
+++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java
@@ -36,12 +36,24 @@
import org.eclipse.emf.common.util.EList;
import org.knallgrau.utils.textcat.TextCategorizer;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Range;
+import com.google.common.collect.RangeMap;
+import com.google.common.collect.TreeRangeMap;
import com.neovisionaries.i18n.LanguageCode;
+import de.hu_berlin.german.korpling.saltnpepper.salt.SaltFactory;
+import de.hu_berlin.german.korpling.saltnpepper.salt.graph.Edge;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.exceptions.SaltTokenizerException;
+import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDataSourceSequence;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph;
+import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SSpan;
+import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SSpanningRelation;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.STextualDS;
+import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.STextualRelation;
import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SToken;
+import de.hu_berlin.german.korpling.saltnpepper.salt.saltCore.SAnnotation;
/**
* The general task of this class is to tokenize a given text in the same order
@@ -327,6 +339,11 @@ else if (LanguageCode.de.equals(language))
* The general task of this class is to tokenize a given text in the same
* order as the tool TreeTagger will do. A list of tokenized text is
* returned with the text anchor (start and end position) in original text.
+ * If the {@link SDocumentGraph} already contains tokens, the tokens will be preserved,
+ * if they overlap the same textual range as the new one. Otherwise a {@link SSpan} is
+ * created covering corresponding to the existing token. The span than overlaps all new tokens
+ * and contains all annotations the old token did. In case, the span would overlaps the same textual
+ * range as the old token did, no span is created.
*
* @param strInput
* original text
@@ -342,36 +359,134 @@ public EList tokenizeToToken(STextualDS sTextualDS, LanguageCode languag
char[] chrText = strInput.toCharArray();
int tokenCntr = 0;
+ // check if tokens exist for passed span
+ List tokens = null;
+ if ((startPos != 0) || (endPos != sTextualDS.getSText().length())
+ || (getsDocumentGraph().getSTextualDSs().size() > 1)) {
+ SDataSourceSequence sequence = SaltFactory.eINSTANCE.createSDataSourceSequence();
+ sequence.setSSequentialDS(sTextualDS);
+ sequence.setSStart(startPos);
+ sequence.setSEnd(endPos);
+ tokens = getsDocumentGraph().getSTokensBySequence(sequence);
+ } else {
+ tokens = getsDocumentGraph().getSTokens();
+ }
+
+ RangeMap oldTokens = null;
+ // create an organization structure for a tokens interval which
+ // corresponds to a token
+ if ((tokens != null) && (tokens.size() != 0)) {
+ if ((getsDocumentGraph().getSTextualRelations() != null)
+ && (getsDocumentGraph().getSTextualRelations().size() > 0)) {
+ oldTokens = TreeRangeMap.create();
+ for (STextualRelation rel : getsDocumentGraph().getSTextualRelations()) {
+ oldTokens.put(Range.closed(rel.getSStart(), rel.getSEnd()), rel.getSToken());
+ }
+ }
+ }
+ // a map mapping new created tokens, to old already existing tokens.
+ // The old tokens should be removed later on and spans should be
+ // created instead
+ Multimap old2newToken = ArrayListMultimap.create();
+
for (int i = 0; i < chrText.length; i++) {
- if ((strTokens.get(tokenCntr).length() < 1) || (strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i])))) {// first
- // letter
- // matches
+ if ((strTokens.get(tokenCntr).length() < 1)
+ || (strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i])))) {
+ // first letter matches
StringBuffer pattern = new StringBuffer();
- for (int y = 0; y < strTokens.get(tokenCntr).length(); y++) {// compute
- // pattern
- // in
- // text
+ for (int y = 0; y < strTokens.get(tokenCntr).length(); y++) {
+ // compute pattern in text
pattern.append(chrText[i + y]);
}// compute pattern in text
- if (strTokens.get(tokenCntr).hashCode() == pattern.toString().hashCode()) {// pattern
- // found
+ if (strTokens.get(tokenCntr).hashCode() == pattern.toString().hashCode()) {
+ // pattern found
int start = i + startPos;
int end = i + startPos + strTokens.get(tokenCntr).length();
- if (this.getsDocumentGraph() == null)
+ if (this.getsDocumentGraph() == null) {
throw new SaltTokenizerException("Cannot add tokens to an empty SDocumentGraph object.");
+ }
SToken sTok = this.getsDocumentGraph().createSToken(sTextualDS, start, end);
- if (retVal == null)
+ if (retVal == null) {
retVal = new BasicEList();
+ }
retVal.add(sTok);
i = i + strTokens.get(tokenCntr).length() - 1;
tokenCntr++;
- if (tokenCntr >= strTokens.size())
+ if (tokenCntr >= strTokens.size()) {
break;
+ }
+
+ /**
+ * check, if there is an old token, overlapping the same
+ * or a bigger span as the currently created one. If
+ * yes, remove the old one and create a span overlapping
+ * the new one.
+ **/
+ if (oldTokens != null) {
+ SToken oldToken = oldTokens.get(start);
+ if (oldToken != null) {
+ old2newToken.put(oldToken, sTok);
+ }
+ }
+
}// pattern found
}// first letter matches
}
+
+ if (old2newToken != null) {
+ for (SToken oldToken : old2newToken.keySet()) {
+ // create span for oldToken
+ EList overlappedTokens = new BasicEList(old2newToken.get(oldToken));
+ if (overlappedTokens.size() == 1) {
+ getsDocumentGraph().removeNode(overlappedTokens.get(0));
+ } else {
+
+ SSpan span = getsDocumentGraph().createSSpan(overlappedTokens);
+
+ // move all annotations from old token to span
+ for (SAnnotation sAnno : oldToken.getSAnnotations()) {
+ span.addSAnnotation(sAnno);
+ }
+
+ // redirect all relations to span
+ List inEdges = new ArrayList();
+ for (Edge edge : getsDocumentGraph().getInEdges(oldToken.getSId())) {
+ inEdges.add(edge);
+ }
+ for (Edge edge : inEdges) {
+ if (edge instanceof SSpanningRelation) {
+ // in case of edge is a SSpanningRelation remove
+ // it and create new ones for each token under
+ // the span
+ if (edge.getSource() instanceof SSpan) {
+ SSpan parentSpan = (SSpan) edge.getSource();
+ getsDocumentGraph().removeEdge(edge);
+ for (SToken overlappedToken : overlappedTokens) {
+ SSpanningRelation rel = SaltFactory.eINSTANCE.createSSpanningRelation();
+ rel.setSSource(parentSpan);
+ rel.setSTarget(overlappedToken);
+ getsDocumentGraph().addSRelation(rel);
+ }
+ }
+ } else {
+ edge.setTarget(span);
+ }
+ }
+ List outEdges = new ArrayList();
+ for (Edge edge : getsDocumentGraph().getOutEdges(oldToken.getSId())) {
+ if (!(edge instanceof STextualRelation)) {
+ outEdges.add(edge);
+ }
+ }
+ for (Edge edge : outEdges) {
+ edge.setSource(span);
+ }
+ getsDocumentGraph().removeNode(oldToken);
+ }
+ }
+ }
}
return (retVal);
}
diff --git a/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java b/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java
index 62129e79d..dc71d819e 100644
--- a/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java
+++ b/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java
@@ -1872,6 +1872,49 @@ else if (textRel.getSTarget().equals(sText1))
assertEquals(6, relTosText2);
}
+ /**
+ * Tests the tokenization of a pretokenized text and checks if the old toekns are removed correctly.
+ */
+ public void testTokenize2()
+ {
+ String text="This is a sample.";
+
+ STextualDS sText1= this.getFixture().createSTextualDS(text);
+ getFixture().createSToken(sText1, 0, 7);
+ getFixture().createSToken(sText1, 8, 16);
+
+ this.getFixture().tokenize();
+
+ assertEquals(5, this.getFixture().getSTokens().size());
+ assertEquals(5, this.getFixture().getSTextualRelations().size());
+ assertEquals(2, this.getFixture().getSSpans().size());
+ }
+
+ /**
+ * Tests the tokenization of a pretokenized text and checks if the old tokens are removed correctly. Further checks, that
+ * all annotations are copied to the new created span node.
+ */
+ public void testTokenize3()
+ {
+ String text="This is a sample.";
+
+ STextualDS sText1= this.getFixture().createSTextualDS(text);
+ SToken tok1= getFixture().createSToken(sText1, 0, 7);
+ tok1.createSAnnotation(null, "a", "b");
+ tok1.createSAnnotation(null, "c", "d");
+ SToken tok2= getFixture().createSToken(sText1, 8, 16);
+ tok2.createSAnnotation(null, "1", "2");
+ tok2.createSAnnotation(null, "3", "4");
+
+ this.getFixture().tokenize();
+
+ assertEquals(5, this.getFixture().getSTokens().size());
+ assertEquals(5, this.getFixture().getSTextualRelations().size());
+ assertEquals(2, this.getFixture().getSSpans().size());
+ assertEquals(2, this.getFixture().getSSpans().get(0).getSAnnotations().size());
+ assertEquals(2, this.getFixture().getSSpans().get(1).getSAnnotations().size());
+ }
+
/**
* Tests the '{@link de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph#createTokenizer() Create Tokenizer}' operation.
*
diff --git a/salt-saltCore/pom.xml b/salt-saltCore/pom.xml
index 358717c57..546ed3f87 100644
--- a/salt-saltCore/pom.xml
+++ b/salt-saltCore/pom.xml
@@ -6,7 +6,7 @@
de.hu_berlin.german.korpling.saltnpeppersalt
- 2.1.0
+ 2.1.1-SNAPSHOT../pom.xml
diff --git a/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java b/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java
index 79a61852e..7abc62f0b 100644
--- a/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java
+++ b/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java
@@ -387,7 +387,7 @@ public void testGetSValueType_SURI()
SDATATYPE dataType= null;
dataType= SDATATYPE.SURI;
- this.getFixture().setSValue(URI.createFileURI(""));
+ this.getFixture().setSValue(URI.createFileURI("."));
assertEquals(dataType, this.getFixture().getSValueType());
}