From 2c05a4325d0016846e450afdf41860a4b2b25a67 Mon Sep 17 00:00:00 2001 From: korpling-Server Date: Wed, 6 May 2015 13:08:19 +0200 Subject: [PATCH 1/7] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- salt-graph/pom.xml | 2 +- salt-saltCommon/pom.xml | 2 +- salt-saltCore/pom.xml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 699a6755e..d12b90bee 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ de.hu_berlin.german.korpling.saltnpepper salt pom - 2.1.0 + 2.1.1-SNAPSHOT salt-graph salt-saltCore @@ -31,7 +31,7 @@ scm:git:git://github.com/korpling/salt.git scm:git:git@github.com:korpling/salt.git https://github.com/korpling/salt - salt-2.1.0 + HEAD diff --git a/salt-graph/pom.xml b/salt-graph/pom.xml index df49c4252..7b302b066 100644 --- a/salt-graph/pom.xml +++ b/salt-graph/pom.xml @@ -3,7 +3,7 @@ de.hu_berlin.german.korpling.saltnpepper salt - 2.1.0 + 2.1.1-SNAPSHOT ../pom.xml 4.0.0 diff --git a/salt-saltCommon/pom.xml b/salt-saltCommon/pom.xml index 8606731b5..81f94b181 100644 --- a/salt-saltCommon/pom.xml +++ b/salt-saltCommon/pom.xml @@ -6,7 +6,7 @@ de.hu_berlin.german.korpling.saltnpepper salt - 2.1.0 + 2.1.1-SNAPSHOT ../pom.xml diff --git a/salt-saltCore/pom.xml b/salt-saltCore/pom.xml index 358717c57..546ed3f87 100644 --- a/salt-saltCore/pom.xml +++ b/salt-saltCore/pom.xml @@ -6,7 +6,7 @@ de.hu_berlin.german.korpling.saltnpepper salt - 2.1.0 + 2.1.1-SNAPSHOT ../pom.xml From 0b2bc8a0036c7be41ab5a74ab49ccb94dd37ecae Mon Sep 17 00:00:00 2001 From: "f.zipser@gmx.de" Date: Thu, 28 May 2015 21:41:43 +0200 Subject: [PATCH 2/7] Fixed #72 --- .../impl/SDocumentGraphImpl.java | 3 +- .../tokenizer/Tokenizer.java | 131 ++++++++++++++++-- .../tests/SDocumentGraphTest.java | 43 ++++++ 3 files changed, 166 insertions(+), 11 deletions(-) diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java index 41e11b508..087981881 100644 --- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java +++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/impl/SDocumentGraphImpl.java @@ -782,8 +782,9 @@ public EList tokenize() { if (sTextualDS!= null) { - if (retVal== null) + if (retVal== null){ retVal= new BasicEList(); + } retVal.addAll(tokenizer.tokenize(sTextualDS)); } } diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java index b49d8cd46..78fd8609d 100644 --- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java +++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java @@ -36,12 +36,24 @@ import org.eclipse.emf.common.util.EList; import org.knallgrau.utils.textcat.TextCategorizer; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.Multimap; +import com.google.common.collect.Range; +import com.google.common.collect.RangeMap; +import com.google.common.collect.TreeRangeMap; import com.neovisionaries.i18n.LanguageCode; +import de.hu_berlin.german.korpling.saltnpepper.salt.SaltFactory; +import de.hu_berlin.german.korpling.saltnpepper.salt.graph.Edge; import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.exceptions.SaltTokenizerException; +import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDataSourceSequence; import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph; +import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SSpan; +import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SSpanningRelation; import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.STextualDS; +import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.STextualRelation; import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SToken; +import de.hu_berlin.german.korpling.saltnpepper.salt.saltCore.SAnnotation; /** * The general task of this class is to tokenize a given text in the same order @@ -342,36 +354,135 @@ public EList tokenizeToToken(STextualDS sTextualDS, LanguageCode languag char[] chrText = strInput.toCharArray(); int tokenCntr = 0; + // check if tokens exist for passed span + List tokens = null; + if ((startPos != 0) || (endPos != sTextualDS.getSText().length()) + || (getsDocumentGraph().getSTextualDSs().size() > 1)) { + SDataSourceSequence sequence = SaltFactory.eINSTANCE.createSDataSourceSequence(); + sequence.setSSequentialDS(sTextualDS); + sequence.setSStart(startPos); + sequence.setSEnd(endPos); + tokens = getsDocumentGraph().getSTokensBySequence(sequence); + } else { + tokens = getsDocumentGraph().getSTokens(); + } + + RangeMap oldTokens = null; + // create an organization structure for a tokens interval which + // corresponds to a token + if ((tokens != null) && (tokens.size() != 0)) { + if ((getsDocumentGraph().getSTextualRelations() != null) + && (getsDocumentGraph().getSTextualRelations().size() > 0)) { + oldTokens = TreeRangeMap.create(); + for (STextualRelation rel : getsDocumentGraph().getSTextualRelations()) { + oldTokens.put(Range.closed(rel.getSStart(), rel.getSEnd()), rel.getSToken()); + } + } + } + // a map mapping new created tokens, to old already existing tokens. + // The old tokens should be removed later on and spans should be + // created instead + Multimap old2newToken = ArrayListMultimap.create(); + for (int i = 0; i < chrText.length; i++) { - if ((strTokens.get(tokenCntr).length() < 1) || (strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i])))) {// first + if ((strTokens.get(tokenCntr).length() < 1) + || (strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i])))) {// first // letter // matches StringBuffer pattern = new StringBuffer(); - for (int y = 0; y < strTokens.get(tokenCntr).length(); y++) {// compute - // pattern - // in - // text + for (int y = 0; y < strTokens.get(tokenCntr).length(); y++) { + // compute pattern in text pattern.append(chrText[i + y]); }// compute pattern in text - if (strTokens.get(tokenCntr).hashCode() == pattern.toString().hashCode()) {// pattern - // found + if (strTokens.get(tokenCntr).hashCode() == pattern.toString().hashCode()) { + // pattern found int start = i + startPos; int end = i + startPos + strTokens.get(tokenCntr).length(); - if (this.getsDocumentGraph() == null) + if (this.getsDocumentGraph() == null) { throw new SaltTokenizerException("Cannot add tokens to an empty SDocumentGraph object."); + } SToken sTok = this.getsDocumentGraph().createSToken(sTextualDS, start, end); - if (retVal == null) + if (retVal == null) { retVal = new BasicEList(); + } retVal.add(sTok); i = i + strTokens.get(tokenCntr).length() - 1; tokenCntr++; - if (tokenCntr >= strTokens.size()) + if (tokenCntr >= strTokens.size()) { break; + } + + /** + * check, if there is an old token, overlapping the same + * or a bigger span as the currently created one. If + * yes, remove the old one and create a span overlapping + * the new one. + **/ + if (oldTokens != null) { + SToken oldToken = oldTokens.get(start); + if (oldToken != null) { + old2newToken.put(oldToken, sTok); + } + } + }// pattern found }// first letter matches } + + if (old2newToken != null) { + for (SToken oldToken : old2newToken.keySet()) { + // create span for oldToken + EList overlappedTokens = new BasicEList(old2newToken.get(oldToken)); + if (overlappedTokens.size() == 1) { + getsDocumentGraph().removeNode(overlappedTokens.get(0)); + } else { + + SSpan span = getsDocumentGraph().createSSpan(overlappedTokens); + + // move all annotations from old token to span + for (SAnnotation sAnno : oldToken.getSAnnotations()) { + span.addSAnnotation(sAnno); + } + + // redirect all relations to span + List inEdges = new ArrayList(); + for (Edge edge : getsDocumentGraph().getInEdges(oldToken.getSId())) { + inEdges.add(edge); + } + for (Edge edge : inEdges) { + if (edge instanceof SSpanningRelation) { + // in case of edge is a SSpanningRelation remove + // it and create new ones for each token under + // the span + if (edge.getSource() instanceof SSpan) { + SSpan parentSpan = (SSpan) edge.getSource(); + getsDocumentGraph().removeEdge(edge); + for (SToken overlappedToken : overlappedTokens) { + SSpanningRelation rel = SaltFactory.eINSTANCE.createSSpanningRelation(); + rel.setSSource(parentSpan); + rel.setSTarget(overlappedToken); + getsDocumentGraph().addSRelation(rel); + } + } + } else { + edge.setTarget(span); + } + } + List outEdges = new ArrayList(); + for (Edge edge : getsDocumentGraph().getOutEdges(oldToken.getSId())) { + if (!(edge instanceof STextualRelation)) { + outEdges.add(edge); + } + } + for (Edge edge : outEdges) { + edge.setSource(span); + } + getsDocumentGraph().removeNode(oldToken); + } + } + } } return (retVal); } diff --git a/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java b/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java index 62129e79d..dc71d819e 100644 --- a/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java +++ b/salt-saltCommon/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tests/SDocumentGraphTest.java @@ -1872,6 +1872,49 @@ else if (textRel.getSTarget().equals(sText1)) assertEquals(6, relTosText2); } + /** + * Tests the tokenization of a pretokenized text and checks if the old toekns are removed correctly. + */ + public void testTokenize2() + { + String text="This is a sample."; + + STextualDS sText1= this.getFixture().createSTextualDS(text); + getFixture().createSToken(sText1, 0, 7); + getFixture().createSToken(sText1, 8, 16); + + this.getFixture().tokenize(); + + assertEquals(5, this.getFixture().getSTokens().size()); + assertEquals(5, this.getFixture().getSTextualRelations().size()); + assertEquals(2, this.getFixture().getSSpans().size()); + } + + /** + * Tests the tokenization of a pretokenized text and checks if the old tokens are removed correctly. Further checks, that + * all annotations are copied to the new created span node. + */ + public void testTokenize3() + { + String text="This is a sample."; + + STextualDS sText1= this.getFixture().createSTextualDS(text); + SToken tok1= getFixture().createSToken(sText1, 0, 7); + tok1.createSAnnotation(null, "a", "b"); + tok1.createSAnnotation(null, "c", "d"); + SToken tok2= getFixture().createSToken(sText1, 8, 16); + tok2.createSAnnotation(null, "1", "2"); + tok2.createSAnnotation(null, "3", "4"); + + this.getFixture().tokenize(); + + assertEquals(5, this.getFixture().getSTokens().size()); + assertEquals(5, this.getFixture().getSTextualRelations().size()); + assertEquals(2, this.getFixture().getSSpans().size()); + assertEquals(2, this.getFixture().getSSpans().get(0).getSAnnotations().size()); + assertEquals(2, this.getFixture().getSSpans().get(1).getSAnnotations().size()); + } + /** * Tests the '{@link de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph#createTokenizer() Create Tokenizer}' operation. * From 1428a986b80f639e4a38df57063f68c8eebd3e68 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 3 Jun 2015 14:45:51 +0200 Subject: [PATCH 3/7] don't use empty file path for testing the URI, since it causes problems with EMF 2.9 (at least under some configurations) --- .../salt/saltCore/tests/SAbstractAnnotationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java b/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java index 79a61852e..7abc62f0b 100644 --- a/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java +++ b/salt-saltCore/src/test/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCore/tests/SAbstractAnnotationTest.java @@ -387,7 +387,7 @@ public void testGetSValueType_SURI() SDATATYPE dataType= null; dataType= SDATATYPE.SURI; - this.getFixture().setSValue(URI.createFileURI("")); + this.getFixture().setSValue(URI.createFileURI(".")); assertEquals(dataType, this.getFixture().getSValueType()); } From dcf08156f0ef8113d5c6249430dc91716a5ae462 Mon Sep 17 00:00:00 2001 From: "f.zipser@gmx.de" Date: Wed, 3 Jun 2015 17:23:05 +0200 Subject: [PATCH 4/7] added more java doc to method tokenize() --- .../sDocumentStructure/tokenizer/Tokenizer.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java index 78fd8609d..5bac7dfa4 100644 --- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java +++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/sDocumentStructure/tokenizer/Tokenizer.java @@ -339,6 +339,11 @@ else if (LanguageCode.de.equals(language)) * The general task of this class is to tokenize a given text in the same * order as the tool TreeTagger will do. A list of tokenized text is * returned with the text anchor (start and end position) in original text. + * If the {@link SDocumentGraph} already contains tokens, the tokens will be preserved, + * if they overlap the same textual range as the new one. Otherwise a {@link SSpan} is + * created covering corresponding to the existing token. The span than overlaps all new tokens + * and contains all annotations the old token did. In case, the span would overlaps the same textual + * range as the old token did, no span is created. * * @param strInput * original text @@ -386,9 +391,8 @@ public EList tokenizeToToken(STextualDS sTextualDS, LanguageCode languag for (int i = 0; i < chrText.length; i++) { if ((strTokens.get(tokenCntr).length() < 1) - || (strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i])))) {// first - // letter - // matches + || (strTokens.get(tokenCntr).substring(0, 1).equals(String.valueOf(chrText[i])))) { + // first letter matches StringBuffer pattern = new StringBuffer(); for (int y = 0; y < strTokens.get(tokenCntr).length(); y++) { // compute pattern in text From 7976416153ae25297e4bd93ba54379d0b1fd1e5a Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 22 Jun 2015 11:20:51 +0200 Subject: [PATCH 5/7] copy list instead of using Collections.synchronizedList before iterating over it --- .../saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java index ab57791ff..212bd10f3 100644 --- a/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java +++ b/salt-saltCommon/src/main/java/de/hu_berlin/german/korpling/saltnpepper/salt/saltCommon/impl/SaltProjectImpl.java @@ -68,6 +68,7 @@ import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sCorpusStructure.SDocument; import de.hu_berlin.german.korpling.saltnpepper.salt.saltCommon.sDocumentStructure.SDocumentGraph; import de.hu_berlin.german.korpling.saltnpepper.salt.saltCore.SElementId; +import java.util.LinkedList; /** * @@ -371,12 +372,12 @@ public synchronized void saveSaltProject(URI saltProjectURI) (this.getSCorpusGraphs().size()> 0)) {//store all documents if exist URI sDocumentFileURI= null; - for (SCorpusGraph sCorpusGraph: Collections.synchronizedList(this.getSCorpusGraphs())) + for (SCorpusGraph sCorpusGraph: new LinkedList<>(this.getSCorpusGraphs())) { if ( (sCorpusGraph.getSDocuments()!= null) && (sCorpusGraph.getSDocuments().size()> 0)) { - for (SDocument sDocument: Collections.synchronizedList(sCorpusGraph.getSDocuments())) + for (SDocument sDocument: new LinkedList<>(sCorpusGraph.getSDocuments())) { if (sDocument.getSDocumentGraph()!= null) {//only store sDocument, when there is some content in sDocumentGraph corresponding to sDocument From a77bd1caa437ee72be0613eb867d8517e32c0c2b Mon Sep 17 00:00:00 2001 From: "f.zipser@gmx.de" Date: Mon, 22 Jun 2015 11:22:55 +0200 Subject: [PATCH 6/7] fixed a typo in documentation --- salt-doc/src/main/docbkx/salt_modelGuide.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/salt-doc/src/main/docbkx/salt_modelGuide.xml b/salt-doc/src/main/docbkx/salt_modelGuide.xml index 670a44745..479b04c91 100644 --- a/salt-doc/src/main/docbkx/salt_modelGuide.xml +++ b/salt-doc/src/main/docbkx/salt_modelGuide.xml @@ -165,9 +165,9 @@ a layer or another label as shown in . An attribute-value-pair is a triple which consists of a namespace, a name and a value (namespace:name=value). The combination of name and namespace is used to identify a - label and therefore must be unique. The namespace is an optional value, to ditinguish in - case of there are two labels having the same name. For instance a node etc. can have a - label stts:pos=VVFIN as well as the label stts:pos=VVFIN as well as the label pos:VV to annotate it with two part-of-speech annotations from different tagsets.
Label mechanism for graph, node, edge and layer (class diagram) From 8d1c62513a4881648bbbf73934aad0666106b477 Mon Sep 17 00:00:00 2001 From: "f.zipser@gmx.de" Date: Wed, 24 Jun 2015 17:44:53 +0200 Subject: [PATCH 7/7] extended change log --- CHANGELOG.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 16d900740..2b72648c8 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -12,6 +12,10 @@ * artifactId: salt-saltSample * *********************************************************************************** +Version 2.1.1 +============= +* this is just a bugfix release + Version 2.1.0 ============= * updated EMF version from 2.8 to 2.9.1-v20130827-0309