-
Notifications
You must be signed in to change notification settings - Fork 99
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
1,096 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>com.hankcs.nlp</groupId> | ||
<artifactId>hanlp-solr-plugin</artifactId> | ||
<version>1.0</version> | ||
|
||
<name>hanlp-solr-plugin</name> | ||
<url>https://github.com/hankcs/HanLP</url> | ||
|
||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
|
||
<hanlp.version>portable-1.2.4</hanlp.version> | ||
<lucene.version>5.1.0</lucene.version> | ||
|
||
<javac.src.version>1.7</javac.src.version> | ||
<javac.target.version>1.7</javac.target.version> | ||
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version> | ||
</properties> | ||
|
||
<dependencies> | ||
|
||
<dependency> | ||
<groupId>com.hankcs</groupId> | ||
<artifactId>hanlp</artifactId> | ||
<version>${hanlp.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<version>4.11</version> | ||
<scope>test</scope> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-core</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-queryparser</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-analyzers-common</artifactId> | ||
<version>${lucene.version}</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<configuration> | ||
<target>${javac.src.version}</target> | ||
<source>${javac.target.version}</source> | ||
</configuration> | ||
<version>${maven.compiler.plugin.version}</version> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package com.hankcs.lucene; | ||
|
||
import com.hankcs.hanlp.HanLP; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
|
||
import java.util.Set; | ||
|
||
public class HanLPAnalyzer extends Analyzer | ||
{ | ||
|
||
boolean enablePorterStemming; | ||
public Set<String> filter; | ||
|
||
/** | ||
* @param filter 停用词 | ||
* @param enablePorterStemming 是否分析词干(仅限英文) | ||
*/ | ||
public HanLPAnalyzer(Set<String> filter, boolean enablePorterStemming) | ||
{ | ||
this.filter = filter; | ||
} | ||
|
||
/** | ||
* @param enablePorterStemming 是否分析词干.进行单复数,时态的转换 | ||
*/ | ||
public HanLPAnalyzer(boolean enablePorterStemming) | ||
{ | ||
this.enablePorterStemming = enablePorterStemming; | ||
} | ||
|
||
public HanLPAnalyzer() | ||
{ | ||
super(); | ||
} | ||
|
||
/** | ||
* 重载Analyzer接口,构造分词组件 | ||
*/ | ||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) | ||
{ | ||
Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableOffset(true), filter, enablePorterStemming); | ||
return new TokenStreamComponents(tokenizer); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
package com.hankcs.lucene; | ||
|
||
import com.hankcs.hanlp.HanLP; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
|
||
import java.util.Set; | ||
|
||
public class HanLPIndexAnalyzer extends Analyzer | ||
{ | ||
|
||
private boolean pstemming; | ||
private Set<String> filter; | ||
|
||
/** | ||
* @param filter 停用词 | ||
* @param pstemming 是否分析词干 | ||
*/ | ||
public HanLPIndexAnalyzer(Set<String> filter, boolean pstemming) | ||
{ | ||
this.filter = filter; | ||
} | ||
|
||
/** | ||
* @param pstemming 是否分析词干.进行单复数,时态的转换 | ||
*/ | ||
public HanLPIndexAnalyzer(boolean pstemming) | ||
{ | ||
this.pstemming = pstemming; | ||
} | ||
|
||
public HanLPIndexAnalyzer() | ||
{ | ||
super(); | ||
} | ||
|
||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) | ||
{ | ||
Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableIndexMode(true), filter, pstemming); | ||
return new TokenStreamComponents(tokenizer); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package com.hankcs.lucene; | ||
|
||
|
||
import com.hankcs.hanlp.corpus.tag.Nature; | ||
import com.hankcs.hanlp.seg.Segment; | ||
import com.hankcs.hanlp.seg.common.Term; | ||
import com.hankcs.hanlp.seg.common.wrapper.SegmentWrapper; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.IOException; | ||
import java.util.Set; | ||
|
||
/** | ||
* Tokenizer,抄袭ansj的 | ||
*/ | ||
public class HanLPTokenizer extends Tokenizer | ||
{ | ||
// 当前词 | ||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
// 偏移量 | ||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); | ||
// 距离 | ||
private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); | ||
// 词性 | ||
private TypeAttribute typeAtt = addAttribute(TypeAttribute.class); | ||
|
||
protected SegmentWrapper segment; | ||
private Set<String> filter; | ||
private boolean enablePorterStemming; | ||
private final PorterStemmer stemmer = new PorterStemmer(); | ||
|
||
public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming) | ||
{ | ||
super(); | ||
this.segment = new SegmentWrapper(new BufferedReader(input), segment); | ||
this.filter = filter; | ||
this.enablePorterStemming = enablePorterStemming; | ||
} | ||
|
||
@Override | ||
final public boolean incrementToken() throws IOException | ||
{ | ||
clearAttributes(); | ||
int position = 0; | ||
Term term; | ||
boolean un_increased = true; | ||
do | ||
{ | ||
term = segment.next(); | ||
if (term == null) | ||
{ | ||
break; | ||
} | ||
if (enablePorterStemming && term.nature == Nature.nx) | ||
{ | ||
term.word = stemmer.stem(term.word); | ||
} | ||
|
||
if (filter != null && filter.contains(term.word)) | ||
{ | ||
continue; | ||
} | ||
else | ||
{ | ||
++position; | ||
un_increased = false; | ||
} | ||
} | ||
while (un_increased); | ||
|
||
if (term != null) | ||
{ | ||
positionAttr.setPositionIncrement(position); | ||
termAtt.setEmpty().append(term.word); | ||
offsetAtt.setOffset(term.offset, term.offset + term.word.length()); | ||
typeAtt.setType(term.nature == null ? "null" : term.nature.toString()); | ||
return true; | ||
} | ||
else | ||
{ | ||
return false; | ||
} | ||
} | ||
|
||
/** | ||
* 必须重载的方法,否则在批量索引文件时将会导致文件索引失败 | ||
*/ | ||
@Override | ||
public void reset() throws IOException | ||
{ | ||
super.reset(); | ||
segment.reset(new BufferedReader(this.input)); | ||
} | ||
|
||
} |
27 changes: 27 additions & 0 deletions
27
src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package com.hankcs.lucene; | ||
|
||
import com.hankcs.hanlp.HanLP; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.util.TokenizerFactory; | ||
import org.apache.lucene.util.AttributeFactory; | ||
|
||
import java.util.Map; | ||
|
||
public class HanLPTokenizerFactory extends TokenizerFactory | ||
{ | ||
private boolean enableIndexMode; | ||
private boolean enablePorterStemming; | ||
|
||
public HanLPTokenizerFactory(Map<String, String> args) | ||
{ | ||
super(args); | ||
enableIndexMode = getBoolean(args, "enableIndexMode", true); | ||
enablePorterStemming = getBoolean(args, "enablePorterStemming", false); | ||
} | ||
|
||
@Override | ||
public Tokenizer create(AttributeFactory factory) | ||
{ | ||
return new HanLPTokenizer(HanLP.newSegment().enableOffset(true).enableIndexMode(enableIndexMode), null, enablePorterStemming); | ||
} | ||
} |
Oops, something went wrong.