Skip to content

Commit

Permalink
初始提交
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Aug 22, 2015
1 parent 86c8480 commit 1ca7f13
Show file tree
Hide file tree
Showing 11 changed files with 1,096 additions and 2 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# hanlp-solr-plugin
hanlp-solr-plugin
========

HanLP中文分词solr插件
----------------------

- 整合方法
0.```hanlp-portable-${version}.jar``````hanlp-solr-plugin-${version}.jar```共两个jar放入```${webapp}/WEB-INF/lib```
0. 修改solr core的配置文件```${core}/conf/schema.xml```
Expand All @@ -15,5 +16,6 @@ HanLP中文分词solr插件
- 配置方法
HanLP分词器主要通过class path下的```hanlp.properties```进行配置,请阅读HanLP自然语言处理包文档以了解更多。
- 版权
Apache License Version 2.0
Apache License Version 2.0
72 changes: 72 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.hankcs.nlp</groupId>
<artifactId>hanlp-solr-plugin</artifactId>
<version>1.0</version>

<name>hanlp-solr-plugin</name>
<url>https://github.com/hankcs/HanLP</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

<hanlp.version>portable-1.2.4</hanlp.version>
<lucene.version>5.1.0</lucene.version>

<javac.src.version>1.7</javac.src.version>
<javac.target.version>1.7</javac.target.version>
<maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
</properties>

<dependencies>

<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>${hanlp.version}</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<target>${javac.src.version}</target>
<source>${javac.target.version}</source>
</configuration>
<version>${maven.compiler.plugin.version}</version>
</plugin>
</plugins>
</build>

</project>
46 changes: 46 additions & 0 deletions src/main/java/com/hankcs/lucene/HanLPAnalyzer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.hankcs.lucene;

import com.hankcs.hanlp.HanLP;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;

import java.util.Set;

public class HanLPAnalyzer extends Analyzer
{

boolean enablePorterStemming;
public Set<String> filter;

/**
* @param filter 停用词
* @param enablePorterStemming 是否分析词干(仅限英文)
*/
public HanLPAnalyzer(Set<String> filter, boolean enablePorterStemming)
{
this.filter = filter;
}

/**
* @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
*/
public HanLPAnalyzer(boolean enablePorterStemming)
{
this.enablePorterStemming = enablePorterStemming;
}

public HanLPAnalyzer()
{
super();
}

/**
* 重载Analyzer接口,构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName)
{
Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableOffset(true), filter, enablePorterStemming);
return new TokenStreamComponents(tokenizer);
}
}
43 changes: 43 additions & 0 deletions src/main/java/com/hankcs/lucene/HanLPIndexAnalyzer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package com.hankcs.lucene;

import com.hankcs.hanlp.HanLP;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;

import java.util.Set;

public class HanLPIndexAnalyzer extends Analyzer
{

private boolean pstemming;
private Set<String> filter;

/**
* @param filter 停用词
* @param pstemming 是否分析词干
*/
public HanLPIndexAnalyzer(Set<String> filter, boolean pstemming)
{
this.filter = filter;
}

/**
* @param pstemming 是否分析词干.进行单复数,时态的转换
*/
public HanLPIndexAnalyzer(boolean pstemming)
{
this.pstemming = pstemming;
}

public HanLPIndexAnalyzer()
{
super();
}

@Override
protected TokenStreamComponents createComponents(String fieldName)
{
Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableIndexMode(true), filter, pstemming);
return new TokenStreamComponents(tokenizer);
}
}
100 changes: 100 additions & 0 deletions src/main/java/com/hankcs/lucene/HanLPTokenizer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package com.hankcs.lucene;


import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.seg.common.wrapper.SegmentWrapper;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.Set;

/**
* Tokenizer,抄袭ansj的
*/
public class HanLPTokenizer extends Tokenizer
{
// 当前词
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
// 偏移量
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// 距离
private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
// 词性
private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

protected SegmentWrapper segment;
private Set<String> filter;
private boolean enablePorterStemming;
private final PorterStemmer stemmer = new PorterStemmer();

public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming)
{
super();
this.segment = new SegmentWrapper(new BufferedReader(input), segment);
this.filter = filter;
this.enablePorterStemming = enablePorterStemming;
}

@Override
final public boolean incrementToken() throws IOException
{
clearAttributes();
int position = 0;
Term term;
boolean un_increased = true;
do
{
term = segment.next();
if (term == null)
{
break;
}
if (enablePorterStemming && term.nature == Nature.nx)
{
term.word = stemmer.stem(term.word);
}

if (filter != null && filter.contains(term.word))
{
continue;
}
else
{
++position;
un_increased = false;
}
}
while (un_increased);

if (term != null)
{
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(term.word);
offsetAtt.setOffset(term.offset, term.offset + term.word.length());
typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
return true;
}
else
{
return false;
}
}

/**
* 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
*/
@Override
public void reset() throws IOException
{
super.reset();
segment.reset(new BufferedReader(this.input));
}

}
27 changes: 27 additions & 0 deletions src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.hankcs.lucene;

import com.hankcs.hanlp.HanLP;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;

import java.util.Map;

public class HanLPTokenizerFactory extends TokenizerFactory
{
private boolean enableIndexMode;
private boolean enablePorterStemming;

public HanLPTokenizerFactory(Map<String, String> args)
{
super(args);
enableIndexMode = getBoolean(args, "enableIndexMode", true);
enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
}

@Override
public Tokenizer create(AttributeFactory factory)
{
return new HanLPTokenizer(HanLP.newSegment().enableOffset(true).enableIndexMode(enableIndexMode), null, enablePorterStemming);
}
}
Loading

0 comments on commit 1ca7f13

Please sign in to comment.