初始提交

hankcs · Aug 22, 2015 · 1ca7f13 · 1ca7f13
1 parent 86c8480
commit 1ca7f13
Show file tree

Hide file tree

Showing 11 changed files with 1,096 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,9 @@
-# hanlp-solr-plugin
+hanlp-solr-plugin
 ========
 
 HanLP中文分词solr插件
 ----------------------
+
  - 整合方法
  0. 将```hanlp-portable-${version}.jar```和```hanlp-solr-plugin-${version}.jar```共两个jar放入```${webapp}/WEB-INF/lib```下
  0. 修改solr core的配置文件```${core}/conf/schema.xml```：
@@ -15,5 +16,6 @@ HanLP中文分词solr插件
 
  - 配置方法
  HanLP分词器主要通过class path下的```hanlp.properties```进行配置，请阅读HanLP自然语言处理包文档以了解更多。
+ 
  - 版权
-Apache License Version 2.0
+ Apache License Version 2.0
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.hankcs.nlp</groupId>
+    <artifactId>hanlp-solr-plugin</artifactId>
+    <version>1.0</version>
+
+    <name>hanlp-solr-plugin</name>
+    <url>https://github.com/hankcs/HanLP</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+
+        <hanlp.version>portable-1.2.4</hanlp.version>
+        <lucene.version>5.1.0</lucene.version>
+
+        <javac.src.version>1.7</javac.src.version>
+        <javac.target.version>1.7</javac.target.version>
+        <maven.compiler.plugin.version>3.3</maven.compiler.plugin.version>
+    </properties>
+
+    <dependencies>
+
+        <dependency>
+            <groupId>com.hankcs</groupId>
+            <artifactId>hanlp</artifactId>
+            <version>${hanlp.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.11</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-queryparser</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <plugins>
+            <plugin>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <target>${javac.src.version}</target>
+                    <source>${javac.target.version}</source>
+                </configuration>
+                <version>${maven.compiler.plugin.version}</version>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
diff --git a/src/main/java/com/hankcs/lucene/HanLPAnalyzer.java b/src/main/java/com/hankcs/lucene/HanLPAnalyzer.java
@@ -0,0 +1,46 @@
+package com.hankcs.lucene;
+
+import com.hankcs.hanlp.HanLP;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.util.Set;
+
+public class HanLPAnalyzer extends Analyzer
+{
+
+    boolean enablePorterStemming;
+    public Set<String> filter;
+
+    /**
+     * @param filter    停用词
+     * @param enablePorterStemming 是否分析词干（仅限英文）
+     */
+    public HanLPAnalyzer(Set<String> filter, boolean enablePorterStemming)
+    {
+        this.filter = filter;
+    }
+
+    /**
+     * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
+     */
+    public HanLPAnalyzer(boolean enablePorterStemming)
+    {
+        this.enablePorterStemming = enablePorterStemming;
+    }
+
+    public HanLPAnalyzer()
+    {
+        super();
+    }
+
+    /**
+     * 重载Analyzer接口，构造分词组件
+     */
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName)
+    {
+        Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableOffset(true), filter, enablePorterStemming);
+        return new TokenStreamComponents(tokenizer);
+    }
+}
diff --git a/src/main/java/com/hankcs/lucene/HanLPIndexAnalyzer.java b/src/main/java/com/hankcs/lucene/HanLPIndexAnalyzer.java
@@ -0,0 +1,43 @@
+package com.hankcs.lucene;
+
+import com.hankcs.hanlp.HanLP;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.util.Set;
+
+public class HanLPIndexAnalyzer extends Analyzer
+{
+
+    private boolean pstemming;
+    private Set<String> filter;
+
+    /**
+     * @param filter    停用词
+     * @param pstemming 是否分析词干
+     */
+    public HanLPIndexAnalyzer(Set<String> filter, boolean pstemming)
+    {
+        this.filter = filter;
+    }
+
+    /**
+     * @param pstemming 是否分析词干.进行单复数,时态的转换
+     */
+    public HanLPIndexAnalyzer(boolean pstemming)
+    {
+        this.pstemming = pstemming;
+    }
+
+    public HanLPIndexAnalyzer()
+    {
+        super();
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName)
+    {
+        Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableIndexMode(true), filter, pstemming);
+        return new TokenStreamComponents(tokenizer);
+    }
+}
diff --git a/src/main/java/com/hankcs/lucene/HanLPTokenizer.java b/src/main/java/com/hankcs/lucene/HanLPTokenizer.java
@@ -0,0 +1,100 @@
+package com.hankcs.lucene;
+
+
+import com.hankcs.hanlp.corpus.tag.Nature;
+import com.hankcs.hanlp.seg.Segment;
+import com.hankcs.hanlp.seg.common.Term;
+import com.hankcs.hanlp.seg.common.wrapper.SegmentWrapper;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Set;
+
+/**
+ * Tokenizer，抄袭ansj的
+ */
+public class HanLPTokenizer extends Tokenizer
+{
+    // 当前词
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    // 偏移量
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    // 距离
+    private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
+    // 词性
+    private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+    protected SegmentWrapper segment;
+    private Set<String> filter;
+    private boolean enablePorterStemming;
+    private final PorterStemmer stemmer = new PorterStemmer();
+
+    public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming)
+    {
+        super();
+        this.segment = new SegmentWrapper(new BufferedReader(input), segment);
+        this.filter = filter;
+        this.enablePorterStemming = enablePorterStemming;
+    }
+
+    @Override
+    final public boolean incrementToken() throws IOException
+    {
+        clearAttributes();
+        int position = 0;
+        Term term;
+        boolean un_increased = true;
+        do
+        {
+            term = segment.next();
+            if (term == null)
+            {
+                break;
+            }
+            if (enablePorterStemming && term.nature == Nature.nx)
+            {
+                term.word = stemmer.stem(term.word);
+            }
+
+            if (filter != null && filter.contains(term.word))
+            {
+                continue;
+            }
+            else
+            {
+                ++position;
+                un_increased = false;
+            }
+        }
+        while (un_increased);
+
+        if (term != null)
+        {
+            positionAttr.setPositionIncrement(position);
+            termAtt.setEmpty().append(term.word);
+            offsetAtt.setOffset(term.offset, term.offset + term.word.length());
+            typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * 必须重载的方法，否则在批量索引文件时将会导致文件索引失败
+     */
+    @Override
+    public void reset() throws IOException
+    {
+        super.reset();
+        segment.reset(new BufferedReader(this.input));
+    }
+
+}
diff --git a/src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java b/src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java
@@ -0,0 +1,27 @@
+package com.hankcs.lucene;
+
+import com.hankcs.hanlp.HanLP;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+
+import java.util.Map;
+
+public class HanLPTokenizerFactory extends TokenizerFactory
+{
+    private boolean enableIndexMode;
+    private boolean enablePorterStemming;
+
+    public HanLPTokenizerFactory(Map<String, String> args)
+    {
+        super(args);
+        enableIndexMode = getBoolean(args, "enableIndexMode", true);
+        enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
+    }
+
+    @Override
+    public Tokenizer create(AttributeFactory factory)
+    {
+        return new HanLPTokenizer(HanLP.newSegment().enableOffset(true).enableIndexMode(enableIndexMode), null, enablePorterStemming);
+    }
+}