-
Notifications
You must be signed in to change notification settings - Fork 4
/
KrovetzStemmer.hpp
175 lines (173 loc) · 5.66 KB
/
KrovetzStemmer.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/*==========================================================================
* Copyright (c) 2005 University of Massachusetts. All Rights Reserved.
*
* Use of the Lemur Toolkit for Language Modeling and Information Retrieval
* is subject to the terms of the software license set forth in the LICENSE
* file included with this software, and also available at
* http://www.lemurproject.org/license.html
*
*==========================================================================
*/
// dmf
// C++ thread safe implementation of the Krovetz stemmer.
// requires no external data files.
// 07/29/2005
#ifndef _KROVETZ_STEMMER_H_
#define _KROVETZ_STEMMER_H_
#include <iostream>
#include <cstring>
#if defined(_WIN32) || defined(_WIN64)
#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS
#include <hash_map>
#elif defined(__clang__)
#include <unordered_map>
#else
// Move this somewhere
#ifndef HAVE_GCC_VERSION
#define HAVE_GCC_VERSION(MAJOR, MINOR) \
(__GNUC__ > (MAJOR) || (__GNUC__ == (MAJOR) && __GNUC_MINOR__ >= (MINOR)))
#endif /* ! HAVE_GCC_VERSION */
#if HAVE_GCC_VERSION(4,3)
// if GCC 4.3+
#include <tr1/unordered_map>
#else
#include <ext/hash_map>
#endif
// 3.3 does not use __gnu_cxx, 3.4+ does.
using namespace __gnu_cxx;
#endif
namespace stem {
class KrovetzStemmer {
public:
KrovetzStemmer();
~KrovetzStemmer();
/// maximum number of characters in a word to be stemmed.
static const int MAX_WORD_LENGTH=25;
/*!
\brief stem a term using the Krovetz algorithm.
The stem returned may be longer than the input term.
May return a pointer
to the private attribute stem. Performs case normalization on its
input argument. Return values should be copied before
calling the method again.
@param term the term to stem
@return the stemmed term or the original term if no stemming was
performed.
*/
char * kstem_stemmer(char *term);
/*!
\brief stem a term using the Krovetz algorithm into the specified
buffer.
The stem returned may be longer than the input term.
Performs case normalization on its input argument.
@param term the term to stem
@param buffer the buffer to hold the stemmed term. The buffer should
be at MAX_WORD_LENGTH or larger.
@return the number of characters written to the buffer, including
the terminating '\\0'. If 0, the caller should use the value in term.
*/
int kstem_stem_tobuffer(char *term, char *buffer);
/*!
\brief Add an entry to the stemmer's dictionary table.
@param variant the spelling for the entry.
@param word the stem to use for the variant. If "", the variant
stems to itself.
@param exc Is the word an exception to the spelling rules.
*/
void kstem_add_table_entry(const char* variant, const char* word,
bool exc=false);
private:
/// Dictionary table entry
typedef struct dictEntry {
/// is the word an exception to stemming rules?
bool exception;
/// stem to use for this entry.
const char *root;
} dictEntry;
/// Two term hashtable entry for caching across calls
typedef struct cacheEntry {
/// flag for first or second entry most recently used.
char flag;
/// first entry variant
char word1[MAX_WORD_LENGTH];
/// first entry stem
char stem1[MAX_WORD_LENGTH];
/// second entry variant
char word2[MAX_WORD_LENGTH];
/// second entry stem
char stem2[MAX_WORD_LENGTH];
} cacheEntry;
// operates on atribute word.
bool ends(const char *s, int sufflen);
void setsuff(const char *str, int length);
dictEntry *getdep(char *word);
bool lookup(char *word);
bool cons(int i);
bool vowelinstem();
bool vowel(int i);
bool doublec(int i);
void plural();
void past_tense();
void aspect();
void ion_endings();
void er_and_or_endings ();
void ly_endings ();
void al_endings() ;
void ive_endings() ;
void ize_endings() ;
void ment_endings() ;
void ity_endings() ;
void ble_endings() ;
void ness_endings() ;
void ism_endings();
void ic_endings();
void ncy_endings();
void nce_endings();
// maint.
void loadTables();
#if defined(_WIN32)
struct ltstr {
bool operator()(const char* s1, const char* s2) const {
return strcmp(s1, s2) < 0;
}
};
//studio 7 hash_map provides hash_compare, rather than hash
// needing an < predicate, rather than an == predicate.
typedef stdext::hash_map<const char *, dictEntry, stdext::hash_compare<const char *, ltstr> > dictTable;
#elif defined(__clang__)
struct eqstr {
bool operator()(const char* s1, const char* s2) const {
return strcmp(s1, s2) == 0;
}
};
typedef std::unordered_map<const char *, dictEntry, std::hash<std::string>, eqstr> dictTable;
#else
struct eqstr {
bool operator()(const char* s1, const char* s2) const {
return strcmp(s1, s2) == 0;
}
};
#if HAVE_GCC_VERSION(4,3)
typedef std::tr1::unordered_map<const char *, dictEntry, std::tr1::hash<std::string>, eqstr> dictTable;
#else
typedef hash_map<const char *, dictEntry, hash<const char *>, eqstr> dictTable;
#endif
#endif
dictTable dictEntries;
// this needs to be a bounded size cache.
// kstem.cpp uses size 30013 entries.
cacheEntry *stemCache;
// size
int stemhtsize;
// state
// k = wordlength - 1
int k;
// j is stemlength - 1
int j;
// pointer to the output buffer
char *word;
// used by kstem_stemmer to return a safe value.
char stem[MAX_WORD_LENGTH];
};
}
#endif /* _KROVETZ_STEMMER_H_*/