Skip to content

Commit

Permalink
added normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmycallin committed Jan 3, 2015
1 parent 2eb679c commit b56fab1
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 4 deletions.
5 changes: 5 additions & 0 deletions bin/plainstream
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ parser.add_argument('--tokenize',
help="Tokenize the text and output one word per\
line, sentences split with newline.",
default=False)
parser.add_argument('--normalize',
action="store_true",
help="Convert to lower case",
default=False)
parser.add_argument('--train_sentence_tokenizer',
action="store_true",
help="For non-English text it might raise the quality to \
Expand All @@ -30,6 +34,7 @@ try:
max_bytes=args.bytes,
max_words=args.words,
tokenize=args.tokenize,
normalize=args.normalize,
train_sentence_tokenizer=args.train_sentence_tokenizer):
if args.tokenize:
for word in sentence:
Expand Down
16 changes: 13 additions & 3 deletions plainstream/plainstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@

tokenizer = {}

def get_text(language, max_bytes=None, max_words=None, tokenize=False, train_sentence_tokenizer=False, output='raw'):
def get_text(language,
max_bytes=None,
max_words=None,
tokenize=False,
normalize=False,
train_sentence_tokenizer=False,
output='raw'):
"""
Returns a generator iteratively downloading text from Wikipedia dumps.
Parameters:
Expand All @@ -23,6 +29,7 @@ def get_text(language, max_bytes=None, max_words=None, tokenize=False, train_sen
max_words: Maximum number of words to download. Currently only works for languages with space-separated words.
tokenize: Tokenize the text into word units. Currently uses the Penn Trebank tokenizer and Punkt sentence segmenter,
and requires you to have NLTK installed. Outputted as one word per line, with two line breaks
normalize: Convert text to lower case.
"""
if language not in available_languages:
raise RuntimeError("Language not supported.")
Expand All @@ -32,7 +39,7 @@ def get_text(language, max_bytes=None, max_words=None, tokenize=False, train_sen
if train_sentence_tokenizer:
train_text = get_text(language, max_words=10000, tokenize=False, train_sentence_tokenizer=False, output='plaintext')
if language not in tokenizer:
tokenizer[language] = Tokenizer(language, train_text_gen=train_text)
tokenizer[language] = Tokenizer(language, normalize=normalize, train_text_gen=train_text)

dump_url = "http://dumps.wikimedia.org/{}wiki/latest/{}wiki-latest-pages-articles.xml.bz2".format(language, language)
req = requests.get(dump_url, stream=True)
Expand All @@ -59,4 +66,7 @@ def get_text(language, max_bytes=None, max_words=None, tokenize=False, train_sen
return
nbytes += sys.getsizeof(line)
nwords += len(line.split(" "))
yield line
if normalize:
yield line.lower()
else:
yield line
5 changes: 4 additions & 1 deletion plainstream/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Tokenizer(object):

def __init__(self, language, train_text_gen=None):
def __init__(self, language, normalize=False, train_text_gen=None):
"""
A tokenizer using NLTK Penn Treebank tokenizer, and the Punkt sentence tokenizer.
Params:
Expand All @@ -13,6 +13,7 @@ def __init__(self, language, train_text_gen=None):
"""
self.language = language
self.train_text_gen = train_text_gen
self.normalize = normalize

if train_text_gen:
self.sent_tokenizer = self._train_sentence_tokenizer()
Expand All @@ -27,6 +28,8 @@ def tokenize(self, text):
for sentence in self.sent_tokenizer.tokenize(text):
tokenized_sentence = []
for word in word_tokenize(sentence):
if self.normalize:
word = word.lower()
tokenized_sentence.append(word)
tokenized.append(tokenized_sentence)

Expand Down

0 comments on commit b56fab1

Please sign in to comment.