Skip to content

Commit

Permalink
Use word_tokenize in combo with TreebankWordDetokenizer. Other small …
Browse files Browse the repository at this point in the history
…changes too (#21)

* use capitalize for first_token_case and an out_of_vocab option

* refactor nominator to numerator (more descriptive)

* Use word_tokenizer instead of TweetTokenizer and TreebankWordDetokenizer to join tokens

* Add test cases

* Update __init__.py

Co-authored-by: daltonfury42 <daltonfury42@users.noreply.github.com>
  • Loading branch information
keshprad and daltonfury42 authored Jul 2, 2021
1 parent 2dfa90b commit 825a6db
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 22 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,7 @@ fabric.properties
.pytest_cache

build/
.eggs/
.eggs/

# virtual env
.env/
20 changes: 18 additions & 2 deletions tests/test_truecase.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,30 @@ def setUp(self):
def test_get_true_case(self):
sentence = "I live in barcelona."
expected = "I live in Barcelona."

result = self.tc.get_true_case(sentence)

assert result == expected

sentence = "My name is irvine wels"
expected = "My name is Irvine Wels"
result = self.tc.get_true_case(sentence)
assert result == expected

sentence = "i paid $50 FOR My shoes."
expected = "I paid $50 for my shoes."
result = self.tc.get_true_case(sentence)
assert result == expected

sentence = "Ron'S show Is a big Hit."
expected = "Ron's show is a big hit."
result = self.tc.get_true_case(sentence)
assert result == expected

sentence = "What Is Your name?"
expected = "What is your name?"
result = self.tc.get_true_case(sentence)
assert result == expected

sentence = "at The moment, I AM getting ready for work!"
expected = "At the moment, I am getting ready for work!"
result = self.tc.get_true_case(sentence)
assert result == expected
36 changes: 18 additions & 18 deletions truecase/TrueCaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import string

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer


class TrueCaser(object):
Expand All @@ -22,24 +23,24 @@ def __init__(self, dist_file_path=None):
self.forward_bi_dist = pickle_dict["forward_bi_dist"]
self.trigram_dist = pickle_dict["trigram_dist"]
self.word_casing_lookup = pickle_dict["word_casing_lookup"]
self.tknzr = TweetTokenizer()
self.detknzr = TreebankWordDetokenizer()

def get_score(self, prev_token, possible_token, next_token):
pseudo_count = 5.0

# Get Unigram Score
nominator = self.uni_dist[possible_token] + pseudo_count
numerator = self.uni_dist[possible_token] + pseudo_count
denominator = 0
for alternativeToken in self.word_casing_lookup[
possible_token.lower()]:
denominator += self.uni_dist[alternativeToken] + pseudo_count

unigram_score = nominator / denominator
unigram_score = numerator / denominator

# Get Backward Score
bigram_backward_score = 1
if prev_token is not None:
nominator = (
numerator = (
self.backward_bi_dist[prev_token + "_" + possible_token] +
pseudo_count)
denominator = 0
Expand All @@ -49,13 +50,13 @@ def get_score(self, prev_token, possible_token, next_token):
alternativeToken] +
pseudo_count)

bigram_backward_score = nominator / denominator
bigram_backward_score = numerator / denominator

# Get Forward Score
bigram_forward_score = 1
if next_token is not None:
next_token = next_token.lower() # Ensure it is lower case
nominator = (
numerator = (
self.forward_bi_dist[possible_token + "_" + next_token] +
pseudo_count)
denominator = 0
Expand All @@ -65,13 +66,13 @@ def get_score(self, prev_token, possible_token, next_token):
self.forward_bi_dist[alternativeToken + "_" + next_token] +
pseudo_count)

bigram_forward_score = nominator / denominator
bigram_forward_score = numerator / denominator

# Get Trigram Score
trigram_score = 1
if prev_token is not None and next_token is not None:
next_token = next_token.lower() # Ensure it is lower case
nominator = (self.trigram_dist[prev_token + "_" + possible_token +
numerator = (self.trigram_dist[prev_token + "_" + possible_token +
"_" + next_token] + pseudo_count)
denominator = 0
for alternativeToken in self.word_casing_lookup[
Expand All @@ -80,15 +81,15 @@ def get_score(self, prev_token, possible_token, next_token):
self.trigram_dist[prev_token + "_" + alternativeToken +
"_" + next_token] + pseudo_count)

trigram_score = nominator / denominator
trigram_score = numerator / denominator

result = (math.log(unigram_score) + math.log(bigram_backward_score) +
math.log(bigram_forward_score) + math.log(trigram_score))

return result

def first_token_case(self, raw):
return f'{raw[0].upper()}{raw[1:]}'
return raw.capitalize()

def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
""" Returns the true case for the passed tokens.
Expand All @@ -99,7 +100,7 @@ def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
lower: Returns OOV tokens in lower case
as-is: Returns OOV tokens as is
"""
tokens = self.tknzr.tokenize(sentence)
tokens = word_tokenize(sentence)

tokens_true_case = []
for token_idx, token in enumerate(tokens):
Expand Down Expand Up @@ -132,21 +133,20 @@ def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
tokens_true_case.append(best_token)

if token_idx == 0:
tokens_true_case[0] = self.first_token_case(tokens_true_case[0])
tokens_true_case[0] = self.first_token_case(
tokens_true_case[0])

else: # Token out of vocabulary
if out_of_vocabulary_token_option == "title":
tokens_true_case.append(token.title())
elif out_of_vocabulary_token_option == "capitalize":
tokens_true_case.append(token.capitalize())
elif out_of_vocabulary_token_option == "lower":
tokens_true_case.append(token.lower())
else:
tokens_true_case.append(token)

return "".join([
" " +
i if not i.startswith("'") and i not in string.punctuation else i
for i in tokens_true_case
]).strip()
return self.detknzr.detokenize(tokens_true_case)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion truecase/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .TrueCaser import TrueCaser

__version__ = "0.0.12"
__version__ = "0.0.13"


@lru_cache(maxsize=1)
Expand Down

0 comments on commit 825a6db

Please sign in to comment.