Skip to content

Commit

Permalink
Change dicts .txt to .marisa format
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexKly committed Oct 29, 2022
1 parent deabe05 commit bc3a24a
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 29 deletions.
Binary file added russian_uncensor/data/ngrams/bi_grams.marisa
Binary file not shown.
Binary file added russian_uncensor/data/ngrams/freq_letters.marisa
Binary file not shown.
Binary file added russian_uncensor/data/ngrams/tri_grams.marisa
Binary file not shown.
Binary file added russian_uncensor/data/obscene_words.marisa
Binary file not shown.
39 changes: 29 additions & 10 deletions russian_uncensor/n_grams.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,30 @@


class WordStats:
def __init__(self, dict_path=None, neg_words_fn=None, freq_letters_fn=None, bigrams_fn=None, trigrams_fn=None, debug=False):
def __init__(self, dict_path=None, neg_words_fn=None, freq_letters_fn=None, bigrams_fn=None, trigrams_fn=None,
ext='.marisa', debug=False):
""" Init WordStats class.
:param dict_path: common path to data files dir (str).
:param neg_words_fn: obscene components file name (str).
:param freq_letters_fn: frequent letters used in obscene words file name (str).
:param bigrams_fn: frequent bi-grams used in obscene words file name (str).
:param trigrams_fn: frequent tri-grams used in obscene words file name (str).
:param ext: extension of output files. .txt or .marisa (str).
:param debug: turn on instruments for debug (bool).
:return:
"""
# File paths:
self.dict_path = Path.joinpath(path_current_file, Path('data')) if dict_path is None else dict_path
print(self.dict_path)
self.neg_words_filename = self.dict_path/'obscene_words.txt' if neg_words_fn is None else self.dict_path/neg_words_fn
self.frequent_letters_filename = self.dict_path/'ngrams/freq_letters.txt' if freq_letters_fn is None else self.dict_path/freq_letters_fn
self.bi_grams_filename = self.dict_path/'ngrams/bi_grams.txt' if bigrams_fn is None else self.dict_path/bigrams_fn
self.tri_grams_filename = self.dict_path/'ngrams/tri_grams.txt' if trigrams_fn is None else self.dict_path/trigrams_fn
self.ext = ext
self.neg_words_filename = self.dict_path/f'obscene_words.marisa' \
if neg_words_fn is None else self.dict_path/neg_words_fn
self.frequent_letters_filename = self.dict_path/f'ngrams/freq_letters{self.ext}' \
if freq_letters_fn is None else self.dict_path/freq_letters_fn
self.bi_grams_filename = self.dict_path/f'ngrams/bi_grams{self.ext}' \
if bigrams_fn is None else self.dict_path/bigrams_fn
self.tri_grams_filename = self.dict_path/f'ngrams/tri_grams{self.ext}' \
if trigrams_fn is None else self.dict_path/trigrams_fn
# Crete dir (if it doesnt exist)
if not os.path.exists(path=self.dict_path/'ngrams'):
os.mkdir(path=self.dict_path/'ngrams')
Expand All @@ -39,7 +45,10 @@ def frequent_letters_stat(self):
:return: Counter of the frequent letters in obscene words (dict).
"""
frequent_letters_cnt = Counter()
neg_words = marisa_trie.Trie(rd_wr_module(self.neg_words_filename))
if str(self.neg_words_filename)[-4:] == '.txt':
neg_words = marisa_trie.Trie(rd_wr_module(self.neg_words_filename))
elif str(self.neg_words_filename)[-7:] == '.marisa':
neg_words = marisa_trie.Trie().load(self.neg_words_filename)
for word in neg_words:
for letter in word:
if letter in self.ru_alphabet:
Expand All @@ -55,7 +64,10 @@ def bi_grams_stat(self):
:return: Counter of the bi-grams in obscene words (dict).
"""
bigrams_cnt = Counter()
neg_words = marisa_trie.Trie(rd_wr_module(self.neg_words_filename))
if str(self.neg_words_filename)[-4:] == '.txt':
neg_words = marisa_trie.Trie(rd_wr_module(self.neg_words_filename))
elif str(self.neg_words_filename)[-7:] == '.marisa':
neg_words = marisa_trie.Trie().load(self.neg_words_filename)
for word in neg_words:
for i in range(len(word) - 1):
if word[i] in self.ru_alphabet and word[i + 1] in self.ru_alphabet:
Expand All @@ -71,7 +83,10 @@ def tri_grams_stat(self):
:return: Counter of the tri-grams in obscene words (dict).
"""
trigrams_cnt = Counter()
neg_words = marisa_trie.Trie(rd_wr_module(self.neg_words_filename))
if str(self.neg_words_filename)[-4:] == '.txt':
neg_words = marisa_trie.Trie(rd_wr_module(self.neg_words_filename))
elif str(self.neg_words_filename)[-7:] == '.marisa':
neg_words = marisa_trie.Trie().load(self.neg_words_filename)
for word in neg_words:
for i in range(len(word) - 2):
if word[i] in self.ru_alphabet and word[i + 1] in self.ru_alphabet and word[i + 2] in self.ru_alphabet:
Expand Down Expand Up @@ -99,4 +114,8 @@ def save_n_grams(self):
for group in zip(n_grams, filenames):
if self.debug:
print(f'Filename: {group[1]} Content: {group[0]}')
rd_wr_module(path_dict=group[1], input_dict=group[0], mode='w')
if self.ext == '.txt':
rd_wr_module(path_dict=group[1], input_dict=group[0], mode='w')
elif self.ext == '.marisa':
marisa_trie.Trie(group[0]).save(group[1])

24 changes: 18 additions & 6 deletions russian_uncensor/uncensored.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,25 @@ def __init__(self, dict_path=None, freq_letter_fn=None, bi_grams_fn=None, tri_gr
"""
# Paths:
self.dict_path = Path.joinpath(path_current_file, Path('data')) if dict_path is None else dict_path
self.freq_letters_fn = self.dict_path/'ngrams/freq_letters.txt' if freq_letter_fn is None else self.dict_path/freq_letter_fn
self.bi_grams_fn = self.dict_path/'ngrams/bi_grams.txt' if bi_grams_fn is None else self.dict_path/bi_grams_fn
self.tri_grams_fn = self.dict_path/'ngrams/tri_grams.txt' if tri_grams_fn is None else self.dict_path/tri_grams_fn
self.freq_letters_fn = self.dict_path/'ngrams/freq_letters.marisa' \
if freq_letter_fn is None else self.dict_path/freq_letter_fn
self.bi_grams_fn = self.dict_path/'ngrams/bi_grams.marisa' \
if bi_grams_fn is None else self.dict_path/bi_grams_fn
self.tri_grams_fn = self.dict_path/'ngrams/tri_grams.marisa' \
if tri_grams_fn is None else self.dict_path/tri_grams_fn
# Dictionaries:
self.freq_letters = marisa_trie.Trie(rd_wr_module(path_dict=self.freq_letters_fn))
self.bi_grams = marisa_trie.Trie(rd_wr_module(path_dict=self.bi_grams_fn))
self.tri_grams = marisa_trie.Trie(rd_wr_module(path_dict=self.tri_grams_fn))
if str(self.freq_letters_fn)[-4:] == '.txt':
self.freq_letters = marisa_trie.Trie(rd_wr_module(path_dict=self.freq_letters_fn))
elif str(self.freq_letters_fn)[-7:] == '.marisa':
self.freq_letters = marisa_trie.Trie().load(self.freq_letters_fn)
if str(self.bi_grams_fn)[-4:] == '.txt':
self.bi_grams = marisa_trie.Trie(rd_wr_module(path_dict=self.bi_grams_fn))
elif str(self.bi_grams_fn)[-7:] == '.marisa':
self.bi_grams = marisa_trie.Trie().load(self.bi_grams_fn)
if str(self.tri_grams_fn)[-4:] == '.txt':
self.tri_grams = marisa_trie.Trie(rd_wr_module(path_dict=self.tri_grams_fn))
elif str(self.tri_grams_fn)[-7:] == '.marisa':
self.tri_grams = marisa_trie.Trie().load(self.tri_grams_fn)
# Parameters:
self.win_len = 3
self.delimiters = string.punctuation
Expand Down
18 changes: 5 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
setup(
name='russian_uncensor',
packages=['russian_uncensor'],
version='0.0.12',
version='0.1.0',
license='MIT',
description='Uncensor for russian masked or separated obscene words based on frequent letters, bi- and tri-grams analysis',
long_description=long_description,
Expand All @@ -32,20 +32,12 @@
'Programming Language :: Python :: 3.10',
],
package_data={
'russian_uncensor/data': ['obscene_words.txt'],
'russian_uncensor/data': ['obscene_words.marisa'],
'russian_uncensor/data/ngrams': [
'freq_letters.txt',
'bi_grams.txt',
'tri_grams.txt'
'freq_letters.marisa',
'bi_grams.marisa',
'tri_grams.marisa'
],
},
#data_files=[
# ('russian_uncensor/data', ['russian_uncensor/data/obscene_words.txt']),
# ('russian_uncensor/data/ngrams', [
# 'russian_uncensor/data/ngrams/freq_letters.txt',
# 'russian_uncensor/data/ngrams/bi_grams.txt',
# 'russian_uncensor/data/ngrams/tri_grams.txt',
# ]),
#],
include_package_data=True,
)

0 comments on commit bc3a24a

Please sign in to comment.