-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
147 lines (108 loc) · 3.72 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import operator
import os
import sys
import time
from math import gcd
from typing import Callable
from numpy import linalg, matrix, round
from pandas import read_csv
def disable_print():
"""
This function sends all future prints into a null file
"""
sys.stdout = open(os.devnull, 'w')
# Restore
def enable_print():
"""
This function sends all future prints into console
"""
sys.stdout = sys.__stdout__
def are_coprime(a, b):
"""
Checks if two values are coprime.
"""
return gcd(a, b) == 1
def mod_inverse_matrix(m: matrix, modulo: int) -> matrix | None:
"""
Modulo inverse of an matrix
:param m: the matrix
:param modulo: the modulo
:return: a matrix inversion
"""
det = round(linalg.det(m))
if gcd(int(det), modulo) != 1:
return None
m_inv = linalg.inv(m)
m_inv_modulo = (m_inv * det * pow(int(det), -1, modulo)) % modulo
m_int = round(m_inv_modulo).astype(int) % modulo
return m_int
def preprocess_text(text: str, alphabet: str):
"""
:param text: raw text
:param alphabet: alphabet used in text (lower or upper)
:return: text that contain only letters
"""
text = text.upper()
processed = [c for c in text if c in alphabet.upper()]
return "".join(processed)
def quality(callback: Callable, t_: int = 1):
"""
:param callback: Callable function
:param t_: test time
:return: numer of iteration in test time
"""
t0 = time.time()
n_iters = 0
while time.time() - t0 < t_:
callback()
n_iters += 1
return n_iters
def get_alphabet(alphabet_file_path: str, encoding: str = "UTF-8"):
with open(alphabet_file_path, encoding=encoding) as file:
text = file.read().strip()
return text
def generate_grams(in_file_path: str, out_file_path: str, alphabet_file_path: str, n: int = 2):
with open(alphabet_file_path, encoding="UTF-8") as file:
alphabet = file.read().strip()
with open(in_file_path, encoding='UTF-8') as file_in:
text = file_in.read()
text = preprocess_text(text, alphabet)
dictionary = dict()
for i in range(0, len(text) - n):
gram = text[i:i + n]
if gram in dictionary:
dictionary[gram] += 1
else:
dictionary[gram] = 1
_save_dict_to_file(dictionary, out_file_path)
def _save_dict_to_file(dictionary, out_file_path, with_key=True):
entries = [(f"{key} {value}" if with_key else str(value)) for key, value in
sorted([i for i in dictionary.items()], key=operator.itemgetter(1), reverse=True)]
text = "\n".join(entries)
with open(out_file_path, encoding='UTF-8', mode='w+') as file:
file.write(text)
def genereate_freqs(in_file_path: str, out_file_path: str, alphabet: str):
with open(in_file_path, encoding='UTF-8') as file_in:
text = file_in.read()
text = preprocess_text(text, alphabet)
text_len = len(text)
counts = dict()
for letter in text:
if letter in counts:
counts[letter] += 1
else:
counts[letter] = 0
for key, value in counts.items():
counts[key] = value / text_len
_save_dict_to_file(counts, out_file_path, with_key=False)
def get_language_data():
alph = "ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜß"
german_bigrams = "./german_bigrams.txt"
german_trigrams = "./german_trigrams.txt"
german_letter_freqs = "./german_letters.csv"
letter_data = read_csv(german_letter_freqs)
freqs = letter_data['frequency'].tolist()
return alph, german_bigrams, german_trigrams, freqs
def parse_freqs(freqs_file_path):
with open(freqs_file_path, encoding="UTF-8") as file:
return [float(val) for val in file.read().split("\n")]