-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVectorizers_Models.py
242 lines (195 loc) · 6.9 KB
/
Vectorizers_Models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# IMPORTS
import pandas as pd
import string
import re
import time
import tensorflow as tf
from keras.layers import TextVectorization
from keras.models import load_model
# import numpy as np
# from keras.preprocessing.sequence import pad_sequences
# import joblib
# DATA Extraction (Have to be in the project folder)
data_path = "Dataset/fra_160K.txt"
df_raw1 = pd.read_csv(
data_path,
delimiter="\t",
encoding="utf-8",
header=None,
names=["en", "fr"],
index_col=False,
)
# df_raw1.head()
lang1 = "en"
lang2 = "fr"
# print(f"Number of sentences : {df_raw1.count()[0]}")
# Returning a list of tuples (corresponding eng-fre sentences)
def Create_pairs(dataframe):
text_pairs = [
(row["en"], "[start] " + row["fr"] + " [end]")
for index, row in dataframe.iterrows()
]
return text_pairs
# Create the pairs
text_pairs = Create_pairs(df_raw1)
# Define custom_standardization of the french vectorizer:
strip_chars = list(string.punctuation)
strip_chars.remove("[")
strip_chars.remove("]")
strip_chars = "".join(strip_chars)
def custom_standardization(input_string):
lowercase = tf.strings.lower(input_string)
return tf.strings.regex_replace(
lowercase, "[%s]" % re.escape("".join(strip_chars)), ""
)
# Parameters
model_number = 1 # 0-->v6 and 1-->v80
params = [[14, 14000], [20, 20000]] # list of [seq_len, vocab_size]
sequence_length = params[model_number][0]
vocab_size = params[model_number][1]
# Default standardization for eng (strip string.punctuations)
eng_vectorization = TextVectorization(
max_tokens=vocab_size,
output_mode="int",
output_sequence_length=sequence_length,
)
# Customized for french (strip specific punctuations)
fre_vectorization = TextVectorization(
max_tokens=vocab_size,
output_mode="int",
output_sequence_length=sequence_length + 1,
standardize=custom_standardization,
)
# Adapting (fitting) Vectorizers to the data :
train_eng_texts = [pair[0] for pair in text_pairs]
train_fre_texts = [pair[1] for pair in text_pairs]
start = time.time()
eng_vectorization.adapt(train_eng_texts) # fitting the text vectorization layer to data
fre_vectorization.adapt(train_fre_texts) # same (but unchanged data)
end = time.time()
print(f"Adapting both vectorizers execution time : {end - start:.2f}")
# Load the models
model_path_v6 = "my_translation_model_gpu_v6.h5"
model_path_v80 = "my_translation_model_gpu_v80.h5"
model_v6 = load_model(model_path_v6) # model number 0
model_v80 = load_model(model_path_v80) # model number 1
model = [model_v6, model_v80][model_number]
#
#
#
#
# --------------------------------------Commented code from the chatbot hackathon (I may use API) -------------------------
# from io import StringIO
# import os
# import fitz
# # import openai
# from dotenv import load_dotenv
# from nltk.tokenize import sent_tokenize
# # import aspose.words as aw
# load_dotenv()
# def open_file(filepath):
# with open(filepath, "r", encoding="utf-8") as infile:
# return infile.read()
# openai.api_key = os.getenv("OPENAI_API_KEY")
# openai.organization = os.getenv("OPENAI_ORGANIZATION")
# def read_pdf(filename):
# context = ""
# # Open the PDF file
# with fitz.open(filename) as pdf_file:
# # Get the number of pages in the PDF file
# num_pages = pdf_file.page_count
# # Loop through each page in the PDF file
# for page_num in range(num_pages):
# # Get the current page
# page = pdf_file[page_num]
# # Get the text from the current page
# page_text = page.get_text().replace("\n", "")
# # Append the text to context
# context += page_text
# return context
# def split_text(text, chunk_size=5000):
# """
# Splits the given text into chunks of approximately the specified chunk size.
# Args:
# text (str): The text to split.
# chunk_size (int): The desired size of each chunk (in characters).
# Returns:
# List[str]: A list of chunks, each of approximately the specified chunk size.
# """
# chunks = []
# current_chunk = StringIO()
# current_size = 0
# sentences = sent_tokenize(text)
# for sentence in sentences:
# sentence_size = len(sentence)
# if sentence_size > chunk_size:
# while sentence_size > chunk_size:
# chunk = sentence[:chunk_size]
# chunks.append(chunk)
# sentence = sentence[chunk_size:]
# sentence_size -= chunk_size
# current_chunk = StringIO()
# current_size = 0
# if current_size + sentence_size < chunk_size:
# current_chunk.write(sentence)
# current_size += sentence_size
# else:
# chunks.append(current_chunk.getvalue())
# current_chunk = StringIO()
# current_chunk.write(sentence)
# current_size = sentence_size
# if current_chunk:
# chunks.append(current_chunk.getvalue())
# return chunks
# filename = os.path.join(os.path.dirname(__file__), "filename.pdf")
# document = read_pdf(filename)
# chunks = split_text(document)
# def gpt3_completion(txt):
# a = openai.ChatCompletion.create(
# model="gpt-3.5-turbo",
# messages=[
# {"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": txt},
# ],
# )
# return a["choices"][0]["message"]["content"]
# text = "Brahim le crétin est une légende de l'école des ponts"
# def ask_question_to_pdf(txt, filename):
# document = read_pdf(filename)
# chunks = split_text(document)
# a = openai.ChatCompletion.create(
# model="gpt-3.5-turbo",
# messages=[
# {"role": "system", "content": chunks[0]},
# {"role": "user", "content": txt},
# ],
# )
# return a["choices"][0]["message"]["content"]
# def verif(question, response, filename):
# document = read_pdf(filename)
# chunks = split_text(document)
# a = openai.ChatCompletion.create(
# model="gpt-3.5-turbo",
# messages=[
# {"role": "system", "content": chunks[0]},
# {
# "role": "system",
# "content": "si ma réponse n'a pas de lien avec la question dis moi que j'ai tout faux",
# },
# {"role": "assistant", "content": question},
# {"role": "user", "content": response},
# {
# "role": "user",
# "content": "est ce que ma réponse est correcte et sinon quelle était la bonne réponse",
# },
# ],
# )
# return a["choices"][0]["message"]["content"]
# def fichier_txt(path):
# f = open_file(
# path, "r"
# ) # Essai est mon fichier.txt que vous pouvez voir juste au dessus )
# readlines = f.readlines()
# f.close()
# final_text = readlines.replace("\n", "")
# return final_text