-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocesing_data.py
102 lines (75 loc) · 2.82 KB
/
preprocesing_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from typing import Tuple
import pickle
import os
import string
import numpy as np
def read_text(data_path: str):
"""
Read data.
"""
output = []
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
for line in lines:
input_doc, target_doc = line.split('\t')[:2]
output.append([input_doc, target_doc])
return output
def preprocess_data(dat) -> np.ndarray:
"""
Preprocess data.
"""
# Translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)
for pair in dat:
pair[0] = pair[0].lower()
pair[0] = pair[0].translate(table)
pair[0] = [word for word in pair[0].split() if word.isalpha()]
pair[0] = ' '.join(pair[0])
pair[1] = pair[1].lower()
pair[1] = pair[1].translate(table)
pair[1] = [word for word in pair[1].split() if word.isalpha()]
pair[1] = ' '.join(pair[1])
return np.array(dat)
def save_preprocessed_data(preprocessed_dat: np.ndarray, filename: str, directory='data') -> None:
"""
Save cleaned data into a file.
"""
filepath = os.path.join(directory, filename)
# Serialize and save the data to the specified filepath
pickle.dump(preprocessed_dat, open(filepath, 'wb'))
print(f"Preprocessed data saved in {filename}")
def load_preprocessed_data(filepath: str) -> np.ndarray:
"""
Load data from the pickle file
"""
return pickle.load(open(filepath, 'rb'))
def train_test_split(dat: np.ndarray, test_ratio=0.2) -> Tuple[np.ndarray, np.ndarray]:
"""
Split data into the training set and the testing set.
"""
split_index = int(len(dat) * (1 - test_ratio))
train_data, test_data = dat[:split_index], dat[split_index:]
return train_data, test_data
if __name__ == '__main__':
data = "data/bel.txt"
docs = read_text(data)
preprocess_docs = preprocess_data(docs)
print(f"Number of phrase pairs: {len(preprocess_docs)}")
# Save preprocessed dataset
save_preprocessed_data(preprocess_docs, 'english-belarusian.pkl')
# Load preprocessed dataset
raw_dataset = load_preprocessed_data("data/english-belarusian.pkl")
# Reduce dataset size
n_pairs = 3883
dataset = raw_dataset[:n_pairs, :]
print(f"Number of phrase pairs in the reduced dataset: {len(dataset)}")
print(f"\nExample 1: {dataset[0]}")
# Shuffle the dataset
np.random.seed(42)
np.random.shuffle(dataset)
print(f"\nShuffled dataset: {dataset}")
# Split the shuffled dataset into train and test sets
train, test = train_test_split(dataset, test_ratio=0.1)
save_preprocessed_data(dataset, 'english-belarusian-both.pkl')
save_preprocessed_data(train, 'english-belarusian-train.pkl')
save_preprocessed_data(test, 'english-belarusian-test.pkl')