-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdataset_preprocessing.py
147 lines (115 loc) · 5.07 KB
/
dataset_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
""" Prepares the dataset for training, by cleaning it, checking its integrity, scaling etc"""
import pandas as pd
from helper_funcs import is_nonzero_file7, get_full_path, append_logs, get_scaling_factors
name4logs = "dataset_preprocessing"
# TODO: make a backup first
def cleanup_data(filename):
""" Removes bad strings from the dataset file.
Args:
filename (str): the dataset filename (e.g. "syntheticData.txt")
"""
badfile = False
lines = []
if is_nonzero_file7(filename):
with open(get_full_path(filename)) as f:
lines = f.readlines()
else:
badfile = True
if not badfile:
cleaned = []
for cfd in range(len(lines)):
if ("invalid" not in lines[cfd]) and ("None" not in lines[cfd]):
cleaned.append(lines[cfd])
with open(get_full_path(filename), "w") as myFile:
for cfd in range(len(cleaned)):
myFile.write(cleaned[cfd])
def cleanup_dataset(using_synthetic_data7):
if using_synthetic_data7:
cleanup_data("dataset/syntheticData.txt")
cleanup_data("lastNpoints_synthetic")
cleanup_data("latest_datapoint_synthetic")
else:
cleanup_data("dataset/fetchedData.txt")
cleanup_data("lastNpoints_fetched")
cleanup_data("latest_datapoint_fetched")
def data_sanity_check(use_synthetic_data7, data_channels):
""" Checks if the dataset contains corrupted data.
The checks cover the case where the user has changed the number of channels, but forgot to delete the old data that
still has the old number of channels.
Args:
use_synthetic_data7 (bool): if True, the synthetic data is used
data_channels (list of strings): channel names
"""
# TODO: check if there are at least 2 channels in configs, otherwise KitNET will not work
if use_synthetic_data7:
dataset_filename = "dataset/syntheticData.txt"
else:
dataset_filename = "dataset/fetchedData.txt"
bad_shape_msg = dataset_filename + "seems to be in a bad shape, as reading it into a dataframe causes an error" \
" or meaningless output. If you changed the number of channels, deleting " \
"the data that has the previous number of channels could help "
# TODO: remove code duplication, as a similar code is used in fetched_data_to_dataframe
if is_nonzero_file7(dataset_filename):
cols_number = 3 * len(data_channels)
my_cols = [str(i) for i in range(cols_number)] # create some row names
print("checking...", dataset_filename)
df = pd.DataFrame()
try:
df = pd.read_csv(get_full_path(dataset_filename),
sep=";|§",
names=my_cols,
header=None,
engine="python")
except Exception as e:
append_logs(bad_shape_msg + " " + str(e), name4logs, "always", "print")
exit()
timestamps = pd.DataFrame(df.index).to_numpy()
latest_timestamp = timestamps[-1]
if "nan" in str(latest_timestamp):
append_logs(bad_shape_msg, name4logs, "always", "print")
exit()
else:
append_logs(dataset_filename + " doesn't exist or of zero size. First launch?", name4logs, "always", "print")
def normilize_list(ilist):
""" Normilizes the list to [-0.5, 0.5], according to the min and max values in the list.
E.g. each of these inputs return [-0.5, -0.25, 0.0, 0.25, 0.5] :
[1, 2, 3, 4, 5]
[-5, -4, -3, -2, -1]
[-2, -1, 0, 1, 2]
Args:
ilist (list of floats): should be of a non-zero len
"""
min_val = min(ilist)
max_val = max(ilist)
if min_val != max_val:
miltiplier = 1 / (max_val - min_val)
extractor = miltiplier * min_val + 0.5
olist = []
for st in range(len(ilist)):
temp = miltiplier * ilist[st] - extractor
olist.append(temp)
else:
olist = [0] * len(ilist)
return olist
def normilize_single_channel_df(raw_df, scaling_factors=None):
col_name = list(raw_df.columns.values)[0]
# The underlying assumption is that there are only FEW anomalies in the dataset that
# look like a big spike/fall of the value.
# If the assumption is false, the resulting dataset could be skewed.
# TODO: exclude known anomalies instead
if scaling_factors is None:
min_val, max_val = get_scaling_factors(raw_df)
else:
min_val, max_val = scaling_factors
print(min_val, max_val)
# removing the values outside of (min_val, max_val):
df = raw_df.loc[((raw_df >= min_val) & (raw_df <= max_val)).any(1)]
output_df = df.copy()
# the scaling itself happens here:
if min_val != max_val:
miltiplier = 1 / (max_val - min_val)
extractor = miltiplier * min_val + 0.5
output_df[col_name] = miltiplier * df[col_name] - extractor
else:
output_df = df.replace(to_replace=min_val, value=0)
return output_df, min_val, max_val