-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess_annotated.py
122 lines (110 loc) · 3.17 KB
/
preprocess_annotated.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import argparse
from collections import Counter
from sklearn.preprocessing import StandardScaler
import pickle as pkl
import re
parser = argparse.ArgumentParser(description='Preprocess annotated dataset')
parser.add_argument(
'--dataset',
action='store',
type=str,
required=True,
help='Path to the csv file.')
parser.add_argument(
'--target-regexs',
nargs='+',
type=str,
required=True,
help='Regular expression to select samples by their target.')
parser.add_argument(
'--new-targets',
nargs='+',
type=str,
required=True,
help='Targets to use instead.')
parser.add_argument(
'--log-transform',
action=argparse.BooleanOptionalAction,
default=False,
help="Whether to apply a logarithimc transformation."
)
parser.add_argument(
'--std-samples',
action=argparse.BooleanOptionalAction,
default=False,
help="Whether to standardize samples."
)
parser.add_argument(
'--pkl-path',
action='store',
type=str,
required=True,
help="Path to the .pkl file to create (a serialized dictionary with " +
"the following keys: 'X', 'y', 'features_ids', 'samples_ids').")
parser.add_argument(
'--csv-path',
action='store',
type=str,
required=False,
help='Path to the .csv file to create.')
args = parser.parse_args()
dataset = args.dataset
target_regexs = args.target_regexs
new_targets_names = args.new_targets
log_transform = args.log_transform
std_samples = args.std_samples
pkl_path = args.pkl_path
csv_path = args.csv_path
data = pd.read_csv(dataset).dropna(axis=0)
targets = list(data.loc['TARGET'])
new_targets = []
targets_to_drop = []
for i, target in enumerate(targets):
found = False
for regex, new_t in zip(target_regexs, new_targets_names):
if re.match(regex, target):
new_targets.append(new_t)
found = True
break
if not found:
targets_to_drop.append(i)
data.drop(data.columns[targets_to_drop], axis=1, inplace=True)
data = data.drop("TARGET")
data = data.apply(pd.to_numeric)
data = data.groupby(by=data.index).mean() # average features with same id
X = np.transpose(data.to_numpy())
if log_transform:
min = np.min(X)
if min < 0:
X += abs(min)
min = 0
if min >= 0 and min < 1:
X += 1
X = np.log2(X)
if std_samples:
scaler = StandardScaler()
X = np.transpose(scaler.fit_transform(np.transpose(X)))
y = np.array(new_targets)
features_ids = list(data.index)
samples_ids = list(data.columns)
if len(features_ids) != X.shape[1]:
raise ValueError("# feature wrong")
if len(samples_ids) != X.shape[0]:
raise ValueError("# samples wrong")
print("# Features = %d"%len(features_ids))
print("# Samples = %d"%len(samples_ids))
print("# Samples per class = %s"%dict(Counter(new_targets)))
dataset = {}
dataset['X'] = X
dataset['y'] = y
dataset['features_ids'] = features_ids
dataset['samples_ids'] = samples_ids
if csv_path:
df = pd.DataFrame(data=X, columns=dataset['features_ids'])
df.insert(0, 'SAMPLE_ID', dataset['samples_ids'])
df.insert(0, 'TARGET', dataset['y'])
df.to_csv(csv_path)
with open(pkl_path, 'wb') as fp :
pkl.dump(dataset, fp)