-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
executable file
·100 lines (79 loc) · 3.14 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import pandas as pd
from utils import *
def load_data(data_tag):
if data_tag == "initial-full-data":
#
# full data, 10k for test with seed 42
#
# Load the data and convert sequences to tensors
#
print("Loading and filtering data...")
pos_path = "data/GSM3155092_P01_CRVstim_CD8_beta.txt.gz"
neg_path = "data/GSM3155090_P01_unstim_CD8_beta.txt.gz"
df_pos = load_dataset(pos_path)
df_neg = load_dataset(neg_path)
df_neg = filter_specific(df_pos, df_neg)
print()
print("Clonotype tables:")
print(" - ", pos_path, ":", len(df_pos), "rows")
print(" - ", neg_path, ":", len(df_neg), "rows")
print()
#
# Convert sequences to tensors
#
test_size = 10000
oh_dict = load_dict()
seed = 42
indices = np.random.choice(len(df_pos), test_size, replace=False)
df_pos_test = df_pos.iloc[indices, :]
df_pos_train = df_pos.drop(indices)
indices = np.random.choice(len(df_neg), test_size, replace=False)
df_neg_test = df_neg.iloc[indices, :]
df_neg_train = df_neg.drop(indices)
print("Train DFs sizes:", len(df_pos_train), len(df_neg_train))
print("Test DFs sizes:", len(df_pos_test), len(df_neg_test))
print()
elif data_tag == "rc1-rc2-5k":
#
# pos > 1 read
# neg > 2 read
# 5k for test with seed 42
#
# Load the data and convert sequences to tensors
#
print("Loading and filtering data...")
pos_path = "data/GSM3155092_P01_CRVstim_CD8_beta.txt.gz"
neg_path = "data/GSM3155090_P01_unstim_CD8_beta.txt.gz"
df_pos = load_dataset(pos_path)
df_neg = load_dataset(neg_path)
df_neg = filter_specific(df_pos, df_neg)
print()
print("Clonotype tables before filtering:")
print(" - ", pos_path, ":", len(df_pos), "rows")
print(" - ", neg_path, ":", len(df_neg), "rows")
print()
df_pos = df_pos.loc[df_pos["Read.count"] != 1, :].reset_index(drop=True)
df_neg = df_neg.loc[df_neg["Read.count"] > 2, :].reset_index(drop=True)
print("Clonotype tables after filtering:")
print(" - ", pos_path, ":", len(df_pos), "rows")
print(" - ", neg_path, ":", len(df_neg), "rows")
print()
#
# Convert sequences to tensors
#
test_size = 5000
oh_dict = load_dict()
seed = 42
indices = np.random.choice(len(df_pos), test_size, replace=False)
df_pos_test = df_pos.iloc[indices, :]
df_pos_train = df_pos.drop(indices)
indices = np.random.choice(len(df_neg), test_size, replace=False)
df_neg_test = df_neg.iloc[indices, :]
df_neg_train = df_neg.drop(indices)
print("Train DFs sizes:", len(df_pos_train), len(df_neg_train))
print("Test DFs sizes:", len(df_pos_test), len(df_neg_test))
print()
else:
raise ValueError('Unknown --data_tag value')
return df_pos_test, df_pos_train, df_neg_test, df_neg_train, oh_dict