forked from levnikmyskin/SLD-reassessment
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_generation.py
64 lines (48 loc) · 2.79 KB
/
data_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import numpy as np
"""
We want to subsample a given dataset D with a given prevalence p so that the subsample will have a random prevalence p'.
In order to do this we do the following:
- First, we create a number c of groups (we'll call the set of groups G) for every class C, each containing the examples
from D pertaining to c;
- Secondly, we generate a vector V of random probabilities with length |C|, such that its values sum to 1:
each element of this vector will represent the prevalence of a class c in C;
- Finally, we draw samples from G with probabilities coming from V.
This works both for multi and single class datasets.
"""
def randomly_modify_prevalences(x: np.array, y: np.array, train_split=1000, test_split=1000) -> (np.array, np.array):
# Step one: generate groups
labels = set(y)
train_groups = [(y == label).nonzero()[0] for label in labels]
# Step two: generate the vectors of random probabilities and draw the new sample
new_training_sample = __new_random_sample(train_groups, len(labels), train_split)
# Filter out documents we have already taken for training set
mask = np.ones_like(y, dtype=np.bool)
mask[new_training_sample] = False
y_test = y[mask]
x_test = x[mask]
test_groups = [(y_test == label).nonzero()[0] for label in labels]
new_test_sample = __new_random_sample(test_groups, len(labels), test_split)
x_tr, y_tr, x_te, y_te = x[new_training_sample], y[new_training_sample], x_test[new_test_sample], y_test[new_test_sample]
if len(set(y_tr)) < len(labels):
return randomly_modify_prevalences(x, y, train_split, test_split)
return x_tr, y_tr, x_te, y_te
def subsample_dataset_random_prev(dataset, sample_length: int) -> ((np.array, np.array), (np.array, np.array)):
x, y = dataset.data, dataset.target
split = y.shape[0] // 2
x_train, x_test = x[:split], x[split:]
y_train, y_test = y[:split], y[split:]
return randomly_modify_prevalences(x_train, y_train, sample_length), \
randomly_modify_prevalences(x_test, y_test, sample_length)
def __new_random_sample(groups, labels_len, sample_size):
# For details on what's happening here, see
# https://stackoverflow.com/questions/44613347/multidimensional-array-for-random-choice-in-numpy
rand_num = np.random.default_rng().integers(1, 100, labels_len)
rand_prob_vector = rand_num / rand_num.sum()
lens = np.array([el.shape[0] for el in groups])
random_vec = np.repeat(np.divide(rand_prob_vector, lens), lens)
new_arr = np.concatenate(groups)
return np.random.default_rng().choice(new_arr, size=sample_size, p=random_vec, replace=False)
if __name__ == '__main__':
from sklearn.datasets import fetch_20newsgroups_vectorized
dataset = fetch_20newsgroups_vectorized()
subsample_dataset_random_prev(dataset, 3000)