-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_pu_bagging.py
269 lines (177 loc) · 8.91 KB
/
main_pu_bagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
### START ###
## Import Libraries ##
# Not all of these libraries are nessassry (code was taking from a large project)
import pandas as pd
import random
import pickle
from sklearn.model_selection import (
train_test_split,
KFold,
GridSearchCV,
ParameterGrid,
ParameterSampler,
)
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
confusion_matrix,
ConfusionMatrixDisplay,
accuracy_score,
f1_score,
recall_score,
precision_score,
)
from sklearn.ensemble import (
RandomForestClassifier,
IsolationForest,
GradientBoostingClassifier,
BaggingClassifier,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.utils import resample
from sklearn.inspection import permutation_importance
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr, shapiro
from scipy import stats
import numpy as np
## Import input dataset (target variable must be 1 (positive instance) or 0 (unlabelled instance)) ##
df = pd.read_csv(FILE_NAME) # CHANGE 'FILE_NAME' TO CSV YOU WANT TO IMPORT
df
print("Original Dataset Counts")
print(df['TARGET_VARIABLE'].value_counts()) # CHANGE 'TARGET_VARIABLE' TO NAME OF TARGET VARIABLE
# Split input dataset into features and target.
y = df["TARGET_VARIABLE"] # CHANGE 'TARGET_VARIABLE' TO NAME OF TARGET VARIABLE
X = df.drop("TARGET_VARIABLE", axis=1) # CHANGE 'TARGET_VARIABLE' TO NAME OF TARGET VARIABLE
# Split input dataset into train (80%) and test (20%) for modelling
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
## STANDARD RANDOM FOREST (test without PU Bagging) ##
# Use a standard (balanced class) random forest classifier (without PU Bagging method) to compare results with PU Bagging Method Below.
# Create basic random forest classifier (balanced class)
rf = RandomForestClassifier(class_weight="balanced", max_leaf_nodes=8, random_state=1)
rf = rf.fit(X_train, y_train) #using hidden pos y train
y_pred = rf.predict(X_test)
## Generate Eval metrics for Random Forest
# Confusion Matrix
actual = y_test
predicted = y_pred
cm_result = confusion_matrix(actual, predicted)
custom_labels = ['TARGET_VARIABLE_0', 'TARGET_VARIABLE_1'] # CHANGE TARGET_VARIABLE_0 = Unlabelled Instance (0), TARGET_VARIABLE_1 = Positive Instance (1)
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm_result, display_labels = custom_labels)
cm_display.plot()
print("Confusion Matrix:")
plt.show()
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.2f}")
# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.2f}")
# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
## PU BAGGING ##
#High-Level Summary of Next Step in Process:
#This process generates 100 bootstrap samples from the training data, allowing for resampling. These bootstrap samples are then balanced, where
#the majority class is subsampled to match the minority class within each bootstrap. Once the bootstrap is balanced, the process identifies all cases that
#are not included in each initial bootstrap sample before balancing (Out-Of-Bag, or OOB). A random forest classifier is then fitted to the rebalanced
#sample (X and Y). The model is applied to the test data, and the predictions for positive cases (avoidance) are recorded and stored. Additionally, the
#model is applied to the train data, and those predictions are stored as well (will be for hidden positives eval). This concludes the iterative phase.
#Next, the average of all stored bootstrap OOB predictions is calculated. This helps evaluate the model's performance on unseen data during training,
#serving as a diagnostic tool to detect potential overfitting or underfitting. It can be used as a substitute for a separate validation dataset.
#The predictions stored from the test and train data are aggregated using majority voting to make the final predictions (on both train and test). The
#idea is that combining multiple models through voting is more reliable than any individual model, reducing bias and variance, and ultimately making
#the model more robust.
# Set up parameters for bootstrapping
# Bootstrap Sampling (take multiple samples from the training set)
n_iterations = 100 # Number of bootstrap samples
n_size = len(X_train)
model = RandomForestClassifier() #using a random forest algorithm as the classifier
# Initialize array to store OOB scores for each point
oob_scores = np.zeros((len(X_train), n_iterations)) # Store OOB scores for each point in each iteration
# List to hold predictions from each bootstrap sample (for evaluation)
bootstrap_predictions = []
# Store model weights so they can be applied to new data
list_of_model_weights = []
for i in range(n_iterations):
# Sample with replacement from the training data
X_resample, y_resample = resample(X_train, y_train, n_samples=n_size, random_state=i)
# Create balanced bootstrap sample. Ensure positives = unlabelled (random sample of unlabelled within bootstrap sample)
X_unlab = X_resample[y_resample == 0] # unlabelled (majoirty)
X_pos = X_resample[y_resample == 1] # positive (minority)
# Use all of class 1 (positive class) without resampling
X_pos_resample = X_pos
y_pos_resample = np.ones(len(X_pos_resample)) # Corresponding labels for class 1
# Sample with replacement from class 0 (negative class) to get a subsample
X_neg_resample = resample(X_unlab, n_samples=len(X_pos), random_state=i) # Sample same number as class 1
y_neg_resample = np.zeros(len(X_neg_resample)) # Corresponding labels for class 0
# Combine the resampled positive and negative class samples
X_bal_resample = np.vstack([X_pos_resample, X_neg_resample])
y_bal_resample = np.hstack([y_pos_resample, y_neg_resample])
# Identify OOB points (points not included in the (balanced) resampled sample)
oob_indices = np.setdiff1d(np.arange(n_size), np.unique(np.where(np.isin(X_train, X_bal_resample).all(axis=1))[0]))
# Train the model on the balanced resampled dataset
model.fit(X_bal_resample, y_bal_resample)
# Apply model to OOB points and record the prediction probabilities for the positive class (class 1)
oob_preds_proba = model.predict_proba(X_train.iloc[oob_indices])[:, 1] # Probability of class 1 (positive)
# Store OOB prediction scores for each point
oob_scores[oob_indices, i] = oob_preds_proba
# Make predictions on the test set
predictions = model.predict(X_test)
bootstrap_predictions.append(predictions)
# Append model wieght from each bootstrap to the list of weights
list_of_model_weights.append(model)
# Aggregate OOB scores (average across all bootstrap samples) - ONLY FOR QA PURPOSES
average_oob_scores = np.mean(oob_scores, axis=1)
# Aggregate predictions for the test set using majority voting
bootstrap_predictions = np.array(bootstrap_predictions)
# Convert the predictions to integers if necessary (in case they are floats)
bootstrap_predictions = bootstrap_predictions.astype(int)
# Majority voting on the predictions
final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=bootstrap_predictions)
## Generate Eval metrics ##
# Confusion Matrix
actual = y_test
predicted = final_predictions
cm_result = confusion_matrix(actual, predicted) #simple confusion matrix
custom_labels = ['TARGET_VARIABLE_0', 'TARGET_VARIABLE_1'] # CHANGE TARGET_VARIABLE_0 = Unlabelled Instance (0), TARGET_VARIABLE_1 = Positive Instance (1)
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm_result, display_labels = custom_labels)
cm_display.plot()
print("Confusion Matrix:")
plt.show()
# Accuracy
accuracy = accuracy_score(y_test, final_predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Precision
precision = precision_score(y_test, final_predictions)
print(f"Precision: {precision:.2f}")
# Recall
recall = recall_score(y_test, final_predictions)
print(f"Recall: {recall:.2f}")
# F1 Score
f1 = f1_score(y_test, final_predictions)
print(f"F1 Score: {f1:.2f}")
## HOW MODEL WEIGHTS WOULD BE APPLIED TO NEW DATA ##
#predictions = []
#for model in list_of_model_weights:
#prediction = model.predict(new data)
#predictions.append(prediction)
# EG. User 1 (model weights x 100 - 100 bootstrap models)
# User 2 (model weights x 100 - 100 bootstrap models)
# '''
### END ###
# FEEL FREE TO ADD ADJUSTMENTS TO CODE TO MAKE IT EASIER FOR THE NEXT USER ##