-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic_survival_prediction.py
122 lines (102 loc) · 3.27 KB
/
titanic_survival_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#Kaggle titanic challenge
import csv
import torch
import torch.nn as nn
import torch.optim as optim
import torch.cuda as cuda
import random
from torch.utils.data import Dataset, DataLoader
import numpy as np
validation_to_training_ratio = 0.15
raw_label_names = []
processed_data = []
train_data_path = 'train.csv'
#extract the features I want into a central array
feature_list = [1,2,4]
with open(train_data_path, 'r') as file:
reader = csv.reader(file)
for row in reader:
new_row = []
if (row[0] == "PassengerId"):
raw_label_names = row
continue
for i in feature_list:
#special handling for strings
if (i==4):
new_entry = 0 if (row[4]=="male") else 1
else:
new_entry = int(row[i])
new_row.append(new_entry)
processed_data.append(new_row)
processed_data = np.asarray(processed_data, dtype=np.float32)
np.random.shuffle(processed_data)
#split into data vs. labels
processed_labels = processed_data.T[0]
processed_data = np.delete(processed_data, 0, 1)
#split the array into train/validation
split_index = int(processed_data.shape[0]*validation_to_training_ratio)
split_labels = np.split(processed_labels, [split_index])
val_labels = split_labels[0]
train_labels = split_labels[1]
split_data = np.split(processed_data, [split_index])
val_data = split_data[0]
train_data = split_data[1]
#data class for pytorch DataLoader
class Generic_Dataset(Dataset):
def __init__(self, data, labels):
self.data = data
self.labels = labels
def __getitem__(self, index):
x = self.data[index]
y = self.labels[index]
return x, y
def __len__(self):
return len(self.data)
#load data into pytorch api
train_dataset= Generic_Dataset(train_data, train_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
#define model
model = nn.Sequential(
nn.Linear(2, 30),
nn.ReLU(),
nn.Linear(30, 30),
nn.ReLU(),
nn.Linear(30, 1))
learning_rate = 1e-2
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()
n_epochs = 100
#allow cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if (torch.cuda.is_available()):
model = model.cuda()
#train model
for epoch in range(n_epochs):
for data, labels in train_loader:
data = data.to(device=device)
labels = labels.to(device=device)
outputs = model(data).to(device=device)
loss = loss_fn(outputs.squeeze(), labels).to(device=device)
optimizer.zero_grad()
loss.backward()
optimizer.step()
#print("Epoch: %d, Loss: %f" % (epoch, float(loss)))
#validation run
val_dataset = Generic_Dataset(val_data, val_labels)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)
correct = 0
total = 0
with torch.no_grad():
for data, labels in val_loader:
data = data.to(device=device)
labels = labels.to(device=device)
outputs = model(data).to(device=device)
print(outputs)
if (torch.cuda.is_available()):
outputs = outputs.cuda().round().flatten()
else:
outputs = np.round(outputs, 0).flatten()
print(outputs)
total += labels.shape[0]
correct += int((outputs == labels).sum())
print("Accuracy: %f" % (correct / total))