-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJetClassification.R
89 lines (75 loc) · 2.93 KB
/
JetClassification.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Setting the working directory
setwd("~/Clones/jet-classification")
# Importing the dataset
signal_data <- read.table('signal_PU0_13TeV_MJ-65-95_PTJ-250-300_ext.txt')
print(dim(signal_data))
print(as.matrix(signal_data[1:10,625]))
# Printing the Heatmap
mean_signal <- colMeans(signal_data)
dim(mean_signal) <- c(25,25)
library(ggplot2)
library(RColorBrewer)
library(reshape2)
mean_signal.melted <- melt(mean_signal)
colnames(mean_signal.melted) <- c("X", "Y","Value")
hm.palette <- colorRampPalette(c("#f7fbff", "#08306b"))
ggplot(mean_signal.melted, aes(x = X, y = Y, fill = Value)) +
geom_tile() +
coord_equal() +
scale_fill_gradientn(colours = hm.palette(100))
# Adding "1" after every result, classifing them as a signal
signal_data[,626] <- 1
print(as.matrix(signal_data[1:10,626]))
# Repeat preprocessing steps with the background data
background_data = read.table('background_PU0_13TeV_MJ-65-95_PTJ-250-300_ext.txt')
print(dim(background_data))
mean_background <- colMeans(background_data)
dim(mean_background) <-c(25, 25)
mean_background.melted <- melt(mean_background)
colnames(mean_background.melted) <- c("X", "Y","Value")
ggplot(mean_background.melted, aes(x = X, y = Y, fill = Value)) +
geom_tile() +
coord_equal() +
scale_fill_gradientn(colours = hm.palette(100))
background_data[,626] <- 0
print(as.matrix(background_data[1:10,626]))
# Concatenating frames and shuffling
full_data <- rbind(signal_data,background_data)
set.seed(123) # reproduce results
full_data <- full_data[sample(nrow(full_data)),]
print(full_data[1:10,626])
# Defining and Training the model
## Separing Training and Test set
library(caTools)
set.seed(123)
split = sample.split(full_data[[626]], SplitRatio = 0.7)
training_set = subset(full_data, split == TRUE)
test_set = subset(full_data, split == FALSE)
print(dim(training_set))
print(dim(test_set))
## Removing zero variance columns
test_set <- test_set[,apply(training_set, 2, var, na.rm=TRUE) != 0]
training_set <-training_set[,apply(training_set, 2, var, na.rm=TRUE) != 0]
# Applying PCA
# install.packages('caret')
library(caret)
# install.packages('e1071')
library(e1071)
pcaElem = 60
pca = preProcess(x = training_set[-540], method = 'pca', pcaComp = pcaElem) # 100 elem. to use the 10x10 square seen around the jets
training_set = predict(pca, training_set)
training_set = training_set[c(2:(pcaElem+1),1)]
test_set = predict(pca, test_set)
test_set = test_set[c(2:(pcaElem+1),1)]
## Fitting Logistic Regression to the Training set
classifier <- glm(formula = V626 ~ .,
family = binomial,
data = training_set)
## Predicting the Test set results
prob_pred <- predict(classifier, type = 'response', newdata = test_set[-(pcaElem+1)])
y_pred <- ifelse(prob_pred > 0.5, 1, 0)
## Making the Confusion Matrix
cm <- table(test_set[, (pcaElem+1)], y_pred > 0.5)
## Computing accuracy
misClasificError <- mean((y_pred > 0.5) != test_set[(pcaElem+1)])
print(paste('Accuracy',1-misClasificError))