-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcleaner.R
107 lines (73 loc) · 3.74 KB
/
cleaner.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Jess, Anna and Seth Project
# 11/29/19
source('helpers.R')
source('packages.R')
### READ IN THE DATA ###
# Create full dataframe
long.data <- read_data()
# Write long dataframe and read it in
write.csv(long.data, file = "data/full_data_raw.csv")
full_df_raw <- read.csv("data/full_data_raw.csv")
# Let's remove certain predictors that we know from data collection are bad...
full_df <- select (full_df_raw,-c(Pace, ORtg, STL., ORB., MP, X, Rk,
# ... and some highly Colinear predictors
ConfW, ConfL, HomeW, HomeL, AwayW, AwayL))
### FIX SCHOOL NAMES ###
# We've got a problem with the School names
table(full_df$School)
# The source of our data adds " NCAA" to teams that made a tournament appearance
# We need to combine these names with the school name if they didn't make an appearance
# Let's use some regex magic
schools <- as.character(full_df$School)
unique_schools <- names(table(full_df$School)); head(unique_schools)
schools_clean <- gsub("[[:space:]]+NCAA", "", schools)
unique_schools_clean <- names(table(schools_clean))
# Replace Schools with Schools clean
full_df$School <- as.factor(schools_clean)
# look again at a table of schools - we're good
table(full_df$School)
### MISSING VALUES ###
# Check that values make sense in the summary
summary(full_df)
# Fortunately, we don't have any missing values for anything relating to 3-pointers
# There are a few missing values, but I'm not certain we'll even use those columns
# as predictors, so we'll cross that bridge when we get there.
### SUBSET DF INTO COMPLETE AND INCOMPLETE ###
# We still have some schools that don't have data for all of the years we want to study
# Let's remove all of the schools with incomplete data and put them in a separate data frame
# for now. I'm not really sure what to do with them.
idx_complete <- which(table(full_df$School) == length(unique(full_df$year)))
schools_incomplete <- names(table(schools_clean))[-idx_complete]
df_incomplete <- full_df[which((full_df$School %in% schools_incomplete)), ]
write.csv(df_incomplete, file = "data/incomplete_data_clean.csv")
# Now let's subset our df for rows with complete data
df_complete <- full_df[which(!(full_df$School %in% schools_incomplete)), ]
# We have 322 unique schools with complete data from the 2003-2017 seasons
length(unique(df_complete$School))
# Write Clean Df
write.csv(df_complete, file = "data/complete_data_clean.csv")
### Tournament Teams for Simpler Model ###
# I'm having issues with convergence. Might be best to restrict number of teams
full_df_raw <- read.csv("data/full_data_raw.csv")
full_df <- select (full_df_raw,-c(Pace, ORtg, STL., ORB., MP, X, Rk,
ConfW, ConfL, HomeW, HomeL, AwayW, AwayL))
# Let's use some regex magic
schools <- as.character(full_df$School)
unique_schools <- names(table(full_df$School)); head(unique_schools)
schools_clean <- gsub("[[:space:]]+NCAA", "", schools)
unique_schools_clean <- names(table(schools_clean))
# Replace Schools with Schools clean
full_df$School <- as.factor(schools_clean)
# Get just tourney schools
tourney_teams_idx <- which(grepl("NCAA", unique_schools))
tourney_teams <- unique_schools[tourney_teams_idx]
tourney_teams_clean <- gsub("[[:space:]]+NCAA", "", tourney_teams)
df_tourney <- full_df[which((full_df$School %in% tourney_teams_clean)), ]
table(df_tourney$School)
# Check for completeness
idx_complete <- which(table(df_tourney$School) == length(unique(df_tourney$year)))
schools_incomplete <- names(table(schools_clean))[-idx_complete]
df_tourney_complete <- df_tourney[which(!(df_tourney$School %in% schools_incomplete)), ]
table(df_tourney_complete$School)
# Write csv
write.csv(df_tourney_complete, file = "data/tourney_data_clean.csv")