-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollectData.R
83 lines (64 loc) · 3.4 KB
/
collectData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# collectData.R
#
# Purpose: Read .csv files into data frames, and generically filter them by
# year. Return a merged data frame with batters and pitchers both.
#
# Author: Jedid Ahn (jedid.ahn@mail.utoronto.ca)
#
# Date: 2018-08-04
#
# ==============================================================================
collectData <- function(year){
# Read batter_values.csv file and filter for hitters only.
batterData <- read.csv("data/batter_values.csv", stringsAsFactors = FALSE)
# Remove any batters that switched teams mid season as their WAR and
# salary are divided between the teams, and this is not reflected in
# the csv files given.
selection <- duplicated(batterData[ , c("player_ID", "year_ID")])
batterData <- batterData [ !selection , ]
selection <- ((batterData$year_ID == year) & (batterData$WAR != "NULL")
& (batterData$salary != "NULL"))
batterData <- batterData[ selection , ]
# Read pitcher_values.csv file and filter for pitchers only.
pitcherData <- read.csv("data/pitcher_values.csv", stringsAsFactors = FALSE)
# Remove any pitchers that switched teams mid season as their WAR and
# salary are divided between the teams, and this is not reflected in
# the csv files given.
selection <- duplicated(pitcherData[ , c("player_ID", "year_ID")])
pitcherData <- pitcherData [ !selection , ]
selection <- ((pitcherData$year_ID == year) & (pitcherData$WAR != "NULL")
& (pitcherData$salary != "NULL"))
pitcherData <- pitcherData[ selection , ]
# Merge data frames.
combinedData <- rbind(batterData, pitcherData)
# Extract last names only for plotting on horizontal axis.
combinedData$last_name <- regmatches(combinedData$name_common,
regexpr(" ",
combinedData$name_common),
invert = TRUE)
combinedData$last_name <- sapply(combinedData$last_name, tail, 1)
# Sort by last name.
combinedData <- combinedData[ order(combinedData$last_name) , ]
# VERY IMPORTANT: Check for duplicate player_ID to account for pitchers
# hitting in NL ballparks, a batter pitching (common during a blowout
# game), or 2 way players (Ex: Shohei Ohtani).
combinedData$WAR <- as.numeric(combinedData$WAR)
selection <- duplicated(combinedData$player_ID)
duplicates <- cbind((combinedData[ selection , ])["player_ID"],
(combinedData[ selection , ])["WAR"])
# Remove duplicates.
combinedData <- combinedData[ !selection , ]
# Get indices corresponding to 1st instance of player_ID in combinedData.
selection <- match(duplicates$player_ID, combinedData$player_ID)
# Add WAR from duplicates data frame onto the WAR on combinedData.
combinedData[ selection , ][ , "WAR"] <-
(combinedData[ selection , ][ , "WAR"] + duplicates$WAR)
# Change "salary" to "actual_salary" to distinguish from "deserved_salary".
colnames(combinedData)[names(combinedData) == "salary"] <- "actual_salary"
# Change actual_salary values to numeric.
combinedData$actual_salary <- as.numeric(combinedData$actual_salary)
# NEW: Remove any possible anomalies in updated data off internet.
combinedData <- na.omit(combinedData)
return(combinedData)
}
# [END]