Skip to content

Commit

Permalink
.Rproject
Browse files Browse the repository at this point in the history
  • Loading branch information
saiemgilani committed Apr 5, 2023
1 parent 16c2e51 commit 0c6d80a
Show file tree
Hide file tree
Showing 9 changed files with 1,181 additions and 1 deletion.
16 changes: 16 additions & 0 deletions .gitignore
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
.vscode
__pycache__
*/__pycache__
./__pycache__
.DS_Store
*/.DS_Store
.pytest_cache
*/.pytest_cache
*.pyc
*/*.pyc
*.pyc

# History files
.Rhistory
.Rapp.history
Expand Down
105 changes: 105 additions & 0 deletions R/0000_create_wehoop_releases_init.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@

#--- ESPN WBB Data -----
piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_womens_college_basketball_schedules",
name = "espn_womens_college_basketball_schedules",
body = "NCAA Women's College Basketball Schedules Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_womens_college_basketball_team_boxscores",
name = "espn_womens_college_basketball_team_boxscores",
body = "NCAA Women's College Basketball Team Boxscores Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_womens_college_basketball_player_boxscores",
name = "espn_womens_college_basketball_player_boxscores",
body = "NCAA Women's College Basketball Player Boxscores Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)


piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_womens_college_basketball_pbp",
name = "espn_womens_college_basketball_pbp",
body = "NCAA Women's College Basketball Play-by-Play Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)

#--- ESPN WNBA Data -----

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_wnba_schedules",
name = "espn_wnba_schedules",
body = "WNBA Schedules Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_wnba_team_boxscores",
name = "espn_wnba_team_boxscores",
body = "WNBA Team Boxscores Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_wnba_player_boxscores",
name = "espn_wnba_player_boxscores",
body = "WNBA Player Boxscores Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)


piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "espn_wnba_pbp",
name = "espn_wnba_pbp",
body = "WNBA Play-by-Play Data (from ESPN)",
.token = Sys.getenv("GITHUB_PAT")
)


#--- WNBA Stats Data -----

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "wnba_stats_schedules",
name = "wnba_stats_schedules",
body = "WNBA Schedules Data (from stats.wnba.com)",
.token = Sys.getenv("GITHUB_PAT")
)

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "wnba_stats_team_boxscores",
name = "wnba_stats_team_boxscores",
body = "WNBA Team Boxscores Data (from stats.wnba.com)",
.token = Sys.getenv("GITHUB_PAT")
)

piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "wnba_stats_player_boxscores",
name = "wnba_stats_player_boxscores",
body = "WNBA Player Boxscores Data (from stats.wnba.com)",
.token = Sys.getenv("GITHUB_PAT")
)


piggyback::pb_release_create(
repo = "sportsdataverse/sportsdataverse-data",
tag = "wnba_stats_pbp",
name = "wnba_stats_pbp",
body = "WNBA Play-by-Play Data (from stats.wnba.com)",
.token = Sys.getenv("GITHUB_PAT")
)
170 changes: 170 additions & 0 deletions R/0001_push_existing_release_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
lib_path <- Sys.getenv("R_LIBS")
if (!requireNamespace("pacman", quietly = TRUE)){
install.packages("pacman",lib=Sys.getenv("R_LIBS"), repos="http://cran.us.r-project.org")
}
suppressPackageStartupMessages(suppressMessages(library(dplyr, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(magrittr, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(jsonlite, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(purrr, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(progressr, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(data.table, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(qs, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(arrow, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(glue, lib.loc=lib_path)))
suppressPackageStartupMessages(suppressMessages(library(optparse, lib.loc=lib_path)))



sched_list <- list.files(path = glue::glue("wbb/schedules/rds/"))
sched_g <- purrr::map(sched_list, function(x) {
sched <- readRDS(paste0("wbb/schedules/rds/", x)) %>%
dplyr::mutate(
id = as.integer(.data$id),
game_id = as.integer(.data$game_id),
status_display_clock = as.character(.data$status_display_clock)
)

sched <- sched %>%
wehoop:::make_wehoop_data("ESPN WBB Schedule from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = sched,
file_name = glue::glue("wbb_schedule_{y}"),
sportsdataverse_type = "schedule data",
release_tag = "espn_womens_college_basketball_schedules",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})
rm(sched_g)

pbp_list <- list.files(path = glue::glue("wbb/pbp/rds/"))
pbp_g <- purrr::map(pbp_list, function(x) {
pbp <- readRDS(paste0("wbb/pbp/rds/", x))

pbp <- pbp %>%
wehoop:::make_wehoop_data("ESPN WBB Play-by-Play from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = pbp,
file_name = glue::glue("play_by_play_{y}"),
sportsdataverse_type = "Play-by-Play data",
release_tag = "espn_womens_college_basketball_pbp",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})
rm(pbp_g)

team_box_list <- list.files(path = glue::glue("wbb/team_box/rds/"))
team_box_g <- purrr::map(team_box_list, function(x) {
team_box <- readRDS(paste0("wbb/team_box/rds/", x))
team_box <- team_box %>%
wehoop:::make_wehoop_data("ESPN WBB Team Boxscores from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = team_box,
file_name = glue::glue("team_box_{y}"),
sportsdataverse_type = "Team Boxscores data",
release_tag = "espn_womens_college_basketball_team_boxscores",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})

rm(team_box_g)

player_box_list <- list.files(path = glue::glue("wbb/player_box/rds/"))
player_box_g <- purrr::map(player_box_list, function(x) {
player_box <- readRDS(paste0("wbb/player_box/rds/", x))
player_box <- player_box %>%
wehoop:::make_wehoop_data("ESPN WBB Player Boxscores from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = player_box,
file_name = glue::glue("player_box_{y}"),
sportsdataverse_type = "Player Boxscores data",
release_tag = "espn_womens_college_basketball_player_boxscores",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})

rm(player_box_g)

sched_list <- list.files(path = glue::glue("wnba/schedules/rds/"))
sched_g <- purrr::map(sched_list, function(x) {
sched <- readRDS(paste0("wnba/schedules/rds/", x)) %>%
dplyr::mutate(
id = as.integer(.data$id),
game_id = as.integer(.data$game_id),
status_display_clock = as.character(.data$status_display_clock)
)

sched <- sched %>%
wehoop:::make_wehoop_data("ESPN WNBA Schedule from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = sched,
file_name = glue::glue("wnba_schedule_{y}"),
sportsdataverse_type = "schedule data",
release_tag = "espn_wnba_schedules",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})
rm(sched_g)

pbp_list <- list.files(path = glue::glue("wnba/pbp/rds/"))
pbp_g <- purrr::map(pbp_list, function(x) {
pbp <- readRDS(paste0("wnba/pbp/rds/", x))

pbp <- pbp %>%
wehoop:::make_wehoop_data("ESPN WNBA Play-by-Play from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = pbp,
file_name = glue::glue("play_by_play_{y}"),
sportsdataverse_type = "Play-by-Play data",
release_tag = "espn_wnba_pbp",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})
rm(pbp_g)

team_box_list <- list.files(path = glue::glue("wnba/team_box/rds/"))
team_box_g <- purrr::map(team_box_list, function(x) {
team_box <- readRDS(paste0("wnba/team_box/rds/", x))
team_box <- team_box %>%
wehoop:::make_wehoop_data("ESPN WNBA Team Boxscores from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = team_box,
file_name = glue::glue("team_box_{y}"),
sportsdataverse_type = "Team Boxscores data",
release_tag = "espn_wnba_team_boxscores",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})

rm(team_box_g)

player_box_list <- list.files(path = glue::glue("wnba/player_box/rds/"))
player_box_g <- purrr::map(player_box_list, function(x) {
player_box <- readRDS(paste0("wnba/player_box/rds/", x))
player_box <- player_box %>%
wehoop:::make_wehoop_data("ESPN WNBA Player Boxscores from wehoop data repository", Sys.time())
y <- stringr::str_extract(x, "\\d+")
sportsdataversedata::sportsdataverse_save(
data_frame = player_box,
file_name = glue::glue("player_box_{y}"),
sportsdataverse_type = "Player Boxscores data",
release_tag = "espn_wnba_player_boxscores",
file_types = c("rds", "csv", "parquet"),
.token = Sys.getenv("GITHUB_PAT")
)
})

rm(player_box_g)
26 changes: 26 additions & 0 deletions R/minify_json_folders.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@


team_box_list <- list.files(path = glue::glue('wbb/json/final/'))
team_box_game_ids <- as.integer(gsub('.json', '', team_box_list))

future::plan("multisession")
espn_df <- furrr::future_map_dfr(team_box_game_ids, function(x){
resp <- glue::glue('wbb/json/final/{x}.json') %>%
jsonlite::fromJSON()
jsonlite::write_json(resp, glue::glue('wbb/json/final/{x}.json'), prettify = 0)

return(NULL)
}, .options = furrr::furrr_options(seed = TRUE))


team_box_list <- list.files(path = glue::glue('wbb/json/raw/'))
team_box_game_ids <- as.integer(gsub('.json', '', team_box_list))

future::plan("multisession")
espn_df <- furrr::future_map_dfr(team_box_game_ids, function(x){
resp <- glue::glue('wbb/json/raw/{x}.json') %>%
jsonlite::fromJSON()
jsonlite::write_json(resp, glue::glue('wbb/json/raw/{x}.json'), prettify = 0)

return(NULL)
}, .options = furrr::furrr_options(seed = TRUE))
62 changes: 62 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
lib_path <- Sys.getenv("R_LIBS")

suppressPackageStartupMessages(suppressMessages(library(dplyr, lib.loc = lib_path)))
suppressPackageStartupMessages(suppressMessages(library(httr, lib.loc = lib_path)))
suppressPackageStartupMessages(suppressMessages(library(jsonlite, lib.loc = lib_path)))
suppressPackageStartupMessages(suppressMessages(library(glue, lib.loc = lib_path)))
suppressPackageStartupMessages(suppressMessages(library(purrr, lib.loc = lib_path)))
get_proxy_bonanza_ips <- function(
api_key = Sys.getenv("PROXY_BONANZA_KEY"),
user_package = Sys.getenv("PROXY_BONANZA_USERPKG")){
res <- httr::RETRY(
"GET",
glue::glue("https://proxybonanza.com/api/v1/userpackages/{user_package}.json"),
httr::add_headers(Authorization = paste(api_key))) %>%
httr::content(as = "text", encoding = "UTF-8")

resp <- res %>%
jsonlite::fromJSON() %>%
purrr::pluck("data")

login <- resp$login
password <- resp$password
ips <- resp$ippacks

ips$login <- login
ips$password <- password
proxies <- ips %>%
dplyr::select("ip","port_http","login", "password")
return(proxies)
}

select_proxy <- function(proxies = get_proxy_bonanza_ips()) {
proxy <- sample(proxies$ip, 1) # pick a random proxy from the list above
proxy_selected <- proxies %>%
dplyr::filter(.data$ip == proxy)
my_proxy <- httr::use_proxy(url = proxy_selected$ip,
port = proxy_selected$port,
username = proxy_selected$login,
password = proxy_selected$password)
return(my_proxy)
}


rejoin_schedules <- function(df){
df <- df %>%
dplyr::mutate(
HOME_AWAY = ifelse(stringr::str_detect(.data$MATCHUP,"@"),"AWAY","HOME")) %>%
dplyr::select(-.data$WL,.data$MATCHUP)
away_df <- df %>%
dplyr::filter(.data$HOME_AWAY == "AWAY") %>%
dplyr::select(-.data$HOME_AWAY) %>%
dplyr::select(.data$SEASON_ID, .data$GAME_ID, .data$GAME_DATE, .data$MATCHUP, tidyr::everything())
colnames(away_df)[5:ncol(away_df)]<-paste0("AWAY_", colnames(away_df)[5:ncol(away_df)])
home_df <- df %>%
dplyr::filter(.data$HOME_AWAY == "HOME") %>%
dplyr::select(-.data$HOME_AWAY, -.data$MATCHUP) %>%
dplyr::select(.data$SEASON_ID, .data$GAME_ID, .data$GAME_DATE, tidyr::everything())
colnames(home_df)[4:ncol(home_df)]<-paste0("HOME_", colnames(home_df)[4:ncol(home_df)])
sched_df <- away_df %>%
dplyr::left_join(home_df, by=c("GAME_ID", "SEASON_ID", "GAME_DATE"))
return(sched_df)
}
Loading

0 comments on commit 0c6d80a

Please sign in to comment.