Skip to content

Commit

Permalink
idk how but maybe this is more efficient?
Browse files Browse the repository at this point in the history
  • Loading branch information
colebrookson committed Oct 2, 2023
1 parent f27e7a3 commit c732697
Showing 1 changed file with 34 additions and 29 deletions.
63 changes: 34 additions & 29 deletions Code/02_1c_Format GenBank.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,10 @@

library(magrittr)

if(!exists("vdict")) {source(here::here("./Code/001_TaxizeFunctions.R"))}
if(!exists("jvdict")) {source(here::here("./Code/001_Julia functions.R"))}

# Attaching GenBank
gb <- vroom::vroom(
here::here("./Intermediate/Unformatted/GenBankUnformatted.csv.gz"))

# structure ====================================================================
if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}
print("vdict")
if(!exists('jvdict')) {source('Code/001_Julia functions.R')}
print("jvdict")

temp <- data.frame(Host = character(),
Virus = character(),
Expand Down Expand Up @@ -51,20 +47,18 @@ temp <- data.frame(Host = character(),
CollectionDay = double(),
stringsAsFactors = FALSE)

## deal with naming and conventions ============================================
print(paste0("rows in genbank as of this run is: ", nrow(gb)))
# Attaching GenBank
gb <- vroom::vroom("Intermediate/Unformatted/GenBankUnformatted.csv.gz")
print("read in")
gb %<>%
dplyr::rename(NCBIAccession = "Accession") %>%
dplyr::rename(NCBIAccession = 'Accession') %>%
dplyr::rename(Release_Date = Release_Date) %>% # not sure what this is doing?

# really don't know why we want or need this here?????
dplyr::mutate_at("Release_Date", ~.x %>% # Modifying date column to make sense
stringr::str_split("T") %>% # Splitting at this midpoint
purrr::map_chr(1) %>% # Taking the first component
lubridate::ymd() # Coding as YMD (shouldn"t throw errors)
lubridate::ymd() # Coding as YMD (shouldn't throw errors)
)

print("renamed")
print("renamed")

gb[, c(paste0("Collection", c("Year", "Month", "Day")))] <-
data.table::tstrsplit(gb$Collection_Date, "-",
Expand All @@ -75,32 +69,43 @@ gb[, c(paste0("Release", c("Year", "Month", "Day")))] <-
names=paste0("Release",
c("Year", "Month", "Day")))

# gb %<>%
# # known that the collection date is a string and many observations don't
# # have year or month values, just the year, so many of these will turn up
# # as missing
# tidyr::separate(Collection_Date, sep = "-",
# into = paste0("Collection", c("Year", "Month", "Day"))) %>%
# tidyr::separate(Release_Date, sep = "-",
# into = paste0("Release", c("Year", "Month", "Day")))
print("separated")

gb %<>%
dplyr::mutate_at(dplyr::vars(tidyselect::matches("Year|Month|Day")),
as.numeric) %>%
dplyr::mutate(HostFlagID = stringr::str_detect(HostOriginal, "cf."),
Database = "GenBank",
DatabaseVersion = "Aug2021FlatFile",
# Choice to call Nucleotide all sequence and not isolation is
# potentially problematic - revisit
DetectionMethod = "PCR/Sequencing",
# Just to keep separate from EID2 Nucleotide entries # Fix
# the HostSynonyms at the 01 import stage
DetectionOriginal = "GenBank")

Database = "GenBank",
DatabaseVersion = "Aug2021FlatFile",
# Choice to call Nucleotide all sequence and not isolation is
# potentially problematic - revisit
DetectionMethod = "PCR/Sequencing",
# Just to keep separate from EID2 Nucleotide entries # Fix
# the HostSynonyms at the 01 import stage
DetectionOriginal = "GenBank")
print("mutated")
gb %<>%
dplyr::mutate(VirusTaxID = as.numeric(VirusTaxID)) %>%
# stiching together the temp and the genbank data that"s now been formatted
# stiching together the temp and the genbank data that's now been formatted
dplyr::bind_rows(temp, .) %>%
dplyr::mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder",
"HostClass", "Virus", "VirusGenus", "VirusFamily",
"VirusOrder", "VirusClass"),
tolower)

# write file ===================================================================

vroom::vroom_write(
gb, here::here("./Intermediate/Formatted/GenbankFormatted.csv.gz"))
print("mutate at")
# write intermediate file
vroom::vroom_write(gb, "Intermediate/Formatted/GenbankFormatted.csv.gz")
print("written")

# unused benchmarking ==========================================================

Expand Down

0 comments on commit c732697

Please sign in to comment.