From c732697c6a2dfcc6664dc33dff327e55d602b7d5 Mon Sep 17 00:00:00 2001 From: Cole Brookson Date: Mon, 2 Oct 2023 14:01:39 -0400 Subject: [PATCH] idk how but maybe this is more efficient? --- Code/02_1c_Format GenBank.R | 63 ++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/Code/02_1c_Format GenBank.R b/Code/02_1c_Format GenBank.R index 5fc32a8..00af8c4 100644 --- a/Code/02_1c_Format GenBank.R +++ b/Code/02_1c_Format GenBank.R @@ -8,14 +8,10 @@ library(magrittr) -if(!exists("vdict")) {source(here::here("./Code/001_TaxizeFunctions.R"))} -if(!exists("jvdict")) {source(here::here("./Code/001_Julia functions.R"))} - -# Attaching GenBank -gb <- vroom::vroom( - here::here("./Intermediate/Unformatted/GenBankUnformatted.csv.gz")) - -# structure ==================================================================== +if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')} +print("vdict") +if(!exists('jvdict')) {source('Code/001_Julia functions.R')} +print("jvdict") temp <- data.frame(Host = character(), Virus = character(), @@ -51,20 +47,18 @@ temp <- data.frame(Host = character(), CollectionDay = double(), stringsAsFactors = FALSE) -## deal with naming and conventions ============================================ -print(paste0("rows in genbank as of this run is: ", nrow(gb))) +# Attaching GenBank +gb <- vroom::vroom("Intermediate/Unformatted/GenBankUnformatted.csv.gz") +print("read in") gb %<>% - dplyr::rename(NCBIAccession = "Accession") %>% + dplyr::rename(NCBIAccession = 'Accession') %>% dplyr::rename(Release_Date = Release_Date) %>% # not sure what this is doing? - - # really don't know why we want or need this here????? dplyr::mutate_at("Release_Date", ~.x %>% # Modifying date column to make sense stringr::str_split("T") %>% # Splitting at this midpoint purrr::map_chr(1) %>% # Taking the first component - lubridate::ymd() # Coding as YMD (shouldn"t throw errors) + lubridate::ymd() # Coding as YMD (shouldn't throw errors) ) - -print("renamed") +print("renamed") gb[, c(paste0("Collection", c("Year", "Month", "Day")))] <- data.table::tstrsplit(gb$Collection_Date, "-", @@ -75,32 +69,43 @@ gb[, c(paste0("Release", c("Year", "Month", "Day")))] <- names=paste0("Release", c("Year", "Month", "Day"))) +# gb %<>% +# # known that the collection date is a string and many observations don't +# # have year or month values, just the year, so many of these will turn up +# # as missing +# tidyr::separate(Collection_Date, sep = "-", +# into = paste0("Collection", c("Year", "Month", "Day"))) %>% +# tidyr::separate(Release_Date, sep = "-", +# into = paste0("Release", c("Year", "Month", "Day"))) +print("separated") + gb %<>% dplyr::mutate_at(dplyr::vars(tidyselect::matches("Year|Month|Day")), as.numeric) %>% dplyr::mutate(HostFlagID = stringr::str_detect(HostOriginal, "cf."), - Database = "GenBank", - DatabaseVersion = "Aug2021FlatFile", - # Choice to call Nucleotide all sequence and not isolation is - # potentially problematic - revisit - DetectionMethod = "PCR/Sequencing", - # Just to keep separate from EID2 Nucleotide entries # Fix - # the HostSynonyms at the 01 import stage - DetectionOriginal = "GenBank") - + Database = "GenBank", + DatabaseVersion = "Aug2021FlatFile", + # Choice to call Nucleotide all sequence and not isolation is + # potentially problematic - revisit + DetectionMethod = "PCR/Sequencing", + # Just to keep separate from EID2 Nucleotide entries # Fix + # the HostSynonyms at the 01 import stage + DetectionOriginal = "GenBank") +print("mutated") gb %<>% dplyr::mutate(VirusTaxID = as.numeric(VirusTaxID)) %>% - # stiching together the temp and the genbank data that"s now been formatted + # stiching together the temp and the genbank data that's now been formatted dplyr::bind_rows(temp, .) %>% dplyr::mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass", "Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"), tolower) -# write file =================================================================== -vroom::vroom_write( - gb, here::here("./Intermediate/Formatted/GenbankFormatted.csv.gz")) +print("mutate at") +# write intermediate file +vroom::vroom_write(gb, "Intermediate/Formatted/GenbankFormatted.csv.gz") +print("written") # unused benchmarking ==========================================================