From c732697c6a2dfcc6664dc33dff327e55d602b7d5 Mon Sep 17 00:00:00 2001
From: Cole Brookson <cole.brookson@gmail.com>
Date: Mon, 2 Oct 2023 14:01:39 -0400
Subject: [PATCH] idk how but maybe this is more efficient?

---
 Code/02_1c_Format GenBank.R | 63 ++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/Code/02_1c_Format GenBank.R b/Code/02_1c_Format GenBank.R
index 5fc32a8..00af8c4 100644
--- a/Code/02_1c_Format GenBank.R	
+++ b/Code/02_1c_Format GenBank.R	
@@ -8,14 +8,10 @@
 
 library(magrittr)
 
-if(!exists("vdict")) {source(here::here("./Code/001_TaxizeFunctions.R"))}
-if(!exists("jvdict")) {source(here::here("./Code/001_Julia functions.R"))}
-
-# Attaching GenBank
-gb <- vroom::vroom(
-  here::here("./Intermediate/Unformatted/GenBankUnformatted.csv.gz"))
-
-# structure ====================================================================
+if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}
+print("vdict")
+if(!exists('jvdict')) {source('Code/001_Julia functions.R')}
+print("jvdict")
 
 temp <- data.frame(Host = character(),
                    Virus = character(),
@@ -51,20 +47,18 @@ temp <- data.frame(Host = character(),
                    CollectionDay = double(),
                    stringsAsFactors = FALSE)
 
-## deal with naming and conventions ============================================
-print(paste0("rows in genbank as of this run is: ", nrow(gb)))
+# Attaching GenBank
+gb <- vroom::vroom("Intermediate/Unformatted/GenBankUnformatted.csv.gz") 
+print("read in")
 gb %<>% 
-  dplyr::rename(NCBIAccession = "Accession") %>% 
+  dplyr::rename(NCBIAccession = 'Accession') %>% 
   dplyr::rename(Release_Date = Release_Date) %>% # not sure what this is doing?
-
-  # really don't know why we want or need this here?????
   dplyr::mutate_at("Release_Date", ~.x %>% # Modifying date column to make sense
                      stringr::str_split("T") %>% # Splitting at this midpoint
                      purrr::map_chr(1) %>% # Taking the first component 
-                     lubridate::ymd() # Coding as YMD (shouldn"t throw errors)
+                     lubridate::ymd() # Coding as YMD (shouldn't throw errors)
   ) 
-
-print("renamed") 
+print("renamed")
 
 gb[, c(paste0("Collection", c("Year", "Month", "Day")))] <- 
   data.table::tstrsplit(gb$Collection_Date, "-", 
@@ -75,32 +69,43 @@ gb[, c(paste0("Release", c("Year", "Month", "Day")))] <-
                         names=paste0("Release", 
                                      c("Year", "Month", "Day"))) 
 
+# gb %<>% 
+#   # known that the collection date is a string and many observations don't
+#   # have year or month values, just the year, so many of these will turn up 
+#   # as missing
+#   tidyr::separate(Collection_Date, sep = "-", 
+#                   into = paste0("Collection", c("Year", "Month", "Day"))) %>% 
+#   tidyr::separate(Release_Date, sep = "-", 
+#                   into = paste0("Release", c("Year", "Month", "Day"))) 
+print("separated")
+
 gb %<>% 
   dplyr::mutate_at(dplyr::vars(tidyselect::matches("Year|Month|Day")), 
                    as.numeric) %>% 
   dplyr::mutate(HostFlagID = stringr::str_detect(HostOriginal, "cf."),
-            Database = "GenBank",
-            DatabaseVersion = "Aug2021FlatFile",
-            # Choice to call Nucleotide all sequence and not isolation is 
-            # potentially problematic - revisit 
-            DetectionMethod = "PCR/Sequencing", 
-            # Just to keep separate from EID2 Nucleotide entries # Fix 
-            # the HostSynonyms at the 01 import stage
-            DetectionOriginal = "GenBank") 
-
+                Database = "GenBank",
+                DatabaseVersion = "Aug2021FlatFile",
+                # Choice to call Nucleotide all sequence and not isolation is 
+                # potentially problematic - revisit 
+                DetectionMethod = "PCR/Sequencing", 
+                # Just to keep separate from EID2 Nucleotide entries # Fix 
+                # the HostSynonyms at the 01 import stage
+                DetectionOriginal = "GenBank") 
+print("mutated")
 gb %<>% 
   dplyr::mutate(VirusTaxID = as.numeric(VirusTaxID)) %>% 
-  # stiching together the temp and the genbank data that"s now been formatted
+  # stiching together the temp and the genbank data that's now been formatted
   dplyr::bind_rows(temp, .) %>%  
   dplyr::mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", 
                      "HostClass", "Virus", "VirusGenus", "VirusFamily", 
                      "VirusOrder", "VirusClass"),
                    tolower)
 
-# write file ===================================================================
 
-vroom::vroom_write(
-  gb, here::here("./Intermediate/Formatted/GenbankFormatted.csv.gz"))
+print("mutate at")
+# write intermediate file
+vroom::vroom_write(gb, "Intermediate/Formatted/GenbankFormatted.csv.gz")
+print("written")
 
 # unused benchmarking ==========================================================