diff --git a/Code/02_1b_Digest GenBank.R b/Code/02_1b_Digest GenBank.R index f80d392..c9808e3 100644 --- a/Code/02_1b_Digest GenBank.R +++ b/Code/02_1b_Digest GenBank.R @@ -45,8 +45,8 @@ gb %<>% dplyr::rename(HostOriginal = "Host") %>% "Myxini", "Reptilia") | HostOrder %in% c("Testudines", "Crocodylia")) - # Reptilia is defunct but left in case GLOBI has something on it - # or it"s reinstituted or something weird + # Reptilia is defunct but left in case + # it's reinstituted or something weird # get vector of the species names virus_vec <- gb %>% diff --git a/Code/02_3a_Download GLOBI.R b/Code/02_3a_Download GLOBI.R deleted file mode 100644 index 846e8b8..0000000 --- a/Code/02_3a_Download GLOBI.R +++ /dev/null @@ -1,56 +0,0 @@ -# install.packages('rglobi') - -library(rglobi) -library(tidyverse) - -# By default, the amount of results are limited. If you'd like to retrieve all results, you can used pagination. For instance, to retrieve parasitic interactions using pagination, you can use: -# -# -# ```r -# otherkeys = list("limit"=10, "skip"=0) -# first_page_of_ten <- get_interactions_by_type(interactiontype = c("hasParasite"), otherkeys = otherkeys) -# otherkeys = list("limit"=10, "skip"=10) -# second_page_of_ten <- get_interactions_by_type(interactiontype = c("hasParasite"), otherkeys = otherkeys) -# ``` -# -# To exhaust all available interactions, you can keep paging results until the size of the page is less than the limit (e.g., ```nrows(interactions) < limit```). - -j = 0 -k = 1 - -while(k > 0) { - -page <- get_interactions_by_taxa(sourcetaxon = 'Virus', - targettaxon = 'Vertebrata', - otherkeys = list("limit" = 1000, "skip" = 1000*j)) -k = nrow(page) - -if(j == 0) {all <- page} else { - all <- rbind(page,all) -} - -print(j) -j = (j + 1) - -} - -j = 0 -k = 1 - -while(k > 0) { - - page <- get_interactions_by_taxa(sourcetaxon = 'Viruses', - targettaxon = 'Vertebrata', - otherkeys = list("limit" = 1000, "skip" = 1000*j)) - k = nrow(page) - - all <- rbind(page,all) - - print(j) - j = (j + 1) - -} - -all %>% unique() -> all - -write_csv(all, 'Source/GLOBI-raw.csv') diff --git a/Code/02_3b_Digest GLOBI.R b/Code/02_3b_Digest GLOBI.R deleted file mode 100644 index 00bf20b..0000000 --- a/Code/02_3b_Digest GLOBI.R +++ /dev/null @@ -1,79 +0,0 @@ -source('Code/001_TaxizeFunctions.R') -source('Code/001_Julia functions.R') -rentrez::set_entrez_key("ec345b39079e565bdfa744c3ef0d4b03ba08") - -library(tidyverse) -library(taxize) -library(magrittr) -library(vroom) -install.ncbi() - -globi <- read_csv('Source/GLOBI-raw.csv') - -globi %>% - select(source_taxon_external_id, - source_taxon_name, - target_taxon_external_id, - target_taxon_name) %>% - rename(Virus.ID = 'source_taxon_external_id', - Virus = 'source_taxon_name', - Host.ID = 'target_taxon_external_id', - Host = 'target_taxon_name') %>% - unique() %>% - mutate(VirusOriginal = Virus, # keep backups - HostOriginal = Host) -> globi - -# How much is already set up for NCBI? - -globi %>% - filter(str_detect(Host.ID, 'NCBI')) %>% - filter(str_detect(Virus.ID, 'NCBI')) -> - ncbi.strict - -# So these are the things that we can double triple check for errors - -globi %>% pull(Host.ID) %>% sapply(function(x) {word(string = x, 1, sep = ":")}) %>% table() -globi %>% pull(Virus.ID) %>% sapply(function(x) {word(string = x, 1, sep = ":")}) %>% table() - -#### Actually call NCBI -# Only pull in names that are resolved enough to make sense of, and none of the messy names - -globi %>% - mutate_cond(str_detect(Virus, "Influenza A"), Virus = "Influenza A") %>% - mutate_cond(str_detect(Virus, "Influenza B"), Virus = "Influenza B") %>% - mutate_cond(str_detect(Virus, "Influenza C"), Virus = "Influenza C") %>% - mutate_cond(str_detect(Virus, "Influenza D"), Virus = "Influenza D") -> globi - -globi %>% pull(Host) %>% unique() %>% sort() -> host.list -host.table <- jhdict(host.list) - -globi %>% pull(Virus) %>% unique() %>% sort() -> virus.list -virus.table <- jvdict(virus.list) - -globi %<>% - rename(HostIntermediate = 'Host') %>% - left_join(host.table, by = c('HostIntermediate' = 'HostOriginal')) %>% - select(-HostIntermediate) %>% - rename(VirusIntermediate = 'Virus') %>% - left_join(virus.table, by = c('VirusIntermediate' = 'VirusOriginal')) %>% - select(-c(VirusIntermediate, Host.ID, Virus.ID)) - -globi %<>% - mutate_cond(is.na(HostGenus), HostGenus = word(Host, 1)) - -# A little cleaning -globi %<>% filter(HostClass %in% c("Actinopteri", - "Actinopterygii", - "Amphibia", - "Aves", - "Chondrichthyes", - "Cladistia", - "Hyperoartia", - "Lepidosauria", - "Mammalia", - "Myxini", - "Reptilia") | HostOrder %in% c("Testudines", "Crocodylia")) - -globi %<>% filter(str_detect(VirusClass, "viricetes")) - -write_csv(globi, "Intermediate/Unformatted/GLOBIUnformatted.csv") diff --git a/Code/02_3c_Format GLOBI.R b/Code/02_3c_Format GLOBI.R deleted file mode 100644 index b746e6e..0000000 --- a/Code/02_3c_Format GLOBI.R +++ /dev/null @@ -1,55 +0,0 @@ - -library(tidyverse) -library(magrittr) - -globi <- read_csv("Intermediate/Unformatted/GLOBIUnformatted.csv") - -temp <- data.frame(Host = character(), - Virus = character(), - HostTaxID = double(), - VirusTaxID = double(), - HostNCBIResolved = logical(), - VirusNCBIResolved = logical(), - HostGenus = character(), - HostFamily = character(), - HostOrder = character(), - HostClass = character(), - HostOriginal = character(), - HostSynonyms = character(), - VirusGenus = character(), - VirusFamily = character(), - VirusOrder = character(), - VirusClass = character(), - VirusOriginal = character(), - HostFlagID = logical(), - DetectionMethod = character(), - DetectionOriginal = character(), - Database = character(), - DatabaseVersion = character(), - PublicationYear = double(), - ReferenceText = character(), - PMID = double(), - NCBIAccession = character(), - ReleaseYear = double(), - ReleaseMonth = double(), - ReleaseDay = double(), - CollectionYear = double(), - CollectionMonth = double(), - CollectionDay = double(), - stringsAsFactors = FALSE) - -globi %<>% mutate(DetectionOriginal = "GLOBI", - Database = "GLOBI", - DatabaseVersion = format(file.info("Source/GLOBI-raw.csv")$ctime, - format = "%b %d, %Y"), - DetectionMethod = "Not specified") - -bind_rows(temp, globi) -> globi - -# Consistency steps: all lowercase names - -globi %<>% mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass", - "Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"), - tolower) - -write_csv(globi, 'Intermediate/Formatted/GLOBIFormatted.csv') diff --git a/Code/03_Merge clean files.R b/Code/03_Merge clean files.R index ebeadbf..0443940 100644 --- a/Code/03_Merge clean files.R +++ b/Code/03_Merge clean files.R @@ -12,12 +12,10 @@ clo <- readr::read_csv("./Intermediate/Formatted/CLOVERFormatted.csv", col_type print("clo") pred <- vroom("./Intermediate/Formatted/PREDICTAllFormatted.csv", col_type = cols(PMID = col_double(), PublicationYear = col_double())) print("pred") -globi <- vroom("./Intermediate/Formatted/GLOBIFormatted.csv", col_type = cols(PMID = col_double(), PublicationYear = col_double())) -print("globi") if(is.numeric(clo$NCBIAccession)) {clo %<>% dplyr::mutate(NCBIAccession = as.character(NCBIAccession))} -virion <- dplyr::bind_rows(clo, pred, gb, globi) +virion <- dplyr::bind_rows(clo, pred, gb) # # chr_cols <- names(virion[, sapply(virion, is.character)]) # virion <- virion %>% diff --git a/Code/04_High level VIRION checks.R b/Code/04_High level VIRION checks.R index ded5226..9f32f37 100644 --- a/Code/04_High level VIRION checks.R +++ b/Code/04_High level VIRION checks.R @@ -75,7 +75,7 @@ virion %<>% dplyr::mutate( print("relocate") -# This only applies to CLOVER and GLOBI, which both don't have any other internal flags +# This only applies to CLOVER, which doesn't have any other internal flags virion %<>% dplyr::mutate(HostFlagID = replace_na(HostFlagID, FALSE)) print("mutate1")