version 1.0.3

pr2database · Apr 30, 2022 · 76c6406 · 76c6406
1 parent 3571010
commit 76c6406
Showing 20 changed files with 230 additions and 44 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: metapr2
 Title: Eukaryotic 18S rRNA metabarcode database 
-Version: 1.0.2
+Version: 1.0.3
 Authors@R: 
     person(given = "Daniel",
            family = "Vaulot",
@@ -37,6 +37,7 @@ Imports:
     scrypt,
     shiny,
     shinycssloaders,
+    shinylogs,
     shinymanager,
     shinyvalidate,
     shinyWidgets,

diff --git a/Dockerfile b/Dockerfile
@@ -57,7 +57,7 @@ RUN install2.r --error --skipinstalled \
     ggforce yaml
 
 RUN install2.r --error --skipinstalled \
-    purrr bslib
+    purrr bslib shinylogs
 # Install vsearch
 # https://github.com/FredHutch/docker-vsearch/blob/master/Dockerfile
 

diff --git a/NEWS.md b/NEWS.md
@@ -1,28 +1,67 @@
+# metapr2 1.0.3
+
+Released: 2022-04-30
+
+### Database 
+
+#### version 1.1 - 41 datasets
+* Tara Ocean V9 samples have been reprocessed using the dada2 pipeline.  In version 1.0, the original swarms were used instead of ASVs.
+
+### Tabs of application
+
+#### Datasets
+* Selected datasets appear first ordered by dataset_id
+* Search error fixed
+
+<!--- 
+
+### Taxonomy
+
+### Treemaps
+
+### Maps
+
+### Barplots
+
+### Diversity
+
+-->
+
+#### Query
+* A fasta formatted sequence with header can now be used.
+
+#### Download
+* The zipped file now contains a fasta file with the asv_code and the taxonomy in the header.
+
+---
+
 # metapr2 1.0.2
 
 Released: 2021-12-14
 
-### Datasets
+### Tabs of application
+
+#### Datasets
 * Settings (datasets, type of samples) can be saved and recalled
 
-### Taxonomy
+#### Taxonomy
 * Now more than one taxon can be selected
 * Three divisions can be removed (Fungi, Metazoa and Streptophyta)
 * Taxa (selected and excluded) can be saved and recalled
 * It is necessary to press the "Validate taxa" to replot after changing taxo selection
 
-### Treemaps
+#### Treemaps
 * Color of taxa now match other panels
 * Add a treemap of ASVs number
 
-### Maps
+#### Maps
 * Add topography
 * Add equator, tropics and polar circle
 
-### Barplots
+#### Barplots
 * Add number of samples for each bar
 
-### Diversity Alpha
+#### Diversity Alpha
 * Use Violin + Sina plot for discrete variable
 * Allow discretization of continuous variables (e.g. depth, latitude)
 
@@ -32,10 +71,12 @@ Released: 2021-12-14
 
 Released: 2021-11-22
 
-### Documentation 
+### Tabs of application
+
+#### Documentation 
 * Using pkgdown: https://pr2database.github.io/metapr2-shiny/
 
-### Barplots
+#### Barplots
 * Make interactive (R plotly library)
 * Add coloring by ecological function
 * Add time series
@@ -47,3 +88,8 @@ Released: 2021-11-22
 Released: 2021-11-19
 
 * Initial release
+
+### Database 
+
+#### version 1.0 - 41 datasets
+* Tara Ocean V9 samples have been not been reprocessed and the original swarms are used instead.
diff --git a/R/app.R b/R/app.R
@@ -39,13 +39,17 @@ shinymanager::set_labels(
   "Login" = "Enter metaPR2"
 )
 
+
 # User interface ----------------------------------------------------------
 
 ui <- fluidPage(
 
   # Booststrap theme:https://rstudio.github.io/shinythemes/
   # theme = bslib::bs_theme(bootswatch = "yeti"),
 
+  # Tracking not necessary in ui
+  # shinylogs::use_tracking(),
+
   # Script to close the windows after some inactivity - ACTIVATE for web application
   tags$script(inactivity),  
 
@@ -100,6 +104,9 @@ server <- function(input, output, session) {
   # Stop the application of the session is closed (after 30 min) - ACTIVATE  for web application
   session$onSessionEnded(stopApp)
 
+  # To track usage
+  shinylogs::track_usage(storage_mode = shinylogs::store_sqlite(path = "logs/"))
+
   # Authentification
 
   authentification <- callModule(module = shinymanager::auth_server,

diff --git a/R/fct_sequences.R b/R/fct_sequences.R
@@ -1,12 +1,24 @@
+# =========================================================================
+# --- Check that sequence is valid --------------------------------------------
+# =========================================================================
+
+
+sequence_clean <- function(sequence){
+  sequence <- str_to_upper(sequence)
+  sequence <- str_replace_all(sequence, "^>.*" , "") # Remove fasta header in case it is present
+  sequence <- str_replace_all(sequence, "[\\r\\n]" , "")
+}
+
+
+
 
 # =========================================================================
 # --- Check that sequence is valid --------------------------------------------
 # =========================================================================
 
 
 sequence_check <- function(sequence){
-  sequence <- str_to_upper(sequence)
-  sequence <- str_replace_all(sequence, "[\r\n]" , "")
+  sequence <- sequence_clean(sequence)
   ((nchar(sequence) >= 130) &
    (str_detect(sequence, "[^ACGTRYSWKMBDHVN]", negate = TRUE)))
 }
@@ -19,8 +31,7 @@ sequence_check <- function(sequence){
 
 match_asv <- function(fasta.df, query){
 
-  query <- str_to_upper(query)
-  query <- str_replace_all(query, "[\r\n]" , "")
+  query <- sequence_clean(query)
 
   query <-  Biostrings::DNAString(query)
 
@@ -51,6 +62,7 @@ blaster_asv <- function(fasta.df, query,
                         minIdentity = 0.80,
                         maxAccepts = 100){
 
+  query <- sequence_clean(query)
   query <-  data.frame(Id = "query", Seq =query)
 
   db <- fasta.df %>% 
@@ -84,5 +96,59 @@ blaster_asv <- function(fasta.df, query,
   return(df)
 }
 
+# =========================================================================
+# --- Write fasta file with taxo ------------------------------------------
+# =========================================================================
+
+#' @title Write a fasta file with the taxonomy
+#'
+#' @description
+#' Write a fasta file from a set of sequences
+#' Option : add to the definition line the the taxonomy separated by separator character (e.g. |)
+#'
+#' >Otu0001|Alveolata|Dinophyta|Syndiniales|Dino-Group-I|Dino-Group-I-Clade-1|Dino-Group-I-Clade-1_X|Dino-Group-I-Clade-1_X_sp.
+#'
+#' AGCTCCAATAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGA...
+#' @param df The data frame with the otu names, the taxonomy and the sequences. It should have the following columns (with exactly these names)
+#'
+#'       * seq_name : the sequence name
+#'       * supergroup: species
+#'       * sequence
+#' @param file_name Character, where to save the fasta file
+#' @param compress If TRUE produces a gz file
+#' @param taxo_include If TRUE then add taxo information which must be provided
+#' @param taxo_separator Character used to separate the different taxonomic levels
+#' TRUE if it terminates OK
+#'
+#' @examples
+#' fasta_write(df,"otu_taxo.fasta", compress=FALSE, include_taxo=TRUE, taxo_separator=";")
+#' @md
+#' @export
+
+fasta_write <- function(df,file_name, compress=FALSE, taxo_include=TRUE, taxo_separator="|") {
+
+  # First remove the gaps (can be - or .)
+  df <-  df %>%  mutate(sequence = str_replace_all(sequence, "(-|\\.)",""))
+
+  seq_out <- Biostrings::DNAStringSet(df$sequence)
+
+  if (taxo_include==TRUE) {
+    names(seq_out) <- str_c(df$seq_name,
+                            df$supergroup,
+                            df$division,
+                            df$class,
+                            df$order,
+                            df$family,
+                            df$genus,
+                            df$species,
+                            sep=taxo_separator)
+  }
+  else { names(seq_out) <- df$seq_name
+  }
+
+  Biostrings::writeXStringSet(seq_out, file_name, compress=compress, width = 20000)
+
+  return(TRUE)
+}
 
 
diff --git a/R/module_datasets.R b/R/module_datasets.R
@@ -139,8 +139,9 @@ dataServer <- function(id, taxo, authentification) {
       req(asv_set())
       DT::datatable(asv_set()$datasets %>% 
                       select(dataset_id, dataset_name, region, paper_reference, sequencing_technology, sample_number, asv_number, n_reads_mean) %>%
-                      mutate(selected = ifelse(dataset_id %in% input$datasets_selected_id,TRUE, FALSE)) %>% 
-                      arrange(dataset_name) ,
+                      mutate(selected = ifelse(dataset_id %in% input$datasets_selected_id,TRUE, FALSE)) %>%
+                      mutate(paper_reference = iconv(paper_reference, "latin1", to = "UTF-8")) %>% 
+                      arrange(-selected, dataset_name) ,
                     rownames = FALSE ,
                     options = list(
                       autoWidth = FALSE,
@@ -288,6 +289,9 @@ dataServer <- function(id, taxo, authentification) {
       if (authentification$user == "private") {
         dir_asv_set <- "data-qs-private"
       }
+      if (authentification$user == "ge") {
+        dir_asv_set <- "data-qs-ge"
+      }
 
 
       message("User: ", authentification$user)

diff --git a/R/module_download.R b/R/module_download.R
@@ -70,12 +70,18 @@ downloadServer <- function(id, datasets_selected, samples_selected, df_selected,
       file_datasets <- str_c(tmpdir, "/datasets.xlsx")
       file_samples <- str_c(tmpdir, "/samples.xlsx")
       file_asv <- str_c(tmpdir, "/asv.xlsx")
+      file_asv_fasta <- str_c(tmpdir, "/asv.fasta")
+
       # file_asv_reads <- str_c(tmpdir, "/asv_reads.xlsx")
-      files = c(file_datasets, file_samples, file_asv)
+      files = c(file_datasets, file_samples, file_asv, file_asv_fasta)
 
       rio::export(datasets_selected(), file=file_datasets, overwrite = TRUE)
       rio::export(samples_selected(), file=file_samples, overwrite = TRUE)
       rio::export(fasta_selected(), file=file_asv, overwrite = TRUE)
+      # Export fasta file
+      fasta_selected() %>% 
+        rename(seq_name = asv_code) %>% 
+        fasta_write(file_asv_fasta)
       # rio::export(df_selected(), file=file_asv_reads, overwrite = TRUE)
 
       system2("zip", args=(paste("--junk-paths", path, files,sep=" "))) # remove the paths of the files

diff --git a/R/module_query.R b/R/module_query.R
@@ -75,7 +75,7 @@ queryServer <- function(id, samples_selected, df_all, fasta_all) {
         p(),
 
         sliderInput(ns("pct_id_min"), label ="% identity min", min = 80.0, max = 100.0, 
-                    step = 0.2, value = 100, width = "500px"),
+                    step = 0.2, value = 95, width = "500px"),
 
         textAreaInput(ns("query"), label = "Query - at least 130 bp", value = "", 
                       width = "100%", height = "100px",

diff --git a/README.Rmd b/README.Rmd
@@ -25,9 +25,9 @@ knitr::opts_chunk$set(
 
 ## A database of 18S rRNA metabarcodes
 
-**Database version**:  1.0.0 - 41 datasets
+**Database version**:  1.1 - 41 datasets
 
-**Shiny application version**: 1.0.2
+**Shiny application version**: 1.0.3
 
 ### Presentation
 

diff --git a/README.md b/README.md
@@ -10,9 +10,9 @@
 
 ## A database of 18S rRNA metabarcodes
 
-**Database version**: 1.0.0 - 41 datasets
+**Database version**: 1.1 - 41 datasets
 
-**Shiny application version**: 1.0.2
+**Shiny application version**: 1.0.3
 
 ### Presentation
 

diff --git a/docs/articles/index.html b/docs/articles/index.html
diff --git a/docs/articles/vignette-data-processing.html b/docs/articles/vignette-data-processing.html
diff --git a/docs/articles/vignette-query.html b/docs/articles/vignette-query.html
diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/inst/data-qs/asv_set.qs b/inst/data-qs/asv_set.qs
diff --git a/inst/data-qs/credentials.qs b/inst/data-qs/credentials.qs
diff --git a/inst/data-qs/global.qs b/inst/data-qs/global.qs
diff --git a/inst/readme/download.md b/inst/readme/download.md
@@ -23,6 +23,7 @@ file | content | key fields
 datasets.xlsx | Information on the different datasets selected including reference and GenBank id | dataset_id
 samples.xlsx | List of samples selected with medadata | file_code
 asv.xlsx | ASV selected with taxonomy and sequence | asv_code
+asv.fasta | ASV selected with taxonomy and sequence  in fasta form|
 asv_reads.tsv.gz | Percent of reads (normalized to total number of eukaryotic reads in the sample), for each ASV and each sample (long form). | asv_code, file_code
 phyloseq.rds | File to use with phyloseq R package (https://joey711.github.io/phyloseq/). Use readRDS() function to read | **5000 samples max**
 </div>

diff --git a/vignettes/vignette-data-processing.Rmd b/vignettes/vignette-data-processing.Rmd
@@ -29,11 +29,13 @@ knitr::opts_chunk$set(
 
 * All datasets were processed with cutapdapt (Martin et al. 2011) to remove primers and the dada2 R package (Callahan et al. 2016) to compute ASVs.
 
+* Samples with less than 1,000 reads after processing not considered.
+
 * Assignment was done with dada2 assignTaxa using the 18S PR2 4.14.0 as reference
 
 * ASVs with less 100 reads total and with bootstrap value at the supergroup level < 75 were not considered.
 
-* Total read number per sample has been normalized to 100 with 3 decimals so that the value displayed in the different panels correspond to % of total eukaryotic reads.
+* Total read number per sample has been normalized to 100 with 3 decimals so that the value displayed in the different panels correspond to % of total eukaryotic reads. For this the number of reads for a given ASV in a given sample was divided by the total number of reads in this sample multiplied by 100.
 
 ## References
 

diff --git a/vignettes/vignette-query.Rmd b/vignettes/vignette-query.Rmd
@@ -26,7 +26,7 @@ knitr::opts_chunk$set(
 # - need to build source package to have the vignettes
 ```
 
-This panel allows to query the ASVs in the database. Enter your sequence (only the sequence, do not include the fasta header) and press the `Search`button. All ASVs in the database are searched (not only those from teh selected datasets or samples).  By clicking a sequence in the ASV table (Fig. 1) you can display the distribution of this ASV in the samples selected.  
+This panel allows to query the ASVs in the database. Enter your sequence (you may include the fasta header) and press the `Search`button. All ASVs in the database are searched (not only those from teh selected datasets or samples).  By clicking a sequence in the ASV table (Fig. 1) you can display the distribution of this ASV in the samples selected.  
 
 You can change the number of sequences displayed by decreasing the similarity threshold (Fig. 2).