diff --git a/DESCRIPTION b/DESCRIPTION index e6d3ba1..d44fcde 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: tidysq Type: Package -Title: Tidy processing and analysis of biological sequences +Title: Tidy Processing and Analysis of Biological Sequences Version: 1.0.0 -Date: 2020-12-10 +Date: 2021-02-25 Authors@R: c(person("Dominik", "Rafacz", email = "dominikrafacz@gmail.com", comment = c(ORCID = "0000-0003-0925-1909"), @@ -51,8 +51,8 @@ Imports: tibble (>= 2.1.3), vctrs (>= 0.3.0) Suggests: - AmyloGram (>= 1.0), ape (>= 5.3), + bioseq (>= 0.1.2), Biostrings (>= 2.52.0), covr, knitr, @@ -63,8 +63,8 @@ Suggests: testthat (>= 2.1.0), withr (>= 2.2.0) License: GPL (>= 2) -URL: https://github.com/michbur/tidysq -BugReports: https://github.com/michbur/tidysq/issues +URL: https://github.com/BioGenies/tidysq +BugReports: https://github.com/BioGenies/tidysq/issues SystemRequirements: GNU make, C++17 NeedsCompilation: no @@ -74,5 +74,4 @@ Language: en-US RoxygenNote: 7.1.1 LinkingTo: Rcpp, testthat -RdMacros: lifecycle VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 32de7a5..aebfe35 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,8 @@ S3method(as.sq,character) S3method(as.sq,default) S3method(bite,default) S3method(bite,sq) +S3method(collapse,default) +S3method(collapse,sq) S3method(complement,default) S3method(complement,sq_dna_bsc) S3method(complement,sq_dna_ext) @@ -50,11 +52,16 @@ S3method(import_sq,SeqFastaAA) S3method(import_sq,SeqFastadna) S3method(import_sq,XStringSetList) S3method(import_sq,alignment) +S3method(import_sq,bioseq_aa) +S3method(import_sq,bioseq_dna) +S3method(import_sq,bioseq_rna) S3method(import_sq,default) S3method(import_sq,list) S3method(obj_print_data,sq) S3method(obj_print_footer,sq) S3method(obj_print_header,sq) +S3method(paste,default) +S3method(paste,sq) S3method(pillar_shaft,encsq) S3method(pillar_shaft,sq) S3method(remove_ambiguous,default) @@ -181,6 +188,7 @@ export("sq_type<-") export(alphabet) export(as.sq) export(bite) +export(collapse) export(complement) export(export_sq) export(find_invalid_letters) @@ -202,6 +210,7 @@ export(is.sq_rna_bsc) export(is.sq_rna_ext) export(is.sq_unt) export(is_empty_sq) +export(paste) export(random_sq) export(read_fasta) export(remove_ambiguous) diff --git a/R/RcppExports.R b/R/RcppExports.R index 40f3b59..c2a8b1e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -9,6 +9,10 @@ CPP_bite <- function(x, indices, NA_letter, on_warning) { .Call(`_tidysq_CPP_bite`, x, indices, NA_letter, on_warning) } +CPP_collapse <- function(x, NA_letter) { + .Call(`_tidysq_CPP_collapse`, x, NA_letter) +} + CPP_complement <- function(x, NA_letter) { .Call(`_tidysq_CPP_complement`, x, NA_letter) } @@ -53,6 +57,10 @@ CPP_pack_STRING <- function(proto, alphabet, NA_letter, ignore_case) { .Call(`_tidysq_CPP_pack_STRING`, proto, alphabet, NA_letter, ignore_case) } +CPP_paste <- function(list_of_x, NA_letter) { + .Call(`_tidysq_CPP_paste`, list_of_x, NA_letter) +} + CPP_random_sq <- function(n, len, alphabet, use_gap) { .Call(`_tidysq_CPP_random_sq`, n, len, alphabet, use_gap) } @@ -81,8 +89,8 @@ CPP_substitute_letters <- function(x, encoding, NA_letter) { .Call(`_tidysq_CPP_substitute_letters`, x, encoding, NA_letter) } -CPP_translate <- function(x, table, NA_letter, interpret_as_stop) { - .Call(`_tidysq_CPP_translate`, x, table, NA_letter, interpret_as_stop) +CPP_translate <- function(x, table, NA_letter) { + .Call(`_tidysq_CPP_translate`, x, table, NA_letter) } CPP_typify <- function(x, dest_type, NA_letter) { diff --git a/R/collapse.R b/R/collapse.R new file mode 100644 index 0000000..f7e1976 --- /dev/null +++ b/R/collapse.R @@ -0,0 +1,51 @@ +#' Collapse multiple sequences into one +#' +#' @description Joins sequences from a vector into a single sequence. Sequence +#' type remains unchanged. +#' +#' @template x +#' @template NA_letter +#' @template three-dots +#' +#' @return \code{\link[=sq-class]{sq}} object of the same type as input but with +#' exactly one sequence. +#' +#' @details +#' \code{collapse()} joins sequences from supplied \code{sq} object in the same +#' order as they appear in said vector. That is, if there are three sequences +#' AGGCT, ATCCGT and GAACGT, then resulting sequence will be AGGCTATCCGTGAACGT. +#' This operation does not alter the type of the input object nor its alphabet. +#' +#' @examples +#' # Creating objects to work on: +#' sq_ami <- sq(c("MIAANYTWIL","TIAALGNIIYRAIE", "NYERTGHLI", "MAYXXXIALN"), +#' alphabet = "ami_ext") +#' sq_dna <- sq(c("ATGCAGGA", "GACCGAACGAN", ""), alphabet = "dna_ext") +#' sq_unt <- sq(c("ATGCAGGA?", "TGACGAGCTTA", "", "TIAALGNIIYRAIE")) +#' +#' # Collapsing sequences: +#' collapse(sq_ami) +#' collapse(sq_dna) +#' collapse(sq_unt) +#' +#' # Empty sq objects are collapsed as well (into empty string - ""): +#' sq_empty <- sq(character(), alphabet = "rna_bsc") +#' collapse(sq_empty) +#' +#' @family order_functions +#' @export +collapse <- function(x, ...) + UseMethod("collapse") + +#' @export +collapse.default <- function(x, ...) + stop("method 'collapse' isn't implemented for this type of object", call. = FALSE) + +#' @rdname collapse +#' @export +collapse.sq <- function(x, ..., + NA_letter = getOption("tidysq_NA_letter")) { + assert_string(NA_letter, min.chars = 1) + + CPP_collapse(x, NA_letter) +} diff --git a/R/export_sq.R b/R/export_sq.R index 5d4e6da..d1476e4 100644 --- a/R/export_sq.R +++ b/R/export_sq.R @@ -4,8 +4,8 @@ #' #' @description Converts object of class \code{\link[=sq-class]{sq}} to a class #' from another package. Currently supported packages are \pkg{ape}, -#' \pkg{Bioconductor} and \pkg{seqinr}. For exact list of supported classes and -#' resulting types, see details. +#' \pkg{bioseq}, \pkg{Bioconductor} and \pkg{seqinr}. For exact list of +#' supported classes and resulting types, see details. #' #' @template x #' @param export_format [\code{character(1)}]\cr @@ -19,6 +19,7 @@ #' \item \strong{ami}: #' \itemize{ #' \item \code{"ape::AAbin"} +#' \item \code{"bioseq::bioseq_aa"} #' \item \code{"Biostrings::AAString"} #' \item \code{"Biostrings::AAStringSet"} #' \item \code{"seqinr::SeqFastaAA"} @@ -26,12 +27,14 @@ #' \item \strong{dna}: #' \itemize{ #' \item \code{"ape::DNAbin"} +#' \item \code{"bioseq::bioseq_dna"} #' \item \code{"Biostrings::DNAString"} #' \item \code{"Biostrings::DNAStringSet"} #' \item \code{"seqinr::SeqFastadna"} #' } #' \item \strong{rna}: #' \itemize{ +#' \item \code{"bioseq::bioseq_rna"} #' \item \code{"Biostrings::RNAString"} #' \item \code{"Biostrings::RNAStringSet"} #' } @@ -41,19 +44,22 @@ #' # DNA and amino acid sequences can be exported to most packages #' sq_ami <- sq(c("MVVGL", "LAVPP"), alphabet = "ami_bsc") #' export_sq(sq_ami, "ape::AAbin") +#' export_sq(sq_ami, "bioseq::bioseq_aa") #' export_sq(sq_ami, "Biostrings::AAStringSet", c("one", "two")) #' export_sq(sq_ami, "seqinr::SeqFastaAA") #' #' sq_dna <- sq(c("TGATGAAGCGCA", "TTGATGGGAA"), alphabet = "dna_bsc") #' export_sq(sq_dna, "ape::DNAbin", name = c("one", "two")) +#' export_sq(sq_dna, "bioseq::bioseq_dna") #' export_sq(sq_dna, "Biostrings::DNAStringSet") #' export_sq(sq_dna, "seqinr::SeqFastadna") #' -#' # RNA sequences are limited to Biostrings +#' # RNA sequences are limited to Biostrings and bioseq #' sq_rna <- sq(c("NUARYGCB", "", "DRKCNYBAU"), alphabet = "rna_ext") +#' export_sq(sq_rna, "bioseq::bioseq_rna") #' export_sq(sq_rna, "Biostrings::RNAStringSet") #' -#' # Biostrings accept single sequences as well +#' # Biostrings can export single sequences to simple strings as well #' export_sq(sq_dna[1], "Biostrings::DNAString") #' #' @family output_functions @@ -77,6 +83,10 @@ export_sq.sq_ami_bsc <- function(x, export_format, name = NULL, ...) { assert_package_installed("ape") ape::as.AAbin(setNames(lapply(unpack(x, "STRINGS"), `attributes<-`, NULL), name)) }, + `bioseq::bioseq_aa` = { + assert_package_installed("bioseq") + bioseq::new_aa(setNames(as.character(x), name)) + }, `Biostrings::AAString` = { assert_package_installed("Biostrings") if (vec_size(x) != 1) @@ -113,6 +123,10 @@ export_sq.sq_dna_bsc <- function(x, export_format, name = NULL, ...) { assert_package_installed("ape") ape::as.DNAbin(setNames(lapply(unpack(x, "STRINGS"), `attributes<-`, NULL), name)) }, + `bioseq::bioseq_dna` = { + assert_package_installed("bioseq") + bioseq::new_dna(setNames(as.character(x), name)) + }, `Biostrings::DNAString` = { assert_package_installed("Biostrings") if (vec_size(x) != 1) @@ -145,6 +159,10 @@ export_sq.sq_dna_ext <- export_sq.sq_dna_bsc #' @export export_sq.sq_rna_bsc <- function(x, export_format, name = NULL, ...) { switch (export_format, + `bioseq::bioseq_rna` = { + assert_package_installed("bioseq") + bioseq::new_rna(setNames(as.character(x), name)) + }, `Biostrings::RNAString` = { assert_package_installed("Biostrings") if (vec_size(x) != 1) diff --git a/R/import_sq.R b/R/import_sq.R index c9cfc50..cda4c3a 100644 --- a/R/import_sq.R +++ b/R/import_sq.R @@ -2,8 +2,8 @@ #' #' @description Creates \code{\link[=sq-class]{sq}} object from object of class #' from another package. Currently supported packages are \pkg{ape}, -#' \pkg{Bioconductor} and \pkg{seqinr}. For exact list of supported classes and -#' resulting types, see details. +#' \pkg{bioseq}, \pkg{Bioconductor} and \pkg{seqinr}. For exact list of +#' supported classes and resulting types, see details. #' #' @param object [\code{any(1)}]\cr #' An object of one of supported classes. @@ -25,6 +25,12 @@ #' \item \code{alignment} - exact type is guessed within \code{\link{sq}} #' function #' } +#' \item \code{bioseq}: +#' \itemize{ +#' \item \code{bioseq_aa} - imported as \strong{ami_ext} +#' \item \code{bioseq_dna} - imported as \strong{dna_ext} +#' \item \code{bioseq_rna} - imported as \strong{rna_ext} +#' } #' \item \code{Biostrings}: #' \itemize{ #' \item \code{AAString} - imported as \strong{ami_ext} with exactly one @@ -59,6 +65,11 @@ #' ape_dna <- as.DNAbin(list(one = c("C", "T", "C", "A"), two = c("T", "G", "A", "G", "G"))) #' import_sq(ape_dna) #' +#' # bioseq example +#' library(bioseq) +#' bioseq_rna <- new_rna(c(one = "ANBRY", two = "YUTUGGN")) +#' import_sq(bioseq_rna) +#' #' # Biostrings example #' library(Biostrings) #' Biostrings_ami <- AAStringSet(c(one = "FEAPQLIWY", two = "EGITENAK")) @@ -125,6 +136,24 @@ import_sq.alignment <- function(object, ...) { bind_into_sqibble(sq(object[["seq"]], ...), object[["nam"]]) } +#' @export +import_sq.bioseq_aa <- function(object, ...) { + # From package `bioseq` + bind_into_sqibble(sq(as.character(object), alphabet = "ami_ext"), names(object)) +} + +#' @export +import_sq.bioseq_dna <- function(object, ...) { + # From package `bioseq` + bind_into_sqibble(sq(as.character(object), alphabet = "dna_ext"), names(object)) +} + +#' @export +import_sq.bioseq_rna <- function(object, ...) { + # From package `bioseq` + bind_into_sqibble(sq(as.character(object), alphabet = "rna_ext"), names(object)) +} + #' @export import_sq.AAString <- function(object, ...) { # From package `Biostrings` diff --git a/R/paste.R b/R/paste.R new file mode 100644 index 0000000..853b611 --- /dev/null +++ b/R/paste.R @@ -0,0 +1,52 @@ +#' @export +paste <- function(...) + UseMethod("paste") + +#' @export +paste.default <- function(...) { + base::paste(...) +} + +#' Paste sequences in string-like fashion +#' +#' @description Joins multiple vectors of sequences into one vector. +#' +#' @param ... [\code{sq}]\cr +#' Sequences to paste together. +#' @template NA_letter +#' +#' @return \code{\link[=sq-class]{sq}} object of common type of input objects. +#' Common type is determined in the same process as for +#' \code{\link[=sq-concatenate]{c.sq}()}. +#' +#' @details +#' \code{paste()} joins sequences in the same way as it does with strings. +#' All \code{sq} objects must have the same length, that is, contain the same +#' number of sequences. An exception is made for scalar (length 1) \code{sq} +#' objects, which are replicated instead. +#' +#' @examples +#' # Creating objects to work on: +#' sq_dna_1 <- sq(c("TTCAGGGCTAG", "CGATTGC", "CAGTTTA"), +#' alphabet = "dna_bsc") +#' sq_dna_2 <- sq(c("ATCTTGAAG", "CATATGCGCTA", "ACGTGTCGA"), +#' alphabet = "dna_bsc") +#' sq_unt_1 <- sq(c("ATGCAGGA?", "TGACGAGCTTA", "", "TIAALGNIIYRAIE")) +#' sq_unt_2 <- sq(c("OVNU!!OK!!J", "GOK!MI!N!BB!", "DPOFIN!!", "??!?")) +#' +#' # Pasting sequences: +#' collapse(sq_dna_1, sq_dna_2) +#' collapse(sq_unt_1, sq_unt_2) +#' collapse(sq_dna_2, sq_unt_2, sq_dna_1) +#' +#' @family order_functions +#' @name paste +#' @export +paste.sq <- function(..., + NA_letter = getOption("tidysq_NA_letter")) { + # Throws error when there is no common size + vec_size_common(...) + assert_string(NA_letter, min.chars = 1) + + CPP_paste(vec_cast_common(...), NA_letter) +} diff --git a/R/translate.R b/R/translate.R index 56909fe..a525e11 100644 --- a/R/translate.R +++ b/R/translate.R @@ -11,9 +11,6 @@ #' \href{https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi}{here}. #' @template NA_letter #' @template three-dots -#' @param interpret_as_stop [\code{logical(1)}]\cr -#' Used with tables 27, 28 and 31 and their ambiguous translations. Tells if -#' these ambiguous codons should be interpreted as stop or the other option. #' #' @return An object of \code{\link[=sq-class]{class sq}} with \strong{ami_bsc} #' type. @@ -32,8 +29,8 @@ #' sequences with extended alphabets, as ambiguous letters in most cases cannot #' be translated into exactly one protein. #' -#' Moreover, behavior of this function is undefined whenever input sequence -#' contain either "\code{-}" or \code{NA} value. +#' Moreover, this function raises an error whenever input sequence contain +#' either "\code{-}" or \code{NA} value. #' #' @examples #' sq_dna <- sq(c("TACTGGGCATGA", "CAGGTC", "TAGTCCTAG"), alphabet = "dna_bsc") @@ -45,7 +42,7 @@ #' @export translate <- function(x, table = 1, ...) { assert_int(table) - assert_choice(table, c(1:16, 21:31, 33)) + assert_choice(table, c(1:16, 21:26, 29, 30, 33)) UseMethod("translate") } @@ -57,12 +54,10 @@ translate.default <- function(x, table = 1, ...) #' @rdname translate #' @export translate.sq_dna_bsc <- function(x, table = 1, ..., - NA_letter = getOption("tidysq_NA_letter"), - interpret_as_stop = FALSE) { + NA_letter = getOption("tidysq_NA_letter")) { assert_string(NA_letter, min.chars = 1) - assert_flag(interpret_as_stop) - CPP_translate(x, table, NA_letter, interpret_as_stop) + CPP_translate(x, table, NA_letter) } #' @rdname translate diff --git a/README.md b/README.md index 08a44db..3c61fbb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Github Actions Build Status](https://github.com/BioGenies/tidysq/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/BioGenies/tidysq/actions) [![codecov.io](https://codecov.io/github/BioGenies/tidysq/coverage.svg?branch=master)](https://codecov.io/github/BioGenies/tidysq?branch=master) - [![Lifecycle: maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) + [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental) diff --git a/inst/include/tidysq/collapse.h b/inst/include/tidysq/collapse.h new file mode 100644 index 0000000..8af8625 --- /dev/null +++ b/inst/include/tidysq/collapse.h @@ -0,0 +1,40 @@ +#pragma once + +#include "tidysq/Sq.h" + +namespace tidysq { + template + Sq collapse(const Sq &sq) { + const Alphabet& alph = sq.alphabet(); + const AlphSize& alph_size = alph.alphabet_size(); + Sq ret(1, alph); + + // Early return in case there is exactly one sequence, so that no processing is done + if (sq.size() == 1) { + ret[0] = sq[0]; + return ret; + } + + // First total sequence length (number of elements) is computed + LenSq element_count = 0; + for (LenSq i = 0; i < sq.size(); ++i) { + element_count += sq[i].get().original_length(); + } + Sequence sequence_out = Sequence((alph_size * element_count + 7) / 8, element_count); + + // Next an iterator is created and sequences are inputted one by one + auto out_seq_iter = sequence_out.begin(alph_size); + for (LenSq i = 0; i < sq.size(); ++i) { + const Sequence& sequence_in = sq[i]; + auto in_seq_iter = sequence_in.cbegin(alph_size); + while (out_seq_iter != sequence_out.end(alph_size) && in_seq_iter != sequence_in.cend(alph_size)) { + out_seq_iter.assign(*in_seq_iter); + ++in_seq_iter; + ++out_seq_iter; + } + } + ret[0] = sequence_out; + + return ret; + } +} diff --git a/inst/include/tidysq/constants/complement_tables.h b/inst/include/tidysq/constants/complement_tables.h index f8c95d5..908944f 100644 --- a/inst/include/tidysq/constants/complement_tables.h +++ b/inst/include/tidysq/constants/complement_tables.h @@ -3,14 +3,42 @@ #include "tidysq/tidysq-typedefs.h" namespace tidysq::constants { - const internal::ComplementTable BSC_COMPLEMENT_TABLE = { - {0u, 3u}, {1u, 2u}, {2u, 1u}, {3u, 0u} - }; - - const internal::ComplementTable EXT_COMPLEMENT_TABLE = { - {0u, 3u}, {1u, 2u}, {2u, 1u}, {3u, 0u}, - {4u, 4u}, {5u, 5u}, {6u, 7u}, {7u, 6u}, {8u, 9u}, {9u, 8u}, - {10u, 13u}, {11u, 12u}, {12u, 11u}, {13u, 10u}, - {14u, 14u} - }; + template + inline const LetterValue COMPLEMENT = CODON; + + template + inline const LetterValue COMPLEMENT = 3u; + + template + inline const LetterValue COMPLEMENT = 2u; + + template + inline const LetterValue COMPLEMENT = 1u; + + template + inline const LetterValue COMPLEMENT = 0u; + + template<> + inline const LetterValue COMPLEMENT = 7u; + + template<> + inline const LetterValue COMPLEMENT = 6u; + + template<> + inline const LetterValue COMPLEMENT = 9u; + + template<> + inline const LetterValue COMPLEMENT = 8u; + + template<> + inline const LetterValue COMPLEMENT = 13u; + + template<> + inline const LetterValue COMPLEMENT = 12u; + + template<> + inline const LetterValue COMPLEMENT = 11u; + + template<> + inline const LetterValue COMPLEMENT = 10u; } diff --git a/inst/include/tidysq/constants/translate_tables.h b/inst/include/tidysq/constants/translate_tables.h index 17298cf..043b4cb 100644 --- a/inst/include/tidysq/constants/translate_tables.h +++ b/inst/include/tidysq/constants/translate_tables.h @@ -3,208 +3,303 @@ #include "tidysq/tidysq-typedefs.h" namespace tidysq::constants { - const internal::CodonTable CODON_TABLE_1 = { - {0u, { - {0u, {{0u, 8u}, {1u, 11u}, {2u, 8u}, {3u, 11u}}}, - {1u, {{0u, 16u}, {1u, 16u}, {2u, 16u}, {3u, 16u}}}, - {2u, {{0u, 14u}, {1u, 15u}, {2u, 14u}, {3u, 15u}}}, - {3u, {{0u, 7u}, {1u, 7u}, {2u, 10u}, {3u, 7u}}} - }}, - {1u, { - {0u, {{0u, 13u}, {1u, 6u}, {2u, 13u}, {3u, 6u}}}, - {1u, {{0u, 12u}, {1u, 12u}, {2u, 12u}, {3u, 12u}}}, - {2u, {{0u, 14u}, {1u, 14u}, {2u, 14u}, {3u, 14u}}}, - {3u, {{0u, 9u}, {1u, 9u}, {2u, 9u}, {3u, 9u}}} - }}, - {2u, { - {0u, {{0u, 3u}, {1u, 2u}, {2u, 3u}, {3u, 2u}}}, - {1u, {{0u, 0u}, {1u, 0u}, {2u, 0u}, {3u, 0u}}}, - {2u, {{0u, 5u}, {1u, 5u}, {2u, 5u}, {3u, 5u}}}, - {3u, {{0u, 17u}, {1u, 17u}, {2u, 17u}, {3u, 17u}}} - }}, - {3u, { - {0u, {{0u, 21u}, {1u, 19u}, {2u, 21u}, {3u, 19u}}}, - {1u, {{0u, 15u}, {1u, 15u}, {2u, 15u}, {3u, 15u}}}, - {2u, {{0u, 21u}, {1u, 1u}, {2u, 18u}, {3u, 1u}}}, - {3u, {{0u, 4u}, {1u, 9u}, {2u, 4u}, {3u, 9u}}} - }} - }; - - const std::unordered_map CODON_DIFF_TABLES = { - {2, { - {0u, { - {2u, {{0u, 21u}, {2u, 21u}}}, - {3u, {{0u, 10u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {3, { - {0u, { - {3u, {{0u, 10u}}} - }}, - {1u, { - {2u, {{0u, 31u}, {1u, 31u}}}, - {3u, {{0u, 16u}, {1u, 16u}, {2u, 16u}, {3u, 16u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {4, { - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {5, { - {0u, { - {2u, {{0u, 15u}, {2u, 15u}}}, - {3u, {{0u, 10u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {6, { - {3u, { - {0u, {{0u, 13u}, {2u, 13u}}} - }} - }}, - {9, { - {0u, { - {0u, {{0u, 11u}}}, - {2u, {{0u, 15u}, {2u, 15u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {10, { - {3u, { - {2u, {{0u, 1u}}} - }} - }}, - {12, { - {1u, { - {3u, {{2u, 15u}}} - }} - }}, - {13, { - {0u, { - {2u, {{0u, 5u}, {2u, 5u}}}, - {3u, {{0u, 10u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {14, { - {0u, { - {0u, {{0u, 11u}}}, - {2u, {{0u, 15u}, {2u, 15u}}} - }}, - {3u, { - {0u, {{0u, 19u}}}, - {2u, {{0u, 18u}}} - }} - }}, - {15, { - {3u, { - {0u, {{2u, 13u}}} - }} - }}, - {16, { - {3u, { - {0u, {{2u, 9u}}} - }} - }}, - {21, { - {0u, { - {0u, {{0u, 11u}}}, - {2u, {{0u, 15u}, {2u, 15u}}}, - {3u, {{0u, 10u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {22, { - {3u, { - {0u, {{2u, 9u}}}, - {1u, {{0u, 21u}}} - }} - }}, - {23, { - {3u, { - {3u, {{0u, 21u}}} - }} - }}, - {24, { - {0u, { - {2u, {{0u, 15u}, {2u, 8u}}} - }}, - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {25, { - {3u, { - {2u, {{0u, 5u}}} - }} - }}, - {26, { - {1u, { - {3u, {{2u, 0u}}} - }} - }}, - {27, { - {3u, { - {0u, {{0u, 13u}, {2u, 13u}}}, - }} - }}, - {29, { - {3u, { - {0u, {{0u, 19u}, {2u, 19u}}} - }} - }}, - {30, { - {3u, { - {0u, {{0u, 3u}, {2u, 3u}}} - }} - }}, - {31, { - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {33, { - {0u, { - {2u, {{0u, 15u}, {2u, 8u}}} - }}, - {3u, { - {0u, {{0u, 19u}}}, - {2u, {{0u, 18u}}} - }} - }} - }; - - const std::unordered_map AMB_CODON_DIFF_TABLES = { - {27, { - {3u, { - {2u, {{0u, 18u}}} - }} - }}, - {28, { - {3u, { - {0u, {{0u, 13u}, {2u, 13u}}}, - {2u, {{0u, 18u}}} - }} - }}, - {31, { - {3u, { - {0u, {{0u, 3u}, {2u, 3u}}} - }} - }} - }; + template + inline const LetterValue CODON = 31u; + + template + inline const LetterValue CODON = 8u; + + template + inline const LetterValue CODON = 11u; + + template + inline const LetterValue CODON = 8u; + + template + inline const LetterValue CODON = 11u; + + template + inline const LetterValue CODON = 16u; + + template + inline const LetterValue CODON = 14u; + + template + inline const LetterValue CODON = 15u; + + template + inline const LetterValue CODON = 14u; + + template + inline const LetterValue CODON = 15u; + + template + inline const LetterValue CODON = 7u; + + template + inline const LetterValue CODON = 10u; + + template + inline const LetterValue CODON = 13u; + + template + inline const LetterValue CODON = 6u; + + template + inline const LetterValue CODON = 13u; + + template + inline const LetterValue CODON = 6u; + + template + inline const LetterValue CODON = 12u; + + template + inline const LetterValue CODON = 14u; + + template + inline const LetterValue CODON = 9u; + + template + inline const LetterValue CODON = 3u; + + template + inline const LetterValue CODON = 2u; + + template + inline const LetterValue CODON = 3u; + + template + inline const LetterValue CODON = 2u; + + template + inline const LetterValue CODON = 0u; + + template + inline const LetterValue CODON = 5u; + + template + inline const LetterValue CODON = 17u; + + template + inline const LetterValue CODON = 21u; + + template + inline const LetterValue CODON = 19u; + + template + inline const LetterValue CODON = 21u; + + template + inline const LetterValue CODON = 19u; + + template + inline const LetterValue CODON = 15u; + + template + inline const LetterValue CODON = 21u; + + template + inline const LetterValue CODON = 1u; + + template + inline const LetterValue CODON = 18u; + + template + inline const LetterValue CODON = 1u; + + template + inline const LetterValue CODON = 4u; + + template + inline const LetterValue CODON = 9u; + + template + inline const LetterValue CODON = 4u; + + template + inline const LetterValue CODON = 9u; + + + template<> + inline const LetterValue CODON<2, 0u, 2u, 0u> = 21u; + + template<> + inline const LetterValue CODON<2, 0u, 2u, 2u> = 21u; + + template<> + inline const LetterValue CODON<2, 0u, 3u, 0u> = 10u; + + template<> + inline const LetterValue CODON<2, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<3, 0u, 3u, 0u> = 10u; + + template<> + inline const LetterValue CODON<3, 1u, 2u, 0u> = 31u; + + template<> + inline const LetterValue CODON<3, 1u, 2u, 1u> = 31u; + + template + inline const LetterValue CODON<3, 1u, 3u, C3> = 16u; + + template<> + inline const LetterValue CODON<3, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<4, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<5, 0u, 2u, 0u> = 15u; + + template<> + inline const LetterValue CODON<5, 0u, 2u, 2u> = 15u; + + template<> + inline const LetterValue CODON<5, 0u, 3u, 0u> = 10u; + + template<> + inline const LetterValue CODON<5, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<6, 3u, 0u, 0u> = 13u; + + template<> + inline const LetterValue CODON<6, 3u, 0u, 2u> = 13u; + + + template<> + inline const LetterValue CODON<9, 0u, 0u, 0u> = 11u; + + template<> + inline const LetterValue CODON<9, 0u, 2u, 0u> = 15u; + + template<> + inline const LetterValue CODON<9, 0u, 2u, 2u> = 15u; + + template<> + inline const LetterValue CODON<9, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<10, 3u, 2u, 0u> = 1u; + + + template<> + inline const LetterValue CODON<12, 1u, 3u, 2u> = 15u; + + + template<> + inline const LetterValue CODON<13, 0u, 2u, 0u> = 5u; + + template<> + inline const LetterValue CODON<13, 0u, 2u, 2u> = 5u; + + template<> + inline const LetterValue CODON<13, 0u, 3u, 0u> = 10u; + + template<> + inline const LetterValue CODON<13, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<14, 0u, 0u, 0u> = 11u; + + template<> + inline const LetterValue CODON<14, 0u, 2u, 0u> = 15u; + + template<> + inline const LetterValue CODON<14, 0u, 2u, 2u> = 15u; + + template<> + inline const LetterValue CODON<14, 3u, 0u, 0u> = 19u; + + template<> + inline const LetterValue CODON<14, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<15, 3u, 0u, 2u> = 13u; + + + template<> + inline const LetterValue CODON<16, 3u, 0u, 2u> = 9u; + + + template<> + inline const LetterValue CODON<21, 0u, 0u, 0u> = 11u; + + template<> + inline const LetterValue CODON<21, 0u, 2u, 0u> = 15u; + + template<> + inline const LetterValue CODON<21, 0u, 2u, 2u> = 15u; + + template<> + inline const LetterValue CODON<21, 0u, 3u, 0u> = 10u; + + template<> + inline const LetterValue CODON<21, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<22, 3u, 0u, 2u> = 9u; + + template<> + inline const LetterValue CODON<22, 3u, 1u, 0u> = 21u; + + + template<> + inline const LetterValue CODON<23, 3u, 3u, 0u> = 21u; + + + template<> + inline const LetterValue CODON<24, 0u, 2u, 0u> = 15u; + + template<> + inline const LetterValue CODON<24, 0u, 2u, 2u> = 8u; + + template<> + inline const LetterValue CODON<24, 3u, 2u, 0u> = 18u; + + + template<> + inline const LetterValue CODON<25, 3u, 2u, 0u> = 5u; + + + template<> + inline const LetterValue CODON<26, 1u, 3u, 2u> = 0u; + + + template<> + inline const LetterValue CODON<29, 3u, 0u, 0u> = 19u; + + template<> + inline const LetterValue CODON<29, 3u, 0u, 2u> = 19u; + + + template<> + inline const LetterValue CODON<30, 3u, 0u, 0u> = 3u; + + template<> + inline const LetterValue CODON<30, 3u, 0u, 2u> = 3u; + + + template<> + inline const LetterValue CODON<33, 0u, 2u, 0u> = 15u; + + template<> + inline const LetterValue CODON<33, 0u, 2u, 2u> = 8u; + + template<> + inline const LetterValue CODON<33, 3u, 0u, 0u> = 19u; + + template<> + inline const LetterValue CODON<33, 3u, 2u, 0u> = 18u; } diff --git a/inst/include/tidysq/ops/complement.h b/inst/include/tidysq/ops/complement.h index 66ff46a..73ded97 100644 --- a/inst/include/tidysq/ops/complement.h +++ b/inst/include/tidysq/ops/complement.h @@ -5,42 +5,63 @@ #include "tidysq/constants/complement_tables.h" namespace tidysq { + namespace internal { + constexpr LetterValue read_complement(const SqType &type, const LetterValue& value) { + switch (type) { + case DNA_BSC: + case RNA_BSC: + switch (value) { + case 0u: return constants::COMPLEMENT; + case 1u: return constants::COMPLEMENT; + case 2u: return constants::COMPLEMENT; + case 3u: return constants::COMPLEMENT; + case 4u: return constants::COMPLEMENT; + default: return constants::COMPLEMENT; + } + case DNA_EXT: + case RNA_EXT: + switch (value) { + case 0u: return constants::COMPLEMENT; + case 1u: return constants::COMPLEMENT; + case 2u: return constants::COMPLEMENT; + case 3u: return constants::COMPLEMENT; + case 4u: return constants::COMPLEMENT; + case 5u: return constants::COMPLEMENT; + case 6u: return constants::COMPLEMENT; + case 7u: return constants::COMPLEMENT; + case 8u: return constants::COMPLEMENT; + case 9u: return constants::COMPLEMENT; + case 10u: return constants::COMPLEMENT; + case 11u: return constants::COMPLEMENT; + case 12u: return constants::COMPLEMENT; + case 13u: return constants::COMPLEMENT; + case 14u: return constants::COMPLEMENT; + case 15u: return constants::COMPLEMENT; + default: return constants::COMPLEMENT; + } + default: + throw std::invalid_argument("complement makes sense only for DNA and RNA sequences"); + } + } + } + namespace ops { template class OperationComplement : public OperationSqToSq { const AlphSize alph_size_; - const internal::ComplementTable &table_; - - [[nodiscard]] const internal::ComplementTable & match_table(const SqType &type) const { - switch (type) { - case DNA_BSC: - case RNA_BSC: - return constants::BSC_COMPLEMENT_TABLE; - case DNA_EXT: - case RNA_EXT: - return constants::EXT_COMPLEMENT_TABLE; - default: - throw std::invalid_argument("complement makes sense only for DNA and RNA sequences"); - } - } + const SqType& type_; public: explicit OperationComplement(const AlphSize alph_size, const SqType &type) : alph_size_(alph_size), - table_(match_table(type)) {}; + type_(type) {}; void operator()(const Sequence &sequence_in, Sequence &sequence_out) override { auto in_seq_iter = sequence_in.cbegin(alph_size_); auto out_seq_iter = sequence_out.begin(alph_size_); - while (out_seq_iter != sequence_out.end(alph_size_) || in_seq_iter != sequence_in.cend(alph_size_)) { - LetterValue in_letter = *in_seq_iter; - if (table_.count(in_letter) > 0) { - out_seq_iter.assign(table_.at(in_letter)); - } else { - out_seq_iter.assign(in_letter); - } - ++in_seq_iter; - ++out_seq_iter; + for (; out_seq_iter != sequence_out.end(alph_size_) || in_seq_iter != sequence_in.cend(alph_size_); + ++in_seq_iter, ++out_seq_iter) { + out_seq_iter.assign(internal::read_complement(type_, *in_seq_iter)); } } diff --git a/inst/include/tidysq/ops/random_sq.h b/inst/include/tidysq/ops/random_sq.h index d660d02..0e06ebc 100644 --- a/inst/include/tidysq/ops/random_sq.h +++ b/inst/include/tidysq/ops/random_sq.h @@ -3,6 +3,7 @@ #include "tidysq/ProtoSequence.h" #include "tidysq/ops/Operation.h" #include "tidysq/ops/pack.h" +#include "tidysq/util/random.h" #include "tidysq/sqapply.h" namespace tidysq { @@ -44,7 +45,7 @@ namespace tidysq { inline void operator() (const LenSq &length, Sequence &sequence) override { for (auto it = sequence.begin(alphabet_.alphabet_size()); it != sequence.end(alphabet_.alphabet_size()); ++it) { - it.assign(letter_values_[rand() % letter_values_.size()]); + it.assign(letter_values_[util::random_value(alphabet_.size())]); } } diff --git a/inst/include/tidysq/ops/translate.h b/inst/include/tidysq/ops/translate.h index 690cbcf..3153a7b 100644 --- a/inst/include/tidysq/ops/translate.h +++ b/inst/include/tidysq/ops/translate.h @@ -7,6 +7,99 @@ #include "tidysq/constants/translate_tables.h" namespace tidysq { + namespace internal { + +#define THIRD_CODON(CODON_1, CODON_2) \ +switch (value_3) { \ + case 0u: return constants::CODON; \ + case 1u: return constants::CODON; \ + case 2u: return constants::CODON; \ + case 3u: return constants::CODON; \ + default: throw std::invalid_argument("translation must be made with four standard DNA/RNA letters only"); \ +} + +#define SECOND_CODON(CODON_1) \ +switch (value_2) { \ + case 0u: THIRD_CODON(CODON_1, 0) \ + case 1u: THIRD_CODON(CODON_1, 1) \ + case 2u: THIRD_CODON(CODON_1, 2) \ + case 3u: THIRD_CODON(CODON_1, 3) \ + default: throw std::invalid_argument("translation must be made with four standard DNA/RNA letters only"); \ +} + +#define FIRST_CODON \ +switch (value_1) { \ + case 0u: SECOND_CODON(0) \ + case 1u: SECOND_CODON(1) \ + case 2u: SECOND_CODON(2) \ + case 3u: SECOND_CODON(3) \ + default: throw std::invalid_argument("translation must be made with four standard DNA/RNA letters only"); \ +} + + template + constexpr LetterValue read_codon(LetterValue value_1, LetterValue value_2, LetterValue value_3) { + FIRST_CODON + } + +#undef FIRST_CODON +#undef SECOND_CODON +#undef THIRD_CODON + + constexpr LetterValue read_codon(int table, LetterValue value_1, LetterValue value_2, LetterValue value_3) { + switch (table) { + case 1: + case 8: + case 11: + return read_codon<1>(value_1, value_2, value_3); + case 2: + return read_codon<2>(value_1, value_2, value_3); + case 3: + return read_codon<3>(value_1, value_2, value_3); + case 4: + case 7: + return read_codon<4>(value_1, value_2, value_3); + case 5: + return read_codon<5>(value_1, value_2, value_3); + case 6: + return read_codon<6>(value_1, value_2, value_3); + case 9: + return read_codon<9>(value_1, value_2, value_3); + case 10: + return read_codon<10>(value_1, value_2, value_3); + case 12: + return read_codon<12>(value_1, value_2, value_3); + case 13: + return read_codon<13>(value_1, value_2, value_3); + case 14: + return read_codon<14>(value_1, value_2, value_3); + case 15: + return read_codon<15>(value_1, value_2, value_3); + case 16: + return read_codon<16>(value_1, value_2, value_3); + case 21: + return read_codon<21>(value_1, value_2, value_3); + case 22: + return read_codon<22>(value_1, value_2, value_3); + case 23: + return read_codon<23>(value_1, value_2, value_3); + case 24: + return read_codon<24>(value_1, value_2, value_3); + case 25: + return read_codon<25>(value_1, value_2, value_3); + case 26: + return read_codon<26>(value_1, value_2, value_3); + case 29: + return read_codon<29>(value_1, value_2, value_3); + case 30: + return read_codon<30>(value_1, value_2, value_3); + case 33: + return read_codon<33>(value_1, value_2, value_3); + default: + throw std::invalid_argument("specified table doesn't exist"); + } + } + } + namespace ops { template class OperationTranslate : public OperationSqToSq { @@ -14,50 +107,10 @@ namespace tidysq { const AlphSize AMI_BSC_ALPH_SIZE = Alphabet(AMI_BSC).alphabet_size(); const unsigned int table_; - const bool interpret_as_stop_; - - inline unsigned int reduce_table(const unsigned int table) { - // Some tables are actually identical to some others - if (table == 7) return 4; - if (table == 8) return 1; - if (table == 11) return 1; - return table; - } - - // TODO: issue #58 - inline LetterValue codon_table(const LetterValue &codon_1, - const LetterValue &codon_2, - const LetterValue &codon_3) { - // If this is non-standard table, then we have this kind of @Override - if (table_ != 1) { - // The only way to include ambiguous translation tables (27, 28, 31) - // I don't really like this solution, maybe we should just drop the support for that - if ((table_ == 27 || table_ == 28 || table_ == 31) && !interpret_as_stop_) { - const auto& amb_codon_diff_table = constants::AMB_CODON_DIFF_TABLES.at(table_); - if (amb_codon_diff_table.count(codon_1) > 0 && - amb_codon_diff_table.at(codon_1).count(codon_2) > 0 && - amb_codon_diff_table.at(codon_1).at(codon_2).count(codon_3) > 0) { - return amb_codon_diff_table.at(codon_1).at(codon_2).at(codon_3); - } - } - // Then find correct table of differences - const auto& codon_diff_table = constants::CODON_DIFF_TABLES.at(table_); - if (codon_diff_table.count(codon_1) > 0 && - codon_diff_table.at(codon_1).count(codon_2) > 0 && - codon_diff_table.at(codon_1).at(codon_2).count(codon_3) > 0) { - const auto amino_acid = codon_diff_table.at(codon_1).at(codon_2).at(codon_3); - // TODO: issue #62 - return amino_acid; - } - } - return constants::CODON_TABLE_1.at(codon_1).at(codon_2).at(codon_3); - } public: - explicit OperationTranslate(const unsigned int &table, - const bool &interpret_as_stop) : - table_(reduce_table(table)), - interpret_as_stop_(interpret_as_stop) {}; + explicit OperationTranslate(const unsigned int &table) : + table_(table) {}; [[nodiscard]] Alphabet map_alphabet(const Alphabet &alphabet_in) const override { return Alphabet(AMI_BSC, alphabet_in.NA_letter(), alphabet_in.ignores_case()); @@ -75,7 +128,7 @@ namespace tidysq { auto codon_1 = *input_it++; auto codon_2 = *input_it++; auto codon_3 = *input_it++; - output_it.assign(codon_table(codon_1, codon_2, codon_3)); + output_it.assign(internal::read_codon(table_, codon_1, codon_2, codon_3)); ++output_it; } } @@ -91,15 +144,13 @@ namespace tidysq { template Sq translate(const Sq &sq, - const unsigned int &table = 1, - const bool &interpret_as_stop = false) { - return sqapply(sq, ops::OperationTranslate(table, interpret_as_stop)); + const unsigned int &table = 1) { + return sqapply(sq, ops::OperationTranslate(table)); } template Sequence translate(const Sequence &sequence, - const unsigned int &table = 1, - const bool &interpret_as_stop = false) { - return ops::OperationTranslate(table, interpret_as_stop)(sequence); + const unsigned int &table = 1) { + return ops::OperationTranslate(table)(sequence); } } \ No newline at end of file diff --git a/inst/include/tidysq/paste.h b/inst/include/tidysq/paste.h new file mode 100644 index 0000000..5212f24 --- /dev/null +++ b/inst/include/tidysq/paste.h @@ -0,0 +1,61 @@ +#pragma once + +#include "tidysq/Sq.h" +#include "tidysq/util/find_common_size.h" + +namespace tidysq { + template + Sq paste(const std::vector> &list_of_sq) { + // If there are no sq, then we don't know what SqType to return + if (list_of_sq.empty()) { + throw std::invalid_argument("cannot paste empty vector"); + } + + // Now we can assume that there exists an element under index 0 + const Alphabet& alph = list_of_sq[0].alphabet(); + const AlphSize& alph_size = alph.alphabet_size(); + + if (std::any_of(list_of_sq.cbegin(), list_of_sq.cend(), [alph](const Sq& other) { + return other.alphabet() != alph; + })) { + throw std::invalid_argument("pasting sq objects with different alphabets is not implemented"); + } + + const LenSq& common_size = util::find_common_size(list_of_sq); + Sq ret(common_size, alph); + + // First total sequence lengths (number of elements) is computed + std::vector element_counts(common_size); + for (const Sq& sq : list_of_sq) { + // If sq is a scalar (length 1), then increment all counts by its original_length + if (sq.size() == 1) { + for (LenSq& element_count : element_counts) { + element_count += sq[0].get().original_length(); + } + } else { + for (LenSq i = 0; i < element_counts.size(); ++i) { + element_counts[i] += sq[i].get().original_length(); + } + } + } + + for (LenSq i = 0; i < ret.size(); ++i) { + Sequence sequence_out = + Sequence((alph_size * element_counts[i] + 7) / 8, element_counts[i]); + auto out_seq_iter = sequence_out.begin(alph_size); + for (const Sq& sq : list_of_sq) { + // If sq is a scalar (length 1), then append the only Sequence present + const Sequence& sequence_in = sq[sq.size() == 1 ? 0 : i]; + auto in_seq_iter = sequence_in.cbegin(alph_size); + while (out_seq_iter != sequence_out.end(alph_size) && in_seq_iter != sequence_in.cend(alph_size)) { + out_seq_iter.assign(*in_seq_iter); + ++in_seq_iter; + ++out_seq_iter; + } + } + ret[i] = sequence_out; + } + + return ret; + } +} diff --git a/inst/include/tidysq/util/find_common_size.h b/inst/include/tidysq/util/find_common_size.h new file mode 100644 index 0000000..8589465 --- /dev/null +++ b/inst/include/tidysq/util/find_common_size.h @@ -0,0 +1,20 @@ +#pragma once + +#include "tidysq/tidysq-typedefs.h" + +namespace tidysq::util { + template + inline LenSq find_common_size(const std::vector> &list_of_sq) { + LenSq common_size = 1; + for (const Sq& sq : list_of_sq) { + if (sq.size() != 1) { + if (common_size == 1) { + common_size = sq.size(); + } else if (common_size != sq.size()) { + throw std::invalid_argument("all sq objects must have either the same size or size 1"); + } + } + } + return common_size; + } +} diff --git a/inst/include/tidysq/util/random.h b/inst/include/tidysq/util/random.h new file mode 100644 index 0000000..6346484 --- /dev/null +++ b/inst/include/tidysq/util/random.h @@ -0,0 +1,15 @@ +#pragma once + +#include "tidysq/tidysq-typedefs.h" + +namespace tidysq::util { + template + inline LenSq random_value(LenSq alphabet_length) { + return rand() % alphabet_length; + } + + template<> + inline LenSq random_value(LenSq alphabet_length) { + return R::runif(0, alphabet_length - 1); + } +} \ No newline at end of file diff --git a/man/bite.Rd b/man/bite.Rd index 5ebe107..27ae5b2 100644 --- a/man/bite.Rd +++ b/man/bite.Rd @@ -87,6 +87,8 @@ bite(sq_dna, -1:-4) \code{\link{remove_na}} Functions that affect order of elements: +\code{\link{collapse}()}, +\code{\link{paste}()}, \code{\link{reverse}()} } \concept{order_functions} diff --git a/man/collapse.Rd b/man/collapse.Rd new file mode 100644 index 0000000..5b8f94f --- /dev/null +++ b/man/collapse.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/collapse.R +\name{collapse} +\alias{collapse} +\alias{collapse.sq} +\title{Collapse multiple sequences into one} +\usage{ +collapse(x, ...) + +\method{collapse}{sq}(x, ..., NA_letter = getOption("tidysq_NA_letter")) +} +\arguments{ +\item{x}{[\code{sq}]\cr +An object this function is applied to.} + +\item{...}{further arguments to be passed from or to other methods.} + +\item{NA_letter}{[\code{character(1)}]\cr +A string that is used to interpret and display \code{NA} value in the +context of \code{\link[=sq-class]{sq class}}. Default value equals to +"\code{!}".} +} +\value{ +\code{\link[=sq-class]{sq}} object of the same type as input but with +exactly one sequence. +} +\description{ +Joins sequences from a vector into a single sequence. Sequence +type remains unchanged. +} +\details{ +\code{collapse()} joins sequences from supplied \code{sq} object in the same +order as they appear in said vector. That is, if there are three sequences +AGGCT, ATCCGT and GAACGT, then resulting sequence will be AGGCTATCCGTGAACGT. +This operation does not alter the type of the input object nor its alphabet. +} +\examples{ +# Creating objects to work on: +sq_ami <- sq(c("MIAANYTWIL","TIAALGNIIYRAIE", "NYERTGHLI", "MAYXXXIALN"), + alphabet = "ami_ext") +sq_dna <- sq(c("ATGCAGGA", "GACCGAACGAN", ""), alphabet = "dna_ext") +sq_unt <- sq(c("ATGCAGGA?", "TGACGAGCTTA", "", "TIAALGNIIYRAIE")) + +# Collapsing sequences: +collapse(sq_ami) +collapse(sq_dna) +collapse(sq_unt) + +# Empty sq objects are collapsed as well (into empty string - ""): +sq_empty <- sq(character(), alphabet = "rna_bsc") +collapse(sq_empty) + +} +\seealso{ +Functions that affect order of elements: +\code{\link{bite}()}, +\code{\link{paste}()}, +\code{\link{reverse}()} +} +\concept{order_functions} diff --git a/man/export_sq.Rd b/man/export_sq.Rd index 9b9e844..7764cde 100644 --- a/man/export_sq.Rd +++ b/man/export_sq.Rd @@ -22,8 +22,8 @@ Vector of sequence names. Must be of the same length as \code{sq} object. \description{ Converts object of class \code{\link[=sq-class]{sq}} to a class from another package. Currently supported packages are \pkg{ape}, -\pkg{Bioconductor} and \pkg{seqinr}. For exact list of supported classes and -resulting types, see details. +\pkg{bioseq}, \pkg{Bioconductor} and \pkg{seqinr}. For exact list of +supported classes and resulting types, see details. } \details{ Currently supported formats are as follows (grouped by \code{sq} types): @@ -31,6 +31,7 @@ Currently supported formats are as follows (grouped by \code{sq} types): \item \strong{ami}: \itemize{ \item \code{"ape::AAbin"} + \item \code{"bioseq::bioseq_aa"} \item \code{"Biostrings::AAString"} \item \code{"Biostrings::AAStringSet"} \item \code{"seqinr::SeqFastaAA"} @@ -38,12 +39,14 @@ Currently supported formats are as follows (grouped by \code{sq} types): \item \strong{dna}: \itemize{ \item \code{"ape::DNAbin"} + \item \code{"bioseq::bioseq_dna"} \item \code{"Biostrings::DNAString"} \item \code{"Biostrings::DNAStringSet"} \item \code{"seqinr::SeqFastadna"} } \item \strong{rna}: \itemize{ + \item \code{"bioseq::bioseq_rna"} \item \code{"Biostrings::RNAString"} \item \code{"Biostrings::RNAStringSet"} } @@ -53,19 +56,22 @@ Currently supported formats are as follows (grouped by \code{sq} types): # DNA and amino acid sequences can be exported to most packages sq_ami <- sq(c("MVVGL", "LAVPP"), alphabet = "ami_bsc") export_sq(sq_ami, "ape::AAbin") +export_sq(sq_ami, "bioseq::bioseq_aa") export_sq(sq_ami, "Biostrings::AAStringSet", c("one", "two")) export_sq(sq_ami, "seqinr::SeqFastaAA") sq_dna <- sq(c("TGATGAAGCGCA", "TTGATGGGAA"), alphabet = "dna_bsc") export_sq(sq_dna, "ape::DNAbin", name = c("one", "two")) +export_sq(sq_dna, "bioseq::bioseq_dna") export_sq(sq_dna, "Biostrings::DNAStringSet") export_sq(sq_dna, "seqinr::SeqFastadna") -# RNA sequences are limited to Biostrings +# RNA sequences are limited to Biostrings and bioseq sq_rna <- sq(c("NUARYGCB", "", "DRKCNYBAU"), alphabet = "rna_ext") +export_sq(sq_rna, "bioseq::bioseq_rna") export_sq(sq_rna, "Biostrings::RNAStringSet") -# Biostrings accept single sequences as well +# Biostrings can export single sequences to simple strings as well export_sq(sq_dna[1], "Biostrings::DNAString") } diff --git a/man/import_sq.Rd b/man/import_sq.Rd index 06bcbe9..6bb2d27 100644 --- a/man/import_sq.Rd +++ b/man/import_sq.Rd @@ -22,8 +22,8 @@ another column \code{name} with those names \description{ Creates \code{\link[=sq-class]{sq}} object from object of class from another package. Currently supported packages are \pkg{ape}, -\pkg{Bioconductor} and \pkg{seqinr}. For exact list of supported classes and -resulting types, see details. +\pkg{bioseq}, \pkg{Bioconductor} and \pkg{seqinr}. For exact list of +supported classes and resulting types, see details. } \details{ Currently supported classes are as follows: @@ -35,6 +35,12 @@ Currently supported classes are as follows: \item \code{alignment} - exact type is guessed within \code{\link{sq}} function } +\item \code{bioseq}: + \itemize{ + \item \code{bioseq_aa} - imported as \strong{ami_ext} + \item \code{bioseq_dna} - imported as \strong{dna_ext} + \item \code{bioseq_rna} - imported as \strong{rna_ext} + } \item \code{Biostrings}: \itemize{ \item \code{AAString} - imported as \strong{ami_ext} with exactly one @@ -69,6 +75,11 @@ library(ape) ape_dna <- as.DNAbin(list(one = c("C", "T", "C", "A"), two = c("T", "G", "A", "G", "G"))) import_sq(ape_dna) +# bioseq example +library(bioseq) +bioseq_rna <- new_rna(c(one = "ANBRY", two = "YUTUGGN")) +import_sq(bioseq_rna) + # Biostrings example library(Biostrings) Biostrings_ami <- AAStringSet(c(one = "FEAPQLIWY", two = "EGITENAK")) diff --git a/man/paste.Rd b/man/paste.Rd new file mode 100644 index 0000000..9d71219 --- /dev/null +++ b/man/paste.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/paste.R +\name{paste} +\alias{paste} +\alias{paste.sq} +\title{Paste sequences in string-like fashion} +\usage{ +\method{paste}{sq}(..., NA_letter = getOption("tidysq_NA_letter")) +} +\arguments{ +\item{...}{[\code{sq}]\cr +Sequences to paste together.} + +\item{NA_letter}{[\code{character(1)}]\cr +A string that is used to interpret and display \code{NA} value in the +context of \code{\link[=sq-class]{sq class}}. Default value equals to +"\code{!}".} +} +\value{ +\code{\link[=sq-class]{sq}} object of common type of input objects. +Common type is determined in the same process as for +\code{\link[=sq-concatenate]{c.sq}()}. +} +\description{ +Joins multiple vectors of sequences into one vector. +} +\details{ +\code{paste()} joins sequences in the same way as it does with strings. +All \code{sq} objects must have the same length, that is, contain the same +number of sequences. An exception is made for scalar (length 1) \code{sq} +objects, which are replicated instead. +} +\examples{ +# Creating objects to work on: +sq_dna_1 <- sq(c("TTCAGGGCTAG", "CGATTGC", "CAGTTTA"), + alphabet = "dna_bsc") +sq_dna_2 <- sq(c("ATCTTGAAG", "CATATGCGCTA", "ACGTGTCGA"), + alphabet = "dna_bsc") +sq_unt_1 <- sq(c("ATGCAGGA?", "TGACGAGCTTA", "", "TIAALGNIIYRAIE")) +sq_unt_2 <- sq(c("OVNU!!OK!!J", "GOK!MI!N!BB!", "DPOFIN!!", "??!?")) + +# Pasting sequences: +collapse(sq_dna_1, sq_dna_2) +collapse(sq_unt_1, sq_unt_2) +collapse(sq_dna_2, sq_unt_2, sq_dna_1) + +} +\seealso{ +Functions that affect order of elements: +\code{\link{bite}()}, +\code{\link{collapse}()}, +\code{\link{reverse}()} +} +\concept{order_functions} diff --git a/man/reverse.Rd b/man/reverse.Rd index 4424262..b9802ad 100644 --- a/man/reverse.Rd +++ b/man/reverse.Rd @@ -47,6 +47,8 @@ reverse(sq_unt) } \seealso{ Functions that affect order of elements: -\code{\link{bite}()} +\code{\link{bite}()}, +\code{\link{collapse}()}, +\code{\link{paste}()} } \concept{order_functions} diff --git a/man/translate.Rd b/man/translate.Rd index 9a037e8..ecc6cde 100644 --- a/man/translate.Rd +++ b/man/translate.Rd @@ -8,21 +8,9 @@ \usage{ translate(x, table = 1, ...) -\method{translate}{sq_dna_bsc}( - x, - table = 1, - ..., - NA_letter = getOption("tidysq_NA_letter"), - interpret_as_stop = FALSE -) +\method{translate}{sq_dna_bsc}(x, table = 1, ..., NA_letter = getOption("tidysq_NA_letter")) -\method{translate}{sq_rna_bsc}( - x, - table = 1, - ..., - NA_letter = getOption("tidysq_NA_letter"), - interpret_as_stop = FALSE -) +\method{translate}{sq_rna_bsc}(x, table = 1, ..., NA_letter = getOption("tidysq_NA_letter")) } \arguments{ \item{x}{[\code{sq_dna_bsc} || \code{sq_rna_bsc}]\cr @@ -38,10 +26,6 @@ An object this function is applied to.} A string that is used to interpret and display \code{NA} value in the context of \code{\link[=sq-class]{sq class}}. Default value equals to "\code{!}".} - -\item{interpret_as_stop}{[\code{logical(1)}]\cr -Used with tables 27, 28 and 31 and their ambiguous translations. Tells if -these ambiguous codons should be interpreted as stop or the other option.} } \value{ An object of \code{\link[=sq-class]{class sq}} with \strong{ami_bsc} @@ -66,8 +50,8 @@ Due to how the tables works, \code{translate()} does not support inputting sequences with extended alphabets, as ambiguous letters in most cases cannot be translated into exactly one protein. -Moreover, behavior of this function is undefined whenever input sequence -contain either "\code{-}" or \code{NA} value. +Moreover, this function raises an error whenever input sequence contain +either "\code{-}" or \code{NA} value. } \examples{ sq_dna <- sq(c("TACTGGGCATGA", "CAGGTC", "TAGTCCTAG"), alphabet = "dna_bsc") diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b4a1fb7..5e857b3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -34,6 +34,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// CPP_collapse +Rcpp::List CPP_collapse(const Rcpp::List& x, const tidysq::Letter& NA_letter); +RcppExport SEXP _tidysq_CPP_collapse(SEXP xSEXP, SEXP NA_letterSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::List& >::type x(xSEXP); + Rcpp::traits::input_parameter< const tidysq::Letter& >::type NA_letter(NA_letterSEXP); + rcpp_result_gen = Rcpp::wrap(CPP_collapse(x, NA_letter)); + return rcpp_result_gen; +END_RCPP +} // CPP_complement Rcpp::List CPP_complement(const Rcpp::List& x, const tidysq::Letter& NA_letter); RcppExport SEXP _tidysq_CPP_complement(SEXP xSEXP, SEXP NA_letterSEXP) { @@ -179,6 +191,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// CPP_paste +Rcpp::List CPP_paste(const Rcpp::List& list_of_x, const tidysq::Letter& NA_letter); +RcppExport SEXP _tidysq_CPP_paste(SEXP list_of_xSEXP, SEXP NA_letterSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::List& >::type list_of_x(list_of_xSEXP); + Rcpp::traits::input_parameter< const tidysq::Letter& >::type NA_letter(NA_letterSEXP); + rcpp_result_gen = Rcpp::wrap(CPP_paste(list_of_x, NA_letter)); + return rcpp_result_gen; +END_RCPP +} // CPP_random_sq Rcpp::List CPP_random_sq(const int& n, const Rcpp::IntegerVector& len, const Rcpp::StringVector& alphabet, const bool& use_gap); RcppExport SEXP _tidysq_CPP_random_sq(SEXP nSEXP, SEXP lenSEXP, SEXP alphabetSEXP, SEXP use_gapSEXP) { @@ -273,16 +297,15 @@ BEGIN_RCPP END_RCPP } // CPP_translate -Rcpp::List CPP_translate(const Rcpp::List& x, const int& table, const tidysq::Letter& NA_letter, const bool& interpret_as_stop); -RcppExport SEXP _tidysq_CPP_translate(SEXP xSEXP, SEXP tableSEXP, SEXP NA_letterSEXP, SEXP interpret_as_stopSEXP) { +Rcpp::List CPP_translate(const Rcpp::List& x, const int& table, const tidysq::Letter& NA_letter); +RcppExport SEXP _tidysq_CPP_translate(SEXP xSEXP, SEXP tableSEXP, SEXP NA_letterSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const Rcpp::List& >::type x(xSEXP); Rcpp::traits::input_parameter< const int& >::type table(tableSEXP); Rcpp::traits::input_parameter< const tidysq::Letter& >::type NA_letter(NA_letterSEXP); - Rcpp::traits::input_parameter< const bool& >::type interpret_as_stop(interpret_as_stopSEXP); - rcpp_result_gen = Rcpp::wrap(CPP_translate(x, table, NA_letter, interpret_as_stop)); + rcpp_result_gen = Rcpp::wrap(CPP_translate(x, table, NA_letter)); return rcpp_result_gen; END_RCPP } @@ -367,6 +390,7 @@ RcppExport SEXP run_testthat_tests(); static const R_CallMethodDef CallEntries[] = { {"_tidysq_CPP_apply_R_function", (DL_FUNC) &_tidysq_CPP_apply_R_function, 4}, {"_tidysq_CPP_bite", (DL_FUNC) &_tidysq_CPP_bite, 4}, + {"_tidysq_CPP_collapse", (DL_FUNC) &_tidysq_CPP_collapse, 2}, {"_tidysq_CPP_complement", (DL_FUNC) &_tidysq_CPP_complement, 2}, {"_tidysq_CPP_find_invalid_letters", (DL_FUNC) &_tidysq_CPP_find_invalid_letters, 3}, {"_tidysq_CPP_find_motifs", (DL_FUNC) &_tidysq_CPP_find_motifs, 4}, @@ -378,6 +402,7 @@ static const R_CallMethodDef CallEntries[] = { {"_tidysq_CPP_pack_INTS", (DL_FUNC) &_tidysq_CPP_pack_INTS, 4}, {"_tidysq_CPP_pack_STRINGS", (DL_FUNC) &_tidysq_CPP_pack_STRINGS, 4}, {"_tidysq_CPP_pack_STRING", (DL_FUNC) &_tidysq_CPP_pack_STRING, 4}, + {"_tidysq_CPP_paste", (DL_FUNC) &_tidysq_CPP_paste, 2}, {"_tidysq_CPP_random_sq", (DL_FUNC) &_tidysq_CPP_random_sq, 4}, {"_tidysq_CPP_read_fasta", (DL_FUNC) &_tidysq_CPP_read_fasta, 4}, {"_tidysq_CPP_sample_fasta", (DL_FUNC) &_tidysq_CPP_sample_fasta, 4}, @@ -385,7 +410,7 @@ static const R_CallMethodDef CallEntries[] = { {"_tidysq_CPP_remove_ambiguous", (DL_FUNC) &_tidysq_CPP_remove_ambiguous, 3}, {"_tidysq_CPP_reverse", (DL_FUNC) &_tidysq_CPP_reverse, 2}, {"_tidysq_CPP_substitute_letters", (DL_FUNC) &_tidysq_CPP_substitute_letters, 3}, - {"_tidysq_CPP_translate", (DL_FUNC) &_tidysq_CPP_translate, 4}, + {"_tidysq_CPP_translate", (DL_FUNC) &_tidysq_CPP_translate, 3}, {"_tidysq_CPP_typify", (DL_FUNC) &_tidysq_CPP_typify, 3}, {"_tidysq_CPP_unpack_RAWS", (DL_FUNC) &_tidysq_CPP_unpack_RAWS, 2}, {"_tidysq_CPP_unpack_INTS", (DL_FUNC) &_tidysq_CPP_unpack_INTS, 2}, diff --git a/src/collapse.cpp b/src/collapse.cpp new file mode 100644 index 0000000..351182f --- /dev/null +++ b/src/collapse.cpp @@ -0,0 +1,11 @@ +#include "tidysq/Rcpp-import.h" +#include "tidysq/Rcpp-export.h" +#include "tidysq/collapse.h" + +using namespace tidysq; + +//[[Rcpp::export]] +Rcpp::List CPP_collapse(const Rcpp::List& x, + const tidysq::Letter &NA_letter) { + return export_to_R(collapse(import_sq_from_R(x, NA_letter))); +} diff --git a/src/paste.cpp b/src/paste.cpp new file mode 100644 index 0000000..1609ca1 --- /dev/null +++ b/src/paste.cpp @@ -0,0 +1,14 @@ +#include "tidysq.h" +#include "tidysq/paste.h" + +using namespace tidysq; + +//[[Rcpp::export]] +Rcpp::List CPP_paste(const Rcpp::List& list_of_x, + const tidysq::Letter &NA_letter) { + std::vector> list_of_sq; + for (const Rcpp::internal::const_generic_proxy<19, Rcpp::PreserveStorage>& x : list_of_x) { + list_of_sq.push_back(import_sq_from_R(x, NA_letter)); + } + return export_to_R(paste(list_of_sq)); +} diff --git a/src/translate.cpp b/src/translate.cpp index 7dfcf75..f4e359a 100644 --- a/src/translate.cpp +++ b/src/translate.cpp @@ -7,10 +7,8 @@ using namespace tidysq; //[[Rcpp::export]] Rcpp::List CPP_translate(const Rcpp::List &x, const int &table, - const tidysq::Letter &NA_letter, - const bool &interpret_as_stop) { + const tidysq::Letter &NA_letter) { return export_to_R(translate( import_sq_from_R(x, NA_letter), - table, - interpret_as_stop)); + table)); } diff --git a/tests/testthat/test-collapse.R b/tests/testthat/test-collapse.R new file mode 100644 index 0000000..bf9b57b --- /dev/null +++ b/tests/testthat/test-collapse.R @@ -0,0 +1,62 @@ +# SETUP ---- +str_dna <- c("TTCAGGGCTAG", "CGATTGC", "CAGTTTA") +str_unt <- c("!!NV!!XFD!", "P!OQ!!-FI", "SP!!I-F-XXS!") +str_atp <- c("mAmYmY", "nbAnsAmA", "") + +alph_atp <- c("mA", "mY", "nbA", "nsA") + +sq_dna <- sq(str_dna, alphabet = "dna_bsc") +sq_unt <- sq(str_unt, alphabet = "unt", NA_letter = "!") +sq_atp <- sq(str_atp, alphabet = alph_atp) +sq_empty <- sq(character(), alphabet = "rna_bsc") +sq_one <- sq("OFUFQBODBUDAFNDBZGFG", alphabet = "ami_ext") + +# PROTOTYPE PRESERVATION ---- +test_that("collapse() preserves type and alphabet of original vector", { + expect_vector(collapse(sq_dna), + ptype = vec_ptype(sq_dna)) + expect_vector(collapse(sq_unt), + ptype = vec_ptype(sq_unt)) + expect_vector(collapse(sq_atp), + ptype = vec_ptype(sq_atp)) + expect_vector(collapse(sq_empty), + ptype = vec_ptype(sq_empty)) +}) + +test_that("collapse() returns sequence vector of length 1", { + expect_vector(collapse(sq_dna), + size = 1) + expect_vector(collapse(sq_unt), + size = 1) + expect_vector(collapse(sq_atp), + size = 1) + expect_vector(collapse(sq_empty), + size = 1) +}) + +# ERROR FOR NON-SQ OBJECTS ---- +test_that("collapse() throws an error whenever passed object of class other that sq", { + expect_error(collapse(1:7)) + expect_error(collapse(LETTERS)) + expect_error(collapse(list(mean, sum, sd))) +}) + +# VALUE COMPUTATION ---- +test_that("collapse() returns sequence equal to all collapsed sequences", { + withr::local_options(list(tidysq_NA_letter = "!")) + expect_equivalent(as.character(collapse(sq_dna)), + paste0(str_dna, collapse = "")) + expect_equivalent(as.character(collapse(sq_unt)), + paste0(str_unt, collapse = "")) + expect_equivalent(as.character(collapse(sq_atp)), + paste0(str_atp, collapse = "")) + # Note that returned value isn't an empty vector, but an empty string + expect_equivalent(as.character(collapse(sq_empty)), + "") +}) + +# NO CHANGES FOR SQ OF LENGTH 1 ---- +test_that("collapse() does nothing to sq objects of length 1", { + expect_identical(collapse(sq_one), + sq_one) +}) diff --git a/tests/testthat/test-paste.R b/tests/testthat/test-paste.R new file mode 100644 index 0000000..365b68b --- /dev/null +++ b/tests/testthat/test-paste.R @@ -0,0 +1,77 @@ +# SETUP ---- +str_dna_1 <- c("TTCAGGGCTAG", "CGATTGC", "CAGTTTA") +str_dna_2 <- c("ATCTTGAAG", "CATATGCGCTA", "ACGTGTCGA") +str_ami <- "OFUFQBODBUDAFNDBZGFG" +str_unt_1 <- c("!!NV!!XFD!", "P!OQ!!-FI", "SP!!I-F-XXS!") +str_unt_2 <- c("OVNU!!OK!!J", "GOK!MI!N!BB!", "DPOFIN!!") +str_unt_3 <- c("!IF", "", "VF!") +str_atp <- c("mAmYmY", "nbAnsAmA", "") + +alph_atp <- c("mA", "mY", "nbA", "nsA") + +sq_dna_1 <- sq(str_dna_1, alphabet = "dna_bsc") +sq_dna_2 <- sq(str_dna_2, alphabet = "dna_bsc") +sq_ami <- sq(str_ami, alphabet = "ami_ext") +sq_unt_1 <- sq(str_unt_1, alphabet = "unt", NA_letter = "!") +sq_unt_2 <- sq(str_unt_2, alphabet = "unt", NA_letter = "!") +sq_unt_3 <- sq(str_unt_3, alphabet = "unt", NA_letter = "!") +sq_atp <- sq(str_atp, alphabet = alph_atp) +sq_empty <- sq(character(), alphabet = "rna_bsc") + +# PROTOTYPE PRESERVATION ---- +test_that("paste() finds common prototype for all arguments (recycling length 1 vectors if necessary)", { + expect_vector(paste(sq_dna_1, sq_dna_2), + ptype = vec_ptype_common(sq_dna_1, sq_dna_2), + size = vec_size_common(sq_dna_1, sq_dna_2)) + expect_vector(paste(sq_unt_1, sq_unt_2, sq_unt_3), + ptype = vec_ptype_common(sq_unt_1, sq_unt_2, sq_unt_3), + size = vec_size_common(sq_unt_1, sq_unt_2, sq_unt_3)) + expect_vector(paste(sq_dna_1, sq_unt_3, sq_dna_2), + ptype = vec_ptype_common(sq_dna_1, sq_unt_3, sq_dna_2), + size = vec_size_common(sq_dna_1, sq_unt_3, sq_dna_2)) + expect_vector(paste(sq_ami, sq_unt_1), + ptype = vec_ptype_common(sq_ami, sq_unt_1), + size = vec_size_common(sq_ami, sq_unt_1)) +}) + +# VALUE COMPUTATION ---- +test_that("paste() correctly merges sq objects of the same type and alphabet", { + expect_identical( + paste(sq_dna_1, sq_dna_2), + sq(paste0(str_dna_1, str_dna_2), alphabet = "dna_bsc") + ) + expect_identical( + paste(sq_atp, sq_atp), + sq(paste0(str_atp, str_atp), alphabet = alph_atp) + ) +}) + +test_that("paste() correctly merges sq objects of different types and alphabets", { + expect_identical( + paste(sq_unt_1, sq_unt_2, sq_unt_3), + vec_cast( + sq(paste0(str_unt_1, str_unt_2, str_unt_3), alphabet = "unt", NA_letter = "!"), + vec_ptype_common(sq_unt_1, sq_unt_2, sq_unt_3) + ) + ) + expect_identical( + paste(sq_dna_1, sq_unt_3, sq_dna_2), + vec_cast( + sq(paste0(str_dna_1, str_unt_3, str_dna_2), alphabet = "unt", NA_letter = "!"), + vec_ptype_common(sq_dna_1, sq_unt_3, sq_dna_2) + ) + ) + expect_identical( + paste(sq_ami, sq_unt_1), + vec_cast( + sq(paste0(str_ami, str_unt_1), alphabet = "unt", NA_letter = "!"), + vec_ptype_common(sq_ami, sq_unt_1) + ) + ) +}) + +# NO CHANGES IF PASSED 1 ARGUMENT ---- +test_that("paste() does nothing to one sq object", { + expect_identical(paste(sq_unt_3), + sq_unt_3) +}) diff --git a/tests/testthat/test-pkg-bioseq.R b/tests/testthat/test-pkg-bioseq.R new file mode 100644 index 0000000..7035888 --- /dev/null +++ b/tests/testthat/test-pkg-bioseq.R @@ -0,0 +1,72 @@ +# SETUP ---- +str_dna <- c("TACTGGGCATG", "CAGGTCGGA", "TAGTAGTCCG", "", "ACGGT") +str_rna <- c("", "KBS-UVW-AWWWG", "YGHHH-", "-CRASH", "MND-KUUBV-MY-") +str_ami <- c("OUTLANDISH", "UNSTRUCTURIZED", "FEAR") + +sq_dna_bsc <- sq(str_dna, alphabet = "dna_bsc") +sq_dna_ext <- sq(str_dna, alphabet = "dna_ext") +sq_rna <- sq(str_rna, alphabet = "rna_ext") +sq_ami <- sq(str_ami, alphabet = "ami_ext") + +names_dna <- c("vengeance", "is", "never", "a", "rubber_duck") +names_rna <- c("Monza", "Imola", "Mugello", "Pescara", "Modena") +names_ami <- c("proteins", "vitamins", "fats") + +bioseq_dna <- bioseq::new_dna(str_dna) +bioseq_dna_n <- bioseq::new_dna(setNames(str_dna, names_dna)) +bioseq_rna <- bioseq::new_rna(str_rna) +bioseq_rna_n <- bioseq::new_rna(setNames(str_rna, names_rna)) +bioseq_ami <- bioseq::new_aa(str_ami) +bioseq_ami_n <- bioseq::new_aa(setNames(str_ami, names_ami)) + +# IMPORT ---- +test_that("correctly imports bioseq::bioseq_dna", { + expect_identical(import_sq(bioseq_dna)[["sq"]], + sq_dna_ext) + expect_identical(import_sq(bioseq_dna_n)[["sq"]], + sq_dna_ext) + expect_identical(import_sq(bioseq_dna_n)[["name"]], + names_dna) +}) + +test_that("correctly imports bioseq::bioseq_rna", { + expect_identical(import_sq(bioseq_rna)[["sq"]], + sq_rna) + expect_identical(import_sq(bioseq_rna_n)[["sq"]], + sq_rna) + expect_identical(import_sq(bioseq_rna_n)[["name"]], + names_rna) +}) + +test_that("correctly imports bioseq::bioseq_aa", { + expect_identical(import_sq(bioseq_ami)[["sq"]], + sq_ami) + expect_identical(import_sq(bioseq_ami_n)[["sq"]], + sq_ami) + expect_identical(import_sq(bioseq_ami_n)[["name"]], + names_ami) +}) + +# EXPORT ---- +test_that("correctly exports sq object to bioseq::bioseq_dna", { + expect_identical(export_sq(sq_dna_bsc, "bioseq::bioseq_dna"), + bioseq_dna) + expect_identical(export_sq(sq_dna_ext, "bioseq::bioseq_dna"), + bioseq_dna) + expect_identical(export_sq(sq_dna_bsc, "bioseq::bioseq_dna", name = names_dna), + bioseq_dna_n) +}) + +test_that("correctly exports sq object to bioseq::bioseq_rna", { + expect_identical(export_sq(sq_rna, "bioseq::bioseq_rna"), + bioseq_rna) + expect_identical(export_sq(sq_rna, "bioseq::bioseq_rna", name = names_rna), + bioseq_rna_n) +}) + +test_that("correctly exports sq object to bioseq::bioseq_aa", { + expect_identical(export_sq(sq_ami, "bioseq::bioseq_aa"), + bioseq_ami) + expect_identical(export_sq(sq_ami, "bioseq::bioseq_aa", name = names_ami), + bioseq_ami_n) +}) diff --git a/tests/testthat/test-random_sq.R b/tests/testthat/test-random_sq.R index 8e79f84..df7779b 100644 --- a/tests/testthat/test-random_sq.R +++ b/tests/testthat/test-random_sq.R @@ -36,6 +36,16 @@ test_that("using sd argument of random_sq() doesn't generate negative-length seq } }) +# SEED SAFETY --- +test_that("generating random sequences with the same seed gives the same sequences", { + set.seed(6125) + sq_1 <- random_sq(10, 100, "ami_bsc") + sq_2 <- random_sq(5, 20, "dna_ext", sd = 5) + set.seed(6125) + expect_equal(random_sq(10, 100, "ami_bsc"), sq_1) + expect_equal(random_sq(5, 20, "dna_ext", sd = 5), sq_2) +}) + # EDGE CASES ---- test_that("random_sq() can generate 0 sequences", { expect_vector(random_sq(0, 13, "rna_ext"), diff --git a/tests/testthat/test-translate.R b/tests/testthat/test-translate.R index 697eade..fe2216a 100644 --- a/tests/testthat/test-translate.R +++ b/tests/testthat/test-translate.R @@ -51,12 +51,3 @@ test_that("translate() correctly handles tables other than default 1", { expect_equivalent(as.character(translate(sq_rna, 24)), str_rna_translated) }) - -test_that("translate() can handle ambiguous tables (27, 28 & 31) with additional parameter", { - expect_equivalent(as.character(translate(sq_dna_2, 31)), - str_dna_2_translated_31) - expect_equivalent(as.character(translate(sq_dna_2, 31, interpret_as_stop = FALSE)), - str_dna_2_translated_31) - expect_equivalent(as.character(translate(sq_dna_2, 31, interpret_as_stop = TRUE)), - str_dna_2_translated_31_as_stop) -})