diff --git a/R/SoilProfileCollection-operators.R b/R/SoilProfileCollection-operators.R index 133aeca29..bca484a1f 100644 --- a/R/SoilProfileCollection-operators.R +++ b/R/SoilProfileCollection-operators.R @@ -14,255 +14,317 @@ #' #' @aliases [,SoilProfileCollection-method #' -#' @description You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j]}. Subset operations are propagated to other slots when they result in removal of sites from a collection. +#' @description You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j, ...]}. Subset operations are propagated to other slots (such as diagnostics or spatial) when they result in removal of sites from a collection. #' -#' \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. +#' - \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. #' -#' \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. So, for instance, if \code{spc} contained 100 profiles, \code{spc[,2]} would return 100 profiles, but just the second horizon from each of the profiles ... assuming each profile had at least two horizons! The single horizon profiles would be dropped from the collection. +#' - \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. #' +#' - `...` is an area to specify an expression that is evaluated in the subset. Currently supported +#' +#' - `.LAST` (last horizon in each profile): return the last horizon from each profile. This uses `i` but ignores the regular `j` index. +#' - `.FIRST` (first horizon in each profile): return the last horizon from each profile. This uses `i` but ignores the regular `j` index. +#' - `.HZID` (horizon index not `SoilProfileCollection` result): return the horizon indices corresponding to `i`+`j`+`...` ("k") constraints #' #' @param x a SoilProfileCollection #' @param i a numeric or logical value denoting profile indices to select in a subset -#' @param j a numeric or logical value denoting profile indices to select in a subset +#' @param j a numeric or logical value denoting horizon indices to select in a subset +#' @param ... non-standard expressions to evaluate in a subset +#' +#' @param drop not used #' @rdname singlebracket +# SPC extract: "[" '[' single bracket SPC object extract method setMethod("[", signature(x = "SoilProfileCollection", i = "ANY", - j = "ANY"), - function(x, i, j) { - # check for missing i and j - if (missing(i) & missing(j)) { - stop('must provide either a profile index or horizon/slice index, or both', - call. = FALSE) - } - - # convert to integer - if (!missing(i)) { - if (any(is.na(i))) { - stop('NA not permitted in profile index', call. = FALSE) - } - - # convert logical to integer - # (thanks Jos? Padarian for the suggestion!) - if (is.logical(i)) { - i <- (1:length(x))[i] - } - - can.cast <- is.numeric(i) - if (can.cast) { - if (all(abs(i - round(i)) < .Machine$double.eps ^ 0.5)) { - i <- as.integer(i) - } else { - stop("Numeric site index does not contain whole numbers.") - } - } else { - stop("Failed to coerce site index to integer.") - } - } else { - # if no index is provided, the user wants all profiles - i <- 1:length(x) - } - - if (!missing(j)) { - # AGB added logical handling to horizon index - if (is.logical(j)) { - j <- (1:length(x))[j] - } - - can.cast <- is.numeric(j) - if (can.cast) { - if (all(abs(j - round(j)) < .Machine$double.eps ^ 0.5)) { - j <- as.integer(j) - } else { - stop("Numeric horizon/slice index does not contain whole numbers.") - } - } else { - stop("Failed to coerce horizon/slice index to integer.") - } - - if (any(is.na(j))) { - stop('NA not permitted in horizon/slice index', call. = FALSE) - } - } - - # extract all site and horizon data - h <- x@horizons - s.all <- x@site - - # extract requested profile IDs - p.ids <- s.all[[idname(x)]][unique(i)] - - # keep only the requested horizon data (filtered by profile ID) - h <- .as.data.frame.aqp(h, aqp_df_class(x))[h[[idname(x)]] %in% p.ids,] - - # keep only the requested site data, (filtered by profile ID) - s.i <- which(s.all[[idname(x)]] %in% p.ids) - - # need to use drop=FALSE when @site contains only a single column - s <- s.all[s.i, , drop = FALSE] - - # subset spatial data, but only if valid - if (validSpatialData(x)) { - sp <- x@sp[i] - } else { - # copy empty SpatialPoints object - sp <- x@sp - } - - # subset diagnostic data - d <- diagnostic_hz(x) - if (length(d) > 0) { - d <- d[which(d[[idname(x)]] %in% p.ids),] - } - - # subset restriction data - r <- restrictions(x) - if (length(r) > 0) { - r <- r[which(r[[idname(x)]] %in% p.ids),] - } - - # subset horizons/slices based on j --> only when j is given - if (!missing(j)) { - - # faster replacement of j subsetting of horizon data - if (aqp_df_class(x) == "data.table") { - - # local vars to make R CMD check happy - .N <- NULL - .I <- NULL - V1 <- NULL - - # data.table can do this much more efficiently - if (requireNamespace("data.table", quietly = TRUE)) { - idn <- idname(x) - - # by list @horizons idname (essentially iterating over profiles) - bylist <- list(h[[idn]]) - names(bylist) <- idn - - # figured out the data.table way to do this - # not using := or . anymore - - # determine j indices to KEEP - j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 - - # determine which site indices to keep - # in case all horizons are removed, remove sites too - if (length(j.idx) == 0) { - i.idx <- numeric(0) - } else { - # determine which profile IDs KEEP - pids <- h[, .I[any(1:.N %in% j)][1], by = bylist] - i.idx <- pids[, .I[!is.na(V1)]] - } - } - - } else { - # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) - j.res <- as.list(aggregate( - h[[hzidname(x)]], - by = list(h[[idname(x)]]), - FUN = function(hh) { - list(1:length(hh) %in% j) - }, - drop = FALSE - )$x) - - ## https://github.com/ncss-tech/aqp/issues/89 - # fix #89, where i with no matching j e.g. @site data returned - i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) - - j.idx <- which(do.call('c', j.res)) - } - - # find any index out of bounds and ignore them - # j.idx.bad <- which(abs(j.idx) > nrow(h)) - # i.idx.bad <- which(abs(i.idx) > nrow(s)) - # - # if (length(i.idx)) - # i.idx <- i.idx[-i.idx.bad] - # - # if (length(j.idx)) - # j.idx <- j.idx[-j.idx.bad] - - # do horizon subset with j index - h <- h[j.idx, ] - - # if profiles have been removed based on the j-index constraints - if (length(i.idx) > 0) { - # remove sites that have no matching j - s <- s[i.idx, , drop = FALSE] - h.ids <- s[[idname(x)]] - - # remove also: diagnostics - d.idx <- which(d[[idname(x)]] %in% h.ids) - if (length(d.idx) > 0) { - d <- d[-d.idx, , drop = FALSE] - } - - # restrictions - r.idx <- which(r[[idname(x)]] %in% h.ids) - if (length(r.idx) > 0) { - r <- r[-r.idx, , drop = FALSE] - } - - # spatial - if (validSpatialData(x)) { - sp <- sp[i.idx,] - } - } - } - - rownames(h) <- NULL - - # rebuild SPC object from slots - res <- SoilProfileCollection( - idcol = idname(x), - hzidcol = hzidname(x), - depthcols = horizonDepths(x), - metadata = aqp::metadata(x), - horizons = .as.data.frame.aqp(h, aqp_df_class(x)), - site = .as.data.frame.aqp(s, aqp_df_class(x)), - sp = sp, - diagnostic = .as.data.frame.aqp(d, aqp_df_class(x)), - restrictions = .as.data.frame.aqp(r, aqp_df_class(x)) - ) - - # fill in any missing data.frame class or group var - o.df.class <- aqp::metadata(x)$aqp_df_class - if(length(o.df.class) == 0) { - o.df.class <- "data.frame" - } - - o.group.by <- aqp::metadata(x)$aqp_group_by - if(length(o.group.by) == 0) { - o.group.by <- "" - } - - metadata(res)$aqp_df_class <- o.df.class - metadata(res)$aqp_group_by <- o.group.by - - # preserve slots that may have been customized relative to defaults - # in prototype or resulting from construction of SPC - suppressMessages(hzidname(res) <- hzidname(x)) - suppressMessages(hzdesgnname(res) <- hzdesgnname(x)) - suppressMessages(hztexclname(res) <- hztexclname(x)) - - # there should be as many records in @site as there are profile IDs - pid.res <- profile_id(res) - site.res <- site(res)[[idname(res)]] - - if (length(pid.res) != length(site.res)) { - message("Some profiles have been removed from the collection.") - } - - # the order of profile_ids should be the same as in @site - if (!all(pid.res == site.res)) { - warning("profile ID order does not match order in @site", - call. = FALSE) - } - - return(res) - }) + j = "ANY"), function(x, i, j, ...) { + + # capture k right away + ksubflag <- FALSE + + # 2nd value is first user-supplied expression + kargs <- substitute(list(...)) + ksub <- as.character(kargs)[2:length(kargs)] + + # handle special keywords in "k" index + for(k in ksub) { + if (!is.na(k)) { + switch(k, + ".FIRST" = { + j <- 1 + }, + ".LAST" = { + ksubflag <- TRUE + }, + ".HZID" = { + ksubflag <- TRUE + }) + } + } + + # convert to integer + if (!missing(i)) { + if (any(is.na(i))) { + stop('NA not permitted in profile index', call. = FALSE) + } + + # convert logical to integer + # (thanks Jos? Padarian for the suggestion!) + if (is.logical(i)) { + i <- (1:length(x))[i] + } + + can.cast <- is.numeric(i) + if (can.cast) { + if (all(abs(i - round(i)) < .Machine$double.eps ^ 0.5)) { + i <- as.integer(i) + } else { + stop("Numeric site index does not contain whole numbers.") + } + } else { + stop("Failed to coerce site index to integer.") + } + } else { + # if no index is provided, the user wants all profiles + i <- 1:length(x) + } + + + # this block does processing for logical or numeric j + # not special symbols like .LAST + if (!missing(j) & !ksubflag) { + + # AGB added logical handling to horizon index + if (is.logical(j)) { + j <- (1:length(x))[j] + } + + can.cast <- is.numeric(j) + if (can.cast) { + if (all(abs(j - round(j)) < .Machine$double.eps ^ 0.5)) { + j <- as.integer(j) + } else { + stop("Numeric horizon/slice index does not contain whole numbers.") + } + } else { + stop("Failed to coerce horizon/slice index to integer.") + } + + if (any(is.na(j))) { + stop('NA not permitted in horizon/slice index', call. = FALSE) + } + } + + # extract all site and horizon data + h <- x@horizons + s.all <- x@site + + # extract requested profile IDs + p.ids <- s.all[[idname(x)]][unique(i)] + + # keep only the requested horizon data (filtered by profile ID) + h <- .as.data.frame.aqp(h, aqp_df_class(x)) + pidx <- h[[idname(x)]] %in% p.ids + h <- h[pidx,] + + if (ksubflag && ".HZID" %in% ksub) { + + j.idx.allowed <- seq_len(nrow(x@horizons))[pidx] + + } else { + j.idx.allowed <- seq_len(nrow(h)) + + # keep only the requested site data, (filtered by profile ID) + s.i <- which(s.all[[idname(x)]] %in% p.ids) + + # need to use drop=FALSE when @site contains only a single column + s <- s.all[s.i, , drop = FALSE] + + # subset spatial data, but only if valid + if (validSpatialData(x)) { + sp <- x@sp[i] + } else { + # copy empty SpatialPoints object + sp <- x@sp + } + + # subset diagnostic data + d <- diagnostic_hz(x) + if (length(d) > 0) { + d <- d[which(d[[idname(x)]] %in% p.ids),] + } + + # subset restriction data + r <- restrictions(x) + if (length(r) > 0) { + r <- r[which(r[[idname(x)]] %in% p.ids),] + } + + } + + # subset horizons/slices based on j --> only when j is given + if (!missing(j) | ksubflag) { + + # faster replacement of j subsetting of horizon data + # if (aqp_df_class(x) == "data.table") { + + h <- data.table::as.data.table(h) + + # local vars to make R CMD check happy + .N <- NULL + .I <- NULL + V1 <- NULL + + idn <- idname(x) + + # by list @horizons idname (essentially iterating over profiles) + bylist <- list(h[[idn]]) + names(bylist) <- idn + + # handle special symbols in j index + # currently supported: + # - .LAST: last horizon index per profile + # - .HZID: return horizon slot row index (short circuit) + # - .FIRST: first horizon index per profile + # the above can be combined. .LAST takes precedent over .FIRST. + + # determine j indices to KEEP + + # default is an index to each horizon + j.idx <- j.idx.allowed + + if (ksubflag && ".LAST" %in% ksub) { + # trigger special last horizon case + j.idx <- h[, .I[.N], by = bylist]$V1 + } else { + if (!missing(j)) { + # get row indices of horizon data corresponding to j within profiles + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + } + } + + # short circuit for horizon-slot indices + if (ksubflag && ".HZID" %in% ksub) { + return(j.idx.allowed[j.idx]) + } + + # determine which site indices to keep + # in case all horizons are removed, remove sites too + if (length(j.idx) == 0) { + i.idx <- numeric(0) + } else { + # determine which profiles to KEEP + i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) + } + + # } + # } else { + # # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) + # j.res <- as.list(aggregate( + # h[[hzidname(x)]], + # by = list(h[[idname(x)]]), + # FUN = function(hh) { + # list(1:length(hh) %in% j) + # }, + # drop = FALSE + # )$x) + # + # ## https://github.com/ncss-tech/aqp/issues/89 + # # fix #89, where i with no matching j e.g. @site data returned + # i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) + # + # j.idx <- which(do.call('c', j.res)) + # } + + # find any index out of bounds and ignore them + # j.idx.bad <- which(abs(j.idx) > nrow(h)) + # i.idx.bad <- which(abs(i.idx) > nrow(s)) + # + # if (length(i.idx)) + # i.idx <- i.idx[-i.idx.bad] + # + # if (length(j.idx)) + # j.idx <- j.idx[-j.idx.bad] + + # do horizon subset with j index + h <- h[j.idx, ] + + # if profiles have been removed based on the j-index constraints + if (length(i.idx) > 0) { + # remove sites that have no matching j + i.idx <- unique(c(s.i, i.idx)) + s <- s.all[i.idx, , drop = FALSE] + h.ids <- s[[idname(x)]] + + # remove also: diagnostics + d.idx <- which(d[[idname(x)]] %in% h.ids) + if (length(d.idx) > 0) { + d <- d[-d.idx, , drop = FALSE] + } + + # restrictions + r.idx <- which(r[[idname(x)]] %in% h.ids) + if (length(r.idx) > 0) { + r <- r[-r.idx, , drop = FALSE] + } + + # spatial + if (validSpatialData(x)) { + sp <- sp[i.idx,] + } + } + } + + rownames(h) <- NULL + + # rebuild SPC object from slots + res <- SoilProfileCollection( + idcol = idname(x), + hzidcol = hzidname(x), + depthcols = horizonDepths(x), + metadata = aqp::metadata(x), + horizons = .as.data.frame.aqp(h, aqp_df_class(x)), + site = .as.data.frame.aqp(s, aqp_df_class(x)), + sp = sp, + diagnostic = .as.data.frame.aqp(d, aqp_df_class(x)), + restrictions = .as.data.frame.aqp(r, aqp_df_class(x)) + ) + + # fill in any missing data.frame class or group var + o.df.class <- aqp::metadata(x)$aqp_df_class + if(length(o.df.class) == 0) { + o.df.class <- "data.frame" + } + + o.group.by <- aqp::metadata(x)$aqp_group_by + if(length(o.group.by) == 0) { + o.group.by <- "" + } + + metadata(res)$aqp_df_class <- o.df.class + metadata(res)$aqp_group_by <- o.group.by + + # preserve slots that may have been customized relative to defaults + # in prototype or resulting from construction of SPC + suppressMessages(hzidname(res) <- hzidname(x)) + suppressMessages(hzdesgnname(res) <- hzdesgnname(x)) + suppressMessages(hztexclname(res) <- hztexclname(x)) + + # there should be as many records in @site as there are profile IDs + pid.res <- profile_id(res) + site.res <- site(res)[[idname(res)]] + + if (length(pid.res) != length(site.res)) { + message("Some profiles have been removed from the collection.") + } + + # the order of profile_ids should be the same as in @site + if (!all(pid.res == site.res)) { + warning("profile ID order does not match order in @site", + call. = FALSE) + } + + return(res) + }) #### #### double bracket SoilProfileCollection methods diff --git a/man/singlebracket.Rd b/man/singlebracket.Rd index 8fd8bc6e5..33a40a97f 100644 --- a/man/singlebracket.Rd +++ b/man/singlebracket.Rd @@ -4,19 +4,29 @@ \alias{[,SoilProfileCollection-method} \title{Matrix/data.frame-like access to profiles and horizons in a SoilProfileCollection} \usage{ -\S4method{[}{SoilProfileCollection}(x, i, j) +\S4method{[}{SoilProfileCollection}(x, i, j, ..., drop = TRUE) } \arguments{ \item{x}{a SoilProfileCollection} \item{i}{a numeric or logical value denoting profile indices to select in a subset} -\item{j}{a numeric or logical value denoting profile indices to select in a subset} -} -\description{ -You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j]}. Subset operations are propagated to other slots when they result in removal of sites from a collection. +\item{j}{a numeric or logical value denoting horizon indices to select in a subset} -\code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. +\item{...}{non-standard expressions to evaluate in a subset} -\code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. So, for instance, if \code{spc} contained 100 profiles, \code{spc[,2]} would return 100 profiles, but just the second horizon from each of the profiles ... assuming each profile had at least two horizons! The single horizon profiles would be dropped from the collection. +\item{drop}{not used} +} +\description{ +You can access the contents of a SoilProfileCollection by profile and horizon "index", \code{i} and \code{j}, respectively: \code{spc[i, j, ...]}. Subset operations are propagated to other slots (such as diagnostics or spatial) when they result in removal of sites from a collection. +\itemize{ +\item \code{i} refers to the profile position within the collection. By default the order is based on the C SORT order of the variable that you specified as your unique profile ID at time of object construction. Note that if your ID variable was numeric, then it has been sorted as a character. +\item \code{j} refers to the horizon or "slice" index. This index is most useful when either a) working with \code{slice}'d SoilProfileCollection or b) working with single-profile collections. \code{j} returns the layer in the specified index positions for all profiles in a collection. +\item \code{...} is an area to specify an expression that is evaluated in the subset. Currently supported +\itemize{ +\item \code{.LAST} (last horizon in each profile): return the last horizon from each profile. This uses \code{i} but ignores the regular \code{j} index. +\item \code{.FIRST} (first horizon in each profile): return the last horizon from each profile. This uses \code{i} but ignores the regular \code{j} index. +\item \code{.HZID} (horizon index not \code{SoilProfileCollection} result): return the horizon indices corresponding to \code{i}+\code{j}+\code{...} ("k") constraints +} +} } diff --git a/misc/aqp2/jindex-benchmarks.R b/misc/aqp2/jindex-benchmarks.R new file mode 100644 index 000000000..fad3e4b50 --- /dev/null +++ b/misc/aqp2/jindex-benchmarks.R @@ -0,0 +1,197 @@ +spc_j_DT <- function(x, i, j) { + + # this is already live in AQP for data.table SPCs -- is it worth enabling globally? + + h <- data.table::as.data.table(horizons(x)) + + # local vars to make R CMD check happy + .N <- NULL + .I <- NULL + V1 <- NULL + + # data.table can do this much more efficiently + # if (requireNamespace("data.table", quietly = TRUE)) { + idn <- idname(x) + + # by list @horizons idname (essentially iterating over profiles) + bylist <- list(h[[idn]]) + names(bylist) <- idn + + # figured out the data.table way to do this + # not using := or . anymore + + # determine j indices to KEEP + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + + # determine which site indices to keep + # in case all horizons are removed, remove sites too + if (length(j.idx) == 0) { + i.idx <- numeric(0) + } else { + # determine which profile IDs KEEP + pids <- h[, .I[any(1:.N %in% j)][1], by = bylist] + i.idx <- pids[, .I[!is.na(V1)]] + } + return(list(i.idx, j.idx)) +} + +# benchmark faster replacement of j subsetting of horizon data + +spc_j_DT_2 <- function(x, i, j) { + + # this is already live in AQP for data.table SPCs + # question: is it worth enabling globally? + + h <- data.table::as.data.table(horizons(x)) + + # local vars to make R CMD check happy + .N <- NULL + .I <- NULL + V1 <- NULL + + idn <- idname(x) + + # by list @horizons idname (essentially iterating over profiles) + bylist <- list(h[[idn]]) + names(bylist) <- idn + + # figured out the data.table way to do this + # not using := or . anymore + + # determine j indices to KEEP + j.idx <- h[, .I[1:.N %in% j], by = bylist]$V1 + + # determine which site indices to keep + # in case all horizons are removed, remove sites too + if (length(j.idx) == 0) { + i.idx <- numeric(0) + } else { + # determine which profile IDs KEEP + i.idx <- which(profile_id(x) %in% unique(h[j.idx,][[idn]])) + # i.idx <- which(vapply(j.res, function(jr) { any(jr) }, FUN.VALUE = logical(1))) + } + return(list(i.idx, j.idx)) +} + +spc_j_base <- function(x, i, j) { + h <- horizons(x) + # retain a base R way of doing things (plenty fast with SPCs up to ~100k or so) + j.res <- as.list(aggregate( + h[[hzidname(x)]], + by = list(h[[idname(x)]]), + FUN = function(hh) { + list(1:length(hh) %in% j) + }, + drop = FALSE + )$x) + + ## https://github.com/ncss-tech/aqp/issues/89 + # fix #89, where i with no matching j e.g. @site data returned + i.idx <- which(as.logical(lapply(j.res, function(jr) { any(jr) }))) + j.idx <- which(do.call('c', j.res)) + + return(list(i.idx, j.idx)) +} + +spc_j_base_2 <- function(x, i, j) { + h <- horizons(x) + idn <- idname(x) + + # slightly more competitive version of base + j.res <- as.list(aggregate( + seq_len(nrow(h)), + by = list(h[[idn]]), + FUN = function(hh) { + (1:length(hh) %in% j) + } + )$x) + + # i.idx <- profile_id(x) %in% unique(h[unlist(j.res), ][[idn]]) # not faster? + i.idx <- which(vapply(j.res, function(jr) { any(jr) }, FUN.VALUE = logical(1))) + + j.idx <- which(unlist(j.res)) + + return(list(i.idx, j.idx)) +} + +spc_j_base_3 <- function(x, i, j) { + h <- horizons(x) + idn <- idname(x) + + # slightly more competitive version of base + j.res <- aggregate( + seq_len(nrow(h)), + by = list(h[[idn]]), + FUN = function(hh) { + (1:length(hh) %in% j) + } + )$x + + i.idx <- which(vapply(j.res, any, FUN.VALUE = logical(1))) + j.idx <- which(unlist(j.res)) + + return(list(i.idx, j.idx)) +} + +library(aqp) + +data(sp4) +depths(sp4) <- id ~ top + bottom + +# base outperforms old DT with 10 profiles, but new solution slightly better +microbenchmark::microbenchmark(spc_j_base(sp4, 1:10, 2:3), + spc_j_base_2(sp4, 1:10, 2:3), + spc_j_base_3(sp4, 1:10, 2:3), + spc_j_DT(sp4, 1:10, 2:3), + spc_j_DT_2(sp4, 1:10, 2:3)) + +# whether the SPC is a "data.table SPC" ahead of time does not affect overhead much +sp4_DT <- data.table::as.data.table(horizons(sp4)) +depths(sp4_DT) <- id ~ top + bottom + +bench::mark(spc_j_base(sp4_DT, 1:10, 2:3), + spc_j_base_2(sp4_DT, 1:10, 2:3), + spc_j_base_3(sp4_DT, 1:10, 2:3), + spc_j_DT(sp4_DT, 1:10, 2:3), + spc_j_DT_2(sp4_DT, 1:10, 2:3)) + +# basically neck-and-neck at 1000 profiles; closing the gap of base over DT from ~50% to about ~5% +# the new solution is 200% faster than both old ones +thousand <- as.data.frame(data.table::rbindlist(lapply(1:1000, random_profile))) +depths(thousand) <- id ~ top + bottom + +bench::mark(spc_j_base(thousand, 1:10, 2:3), + spc_j_base_2(thousand, 1:10, 2:3), + spc_j_base_3(thousand, 1:10, 2:3), + spc_j_DT(thousand, 1:10, 2:3), + spc_j_DT_2(thousand, 1:10, 2:3)) + +# ten thousand profiles, we see a benefit from DT (old: ~20% faster than base; new 300% faster) +tenthousand <- as.data.frame(data.table::rbindlist(lapply(1:10000, random_profile))) +depths(tenthousand) <- id ~ top + bottom + +bench::mark(spc_j_base(tenthousand, 1:10, 2:3), + spc_j_base_2(tenthousand, 1:10, 2:3), + spc_j_base_3(tenthousand, 1:10, 2:3), + spc_j_DT(tenthousand, 1:10, 2:3), + spc_j_DT_2(tenthousand, 1:10, 2:3)) + +# hundred thousand profiles, we see a similar benefit from DT -- 2 to 3 times faster +# hundredthousand <- as.data.frame(data.table::rbindlist(lapply(1:100000, random_profile))) +# depths(hundredthousand) <- id ~ top + bottom +# +# bench::mark(spc_j_base(hundredthousand, 1:10, 2:3), +# spc_j_base_2(hundredthousand, 1:10, 2:3), +# spc_j_base_3(hundredthousand, 1:10, 2:3), +# spc_j_DT(hundredthousand, 1:10, 2:3), +# spc_j_DT_2(hundredthousand, 1:10, 2:3)) + +# # million profiles, we see a benefit from DT +# million <- as.data.frame(data.table::rbindlist(lapply(1:1000000, random_profile))) +# depths(million) <- id ~ top + bottom +# +# bench::mark( +# spc_j_base(million, 1:10, 2:3), +# spc_j_DT(million, 1:10, 2:3), +# spc_j_DT_2(hundredthousand, 1:10, 2:3) +# ) diff --git a/misc/aqp2/test-k-keywords.R b/misc/aqp2/test-k-keywords.R new file mode 100644 index 000000000..c4e79f303 --- /dev/null +++ b/misc/aqp2/test-k-keywords.R @@ -0,0 +1,48 @@ +# DEMO: base non-standard eval of keyword in ... "k-index": SPC[i, j, ...] +# version 1: add support for .LAST +# version 2: add support for .FIRST, .HZID and multiple special keywords + +library(aqp, warn = FALSE) + +data(sp4) +depths(sp4) <- id ~ top + bottom + +# .LAST +sp4[, , .LAST] +sp4[, , .HZID] +sp4[, , .LAST, .HZID] +sp4[, , .FIRST, .HZID] # this just sets j <- 1 +sp4[, 1, , .HZID] +sp4[, 1000, .FIRST, .HZID] # this sets j <- 1; ignores j input if given +sp4[, 1000, .LAST, .HZID] # this ignores j input if given + +# horizon index of 2nd horizon in each profile +sp4[5:10, 2, .HZID] + +# this is more realistic, perhaps: +fivetoten <- sp4[5:10,] # or some more complicated subset() + +# third horizon ID, index to horizon data.frame +horizons(fivetoten)[fivetoten[,3,,.HZID],] + +# use it to implement #199 +getLastHorizonID_v1 <- function(x) { + res <- hzID(x[, , .LAST]) + names(res) <- profile_id(x) + return(res) +} + +# ~3x more efficient using j-index shortcut +getLastHorizonID_v2 <- function(x) { + res <- hzID(x)[x[, , .LAST, .HZID]] + names(res) <- profile_id(x) + return(res) +} + +bench::mark(getLastHorizonID_v1(sp4), + getLastHorizonID_v2(sp4)) + +# bigger <- data.table::rbindlist(lapply(1:1000, random_profile)) +# depths(bigger) <- id ~ top + bottom +# bench::mark(x <- getLastHorizonID_v1(bigger), +# x <- getLastHorizonID_v2(bigger)) diff --git a/tests/testthat/test-SPC-k-keywords.R b/tests/testthat/test-SPC-k-keywords.R new file mode 100644 index 000000000..be34d3176 --- /dev/null +++ b/tests/testthat/test-SPC-k-keywords.R @@ -0,0 +1,35 @@ +# base non-standard eval of keyword in ... "k-index": SPC[i, j, ...] +# support for .LAST, .FIRST, .HZID special keywords +context(".LAST, .FIRST, .HZID k-keywords for SoilProfileCollection objects") + +# define special symbols in global env +# (not needed for tests, but needed wherever they are used in package) +.FIRST <- NULL +.LAST <- NULL +.HZID <- NULL + +data(sp4) +depths(sp4) <- id ~ top + bottom + +# .LAST +expect_equal(length(sp4[, , .LAST]), 10) + +expect_equal(sp4[, , .HZID], 1:30) + +expect_equal(sp4[, , .LAST, .HZID], + c(4L, 6L, 9L, 13L, 16L, 18L, 20L, 22L, 27L, 30L)) +# .FIRST sets j <- 1 +expect_equal(sp4[, , .FIRST, .HZID], + sp4[, 1, , .HZID]) + +# .FIRST ignores j input if given +expect_equal(sp4[, 1000, .FIRST, .HZID], + sp4[, 1, , .HZID]) + +# .LAST ignores j input if given +expect_equal(sp4[, , .LAST, .HZID], + sp4[, 1000, .LAST, .HZID]) + +# horizon index of 2nd horizon in each profile +expect_equal(sp4[5:10, 2, .HZID], + c(15L, 18L, 20L, 22L, 24L, 29L))