diff --git a/NAMESPACE b/NAMESPACE index 116f50ea2..a7ed73c4e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,13 +72,7 @@ importFrom(plyr, rbind.fill ) -importFrom(reshape, - melt, - melt.data.frame, - cast -) - -import(data.table, except=c(melt)) +import(data.table) importFrom(stringr, fixed, diff --git a/NEWS.md b/NEWS.md index dc6bdf3fb..8dc631b8c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ # aqp 1.27 (2021-01-14) + * {aqp} no longer imports from {reshape} (less one dependency), all transformations from wide<->long are done via {data.table} + * methods from {data.table} are now imported by {aqp} (new dependency) * Major overhaul of `plotColorQuantiles()`, now using {lattice} graphics * New dataset `munequivalent` and method `equivalentMunsellChips` for "equivalent" Munsell chips lookup list based on all pairwise dE00 contrasts for integer "chips" in `aqp::munsell` data set * Argillic critical clay contents `crit.clay.argillic` rounded to whole numbers per NSSH Part 614, subpart B, sections 614.13 and 614.14 diff --git a/R/colorQuantiles.R b/R/colorQuantiles.R index 2308965b5..e23295a02 100644 --- a/R/colorQuantiles.R +++ b/R/colorQuantiles.R @@ -189,8 +189,16 @@ colorQuantiles <- function(soilColors, p = c(0.05, 0.5, 0.95)) { #' plotColorQuantiles <- function(res, pt.cex = 7, lab.cex = 0.66) { - # long format for plotting in panels - m.long <- melt(res$marginal, id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip')) + # convert wide -> long format for plotting in panels + # using data.table::melt() + m.long <- melt( + as.data.table(res$marginal), + id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip') + ) + + # convert back to data.frame + m.long <- as.data.frame(m.long) + # fake y-variable for plotting marginal vs. L1 m.long$y <- 1 diff --git a/R/evalGenHz.R b/R/evalGenHz.R index 56a3526f8..10a395ba9 100644 --- a/R/evalGenHz.R +++ b/R/evalGenHz.R @@ -15,7 +15,10 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE h[[genhz]] <- factor(h[[genhz]]) # make an index to complete data - no.na.idx <- which(complete.cases(h[, vars])) + no.na.idx <- which(complete.cases(h[, vars, drop = FALSE])) + + ## TODO: all of vars should be numeric or convertable to numeric + # numeric.test <- sapply(vars, function(i) is.numeric(h[[i]])) # test for duplicate data # unique IDs are based on a concatenation of variables used... digest would be safer @@ -56,21 +59,40 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE h$mds.2[no.na.idx] <- mds$points[, 2] h$sil.width[sil.idx] <- sil[, 3] h$neighbor[sil.idx] <- levels(h[[genhz]])[sil[, 2]] - - # melt into long form - m <- melt(h, id.vars = genhz, measure.vars = c(vars, 'sil.width')) - - # compute group-wise summaries-- note that text is returned - m.summary <- ddply(m, c(genhz, 'variable'), function(i) { - stats <- format(paste0(round(mean(i$value, na.rm=TRUE), 2), ' (' , - sd = round(sd(i$value, na.rm=TRUE), 2), ')'), - justify = 'right') - return(data.frame(stats = stats)) - }) - + + ## TODO: enforce / check above + ## important note: all 'vars' should be numeric + + + # convert wide -> long form + # using data.table::melt + # suppressing warnings related to mixture of int / numeric + m <- suppressWarnings( + melt( + as.data.table(h), + id.vars = genhz, + measure.vars = c(vars, 'sil.width') + ) + ) + + # leave as data.table for aggregation + # compute group-wise summaries + m.summary <- m[, list(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)), by = c((genhz), 'variable')] + + # format text + m.summary$stats <- sprintf( + "%s (%s)", + round(m.summary$mean, 2), + round(m.summary$sd, 2) + ) + + # using data.table::dcast fm <- paste0(genhz, ' ~ variable') - genhz.stats <- cast(m.summary, fm, value = 'stats') - + genhz.stats <- dcast(data = m.summary, formula = fm, value.var = 'stats') + + # convert back to data.frame + genhz.stats <- as.data.frame(genhz.stats) + # composite into a list res <- list(horizons = h[, c('mds.1', 'mds.2', 'sil.width', 'neighbor')], stats = genhz.stats, dist = d) diff --git a/R/get.ml.hz.R b/R/get.ml.hz.R index ad6b856ee..66346a065 100644 --- a/R/get.ml.hz.R +++ b/R/get.ml.hz.R @@ -22,7 +22,11 @@ get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) { safe.names <- make.names(o.names) # LUT for names - names.LUT <- data.frame(original=o.names, safe=safe.names, stringsAsFactors = FALSE) + names.LUT <- data.frame( + original = o.names, + safe = safe.names, + stringsAsFactors = FALSE + ) # get index to max probability, # but only when there is at least one value > 0 and all are not NA diff --git a/R/slab.R b/R/slab.R index 4a2e49324..b6f4141c2 100644 --- a/R/slab.R +++ b/R/slab.R @@ -164,14 +164,18 @@ gc() # check variable classes - if(length(vars) > 1) - vars.numeric.test <- sapply(data[, vars], is.numeric) - else - vars.numeric.test <- is.numeric(data[[vars]]) + if(length(vars) > 1) { + vars.numeric.test <- sapply(data[, vars], is.numeric) + } else { + vars.numeric.test <- is.numeric(data[[vars]]) + } + # sanity check: all numeric, or single character/factor - if(any(! vars.numeric.test) & length(vars) > 1) - stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call.=FALSE) + if(any(! vars.numeric.test) & length(vars) > 1) { + stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call. = FALSE) + } + # check for single categorical variable, and convert to factor if(length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) { @@ -183,19 +187,26 @@ } # check for weights - if(!missing(weights)) - stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE) + if(!missing(weights)) { + stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE) + } + # re-set default function, currently no user-supply-able option slab.fun <- .slab.fun.factor.default # add extra arguments required by this function # note that we cannot accept additional arguments when processing categorical values - extra.args <- list(cpm=cpm) + extra.args <- list(cpm = cpm) # save factor levels for later original.levels <- levels(data[[vars]]) - } + + # set a flag for post data.table.melt -> factor level setting + .factorFlag <- TRUE + } else { + .factorFlag <- FALSE + } #### @@ -222,7 +233,7 @@ ## ## TODO: adding weighted-aggregate functionality here ## we can't use aggregate() for this - ## + ## we can use data.table methods # check for weights if(!missing(weights)) @@ -233,8 +244,26 @@ seg.label.is.not.NA <- which(!is.na(data$seg.label)) # convert into long format - d.long <- melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars) - + # d.long.df <- reshape::melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars) + + # convert wide -> long format + # using data.table::melt() + # note that this will not preserve factor levels when 'vars' is categorical + # must call unique() on `id.vars` + d.long <- melt( + as.data.table(data[seg.label.is.not.NA, ]), + id.vars = unique(c(object.ID, 'seg.label', g)), + measure.vars = vars, + ) + + # convert back to data.frame + d.long <- as.data.frame(d.long) + + # reset factor levels in d.long[[value]] + if(.factorFlag) { + d.long[['value']] <- factor(d.long[['value']], levels = original.levels) + } + # make a formula for aggregate() aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep='')) @@ -244,6 +273,11 @@ ## 2. aggregate using seg.label + variable in parallel ## 3. combine results (a list of data.frames) ## + ## NOPE: use data.table which will automatically scale to multiple threads + ## + + + # process chunks according to group -> variable -> segment # NA values are not explicitly dropped if(length(extra.args) == 0) diff --git a/R/soilColorSignature.R b/R/soilColorSignature.R index ddbfbc969..f61b12ed5 100644 --- a/R/soilColorSignature.R +++ b/R/soilColorSignature.R @@ -41,13 +41,26 @@ # make IDs x.medoids$.ids <- paste0('.', 1:k) - # melt and create new variable names - m <- melt(x.medoids, id.var=c(idname(x), '.ids'), measure.vars = c('L', 'A', 'B')) + # convert wide -> long and create new variable names + # using data.table::melt() + m <- melt( + as.data.table(x.medoids), + id.var = c(idname(x), '.ids'), + measure.vars = c('L', 'A', 'B') + ) + + # leave as data.table for dcast + + # new ID m$variable <- paste0(m$variable, m$.ids) - # cast to wide format + # convert long -> wide format + # using data.table::dcast fm <- as.formula(paste0(idname(x), ' ~ variable')) - res <- cast(m, fm, value = 'value') + res <- dcast(m, formula = fm, value.var = 'value') + + # convert back to data.frame + res <- as.data.frame(res) # don't include the id column return(res[, -1]) @@ -83,13 +96,26 @@ # make depth IDs x.slices$depth.id <- paste0('.', p) - # melt and create new variable names - m <- melt(x.slices, id.var=c(idname(x), 'depth.id'), measure.vars = c('L', 'A', 'B')) + # convert wide -> long and create new variable names + # using data.table::melt() + m <- melt( + as.data.table(x.slices), + id.var = c(idname(x), 'depth.id'), + measure.vars = c('L', 'A', 'B') + ) + + # leave as data.table for re-shape + + # new ID m$variable <- paste0(m$variable, m$depth.id) - # cast to wide format + # convert long -> wide format + # using data.table::dcast fm <- as.formula(paste0(idname(x), ' ~ variable')) - res <- cast(m, fm, value = 'value') + res <- dcast(m, formula = fm, value.var = 'value') + + # convert back to data.frame + res <- as.data.frame(res) # don't include the id column return(res[, -1]) diff --git a/man/SPC-slab-methods.Rd b/man/SPC-slab-methods.Rd index 1c13b9b94..f3e57a989 100644 --- a/man/SPC-slab-methods.Rd +++ b/man/SPC-slab-methods.Rd @@ -110,6 +110,7 @@ Harrell FE, Davis CE (1982): A new distribution-free quantile estimator. Biometr ## library(lattice) library(grid) +library(data.table) # load sample data, upgrade to SoilProfileCollection data(sp1) @@ -169,47 +170,97 @@ xyplot(top ~ value | id, data=a, ylab='Depth', ## ## categorical variable example ## -library(reshape) + +data(sp1) +depths(sp1) <- id ~ top + bottom # normalize horizon names: result is a factor -sp1$name <- generalize.hz(sp1$name, -new=c('O','A','B','C'), -pat=c('O', '^A','^B','C')) +sp1$name <- generalize.hz( + sp1$name, + new = c('O','A','B','C'), + pat = c('O', '^A','^B','C') + ) # compute slice-wise probability so that it sums to contributing fraction, from 0-150 a <- slab(sp1, fm= ~ name, cpm=1, slab.structure=0:150) -# reshape into long format for plotting -a.long <- melt(a, id.vars=c('top','bottom'), measure.vars=c('O','A','B','C')) +# convert wide -> long for plotting +# result is a data.table +# genhz factor levels are set by order in `measure.vars` +a.long <- melt( + as.data.table(a), + id.vars = c('top','bottom'), + measure.vars = c('O', 'A', 'B', 'C'), + ) + # plot horizon type proportions using panels -xyplot(top ~ value | variable, data=a.long, subset=value > 0, - ylim=c(150, -5), type=c('S','g'), horizontal=TRUE, layout=c(4,1), col=1 ) +xyplot(top ~ value | variable, + data = a.long, subset=value > 0, + col = 1, lwd = 2, + xlab = 'Class Probability', + ylab = 'Depth (cm)', + strip = strip.custom(bg = grey(0.85)), + scales = list(x = list(alternating = FALSE)), + ylim = c(150, -5), type=c('S','g'), + horizontal = TRUE, layout = c(4,1) + ) # again, this time using groups -xyplot(top ~ value, data=a.long, groups=variable, subset=value > 0, - ylim=c(150, -5), type=c('S','g'), horizontal=TRUE, asp=2) +xyplot(top ~ value, + data = a.long, + groups = variable, + subset = value > 0, + ylim = c(150, -5), + type = c('S','g'), + horizontal = TRUE, + asp = 2, + lwd = 2, + auto.key = list( + lines = TRUE, + points = FALSE, + cex = 0.8, + columns = 1, + space = 'right' + ) +) # adjust probability to size of collection, from 0-150 -a.1 <- slab(sp1, fm= ~ name, cpm=2, slab.structure=0:150) - -# reshape into long format for plotting -a.1.long <- melt(a.1, id.vars=c('top','bottom'), measure.vars=c('O','A','B','C')) +a.1 <- slab(sp1, fm= ~ name, cpm = 2, slab.structure = 0:150) + +# convert wide -> long for plotting +# result is a data.table +# genhz factor levels are set by order in `measure.vars` +a.1.long <- melt( + as.data.table(a.1), + id.vars = c('top','bottom'), + measure.vars = c('O','A','B','C') +) # combine aggregation from `cpm` modes 1 and 2 -g <- make.groups(cmp.mode.1=a.long, cmp.mode.2=a.1.long) +g <- make.groups(cmp.mode.1 = a.long, cmp.mode.2 = a.1.long) # plot horizon type proportions -xyplot(top ~ value | variable, groups=which, data=g, subset=value > 0, - ylim=c(240, -5), type=c('S','g'), horizontal=TRUE, layout=c(4,1), - auto.key=list(lines=TRUE, points=FALSE, columns=2), - par.settings=list(superpose.line=list(col=c(1,2))), - scales=list(alternating=3)) +xyplot(top ~ value | variable, + groups = which, + data = g, subset = value > 0, + ylim = c(240, -5), + type = c('S','g'), + horizontal = TRUE, + layout = c(4,1), + auto.key = list(lines = TRUE, points = FALSE, columns = 2), + par.settings = list(superpose.line = list(col = c(1, 2), lwd = 2)), + scales = list(alternating = 3), + xlab = 'Class Probability', + ylab = 'Depth (cm)', + strip = strip.custom(bg = grey(0.85)) +) # apply slice-wise evaluation of max probability, and assign ML-horizon at each slice (gen.hz.ml <- get.ml.hz(a, c('O','A','B','C'))) + \dontrun{ ## ## HD quantile estimator @@ -217,6 +268,7 @@ xyplot(top ~ value | variable, groups=which, data=g, subset=value > 0, library(soilDB) library(lattice) +library(data.table) # sample data data('loafercreek', package = 'soilDB') @@ -269,34 +321,38 @@ site(sp3) <- ~ group # custom 'slab' function, returning mean +/- 1SD mean.and.sd <- function(values) { - m <- mean(values, na.rm=TRUE) - s <- sd(values, na.rm=TRUE) - upper <- m + s - lower <- m - s - res <- c(mean=m, lower=lower, upper=upper) - return(res) - } + m <- mean(values, na.rm=TRUE) + s <- sd(values, na.rm=TRUE) + upper <- m + s + lower <- m - s + res <- c(mean=m, lower=lower, upper=upper) + return(res) +} # aggregate several variables at once, within 'group' -a <- slab(sp3, fm=group ~ L + A + B, slab.fun=mean.and.sd) +a <- slab(sp3, fm = group ~ L + A + B, slab.fun = mean.and.sd) # check the results: # note that 'group' is the column containing group labels -library(lattice) xyplot( - top ~ mean | variable, data=a, groups=group, - lower=a$lower, upper=a$upper, sync.colors=TRUE, alpha=0.5, - cf=a$contributing_fraction, - ylim=c(125,-5), layout=c(3,1), scales=list(x=list(relation='free')), - par.settings=list(superpose.line=list(lwd=2, col=c('RoyalBlue', 'Orange2'))), - panel=panel.depth_function, - prepanel=prepanel.depth_function, - auto.key=list(columns=2, lines=TRUE, points=FALSE) + top ~ mean | variable, data=a, groups=group, + lower=a$lower, upper=a$upper, + sync.colors=TRUE, alpha=0.5, + cf = a$contributing_fraction, + xlab = 'Mean Bounded by +/- 1SD', + ylab = 'Depth (cm)', + ylim=c(125,-5), layout=c(3,1), + scales=list(x=list(relation='free')), + par.settings = list(superpose.line=list(lwd=2, col=c('RoyalBlue', 'Orange2'))), + panel = panel.depth_function, + prepanel = prepanel.depth_function, + strip = strip.custom(bg=grey(0.85)), + auto.key = list(columns=2, lines=TRUE, points=FALSE) ) # compare a single profile to the group-level aggregate values -a.1 <- slab(sp3[1, ], fm=group ~ L + A + B, slab.fun=mean.and.sd) +a.1 <- slab(sp3[1, ], fm = group ~ L + A + B, slab.fun = mean.and.sd) # manually update the group column a.1$group <- 'profile 1' @@ -306,35 +362,56 @@ g <- rbind(a, a.1) # plot with customized line styles xyplot( - top ~ mean | variable, data=g, groups=group, subscripts=TRUE, - lower=a$lower, upper=a$upper, ylim=c(125,-5), - layout=c(3,1), scales=list(x=list(relation='free')), - panel=panel.depth_function, - prepanel=prepanel.depth_function, - sync.colors=TRUE, alpha=0.25, - par.settings=list(superpose.line=list(col=c('orange', 'royalblue', 'black'), - lwd=2, lty=c(1,1,2))), - auto.key=list(columns=3, lines=TRUE, points=FALSE) + top ~ mean | variable, data=g, groups=group, subscripts=TRUE, + lower=a$lower, upper=a$upper, ylim=c(125,-5), + layout=c(3,1), scales=list(x=list(relation='free')), + xlab = 'Mean Bounded by +/- 1SD', + ylab = 'Depth (cm)', + panel=panel.depth_function, + prepanel=prepanel.depth_function, + sync.colors = TRUE, alpha = 0.25, + par.settings = list( + superpose.line = list( + col = c('orange', 'royalblue', 'black'), + lwd = 2, lty = c(1,1,2) + ) + ), + strip = strip.custom(bg=grey(0.85)), + auto.key = list(columns=3, lines=TRUE, points=FALSE) ) -## convert mean value for each variable into long format -library(reshape) - -# note that depths are no longer in order -a.wide <- cast(a, group + top + bottom ~ variable, value=c('mean')) ## again, this time for a user-defined slab from 40-60 cm -a <- slab(sp3, fm=group ~ L + A + B, slab.structure=c(40,60), slab.fun=mean.and.sd) +a <- slab(sp3, + fm = group ~ L + A + B, + slab.structure = c(40,60), + slab.fun = mean.and.sd +) # now we have weighted average properties (within the defined slab) # for each variable, and each group -(a.wide <- cast(a, group + top + bottom ~ variable, value=c('mean'))) +# convert long -> wide +dcast( + as.data.table(a), + formula = group + top + bottom ~ variable, + value.var = 'mean' +) ## this time, compute the weighted mean of selected properties, by profile ID -a <- slab(sp3, fm= id ~ L + A + B, slab.structure=c(40,60), slab.fun=mean.and.sd) -(a.wide <- cast(a, id + top + bottom ~ variable, value=c('mean'))) +a <- slab(sp3, + fm = id ~ L + A + B, + slab.structure = c(40,60), + slab.fun = mean.and.sd +) + +# convert long -> wide +dcast( + as.data.table(a), + formula = id + top + bottom ~ variable, + value.var = 'mean' +) ## aggregate the entire collection, using default slab function (hdquantile) @@ -342,6 +419,7 @@ a <- slab(sp3, fm= id ~ L + A + B, slab.structure=c(40,60), slab.fun=mean.and.sd a <- slab(sp3, fm= ~ L + A + B) + ## weighted-aggregation -- NOT YET IMPLEMENTED -- # load sample data, upgrade to SoilProfileCollection data(sp1)