Merge pull request #188 from ncss-tech/remove-reshape

Replace {reshape} melt and cast with {data.table} methods.
ncss-tech · Jan 20, 2021 · db2732f · db2732f
2 parents 5ea7b82 + 1d2b13c
commit db2732f
Show file tree

Hide file tree

Showing 8 changed files with 270 additions and 102 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -72,13 +72,7 @@ importFrom(plyr,
   rbind.fill
 )
 
-importFrom(reshape,
-  melt,
-  melt.data.frame,
-  cast
-)
-
-import(data.table, except=c(melt))
+import(data.table)
 
 importFrom(stringr,
   fixed,

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,6 @@
 # aqp 1.27 (2021-01-14)
+ * {aqp} no longer imports from {reshape} (less one dependency), all transformations from wide<->long are done via {data.table}
+ * methods from {data.table} are now imported by {aqp} (new dependency)
  * Major overhaul of `plotColorQuantiles()`, now using {lattice} graphics
  * New dataset `munequivalent` and method `equivalentMunsellChips` for "equivalent" Munsell chips lookup list based on all pairwise dE00 contrasts for integer "chips" in `aqp::munsell` data set
  * Argillic critical clay contents `crit.clay.argillic` rounded to whole numbers per NSSH Part 614, subpart B, sections 614.13 and 614.14

diff --git a/R/colorQuantiles.R b/R/colorQuantiles.R
@@ -189,8 +189,16 @@ colorQuantiles <- function(soilColors, p = c(0.05, 0.5, 0.95)) {
 #' 
 plotColorQuantiles <- function(res, pt.cex = 7, lab.cex = 0.66) {
 
-  # long format for plotting in panels
-  m.long <- melt(res$marginal, id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip'))
+  # convert wide -> long format for plotting in panels
+  # using data.table::melt()
+  m.long <- melt(
+    as.data.table(res$marginal), 
+    id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip')
+    )
+
+  # convert back to data.frame
+  m.long <- as.data.frame(m.long)
+
   # fake y-variable for plotting marginal vs. L1
   m.long$y <- 1
 

diff --git a/R/evalGenHz.R b/R/evalGenHz.R
@@ -15,7 +15,10 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE
     h[[genhz]] <- factor(h[[genhz]])
 
   # make an index to complete data
-  no.na.idx <- which(complete.cases(h[, vars]))
+  no.na.idx <- which(complete.cases(h[, vars, drop = FALSE]))
+
+  ## TODO: all of vars should be numeric or convertable to numeric
+  # numeric.test <- sapply(vars, function(i) is.numeric(h[[i]]))
 
   # test for duplicate data
   # unique IDs are based on a concatenation of variables used... digest would be safer
@@ -56,21 +59,40 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE
   h$mds.2[no.na.idx] <- mds$points[, 2]
   h$sil.width[sil.idx] <- sil[, 3]
   h$neighbor[sil.idx] <- levels(h[[genhz]])[sil[, 2]]
-
-  # melt into long form
-  m <- melt(h, id.vars = genhz, measure.vars = c(vars, 'sil.width'))
-
-  # compute group-wise summaries-- note that text is returned
-  m.summary <- ddply(m, c(genhz, 'variable'), function(i) {
-    stats <- format(paste0(round(mean(i$value, na.rm=TRUE), 2), ' (' ,
-                           sd = round(sd(i$value, na.rm=TRUE), 2), ')'),
-                    justify = 'right')
-    return(data.frame(stats = stats))
-  })
-
+
+  ## TODO: enforce / check above
+  ## important note: all 'vars' should be numeric
+
+
+  # convert wide -> long form
+  # using data.table::melt
+  # suppressing warnings related to mixture of int / numeric
+  m <- suppressWarnings(
+    melt(
+      as.data.table(h), 
+      id.vars = genhz, 
+      measure.vars = c(vars, 'sil.width')
+    )
+  )
+
+  # leave as data.table for aggregation
+  # compute group-wise summaries
+  m.summary <- m[, list(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)), by = c((genhz), 'variable')]
+
+  # format text
+  m.summary$stats <- sprintf(
+    "%s (%s)", 
+    round(m.summary$mean, 2),
+    round(m.summary$sd, 2)
+  )
+
+  # using data.table::dcast
   fm <- paste0(genhz, ' ~ variable')
-  genhz.stats <- cast(m.summary, fm, value = 'stats')
-
+  genhz.stats <- dcast(data = m.summary, formula = fm, value.var = 'stats')
+
+  # convert back to data.frame
+  genhz.stats <- as.data.frame(genhz.stats)
+
   # composite into a list
   res <- list(horizons = h[, c('mds.1', 'mds.2', 'sil.width', 'neighbor')],
               stats = genhz.stats, dist = d)

diff --git a/R/get.ml.hz.R b/R/get.ml.hz.R
@@ -22,7 +22,11 @@ get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) {
   safe.names <- make.names(o.names)
 
   # LUT for names
-  names.LUT <- data.frame(original=o.names, safe=safe.names, stringsAsFactors = FALSE)
+  names.LUT <- data.frame(
+    original = o.names, 
+    safe = safe.names, 
+    stringsAsFactors = FALSE
+    )
 
 	# get index to max probability, 
 	# but only when there is at least one value > 0 and all are not NA

diff --git a/R/slab.R b/R/slab.R
@@ -164,14 +164,18 @@
 	gc()
 
 	# check variable classes
-	if(length(vars) > 1)
-		vars.numeric.test <- sapply(data[, vars], is.numeric)
-	else
-		vars.numeric.test <- is.numeric(data[[vars]])
+	if(length(vars) > 1) {
+	  vars.numeric.test <- sapply(data[, vars], is.numeric)
+	}	else {
+	  vars.numeric.test <- is.numeric(data[[vars]])
+	}
+
 
 	# sanity check: all numeric, or single character/factor
-	if(any(! vars.numeric.test) & length(vars) > 1)
-		stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call.=FALSE)
+	if(any(! vars.numeric.test) & length(vars) > 1) {
+	  stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call. = FALSE)
+	}
+
 
 	# check for single categorical variable, and convert to factor
 	if(length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) {
@@ -183,19 +187,26 @@
 		}
 
 		# check for weights
-		if(!missing(weights))
-			stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE)
+		if(!missing(weights)) {
+		  stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE)
+		}
+
 
 		# re-set default function, currently no user-supply-able option
 		slab.fun <- .slab.fun.factor.default
 
 		# add extra arguments required by this function
 		# note that we cannot accept additional arguments when processing categorical values
-		extra.args <- list(cpm=cpm)
+		extra.args <- list(cpm = cpm)
 
     # save factor levels for later
     original.levels <- levels(data[[vars]])
-		}
+
+    # set a flag for post data.table.melt -> factor level setting
+    .factorFlag <- TRUE
+	} else {
+	  .factorFlag <- FALSE
+	}
 
 
 	####
@@ -222,7 +233,7 @@
 	##
 	## TODO: adding weighted-aggregate functionality here
 	## we can't use aggregate() for this
-	##
+	## we can use data.table methods
 
 	# check for weights
 	if(!missing(weights))
@@ -233,8 +244,26 @@
 	seg.label.is.not.NA <- which(!is.na(data$seg.label))
 
 	# convert into long format
-	d.long <- melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars)
-
+	# d.long.df <- reshape::melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars)
+
+	# convert wide -> long format
+	# using data.table::melt()
+	# note that this will not preserve factor levels when 'vars' is categorical
+	# must call unique() on `id.vars`
+	d.long <- melt(
+	  as.data.table(data[seg.label.is.not.NA, ]),
+	  id.vars = unique(c(object.ID, 'seg.label', g)),
+	  measure.vars = vars,
+	  )
+
+	# convert back to data.frame
+	d.long <- as.data.frame(d.long)
+
+  # reset factor levels in d.long[[value]]
+	if(.factorFlag) {
+	  d.long[['value']] <- factor(d.long[['value']], levels = original.levels)
+	}
+
 	# make a formula for aggregate()
 	aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep=''))
 
@@ -244,6 +273,11 @@
 	##       2. aggregate using seg.label + variable in parallel
 	##       3. combine results (a list of data.frames)
 	##
+	## NOPE: use data.table which will automatically scale to multiple threads
+	##
+
+
+
 	# process chunks according to group -> variable -> segment
 	# NA values are not explicitly dropped
 	if(length(extra.args) == 0)

diff --git a/R/soilColorSignature.R b/R/soilColorSignature.R
@@ -41,13 +41,26 @@
   # make IDs
   x.medoids$.ids <- paste0('.', 1:k)
 
-  # melt and create new variable names
-  m <- melt(x.medoids, id.var=c(idname(x), '.ids'), measure.vars = c('L', 'A', 'B'))
+  # convert wide -> long and create new variable names
+  # using data.table::melt()
+  m <- melt(
+    as.data.table(x.medoids), 
+    id.var = c(idname(x), '.ids'), 
+    measure.vars = c('L', 'A', 'B')
+  )
+
+  # leave as data.table for dcast
+
+  # new ID
   m$variable <- paste0(m$variable, m$.ids)
 
-  # cast to wide format
+  # convert long -> wide format
+  # using data.table::dcast
   fm <- as.formula(paste0(idname(x), ' ~ variable'))
-  res <- cast(m, fm, value = 'value')
+  res <- dcast(m, formula = fm, value.var = 'value')
+
+  # convert back to data.frame
+  res <- as.data.frame(res)
 
   # don't include the id column
   return(res[, -1])
@@ -83,13 +96,26 @@
   # make depth IDs
   x.slices$depth.id <- paste0('.', p)
 
-  # melt and create new variable names
-  m <- melt(x.slices, id.var=c(idname(x), 'depth.id'), measure.vars = c('L', 'A', 'B'))
+  # convert wide -> long and create new variable names
+  # using data.table::melt()
+  m <- melt(
+    as.data.table(x.slices),
+    id.var = c(idname(x), 'depth.id'), 
+    measure.vars = c('L', 'A', 'B')
+  )
+
+  # leave as data.table for re-shape
+
+  # new ID
   m$variable <- paste0(m$variable, m$depth.id)
 
-  # cast to wide format
+  # convert long -> wide format
+  # using data.table::dcast
   fm <- as.formula(paste0(idname(x), ' ~ variable'))
-  res <- cast(m, fm, value = 'value')
+  res <- dcast(m, formula = fm, value.var = 'value')
+
+  # convert back to data.frame
+  res <- as.data.frame(res)
 
   # don't include the id column
   return(res[, -1])