Skip to content

Commit

Permalink
Merge pull request #188 from ncss-tech/remove-reshape
Browse files Browse the repository at this point in the history
Replace {reshape} melt and cast with {data.table} methods.
  • Loading branch information
dylanbeaudette authored Jan 20, 2021
2 parents 5ea7b82 + 1d2b13c commit db2732f
Show file tree
Hide file tree
Showing 8 changed files with 270 additions and 102 deletions.
8 changes: 1 addition & 7 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,7 @@ importFrom(plyr,
rbind.fill
)

importFrom(reshape,
melt,
melt.data.frame,
cast
)

import(data.table, except=c(melt))
import(data.table)

importFrom(stringr,
fixed,
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# aqp 1.27 (2021-01-14)
* {aqp} no longer imports from {reshape} (less one dependency), all transformations from wide<->long are done via {data.table}
* methods from {data.table} are now imported by {aqp} (new dependency)
* Major overhaul of `plotColorQuantiles()`, now using {lattice} graphics
* New dataset `munequivalent` and method `equivalentMunsellChips` for "equivalent" Munsell chips lookup list based on all pairwise dE00 contrasts for integer "chips" in `aqp::munsell` data set
* Argillic critical clay contents `crit.clay.argillic` rounded to whole numbers per NSSH Part 614, subpart B, sections 614.13 and 614.14
Expand Down
12 changes: 10 additions & 2 deletions R/colorQuantiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,16 @@ colorQuantiles <- function(soilColors, p = c(0.05, 0.5, 0.95)) {
#'
plotColorQuantiles <- function(res, pt.cex = 7, lab.cex = 0.66) {

# long format for plotting in panels
m.long <- melt(res$marginal, id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip'))
# convert wide -> long format for plotting in panels
# using data.table::melt()
m.long <- melt(
as.data.table(res$marginal),
id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip')
)

# convert back to data.frame
m.long <- as.data.frame(m.long)

# fake y-variable for plotting marginal vs. L1
m.long$y <- 1

Expand Down
52 changes: 37 additions & 15 deletions R/evalGenHz.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE
h[[genhz]] <- factor(h[[genhz]])

# make an index to complete data
no.na.idx <- which(complete.cases(h[, vars]))
no.na.idx <- which(complete.cases(h[, vars, drop = FALSE]))

## TODO: all of vars should be numeric or convertable to numeric
# numeric.test <- sapply(vars, function(i) is.numeric(h[[i]]))

# test for duplicate data
# unique IDs are based on a concatenation of variables used... digest would be safer
Expand Down Expand Up @@ -56,21 +59,40 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE
h$mds.2[no.na.idx] <- mds$points[, 2]
h$sil.width[sil.idx] <- sil[, 3]
h$neighbor[sil.idx] <- levels(h[[genhz]])[sil[, 2]]

# melt into long form
m <- melt(h, id.vars = genhz, measure.vars = c(vars, 'sil.width'))

# compute group-wise summaries-- note that text is returned
m.summary <- ddply(m, c(genhz, 'variable'), function(i) {
stats <- format(paste0(round(mean(i$value, na.rm=TRUE), 2), ' (' ,
sd = round(sd(i$value, na.rm=TRUE), 2), ')'),
justify = 'right')
return(data.frame(stats = stats))
})


## TODO: enforce / check above
## important note: all 'vars' should be numeric


# convert wide -> long form
# using data.table::melt
# suppressing warnings related to mixture of int / numeric
m <- suppressWarnings(
melt(
as.data.table(h),
id.vars = genhz,
measure.vars = c(vars, 'sil.width')
)
)

# leave as data.table for aggregation
# compute group-wise summaries
m.summary <- m[, list(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)), by = c((genhz), 'variable')]

# format text
m.summary$stats <- sprintf(
"%s (%s)",
round(m.summary$mean, 2),
round(m.summary$sd, 2)
)

# using data.table::dcast
fm <- paste0(genhz, ' ~ variable')
genhz.stats <- cast(m.summary, fm, value = 'stats')

genhz.stats <- dcast(data = m.summary, formula = fm, value.var = 'stats')

# convert back to data.frame
genhz.stats <- as.data.frame(genhz.stats)

# composite into a list
res <- list(horizons = h[, c('mds.1', 'mds.2', 'sil.width', 'neighbor')],
stats = genhz.stats, dist = d)
Expand Down
6 changes: 5 additions & 1 deletion R/get.ml.hz.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) {
safe.names <- make.names(o.names)

# LUT for names
names.LUT <- data.frame(original=o.names, safe=safe.names, stringsAsFactors = FALSE)
names.LUT <- data.frame(
original = o.names,
safe = safe.names,
stringsAsFactors = FALSE
)

# get index to max probability,
# but only when there is at least one value > 0 and all are not NA
Expand Down
60 changes: 47 additions & 13 deletions R/slab.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,18 @@
gc()

# check variable classes
if(length(vars) > 1)
vars.numeric.test <- sapply(data[, vars], is.numeric)
else
vars.numeric.test <- is.numeric(data[[vars]])
if(length(vars) > 1) {
vars.numeric.test <- sapply(data[, vars], is.numeric)
} else {
vars.numeric.test <- is.numeric(data[[vars]])
}


# sanity check: all numeric, or single character/factor
if(any(! vars.numeric.test) & length(vars) > 1)
stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call.=FALSE)
if(any(! vars.numeric.test) & length(vars) > 1) {
stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call. = FALSE)
}


# check for single categorical variable, and convert to factor
if(length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) {
Expand All @@ -183,19 +187,26 @@
}

# check for weights
if(!missing(weights))
stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE)
if(!missing(weights)) {
stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE)
}


# re-set default function, currently no user-supply-able option
slab.fun <- .slab.fun.factor.default

# add extra arguments required by this function
# note that we cannot accept additional arguments when processing categorical values
extra.args <- list(cpm=cpm)
extra.args <- list(cpm = cpm)

# save factor levels for later
original.levels <- levels(data[[vars]])
}

# set a flag for post data.table.melt -> factor level setting
.factorFlag <- TRUE
} else {
.factorFlag <- FALSE
}


####
Expand All @@ -222,7 +233,7 @@
##
## TODO: adding weighted-aggregate functionality here
## we can't use aggregate() for this
##
## we can use data.table methods

# check for weights
if(!missing(weights))
Expand All @@ -233,8 +244,26 @@
seg.label.is.not.NA <- which(!is.na(data$seg.label))

# convert into long format
d.long <- melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars)

# d.long.df <- reshape::melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars)

# convert wide -> long format
# using data.table::melt()
# note that this will not preserve factor levels when 'vars' is categorical
# must call unique() on `id.vars`
d.long <- melt(
as.data.table(data[seg.label.is.not.NA, ]),
id.vars = unique(c(object.ID, 'seg.label', g)),
measure.vars = vars,
)

# convert back to data.frame
d.long <- as.data.frame(d.long)

# reset factor levels in d.long[[value]]
if(.factorFlag) {
d.long[['value']] <- factor(d.long[['value']], levels = original.levels)
}

# make a formula for aggregate()
aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep=''))

Expand All @@ -244,6 +273,11 @@
## 2. aggregate using seg.label + variable in parallel
## 3. combine results (a list of data.frames)
##
## NOPE: use data.table which will automatically scale to multiple threads
##



# process chunks according to group -> variable -> segment
# NA values are not explicitly dropped
if(length(extra.args) == 0)
Expand Down
42 changes: 34 additions & 8 deletions R/soilColorSignature.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,26 @@
# make IDs
x.medoids$.ids <- paste0('.', 1:k)

# melt and create new variable names
m <- melt(x.medoids, id.var=c(idname(x), '.ids'), measure.vars = c('L', 'A', 'B'))
# convert wide -> long and create new variable names
# using data.table::melt()
m <- melt(
as.data.table(x.medoids),
id.var = c(idname(x), '.ids'),
measure.vars = c('L', 'A', 'B')
)

# leave as data.table for dcast

# new ID
m$variable <- paste0(m$variable, m$.ids)

# cast to wide format
# convert long -> wide format
# using data.table::dcast
fm <- as.formula(paste0(idname(x), ' ~ variable'))
res <- cast(m, fm, value = 'value')
res <- dcast(m, formula = fm, value.var = 'value')

# convert back to data.frame
res <- as.data.frame(res)

# don't include the id column
return(res[, -1])
Expand Down Expand Up @@ -83,13 +96,26 @@
# make depth IDs
x.slices$depth.id <- paste0('.', p)

# melt and create new variable names
m <- melt(x.slices, id.var=c(idname(x), 'depth.id'), measure.vars = c('L', 'A', 'B'))
# convert wide -> long and create new variable names
# using data.table::melt()
m <- melt(
as.data.table(x.slices),
id.var = c(idname(x), 'depth.id'),
measure.vars = c('L', 'A', 'B')
)

# leave as data.table for re-shape

# new ID
m$variable <- paste0(m$variable, m$depth.id)

# cast to wide format
# convert long -> wide format
# using data.table::dcast
fm <- as.formula(paste0(idname(x), ' ~ variable'))
res <- cast(m, fm, value = 'value')
res <- dcast(m, formula = fm, value.var = 'value')

# convert back to data.frame
res <- as.data.frame(res)

# don't include the id column
return(res[, -1])
Expand Down
Loading

0 comments on commit db2732f

Please sign in to comment.