Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace {reshape} melt and cast with {data.table} methods. #188

Merged
merged 8 commits into from
Jan 20, 2021
8 changes: 1 addition & 7 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,7 @@ importFrom(plyr,
rbind.fill
)

importFrom(reshape,
melt,
melt.data.frame,
cast
)

import(data.table, except=c(melt))
import(data.table)

importFrom(stringr,
fixed,
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# aqp 1.27 (2021-01-14)
* {aqp} no longer imports from {reshape} (less one dependency), all transformations from wide<->long are done via {data.table}
* methods from {data.table} are now imported by {aqp} (new dependency)
* Major overhaul of `plotColorQuantiles()`, now using {lattice} graphics
* New dataset `munequivalent` and method `equivalentMunsellChips` for "equivalent" Munsell chips lookup list based on all pairwise dE00 contrasts for integer "chips" in `aqp::munsell` data set
* Argillic critical clay contents `crit.clay.argillic` rounded to whole numbers per NSSH Part 614, subpart B, sections 614.13 and 614.14
Expand Down
12 changes: 10 additions & 2 deletions R/colorQuantiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,16 @@ colorQuantiles <- function(soilColors, p = c(0.05, 0.5, 0.95)) {
#'
plotColorQuantiles <- function(res, pt.cex = 7, lab.cex = 0.66) {

# long format for plotting in panels
m.long <- melt(res$marginal, id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip'))
# convert wide -> long format for plotting in panels
# using data.table::melt()
m.long <- melt(
as.data.table(res$marginal),
id.var = c('p', 'L_colors', 'A_colors', 'B_colors', 'L_chip', 'A_chip', 'B_chip')
)

# convert back to data.frame
m.long <- as.data.frame(m.long)

# fake y-variable for plotting marginal vs. L1
m.long$y <- 1

Expand Down
52 changes: 37 additions & 15 deletions R/evalGenHz.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE
h[[genhz]] <- factor(h[[genhz]])

# make an index to complete data
no.na.idx <- which(complete.cases(h[, vars]))
no.na.idx <- which(complete.cases(h[, vars, drop = FALSE]))

## TODO: all of vars should be numeric or convertable to numeric
# numeric.test <- sapply(vars, function(i) is.numeric(h[[i]]))

# test for duplicate data
# unique IDs are based on a concatenation of variables used... digest would be safer
Expand Down Expand Up @@ -56,21 +59,40 @@ evalGenHZ <- function(obj, genhz, vars, non.matching.code='not-used', stand=TRUE
h$mds.2[no.na.idx] <- mds$points[, 2]
h$sil.width[sil.idx] <- sil[, 3]
h$neighbor[sil.idx] <- levels(h[[genhz]])[sil[, 2]]

# melt into long form
m <- melt(h, id.vars = genhz, measure.vars = c(vars, 'sil.width'))

# compute group-wise summaries-- note that text is returned
m.summary <- ddply(m, c(genhz, 'variable'), function(i) {
stats <- format(paste0(round(mean(i$value, na.rm=TRUE), 2), ' (' ,
sd = round(sd(i$value, na.rm=TRUE), 2), ')'),
justify = 'right')
return(data.frame(stats = stats))
})


## TODO: enforce / check above
## important note: all 'vars' should be numeric


# convert wide -> long form
# using data.table::melt
# suppressing warnings related to mixture of int / numeric
m <- suppressWarnings(
melt(
as.data.table(h),
id.vars = genhz,
measure.vars = c(vars, 'sil.width')
)
)

# leave as data.table for aggregation
# compute group-wise summaries
m.summary <- m[, list(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)), by = c((genhz), 'variable')]

# format text
m.summary$stats <- sprintf(
"%s (%s)",
round(m.summary$mean, 2),
round(m.summary$sd, 2)
)

# using data.table::dcast
fm <- paste0(genhz, ' ~ variable')
genhz.stats <- cast(m.summary, fm, value = 'stats')

genhz.stats <- dcast(data = m.summary, formula = fm, value.var = 'stats')

# convert back to data.frame
genhz.stats <- as.data.frame(genhz.stats)

# composite into a list
res <- list(horizons = h[, c('mds.1', 'mds.2', 'sil.width', 'neighbor')],
stats = genhz.stats, dist = d)
Expand Down
6 changes: 5 additions & 1 deletion R/get.ml.hz.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) {
safe.names <- make.names(o.names)

# LUT for names
names.LUT <- data.frame(original=o.names, safe=safe.names, stringsAsFactors = FALSE)
names.LUT <- data.frame(
original = o.names,
safe = safe.names,
stringsAsFactors = FALSE
)

# get index to max probability,
# but only when there is at least one value > 0 and all are not NA
Expand Down
60 changes: 47 additions & 13 deletions R/slab.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,18 @@
gc()

# check variable classes
if(length(vars) > 1)
vars.numeric.test <- sapply(data[, vars], is.numeric)
else
vars.numeric.test <- is.numeric(data[[vars]])
if(length(vars) > 1) {
vars.numeric.test <- sapply(data[, vars], is.numeric)
} else {
vars.numeric.test <- is.numeric(data[[vars]])
}


# sanity check: all numeric, or single character/factor
if(any(! vars.numeric.test) & length(vars) > 1)
stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call.=FALSE)
if(any(! vars.numeric.test) & length(vars) > 1) {
stop('mixed variable types and multiple categorical variables are not currently supported in the same call to slab', call. = FALSE)
}


# check for single categorical variable, and convert to factor
if(length(vars) == 1 & inherits(data[, vars], c('character', 'factor'))) {
Expand All @@ -183,19 +187,26 @@
}

# check for weights
if(!missing(weights))
stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE)
if(!missing(weights)) {
stop('weighted aggregation of categorical variables not yet implemented', call.=FALSE)
}


# re-set default function, currently no user-supply-able option
slab.fun <- .slab.fun.factor.default

# add extra arguments required by this function
# note that we cannot accept additional arguments when processing categorical values
extra.args <- list(cpm=cpm)
extra.args <- list(cpm = cpm)

# save factor levels for later
original.levels <- levels(data[[vars]])
}

# set a flag for post data.table.melt -> factor level setting
.factorFlag <- TRUE
} else {
.factorFlag <- FALSE
}


####
Expand All @@ -222,7 +233,7 @@
##
## TODO: adding weighted-aggregate functionality here
## we can't use aggregate() for this
##
## we can use data.table methods

# check for weights
if(!missing(weights))
Expand All @@ -233,8 +244,26 @@
seg.label.is.not.NA <- which(!is.na(data$seg.label))

# convert into long format
d.long <- melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars)

# d.long.df <- reshape::melt(data[seg.label.is.not.NA, ], id.vars=c(object.ID, 'seg.label', g), measure.vars=vars)

# convert wide -> long format
# using data.table::melt()
# note that this will not preserve factor levels when 'vars' is categorical
# must call unique() on `id.vars`
d.long <- melt(
as.data.table(data[seg.label.is.not.NA, ]),
id.vars = unique(c(object.ID, 'seg.label', g)),
measure.vars = vars,
)

# convert back to data.frame
d.long <- as.data.frame(d.long)

# reset factor levels in d.long[[value]]
if(.factorFlag) {
d.long[['value']] <- factor(d.long[['value']], levels = original.levels)
}

# make a formula for aggregate()
aggregate.fm <- as.formula(paste('value ~ seg.label + variable + ', g, sep=''))

Expand All @@ -244,6 +273,11 @@
## 2. aggregate using seg.label + variable in parallel
## 3. combine results (a list of data.frames)
##
## NOPE: use data.table which will automatically scale to multiple threads
##



# process chunks according to group -> variable -> segment
# NA values are not explicitly dropped
if(length(extra.args) == 0)
Expand Down
42 changes: 34 additions & 8 deletions R/soilColorSignature.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,26 @@
# make IDs
x.medoids$.ids <- paste0('.', 1:k)

# melt and create new variable names
m <- melt(x.medoids, id.var=c(idname(x), '.ids'), measure.vars = c('L', 'A', 'B'))
# convert wide -> long and create new variable names
# using data.table::melt()
m <- melt(
as.data.table(x.medoids),
id.var = c(idname(x), '.ids'),
measure.vars = c('L', 'A', 'B')
)

# leave as data.table for dcast

# new ID
m$variable <- paste0(m$variable, m$.ids)

# cast to wide format
# convert long -> wide format
# using data.table::dcast
fm <- as.formula(paste0(idname(x), ' ~ variable'))
res <- cast(m, fm, value = 'value')
res <- dcast(m, formula = fm, value.var = 'value')

# convert back to data.frame
res <- as.data.frame(res)

# don't include the id column
return(res[, -1])
Expand Down Expand Up @@ -83,13 +96,26 @@
# make depth IDs
x.slices$depth.id <- paste0('.', p)

# melt and create new variable names
m <- melt(x.slices, id.var=c(idname(x), 'depth.id'), measure.vars = c('L', 'A', 'B'))
# convert wide -> long and create new variable names
# using data.table::melt()
m <- melt(
as.data.table(x.slices),
id.var = c(idname(x), 'depth.id'),
measure.vars = c('L', 'A', 'B')
)

# leave as data.table for re-shape

# new ID
m$variable <- paste0(m$variable, m$depth.id)

# cast to wide format
# convert long -> wide format
# using data.table::dcast
fm <- as.formula(paste0(idname(x), ' ~ variable'))
res <- cast(m, fm, value = 'value')
res <- dcast(m, formula = fm, value.var = 'value')

# convert back to data.frame
res <- as.data.frame(res)

# don't include the id column
return(res[, -1])
Expand Down
Loading