diff --git a/NEWS.md b/NEWS.md index fd13331a2..0ba12fc4b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -84,6 +84,8 @@ 16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. +17. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/data.table.R b/R/data.table.R index 9bc04e3ca..5b23169c4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -242,7 +242,7 @@ replace_dot_alias = function(e) { } if (!missing(j)) { jsub = replace_dot_alias(jsub) - root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" + root = root_name(jsub) if (root == ":" || (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) && @@ -285,7 +285,7 @@ replace_dot_alias = function(e) { if (root=="{") { if (length(jsub) == 2L) { jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376] - root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" + root = root_name(jsub) } else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") { #2142 -- j can be {} and have length 1 stopf("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}") @@ -298,10 +298,8 @@ replace_dot_alias = function(e) { jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call # Note that the dynamic expression could now be := (new in v1.9.7) - root = if (is.call(jsub)) { - jsub = replace_dot_alias(jsub) - as.character(jsub[[1L]])[1L] - } else "" + jsub = replace_dot_alias(jsub) + root = root_name(jsub) } if (root == ":=" || root == "let") { # let(...) as alias for :=(...) (#3795) if (root == "let") @@ -1401,7 +1399,7 @@ replace_dot_alias = function(e) { .Call(Cassign,x,irows,cols,newnames,jval) return(suppPrint(x)) } - if ((is.call(jsub) && jsub[[1L]] != "get" && is.list(jval) && !is.object(jval)) || !missingby) { + if ((is.call(jsub) && !jsub %iscall% "get" && is.list(jval) && !is.object(jval)) || !missingby) { # is.call: selecting from a list column should return list # is.object: for test 168 and 168.1 (S4 object result from ggplot2::qplot). Just plain list results should result in data.table @@ -1647,25 +1645,25 @@ replace_dot_alias = function(e) { jsub = as.call(c(quote(list), lapply(sdvars, as.name))) jvnames = sdvars } - } else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with + } else if (is.name(jsub[[1L]])) { # Else expect problems with # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 subopt = length(jsub) == 3L && - (jsub[[1L]] == "[" || - (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && + (jsub %iscall% "[" || + (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") - headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" - firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 + headopt = jsub %iscall% c("head", "tail") + firstopt = jsub %iscall% c("first", "last") # fix for #2030 if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && (subopt || headopt || firstopt)) { if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) jvnames = sdvars - } else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) { + } else if (jsub %iscall% "lapply" && jsub[[2L]]==".SD" && length(xcols)) { deparse_ans = .massageSD(jsub) jsub = deparse_ans[[1L]] jvnames = deparse_ans[[2L]] - } else if (jsub[[1L]] == "c" && length(jsub) > 1L) { + } else if (jsub %iscall% "c" && length(jsub) > 1L) { # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already. # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains @@ -1770,7 +1768,7 @@ replace_dot_alias = function(e) { GForce = FALSE } else { # Apply GForce - if (jsub[[1L]]=="list") { + if (jsub %iscall% "list") { GForce = TRUE for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break} @@ -1778,7 +1776,7 @@ replace_dot_alias = function(e) { } else GForce = .gforce_ok(jsub, SDenv$.SDall) if (GForce) { - if (jsub[[1L]]=="list") + if (jsub %iscall% "list") for (ii in seq_along(jsub)[-1L]) { if (is.N(jsub[[ii]])) next; # For #334 jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x) @@ -1796,7 +1794,7 @@ replace_dot_alias = function(e) { # Still do the old speedup for mean, for now nomeanopt=FALSE # to be set by .optmean() using <<- inside it oldjsub = jsub - if (jsub[[1L]]=="list") { + if (jsub %iscall% "list") { # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() # jsub[[1]]=="list" so the first item of todo will always be FALSE todo = sapply(jsub, `%iscall%`, 'mean') @@ -1804,7 +1802,7 @@ replace_dot_alias = function(e) { w = which(todo) jsub[w] = lapply(jsub[w], .optmean) } - } else if (jsub[[1L]]=="mean") { + } else if (jsub %iscall% "mean") { jsub = .optmean(jsub) } if (nomeanopt) { @@ -1884,7 +1882,7 @@ replace_dot_alias = function(e) { (q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 else 0 } - if (jsub[[1L]] == "list"){ + if (jsub %iscall% "list"){ q3 = max(sapply(jsub, headTail_arg)) } else if (length(jsub)==3L) { q3 = headTail_arg(jsub) @@ -1986,6 +1984,10 @@ replace_dot_alias = function(e) { setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } +# What's the name of the top-level call in 'j'? +# NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026). +root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else "" + DT = function(x, ...) { #4872 old = getOption("datatable.optimize") if (!is.data.table(x) && old>2L) { diff --git a/R/utils.R b/R/utils.R index ff4766d5e..cf23609ee 100644 --- a/R/utils.R +++ b/R/utils.R @@ -155,7 +155,8 @@ is_utc = function(tz) { `%iscall%` = function(e, f) { if (!is.call(e)) return(FALSE) if (is.name(e1 <- e[[1L]])) return(e1 %chin% f) - e1 %iscall% '::' && e1[[3L]] %chin% f + if (e1 %iscall% c('::', ':::')) return(e1[[3L]] %chin% f) + paste(deparse(e1), collapse = " ") %chin% f # complicated cases e.g. a closure/builtin on LHS of call; note that format() is much (e.g. 40x) slower than deparse() } # nocov start #593 always return a data.table diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 69866f1cd..a94781c09 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18994,3 +18994,11 @@ test(2276.14, fcase(c(TRUE, FALSE), 1L, c(TRUE, TRUE), NA), c(1L, NA_integer_)) # output is missing test(2276.15, fcase(c(TRUE, FALSE), NA_integer_, c(TRUE, TRUE), 2L), c(NA_integer_, 2L)) + +# passing a function in env= doesn't trip up processing 'j', #6026 +DT=data.table(a=1:2, b=3:4) +test(2277.1, DT[, builtin(b), env=list(builtin=sum)], 7L) +test(2277.2, DT[, closure(b), env=list(closure=var)], 0.5) +test(2277.3, DT[, closure(b), env=list(closure=stats::var)], 0.5) +test(2277.4, DT[, closure(b), env=list(closure=stats:::var)], 0.5) +test(2277.5, DT[, lambda(b), env=list(lambda=function(x) sum(x))], 7L)