Skip to content

Commit

Permalink
Use deparse() to cast calls to string when needed (#6027)
Browse files Browse the repository at this point in the history
* Don't cast jsub[[1]] to character if invalid

* Fix for case of lambda in j

* comment on why '(' handling is needed

* switch to format() in progress

* fix %iscall%

* more fixes

* var in earlier test was masking stats::var

* NEWS

* Use deparse() directly to avoid tiny overhead & make it easier to find deparse1() calls later

* discourage f=sum, encourage f="sum"

* Also catch pkg:::fun

* test for ':::' too
  • Loading branch information
MichaelChirico authored Jul 29, 2024
1 parent 680b5e8 commit 0a25b42
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 20 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@

16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report.

17. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix.

## NOTES

1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1.
Expand Down
40 changes: 21 additions & 19 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ replace_dot_alias = function(e) {
}
if (!missing(j)) {
jsub = replace_dot_alias(jsub)
root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
root = root_name(jsub)
if (root == ":" ||
(root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') ||
( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) &&
Expand Down Expand Up @@ -285,7 +285,7 @@ replace_dot_alias = function(e) {
if (root=="{") {
if (length(jsub) == 2L) {
jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376]
root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else ""
root = root_name(jsub)
} else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") {
#2142 -- j can be {} and have length 1
stopf("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}")
Expand All @@ -298,10 +298,8 @@ replace_dot_alias = function(e) {
jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression
if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call
# Note that the dynamic expression could now be := (new in v1.9.7)
root = if (is.call(jsub)) {
jsub = replace_dot_alias(jsub)
as.character(jsub[[1L]])[1L]
} else ""
jsub = replace_dot_alias(jsub)
root = root_name(jsub)
}
if (root == ":=" || root == "let") { # let(...) as alias for :=(...) (#3795)
if (root == "let")
Expand Down Expand Up @@ -1401,7 +1399,7 @@ replace_dot_alias = function(e) {
.Call(Cassign,x,irows,cols,newnames,jval)
return(suppPrint(x))
}
if ((is.call(jsub) && jsub[[1L]] != "get" && is.list(jval) && !is.object(jval)) || !missingby) {
if ((is.call(jsub) && !jsub %iscall% "get" && is.list(jval) && !is.object(jval)) || !missingby) {
# is.call: selecting from a list column should return list
# is.object: for test 168 and 168.1 (S4 object result from ggplot2::qplot). Just plain list results should result in data.table

Expand Down Expand Up @@ -1647,25 +1645,25 @@ replace_dot_alias = function(e) {
jsub = as.call(c(quote(list), lapply(sdvars, as.name)))
jvnames = sdvars
}
} else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with <jsub[[1L]] == >
} else if (is.name(jsub[[1L]])) { # Else expect problems with <jsub[[1L]] == >
# g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612
subopt = length(jsub) == 3L &&
(jsub[[1L]] == "[" ||
(jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) &&
(jsub %iscall% "[" ||
(jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) &&
(is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail"
firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030
headopt = jsub %iscall% c("head", "tail")
firstopt = jsub %iscall% c("first", "last") # fix for #2030
if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") &&
(subopt || headopt || firstopt)) {
if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub })))
jvnames = sdvars
} else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) {
} else if (jsub %iscall% "lapply" && jsub[[2L]]==".SD" && length(xcols)) {
deparse_ans = .massageSD(jsub)
jsub = deparse_ans[[1L]]
jvnames = deparse_ans[[2L]]
} else if (jsub[[1L]] == "c" && length(jsub) > 1L) {
} else if (jsub %iscall% "c" && length(jsub) > 1L) {
# TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already.
# FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here.
# FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains
Expand Down Expand Up @@ -1770,15 +1768,15 @@ replace_dot_alias = function(e) {
GForce = FALSE
} else {
# Apply GForce
if (jsub[[1L]]=="list") {
if (jsub %iscall% "list") {
GForce = TRUE
for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) {
if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break}
}
} else
GForce = .gforce_ok(jsub, SDenv$.SDall)
if (GForce) {
if (jsub[[1L]]=="list")
if (jsub %iscall% "list")
for (ii in seq_along(jsub)[-1L]) {
if (is.N(jsub[[ii]])) next; # For #334
jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x)
Expand All @@ -1796,15 +1794,15 @@ replace_dot_alias = function(e) {
# Still do the old speedup for mean, for now
nomeanopt=FALSE # to be set by .optmean() using <<- inside it
oldjsub = jsub
if (jsub[[1L]]=="list") {
if (jsub %iscall% "list") {
# Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if()
# jsub[[1]]=="list" so the first item of todo will always be FALSE
todo = sapply(jsub, `%iscall%`, 'mean')
if (any(todo)) {
w = which(todo)
jsub[w] = lapply(jsub[w], .optmean)
}
} else if (jsub[[1L]]=="mean") {
} else if (jsub %iscall% "mean") {
jsub = .optmean(jsub)
}
if (nomeanopt) {
Expand Down Expand Up @@ -1884,7 +1882,7 @@ replace_dot_alias = function(e) {
(q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3
else 0
}
if (jsub[[1L]] == "list"){
if (jsub %iscall% "list"){
q3 = max(sapply(jsub, headTail_arg))
} else if (length(jsub)==3L) {
q3 = headTail_arg(jsub)
Expand Down Expand Up @@ -1986,6 +1984,10 @@ replace_dot_alias = function(e) {
setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line
}

# What's the name of the top-level call in 'j'?
# NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026).
root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else ""

DT = function(x, ...) { #4872
old = getOption("datatable.optimize")
if (!is.data.table(x) && old>2L) {
Expand Down
3 changes: 2 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@ is_utc = function(tz) {
`%iscall%` = function(e, f) {
if (!is.call(e)) return(FALSE)
if (is.name(e1 <- e[[1L]])) return(e1 %chin% f)
e1 %iscall% '::' && e1[[3L]] %chin% f
if (e1 %iscall% c('::', ':::')) return(e1[[3L]] %chin% f)
paste(deparse(e1), collapse = " ") %chin% f # complicated cases e.g. a closure/builtin on LHS of call; note that format() is much (e.g. 40x) slower than deparse()
}

# nocov start #593 always return a data.table
Expand Down
8 changes: 8 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -18994,3 +18994,11 @@ test(2276.14, fcase(c(TRUE, FALSE), 1L, c(TRUE, TRUE), NA), c(1L, NA_integer_))

# output is missing
test(2276.15, fcase(c(TRUE, FALSE), NA_integer_, c(TRUE, TRUE), 2L), c(NA_integer_, 2L))

# passing a function in env= doesn't trip up processing 'j', #6026
DT=data.table(a=1:2, b=3:4)
test(2277.1, DT[, builtin(b), env=list(builtin=sum)], 7L)
test(2277.2, DT[, closure(b), env=list(closure=var)], 0.5)
test(2277.3, DT[, closure(b), env=list(closure=stats::var)], 0.5)
test(2277.4, DT[, closure(b), env=list(closure=stats:::var)], 0.5)
test(2277.5, DT[, lambda(b), env=list(lambda=function(x) sum(x))], 7L)

0 comments on commit 0a25b42

Please sign in to comment.