Skip to content

Commit

Permalink
Refactor GForce helpers outside of [ (#5954)
Browse files Browse the repository at this point in the history
This is mainly to facilitate debugging, and also keep the dependencies of each function clearer (rather than implicitly inheriting from the [.data.table environment).

This feeds in to #5953.

Not a _pure_ copy-paste, note the following minor changes:

 - Use argument name `q` consistently. Particularly important for `.gforce_jsub` where argument `x` is confusing next to argument `names_x` which has nothing to do with `x` in this environment, but the parent environment.
 - Consistently evaluate inside `SDenv$.SDall`, from which we also pull names (at HEAD, we evaluate inside `x` while pulling names from `.SDenv$.SDall`)
 - Rename `gforce_jsub` -> `.gforce_jsub` for consistency with `.gforce_ok`
 - Rename `dotN` to `is.N` to convey its boolean nature
  • Loading branch information
MichaelChirico authored Feb 23, 2024
1 parent 7057795 commit 82f559f
Showing 1 changed file with 83 additions and 80 deletions.
163 changes: 83 additions & 80 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1734,7 +1734,6 @@ replace_dot_alias = function(e) {
else
catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L))
}
dotN = function(x) is.name(x) && x==".N" # For #334. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character?
# FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with
# nomatch=NULL even now.. but not switching it on yet, will deal it separately.
if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) {
Expand All @@ -1748,95 +1747,23 @@ replace_dot_alias = function(e) {
GForce = FALSE
} else {
# Apply GForce
# GForce needs to evaluate all arguments not present in the data.table before calling C part #5547
# Safe cases: variables [i], calls without variables [c(0,1), list(1)] # TODO extend this list
# Unsafe cases: functions containing variables [c(i), abs(i)], .N
is_constantish = function(expr, check_singleton=FALSE) {
if (!is.call(expr)) {
return(!dotN(expr))
}
if (check_singleton) {
return(FALSE)
}
# calls are allowed <=> there's no SYMBOLs in the sub-AST
return(length(all.vars(expr, max.names=1L, unique=FALSE)) == 0L)
}
.gshift_ok = function(q) {
q = match.call(shift, q)
is_constantish(q[["n"]]) &&
is_constantish(q[["fill"]]) &&
is_constantish(q[["type"]]) &&
!"give.names" %chin% names(q)
}
.ghead_ok = function(q) {
length(q) == 3L &&
is_constantish(q[[3L]], check_singleton = TRUE)
}
`.g[_ok` = function(q, x) {
length(q) == 3L &&
is_constantish(q[[3L]], check_singleton = TRUE) &&
(q[[1L]] != "[[" || eval(call('is.atomic', q[[2L]]), envir=x)) &&
!(as.character(q[[3L]]) %chin% names_x) && is.numeric(q3<-eval(q[[3L]], parent.frame(3L))) && length(q3)==1L && q3>0L
}
.gweighted.mean_ok = function(q, x) { #3977
q = match.call(gweighted.mean, q)
is_constantish(q[["na.rm"]]) &&
(is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x))
}
# run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD
.get_gcall = function(q) {
if (!is.call(q)) return(NULL)
# is.symbol() is for #1369, #1974 and #2949
if (!is.symbol(q[[2L]])) return(NULL)
q1 <- q[[1L]]
if (is.symbol(q1)) return(if (q1 %chin% gfuns) q1)
if (!q1 %iscall% "::") return(NULL)
if (q1[[2L]] != "data.table") return(NULL)
return(if (q1[[3L]] %chin% gdtfuns) q1[[3L]])
}
.gforce_ok = function(q) {
if (dotN(q)) return(TRUE) # For #334
q1 <- .get_gcall(q)
if (is.null(q1)) return(FALSE)
if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875
if (length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na") && is_constantish(q[[3L]]))) return(TRUE)
# ^^ base::startWith errors on NULL unfortunately
switch(as.character(q1),
"shift" = .gshift_ok(q),
"weighted.mean" = .gweighted.mean_ok(q, x),
"tail" = , "head" = .ghead_ok(q),
"[[" = , "[" = `.g[_ok`(q, x),
FALSE
)
}
if (jsub[[1L]]=="list") {
GForce = TRUE
for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) {
if (!.gforce_ok(jsub[[ii]])) {GForce = FALSE; break}
}
} else GForce = .gforce_ok(jsub)
gforce_jsub = function(x, names_x) {
call_name <- if (is.symbol(x[[1L]])) x[[1L]] else x[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work.
x[[1L]] = as.name(paste0("g", call_name))
# gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok
# do not evaluate vars present as columns in x
if (length(x) >= 3L) {
for (i in 3:length(x)) {
if (is.symbol(x[[i]]) && !(x[[i]] %chin% names_x)) x[[i]] = eval(x[[i]], parent.frame(2L)) # tests 1187.2 & 1187.4
}
if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break}
}
x
}
} else
GForce = .gforce_ok(jsub, SDenv$.SDall)
if (GForce) {
if (jsub[[1L]]=="list")
for (ii in seq_along(jsub)[-1L]) {
if (dotN(jsub[[ii]])) next; # For #334
jsub[[ii]] = gforce_jsub(jsub[[ii]], names_x)
if (is.N(jsub[[ii]])) next; # For #334
jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x)
}
else {
# adding argument to ghead/gtail if none is supplied to g-optimized head/tail
if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L
jsub = gforce_jsub(jsub, names_x)
jsub = .gforce_jsub(jsub, names_x)
}
if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L))
} else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n");
Expand All @@ -1847,7 +1774,7 @@ replace_dot_alias = function(e) {
nomeanopt=FALSE # to be set by .optmean() using <<- inside it
oldjsub = jsub
if (jsub[[1L]]=="list") {
# Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() and/or the for-looped if()
# Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if()
# jsub[[1]]=="list" so the first item of todo will always be FALSE
todo = sapply(jsub, `%iscall%`, 'mean')
if (any(todo)) {
Expand Down Expand Up @@ -3085,6 +3012,82 @@ gshift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift", "cyclic")) {
}
gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows)

# GForce needs to evaluate all arguments not present in the data.table before calling C part #5547
# Safe cases: variables [i], calls without variables [c(0,1), list(1)] # TODO extend this list
# Unsafe cases: functions containing variables [c(i), abs(i)], .N
is.N = function(q) is.name(q) && q==".N" # For #334. TODO: Rprof() showed is.N() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character?
is_constantish = function(q, check_singleton=FALSE) {
if (!is.call(q)) {
return(!is.N(q))
}
if (check_singleton) {
return(FALSE)
}
# calls are allowed <=> there's no SYMBOLs in the sub-AST
return(length(all.vars(q, max.names=1L, unique=FALSE)) == 0L)
}
.gshift_ok = function(q) {
q = match.call(shift, q)
is_constantish(q[["n"]]) &&
is_constantish(q[["fill"]]) &&
is_constantish(q[["type"]]) &&
!"give.names" %chin% names(q)
}
.ghead_ok = function(q) {
length(q) == 3L &&
is_constantish(q[[3L]], check_singleton = TRUE)
}
`.g[_ok` = function(q, x) {
length(q) == 3L &&
is_constantish(q[[3L]], check_singleton = TRUE) &&
(q[[1L]] != "[[" || eval(call('is.atomic', q[[2L]]), envir=x)) &&
!(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], parent.frame(3L))) && length(q3)==1L && q3>0L
}
.gweighted.mean_ok = function(q, x) { #3977
q = match.call(gweighted.mean, q)
is_constantish(q[["na.rm"]]) &&
(is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x))
}
# run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD
.get_gcall = function(q) {
if (!is.call(q)) return(NULL)
# is.symbol() is for #1369, #1974 and #2949
if (!is.symbol(q[[2L]])) return(NULL)
q1 = q[[1L]]
if (is.symbol(q1)) return(if (q1 %chin% gfuns) q1)
if (!q1 %iscall% "::") return(NULL)
if (q1[[2L]] != "data.table") return(NULL)
return(if (q1[[3L]] %chin% gdtfuns) q1[[3L]])
}
.gforce_ok = function(q, x) {
if (is.N(q)) return(TRUE) # For #334
q1 = .get_gcall(q)
if (is.null(q1)) return(FALSE)
if (!(q2 <- q[[2L]]) %chin% names(x) && q2 != ".I") return(FALSE) # 875
if (length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na") && is_constantish(q[[3L]]))) return(TRUE)
# ^^ base::startWith errors on NULL unfortunately
switch(as.character(q1),
"shift" = .gshift_ok(q),
"weighted.mean" = .gweighted.mean_ok(q, x),
"tail" = , "head" = .ghead_ok(q),
"[[" = , "[" = `.g[_ok`(q, x),
FALSE
)
}

.gforce_jsub = function(q, names_x) {
call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work.
q[[1L]] = as.name(paste0("g", call_name))
# gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok
# do not evaluate vars present as columns in x
if (length(q) >= 3L) {
for (i in 3:length(q)) {
if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], parent.frame(2L)) # tests 1187.2 & 1187.4
}
}
q
}

.prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){
## helper that decides, whether a fast binary search can be performed, if i is a call
## For details on the supported queries, see \code{\link{datatable-optimize}}
Expand Down

0 comments on commit 82f559f

Please sign in to comment.