From 87d12c204b36654e95507e781c1ec83864bcb2b2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 22 Feb 2024 08:58:31 -0800 Subject: [PATCH] GForce works with data.table::shift (#5944) * GForce works with data.table::shift * use if+implicit else in return for concision * better comment --- NEWS.md | 2 ++ R/data.table.R | 26 +++++++++++++++++++------- inst/tests/tests.Rraw | 8 ++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7ec502902..359fc38ad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,8 @@ 2. `cedta()` now returns `FALSE` if `.datatable.aware = FALSE` is set in the calling environment, [#5654](https://github.com/Rdatatable/data.table/issues/5654). +3. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/data.table.R b/R/data.table.R index 6368df4eb..de5a73533 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1783,15 +1783,25 @@ replace_dot_alias = function(e) { is_constantish(q[["na.rm"]]) && (is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x)) } + # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD + .get_gcall = function(q) { + if (!is.call(q)) return(NULL) + # is.symbol() is for #1369, #1974 and #2949 + if (!is.symbol(q[[2L]])) return(NULL) + q1 <- q[[1L]] + if (is.symbol(q1)) return(if (q1 %chin% gfuns) q1) + if (!q1 %iscall% "::") return(NULL) + if (q1[[2L]] != "data.table") return(NULL) + return(if (q1[[3L]] %chin% gdtfuns) q1[[3L]]) + } .gforce_ok = function(q) { if (dotN(q)) return(TRUE) # For #334 - # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD - # is.symbol() is for #1369, #1974 and #2949 - if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q[[1L]]) %chin% gfuns)) return(FALSE) + q1 <- .get_gcall(q) + if (is.null(q1)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 if (length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na") && is_constantish(q[[3L]]))) return(TRUE) # ^^ base::startWith errors on NULL unfortunately - switch(as.character(q[[1L]]), + switch(as.character(q1), "shift" = .gshift_ok(q), "weighted.mean" = .gweighted.mean_ok(q, x), "tail" = , "head" = .ghead_ok(q), @@ -1806,7 +1816,8 @@ replace_dot_alias = function(e) { } } else GForce = .gforce_ok(jsub) gforce_jsub = function(x, names_x) { - x[[1L]] = as.name(paste0("g", x[[1L]])) + call_name <- if (is.symbol(x[[1L]])) x[[1L]] else x[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. + x[[1L]] = as.name(paste0("g", call_name)) # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok # do not evaluate vars present as columns in x if (length(x) >= 3L) { @@ -3040,8 +3051,9 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { # (1) add it to gfuns # (2) edit .gforce_ok (defined within `[`) to catch which j will apply the new function # (3) define the gfun = function() R wrapper -gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", - "median", "min", "max", "var", "sd", ".N", "shift", "weighted.mean") # added .N for #334 +gdtfuns = c("first", "last", "shift") # exported by data.table, not generic, thus also accept data.table:: form under GForce, #5942. +gfuns = c(gdtfuns, + "[", "[[", "head", "tail", "sum", "mean", "prod", "median", "min", "max", "var", "sd", ".N", "weighted.mean") # added .N for #334 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. ghead = function(x, n) .Call(Cghead, x, as.integer(n)) gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7e64bdc1a..9d68ff5d8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18278,3 +18278,11 @@ test(2245.1, dt[1], data.table(foo = 1L, bar = 4L)) # Single index should be int test(2245.2, copy(dt[1]), data.table(foo = 1:3)) # Revert to data.frame syntax: Interpret index as column rm(.datatable.aware) test(2245.3, dt[1], data.table(foo = 1L, bar = 4L)) # Default in this environment should be datatable-aware + +# data.table:: doesn't turn off GForce, #5942 +DT = data.table(a = rep(1:5, 2L), b = 1:10) +old = options(datatable.optimize=Inf, datatable.verbose=TRUE) +test(2246.1, DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") +test(2246.2, DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") +test(2246.3, DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") +options(old)