From e665d2a89c3c8a81ccafd20f0fa7cd96bb905afd Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 5 Jan 2024 13:01:02 +0100 Subject: [PATCH] gshift cannot eval variables used in [ (#5548) * add fix for escaping gforce using [variables in function call * push * escape gshift * update * add fix * add test for coverage * add news * update shift tests * move tests * update tests * simplify tests * simplify * working version * add comments * update test info * add dropped DT * add raw tests * update tests * add more tests for nested jsub * add more tests * make qforce_ok more robust * update comments * simplify logical * remove comment since n is used * update eval environment * add Jans testcase * add helper functions * remove unused assignments * update test nums * escape evaluating values present int x * update eval of vars * add spaces * overwrite call * all.vars==0L and unique=FALSE * add comment about noCall_noVars * shorten switch Co-authored-by: Michael Chirico * add extra check to noCall_noVars * rename noCall_noVars * simplify switch * update match.call gweighted.mean * simplify * rename zip and name args * remove redundant switch() entry * deduplicate code * update g[_ok signature * whitespace suggestion * just check if 'give.names' in names * Change check= to check_singleton= * name argument * update check constantish * update NEWS item * standardize spelling * infix spacing --------- Co-authored-by: Benjamin Schwendinger <5801475-ben-schwen@users.noreply.gitlab.com> Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 6 ++-- R/data.table.R | 84 +++++++++++++++++++++++++++++++++---------- inst/tests/tests.Rraw | 63 +++++++++++++++++++++++++++++++- src/shift.c | 50 +++++++++++++------------- 4 files changed, 156 insertions(+), 47 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4f521bdd2..bc7f467cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -168,12 +168,12 @@ 28. `setkey()` now supports type `raw` as value columns (not as key columns), [#5100](https://github.com/Rdatatable/data.table/issues/5100). Thanks Hugh Parsonage for requesting, and Benjamin Schwendinger for the PR. -29. `shift()` is now optimised by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. +29. `shift()` is now optimized by group, [#1534](https://github.com/Rdatatable/data.table/issues/1534). Thanks to Gerhard Nachtmann for requesting, and Benjamin Schwendinger for the PR. Thanks to @neovom for testing dev and filing a bug report, [#5547](https://github.com/Rdatatable/data.table/issues/5547) which was fixed before release. This helped also in improving the logic for when to turn on optimization by group in general, making it more robust. ```R N = 1e7 DT = data.table(x=sample(N), y=sample(1e6,N,TRUE)) - shift_no_opt = shift # different name not optimised as a way to compare + shift_no_opt = shift # different name not optimized as a way to compare microbenchmark( DT[, c(NA, head(x,-1)), y], DT[, shift_no_opt(x, 1, type="lag"), y], @@ -263,7 +263,7 @@ # 2: 2 4 b ``` -34. `weighted.mean()` is now optimised by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. +34. `weighted.mean()` is now optimized by group, [#3977](https://github.com/Rdatatable/data.table/issues/3977). Thanks to @renkun-ken for requesting, and Benjamin Schwendinger for the PR. 35. `as.xts.data.table()` now supports non-numeric xts coredata matrixes, [5268](https://github.com/Rdatatable/data.table/issues/5268). Existing numeric only functionality is supported by a new `numeric.only` parameter, which defaults to `TRUE` for backward compatability and the most common use case. To convert non-numeric columns, set this parameter to `FALSE`. Conversions of `data.table` columns to a `matrix` now uses `data.table::as.matrix`, with all its performance benefits. Thanks to @ethanbsmith for the report and fix. diff --git a/R/data.table.R b/R/data.table.R index 76e6c4e0e..77fabf83b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1739,22 +1739,56 @@ replace_dot_alias = function(e) { GForce = FALSE } else { # Apply GForce + # GForce needs to evaluate all arguments not present in the data.table before calling C part #5547 + # Safe cases: variables [i], calls without variables [c(0,1), list(1)] # TODO extend this list + # Unsafe cases: functions containing variables [c(i), abs(i)], .N + is_constantish = function(expr, check_singleton=FALSE) { + if (!is.call(expr)) { + return(!dotN(expr)) + } + if (check_singleton) { + return(FALSE) + } + # calls are allowed <=> there's no SYMBOLs in the sub-AST + return(length(all.vars(expr, max.names=1L, unique=FALSE)) == 0L) + } + .gshift_ok = function(q) { + q = match.call(shift, q) + is_constantish(q[["n"]]) && + is_constantish(q[["fill"]]) && + is_constantish(q[["type"]]) && + !"give.names" %chin% names(q) + } + .ghead_ok = function(q) { + length(q) == 3L && + is_constantish(q[[3L]], check_singleton = TRUE) + } + `.g[_ok` = function(q, x) { + length(q) == 3L && + is_constantish(q[[3L]], check_singleton = TRUE) && + (q[[1L]] != "[[" || eval(call('is.atomic', q[[2L]]), envir=x)) && + eval(q[[3L]], parent.frame(3L)) > 0L + } + .gweighted.mean_ok = function(q, x) { #3977 + q = match.call(gweighted.mean, q) + is_constantish(q[["na.rm"]]) && + (is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x)) + } .gforce_ok = function(q) { if (dotN(q)) return(TRUE) # For #334 # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD # is.symbol() is for #1369, #1974 and #2949 - if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) + if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 - if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) + if (length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na") && is_constantish(q[[3L]]))) return(TRUE) # ^^ base::startWith errors on NULL unfortunately - if (length(q)>=2L && q[[1L]] == "shift") { - q_named = match.call(shift, q) - if (!is.call(q_named[["fill"]]) && is.null(q_named[["give.names"]])) return(TRUE) - } - if (length(q)>=3L && q[[1L]] == "weighted.mean") return(TRUE) #3977 - # otherwise there must be three arguments - length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) + switch(as.character(q[[1L]]), + "shift" = .gshift_ok(q), + "weighted.mean" = .gweighted.mean_ok(q, x), + "tail" = , "head" = .ghead_ok(q), + "[[" = , "[" = `.g[_ok`(q, x), + FALSE + ) } if (jsub[[1L]]=="list") { GForce = TRUE @@ -1762,18 +1796,27 @@ replace_dot_alias = function(e) { if (!.gforce_ok(jsub[[ii]])) {GForce = FALSE; break} } } else GForce = .gforce_ok(jsub) + gforce_jsub = function(x, names_x) { + x[[1L]] = as.name(paste0("g", x[[1L]])) + # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok + # do not evaluate vars present as columns in x + if (length(x) >= 3L) { + for (i in 3:length(x)) { + if (is.symbol(x[[i]]) && !(x[[i]] %chin% names_x)) x[[i]] = eval(x[[i]], parent.frame(2L)) # tests 1187.2 & 1187.4 + } + } + x + } if (GForce) { if (jsub[[1L]]=="list") for (ii in seq_along(jsub)[-1L]) { if (dotN(jsub[[ii]])) next; # For #334 - jsub[[ii]][[1L]] = as.name(paste0("g", jsub[[ii]][[1L]])) - if (length(jsub[[ii]])>=3L && is.symbol(jsub[[ii]][[3L]]) && !(jsub[[ii]][[3L]] %chin% sdvars)) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 + jsub[[ii]] = gforce_jsub(jsub[[ii]], names_x) } else { # adding argument to ghead/gtail if none is supplied to g-optimized head/tail if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L - jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) - if (length(jsub)>=3L && is.symbol(jsub[[3L]]) && !(jsub[[3L]] %chin% sdvars)) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 + jsub = gforce_jsub(jsub, names_x) } if (verbose) catf("GForce optimized j to '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } else if (verbose) catf("GForce is on, left j unchanged\n"); @@ -1868,7 +1911,7 @@ replace_dot_alias = function(e) { if (!is.symbol(jsub)) { headTail_arg = function(q) { if (length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - (q1 <- q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 + (q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 else 0 } if (jsub[[1L]] == "list"){ @@ -1882,6 +1925,11 @@ replace_dot_alias = function(e) { g = lapply(g, rep.int, times=grplens) } else if (.is_nrows(jsub)) { g = lapply(g, rep.int, times=len__) + # unpack list of lists for nrows functions + zip_items = function(ll) do.call(mapply, c(list(FUN = c), ll, SIMPLIFY=FALSE, USE.NAMES=FALSE)) + if (all(vapply_1b(ans, is.list))) { + ans = lapply(ans, zip_items) + } } ans = c(g, ans) } else { @@ -3000,13 +3048,13 @@ rleidv = function(x, cols=seq_along(x), prefix=NULL) { gfuns = c("[", "[[", "head", "tail", "first", "last", "sum", "mean", "prod", "median", "min", "max", "var", "sd", ".N", "shift", "weighted.mean") # added .N for #334 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here. -ghead = function(x, n) .Call(Cghead, x, as.integer(n)) # n is not used at the moment -gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) # n is not used at the moment +ghead = function(x, n) .Call(Cghead, x, as.integer(n)) +gtail = function(x, n) .Call(Cgtail, x, as.integer(n)) gfirst = function(x) .Call(Cgfirst, x) glast = function(x) .Call(Cglast, x) gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm) gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm) -gweighted.mean = function(x, w, na.rm=FALSE) { +gweighted.mean = function(x, w, ..., na.rm=FALSE) { if (missing(w)) gmean(x, na.rm) else { if (na.rm) { # take those indices out of the equation by setting them to 0 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f27abfcc0..cb74b3109 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6736,6 +6736,18 @@ test(1463.69, shift(x, -6, type="cyclic"), shift(x, -1, type="cyclic")) # test warning test(1463.70, shift(x, 1, fill=1, type="cyclic"), c(5L, 1L:4L), warning="Provided argument fill=1 will be ignored since type='shift'.") +# test raw #5547 +x=as.raw(1:5) +test(1463.71, shift(x,1L), as.raw(c(0L, 1:4))) +test(1463.72, shift(x,1:2), list(as.raw(c(0L, 1:4)), as.raw(c(0L, 0L, 1:3)))) +test(1463.73, shift(x,1L, fill=0L), as.raw(c(0L, 1:4))) +test(1463.74, shift(x,1L, type="lead"), as.raw(c(2:5, 0L))) +test(1463.75, shift(x,1:2, type="lead"), list(as.raw(c(2:5, 0L)), as.raw(c(3:5, 0L, 0L)))) +test(1463.76, shift(x,1L, fill=0L,type="lead"), as.raw(c(2:5, 0L))) +test(1463.77, shift(x,1L, type="cyclic"), as.raw(c(5, 1:4))) +test(1463.78, shift(x,1:2, type="cyclic"), list(as.raw(c(5, 1:4)), as.raw(c(4:5, 1:3)))) +test(1463.79, shift(x,-1L, type="cyclic"), as.raw(c(2:5, 1))) +test(1463.80, shift(x,-(1:2),type="cyclic"), list(as.raw(c(2:5, 1)), as.raw(c(3:5,1:2)))) # FR #686 DT = data.table(a=rep(c("A", "B", "C", "A", "B"), c(2,2,3,1,2)), foo=1:10) @@ -13628,7 +13640,8 @@ test(1963.07, shift(DT, -1:1), c(NA, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L))) ## some coverage tests for good measure test(1963.08, shift(DT$x, type = 'some_other_type'), error='should be one of.*lag.*lead') -test(1963.09, shift(as.raw(0:1)), error = "Type 'raw' is not supported") +test(1963.09, shift(as.raw(0:1)), as.raw(c(0,0))) +test(1963.095, shift(list(expression(1))), error = "Type 'expression' is not supported") test(1963.10, shift(DT, -1:1, type="shift", give.names = TRUE), # new type="shift" #3223 ans <- list(`x_shift_-1` = c(2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, NA), x_shift_0 = 1:10, @@ -17946,6 +17959,12 @@ test(2231.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# let wrongly named arguments get lost in ellipsis #5543 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.61, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output="GForce optimized j to") +test(2231.62, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") +test(2231.63, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) options(old) # cols argument for unique.data.table, #5243 @@ -18184,3 +18203,45 @@ test(2241.14, r, data.table(id=1:2, x=c(5L,2L))) DT = data.table(a=1, b=2, c=3) test(2242.1, DT[, .SD, .SDcols=2-1], DT[, .(a)]) test(2242.2, DT[, .SD, .SDcols=length(DT)-1], DT[, .SD, .SDcols=2]) + +# turn off GForce where arguments are calls but still allow variables, #5547 +options(datatable.optimize=2L) +dt = data.table(x=c("a","b","c","d"), y=c(1L,1L,2L,2L)) +i = c(0,1) +j = 1L +t = "lead" +f = shift +test(2243.01, dt[, shift(x, i, type=t), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead"), by=y], output="GForce optimized j to") +test(2243.02, dt[, shift(x, abs(c(0,1)), type=t), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead"), by=y], output="GForce optimized j to") +test(2243.03, dt[, shift(x, abs(i), type=t), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead"), by=y], output="GForce FALSE") +test(2243.04, dt[, shift(x, i, type=as.character(t)), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead"), by=y], output="GForce FALSE") +test(2243.05, dt[, shift(x, i, type=t, fill=1), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead", fill=1), by=y], output="GForce optimized j to") +test(2243.06, dt[, shift(x, i, type=t, fill=abs(1)), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead", fill=1), by=y], output="GForce optimized j to") +test(2243.07, dt[, shift(x, i, type=t, fill=abs(j)), by=y, verbose=TRUE], dt[, f(x, c(0,1), type="lead", fill=1), by=y], output="GForce FALSE") +test(2243.08, dt[, .(shift(x, i, type=t)), by=y, verbose=TRUE], dt[, .(f(x, c(0,1), type="lead")), by=y], output="GForce optimized j to") +# GForce only var or call without vars as n of head/tail/"["(x, n) +dt = data.table(id=c(1L,1L,2L), v=1:3) +j = 1L +test(2243.11, dt[, head(v, j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(1L,3L)), output="GForce optimized j to") +test(2243.12, dt[, tail(v, j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(2L,3L)), output="GForce optimized j to") +test(2243.13, dt[, v[j], id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(1L,3L)), output="GForce optimized j to") +# GForce only var or call without vars as na.rm of sum, mean, median, prod, min, max, var +j = FALSE +test(2243.21, dt[, sum(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(3L,3L)), output="GForce optimized j to") +test(2243.22, dt[, mean(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(1.5,3)), output="GForce optimized j to") +test(2243.23, dt[, median(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(1.5,3)), output="GForce optimized j to") +test(2243.24, dt[, prod(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(2,3)), output="GForce optimized j to") +test(2243.25, dt[, min(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(1L,3L)), output="GForce optimized j to") +test(2243.26, dt[, max(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(2L,3L)), output="GForce optimized j to") +test(2243.27, dt[, var(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(0.5,NA)), output="GForce optimized j to") +test(2243.28, dt[, sd(v, na.rm=j), id, verbose=TRUE], data.table(id=c(1L,2L), V1=c(sqrt(0.5),NA)), output="GForce optimized j to") +dt = data.table(g=1:2, y=1:4) +j = TRUE +test(2243.31, dt[, sum(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(4L,6L)), output="GForce FALSE") +test(2243.32, dt[, mean(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,3)), output="GForce FALSE") +test(2243.33, dt[, median(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,3)), output="GForce FALSE") +test(2243.34, dt[, prod(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(3,8)), output="GForce FALSE") +test(2243.35, dt[, min(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(1L,2L)), output="GForce FALSE") +test(2243.36, dt[, max(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(3L,4L)), output="GForce FALSE") +test(2243.37, dt[, var(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,2)), output="GForce FALSE") +test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(sqrt(c(2,2)))), output="GForce FALSE") diff --git a/src/shift.c b/src/shift.c index 30c13a547..e3e4a2b82 100644 --- a/src/shift.c +++ b/src/shift.c @@ -42,11 +42,11 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) R_xlen_t xrows = xlength(elem); SEXP thisfill = PROTECT(coerceAs(fill, elem, ScalarLogical(0))); nprotect++; // #4865 use coerceAs for type coercion switch (TYPEOF(elem)) { - case INTSXP : { + case INTSXP: case LGLSXP: { const int ifill = INTEGER(thisfill)[0]; for (int j=0; j= 0) || (stype == LEAD && kd[j] < 0)) { - if (tailk > 0) memmove(ltmp+thisk, lelem, tailk*size); - if (cycle) { - if (thisk > 0) memmove(ltmp, lelem+tailk, thisk*size); - } else for (int m=0; m 0) memmove(ltmp, lelem+thisk, tailk*size); - if (cycle) { - if (thisk > 0) memmove(ltmp+tailk, lelem, thisk*size); - } else for (int m=tailk; m= 0) || (stype == LEAD && kd[j] < 0)) { + if (tailk > 0) memmove(dtmp+thisk, delem, tailk*size); + if (cycle) { + if (thisk > 0) memmove(dtmp, delem+tailk, thisk*size); + } else for (int m=0; m 0) memmove(dtmp, delem+thisk, tailk*size); + if (cycle) { + if (thisk > 0) memmove(dtmp+tailk, delem, thisk*size); + } else for (int m=tailk; m