Rdatatable · MichaelChirico · Jul 28, 2024 · Apr 18, 2020 · Apr 18, 2020 · Apr 18, 2020
@@ -126,6 +126,46 @@
 
 22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter.
 
+22. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions.
+
+Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above.
+
+Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown.
+
+```r
+d2 = data.table(id=2:1, v2=1:2)
+setindexv(d2, "id")
+str(attr(attr(d2, "index"), "__id"))
+# int [1:2] 2 1
+# - attr(*, "starts")= int [1:2] 1 2
+# - attr(*, "maxgrpn")= int 1
+# - attr(*, "anyna")= int 0
+# - attr(*, "anyinfnan")= int 0
+# - attr(*, "anynotascii")= int 0
+# - attr(*, "anynotutf8")= int 0
+
+d2 = data.table(id=2:1, v2=1:2)
+invisible(d2[id==1L])
+str(attr(attr(d2, "index"), "__id"))
+# int [1:2] 2 1
+```
+
+This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code.
+
+```r
+d1 = data.table(id=1:2, v1=1:2)
+d2 = data.table(id=2:1, v2=1:2)
+setindexv(d2, "id")
+d1[d2, on="id", verbose=TRUE]
+#...
+#Starting bmerge ...
+#forderReuseSorting: using existing index: __id
+#forderReuseSorting: opt=2, took 0.000s
+#...
+```
+
+This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing.
+
 ## TRANSLATIONS
 
 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix.

@@ -118,9 +118,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
     }
   }
 
-  ## after all modifications of i, check if i has a proper key on all icols
-  io = identical(icols, head(chmatch(key(i), names(i)), length(icols)))
-
   ## after all modifications of x, check if x has a proper key on all xcols.
   ## If not, calculate the order. Also for non-equi joins, the order must be calculated.
   non_equi = which.first(ops != 1L) # 1 is "==" operator
@@ -180,7 +177,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
   }
 
   if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()}
-  ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
+  ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
   if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()}
   # TO DO: xo could be moved inside Cbmerge
 

@@ -3241,13 +3241,13 @@ is_constantish = function(q, check_singleton=FALSE) {
   if (is.null(idx)){
     ## if nothing else helped, auto create a new index that can be used
     if (!getOption("datatable.auto.index")) return(NULL)
-    if (verbose) {catf("Creating new index '%s'\n", paste(names(i), collapse = "__"));flush.console()}
-    if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(names(i), collapse = "__"));flush.console()}
-    setindexv(x, names(i))
-    if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()}
-    if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(names(i), collapse = "__"));flush.console()}
-    idx = attr(attr(x, "index", exact=TRUE), paste("__", names(i), collapse = ""), exact=TRUE)
     idxCols = names(i)
+    if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()}
+    if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()}
+    idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, reuseSorting=TRUE)
+    maybe_reset_index(x, idx, idxCols) ## forder can write index, but disabled for now, see #4386
+    if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()}
+    if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()}
   }
   if(!is.null(idxCols)){
     setkeyv(i, idxCols)

@@ -50,23 +50,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
   miss = !(cols %chin% colnames(x))
   if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss]))
 
-  ## determine, whether key is already present:
-  if (identical(key(x),cols)) {
-    if (!physical) {
-      ## create index as integer() because already sorted by those columns
-      if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
-      setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer())
-    }
-    return(invisible(x))
-  } else if(identical(head(key(x), length(cols)), cols)){
-    if (!physical) {
-      ## create index as integer() because already sorted by those columns
-      if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
-      setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer())
-    } else {
-      ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
-      setattr(x,"sorted",cols)
-    }
+  if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387
+    ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
+    setattr(x,"sorted",cols)
     return(invisible(x))
   }
 
@@ -77,26 +63,20 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
   }
   if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
 
-  newkey = paste(cols, collapse="__")
-  if (!any(indices(x) == newkey)) {
-    if (verbose) {
-      tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
-      # suppress needed for tests 644 and 645 in verbose mode
-      catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"])
-    } else {
-      o = forderv(x, cols, sort=TRUE, retGrp=FALSE)
-    }
+  if (verbose) {
+    # we now also retGrp=TRUE #4387 for !physical
+    tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE)))  # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
+    # suppress needed for tests 644 and 645 in verbose mode
+    catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"])
   } else {
-    if (verbose) catf("setkey on columns %s using existing index '%s'\n", brackify(cols), newkey)
-    o = getindex(x, newkey)
+    o = forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE)
   }
-  if (!physical) {
-    if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
-    setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o)
+  if (!physical) { # index COULD BE saved from C forderReuseSorting already, but disabled for now
+    maybe_reset_index(x, o, cols)
     return(invisible(x))
   }
-  setattr(x,"index",NULL)   # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
   if (length(o)) {
+    setattr(x,"index",NULL)   # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. Only when order changes.
     if (verbose) { last.started.at = proc.time() }
     .Call(Creorder,x,o)
     if (verbose) { catf("reorder took %s\n", timetaken(last.started.at)); flush.console() }
@@ -124,7 +104,7 @@ getindex = function(x, name) {
   if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) {
     stopf("Internal error: index '%s' exists but is invalid", name)   # nocov
   }
-  ans
+  c(ans) ## drop starts and maxgrpn attributes
 }
 
 haskey = function(x) !is.null(key(x))
@@ -160,19 +140,24 @@ is.sorted = function(x, by=NULL) {
   # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1)
 }
 
+maybe_reset_index = function(x, idx, cols) {
+  if (isTRUE(getOption("datatable.forder.auto.index"))) return(invisible())
+  if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
+  setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), idx)
+  invisible(x)
+}
+
 ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character')
-forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE)
-{
+forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, reuseSorting=getOption("datatable.reuse.sorting", NA)) {
   if (is.atomic(x) || is.null(x)) {  # including forderv(NULL) which returns error consistent with base::order(NULL),
     if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense")
     by = NULL
   } else {
     if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L))
     by = colnamesInt(x, by, check_dups=FALSE)
-    if (length(order) == 1L) order = rep(order, length(by))
   }
   order = as.integer(order) # length and contents of order being +1/-1 is checked at C level
-  .Call(Cforder, x, by, retGrp, sort, order, na.last)  # returns integer() if already sorted, regardless of sort=TRUE|FALSE
+  .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting)  # returns integer() if already sorted, regardless of sort=TRUE|FALSE
 }
 
 forder = function(..., na.last=TRUE, decreasing=FALSE)
@@ -209,7 +194,7 @@ forder = function(..., na.last=TRUE, decreasing=FALSE)
     data = eval(sub, parent.frame(), parent.frame())
   }
   stopifnot(isTRUEorFALSE(decreasing))
-  o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last)
+  o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last)
   if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o
   o
 }