From 208e4e15f2c5660aeccdcad4b4b2fdfafb8df588 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 17:24:08 +0000
Subject: [PATCH] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 R/mergelist.R             | 110 +++++++++++++++++++++
 R/onLoad.R                |   1 +
 inst/tests/mergelist.Rraw | 194 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 305 insertions(+)

diff --git a/R/mergelist.R b/R/mergelist.R
index 435ee0a60..52ce68493 100644
--- a/R/mergelist.R
+++ b/R/mergelist.R
@@ -125,6 +125,116 @@ dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
   return(list(ans=ans, irows=irows, xrows=xrows))
 }
 
+# atomic join between two tables
+mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=names(rhs), copy=TRUE, join.many=TRUE, verbose=FALSE) {
+  semianti = how=="semi" || how=="anti"
+  innerfull = how=="inner" || how=="full"
+  {
+    if (how!="cross") {
+      if (is.null(on)) {
+        if (how=="left" || semianti) on = key(rhs)
+        else if (how=="right") on = key(lhs)
+        else if (innerfull) on = onkeys(key(lhs), key(rhs))
+        if (is.null(on))
+          stopf("'on' is missing and necessary key is not present")
+      }
+      if (any(bad.on <- !on %chin% names(lhs)))
+        stopf("'on' argument specify columns to join [%s] that are not present in LHS table [%s]", brackify(on[bad.on]), brackify(names(lhs)))
+      if (any(bad.on <- !on %chin% names(rhs)))
+        stopf("'on' argument specify columns to join [%s] that are not present in RHS table [%s]", brackify(on[bad.on]), brackify(names(rhs)))
+    } else if (is.null(on)) {
+      on = character() ## cross join only
+    }
+  } ## on
+  {
+    if (how!="right") {
+      jnfm = lhs; fm.cols = lhs.cols; jnto = rhs; to.cols = rhs.cols
+    } else {
+      jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols
+    }
+  } ## join-to and join-from tables and columns (right outer join swap)
+
+  ## ensure symmetric join for inner|full join, apply mult on both tables, bmerge do only 'x' table
+  cp.i = FALSE ## copy marker of out.i
+  if ((innerfull) && !is.null(mult) && (mult=="first" || mult=="last")) {
+    jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) ## might not copy when already unique by 'on'
+    cp.i = nrow(jnfm)!=nrow(lhs) ## nrow(lhs) bc how='inner|full' so jnfm=lhs
+  } else if (how=="inner" && (is.null(mult) || mult=="error")) { ## we do this branch only to raise error from bmerge, we cannot use forder to just find duplicates because those duplicates might not have matching rows in another table, full join checks mult='error' during two non-void bmerges
+    dtmerge(x=jnfm, i=jnto, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many, void=TRUE)
+  }
+
+  ## binary merge
+  ans = dtmerge(x=jnto, i=jnfm, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many)
+
+  ## make i side
+  out.i = if (is.null(ans$irows))
+    .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on, retain.order=semianti), retain.key=TRUE)
+  else
+    .Call(CsubsetDT, jnfm, ans$irows, someCols(jnfm, fm.cols, keep=on, retain.order=semianti))
+  cp.i = cp.i || !is.null(ans$irows)
+
+  ## make x side
+  if (semianti) {
+    out.x = list(); cp.x = TRUE
+  } else {
+    out.x = if (is.null(ans$xrows)) ## as of now xrows cannot be NULL #4409 thus nocov below
+      internal_error("dtmerge()$xrows returned NULL, #4409 been resolved but related code has not been updated?") #.shallow(jnto, cols=someCols(jnto, to.cols, drop=on), retain.key=TRUE) # nocov ## as of now nocov does not make difference r-lib/covr#279
+    else
+      .Call(CsubsetDT, jnto, ans$xrows, someCols(jnto, to.cols, drop=on))
+    cp.x = !is.null(ans$xrows)
+    ## ensure no duplicated column names in merge results
+    if (any(dup.i<-names(out.i) %chin% names(out.x)))
+      stopf("merge result has duplicated column names, use 'cols' argument or rename columns in 'l' tables, duplicated column(s): %s", brackify(names(out.i)[dup.i]))
+  }
+
+  ## stack i and x
+  if (how!="full") {
+    if (!cp.i && copy) out.i = copy(out.i)
+    #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here
+    out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
+    if (how=="right") setcolorder(out, neworder=c(on, names(out.x))) ## arrange columns: i.on, x.cols, i.cols
+  } else { # how=="full"
+    ## we made left join side above, proceed to right join side, so swap tbls
+    jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols
+
+    cp.r = FALSE
+    if (!is.null(mult) && (mult=="first" || mult=="last")) {
+      jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE)
+      cp.r = nrow(jnfm)!=nrow(rhs) ## nrow(rhs) bc jnfm=rhs
+    } ## mult=="error" check was made on one side already, below we do on the second side, test 101.43
+
+    ## binary merge anti join
+    bns = dtmerge(x=jnto, i=jnfm, on=on, how="anti", mult=if (!is.null(mult) && mult!="all") mult, verbose=verbose, join.many=join.many)
+
+    ## make anti join side
+    out.r = if (is.null(bns$irows))
+      .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on), retain.key=TRUE) ## retain.key is used only in the edge case when !nrow(out.i)
+    else
+      .Call(CsubsetDT, jnfm, bns$irows, someCols(jnfm, fm.cols, keep=on))
+    cp.r = cp.r || !is.null(bns$irows)
+
+    ## short circuit to avoid rbindlist to empty sets and retains keys
+    if (!nrow(out.r)) { ## possibly also !nrow(out.i)
+      if (!cp.i && copy) out.i = copy(out.i)
+      #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here
+      out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
+    } else if (!nrow(out.i)) { ## but not !nrow(out.r)
+      if (!cp.r && copy) out.r = copy(out.r)
+      if (length(add<-setdiff(names(out.i), names(out.r)))) { ## add missing columns of proper types NA
+        neworder = copy(names(out.i)) #set(out.r, NULL, add, lapply(unclass(out.i)[add], `[`, 1L)) ## 291.04 overalloc exceed fail during set()
+        out.i = lapply(unclass(out.i)[add], `[`, seq_len(nrow(out.r))) ## could eventually remove this when cbindlist recycle 0 rows up, note that we need out.r not to be copied
+        out.r = .Call(Ccbindlist, list(out.r, out.i), FALSE)
+        setcolorder(out.r, neworder=neworder)
+      }
+      out = out.r
+    } else { ## all might have not been copied yet, rbindlist will copy
+      out.l = .Call(Ccbindlist, list(out.i, out.x), FALSE)
+      out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE)
+    }
+  }
+  setDT(out)
+}
+
 # Previously, we had a custom C implementation here, which is ~2x faster,
 #   but this is fast enough we don't bother maintaining a new routine.
 #   Hopefully in the future rep() can recognize the ALTREP and use that, too.
diff --git a/R/onLoad.R b/R/onLoad.R
index 01b159f77..ef6bb4a02 100644
--- a/R/onLoad.R
+++ b/R/onLoad.R
@@ -81,6 +81,7 @@
        "datatable.print.trunc.cols"="FALSE",   # for print.data.table
        "datatable.show.indices"="FALSE",       # for print.data.table
        "datatable.allow.cartesian"="FALSE",    # datatable.<argument name>
+       "datatable.join.many"="TRUE",           # mergelist, [.data.table #4383 #914
        "datatable.dfdispatchwarn"="TRUE",                   # not a function argument
        "datatable.warnredundantby"="TRUE",                  # not a function argument
        "datatable.alloccol"="1024L",           # argument 'n' of alloc.col. Over-allocate 1024 spare column slots
diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw
index a35c4f410..422d8d709 100644
--- a/inst/tests/mergelist.Rraw
+++ b/inst/tests/mergelist.Rraw
@@ -6,6 +6,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 } else {
   require(data.table)
   test = data.table:::test
+  mergepair = data.table:::mergepair
   perhaps.data.table = data.table:::perhaps.data.table
   hasindex = data.table:::hasindex
   fdistinct = data.table:::fdistinct
@@ -13,6 +14,16 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 }
 
 addresses = function(x) vapply(x, address, "")
+copied = function(ans, l) {
+  all(!addresses(ans) %chin% unlist(recursive=FALSE, lapply(l, addresses)))
+}
+notcopied = function(ans, l, how="left", unless=character()) {
+  if (how %chin% unless) return(copied(ans, l)) ## used during looping tests for easier escape
+  if (how=="full") return( ## either side, left|right, notcopied is fine
+    all(addresses(l[[1L]]) %chin% addresses(ans)) || all(addresses(l[[length(l)]]) %chin% addresses(ans))
+  )
+  all(addresses(if (how=="right") l[[length(l)]] else l[[1L]]) %chin% addresses(ans))
+}
 
 # internal helpers
 
@@ -110,6 +121,189 @@ test(13.04, key(ans), "id1")
 test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
 test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib
 
+# mergepair
+
+## test copy-ness argument in mergepair
+
+### LHS equal to RHS: no copy in all cases
+num = 21.000
+l = list(
+  lhs = data.table(id1=1:2, v1=1:2),
+  rhs = data.table(id1=1:2, v2=1:2)
+)
+expected = data.table(id1=1:2, v1=1:2, v2=1:2)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected) ## copy=TRUE: no shared columns
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected) ## copy=FALSE: LHS shared but no RHS
+    test(num<-num+0.001, notcopied(ans, l, how=how))
+  }
+}
+### RHS includes LHS: no copy in inner, left, right
+num = 22.000
+unless = "full"
+l = list(
+  lhs = data.table(id1=1:2, v1=1:2),
+  rhs = data.table(id1=1:3, v2=1:3)
+)
+expected = list(
+  inner = data.table(id1=1:2, v1=1:2, v2=1:2),
+  left = data.table(id1=1:2, v1=1:2, v2=1:2),
+  right = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3),
+  full = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3)
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS includes RHS: no copy in left, right, full
+num = 23.000
+unless = "inner"
+l = list(
+  lhs = data.table(id1=1:3, v1=1:3),
+  rhs = data.table(id1=1:2, v2=1:2)
+)
+expected = list(
+  inner = data.table(id1=1:2, v1=1:2, v2=1:2),
+  left = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)),
+  right = data.table(id1=1:2, v1=1:2, v2=1:2),
+  full = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS single nonmatch RHS on both sides: no copy in left, right
+num = 24.000
+unless = c("inner","full")
+l = list(
+  lhs = data.table(id1=3:1, v1=1:3),
+  rhs = data.table(id1=c(4L,2:1), v2=1:3)
+)
+expected = list(
+  inner = data.table(id1=2:1, v1=2:3, v2=2:3),
+  left = data.table(id1=3:1, v1=1:3, v2=c(NA,2:3)),
+  right = data.table(id1=c(4L,2:1), v1=c(NA,2:3), v2=1:3),
+  full = data.table(id1=c(3:1,4L), v1=c(1:3,NA), v2=c(NA,2:3,1L))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS zero match RHS: no copy in left, right
+num = 25.000
+unless = c("inner","full")
+l = list(
+  lhs = data.table(id1=2:1, v1=1:2),
+  rhs = data.table(id1=3:4, v2=1:2)
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)),
+  right = data.table(id1=3:4, v1=c(NA_integer_,NA), v2=1:2),
+  full = data.table(id1=c(2:1,3:4), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS and RHS zero nrow: no copies
+num = 26.000
+unless = character()
+l = list(
+  lhs = data.table(id1=integer(), v1=integer()),
+  rhs = data.table(id1=integer(), v2=integer())
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=integer(), v1=integer(), v2=integer()),
+  right = data.table(id1=integer(), v1=integer(), v2=integer()),
+  full = data.table(id1=integer(), v1=integer(), v2=integer())
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS has zero nrow: no copies
+num = 27.000
+unless = character()
+l = list(
+  lhs = data.table(id1=integer(), v1=integer()),
+  rhs = data.table(id1=2:1, v2=1:2)
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=integer(), v1=integer(), v2=integer()),
+  right = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2),
+  full = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### RHS has zero nrow
+num = 28.000
+unless = "inner"
+l = list(
+  lhs = data.table(id1=2:1, v1=1:2),
+  rhs = data.table(id1=integer(), v2=integer())
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)),
+  right = data.table(id1=integer(), v1=integer(), v2=integer()),
+  full = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+
 ## fdistinct, another round
 
 dt = data.table(x =