From 976c37222c81ac02a34bf2ed5ab9ea0b7dfdebdd Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 25 Aug 2022 21:12:22 +0200 Subject: [PATCH 01/33] add fix #5309 --- inst/tests/tests.Rraw | 22 ++++++++++++++++++++++ src/rbindlist.c | 11 ++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e05f52281..fa4f13e6b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18814,3 +18814,25 @@ test(2238.6, "a" %notin% integer(), TRUE) test(2238.7, "a" %notin% NULL, TRUE) test(2238.8, NA %notin% 1:5, TRUE) test(2238.9, NA %notin% c(1:5, NA), FALSE) + +# rbind with vectors with class attributes #5309 +x = data.table(a = 1L, b = as.Date("2020-01-01")) +y = data.table(a = 2L, b = as.IDate("2021-01-01")) +z = data.table(a = 3L, b = NA) +test(2239.01, rbind(x, y), data.table(a = 1:2, b = as.Date(c("2020-01-01", "2021-01-01")))) +test(2239.02, rbind(y, x), data.table(a = c(2L, 1L), b = as.IDate(c("2021-01-01", "2020-01-01")))) +test(2239.03, rbind(x, z), data.table(a = c(1L, 3L), b = as.Date(c("2020-01-01", NA)))) +test(2239.04, rbind(z, x), data.table(a = c(3L, 1L), b = as.Date(c(NA, "2020-01-01")))) +test(2239.05, rbind(y, z), data.table(a = c(2L, 3L), b = as.IDate(c("2021-01-01", NA)))) +test(2239.06, rbind(z, y), data.table(a = c(3L, 2L), b = as.IDate(c(NA, "2021-01-01")))) +z[, b := NULL] +test(2239.07, rbind(x, z, fill = TRUE), data.table(a = c(1L, 3L), b = as.Date(c("2020-01-01", NA)))) +test(2239.08, rbind(z, x, fill = TRUE), data.table(a = c(3L, 1L), b = as.Date(c(NA, "2020-01-01")))) +test(2239.09, rbind(y, z, fill = TRUE), data.table(a = c(2L, 3L), b = as.IDate(c("2021-01-01", NA)))) +test(2239.10, rbind(z, y, fill = TRUE), data.table(a = c(3L, 2L), b = as.IDate(c(NA, "2021-01-01")))) +x = data.table(a = 1L, b = as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2239.11, rbind(x, z, fill = TRUE), data.table(a = c(1L, 3L), b = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2239.12, rbind(z, x, fill = TRUE), data.table(a = c(3L, 1L), b = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(c = 1L, d = as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2239.11, rbind(x, z, fill = TRUE, use.names = FALSE), data.table(c = c(1L, 3L), d = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2239.12, setnames(rbind(z, x, fill = TRUE, use.names = FALSE), c("c", "d")), data.table(c = c(3L, 1L), d = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) diff --git a/src/rbindlist.c b/src/rbindlist.c index 366902883..28560a630 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -275,6 +275,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int longestLen=-1, longestW=-1, longestI=-1; // just for ordered factor; longestLen must be initialized as -1 so that rbind zero-length ordered factor could work #4795 SEXP longestLevels=R_NilValue; // just for ordered factor bool int64=false; + bool date=false; + bool posixct=false; const char *foundName=NULL; bool anyNotStringOrFactor=false; SEXP firstCol=R_NilValue; @@ -305,10 +307,16 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (INHERITS(thisCol, char_integer64)) { if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below int64=true; + } else if (INHERITS(thisCol, char_Date)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + date=true; + } else if (INHERITS(thisCol, char_POSIXct)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + posixct=true; } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64) { + if (!factor && !int64 && ((!date && !posixct) || (date && posixct))) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), 0)) { @@ -323,6 +331,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option) if (int64 && maxType!=REALSXP) error(_("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP"), j+1, type2char(maxType)); // # nocov + if (date && INHERITS(firstCol, char_IDate)) maxType=INTSXP; // first encountered Date determines class and type #5309 SEXP target; SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow)); // does not initialize logical & numerics, but does initialize character and list if (!factor) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB. From 3754191aace797bcfac5acc1ee1f53052fd77a9f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 25 Aug 2022 21:20:04 +0200 Subject: [PATCH 02/33] fix test numbering --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fa4f13e6b..ccd260b18 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18834,5 +18834,5 @@ x = data.table(a = 1L, b = as.POSIXct("2021-10-06 13:58:00 UTC")) test(2239.11, rbind(x, z, fill = TRUE), data.table(a = c(1L, 3L), b = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) test(2239.12, rbind(z, x, fill = TRUE), data.table(a = c(3L, 1L), b = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) x = data.table(c = 1L, d = as.POSIXct("2021-10-06 13:58:00 UTC")) -test(2239.11, rbind(x, z, fill = TRUE, use.names = FALSE), data.table(c = c(1L, 3L), d = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) -test(2239.12, setnames(rbind(z, x, fill = TRUE, use.names = FALSE), c("c", "d")), data.table(c = c(3L, 1L), d = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +test(2239.13, rbind(x, z, fill = TRUE, use.names = FALSE), data.table(c = c(1L, 3L), d = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2239.14, setnames(rbind(z, x, fill = TRUE, use.names = FALSE), c("c", "d")), data.table(c = c(3L, 1L), d = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) From 40778309d9a6787981f6e30e469586cccc523c4b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 26 Aug 2022 00:44:13 +0200 Subject: [PATCH 03/33] add rbind for ITime --- src/rbindlist.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index 28560a630..a5d994926 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -277,6 +277,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) bool int64=false; bool date=false; bool posixct=false; + bool itime=false; const char *foundName=NULL; bool anyNotStringOrFactor=false; SEXP firstCol=R_NilValue; @@ -313,10 +314,13 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) } else if (INHERITS(thisCol, char_POSIXct)) { if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } posixct=true; + } else if (INHERITS(thisCol, char_ITime)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + itime=true; } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64 && ((!date && !posixct) || (date && posixct))) { // prohibit binding of date and posixct + if (!factor && !int64 && ((!date && !posixct) || (date && posixct)) && !itime) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), 0)) { From f015eedd567416b52398797d56fe8215de4766ca Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 26 Aug 2022 01:23:19 +0200 Subject: [PATCH 04/33] more tests --- inst/tests/tests.Rraw | 48 ++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ccd260b18..0afc131bb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18819,20 +18819,36 @@ test(2238.9, NA %notin% c(1:5, NA), FALSE) x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = as.IDate("2021-01-01")) z = data.table(a = 3L, b = NA) -test(2239.01, rbind(x, y), data.table(a = 1:2, b = as.Date(c("2020-01-01", "2021-01-01")))) -test(2239.02, rbind(y, x), data.table(a = c(2L, 1L), b = as.IDate(c("2021-01-01", "2020-01-01")))) -test(2239.03, rbind(x, z), data.table(a = c(1L, 3L), b = as.Date(c("2020-01-01", NA)))) -test(2239.04, rbind(z, x), data.table(a = c(3L, 1L), b = as.Date(c(NA, "2020-01-01")))) -test(2239.05, rbind(y, z), data.table(a = c(2L, 3L), b = as.IDate(c("2021-01-01", NA)))) -test(2239.06, rbind(z, y), data.table(a = c(3L, 2L), b = as.IDate(c(NA, "2021-01-01")))) +test(2239.01, rbind(x, y), data.table(a=1:2, b= as.Date(c("2020-01-01", "2021-01-01")))) +test(2239.02, rbind(y, x), data.table(a=c(2L, 1L), b=as.IDate(c("2021-01-01", "2020-01-01")))) +test(2239.03, rbind(x, z), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) +test(2239.04, rbind(z, x), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) +test(2239.05, rbind(y, z), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA)))) +test(2239.06, rbind(z, y), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01")))) z[, b := NULL] -test(2239.07, rbind(x, z, fill = TRUE), data.table(a = c(1L, 3L), b = as.Date(c("2020-01-01", NA)))) -test(2239.08, rbind(z, x, fill = TRUE), data.table(a = c(3L, 1L), b = as.Date(c(NA, "2020-01-01")))) -test(2239.09, rbind(y, z, fill = TRUE), data.table(a = c(2L, 3L), b = as.IDate(c("2021-01-01", NA)))) -test(2239.10, rbind(z, y, fill = TRUE), data.table(a = c(3L, 2L), b = as.IDate(c(NA, "2021-01-01")))) -x = data.table(a = 1L, b = as.POSIXct("2021-10-06 13:58:00 UTC")) -test(2239.11, rbind(x, z, fill = TRUE), data.table(a = c(1L, 3L), b = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) -test(2239.12, rbind(z, x, fill = TRUE), data.table(a = c(3L, 1L), b = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) -x = data.table(c = 1L, d = as.POSIXct("2021-10-06 13:58:00 UTC")) -test(2239.13, rbind(x, z, fill = TRUE, use.names = FALSE), data.table(c = c(1L, 3L), d = as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) -test(2239.14, setnames(rbind(z, x, fill = TRUE, use.names = FALSE), c("c", "d")), data.table(c = c(3L, 1L), d = as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +test(2239.07, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) +test(2239.08, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) +test(2239.09, rbind(y, z, fill=TRUE), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA)))) +test(2239.10, rbind(z, y, fill=TRUE), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01")))) +x = data.table(a=1L, b=as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2239.11, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2239.12, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(c=1L, d=as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2239.13, rbind(x, z, fill=TRUE, use.names=FALSE), data.table(c = c(1L, 3L), d=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2239.14, setnames(rbind(z, x, fill=TRUE, use.names=FALSE), c("c", "d")), data.table(c=c(3L, 1L), d=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(a=1L, b=as.ITime(0)) +y = data.table(a=2L, b=NA) +test(2239.15, rbind(x,y), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) +test(2239.16, rbind(y,x), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +y[, b := NULL] +test(2239.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) +test(2239.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +if (test_nanotime) { + x = data.table(a=1L, b=as.nanotime(0)) + y = data.table(a=2L, b=NA) + test(2239.19, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(2239.20, rbind(y,x), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) + y[, b := NULL] + test(2239.21, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(2239.22, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) +} From 6ab44b06fc5e8c08bf9f35d033f936c814dd3906 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 26 Aug 2022 01:55:55 +0200 Subject: [PATCH 05/33] add merge tests --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0afc131bb..099726a96 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18852,3 +18852,7 @@ if (test_nanotime) { test(2239.21, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) test(2239.22, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) } +x = data.table(a = 1L, b = as.Date("2020-01-01")) +y = data.table(a = 2L, b = NA) +test(2239.23, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) +test(2239.24, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) From 65e7674172cf52d2da3ce5e950d0cab308361456 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 26 Aug 2022 02:33:09 +0200 Subject: [PATCH 06/33] add AsIs #4934 --- inst/tests/tests.Rraw | 4 ++++ src/data.table.h | 1 + src/init.c | 2 ++ src/rbindlist.c | 9 ++++----- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 099726a96..80f50f66b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18856,3 +18856,7 @@ x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = NA) test(2239.23, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) test(2239.24, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) +x = data.table(a = 1L, b=I(3L)) +y = data.table(a = 2L, b=NA) +test(2239.25, rbind(x,y), data.table(a = c(1L, 2L), b=I(c(3L, NA)))) +test(2239.26, rbind(y,x), data.table(a = c(2L, 1L), b=c(NA, 3L))) diff --git a/src/data.table.h b/src/data.table.h index b966e86c0..2068d4570 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -90,6 +90,7 @@ extern SEXP char_datatable; extern SEXP char_dataframe; extern SEXP char_NULL; extern SEXP char_maxString; +extern SEXP char_AsIs; extern SEXP sym_sorted; extern SEXP sym_index; extern SEXP sym_BY; diff --git a/src/init.c b/src/init.c index 284c30b4f..ae2c68243 100644 --- a/src/init.c +++ b/src/init.c @@ -23,6 +23,7 @@ SEXP char_datatable; SEXP char_dataframe; SEXP char_NULL; SEXP char_maxString; +SEXP char_AsIs; SEXP sym_sorted; SEXP sym_index; SEXP sym_BY; @@ -348,6 +349,7 @@ void attribute_visible R_init_data_table(DllInfo *info) char_dataframe = PRINTNAME(install("data.frame")); char_NULL = PRINTNAME(install("NULL")); char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF")); + char_AsIs = PRINTNAME(install("AsIs")); if (TYPEOF(char_integer64) != CHARSXP) { // checking one is enough in case of any R-devel changes diff --git a/src/rbindlist.c b/src/rbindlist.c index a5d994926..20218c3e2 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -274,10 +274,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) bool factor=false, orderedFactor=false; // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true. int longestLen=-1, longestW=-1, longestI=-1; // just for ordered factor; longestLen must be initialized as -1 so that rbind zero-length ordered factor could work #4795 SEXP longestLevels=R_NilValue; // just for ordered factor - bool int64=false; - bool date=false; - bool posixct=false; - bool itime=false; + bool int64=false, date=false, posixct=false, itime=false, asis=false; const char *foundName=NULL; bool anyNotStringOrFactor=false; SEXP firstCol=R_NilValue; @@ -317,10 +314,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) } else if (INHERITS(thisCol, char_ITime)) { if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } itime=true; + } else if (!asis && INHERITS(thisCol, char_AsIs)) { + asis=true; } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64 && ((!date && !posixct) || (date && posixct)) && !itime) { // prohibit binding of date and posixct + if (!factor && !int64 && ((!date && !posixct) || (date && posixct)) && !itime &!asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), 0)) { From 2282d9f92097effed7af7a2096081986dddfa294 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 6 Sep 2022 09:51:55 +0200 Subject: [PATCH 07/33] add news --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 4f4a2f417..4dae6a794 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,6 +296,8 @@ 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. +54. `rbindlist(l, use.names=TRUE)` allows now the binding of columns with different class attributes for certain classes such such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile for the request and @ben-schwen for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. From 9527b892e2a7a30f4bca4a51da41e439b65ab681 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 6 Sep 2022 09:59:38 +0200 Subject: [PATCH 08/33] news typo --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4dae6a794..6068b68e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,7 +296,7 @@ 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. -54. `rbindlist(l, use.names=TRUE)` allows now the binding of columns with different class attributes for certain classes such such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile for the request and @ben-schwen for the PR. +54. `rbindlist(l, use.names=TRUE)` allows now the binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile for the request and @ben-schwen for the PR. ## BUG FIXES From a6d9c65ac21187fb68ee9ba303f6e16b2dcfb93f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 30 Dec 2022 17:53:50 +0100 Subject: [PATCH 09/33] add ignore.attr argument --- NEWS.md | 3 ++- R/data.table.R | 8 ++++---- R/merge.R | 2 +- inst/tests/tests.Rraw | 6 ++++++ man/rbindlist.Rd | 6 ++++++ src/rbindlist.c | 9 ++++++--- 6 files changed, 25 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6068b68e7..b4c6e5100 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,7 +296,8 @@ 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. -54. `rbindlist(l, use.names=TRUE)` allows now the binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile for the request and @ben-schwen for the PR. +54. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index 473cf6e76..4690c45d4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2740,14 +2740,14 @@ chgroup = function(x) { } # plain rbind and cbind methods are registered using S3method() in NAMESPACE only from R>=4.0.0; #3948 -rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL) { +rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL, ignore.attr=FALSE) { l = lapply(list(...), function(x) if (is.list(x)) x else as.data.table(x)) #1626; e.g. psych binds a data.frame|table with a matrix - rbindlist(l, use.names, fill, idcol) + rbindlist(l, use.names, fill, idcol, ignore.attr) } cbind.data.table = data.table .rbind.data.table = rbind.data.table # the workaround using this in FAQ 2.24 is still applied to support R < 4.0.0 -rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { +rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) { if (is.null(l)) return(null.data.table()) if (!is.list(l) || is.data.frame(l)) stopf("Input is %s but should be a plain list of items to be stacked", class(l)[1L]) if (isFALSE(idcol)) { idcol = NULL } @@ -2763,7 +2763,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { if (!miss) stopf("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.") use.names = NA } - ans = .Call(Crbindlist, l, use.names, fill, idcol) + ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr) if (!length(ans)) return(null.data.table()) setDT(ans)[] } diff --git a/R/merge.R b/R/merge.R index cbc9b9e29..7e050e1a4 100644 --- a/R/merge.R +++ b/R/merge.R @@ -97,7 +97,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) { - dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE) + dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 80f50f66b..1be63af87 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18860,3 +18860,9 @@ x = data.table(a = 1L, b=I(3L)) y = data.table(a = 2L, b=NA) test(2239.25, rbind(x,y), data.table(a = c(1L, 2L), b=I(c(3L, NA)))) test(2239.26, rbind(y,x), data.table(a = c(2L, 1L), b=c(NA, 3L))) +# rbind ignore attributes #3911 +x = data.table(a = structure(1:2, class=c("a", "integer")), key="a") +y = data.table(a = 2:3, key="a") +test(2239.31, merge(x,y, all.y=TRUE), data.table(a=structure(2:3, class=c("a", "integer")), key="a")) +test(2239.32, rbind(x,y), error="Class attribute .* does not match with .*") +test(2239.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, 3L), class=c("a", "integer")))) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 2ba39a2a9..3aef56eb8 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -15,6 +15,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} + \item{ignore.attr}{\code{TRUE} allows binding columns with different class attributes. By default \code{FALSE}.} } \details{ Each item of \code{l} can be a \code{data.table}, \code{data.frame} or \code{list}, including \code{NULL} (skipped) or an empty object (0 rows). \code{rbindlist} is most useful when there are an unknown number of (potentially many) objects to stack, such as returned by \code{lapply(fileNames, fread)}. \code{rbind} is most useful to stack two or three objects which you know in advance. \code{\dots} should contain at least one \code{data.table} for \code{rbind(\dots)} to call the fast method and return a \code{data.table}, whereas \code{rbindlist(l)} always returns a \code{data.table} even when stacking a plain \code{list} with a \code{data.frame}, for example. @@ -54,6 +55,11 @@ rbindlist(l, use.names=TRUE, fill=TRUE, idcol=TRUE) setattr(l, 'names', c("a", "b")) rbindlist(l, use.names=TRUE, fill=TRUE, idcol="ID") +# bind different classes +DT1 = data.table(A=1:3,B=letters[1:3]) +DT2 = data.table(A=4:5,B=letters[4:5]) +setattr(DT1[["A"]], "class", c("a", "integer")) +rbind(DT1, DT2, ignore.attr=TRUE) } \keyword{ data } diff --git a/src/rbindlist.c b/src/rbindlist.c index 20218c3e2..588c33960 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -2,16 +2,19 @@ #include #include // for isdigit -SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) +SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignoreattrArg) { if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL) error(_("fill= should be TRUE or FALSE")); if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA + if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1) + error(_("ignore.attr= should be TRUE or FALSE")); if (!length(l)) return(l); if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; const bool fill = LOGICAL(fillArg)[0]; + const bool ignoreatt = LOGICAL(ignoreattrArg)[0]; if (fill && usenames==NA_LOGICAL) { usenames=TRUE; } @@ -322,8 +325,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (!factor && !int64 && ((!date && !posixct) || (date && posixct)) && !itime &!asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), - 0)) { - error(_("Class attribute on column %d of item %d does not match with column %d of item %d."), w+1, i+1, firstw+1, firsti+1); + 0) && !ignoreatt) { + error(_("Class attribute on column %d of item %d does not match with column %d of item %d. You can deactivate this safety-check by using ignore.attr=TRUE"), w+1, i+1, firstw+1, firsti+1); } UNPROTECT(2); } From e2786a520720e50608c5ab313b921a9441404ebc Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 30 Dec 2022 17:56:49 +0100 Subject: [PATCH 10/33] fix news --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index b4c6e5100..d3a5919fe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,7 +296,7 @@ 41. New function `%notin%` provides a convenient alternative to `!(x %in% y)`, [#4152](https://github.com/Rdatatable/data.table/issues/4152). Thanks to Jan Gorecki for suggesting and Michael Czekanski for the PR. `%notin%` uses half the memory because it computes the result directly as opposed to `!` which allocates a new vector to hold the negated result. If `x` is long enough to occupy more than half the remaining free memory, this can make the difference between the operation working, or failing with an out-of-memory error. -54. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +42. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). `rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. ## BUG FIXES From 8c4d19c8a2bc0343cf48ceb500b6a1450ff90259 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 30 Dec 2022 18:03:21 +0100 Subject: [PATCH 11/33] change arguments of registered rbindlist --- src/data.table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data.table.h b/src/data.table.h index 949c5510c..dac98cdc7 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -283,7 +283,7 @@ SEXP chmatchdup_R(SEXP, SEXP, SEXP); SEXP chin_R(SEXP, SEXP); SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP rbindlist(SEXP, SEXP, SEXP, SEXP); +SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP setlistelt(SEXP, SEXP, SEXP); SEXP address(SEXP); SEXP expandAltRep(SEXP); From 9d6afbc78589bd3236b7965f7ccd3c4e1ed10fce Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 30 Dec 2022 18:27:00 +0100 Subject: [PATCH 12/33] add attribute to usage --- man/rbindlist.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 3aef56eb8..34aabdb2e 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -7,7 +7,7 @@ Same as \code{do.call("rbind", l)} on \code{data.frame}s, but much faster. } \usage{ -rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) +rbindlist(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) # rbind(..., use.names=TRUE, fill=FALSE, idcol=NULL) } \arguments{ From 5a4823c6b43ab32365c0cd8ddcf8629cc1772b99 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 30 Dec 2022 18:49:04 +0100 Subject: [PATCH 13/33] move nanotime tests --- inst/tests/other.Rraw | 8 ++++++++ inst/tests/tests.Rraw | 9 --------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 807a67c19..80ca1b398 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -688,6 +688,14 @@ if (loaded[["nanotime"]]) { DT = data.table(time=nanotime(c(1,NA,3))) test(27, na.omit(DT), DT[c(1,3)]) + # was 2239 in tests.Rraw, rbind with vectors with class attributes #5309 + x = data.table(a=1L, b=as.nanotime(0)) + y = data.table(a=2L, b=NA) + test(27.01, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(27.02, rbind(y,x), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) + y[, b := NULL] + test(27.03, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(27.04, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) } # that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ba102951e..5bdc00a06 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18123,15 +18123,6 @@ test(2239.16, rbind(y,x), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) y[, b := NULL] test(2239.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) test(2239.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) -if (test_nanotime) { - x = data.table(a=1L, b=as.nanotime(0)) - y = data.table(a=2L, b=NA) - test(2239.19, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) - test(2239.20, rbind(y,x), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) - y[, b := NULL] - test(2239.21, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) - test(2239.22, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) -} x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = NA) test(2239.23, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) From 452adc275eb17f453b791ad8113fdca9b5e91dba Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 30 Dec 2022 18:49:26 +0100 Subject: [PATCH 14/33] adjust test numbering --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5bdc00a06..c1819725a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18125,12 +18125,12 @@ test(2239.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, test(2239.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = NA) -test(2239.23, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) -test(2239.24, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) +test(2239.19, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) +test(2239.20, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) x = data.table(a = 1L, b=I(3L)) y = data.table(a = 2L, b=NA) -test(2239.25, rbind(x,y), data.table(a = c(1L, 2L), b=I(c(3L, NA)))) -test(2239.26, rbind(y,x), data.table(a = c(2L, 1L), b=c(NA, 3L))) +test(2239.21, rbind(x,y), data.table(a = c(1L, 2L), b=I(c(3L, NA)))) +test(2239.22, rbind(y,x), data.table(a = c(2L, 1L), b=c(NA, 3L))) # rbind ignore attributes #3911 x = data.table(a = structure(1:2, class=c("a", "integer")), key="a") y = data.table(a = 2:3, key="a") From f967ce80d74f2af1f442ea323b19f5fc269fe3ab Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 3 Jan 2023 11:02:52 +0100 Subject: [PATCH 15/33] add test coverage --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c1819725a..a49fd71b5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14330,6 +14330,7 @@ test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TR data.table(a=c(1:4), c=INT(5,6,NA,NA))) test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4), V1=INT(NA,NA,5,6))) +test(2003.6, rbindlist(list(), ignore.attr=1), error="ignore.attr= should be TRUE or FALSE") # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" From 04ad6eeb3779cf897e2ba85e65028f9ce349a588 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 10 Dec 2023 20:47:39 +0100 Subject: [PATCH 16/33] prohibit NA for ignore.att --- src/rbindlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index 588c33960..88f660006 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -8,7 +8,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor error(_("fill= should be TRUE or FALSE")); if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA - if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1) + if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1 || LOGICAL(ignoreattrArg)[0] == NA_LOGICAL) error(_("ignore.attr= should be TRUE or FALSE")); if (!length(l)) return(l); if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); From 89f2541db0ac41519f7c8cec17bb46485f8ac388 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 31 Mar 2024 18:59:38 +0200 Subject: [PATCH 17/33] move news --- NEWS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3e28c31c6..9abf840b7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,9 @@ 7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. +8. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. @@ -347,8 +350,6 @@ 41. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. -43. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). -`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. ## BUG FIXES From 7353bc656739ad31c10379cdf746539d7bcbabf4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 31 Mar 2024 19:03:24 +0200 Subject: [PATCH 18/33] finish todo of #5857 --- R/merge.R | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/R/merge.R b/R/merge.R index 8062d91fc..ea53df3f2 100644 --- a/R/merge.R +++ b/R/merge.R @@ -96,21 +96,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] - # TO DO: replace by following once #5446 is merged - # if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) - if (length(missingyidx)) { - yy = y[missingyidx] - othercolsx = setdiff(nm_x, by) - if (length(othercolsx)) { - # create NA rectangle with correct types and attributes of x to cbind to y - tmp = rep.int(NA_integer_, length(missingyidx)) - # TO DO: use set() here instead.. - yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) - } - # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist - # takes care of #24 without having to save names. This is how it should be, IMHO. - dt = rbind(dt, yy, use.names=FALSE) - } + if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. newend = setdiff(nm_y, by.y) From 2e69fa1e97520e1a7453b6a8073740ab47f8629b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 21 Jul 2024 00:48:48 +0200 Subject: [PATCH 19/33] Update NEWS.md Co-authored-by: Michael Chirico --- NEWS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4b00d74d5..7623e3de0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -434,7 +434,6 @@ 41. `tables()` is faster by default by excluding the size of character strings in R's global cache (which may be shared) and excluding the size of list column items (which also may be shared). `mb=` now accepts any function which accepts a `data.table` and returns a higher and better estimate of its size in bytes, albeit more slowly; e.g. `mb = utils::object.size`. - ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. From 44b44f211c4c27d238acb122ae4b38422bcc8974 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 00:51:32 +0200 Subject: [PATCH 20/33] update comment --- inst/tests/other.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 554e1e8b9..d949efa1f 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -699,7 +699,7 @@ if (loaded[["nanotime"]]) { DT = data.table(time=nanotime(c(1,NA,3))) test(27, na.omit(DT), DT[c(1,3)]) - # was 2239 in tests.Rraw, rbind with vectors with class attributes #5309 + # rbind with vectors with class attributes #5309 x = data.table(a=1L, b=as.nanotime(0)) y = data.table(a=2L, b=NA) test(27.01, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) From 677a47ca270755a556ef2c1f8270843b0726c366 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 00:57:49 +0200 Subject: [PATCH 21/33] update doc for ignore.attr --- man/rbindlist.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 31693c7cd..17c5c2205 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -15,7 +15,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} \item{fill}{\code{TRUE} fills missing columns with NAs, or NULL for missing list columns. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} - \item{ignore.attr}{\code{TRUE} allows binding columns with different class attributes. By default \code{FALSE}.} + \item{ignore.attr}{Logical, default \code{FALSE}. When \code{TRUE}, allows binding columns with different attributes (e.g. class).} } \details{ Each item of \code{l} can be a \code{data.table}, \code{data.frame} or \code{list}, including \code{NULL} (skipped) or an empty object (0 rows). \code{rbindlist} is most useful when there are an unknown number of (potentially many) objects to stack, such as returned by \code{lapply(fileNames, fread)}. \code{rbind} is most useful to stack two or three objects which you know in advance. \code{\dots} should contain at least one \code{data.table} for \code{rbind(\dots)} to call the fast method and return a \code{data.table}, whereas \code{rbindlist(l)} always returns a \code{data.table} even when stacking a plain \code{list} with a \code{data.frame}, for example. From 319065138cf36260534381c050fac94053528d6c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 01:00:07 +0200 Subject: [PATCH 22/33] fix nit ignoreattr --- src/rbindlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index d265de80c..16efa5c33 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -14,7 +14,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; const bool fill = LOGICAL(fillArg)[0]; - const bool ignoreatt = LOGICAL(ignoreattrArg)[0]; + const bool ignoreattr = LOGICAL(ignoreattrArg)[0]; if (fill && usenames==NA_LOGICAL) { usenames=TRUE; } @@ -325,7 +325,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor if (!factor && !int64 && ((!date && !posixct) || (date && posixct)) && !itime &!asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), - 0) && !ignoreatt) { + 0) && !ignoreattr) { error(_("Class attribute on column %d of item %d does not match with column %d of item %d. You can deactivate this safety-check by using ignore.attr=TRUE"), w+1, i+1, firstw+1, firsti+1); } UNPROTECT(2); From 57572763bb5d25bd64566432738d8abfd494737d Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 01:16:30 +0200 Subject: [PATCH 23/33] fix test consistency --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5abdc46db..7145b8538 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18739,7 +18739,7 @@ test(2268, rbindlist(y, fill=TRUE), rbindlist(x, fill=TRUE)[rep(1:5, N)]) x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = as.IDate("2021-01-01")) z = data.table(a = 3L, b = NA) -test(2269.01, rbind(x, y), data.table(a=1:2, b= as.Date(c("2020-01-01", "2021-01-01")))) +test(2269.01, rbind(x, y), data.table(a=c(1L, 2L), b= as.Date(c("2020-01-01", "2021-01-01")))) test(2269.02, rbind(y, x), data.table(a=c(2L, 1L), b=as.IDate(c("2021-01-01", "2020-01-01")))) test(2269.03, rbind(x, z), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) test(2269.04, rbind(z, x), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) From d977e9b637605e4e519969e6ae06e01f0d358a88 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 01:18:04 +0200 Subject: [PATCH 24/33] remove setnames --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7145b8538..c1f45e6b4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18755,7 +18755,7 @@ test(2269.11, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b=as.POSIXct(c("20 test(2269.12, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) x = data.table(c=1L, d=as.POSIXct("2021-10-06 13:58:00 UTC")) test(2269.13, rbind(x, z, fill=TRUE, use.names=FALSE), data.table(c = c(1L, 3L), d=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) -test(2269.14, setnames(rbind(z, x, fill=TRUE, use.names=FALSE), c("c", "d")), data.table(c=c(3L, 1L), d=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +test(2269.14, rbind(z, x, fill=TRUE, use.names=FALSE), data.table(a=c(3L, 1L), d=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) x = data.table(a=1L, b=as.ITime(0)) y = data.table(a=2L, b=NA) test(2269.15, rbind(x,y), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) From 62f9e9a38caeaa4dfbced43df77808d34d813aa3 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 01:22:45 +0200 Subject: [PATCH 25/33] update asis test to use rbindlist --- inst/tests/tests.Rraw | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c1f45e6b4..0e9e09e67 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18767,10 +18767,11 @@ x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = NA) test(2269.19, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) test(2269.20, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) +# rbindlist with AsIs x = data.table(a = 1L, b=I(3L)) -y = data.table(a = 2L, b=NA) -test(2269.21, rbind(x,y), data.table(a = c(1L, 2L), b=I(c(3L, NA)))) -test(2269.22, rbind(y,x), data.table(a = c(2L, 1L), b=c(NA, 3L))) +y = data.table(a = 2L, b=4) +test(2269.21, rbindlist(list(x,y)), data.table(a = c(1L, 2L), b=I(c(3L, 4)))) +test(2269.22, rbindlist(list(y,x)), data.table(a = c(2L, 1L), b=c(4, 3))) # rbind ignore attributes #3911 x = data.table(a = structure(1:2, class=c("a", "integer")), key="a") y = data.table(a = 2:3, key="a") From 094bbfc9b353f1afc8113f90e90c46066ef85d6c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 01:28:00 +0200 Subject: [PATCH 26/33] update test comments --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0e9e09e67..521a1579a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18763,6 +18763,7 @@ test(2269.16, rbind(y,x), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) y[, b := NULL] test(2269.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) test(2269.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +# follow up to #5263 to simplify merge logic x = data.table(a = 1L, b = as.Date("2020-01-01")) y = data.table(a = 2L, b = NA) test(2269.19, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) From 66fa443c5b5a386003c22e6214c4bd221c2e5fda Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 21 Jul 2024 01:32:52 +0200 Subject: [PATCH 27/33] update NEWS num --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 7623e3de0..56296dc1c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,7 +40,8 @@ 14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. -8. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +15. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). + `rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. ## BUG FIXES From dbdd16fb0879cb4ebbdca3f186859b0be03a7cb4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 13:00:04 -0700 Subject: [PATCH 28/33] NEWS wording --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 56296dc1c..b690b04cf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,7 +40,7 @@ 14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. -15. `rbindlist(l, use.names=TRUE)` and `rbind` allows now the automatic binding of columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). `rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. From a4c73cc20f793ff3b2b59528494343677a185cc7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 13:01:21 -0700 Subject: [PATCH 29/33] more NEWS wording --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index b690b04cf..69fa3cddf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -42,7 +42,7 @@ 15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). -`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gained argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. +`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. ## BUG FIXES From 1f6ddf3f09c8288b1cf81ff1968c75a9604fc5b8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 13:03:45 -0700 Subject: [PATCH 30/33] template message for i18n --- src/rbindlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index 16efa5c33..d467feb8f 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -9,7 +9,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1 || LOGICAL(ignoreattrArg)[0] == NA_LOGICAL) - error(_("ignore.attr= should be TRUE or FALSE")); + error(_("%s should be TRUE or FALSE"), "ignore.attr"); if (!length(l)) return(l); if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; From d77c1eae638c4a921f5bf00c83f08b0ad1cdacd8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 13:27:50 -0700 Subject: [PATCH 31/33] simplify condition (C boolean --> no NA to worry about) --- src/rbindlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index d467feb8f..b7257eb01 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -322,7 +322,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64 && ((!date && !posixct) || (date && posixct)) && !itime &!asis) { // prohibit binding of date and posixct + if (!factor && !int64 && date == posixct && !itime &!asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), 0) && !ignoreattr) { From 45fa3876c46f73849b0d4555ddabd9f5bf6bb510 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 24 Jul 2024 09:55:34 -0700 Subject: [PATCH 32/33] && not & --- src/rbindlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index b7257eb01..f4405cf62 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -322,7 +322,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64 && date == posixct && !itime &!asis) { // prohibit binding of date and posixct + if (!factor && !int64 && date == posixct && !itime && !asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), 0) && !ignoreattr) { From 13ec07fff30cc10b8dfb43fff47234f9402e0bb5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 24 Jul 2024 17:08:46 +0000 Subject: [PATCH 33/33] correct error message --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 021ac199b..c7f0833cf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14436,7 +14436,7 @@ test(2003.82, rbind(y, x, fill=TRUE, use.names=TRUE), ans[2:1,]) test(2003.83, rbind(x, y, fill=TRUE, use.names=FALSE), ans) test(2003.84, rbind(y, x, fill=TRUE, use.names=FALSE), ans[2:1,]) # rbindlist ignore attributes #3911 -test(2003.85, rbindlist(list(), ignore.attr=1), error="ignore.attr= should be TRUE or FALSE") +test(2003.85, rbindlist(list(), ignore.attr=1), error="ignore.attr should be TRUE or FALSE") # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile"