From 4fd75e2bc7bd01a4653bd02fcccac8db12522180 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:30:27 +0200 Subject: [PATCH] Rbind allow binding of different class attributes (#5446) * add fix #5309 * fix test numbering * add rbind for ITime * more tests * add merge tests * add AsIs #4934 * add news * news typo * add ignore.attr argument * fix news * change arguments of registered rbindlist * add attribute to usage * move nanotime tests * adjust test numbering * add test coverage * prohibit NA for ignore.att * move news * finish todo of #5857 * Update NEWS.md Co-authored-by: Michael Chirico * update comment * update doc for ignore.attr * fix nit ignoreattr * fix test consistency * remove setnames * update asis test to use rbindlist * update test comments * update NEWS num * NEWS wording * more NEWS wording * template message for i18n * simplify condition (C boolean --> no NA to worry about) * && not & * correct error message --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 4 ++++ R/data.table.R | 8 ++++---- R/merge.R | 16 +-------------- inst/tests/other.Rraw | 8 ++++++++ inst/tests/tests.Rraw | 47 +++++++++++++++++++++++++++++++++++++++++++ man/rbindlist.Rd | 8 +++++++- src/data.table.h | 3 ++- src/init.c | 2 ++ src/rbindlist.c | 25 ++++++++++++++++++----- 9 files changed, 95 insertions(+), 26 deletions(-) diff --git a/NEWS.md b/NEWS.md index d1d2f20ae..bb19620cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,10 @@ 14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. +15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). + +`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 072da0cdc..99d06fad7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2735,14 +2735,14 @@ chgroup = function(x) { } # plain rbind and cbind methods are registered using S3method() in NAMESPACE only from R>=4.0.0; #3948 -rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL) { +rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL, ignore.attr=FALSE) { l = lapply(list(...), function(x) if (is.list(x)) x else as.data.table(x)) #1626; e.g. psych binds a data.frame|table with a matrix - rbindlist(l, use.names, fill, idcol) + rbindlist(l, use.names, fill, idcol, ignore.attr) } cbind.data.table = data.table .rbind.data.table = rbind.data.table # the workaround using this in FAQ 2.24 is still applied to support R < 4.0.0 -rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { +rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) { if (is.null(l)) return(null.data.table()) if (!is.list(l) || is.data.frame(l)) stopf("Input is %s but should be a plain list of items to be stacked", class(l)[1L]) if (isFALSE(idcol)) { idcol = NULL } @@ -2758,7 +2758,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { if (!miss) stopf("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.") use.names = NA } - ans = .Call(Crbindlist, l, use.names, fill, idcol) + ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr) if (!length(ans)) return(null.data.table()) setDT(ans)[] } diff --git a/R/merge.R b/R/merge.R index aabaaf740..025488740 100644 --- a/R/merge.R +++ b/R/merge.R @@ -96,21 +96,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] - # TO DO: replace by following once #5446 is merged - # if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) - if (length(missingyidx)) { - yy = y[missingyidx] - othercolsx = setdiff(nm_x, by) - if (length(othercolsx)) { - # create NA rectangle with correct types and attributes of x to cbind to y - tmp = rep.int(NA_integer_, length(missingyidx)) - # TO DO: use set() here instead.. - yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) - } - # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist - # takes care of #24 without having to save names. This is how it should be, IMHO. - dt = rbind(dt, yy, use.names=FALSE) - } + if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. newend = setdiff(nm_y, by.y) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 79f64b487..0a0195279 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -699,6 +699,14 @@ if (loaded[["nanotime"]]) { DT = data.table(time=nanotime(c(1,NA,3))) test(27, na.omit(DT), DT[c(1,3)]) + # rbind with vectors with class attributes #5309 + x = data.table(a=1L, b=as.nanotime(0)) + y = data.table(a=2L, b=NA) + test(27.01, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(27.02, rbind(y,x), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) + y[, b := NULL] + test(27.03, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(27.04, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) } # that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bf9bfff39..c7f0833cf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14435,6 +14435,8 @@ test(2003.81, rbind(x, y, fill=TRUE, use.names=TRUE), ans) test(2003.82, rbind(y, x, fill=TRUE, use.names=TRUE), ans[2:1,]) test(2003.83, rbind(x, y, fill=TRUE, use.names=FALSE), ans) test(2003.84, rbind(y, x, fill=TRUE, use.names=FALSE), ans[2:1,]) +# rbindlist ignore attributes #3911 +test(2003.85, rbindlist(list(), ignore.attr=1), error="ignore.attr should be TRUE or FALSE") # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" @@ -18767,3 +18769,48 @@ if (test_R.utils) { test(2273.4, fread(tmp2), DT) } file.remove(tmp, tmp2) + +# rbind with vectors with class attributes #5309 +x = data.table(a = 1L, b = as.Date("2020-01-01")) +y = data.table(a = 2L, b = as.IDate("2021-01-01")) +z = data.table(a = 3L, b = NA) +test(2274.01, rbind(x, y), data.table(a=c(1L, 2L), b= as.Date(c("2020-01-01", "2021-01-01")))) +test(2274.02, rbind(y, x), data.table(a=c(2L, 1L), b=as.IDate(c("2021-01-01", "2020-01-01")))) +test(2274.03, rbind(x, z), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) +test(2274.04, rbind(z, x), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) +test(2274.05, rbind(y, z), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA)))) +test(2274.06, rbind(z, y), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01")))) +z[, b := NULL] +test(2274.07, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) +test(2274.08, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) +test(2274.09, rbind(y, z, fill=TRUE), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA)))) +test(2274.10, rbind(z, y, fill=TRUE), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01")))) +x = data.table(a=1L, b=as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2274.11, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2274.12, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(c=1L, d=as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2274.13, rbind(x, z, fill=TRUE, use.names=FALSE), data.table(c = c(1L, 3L), d=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2274.14, rbind(z, x, fill=TRUE, use.names=FALSE), data.table(a=c(3L, 1L), d=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(a=1L, b=as.ITime(0)) +y = data.table(a=2L, b=NA) +test(2274.15, rbind(x,y), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) +test(2274.16, rbind(y,x), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +y[, b := NULL] +test(2274.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) +test(2274.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +# follow up to #5263 to simplify merge logic +x = data.table(a = 1L, b = as.Date("2020-01-01")) +y = data.table(a = 2L, b = NA) +test(2274.19, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) +test(2274.20, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) +# rbindlist with AsIs +x = data.table(a = 1L, b=I(3L)) +y = data.table(a = 2L, b=4) +test(2274.21, rbindlist(list(x,y)), data.table(a = c(1L, 2L), b=I(c(3L, 4)))) +test(2274.22, rbindlist(list(y,x)), data.table(a = c(2L, 1L), b=c(4, 3))) +# rbind ignore attributes #3911 +x = data.table(a = structure(1:2, class=c("a", "integer")), key="a") +y = data.table(a = 2:3, key="a") +test(2274.31, merge(x,y, all.y=TRUE), data.table(a=structure(2:3, class=c("a", "integer")), key="a")) +test(2274.32, rbind(x,y), error="Class attribute .* does not match with .*") +test(2274.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, 3L), class=c("a", "integer")))) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 9e218c5e1..17c5c2205 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -7,7 +7,7 @@ Same as \code{do.call(rbind, l)} on \code{data.frame}s, but much faster. } \usage{ -rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) +rbindlist(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) # rbind(..., use.names=TRUE, fill=FALSE, idcol=NULL) } \arguments{ @@ -15,6 +15,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} \item{fill}{\code{TRUE} fills missing columns with NAs, or NULL for missing list columns. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} + \item{ignore.attr}{Logical, default \code{FALSE}. When \code{TRUE}, allows binding columns with different attributes (e.g. class).} } \details{ Each item of \code{l} can be a \code{data.table}, \code{data.frame} or \code{list}, including \code{NULL} (skipped) or an empty object (0 rows). \code{rbindlist} is most useful when there are an unknown number of (potentially many) objects to stack, such as returned by \code{lapply(fileNames, fread)}. \code{rbind} is most useful to stack two or three objects which you know in advance. \code{\dots} should contain at least one \code{data.table} for \code{rbind(\dots)} to call the fast method and return a \code{data.table}, whereas \code{rbindlist(l)} always returns a \code{data.table} even when stacking a plain \code{list} with a \code{data.frame}, for example. @@ -54,6 +55,11 @@ rbindlist(l, use.names=TRUE, fill=TRUE, idcol=TRUE) setattr(l, 'names', c("a", "b")) rbindlist(l, use.names=TRUE, fill=TRUE, idcol="ID") +# bind different classes +DT1 = data.table(A=1:3,B=letters[1:3]) +DT2 = data.table(A=4:5,B=letters[4:5]) +setattr(DT1[["A"]], "class", c("a", "integer")) +rbind(DT1, DT2, ignore.attr=TRUE) } \keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index ee4a55d3a..cd9e40efa 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -93,6 +93,7 @@ extern SEXP char_datatable; extern SEXP char_dataframe; extern SEXP char_NULL; extern SEXP char_maxString; +extern SEXP char_AsIs; extern SEXP sym_sorted; extern SEXP sym_index; extern SEXP sym_BY; @@ -286,7 +287,7 @@ SEXP chmatchdup_R(SEXP, SEXP, SEXP); SEXP chin_R(SEXP, SEXP); SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP rbindlist(SEXP, SEXP, SEXP, SEXP); +SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP setlistelt(SEXP, SEXP, SEXP); SEXP address(SEXP); SEXP expandAltRep(SEXP); diff --git a/src/init.c b/src/init.c index 49de93746..48046b8d6 100644 --- a/src/init.c +++ b/src/init.c @@ -23,6 +23,7 @@ SEXP char_datatable; SEXP char_dataframe; SEXP char_NULL; SEXP char_maxString; +SEXP char_AsIs; SEXP sym_sorted; SEXP sym_index; SEXP sym_BY; @@ -260,6 +261,7 @@ void attribute_visible R_init_data_table(DllInfo *info) char_dataframe = PRINTNAME(install("data.frame")); char_NULL = PRINTNAME(install("NULL")); char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF")); + char_AsIs = PRINTNAME(install("AsIs")); if (TYPEOF(char_integer64) != CHARSXP) { // checking one is enough in case of any R-devel changes diff --git a/src/rbindlist.c b/src/rbindlist.c index e206d4ce4..7d7578cde 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -2,16 +2,19 @@ #include #include // for isdigit -SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) +SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignoreattrArg) { if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL) error(_("%s should be TRUE or FALSE"), "fill"); if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA + if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1 || LOGICAL(ignoreattrArg)[0] == NA_LOGICAL) + error(_("%s should be TRUE or FALSE"), "ignore.attr"); if (!length(l)) return(l); if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; const bool fill = LOGICAL(fillArg)[0]; + const bool ignoreattr = LOGICAL(ignoreattrArg)[0]; if (fill && usenames==NA_LOGICAL) { usenames=TRUE; } @@ -275,7 +278,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) bool factor=false, orderedFactor=false; // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true. int longestLen=-1, longestW=-1, longestI=-1; // just for ordered factor; longestLen must be initialized as -1 so that rbind zero-length ordered factor could work #4795 SEXP longestLevels=R_NilValue; // just for ordered factor - bool int64=false; + bool int64=false, date=false, posixct=false, itime=false, asis=false; const char *foundName=NULL; bool anyNotStringOrFactor=false; SEXP firstCol=R_NilValue; @@ -306,14 +309,25 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (INHERITS(thisCol, char_integer64)) { if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below int64=true; + } else if (INHERITS(thisCol, char_Date)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + date=true; + } else if (INHERITS(thisCol, char_POSIXct)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + posixct=true; + } else if (INHERITS(thisCol, char_ITime)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + itime=true; + } else if (!asis && INHERITS(thisCol, char_AsIs)) { + asis=true; } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64) { + if (!factor && !int64 && date == posixct && !itime && !asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), - 0)) { - error(_("Class attribute on column %d of item %d does not match with column %d of item %d."), w+1, i+1, firstw+1, firsti+1); + 0) && !ignoreattr) { + error(_("Class attribute on column %d of item %d does not match with column %d of item %d. You can deactivate this safety-check by using ignore.attr=TRUE"), w+1, i+1, firstw+1, firsti+1); } UNPROTECT(2); } @@ -324,6 +338,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option) if (int64 && maxType!=REALSXP) error(_("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP"), j+1, type2char(maxType)); // # nocov + if (date && INHERITS(firstCol, char_IDate)) maxType=INTSXP; // first encountered Date determines class and type #5309 SEXP target; SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow)); // does not initialize logical & numerics, but does initialize character and list if (!factor) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.