Skip to content

Commit

Permalink
Rbind allow binding of different class attributes (#5446)
Browse files Browse the repository at this point in the history
* add fix #5309

* fix test numbering

* add rbind for ITime

* more tests

* add merge tests

* add AsIs #4934

* add news

* news typo

* add ignore.attr argument

* fix news

* change arguments of registered rbindlist

* add attribute to usage

* move nanotime tests

* adjust test numbering

* add test coverage

* prohibit NA for ignore.att

* move news

* finish todo of #5857

* Update NEWS.md

Co-authored-by: Michael Chirico <chiricom@google.com>

* update comment

* update doc for ignore.attr

* fix nit ignoreattr

* fix test consistency

* remove setnames

* update asis test to use rbindlist

* update test comments

* update NEWS num

* NEWS wording

* more NEWS wording

* template message for i18n

* simplify condition (C boolean --> no NA to worry about)

* && not &

* correct error message

---------

Co-authored-by: Michael Chirico <chiricom@google.com>
Co-authored-by: Michael Chirico <michaelchirico4@gmail.com>
  • Loading branch information
3 people authored Jul 24, 2024
1 parent 0c022e2 commit 4fd75e2
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 26 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@

14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR.

15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391).

`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR.

## BUG FIXES

1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.
Expand Down
8 changes: 4 additions & 4 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2735,14 +2735,14 @@ chgroup = function(x) {
}

# plain rbind and cbind methods are registered using S3method() in NAMESPACE only from R>=4.0.0; #3948
rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL) {
rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL, ignore.attr=FALSE) {
l = lapply(list(...), function(x) if (is.list(x)) x else as.data.table(x)) #1626; e.g. psych binds a data.frame|table with a matrix
rbindlist(l, use.names, fill, idcol)
rbindlist(l, use.names, fill, idcol, ignore.attr)
}
cbind.data.table = data.table
.rbind.data.table = rbind.data.table # the workaround using this in FAQ 2.24 is still applied to support R < 4.0.0

rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) {
rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) {
if (is.null(l)) return(null.data.table())
if (!is.list(l) || is.data.frame(l)) stopf("Input is %s but should be a plain list of items to be stacked", class(l)[1L])
if (isFALSE(idcol)) { idcol = NULL }
Expand All @@ -2758,7 +2758,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) {
if (!miss) stopf("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.")
use.names = NA
}
ans = .Call(Crbindlist, l, use.names, fill, idcol)
ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr)
if (!length(ans)) return(null.data.table())
setDT(ans)[]
}
Expand Down
16 changes: 1 addition & 15 deletions R/merge.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,21 +96,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed
# Perhaps not very commonly used, so not a huge deal that the join is redone here.
missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
# TO DO: replace by following once #5446 is merged
# if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
if (length(missingyidx)) {
yy = y[missingyidx]
othercolsx = setdiff(nm_x, by)
if (length(othercolsx)) {
# create NA rectangle with correct types and attributes of x to cbind to y
tmp = rep.int(NA_integer_, length(missingyidx))
# TO DO: use set() here instead..
yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
}
# empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
# takes care of #24 without having to save names. This is how it should be, IMHO.
dt = rbind(dt, yy, use.names=FALSE)
}
if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
}
# X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.
newend = setdiff(nm_y, by.y)
Expand Down
8 changes: 8 additions & 0 deletions inst/tests/other.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,14 @@ if (loaded[["nanotime"]]) {
DT = data.table(time=nanotime(c(1,NA,3)))
test(27, na.omit(DT), DT[c(1,3)])

# rbind with vectors with class attributes #5309
x = data.table(a=1L, b=as.nanotime(0))
y = data.table(a=2L, b=NA)
test(27.01, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA))))
test(27.02, rbind(y,x), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0))))
y[, b := NULL]
test(27.03, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA))))
test(27.04, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0))))
}

# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517
Expand Down
47 changes: 47 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -14435,6 +14435,8 @@ test(2003.81, rbind(x, y, fill=TRUE, use.names=TRUE), ans)
test(2003.82, rbind(y, x, fill=TRUE, use.names=TRUE), ans[2:1,])
test(2003.83, rbind(x, y, fill=TRUE, use.names=FALSE), ans)
test(2003.84, rbind(y, x, fill=TRUE, use.names=FALSE), ans[2:1,])
# rbindlist ignore attributes #3911
test(2003.85, rbindlist(list(), ignore.attr=1), error="ignore.attr should be TRUE or FALSE")

# chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111
x1 = "fa\xE7ile"
Expand Down Expand Up @@ -18767,3 +18769,48 @@ if (test_R.utils) {
test(2273.4, fread(tmp2), DT)
}
file.remove(tmp, tmp2)

# rbind with vectors with class attributes #5309
x = data.table(a = 1L, b = as.Date("2020-01-01"))
y = data.table(a = 2L, b = as.IDate("2021-01-01"))
z = data.table(a = 3L, b = NA)
test(2274.01, rbind(x, y), data.table(a=c(1L, 2L), b= as.Date(c("2020-01-01", "2021-01-01"))))
test(2274.02, rbind(y, x), data.table(a=c(2L, 1L), b=as.IDate(c("2021-01-01", "2020-01-01"))))
test(2274.03, rbind(x, z), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA))))
test(2274.04, rbind(z, x), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01"))))
test(2274.05, rbind(y, z), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA))))
test(2274.06, rbind(z, y), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01"))))
z[, b := NULL]
test(2274.07, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA))))
test(2274.08, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01"))))
test(2274.09, rbind(y, z, fill=TRUE), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA))))
test(2274.10, rbind(z, y, fill=TRUE), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01"))))
x = data.table(a=1L, b=as.POSIXct("2021-10-06 13:58:00 UTC"))
test(2274.11, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA))))
test(2274.12, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC"))))
x = data.table(c=1L, d=as.POSIXct("2021-10-06 13:58:00 UTC"))
test(2274.13, rbind(x, z, fill=TRUE, use.names=FALSE), data.table(c = c(1L, 3L), d=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA))))
test(2274.14, rbind(z, x, fill=TRUE, use.names=FALSE), data.table(a=c(3L, 1L), d=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC"))))
x = data.table(a=1L, b=as.ITime(0))
y = data.table(a=2L, b=NA)
test(2274.15, rbind(x,y), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA))))
test(2274.16, rbind(y,x), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0))))
y[, b := NULL]
test(2274.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA))))
test(2274.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0))))
# follow up to #5263 to simplify merge logic
x = data.table(a = 1L, b = as.Date("2020-01-01"))
y = data.table(a = 2L, b = NA)
test(2274.19, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a"))
test(2274.20, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA))))
# rbindlist with AsIs
x = data.table(a = 1L, b=I(3L))
y = data.table(a = 2L, b=4)
test(2274.21, rbindlist(list(x,y)), data.table(a = c(1L, 2L), b=I(c(3L, 4))))
test(2274.22, rbindlist(list(y,x)), data.table(a = c(2L, 1L), b=c(4, 3)))
# rbind ignore attributes #3911
x = data.table(a = structure(1:2, class=c("a", "integer")), key="a")
y = data.table(a = 2:3, key="a")
test(2274.31, merge(x,y, all.y=TRUE), data.table(a=structure(2:3, class=c("a", "integer")), key="a"))
test(2274.32, rbind(x,y), error="Class attribute .* does not match with .*")
test(2274.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, 3L), class=c("a", "integer"))))
8 changes: 7 additions & 1 deletion man/rbindlist.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
Same as \code{do.call(rbind, l)} on \code{data.frame}s, but much faster.
}
\usage{
rbindlist(l, use.names="check", fill=FALSE, idcol=NULL)
rbindlist(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE)
# rbind(..., use.names=TRUE, fill=FALSE, idcol=NULL)
}
\arguments{
\item{l}{ A list containing \code{data.table}, \code{data.frame} or \code{list} objects. \code{\dots} is the same but you pass the objects by name separately. }
\item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.}
\item{fill}{\code{TRUE} fills missing columns with NAs, or NULL for missing list columns. By default \code{FALSE}.}
\item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.}
\item{ignore.attr}{Logical, default \code{FALSE}. When \code{TRUE}, allows binding columns with different attributes (e.g. class).}
}
\details{
Each item of \code{l} can be a \code{data.table}, \code{data.frame} or \code{list}, including \code{NULL} (skipped) or an empty object (0 rows). \code{rbindlist} is most useful when there are an unknown number of (potentially many) objects to stack, such as returned by \code{lapply(fileNames, fread)}. \code{rbind} is most useful to stack two or three objects which you know in advance. \code{\dots} should contain at least one \code{data.table} for \code{rbind(\dots)} to call the fast method and return a \code{data.table}, whereas \code{rbindlist(l)} always returns a \code{data.table} even when stacking a plain \code{list} with a \code{data.frame}, for example.
Expand Down Expand Up @@ -54,6 +55,11 @@ rbindlist(l, use.names=TRUE, fill=TRUE, idcol=TRUE)
setattr(l, 'names', c("a", "b"))
rbindlist(l, use.names=TRUE, fill=TRUE, idcol="ID")

# bind different classes
DT1 = data.table(A=1:3,B=letters[1:3])
DT2 = data.table(A=4:5,B=letters[4:5])
setattr(DT1[["A"]], "class", c("a", "integer"))
rbind(DT1, DT2, ignore.attr=TRUE)
}
\keyword{ data }

3 changes: 2 additions & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ extern SEXP char_datatable;
extern SEXP char_dataframe;
extern SEXP char_NULL;
extern SEXP char_maxString;
extern SEXP char_AsIs;
extern SEXP sym_sorted;
extern SEXP sym_index;
extern SEXP sym_BY;
Expand Down Expand Up @@ -286,7 +287,7 @@ SEXP chmatchdup_R(SEXP, SEXP, SEXP);
SEXP chin_R(SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP setlistelt(SEXP, SEXP, SEXP);
SEXP address(SEXP);
SEXP expandAltRep(SEXP);
Expand Down
2 changes: 2 additions & 0 deletions src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ SEXP char_datatable;
SEXP char_dataframe;
SEXP char_NULL;
SEXP char_maxString;
SEXP char_AsIs;
SEXP sym_sorted;
SEXP sym_index;
SEXP sym_BY;
Expand Down Expand Up @@ -260,6 +261,7 @@ void attribute_visible R_init_data_table(DllInfo *info)
char_dataframe = PRINTNAME(install("data.frame"));
char_NULL = PRINTNAME(install("NULL"));
char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"));
char_AsIs = PRINTNAME(install("AsIs"));

if (TYPEOF(char_integer64) != CHARSXP) {
// checking one is enough in case of any R-devel changes
Expand Down
25 changes: 20 additions & 5 deletions src/rbindlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
#include <Rdefines.h>
#include <ctype.h> // for isdigit

SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignoreattrArg)
{
if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL)
error(_("%s should be TRUE or FALSE"), "fill");
if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1)
error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA
if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1 || LOGICAL(ignoreattrArg)[0] == NA_LOGICAL)
error(_("%s should be TRUE or FALSE"), "ignore.attr");
if (!length(l)) return(l);
if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists."));
Rboolean usenames = LOGICAL(usenamesArg)[0];
const bool fill = LOGICAL(fillArg)[0];
const bool ignoreattr = LOGICAL(ignoreattrArg)[0];
if (fill && usenames==NA_LOGICAL) {
usenames=TRUE;
}
Expand Down Expand Up @@ -275,7 +278,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
bool factor=false, orderedFactor=false; // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true.
int longestLen=-1, longestW=-1, longestI=-1; // just for ordered factor; longestLen must be initialized as -1 so that rbind zero-length ordered factor could work #4795
SEXP longestLevels=R_NilValue; // just for ordered factor
bool int64=false;
bool int64=false, date=false, posixct=false, itime=false, asis=false;
const char *foundName=NULL;
bool anyNotStringOrFactor=false;
SEXP firstCol=R_NilValue;
Expand Down Expand Up @@ -306,14 +309,25 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
if (INHERITS(thisCol, char_integer64)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below
int64=true;
} else if (INHERITS(thisCol, char_Date)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; }
date=true;
} else if (INHERITS(thisCol, char_POSIXct)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; }
posixct=true;
} else if (INHERITS(thisCol, char_ITime)) {
if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; }
itime=true;
} else if (!asis && INHERITS(thisCol, char_AsIs)) {
asis=true;
}
if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; }
else {
if (!factor && !int64) {
if (!factor && !int64 && date == posixct && !itime && !asis) { // prohibit binding of date and posixct
if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)),
PROTECT(getAttrib(firstCol, R_ClassSymbol)),
0)) {
error(_("Class attribute on column %d of item %d does not match with column %d of item %d."), w+1, i+1, firstw+1, firsti+1);
0) && !ignoreattr) {
error(_("Class attribute on column %d of item %d does not match with column %d of item %d. You can deactivate this safety-check by using ignore.attr=TRUE"), w+1, i+1, firstw+1, firsti+1);
}
UNPROTECT(2);
}
Expand All @@ -324,6 +338,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option)
if (int64 && maxType!=REALSXP)
error(_("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP"), j+1, type2char(maxType)); // # nocov
if (date && INHERITS(firstCol, char_IDate)) maxType=INTSXP; // first encountered Date determines class and type #5309
SEXP target;
SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow)); // does not initialize logical & numerics, but does initialize character and list
if (!factor) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.
Expand Down

0 comments on commit 4fd75e2

Please sign in to comment.