From cb9ad20bfafb7515271f41d2f9f9bf947f9a92ec Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 4 Apr 2024 11:21:16 -0700 Subject: [PATCH 1/7] Revert "Deprecate key="a,b" and by="a,b" (#6047)" This reverts commit b6d61007d1da8abea3a1555aeaa56b60e9a09aef. --- NEWS.md | 4 -- R/data.table.R | 6 ++- R/fread.R | 2 +- inst/tests/tests.Rraw | 105 +++++++++++++++++++++------------------- man/IDateTime.Rd | 2 +- man/data.table.Rd | 2 +- man/duplicated.Rd | 4 +- man/fread.Rd | 2 +- man/merge.Rd | 8 +-- man/print.data.table.Rd | 2 +- man/setDT.Rd | 2 +- 11 files changed, 71 insertions(+), 68 deletions(-) diff --git a/NEWS.md b/NEWS.md index e62a60104..902f2fecc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,10 +2,6 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) -## BREAKING CHANGES - -1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error. - ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/R/data.table.R b/R/data.table.R index d55132071..24eff62d5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -62,7 +62,8 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str if (!is.null(key)) { if (!is.character(key)) stopf("key argument of data.table() must be character") if (length(key)==1L) { - if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=") + key = strsplit(key,split=",")[[1L]] + # eg key="A,B"; a syntax only useful in key argument to data.table(), really. } setkeyv(ans,key) } else { @@ -805,7 +806,8 @@ replace_dot_alias = function(e) { if (mode(bysub) == "character") { if (any(grepl(",", bysub, fixed = TRUE))) { - stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "by=") + if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) + bysub = strsplit(bysub, split=",", fixed=TRUE)[[1L]] } bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138 nzidx = nzchar(bysub) diff --git a/R/fread.R b/R/fread.R index b2e55403d..b4086d155 100644 --- a/R/fread.R +++ b/R/fread.R @@ -340,7 +340,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!is.character(key)) stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (length(key) == 1L) { - if (key != strsplit(key,split=",")[[1L]]) stopf("Usage of comma-separated literals in %s is deprecated, please split such entries yourself before passing to data.table", "key=") + key = strsplit(key, split = ",", fixed = TRUE)[[1L]] } setkeyv(ans, key) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0b740f605..287d36713 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -334,7 +334,7 @@ test(69.4, names(tables(silent=TRUE, mb=FALSE, index=TRUE)), xenv = new.env() # to control testing tables() xenv$DT = data.table(a = 1) test(69.5, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 1 1 0 a [NULL]\nTotal: 0MB") -xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key=c("A", "D", "F", "G")) +xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key="A,D,F,G") test(69.6, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 2 8 0 A,B,C,D,E,F,... A,D,F,G.*Total: 0MB") rm(xenv) test(69.7, tables(order.col='asdf'), error="not a column name of info") @@ -369,7 +369,7 @@ test(82, TESTDT[,c("a","b")], data.table(a=TESTDT[[1]], b=TESTDT[[2]], key=c("a" test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) -test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = c('a', 'b'))) +test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) # test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated ## this is very old DT() functionality, completely different than DT() discussed in 2023 test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default @@ -396,8 +396,8 @@ test(97, TESTDT[c("f","i","b"),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c( test(98, TESTDT[SJ(c("f","i","b")),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("b","f","i"), GroupSum=c(7L,7L,11L), key="b")) # line above is the way to group, sort by group and setkey on the result by group. -dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = c("A", "B")) -test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key=c("A", "B"))) +dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") +test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key="A,B")) # test [<- for column assignment dt1 <- dt2 <- dt @@ -419,7 +419,7 @@ test(106, all(dt + dt > 1)) test(107, dt + dt, dt * 2L) # test a few other generics: -test(108, dt, data.table(t(t(dt)),key=c('A', 'B'))) +test(108, dt, data.table(t(t(dt)),key="A,B")) test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key @@ -649,7 +649,7 @@ test(184, xx[a>6,sum(b),by=a], data.table(a=integer(),V1=integer())) # Tests of bug 1015 highlight by Harish # See thread "'by without by' now heeds nomatch=NA" # Tests 185-201 were added in above next to originals -x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key=c('a', 'b')) +x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key="a,b") y <- data.table(g=c("a","b","c","d"),h=c("A","A","A","A")) test(202, x[y], x[y,mult="all"]) test(203, x[y,d], c(1,2,NA,NA)) @@ -775,7 +775,7 @@ test(243, X[Y][,sum(foo*bar)], 195L) # test(245, X[Y,sum(foo*bar),mult="last"], data.table(a=2:3,V1=c(36L,56L))) # joining to less than all X's key colums (in examples but can't see formal test) -X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key=c('a', 'b')) +X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key="a,b") test(246.1, X["A"], X[1:2]) # checks that X[1:2] retains key, too test(246.2, key(X["A"]), c("a","b")) test(247, X["C"]$v, NA_integer_) @@ -959,7 +959,7 @@ test(295.3,DT,data.table(a=1:3,b=4:6,key="a")) # The := was on the local copy # new feature added 1.6.3, that key can be vector. -test(296,data.table(a=1:3,b=4:6,key=c('a', 'b')),data.table(a=1:3,b=4:6,key=c("a","b"))) +test(296,data.table(a=1:3,b=4:6,key="a,b"),data.table(a=1:3,b=4:6,key=c("a","b"))) # test .SDcols (not speed, just operation) DT = data.table(grp=1:3,A1=1:9,A2=10:18,A3=19:27,B1=101:109,B2=110:118,B3=119:127,key="grp") @@ -986,7 +986,7 @@ test(299.11, DT[1,c:=42L], data.table(a=1:3, c=TRUE), warning="42.*integer.*at R test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=c(TRUE,FALSE,FALSE))) # Test bug fix #1468, combining i and by. -DT = data.table(a=1:3,b=1:9,v=1:9,key=c('a', 'b')) +DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) @@ -1455,7 +1455,7 @@ unlink(f) # Test CJ problems with v1.7.4, #1689 test(463, all(sapply(CJ(1:2,1:3),length)==6L)) -DT = data.table(x=1:4,y=1:2,cnt=1L,key=c('x', 'y')) +DT = data.table(x=1:4,y=1:2,cnt=1L,key="x,y") test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA)) test(465, DT[CJ(1:4,1:4), sum(cnt>0), by=.EACHI]$y, rep(1:4,4)) f1 = factor(c("READING","MATHEMATICS")) @@ -1534,7 +1534,7 @@ test(483.2, DT, data.table(x=1:4)) # i.e. DT as it was before, without foo bein test(484, DT[,c("foo","bar"):=list(20L,numeric())], data.table(x=1:4, foo=20L, bar=NA_real_)) # Test i's key longer than x's -d1 <- data.table(a=1:2, b=11:14, key=c('a', 'b')) +d1 <- data.table(a=1:2, b=11:14, key="a,b") d2 <- data.table(A=0:1, B=1:4, key="A") test(485, d2[d1, allow.cartesian=TRUE], data.table(A=INT(1,1,1,1,2,2),B=INT(2,4,2,4,NA,NA),b=INT(11,11,13,13,12,14),key="A")) test(486, d2[d1,sum(B),by=.EACHI], data.table(A=INT(1,1,2,2),V1=INT(6,6,NA,NA),key="A")) # no allow.cartesian needed due to by-without-by @@ -1601,7 +1601,7 @@ dtA = data.table(i = 1:8, j = rep(1:2, 4), k = rep(1:4, 2), A = 10:17) dtB = data.table(j = rep(1:2, 2), k = 1:4, B = 18:21) test(502, merge(dtA, dtB, by = c("j","k"), all.x = TRUE), data.table(j=rep(1:2,each=4), k=rep(INT(1,3,2,4),each=2), i=INT(1,5,3,7,2,6,4,8), - A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key=c('j', 'k'))) + A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key="j,k")) test(503, dtA$i, 1:8) # check that merge didn't change the order of dtA by reference test(504, dtB$k, 1:4) # or dtB @@ -1686,7 +1686,8 @@ test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2)) -# tests 543,544 were of deprecated behavior to allow comma-separated entries to keyby +test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2)) +test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma") # Test single expressions passed to by, FR#1743 in v1.8.0 DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8) @@ -1753,16 +1754,20 @@ test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be gro test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because") -# tests 571-573 were of deprecated behavior to allow comma-separated entries in by= +# Test spaces in by="..." format, datatable-help on 31 March +DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) +test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) +test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) +test(573, DT[,sum(v),by="b, a"], error=base_messages$missing_object(" a")) # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) test(574, dim(unname(DT)), 3:2) # Test that CJ retains explicit names (useful if used independently) -test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key=c('x', 'y'))) -test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key=c('V1', 'y'))) -test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key=c('x', 'V2'))) +test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="x,y")) +test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="V1,y")) +test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key="x,V2")) # Test factor to character join when factor contains unused and reverse order levels : X = data.table(a=LETTERS[1:4],v=1:4,key="a") @@ -2289,7 +2294,7 @@ RHS = as.integer(DT$a) test(754.6, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") # Used to test warning on redundant by (#2282) but by=.EACHI has now superseded -DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key=c('a', 'b')) +DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b") test(755, DT[c("b","c"),sum(x),by=.EACHI], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(756, DT[c("b","c"),sum(x),by=a], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(757, DT[list(c("b","c"),"d"),sum(x),by=a], data.table(a=c("b","c"),V1=2:3,key="a")) # 'by' less than number of join columns @@ -2550,15 +2555,15 @@ test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) options(datatable.optimize = 0L) -test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") +test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)") options(datatable.optimize = 1L) -test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") +test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)") options(datatable.optimize = 2L) -test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], +test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized options(datatable.optimize = Inf) test(866, names(ans1), c("a","b","name1","name2")) -test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a","b","name1","name2")) # list names extracted here +test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by="a,b"]), c("a","b","name1","name2")) # list names extracted here test(868, ans1, ans2) # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") @@ -2750,9 +2755,9 @@ DT = data.table(a=1:3,b=1:6) test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA)) # roll join error when non last join column is factor, #2450 -X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key=c('state', 'uid', 'ts')) -Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key=c('state', 'uid', 'ts')) -test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key=c('state', 'uid', 'ts'))) +X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts') +Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts') +test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key='state,uid,ts')) test(917.2, X[Y, on=c("id","state"), roll=TRUE], error="Attempting roll join on factor column when joining x.state to i.state") # NA in join column of type double, #2453. @@ -2797,7 +2802,7 @@ DT[,num:=1:.N] # to group each row by itself test(931, DT[,cbind(.SD,dup=1:rep),by="num"], data.table(num=INT(1,2,2,3:7,7,7),x=c(1,1,1,1,1,2,2,3,3,3),y=c(1,1,1,2,3,1,1,2,2,2),rep=INT(1,2,2,1,1,1,1,3,3,3), dup=INT(1,1,2,1,1,1,1,1,2,3))) # New roll=+/- and rollends -DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key=c('a', 'b')) +DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key="a,b") test(932, DT[J(c(0,2,6,8)), roll=+Inf, rollends=TRUE, v], INT(1,1,6,7)) test(933, DT[J(c(0,2,6,8)), roll=-Inf, rollends=TRUE, v], INT(1,2,7,7)) test(934, DT[J(c(0,2,6,8)), roll=+Inf, v], INT(NA,1,6,7)) @@ -2928,7 +2933,7 @@ test(985.2, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C",NA))) ## Allow unique/duplicated to accept custom colum combination to query for ## uniqueness -dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = c('A', 'B')) +dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") df <- as.data.frame(dt) test(986, unique(dt, by=key(dt)), dt[!duplicated(df[, key(dt)]),]) test(987, unique(dt, by='A'), dt[!duplicated(df[, 'A'])]) @@ -3572,11 +3577,11 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time")) test(1102.02, dcast(DT, diet ~ variable, fun.aggregate=sum), data.table(diet=factor(1:4), weight=c(22582, 14714, 17154, 15961), key="diet")) test(1102.03, dcast(DT, diet+chick ~ time, drop=FALSE)[c(1,.N),c(1:4,13:14)], - ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key=c('diet', 'chick'))) + ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key="diet,chick")) test(1102.04, dcast(DT, diet+chick ~ time, drop=FALSE, fill=0)[c(1,.N),c(1:4,13:14)], ans[1, c("20","21"):=0]) # add test for 'subset=' in dcast test(1102.05, dcast(DT, time + chick ~ variable+diet, fun.aggregate=sum, subset=.(time> 20))[c(1,2,44,.N)], - data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key=c('time', 'chick'))) + data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key="time,chick")) # testing without aggregation set.seed(3) @@ -3628,7 +3633,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, v=factor(NA, levels=tail(letters,5)), x=factor(NA, levels=tail(letters,5)), y=factor(c(NA,"y",NA), levels=tail(letters,5)), - z=factor(NA, levels=tail(letters,5)), key=c("a1", "a2", "a3"))) + z=factor(NA, levels=tail(letters,5)), key="a1,a2,a3")) # dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast): DT = data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA)) @@ -3699,17 +3704,17 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) test(1102.31, dcast(DT, x + y ~ z, fun.aggregate=sum, value.var=c("d1","d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,4), .SDcols=c("d1_a","d1_b")][], - data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key=c('x', 'y'))) + data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key="x,y")) # multiple fun.agg test(1102.32, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var="d1")[c(1,.N)][, 3:6:=lapply(.SD,round,3), .SDcols=3:6][], - data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key=c('x', 'y'))) + data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key="x,y")) # multiple fun.agg and value.var (all combinations) test(1102.33, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=c("d1", "d2"))[c(1,.N)][, c(3,4,7:10):=lapply(.SD,round,3), .SDcols=c(3,4,7:10)][], data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_sum_a=INT(0,1),d2_sum_b=INT(1,3), - d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key=c('x', 'y'))) + d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y")) # multiple fun.agg and value.var (one-to-one) test(1102.34, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=list("d1", "d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,3), .SDcols=3:4][], - data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key=c('x', 'y'))) + data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y")) # Additional test after fixing fun.agg creation - using the example here: https://github.com/Rdatatable/data.table/issues/716 DT = data.table(x=1:5, y=paste("v", 1:5, sep=""), v1=6:10, v2=11:15, k1=letters[1:5], k2=letters[6:10]) @@ -4422,7 +4427,7 @@ test(1220, set(DT,j=2:3,value=newVals), data.table(a=1:3,b=16:18,c=19:21)) # Test non-join key columns used in j work again (spotted straight away by Michele on datatable-help when v1.9.2 was released). # Introduced at commit 1030. Very extensive new tests 1136* still all pass (great stuff Arun). -DT = data.table(a=1:2,b=letters[1:6],key=c('a', 'b')) +DT = data.table(a=1:2,b=letters[1:6],key="a,b") test(1221, DT[.(1),b], c("a","c","e")) ########################################################################################### @@ -5230,7 +5235,7 @@ test(1305.13, setDF(dt, rownames=rep("a",5)), error='rownames contains duplicate # .SD retains as much of head(key) as appropriate. # by= always keeps data appearance order, so it's which columns are grouped and selected that drive how much of key is retained -DT = data.table(a=1:3,b=1:6,c=1:6,key=c('a', 'b')) +DT = data.table(a=1:3,b=1:6,c=1:6,key="a,b") test(1306, DT[1:2,key(.SD)], c("a","b")) test(1307, DT[2:1,key(.SD)], NULL) test(1308, DT[,key(.SD),by=a], data.table(a=integer())) @@ -5299,9 +5304,9 @@ test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c" # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = -c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = c('x', 'y')) -dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = c('x', 'y')) -test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key=c('x', 'y'))) +c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = "x,y") +dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = "x,y") +test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key="x,y")) # also test where 'i' is not sorted. set.seed(1L) @@ -7359,10 +7364,10 @@ x = c(1, 2, 1) y = c(5, 8, 8, 4) w = c(10, 12, 12, 13) # already sorted but has dups; more efficient case to cover # tests 1525.1, 1525.2 tested the now-ineffectual datatable.CJ.names option. -ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key=c('V1', 'z')) +ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key="V1,z") test(1525.3, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) test(1525.4, CJ(x, z=y, unique=TRUE), setnames(copy(ans),c("x","z"))) -test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key=c('x', 'w'))) +test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key="x,w")) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) @@ -7682,7 +7687,7 @@ setkey(x1, a1, a2) test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y))) test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2")) # also test shallow here so as to catch future regressions -x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key=c('a1', 'a2')) +x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2") test(1545.01, key(.shallow(x1, cols="a2")), NULL) test(1545.02, key(.shallow(x1, retain.key=FALSE)), NULL) test(1545.03, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) @@ -9091,7 +9096,7 @@ test(1630.09, copy(dt1)[id>5, z:=2L, nomatch=0L], copy(dt1)[ test(1630.10, copy(dt1)[id>5, z:=2L, nomatch=NA], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch") # fix for #1268, on= retains keys correctly. -A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key=c('site', 'date')) +A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key="site,date") B = data.table(x=c(10,20), y=c(100,200), key="x") test(1631, key(A[B, on="x"]), NULL) @@ -13153,8 +13158,8 @@ setindex(DT, NULL) test(1942.06, indices(DT), NULL) setindex(DT,id1,id2) test(1942.07, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using uniqlist on index 'id1__id2'") -test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key=c('id1', 'id2')), output="Finding groups using uniqlist on index 'id1__id2'") -test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key=c('id2', 'id1')), output="Finding groups using forderv") +test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key="id1,id2"), output="Finding groups using uniqlist on index 'id1__id2'") +test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key="id2,id1"), output="Finding groups using forderv") options(datatable.use.index=FALSE) test(1942.10, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using forderv") options(datatable.use.index=TRUE) @@ -13164,7 +13169,7 @@ set.seed(2) DT = data.table(real=sample((1:1500)/1000, 10000, replace=TRUE), id=sample(letters, 1000, replace=TRUE), value=1:10000) setkey(DT,id,real) test(1942.11, DT[, .(list(value)), keyby=.(id,real), verbose=TRUE][c(1,6,8744,.N)], - data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key=c('id', 'real')), + data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key="id,real"), output="Finding groups using uniqlist on key") setindex(DT,real) test(1942.12, DT[, sum(value), keyby=real, verbose=TRUE][c(1,500,1498,.N)], data.table(real=c(0.001, 0.501, 1.499, 1.5), V1=INT(31036,37564,14792,38606), key="real"), @@ -13186,8 +13191,8 @@ DT2 <- data.table( test(1943.1, (ans<-DT1[DT2])[,1:4], DT1) # ok before test(1943.2, DT1[DT2, on=c("id","date","period")], ans) # ok before test(1943.3, DT1[DT2, on=c("id","date","period","year")], ans[,1:4]) # no warning (longer object length is not a multiple) -DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key=c('id', 'date')) -DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key=c('id', 'date')) +DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key="id,date") +DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key="id,date") test(1943.4, DT1[DT2, on=c("id",date="date2")], data.table(id="A", date=3:1, val=9:7, i.date=1:3)) # was invalidly keyed by id,date in 1.11.6 @@ -15674,7 +15679,7 @@ test(2069.28, data.table(c='1', d=2)[ , c(a='b'), by=c, verbose=TRUE], output='j test(2069.29, data.table(c = '1', d = 2)[ , .(a = c(nm='b')), by = c, verbose = TRUE], output = 'Column 1 of j is a named vector') DT <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], z = 0:3 + (4:1)*1i) test(2069.30, DT[, .SD[3,], by=b], DT[9:12, .(b, a, z)]) -DT = data.table(x=1:4,y=1:2,lgl=TRUE,key=c('x', 'y')) +DT = data.table(x=1:4,y=1:2,lgl=TRUE,key="x,y") test(2069.31, DT[CJ(1:4,1:4), any(lgl), by=.EACHI]$V1, c(TRUE, NA, NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, NA, NA, TRUE, NA, NA)) set.seed(45L) @@ -15858,7 +15863,7 @@ test(2074.23, capture.output(print(DT2, topn=1L, col.names='none')), c(" 1: 1", " --- ", "101: 101")) # foverlaps -x = data.table(start=NA_integer_, end=1L, key=c('start', 'end')) +x = data.table(start=NA_integer_, end=1L, key='start,end') y = copy(x) test(2074.24, foverlaps(x, y), error="NA values in data.table x 'start' column") x[ , start := 0L] diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index be7620890..928e732bc 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -241,7 +241,7 @@ identical(as.ITime("10:45"), methods::as("10:45", "ITime")) as.POSIXct("2001-01-01") + as.ITime("10:45") datetime <- seq(as.POSIXct("2001-01-01"), as.POSIXct("2001-01-03"), by = "5 hour") -(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = c("a", "idate", "itime"))) +(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = "a,idate,itime")) af[, mean(a), by = "itime"] af[, mean(a), by = list(hour = hour(itime))] diff --git a/man/data.table.Rd b/man/data.table.Rd index 557139e2f..2e326fed0 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -44,7 +44,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}.} - \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}.} + \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}.} \item{stringsAsFactors}{Logical (default is \code{FALSE}). Convert all \code{character} columns to \code{factor}s?} diff --git a/man/duplicated.Rd b/man/duplicated.Rd index e17d8df0c..daf7c39d5 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -88,7 +88,7 @@ If none exists, 0L is returned. } \examples{ DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), - C = rep(1:2, 6), key = c("A", "B")) + C = rep(1:2, 6), key = "A,B") duplicated(DT) unique(DT) @@ -113,7 +113,7 @@ identical(unique(DT),DT[10]) # FALSE # fromLast=TRUE DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), - C = rep(1:2, 6), key = c("A", "B")) + C = rep(1:2, 6), key = "A,B") duplicated(DT, by="B", fromLast=TRUE) unique(DT, by="B", fromLast=TRUE) diff --git a/man/fread.Rd b/man/fread.Rd index 49b187364..b431969dc 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -55,7 +55,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} - \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } + \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. The default for this argument can be changed with \code{options(datatable.fread.datatable=FALSE)}.} diff --git a/man/merge.Rd b/man/merge.Rd index d374da076..d8246668c 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -87,16 +87,16 @@ merge(dt1, dt2, all = TRUE) (dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A")) merge(dt1, dt2, allow.cartesian=TRUE) -(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = c("A", "B"))) -(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = c("A", "B"))) +(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = "A,B")) +(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = "A,B")) merge(dt1, dt2) merge(dt1, dt2, by="B", allow.cartesian=TRUE) # test it more: -d1 <- data.table(a=rep(1:2,each=3), b=1:6, key=c("a", "b")) +d1 <- data.table(a=rep(1:2,each=3), b=1:6, key="a,b") d2 <- data.table(a=0:1, bb=10:11, key="a") d3 <- data.table(a=0:1, key="a") -d4 <- data.table(a=0:1, b=0:1, key=c("a", "b")) +d4 <- data.table(a=0:1, b=0:1, key="a,b") merge(d1, d2) merge(d2, d1) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index a39c8c446..bda7a9b78 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -83,7 +83,7 @@ print(DT, row.names = FALSE) #`print.keys` can alert which columns are currently keys - DT <- data.table(a=1:3, b=4:6, c=7:9, key=c("b", "a")) + DT <- data.table(a=1:3, b=4:6, c=7:9, key="b,a") setindexv(DT, c("a", "b")) setindexv(DT, "a") print(DT, print.keys=TRUE) diff --git a/man/setDT.Rd b/man/setDT.Rd index 9311d0e3b..c00ba0f46 100644 --- a/man/setDT.Rd +++ b/man/setDT.Rd @@ -13,7 +13,7 @@ setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) \arguments{ \item{x}{ A named or unnamed \code{list}, \code{data.frame} or \code{data.table}. } \item{keep.rownames}{ For \code{data.frame}s, \code{TRUE} retains the \code{data.frame}'s row names under a new column \code{rn}. \code{keep.rownames = "id"} names the column \code{"id"} instead. } - \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. } + \item{key}{Character vector of one or more column names which is passed to \code{\link{setkeyv}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. } \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}. } } From 609c8b358aba62db858ab680b7ed5ee1605049a3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 17:36:26 -0700 Subject: [PATCH 2/7] restore changes to usages of "a,b" in the package --- NEWS.md | 4 ++ inst/tests/tests.Rraw | 105 +++++++++++++++++++--------------------- man/IDateTime.Rd | 2 +- man/data.table.Rd | 2 +- man/duplicated.Rd | 4 +- man/fread.Rd | 2 +- man/merge.Rd | 8 +-- man/print.data.table.Rd | 2 +- man/setDT.Rd | 2 +- 9 files changed, 65 insertions(+), 66 deletions(-) diff --git a/NEWS.md b/NEWS.md index 77388e7fb..3d5b2f81c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) +## BREAKING CHANGES + +1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error. + ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b526c969f..977b29b5c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -339,7 +339,7 @@ test(69.4, names(tables(silent=TRUE, mb=FALSE, index=TRUE)), xenv = new.env() # to control testing tables() xenv$DT = data.table(a = 1) test(69.5, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 1 1 0 a [NULL]\nTotal: 0MB") -xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key="A,D,F,G") +xenv$DT = data.table(A=1:2, B=3:4, C=5:6, D=7:8, E=9:10, F=11:12, G=13:14, H=15:16, key=c("A", "D", "F", "G")) test(69.6, nrow(tables(env=xenv)), 1L, output="NAME NROW NCOL MB COLS KEY\n1: DT 2 8 0 A,B,C,D,E,F,... A,D,F,G.*Total: 0MB") rm(xenv) test(69.7, tables(order.col='asdf'), error="not a column name of info") @@ -374,7 +374,7 @@ test(82, TESTDT[,c("a","b")], data.table(a=TESTDT[[1]], b=TESTDT[[2]], key=c("a" test(83, TESTDT[,list("a","b")], data.table(V1="a",V2="b")) test(83.1, TESTDT[,list("sum(a),sum(b)")], data.table("sum(a),sum(b)")) test(83.2, TESTDT[,list("sum(a),sum(b)"),by=a], {tt=data.table(a=c("a","c","d","g"),V1="sum(a),sum(b)",key="a");tt$V1=as.character(tt$V1);tt}) -test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = 'a,b')) +test(84, TESTDT[1:2,list(a,b)], data.table(a=c("a","c"), b=c("e","e"), key = c('a', 'b'))) # test(85, TESTDT[1:2,DT(a,b)], data.table(a=c("a","c"), b=c("e","e"))) #DT() now deprecated ## this is very old DT() functionality, completely different than DT() discussed in 2023 test(86, TESTDT[,sum(v),by="b"], data.table(b=c("e","f","i","b"),V1=INT(3,7,11,7))) # TESTDT is key'd by a,b, so correct that grouping by b should not be key'd in the result by default @@ -401,8 +401,8 @@ test(97, TESTDT[c("f","i","b"),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c( test(98, TESTDT[SJ(c("f","i","b")),list(GroupSum=sum(v)),by=.EACHI], data.table(b=c("b","f","i"), GroupSum=c(7L,7L,11L), key="b")) # line above is the way to group, sort by group and setkey on the result by group. -dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") -test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key="A,B")) +dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = c("A", "B")) +test(99, unique(dt, by=key(dt)), data.table(dt[c(1L, 4L, 5L, 7L, 9L, 10L)], key=c("A", "B"))) # test [<- for column assignment dt1 <- dt2 <- dt @@ -424,7 +424,7 @@ test(106, all(dt + dt > 1)) test(107, dt + dt, dt * 2L) # test a few other generics: -test(108, dt, data.table(t(t(dt)),key="A,B")) +test(108, dt, data.table(t(t(dt)),key=c('A', 'B'))) test(109, all(!is.na(dt))) dt2 <- dt dt2$A[1] <- NA # removes key @@ -654,7 +654,7 @@ test(184, xx[a>6,sum(b),by=a], data.table(a=integer(),V1=integer())) # Tests of bug 1015 highlight by Harish # See thread "'by without by' now heeds nomatch=NA" # Tests 185-201 were added in above next to originals -x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key="a,b") +x <- data.table(a=c("a","b","d","e"),b=c("A","A","B","B"),d=c(1,2,3,4), key=c('a', 'b')) y <- data.table(g=c("a","b","c","d"),h=c("A","A","A","A")) test(202, x[y], x[y,mult="all"]) test(203, x[y,d], c(1,2,NA,NA)) @@ -780,7 +780,7 @@ test(243, X[Y][,sum(foo*bar)], 195L) # test(245, X[Y,sum(foo*bar),mult="last"], data.table(a=2:3,V1=c(36L,56L))) # joining to less than all X's key colums (in examples but can't see formal test) -X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key="a,b") +X=data.table(a=rep(LETTERS[1:2],2:3),b=1:5,v=10:14,key=c('a', 'b')) test(246.1, X["A"], X[1:2]) # checks that X[1:2] retains key, too test(246.2, key(X["A"]), c("a","b")) test(247, X["C"]$v, NA_integer_) @@ -964,7 +964,7 @@ test(295.3,DT,data.table(a=1:3,b=4:6,key="a")) # The := was on the local copy # new feature added 1.6.3, that key can be vector. -test(296,data.table(a=1:3,b=4:6,key="a,b"),data.table(a=1:3,b=4:6,key=c("a","b"))) +test(296,data.table(a=1:3,b=4:6,key=c('a', 'b')),data.table(a=1:3,b=4:6,key=c("a","b"))) # test .SDcols (not speed, just operation) DT = data.table(grp=1:3,A1=1:9,A2=10:18,A3=19:27,B1=101:109,B2=110:118,B3=119:127,key="grp") @@ -991,7 +991,7 @@ test(299.11, DT[1,c:=42L], data.table(a=1:3, c=TRUE), warning="42.*integer.*at R test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=c(TRUE,FALSE,FALSE))) # Test bug fix #1468, combining i and by. -DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") +DT = data.table(a=1:3,b=1:9,v=1:9,key=c('a', 'b')) test(300, DT[J(1),sum(v),by=b], data.table(b=c(1L,4L,7L),V1=c(1L,4L,7L))) # should not retain key because by= is not on head(key(DT)) test(300.1, DT[J(1:2),sum(v),by=b], data.table(b=c(1L,4L,7L,2L,5L,8L),V1=c(1L,4L,7L,2L,5L,8L))) @@ -1460,7 +1460,7 @@ unlink(f) # Test CJ problems with v1.7.4, #1689 test(463, all(sapply(CJ(1:2,1:3),length)==6L)) -DT = data.table(x=1:4,y=1:2,cnt=1L,key="x,y") +DT = data.table(x=1:4,y=1:2,cnt=1L,key=c('x', 'y')) test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA)) test(465, DT[CJ(1:4,1:4), sum(cnt>0), by=.EACHI]$y, rep(1:4,4)) f1 = factor(c("READING","MATHEMATICS")) @@ -1539,7 +1539,7 @@ test(483.2, DT, data.table(x=1:4)) # i.e. DT as it was before, without foo bein test(484, DT[,c("foo","bar"):=list(20L,numeric())], data.table(x=1:4, foo=20L, bar=NA_real_)) # Test i's key longer than x's -d1 <- data.table(a=1:2, b=11:14, key="a,b") +d1 <- data.table(a=1:2, b=11:14, key=c('a', 'b')) d2 <- data.table(A=0:1, B=1:4, key="A") test(485, d2[d1, allow.cartesian=TRUE], data.table(A=INT(1,1,1,1,2,2),B=INT(2,4,2,4,NA,NA),b=INT(11,11,13,13,12,14),key="A")) test(486, d2[d1,sum(B),by=.EACHI], data.table(A=INT(1,1,2,2),V1=INT(6,6,NA,NA),key="A")) # no allow.cartesian needed due to by-without-by @@ -1606,7 +1606,7 @@ dtA = data.table(i = 1:8, j = rep(1:2, 4), k = rep(1:4, 2), A = 10:17) dtB = data.table(j = rep(1:2, 2), k = 1:4, B = 18:21) test(502, merge(dtA, dtB, by = c("j","k"), all.x = TRUE), data.table(j=rep(1:2,each=4), k=rep(INT(1,3,2,4),each=2), i=INT(1,5,3,7,2,6,4,8), - A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key="j,k")) + A=INT(10,14,12,16,11,15,13,17), B=rep(INT(18,20,19,21),each=2), key=c('j', 'k'))) test(503, dtA$i, 1:8) # check that merge didn't change the order of dtA by reference test(504, dtB$k, 1:4) # or dtB @@ -1691,8 +1691,7 @@ test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2)) -test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2)) -test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma") +# tests 543,544 were of deprecated behavior to allow comma-separated entries to keyby # Test single expressions passed to by, FR#1743 in v1.8.0 DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8) @@ -1759,20 +1758,16 @@ test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be gro test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because") -# Test spaces in by="..." format, datatable-help on 31 March -DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) -test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) -test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) -test(573, DT[,sum(v),by="b, a"], error=base_messages$missing_object(" a")) +# tests 571-573 were of deprecated behavior to allow comma-separated entries in by= # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) test(574, dim(unname(DT)), 3:2) # Test that CJ retains explicit names (useful if used independently) -test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="x,y")) -test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key="V1,y")) -test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key="x,V2")) +test(575, CJ(x=c(1L,2L), y=c("a","b")), data.table(x=c(1L,1L,2L,2L),y=c("a","b","a","b"),key=c('x', 'y'))) +test(576, CJ(c(1L,2L), y=c("a","b")), data.table(V1=c(1L,1L,2L,2L),y=c("a","b","a","b"),key=c('V1', 'y'))) +test(577, CJ(x=c(1L,2L), c("a","b")), data.table(x=c(1L,1L,2L,2L),V2=c("a","b","a","b"),key=c('x', 'V2'))) # Test factor to character join when factor contains unused and reverse order levels : X = data.table(a=LETTERS[1:4],v=1:4,key="a") @@ -2299,7 +2294,7 @@ RHS = as.integer(DT$a) test(754.6, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") # Used to test warning on redundant by (#2282) but by=.EACHI has now superseded -DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b") +DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key=c('a', 'b')) test(755, DT[c("b","c"),sum(x),by=.EACHI], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(756, DT[c("b","c"),sum(x),by=a], data.table(a=c("b","c"),V1=c(7L,9L),key="a")) test(757, DT[list(c("b","c"),"d"),sum(x),by=a], data.table(a=c("b","c"),V1=2:3,key="a")) # 'by' less than number of join columns @@ -2560,15 +2555,15 @@ test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) options(datatable.optimize = 0L) -test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)") +test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") options(datatable.optimize = 1L) -test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], output="(GForce FALSE)") +test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") options(datatable.optimize = 2L) -test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by="a,b",verbose=TRUE], +test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized options(datatable.optimize = Inf) test(866, names(ans1), c("a","b","name1","name2")) -test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by="a,b"]), c("a","b","name1","name2")) # list names extracted here +test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a","b","name1","name2")) # list names extracted here test(868, ans1, ans2) # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") @@ -2758,9 +2753,9 @@ DT = data.table(a=1:3,b=1:6) test(916, DT[,newcol:=logical(0),by=a], data.table(a=1:3,b=1:6,newcol=NA)) # roll join error when non last join column is factor, #2450 -X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key='state,uid,ts') -Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key='state,uid,ts') -test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key='state,uid,ts')) +X = data.table(id=2001:2004, uid=c(1001,1002,1001,1001), state=factor(c('CA','CA','CA','MA')), ts=c(51,52,53,54), key=c('state', 'uid', 'ts')) +Y = data.table(id=3001:3004, uid=c(1001,1003,1002,1001), state=factor(c('CA','CA','CA','CA')), ts=c(51,57,59,59), key=c('state', 'uid', 'ts')) +test(917.1, X[Y,roll=TRUE], data.table(id=INT(2001,2003,2002,NA), uid=c(1001,1001,1002,1003), state=factor('CA'), ts=c(51,59,59,57), i.id=INT(3001,3004,3003,3002), key=c('state', 'uid', 'ts'))) test(917.2, X[Y, on=c("id","state"), roll=TRUE], error="Attempting roll join on factor column when joining x.state to i.state") # NA in join column of type double, #2453. @@ -2805,7 +2800,7 @@ DT[,num:=1:.N] # to group each row by itself test(931, DT[,cbind(.SD,dup=1:rep),by="num"], data.table(num=INT(1,2,2,3:7,7,7),x=c(1,1,1,1,1,2,2,3,3,3),y=c(1,1,1,2,3,1,1,2,2,2),rep=INT(1,2,2,1,1,1,1,3,3,3), dup=INT(1,1,2,1,1,1,1,1,2,3))) # New roll=+/- and rollends -DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key="a,b") +DT = data.table(a=INT(1,3,4,4,4,4,7), b=INT(5,5,6,6,9,9,2), v=1:7, key=c('a', 'b')) test(932, DT[J(c(0,2,6,8)), roll=+Inf, rollends=TRUE, v], INT(1,1,6,7)) test(933, DT[J(c(0,2,6,8)), roll=-Inf, rollends=TRUE, v], INT(1,2,7,7)) test(934, DT[J(c(0,2,6,8)), roll=+Inf, v], INT(NA,1,6,7)) @@ -2936,7 +2931,7 @@ test(985.2, rbindlist(list(data.table(c("A","B")), data.table(factor(c("C",NA))) ## Allow unique/duplicated to accept custom colum combination to query for ## uniqueness -dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = "A,B") +dt <- data.table(A = rep(1:3, each=4), B = rep(11:14, each=3), C = rep(21:22, 6), key = c('A', 'B')) df <- as.data.frame(dt) test(986, unique(dt, by=key(dt)), dt[!duplicated(df[, key(dt)]),]) test(987, unique(dt, by='A'), dt[!duplicated(df[, 'A'])]) @@ -3588,11 +3583,11 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, test(1102.01, dcast(DT, time ~ variable, fun.aggregate=sum)[c(1,2,11,.N)], data.table(time=c(0,2,20,21),weight=c(2053,2461,9647,9841), key="time")) test(1102.02, dcast(DT, diet ~ variable, fun.aggregate=sum), data.table(diet=factor(1:4), weight=c(22582, 14714, 17154, 15961), key="diet")) test(1102.03, dcast(DT, diet+chick ~ time, drop=FALSE)[c(1,.N),c(1:4,13:14)], - ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key="diet,chick")) + ans<-data.table(diet=factor(c(1,4)), chick=ordered(c(18,48),levels=levels(DT$chick)), "0"=39, "2"=c(35,50), "20"=c(NA,303), "21"=c(NA,322), key=c('diet', 'chick'))) test(1102.04, dcast(DT, diet+chick ~ time, drop=FALSE, fill=0)[c(1,.N),c(1:4,13:14)], ans[1, c("20","21"):=0]) # add test for 'subset=' in dcast test(1102.05, dcast(DT, time + chick ~ variable+diet, fun.aggregate=sum, subset=.(time> 20))[c(1,2,44,.N)], - data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key="time,chick")) + data.table(time=21, chick=ordered(c(13,9,42,48), levels=levels(DT$chick)), weight_1=c(96,98,0,0), weight_2=0, weight_3=0, weight_4=c(0,0,281,322), key=c('time', 'chick'))) # testing without aggregation set.seed(3) @@ -3644,7 +3639,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, v=factor(NA, levels=tail(letters,5)), x=factor(NA, levels=tail(letters,5)), y=factor(c(NA,"y",NA), levels=tail(letters,5)), - z=factor(NA, levels=tail(letters,5)), key="a1,a2,a3")) + z=factor(NA, levels=tail(letters,5)), key=c("a1", "a2", "a3"))) # dcast bug fix for 'subset' argument (it doesn't get key set before to run C-fcast): DT = data.table(x=c(1,1,1,2,2,2,1,1), y=c(1,2,3,1,2,1,1,2), z=c(1,2,3,NA,4,5,NA,NA)) @@ -3715,17 +3710,17 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, DT = data.table(x=sample(5,20,TRUE), y=sample(2,20,TRUE), z=sample(letters[1:2], 20,TRUE), d1 = runif(20), d2=1L) test(1102.31, dcast(DT, x + y ~ z, fun.aggregate=sum, value.var=c("d1","d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,4), .SDcols=c("d1_a","d1_b")][], - data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key="x,y")) + data.table(x=INT(1,5), y=INT(1,1), d1_a=c(0.0,0.4785), d1_b=c(0.8753,0.9804), d2_a=INT(0,1), d2_b=INT(1,3), key=c('x', 'y'))) # multiple fun.agg test(1102.32, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var="d1")[c(1,.N)][, 3:6:=lapply(.SD,round,3), .SDcols=3:6][], - data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key="x,y")) + data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327), key=c('x', 'y'))) # multiple fun.agg and value.var (all combinations) test(1102.33, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=c("d1", "d2"))[c(1,.N)][, c(3,4,7:10):=lapply(.SD,round,3), .SDcols=c(3,4,7:10)][], data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_sum_a=INT(0,1),d2_sum_b=INT(1,3), - d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y")) + d1_mean_a=c(NaN,0.479),d1_mean_b=c(0.875,0.327),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key=c('x', 'y'))) # multiple fun.agg and value.var (one-to-one) test(1102.34, dcast(DT, x + y ~ z, fun.aggregate=list(sum, mean), value.var=list("d1", "d2"))[c(1,.N)][, 3:4:=lapply(.SD,round,3), .SDcols=3:4][], - data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key="x,y")) + data.table(x=INT(1,5), y=INT(1,1), d1_sum_a=c(0.0,0.479), d1_sum_b=c(0.875,0.980),d2_mean_a=c(NaN,1),d2_mean_b=c(1,1), key=c('x', 'y'))) # Additional test after fixing fun.agg creation - using the example here: https://github.com/Rdatatable/data.table/issues/716 DT = data.table(x=1:5, y=paste("v", 1:5, sep=""), v1=6:10, v2=11:15, k1=letters[1:5], k2=letters[6:10]) @@ -4450,7 +4445,7 @@ test(1220, set(DT,j=2:3,value=newVals), data.table(a=1:3,b=16:18,c=19:21)) # Test non-join key columns used in j work again (spotted straight away by Michele on datatable-help when v1.9.2 was released). # Introduced at commit 1030. Very extensive new tests 1136* still all pass (great stuff Arun). -DT = data.table(a=1:2,b=letters[1:6],key="a,b") +DT = data.table(a=1:2,b=letters[1:6],key=c('a', 'b')) test(1221, DT[.(1),b], c("a","c","e")) ########################################################################################### @@ -5258,7 +5253,7 @@ test(1305.13, setDF(dt, rownames=rep("a",5)), error='rownames contains duplicate # .SD retains as much of head(key) as appropriate. # by= always keeps data appearance order, so it's which columns are grouped and selected that drive how much of key is retained -DT = data.table(a=1:3,b=1:6,c=1:6,key="a,b") +DT = data.table(a=1:3,b=1:6,c=1:6,key=c('a', 'b')) test(1306, DT[1:2,key(.SD)], c("a","b")) test(1307, DT[2:1,key(.SD)], NULL) test(1308, DT[,key(.SD),by=a], data.table(a=integer())) @@ -5327,9 +5322,9 @@ test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c" # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = -c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = "x,y") -dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = "x,y") -test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key="x,y")) +c("x", "y", "z"), class = "data.frame", row.names = c(NA, -2L)), key = c('x', 'y')) +dt2 <- data.table(structure(list(x = c(7L, 7L, 33L, 33L, 33L, 33L), y = structure(c(15884, 15917, 15884, 15884, 15917, 15917), class = "Date"), w = c(-0.118303, 0.141225, -0.03137, -0.02533, 0.045967, 0.043694)), .Names = c("x", "y", "w"), class = "data.frame", row.names = c(NA, -6L)), key = c('x', 'y')) +test(1317.1, dt1[dt2, roll=TRUE, nomatch=0L], data.table(x=c(7L,33L,33L), y=as.Date(c("2013-07-31", "2013-07-31", "2013-07-31")), z=c(dt1$z[1:2], dt1$z[2]), w=c(dt2$w[2], dt2$w[5:6]), key=c('x', 'y'))) # also test where 'i' is not sorted. set.seed(1L) @@ -7383,10 +7378,10 @@ x = c(1, 2, 1) y = c(5, 8, 8, 4) w = c(10, 12, 12, 13) # already sorted but has dups; more efficient case to cover # tests 1525.1, 1525.2 tested the now-ineffectual datatable.CJ.names option. -ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key="V1,z") +ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key=c('V1', 'z')) test(1525.3, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) test(1525.4, CJ(x, z=y, unique=TRUE), setnames(copy(ans),c("x","z"))) -test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key="x,w")) +test(1525.5, CJ(x, w, unique=TRUE), data.table(x=(rep(c(1,2), each=3)), w=c(10,12,13), key=c('x', 'w'))) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) @@ -7706,7 +7701,7 @@ setkey(x1, a1, a2) test(1544.1, setDF(merge(x1, y)), merge(as.data.frame(x1), as.data.frame(y))) test(1544.2, setDF(merge(x1, y, by="a2")), merge(as.data.frame(x1), as.data.frame(y), by="a2")) # also test shallow here so as to catch future regressions -x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key="a1,a2") +x1 <- data.table(a1 = c('a', 'b', 'c'), a2 = c(1L, 3L, 2L), a3 = c(TRUE, FALSE, TRUE), key=c('a1', 'a2')) test(1545.01, key(.shallow(x1, cols="a2")), NULL) test(1545.02, key(.shallow(x1, retain.key=FALSE)), NULL) test(1545.03, key(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) @@ -9115,7 +9110,7 @@ test(1630.09, copy(dt1)[id>5, z:=2L, nomatch=0L], copy(dt1)[ test(1630.10, copy(dt1)[id>5, z:=2L, nomatch=NA], copy(dt1)[,z:=NA_integer_], warning="ignoring nomatch") # fix for #1268, on= retains keys correctly. -A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key="site,date") +A = data.table(site=rep(c("A","B"), each=3), date=rep(1:3, times=2), x=rep(1:3*10, times=2), key=c('site', 'date')) B = data.table(x=c(10,20), y=c(100,200), key="x") test(1631, key(A[B, on="x"]), NULL) @@ -13135,8 +13130,8 @@ setindex(DT, NULL) test(1942.06, indices(DT), NULL) setindex(DT,id1,id2) test(1942.07, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using uniqlist on index 'id1__id2'") -test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key="id1,id2"), output="Finding groups using uniqlist on index 'id1__id2'") -test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key="id2,id1"), output="Finding groups using forderv") +test(1942.08, DT[,sum(v),keyby=.(id1,id2),verbose=TRUE], data.table(id1=c("A","C","C","D"), id2=INT(9,2,3,3), V1=INT(6,3,5,1), key=c('id1', 'id2')), output="Finding groups using uniqlist on index 'id1__id2'") +test(1942.09, DT[,sum(v),keyby=.(id2,id1),verbose=TRUE], data.table(id2=INT(2,3,3,9), id1=c("C","C","D","A"), V1=INT(3,5,1,6), key=c('id2', 'id1')), output="Finding groups using forderv") options(datatable.use.index=FALSE) test(1942.10, DT[,sum(v),keyby=id1,verbose=TRUE], data.table(id1=c("D","A","C"), V1=INT(1,6,8), key="id1"), output="Finding groups using forderv") options(datatable.use.index=TRUE) @@ -13146,7 +13141,7 @@ set.seed(2) DT = data.table(real=sample((1:1500)/1000, 10000, replace=TRUE), id=sample(letters, 1000, replace=TRUE), value=1:10000) setkey(DT,id,real) test(1942.11, DT[, .(list(value)), keyby=.(id,real), verbose=TRUE][c(1,6,8744,.N)], - data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key="id,real"), + data.table(id=c("a","a","z","z"), real=c(0.004,0.037,1.486,1.497), V1=list(9441L, c(3375L,5983L), c(4901L,5260L,7668L), 4181L), key=c('id', 'real')), output="Finding groups using uniqlist on key") setindex(DT,real) test(1942.12, DT[, sum(value), keyby=real, verbose=TRUE][c(1,500,1498,.N)], data.table(real=c(0.001, 0.501, 1.499, 1.5), V1=INT(31036,37564,14792,38606), key="real"), @@ -13168,8 +13163,8 @@ DT2 <- data.table( test(1943.1, (ans<-DT1[DT2])[,1:4], DT1) # ok before test(1943.2, DT1[DT2, on=c("id","date","period")], ans) # ok before test(1943.3, DT1[DT2, on=c("id","date","period","year")], ans[,1:4]) # no warning (longer object length is not a multiple) -DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key="id,date") -DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key="id,date") +DT1 = data.table(id=c("A","A","A"), date=1:3, val=7:9, key=c('id', 'date')) +DT2 = data.table(id=c("A","A","A"), date=1:3, date2=3:1, key=c('id', 'date')) test(1943.4, DT1[DT2, on=c("id",date="date2")], data.table(id="A", date=3:1, val=9:7, i.date=1:3)) # was invalidly keyed by id,date in 1.11.6 @@ -15645,7 +15640,7 @@ test(2069.28, data.table(c='1', d=2)[ , c(a='b'), by=c, verbose=TRUE], output='j test(2069.29, data.table(c = '1', d = 2)[ , .(a = c(nm='b')), by = c, verbose = TRUE], output = 'Column 1 of j is a named vector') DT <- data.table(a = rep(1:3, each = 4), b = LETTERS[1:4], z = 0:3 + (4:1)*1i) test(2069.30, DT[, .SD[3,], by=b], DT[9:12, .(b, a, z)]) -DT = data.table(x=1:4,y=1:2,lgl=TRUE,key="x,y") +DT = data.table(x=1:4,y=1:2,lgl=TRUE,key=c('x', 'y')) test(2069.31, DT[CJ(1:4,1:4), any(lgl), by=.EACHI]$V1, c(TRUE, NA, NA, NA, NA, TRUE, NA, NA, TRUE, NA, NA, NA, NA, TRUE, NA, NA)) set.seed(45L) @@ -15829,7 +15824,7 @@ test(2074.23, capture.output(print(DT2, topn=1L, col.names='none')), c(" 1: 1", " --- ", "101: 101")) # foverlaps -x = data.table(start=NA_integer_, end=1L, key='start,end') +x = data.table(start=NA_integer_, end=1L, key=c('start', 'end')) y = copy(x) test(2074.24, foverlaps(x, y), error="NA values in data.table x 'start' column") x[ , start := 0L] diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 928e732bc..be7620890 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -241,7 +241,7 @@ identical(as.ITime("10:45"), methods::as("10:45", "ITime")) as.POSIXct("2001-01-01") + as.ITime("10:45") datetime <- seq(as.POSIXct("2001-01-01"), as.POSIXct("2001-01-03"), by = "5 hour") -(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = "a,idate,itime")) +(af <- data.table(IDateTime(datetime), a = rep(1:2, 5), key = c("a", "idate", "itime"))) af[, mean(a), by = "itime"] af[, mean(a), by = list(hour = hour(itime))] diff --git a/man/data.table.Rd b/man/data.table.Rd index d53ac6f0e..680e25574 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -44,7 +44,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}.} - \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}.} + \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}.} \item{stringsAsFactors}{Logical (default is \code{FALSE}). Convert all \code{character} columns to \code{factor}s?} diff --git a/man/duplicated.Rd b/man/duplicated.Rd index daf7c39d5..e17d8df0c 100644 --- a/man/duplicated.Rd +++ b/man/duplicated.Rd @@ -88,7 +88,7 @@ If none exists, 0L is returned. } \examples{ DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), - C = rep(1:2, 6), key = "A,B") + C = rep(1:2, 6), key = c("A", "B")) duplicated(DT) unique(DT) @@ -113,7 +113,7 @@ identical(unique(DT),DT[10]) # FALSE # fromLast=TRUE DT <- data.table(A = rep(1:3, each=4), B = rep(1:4, each=3), - C = rep(1:2, 6), key = "A,B") + C = rep(1:2, 6), key = c("A", "B")) duplicated(DT, by="B", fromLast=TRUE) unique(DT, by="B", fromLast=TRUE) diff --git a/man/fread.Rd b/man/fread.Rd index 07b39e600..d397a441d 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -55,7 +55,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} - \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } + \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. The default for this argument can be changed with \code{options(datatable.fread.datatable=FALSE)}.} diff --git a/man/merge.Rd b/man/merge.Rd index d8246668c..d374da076 100644 --- a/man/merge.Rd +++ b/man/merge.Rd @@ -87,16 +87,16 @@ merge(dt1, dt2, all = TRUE) (dt2 <- data.table(A = letters[rep(2:4, 2)], Y = 6:1, key = "A")) merge(dt1, dt2, allow.cartesian=TRUE) -(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = "A,B")) -(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = "A,B")) +(dt1 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(1:3, 2)], X = 1:6, key = c("A", "B"))) +(dt2 <- data.table(A = c(rep(1L, 5), 2L), B = letters[rep(2:4, 2)], Y = 6:1, key = c("A", "B"))) merge(dt1, dt2) merge(dt1, dt2, by="B", allow.cartesian=TRUE) # test it more: -d1 <- data.table(a=rep(1:2,each=3), b=1:6, key="a,b") +d1 <- data.table(a=rep(1:2,each=3), b=1:6, key=c("a", "b")) d2 <- data.table(a=0:1, bb=10:11, key="a") d3 <- data.table(a=0:1, key="a") -d4 <- data.table(a=0:1, b=0:1, key="a,b") +d4 <- data.table(a=0:1, b=0:1, key=c("a", "b")) merge(d1, d2) merge(d2, d1) diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index bda7a9b78..a39c8c446 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -83,7 +83,7 @@ print(DT, row.names = FALSE) #`print.keys` can alert which columns are currently keys - DT <- data.table(a=1:3, b=4:6, c=7:9, key="b,a") + DT <- data.table(a=1:3, b=4:6, c=7:9, key=c("b", "a")) setindexv(DT, c("a", "b")) setindexv(DT, "a") print(DT, print.keys=TRUE) diff --git a/man/setDT.Rd b/man/setDT.Rd index c00ba0f46..9311d0e3b 100644 --- a/man/setDT.Rd +++ b/man/setDT.Rd @@ -13,7 +13,7 @@ setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) \arguments{ \item{x}{ A named or unnamed \code{list}, \code{data.frame} or \code{data.table}. } \item{keep.rownames}{ For \code{data.frame}s, \code{TRUE} retains the \code{data.frame}'s row names under a new column \code{rn}. \code{keep.rownames = "id"} names the column \code{"id"} instead. } - \item{key}{Character vector of one or more column names which is passed to \code{\link{setkeyv}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. } + \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. } \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}. } } From a0d7ac426738d2aacef3f135031f3d5312edcf0f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 17:36:52 -0700 Subject: [PATCH 3/7] rollforward: add options supporting old style (on by default) to avoid hard deprecation --- NEWS.md | 6 ++---- R/data.table.R | 8 +++++--- R/fread.R | 3 ++- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3d5b2f81c..9965c62f5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,10 +2,6 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) -## BREAKING CHANGES - -1. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[`'s `by=`/`keyby=` arguments is deprecated, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. NB: this hard deprecation is temporary in the development version. Before release, it will soften into the normal data.table deprecation cycle starting from introducing the new behavior with an option, then changing the default for the option with a warning, then upgrading the warning to an error before finally removing the option and the error. - ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. @@ -76,6 +72,8 @@ 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. +12. Usage of comma-separated character strings representing multiple columns in `data.table()`'s `key=` argument and `[` and `fread()`'s `by=`/`keyby=` arguments is slated for deprecation, [#4357](https://github.com/Rdatatable/data.table/issues/4357). While sometimes convenient, ultimately it introduces inconsistency in implementation that is not worth the benefit to maintain. This release comes with two new options, `datatable.key.split.comma` and `datatable.by.split.comma`, both defaulting in this release to `TRUE`, corresponding to the arguments `key=` (in `data.table()` and `fread()`) and `by=` and `keyby=` (in `[.data.table`), respectively. You can start future-proofing your usage by either changing to supply a vector (e.g. `c("a", "b")` instead of `"a,b"`) or by setting these options to `FALSE`; otherwise, the next release will start warning about the pending behavior change, and the release after that will start erroring. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/data.table.R b/R/data.table.R index 9181f31c5..56eafecd5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -62,8 +62,9 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str if (!is.null(key)) { if (!is.character(key)) stopf("key argument of data.table() must be character") if (length(key)==1L) { - key = strsplit(key,split=",")[[1L]] - # eg key="A,B"; a syntax only useful in key argument to data.table(), really. + keySplit = strsplit(key, ",", fixed=TRUE)[[1L]] + # eg key="A,B"; marked for deprecation in 1.16.0, mid-2024 + if (isTRUE(getOption("datatable.key.split.comma", default=TRUE))) key = keySplit } setkeyv(ans,key) } else { @@ -807,7 +808,8 @@ replace_dot_alias = function(e) { if (mode(bysub) == "character") { if (any(grepl(",", bysub, fixed = TRUE))) { if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) - bysub = strsplit(bysub, split=",", fixed=TRUE)[[1L]] + bySplit = strsplit(bysub, ",", fixed=TRUE)[[1L]] + if (isTRUE(getOption("datatable.by.split.comma", default=TRUE))) bysub = bySplit } bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138 nzidx = nzchar(bysub) diff --git a/R/fread.R b/R/fread.R index a8a26aa4b..d6387cbe1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -341,7 +341,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!is.character(key)) stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") if (length(key) == 1L) { - key = strsplit(key, split = ",", fixed = TRUE)[[1L]] + keySplit = strsplit(key, ",", fixed=TRUE)[[1L]] + if (isTRUE(getOption("datatable.key.split.comma", default=TRUE))) key = keySplit } setkeyv(ans, key) } From 7b6004d3985b038fb418c6028c2773b98fd51cae Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 21:53:08 -0700 Subject: [PATCH 4/7] tests of new options --- inst/tests/tests.Rraw | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 977b29b5c..4597874bd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18464,3 +18464,11 @@ test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), outpu # helpful error about deleting during grouping, #1873 DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8) test(2257, DT[ , c("c", "a") := .(a + 1, NULL), by=id], error="it's not possible to delete parts of a column") + +# new options on the road to deprecating "a,b" splitting in by= and key= in certain places, #4357 +test(2258.1, options=c(datatable.key.split.comma=FALSE), data.table(a=1L, b=2L, key="a,b"), error="some columns are not in the data.table") # tries to find one column named 'a,b' +test(2258.2, options=c(datatable.by.split.comma=FALSE), data.table(a=1L, b=2L, key="a,b"), ans<-data.table(a=1L, b=2L, key=c("a", "b"))) # no interference from 2nd option +test(2258.3, options=c(datatable.key.split.comma=FALSE), fread('a,b\n1,2\n', key="a,b"), error="some columns are not in the data.table") +test(2258.4, options=c(datatable.by.split.comma=FALSE), fread('a,b\n1,2\n', key="a,b"), ans) +test(2258.5, options=c(datatable.by.split.comma=FALSE), ans[, .N, by="a,b"], error="object 'a,b' not found") +test(2258.6, options=c(datatable.key.split.comma=FALSE), ans[, .N, by="a,b"], ans[, .N, by=c("a", "b")]) From 8851986cf9477b5ce2eda85f37ae4c3be8f75fb5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Apr 2024 21:54:45 -0700 Subject: [PATCH 5/7] restore original tests of maintained behavior, for now --- inst/tests/tests.Rraw | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4597874bd..911786662 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1691,7 +1691,8 @@ test(540, DT[,sum(v),by=eval(a)], data.table(a=1:0,V1=c(11L,10L))) test(541, DT[,sum(v),keyby=eval(a)], data.table(a=0:1,V1=c(10L,11L),key="a")) test(542, DT[,sum(v),keyby=c("a","b","c")]$V1, INT(1,3,4,6,5,2)) -# tests 543,544 were of deprecated behavior to allow comma-separated entries to keyby +test(543, DT[,sum(v),keyby="a,b,c"]$V1, INT(1,3,4,6,5,2)) +test(544, DT[,sum(v),keyby=c("a","b,c")], error="but one or more items include a comma") # Test single expressions passed to by, FR#1743 in v1.8.0 DT = data.table(a=1:4,date=as.IDate("2012-02-28")+0:3,v=5:8) @@ -1758,7 +1759,11 @@ test(569, DT[,list(.N=.N),list(a,b)][,.N,a], error="The column '.N' can't be gro test(570, DT[,list(.N=.N),list(a,b)][,unique(.N),a], error="The column '.N' can't be grouped because") test(570.1, DT[,list(.I=.I),list(a,b)][,.I,a], error="The column '.I' can't be grouped because") -# tests 571-573 were of deprecated behavior to allow comma-separated entries in by= +# Test spaces in by="..." format, datatable-help on 31 March +DT = data.table("a "=1:2, "b"=3:4," b"=5:6, v=1:6) +test(571, DT[,sum(v),by="b, b"], data.table("b"=3:4, " b"=5:6, V1=c(9L,12L))) +test(572, DT[,sum(v),by="a , b"], data.table("a "=1:2, " b"=5:6, V1=c(9L,12L))) +test(573, DT[,sum(v),by="b, a"], error=base_messages$missing_object(" a")) # Test base::unname, used by melt, and only supported by data.table for DF compatibility for non-dtaware packages DT = data.table(a=1:3, b=4:6) From 07119e662976cb4fc1464c6baa138c263f587df9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 10 Jul 2024 18:55:26 +0000 Subject: [PATCH 6/7] ? --- inst/tests/tests.Rraw | 153 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 185d9dcc4..75bf81400 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18561,3 +18561,156 @@ test(2258.8, capture.output(print(DT, na.print=".", topn=2, col.names="none")), # table requires splitting, col.names!="none" test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: .", " 2: e", "--- ", " 5: w", " 6: .")) +# split(by = ., sep = ..) works like split(f= ., sep = ..), #5417 +x = data.table(rep(1:2, each=5L), 1:5, 1:10) +test(2259.1, names(split(x, by = c("V1", "V2"), sep = "|")), sort(names(split(x, list(x$V1, x$V2), sep = "|")))) +test(2259.2, names(split(x, by = c("V1", "V2"), sep = "||")), sort(names(split(x, list(x$V1, x$V2), sep = "||")))) + +# custom signaling functions +## basics: default signals with/without formats +test(2260.01, tryCatch(stopf("%s", "abc"), error=function(x) conditionMessage(x)), "abc") +test(2260.02, tryCatch(stopf("abc"), error=function(x) conditionMessage(x)), "abc") +test(2260.03, tryCatch(warningf("%s", "abc"), warning=function(x) conditionMessage(x)), "abc") +test(2260.04, tryCatch(warningf("abc"), warning=function(x) conditionMessage(x)), "abc") +test(2260.05, tryCatch(messagef("%s", "abc"), message=function(x) conditionMessage(x)), "abc\n") +test(2260.06, tryCatch(messagef("abc"), message=function(x) conditionMessage(x)), "abc\n") +test(2260.07, tryCatch(messagef("abc", appendLF=FALSE), message=function(x) conditionMessage(x)), "abc") +test(2260.08, tryCatch(packageStartupMessagef("%s", "abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") +test(2260.09, tryCatch(packageStartupMessagef("abc"), packageStartupMessage=function(x) conditionMessage(x)), "abc\n") +test(2260.10, tryCatch(packageStartupMessagef("abc", appendLF=FALSE), packageStartupMessage=function(x) conditionMessage(x)), "abc") + +## custom signal classes +test(2260.11, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "test_error")) +test(2260.12, inherits(tryCatch(stopf("x", class="test_error"), condition=identity), "error")) +test(2260.13, inherits(tryCatch(warningf("x", class="test_warning"), condition=identity), "test_warning")) +test(2260.14, inherits(tryCatch(warningf("x", class="test_warning"), condition=identity), "warning")) +test(2260.15, inherits(tryCatch(messagef("x", class="test_message"), condition=identity), "test_message")) +test(2260.16, inherits(tryCatch(messagef("x", class="test_message"), condition=identity), "message")) +test(2260.17, inherits(tryCatch(packageStartupMessagef("x", class="test_psm"), condition=identity), "test_psm")) +test(2260.18, inherits(tryCatch(packageStartupMessagef("x", class="test_psm"), condition=identity), "packageStartupMessage")) + +# tests for setNumericRounding() returning original value before changing, #6112 +old = setNumericRounding(0L) +test(2261.01, getNumericRounding(), 0L) +test(2261.02, setNumericRounding(1L), 0L) +test(2261.03, getNumericRounding(), 1L) +test(2261.04, setNumericRounding(2L), 1L) +# This test ensures that the function returns invisibly. Note we can't use notOutput here as it calls print, which ignores whether +# or not an object is an invisible copy or not, and prints it anyways. +test(2261.05, capture.output(setNumericRounding(2L)), character(0)) +setNumericRounding(old) + +# Add list column to null data.table #5738 +x = list("a", 1) +dt1 = data.table(a = x) +dt2 = data.table(a = "a", b=1) +dt3 = data.table(a=1:2, b=3:4) +test(2262.1, null.data.table()[, a := x], dt1) +test(2262.2, set(null.data.table(), j="a", value=x), dt1) +test(2262.3, null.data.table()[, c("a","b") := x], dt2) +test(2262.4, set(null.data.table(), j=c("a","b"), value=x), dt2) +test(2262.5, null.data.table()[, c("a","b") := list(1:2, 3:4)], dt3) +test(2262.6, set(null.data.table(), j=c("a","b"), value=list(1:2, 3:4)), dt3) +test(2262.7, data.table(a=1, b=2)[, c("a", "b") := list(NULL, NULL)], null.data.table()) +test(2262.8, data.table(a=1, b=2)[, c("a", "b") := list(NULL)], null.data.table()) + +# GForce retains attributes in by arguments #5567 +dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) +test(2263.1, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce FALSE") +test(2263.2, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce FALSE") +test(2263.3, options=list(datatable.verbose=TRUE, datatable.optimize=0L), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce FALSE") +test(2263.4, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce optimized j to") +test(2263.5, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce optimized j to") +test(2263.6, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce optimized j to") + +# tests for printing indices alongside data.tables +NN = 200 +set.seed(2024) +DT = data.table( + grp1 = sample(100, NN, TRUE), + grp2 = sample(90, NN, TRUE), + grp3 = sample(80, NN, TRUE)) +setkey(DT, grp1, grp2) +setindex(DT, grp1, grp3) +ans = c( + " grp1 grp2 grp3 index:grp1__grp3", + " 1: 1 5 15 1", + " 2: 1 24 60 2", + " 3: 2 26 32 5", + " 4: 2 36 57 3", + " 5: 2 51 30 4", + " --- ", + "196: 98 77 45 195", + "197: 98 87 70 197", + "198: 100 18 21 198", + "199: 100 36 51 199", + "200: 100 38 56 200") +# test printing with 1 index column, no markers for order +test(2264.1, print(DT, show.indices=TRUE), output=ans) +# test that options work as well +test(2264.2, options=list(datatable.show.indices=TRUE), print(DT), output=ans) +setindex(DT, grp3, grp1) +ans = c( + " grp1 grp2 grp3 index1:grp1__grp3 index2:grp3__grp1", + " 1: 1 5 15 1 10", + " 2: 1 24 60 2 119", + " 3: 2 26 32 5 164", + " 4: 2 36 57 3 192", + " 5: 2 51 30 4 63", + " --- ", + "196: 98 77 45 195 11", + "197: 98 87 70 197 66", + "198: 100 18 21 198 31", + "199: 100 36 51 199 139", + "200: 100 38 56 200 159") +# test for two indices, with markers to show order +test(2264.3, print(DT, show.indices=TRUE), output=ans) +test(2264.4, options=list(datatable.show.indices=TRUE), print(DT), output=ans) +setindex(DT, NULL) # clear indices +# if no indices are set, simply ignore +test(2264.5, print(DT, show.indices=TRUE), notOutput="index:grp1__grp3") +test(2264.6, options=list(datatable.show.indices=TRUE), print(DT, show.indices=TRUE), notOutput="index:grp1__grp3") +setindex(DT, grp3) +ans = c( + " grp1 grp2 grp3 index:grp3", + " 1: 1 5 15 10", + " 2: 1 24 60 119", + " 3: 2 26 32 164", + " 4: 2 36 57 192", + " 5: 2 51 30 63", + " --- ", + "196: 98 77 45 11", + "197: 98 87 70 66", + "198: 100 18 21 31", + "199: 100 36 51 139", + "200: 100 38 56 159") +test(2264.7, print(DT, show.indices=TRUE), output=ans) +NN = 10 +DT = data.table( + grp1 = sample(100, NN, TRUE), + grp2 = sample(90, NN, TRUE), + grp3 = sample(80, NN, TRUE)) +setindex(DT, grp1, grp3) +setindex(DT, grp3, grp1) +ans = c( + " grp1 grp2 grp3 index1:grp1__grp3 index2:grp3__grp1", + " 1: 77 61 53 3 5", + " 2: 80 66 37 8 4", + " 3: 27 42 8 5 3", + " 4: 66 37 7 4 7", + " 5: 38 69 5 6 2", + " 6: 72 89 69 1 10", + " 7: 86 52 16 2 1", + " 8: 28 35 62 10 8", + " 9: 95 82 80 7 6", + "10: 83 64 41 9 9") +# test where topn isn't necessary +test(2264.8, print(DT, show.indices=TRUE), output=ans) + +# integer64 columns print even when bit64 isn't loaded +if (test_bit64) local({ + DT = data.table(a = 'abc', b = as.integer64(1)) + unloadNamespace("bit64") + on.exit(library(bit64)) + test(2265, DT, output="abc\\s*1$") +}) From 636e91b178d02a6a4e3c4e701a3a64a5c0410e89 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 10 Jul 2024 19:11:06 +0000 Subject: [PATCH 7/7] Restore original code, with a helper. --- R/data.table.R | 11 +++-------- R/fread.R | 6 ++---- R/utils.R | 4 ++++ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index ea9e13790..b9154f440 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -53,11 +53,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names, .named=nd$.named) # see comments inside as.data.table.list re copies if (!is.null(key)) { if (!is.character(key)) stopf("key argument of data.table() must be character") - if (length(key)==1L) { - keySplit = strsplit(key, ",", fixed=TRUE)[[1L]] - # eg key="A,B"; marked for deprecation in 1.16.0, mid-2024 - if (isTRUE(getOption("datatable.key.split.comma", default=TRUE))) key = keySplit - } + if (length(key)==1L) key = cols_from_csv(key) setkeyv(ans,key) } else { # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table(). @@ -799,9 +795,8 @@ replace_dot_alias = function(e) { if (mode(bysub) == "character") { if (any(grepl(",", bysub, fixed = TRUE))) { - if (length(bysub)>1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) - bySplit = strsplit(bysub, ",", fixed=TRUE)[[1L]] - if (isTRUE(getOption("datatable.by.split.comma", default=TRUE))) bysub = bySplit + if (length(bysub) > 1L) stopf("'by' is a character vector length %d but one or more items include a comma. Either pass a vector of column names (which can contain spaces, but no commas), or pass a vector length 1 containing comma separated column names. See ?data.table for other possibilities.", length(bysub)) + bysub = cols_from_csv(bysub) } bysub = gsub("^`(.*)`$", "\\1", bysub) # see test 138 nzidx = nzchar(bysub) diff --git a/R/fread.R b/R/fread.R index f07063f3e..3dc4468c6 100644 --- a/R/fread.R +++ b/R/fread.R @@ -340,10 +340,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (!is.null(key) && data.table) { if (!is.character(key)) stopf("key argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)") - if (length(key) == 1L) { - keySplit = strsplit(key, ",", fixed=TRUE)[[1L]] - if (isTRUE(getOption("datatable.key.split.comma", default=TRUE))) key = keySplit - } + if (length(key) == 1L) + key = cols_from_csv(key) setkeyv(ans, key) } if (yaml) setattr(ans, 'yaml_metadata', yaml_header) diff --git a/R/utils.R b/R/utils.R index 7d4128a1d..fa3fd5ad4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -114,6 +114,10 @@ brackify = function(x, quote=FALSE) { sprintf('[%s]', toString(x)) } +# convenience for specifying columns in some cases, e.g. by= and key= +# caller should ensure length(x) == 1 & handle accordingly. +cols_from_csv = function(x) strsplit(x, ',', fixed=TRUE)[[1L]] + # patterns done via NSE in melt.data.table and .SDcols in `[.data.table` # was called do_patterns() before PR#4731 eval_with_cols = function(orig_call, all_cols) {