From b3c24a67e236e11a206d3759704fec8f258acf7f Mon Sep 17 00:00:00 2001 From: Nitish Jha <151559388+Nj221102@users.noreply.github.com> Date: Tue, 30 Jul 2024 01:10:37 +0530 Subject: [PATCH 01/21] updated documentation regarding behavior of `rbindlist` when applied to `difftime` objects with different units (#6309) * updated documentation regarding behavior of rbind when applied to difftime objects with different units * Update rbindlist.Rd * fix mistakes * add dot --------- Co-authored-by: nitish jha Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- man/rbindlist.Rd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 17c5c2205..f3b7e6845 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -25,6 +25,8 @@ Columns with duplicate names are bound in the order of occurrence, similar to ba If column \code{i} does not have the same type in each of the list items; e.g, the column is \code{integer} in item 1 while others are \code{numeric}, they are coerced to the highest type. If a column contains factors then a factor is created. If any of the factors are also ordered factors then the longest set of ordered levels are found (the first if this is tied). Then the ordered levels from each list item are checked to be an ordered subset of these longest levels. If any ambiguities are found (e.g. \code{blue Date: Mon, 29 Jul 2024 12:49:26 -0700 Subject: [PATCH 02/21] adding an atime test case; groupby with dogroups (R expression) #PR4558 (#6288) Co-authored-by: Toby Dylan Hocking --- .ci/atime/tests.R | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index ecd5d33d5..f962b6672 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -121,6 +121,24 @@ test.list <- atime::atime_test_list( Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) + # Issue reported in: https://github.com/Rdatatable/data.table/issues/4200 + # To be fixed in: https://github.com/Rdatatable/data.table/pull/4558 + "DT[by] fixed in #4558" = atime::atime_test( + N = 10^seq(1, 20), + setup = { + d <- data.table( + id3 = sample(c(seq.int(N*0.9), sample( N*0.9, N*0.1, TRUE))), + v1 = sample(5L, N, TRUE), + v2 = sample(5L, N, TRUE) + ) + }, + expr = { + expr=data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id3) + }, + Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression + Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression + Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression + # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498 # Fixed in: https://github.com/Rdatatable/data.table/pull/4501 "DT[,.SD] improved in #4501" = atime::atime_test( @@ -136,8 +154,7 @@ test.list <- atime::atime_test_list( }, Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - Slower = "cacdc92df71b777369a217b6c902c687cf35a70d" # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - ), + Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue NULL) # nolint end: undesirable_operator_linter. From 4704c82c1cd3f929faeac46774108f33b89c1198 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Mon, 29 Jul 2024 13:02:45 -0700 Subject: [PATCH 03/21] Progress bar/indicator for "by" operations (#6228) Co-authored-by: Michael Chirico Co-authored-by: Toby Dylan Hocking --- NEWS.md | 2 ++ R/data.table.R | 5 +++-- man/data.table.Rd | 5 ++++- src/data.table.h | 2 +- src/dogroups.c | 21 +++++++++++++++++++-- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0ba12fc4b..e3f7c9f4c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -50,6 +50,8 @@ 16. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks @sindribaldur for the feature request, which has been highly requested, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. +17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 5b23169c4..fa07593e9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -127,7 +127,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) +"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -224,6 +224,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") + if (!isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -1901,7 +1902,7 @@ replace_dot_alias = function(e) { } ans = c(g, ans) } else { - ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) + ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress) } # unlock any locked data.table components of the answer, #4159 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 diff --git a/man/data.table.Rd b/man/data.table.Rd index 680e25574..729c0861c 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -35,7 +35,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL, env = NULL) + drop = NULL, on = NULL, env = NULL, + showProgress = getOption("datatable.showProgress", interactive())) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} @@ -177,6 +178,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } + + \item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/src/data.table.h b/src/data.table.h index ed63978d6..49f3f1634 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -189,7 +189,7 @@ void warn_matrix_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, - SEXP on, SEXP verbose); + SEXP on, SEXP verbose, SEXP showProgressArg); // bmerge.c SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, diff --git a/src/dogroups.c b/src/dogroups.c index e03ad84df..2728a8bcf 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -63,7 +63,7 @@ static bool anySpecialStatic(SEXP x) { return false; } -SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg) +SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; @@ -71,6 +71,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; const bool verbose = LOGICAL(verboseArg)[0]==1; + const bool showProgress = LOGICAL(showProgressArg)[0]==1; + bool hasPrinted = false; + double startTime = (showProgress) ? wallclock() : 0; + double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -169,7 +173,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // because it is a rare edge case for it to be true. See #4892. bool anyNA=false, orderedSubset=false; check_idx(order, length(VECTOR_ELT(dt, 0)), &anyNA, &orderedSubset); - for(int i=0; i-1)) continue; @@ -435,6 +438,19 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (copied) UNPROTECT(1); } } + // progress printing, #3060 + // could potentially refactor to use fread's progress() function, however we would lose some information in favor of simplicity. + double now; + if (showProgress && (now=wallclock())>=nextTime) { + double avgTimePerGroup = (now-startTime)/(i+1); + int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); + if (hasPrinted || ETA >= 0) { + if (verbose && !hasPrinted) Rprintf(_("\n")); + Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); + } + nextTime = now+1; + hasPrinted = true; + } ansloc += maxn; if (firstalloc) { nprotect++; // remember the first jval. If we UNPROTECTed now, we'd unprotect @@ -443,6 +459,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok. } + if (showProgress && hasPrinted) Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds.\n"), ngrp, ngrp, 100.0, (int)(wallclock()-startTime), 0); if (isNull(lhs) && ans!=NULL) { if (ansloc < LENGTH(VECTOR_ELT(ans,0))) { if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0))); From 0ed550249ce132881ac5824152e5516d1b60f808 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Mon, 29 Jul 2024 13:45:26 -0700 Subject: [PATCH 04/21] set() adds new cols when rows aren't updated (#6204) * set() adds new cols when rows aren't updated * new test * review suggestions * review changes * NEWS * Update NEWS.md Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * new test * documentation update * consistent framing * grammar/clarity --------- Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 9 +++++++++ man/assign.Rd | 2 +- src/assign.c | 5 +++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index e3f7c9f4c..59a671410 100644 --- a/NEWS.md +++ b/NEWS.md @@ -178,6 +178,8 @@ d1[d2, on="id", verbose=TRUE] This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. +23. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 094af74c7..1f303c31e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19002,3 +19002,12 @@ test(2277.2, DT[, closure(b), env=list(closure=var)], 0.5) test(2277.3, DT[, closure(b), env=list(closure=stats::var)], 0.5) test(2277.4, DT[, closure(b), env=list(closure=stats:::var)], 0.5) test(2277.5, DT[, lambda(b), env=list(lambda=function(x) sum(x))], 7L) + +# test that set() correctly adds new columns even if no rows are updated +dt = data.table(a=1L) +test(2278.1, set(copy(dt), 0L, "b", logical(0)), data.table(a=1L, b=NA)) +test(2278.2, set(copy(dt), NA_integer_, "b", NA), data.table(a=1L, b=NA)) +test(2278.3, set(copy(dt), 0L, "b", NA), copy(dt)[0L, b := NA]) +test(2278.4, set(copy(dt), NA_integer_, "b", logical(0)), copy(dt)[NA_integer_, b := logical(0)]) +test(2278.5, set(copy(dt), integer(0), "b", numeric(0)), copy(dt)[integer(0), b := numeric(0)]) +test(2278.6, { set(dt, 0L, "b", logical(0)); set(dt, 1L, "a", 2L); dt }, data.table(a=2L, b=NA)) diff --git a/man/assign.Rd b/man/assign.Rd index daeac56a7..71e954230 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -35,7 +35,7 @@ set(x, i = NULL, j, value) \item{LHS}{ A character vector of column names (or numeric positions) or a variable that evaluates as such. If the column doesn't exist, it is added, \emph{by reference}. } \item{RHS}{ A list of replacement values. It is recycled in the usual way to fill the number of rows satisfying \code{i}, if any. To remove a column use \code{NULL}. } \item{x}{ A \code{data.table}. Or, \code{set()} accepts \code{data.frame}, too. } -\item{i}{ Optional. Indicates the rows on which the values must be updated with. If not provided, implies \emph{all rows}. The \code{:=} form is more powerful as it allows \emph{subsets} and \code{joins} based add/update columns by reference. See \code{Details}. +\item{i}{ Optional. Indicates the rows on which the values must be updated. If not \code{NULL}, implies \emph{all rows}. Missing or zero values are ignored. The \code{:=} form is more powerful as it allows adding/updating columns by reference based on \emph{subsets} and \code{joins}. See \code{Details}. In \code{set}, only integer type is allowed in \code{i} indicating which rows \code{value} should be assigned to. \code{NULL} represents all rows more efficiently than creating a vector such as \code{1:nrow(x)}. } \item{j}{ Column name(s) (character) or number(s) (integer) to be assigned \code{value} when column(s) already exist, and only column name(s) if they are to be created. } diff --git a/src/assign.c b/src/assign.c index b1623875e..a78b9452c 100644 --- a/src/assign.c +++ b/src/assign.c @@ -378,12 +378,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) for (int i=0; inrow) error(_("i[%d] is %d which is out of range [1,nrow=%d]"), i+1, rowsd[i], nrow); // set() reaches here (test 2005.2); := reaches the same error in subset.c first - if (rowsd[i]>=1) numToDo++; + if (rowsd[i]>=0) numToDo++; } if (verbose) Rprintf(_("Assigning to %d row subset of %d rows\n"), numToDo, nrow); // TODO: include in message if any rows are assigned several times (e.g. by=.EACHI with dups in i) if (numToDo==0) { - if (!length(newcolnames)) { + // isString(cols) is exclusive to calls from set() + if (!length(newcolnames) && !isString(cols)) { *_Last_updated = 0; UNPROTECT(protecti); return(dt); // all items of rows either 0 or NA. !length(newcolnames) for #759 From 138477c60f87c27a60ceb3e92a6d65afcf313fe3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 29 Jul 2024 13:50:25 -0700 Subject: [PATCH 05/21] Set _R_CHECK_COMPILATION_FLAGS_KNOWN_ to suppress R CMD check NOTE (#6327) --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8a3fa4d71..77ecc5280 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -110,6 +110,7 @@ test-lin-rel: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" @@ -131,6 +132,8 @@ test-lin-rel: test-lin-rel-vanilla: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc + variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" script: - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars @@ -143,6 +146,7 @@ test-lin-rel-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes @@ -163,6 +167,7 @@ test-lin-dev-gcc-strict-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc-strict variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" ## detects S3 method lookup found on search path #4777 @@ -184,6 +189,7 @@ test-lin-dev-clang-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-clang variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" From b8d5f83270d45a47316a41259bedae63f3d2854d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 30 Jul 2024 05:17:26 -0700 Subject: [PATCH 06/21] Setdroplevels for in-place removal of unused levels (#6316) * Add setdroplevels() to handle droplevels.data.table() * Export it * Some tests * NEWS --- NAMESPACE | 2 +- NEWS.md | 8 ++++++-- R/fdroplevels.R | 35 ++++++++++++++++++++--------------- inst/tests/tests.Rraw | 11 +++++++++-- man/fdroplevels.Rd | 2 ++ 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 109336c9e..2bc30543f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -201,5 +201,5 @@ S3method(format_col, expression) export(format_list_item) S3method(format_list_item, default) -export(fdroplevels) +export(fdroplevels, setdroplevels) S3method(droplevels, data.table) diff --git a/NEWS.md b/NEWS.md index 59a671410..d104981f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,9 +2,11 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) -## BREAKING CHANGE +## BREAKING CHANGES + +1. `droplevels(in.place=TRUE)` is deprecated in favor of calling `setdroplevels()`, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Given the associated risks/pain points, we strongly prefer all in-place/by-reference behavior within data.table come from functions `set*` (and `:=`) to make it as clear as possible that inputs are mutable. See below and `?setdroplevels` for more. -1. `` `[.data.table` `` is un-exported again. This was exported to support an experimental feature (`DT()` functional form of `[`) that never made it to release, but we forgot to claw back this export in the NAMESPACE; sorry about that. We didn't find anyone calling the method directly (which is inadvisable to begin with). +2. `` `[.data.table` `` is un-exported again. This was exported to support an experimental feature (`DT()` functional form of `[`) that never made it to release, but we forgot to claw back this export in the NAMESPACE; sorry about that. We didn't find anyone calling the method directly (which is inadvisable to begin with). ## NEW FEATURES @@ -52,6 +54,8 @@ 17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. +18. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fdroplevels.R b/R/fdroplevels.R index 69f23cb61..3116a3f85 100644 --- a/R/fdroplevels.R +++ b/R/fdroplevels.R @@ -8,19 +8,24 @@ fdroplevels = function(x, exclude = if (anyNA(levels(x))) NULL else NA, ...) { return(ans) } -droplevels.data.table = function(x, except = NULL, exclude, in.place = FALSE, ...){ - stopifnot(is.logical(in.place)) - if (nrow(x)==0L) return(x) - ix = vapply(x, is.factor, NA) - if(!is.null(except)){ - stopifnot(is.numeric(except), except <= length(x)) - ix[except] = FALSE - } - if(!sum(ix)) return(x) - if(!in.place) x = copy(x) - for(nx in names(ix)[ix==TRUE]){ - if (missing(exclude)) set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]])) - else set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]], exclude = exclude)) - } - return(x) +droplevels.data.table = function(x, except=NULL, exclude, in.place=FALSE, ...){ + stopifnot(is.logical(in.place)) + if (isTRUE(in.place)) warningf("droplevels() with in.place=TRUE is deprecated. Use setdroplevels() instead.") + if (!in.place) x = copy(x) + if (missing(exclude)) exclude = NULL + setdroplevels(x, except, exclude)[] +} + +setdroplevels = function(x, except=NULL, exclude=NULL) { + if (!nrow(x)) return(invisible(x)) + ix = vapply_1b(x, is.factor) + if (!is.null(except)) { + stopifnot(is.numeric(except), except >= 1, except <= length(x)) + ix[except] = FALSE + } + if (!any(ix)) return(invisible(x)) + for (nx in names(ix)[ix]) { + set(x, i=NULL, j=nx, value=fdroplevels(x[[nx]], exclude=exclude)) + } + invisible(x) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1f303c31e..b3447b04e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1682,7 +1682,7 @@ test(529.4, set(DT1, i=NULL, j=7L, value=5L), error="Item 1 of column numbers in # Test that data.frame incompability is fixed, came to light in Feb 2012 DT = data.table(name=c('a','b','c'), value=1:3) -test(530, base::droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # base:: because we'll implement a fast droplevels, too. +test(530, droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # Test that .set_row_names() is maintained on .SD for each group DT = data.table(a=INT(1,1,2,2,2,3,3,3,3),b=1:9) @@ -17732,7 +17732,7 @@ if (base::getRversion() >= "3.4.0") { } test(2214.06, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) test(2214.07, droplevels(DT, 1)[["a"]], x[1:5]) -test(2214.08, droplevels(DT, in.place=TRUE), DT) +test(2214.08, droplevels(DT, in.place=TRUE), DT, warning="droplevels() with in.place=TRUE is deprecated.") # support ordered factors in fdroplevels o = factor(letters[1:10], ordered=TRUE) test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) @@ -17740,6 +17740,13 @@ test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) test(2214.10, droplevels(DT[0]), DT[0]) test(2214.11, droplevels(data.table()), data.table()) +# setdroplevels() for in-place operations #6014 +x = factor(letters[1:10]) +DT = data.table(a = x)[1:5] +test(2214.12, setdroplevels(DT, except=1L), DT) # don't do anything +test(2214.13, setdroplevels(DT, except=0L), error="except >= 1") +test(2214.14, setdroplevels(DT, except=2L), error="except <= length(x)") +test(2214.15, setdroplevels(DT), DT) # factor i should be just like character i and work, #1632 DT = data.table(A=letters[1:3], B=4:6, key="A") diff --git a/man/fdroplevels.Rd b/man/fdroplevels.Rd index 98334f011..724399d20 100644 --- a/man/fdroplevels.Rd +++ b/man/fdroplevels.Rd @@ -2,6 +2,7 @@ \alias{fdroplevels} \alias{droplevels} \alias{droplevels.data.table} +\alias{setdroplevels} \title{Fast droplevels} \description{ Similar to \code{base::droplevels} but \emph{much faster}. @@ -9,6 +10,7 @@ \usage{ fdroplevels(x, exclude = if (anyNA(levels(x))) NULL else NA, \dots) +setdroplevels(x, except = NULL, exclude = NULL) \method{droplevels}{data.table}(x, except = NULL, exclude, in.place = FALSE, \dots) } From 488bdd2e068e4e56444916b75a61e029d3d4d296 Mon Sep 17 00:00:00 2001 From: rtobar Date: Thu, 1 Aug 2024 01:40:41 +0800 Subject: [PATCH 07/21] Mark fread's init function as static (#6328) * Mark fread's init function as static The function isn't used elsewhere, and making it publicly accessible opens the door for runtime linking issues -- where the function is served by other libraries exposing the same function. This was seen in a HPC cluster with software built with spack: 0 0x00001555513d8ce0 in init () from /opt/cray/pe/lib64/libsci_gnu_82_mpi.so.5 1 0x00001555433f46ba in parse_double_extended (...) at fread.c:819 2 0x00001555433f3e97 in detect_types (...) at fread.c:1203 3 0x00001555433f7959 in freadMain (...) at fread.c:1852 4 0x00001555433fd84d in freadR (...) at fRead.c:217 Signed-off-by: Rodrigo Tobar * rm ws * correct placement --------- Signed-off-by: Rodrigo Tobar Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ src/fread.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d104981f6..83f48d685 100644 --- a/NEWS.md +++ b/NEWS.md @@ -184,6 +184,8 @@ This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/43 23. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. +24. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/src/fread.c b/src/fread.c index e301e8cd1..45efa6eeb 100644 --- a/src/fread.c +++ b/src/fread.c @@ -84,7 +84,7 @@ static double NAND; static double INFD; // NAN and INFINITY constants are float, so cast to double once up front. -void init(void) { +static void init(void) { NAND = (double)NAN; INFD = (double)INFINITY; } From 7f18c097bce775793d576eebec2d1a88ba8e78ec Mon Sep 17 00:00:00 2001 From: Rafael Fontenelle Date: Wed, 31 Jul 2024 18:53:07 -0300 Subject: [PATCH 08/21] Use proper language name in R-pt_BR.po (#6332) --- po/R-pt_BR.po | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/po/R-pt_BR.po b/po/R-pt_BR.po index b0db1159c..4fb95ffb8 100644 --- a/po/R-pt_BR.po +++ b/po/R-pt_BR.po @@ -2771,10 +2771,10 @@ msgid "" "**********" msgstr "" "**********\n" -"Executando data.table em inglês; o suporte ao pacote está disponível apenas " +"Executando data.table em português; o suporte ao pacote está disponível apenas " "em inglês. Ao procurar ajuda online, certifique-se de verificar também a " "mensagem de erro em inglês. Isso pode ser obtido examinando os arquivos po/R-" -".po e po/.po no código-fonte do pacote, onde as mensagens de " +"pt_BR.po e po/pt_BR.po no código-fonte do pacote, onde as mensagens de " "erro no idioma nativo e em inglês podem ser encontradas lado a lado.\n" "**********" From c36c84ff4a78313f2c71e176553549c9c3c7a5dd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 31 Jul 2024 20:15:27 -0700 Subject: [PATCH 09/21] Don't include po/ directory in bundled package (#6331) --- .Rbuildignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 6e6b8b401..a6996ac55 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -46,3 +46,6 @@ ^lib$ ^library$ ^devwd$ + +# only the inst/po compressed files are needed, not raw .pot/.po +^po$ From 2349536e0cac05935781f553b31cea4946cf996a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Aug 2024 09:50:24 -0400 Subject: [PATCH 10/21] melt warns for measure.vars=list of length=1 (#6333) Co-authored-by: Michael Chirico --- NEWS.md | 4 ++-- inst/tests/tests.Rraw | 8 ++++---- src/fmelt.c | 7 +++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 83f48d685..fdc0db931 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,8 +70,6 @@ 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. Any users who were relying on this behavior can change `measure.vars=list("col_name")` (output `variable` was column name, now is column index/integer) to `measure.vars="col_name"` (`variable` still is column name). - 8. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. 9. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. @@ -94,6 +92,8 @@ ## NOTES +7. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. + 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. 2. The documentation for the `fill` argument in `rbind()` and `rbindlist()` now notes the expected behaviour for missing `list` columns when `fill=TRUE`, namely to use `NULL` (not `NA`), [#4198](https://github.com/Rdatatable/data.table/pull/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b3447b04e..57a348ce1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17257,13 +17257,13 @@ exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) -test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("1","2")), b=c(1,2))) # measure.vars named list length=1, #5065 +test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2)), warning="measure.vars is a list with length=1") # measure.vars named list length=1, #5065 # consistency between measure.vars=list with length=1 and length>1, #5209 -test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor(1), value=2)) +test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2), warning="measure.vars is a list with length=1") test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) -test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) +test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2), warning="measure.vars is a list with length=1") test(2182.74, melt(DT.wide, measure.vars=c("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2)) -test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="1", n=10))#thanks @mnazarov +test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="a", n=10), warning="measure.vars is a list with length=1")#thanks @mnazarov ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues diff --git a/src/fmelt.c b/src/fmelt.c index 51f4fbbb8..77a48f6a4 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -595,9 +595,12 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { + if ((data->lvalues == 1) & data->measure_is_list) { + warning("measure.vars is a list with length=1, which as long documented should return integer indices in the 'variable' column, but currently returns character column names. To increase consistency in the next release, we plan to change 'variable' to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name now, but will become column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); + } if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; @@ -616,7 +619,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; From d3f1554fb98d5cca8200d31d3b9672814a22b834 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 1 Aug 2024 14:01:31 -0700 Subject: [PATCH 11/21] lin-ancient job for oldest R dep (#5984) * lin-ancient job for oldest R dep * 3.2->3.3 --------- Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- .gitlab-ci.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 77ecc5280..83def117b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -203,10 +203,9 @@ test-lin-dev-clang-cran: - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 3 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (size of tarball, installed package size, non-API calls) but ", shQuote(l)) else q("no")' -## R 3.1.0 # stated dependency on R -test-lin-310-cran: - image: registry.gitlab.com/jangorecki/dockerfiles/r-3.1.0 +test-lin-ancient-cran: + image: registry.gitlab.com/jangorecki/dockerfiles/r-3.3.0 <<: *test-lin script: - *install-deps @@ -301,7 +300,7 @@ integration: - saas-linux-medium-amd64 only: - master - needs: ["mirror-packages","build","test-lin-rel","test-lin-rel-cran","test-lin-dev-gcc-strict-cran","test-lin-dev-clang-cran","test-lin-rel-vanilla","test-lin-310-cran","test-win-rel","test-win-dev" ,"test-win-old"] + needs: ["mirror-packages","build","test-lin-rel","test-lin-rel-cran","test-lin-dev-gcc-strict-cran","test-lin-dev-clang-cran","test-lin-rel-vanilla","test-lin-ancient-cran","test-win-rel","test-win-dev" ,"test-win-old"] script: - R --version - *install-deps ## markdown pkg not present in r-pkgdown image From b02c8acabab413918363dda09d9ae6f280c5185b Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Aug 2024 22:08:53 -0400 Subject: [PATCH 12/21] link translation teams page --- .github/CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 211cb063a..706ab2933 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -114,7 +114,7 @@ and then the `commit` and `push` shown above would push to the branch in the mai #### Translations -`data.table` offers some translations so that our users can get feedback (errors, warnings, verbose messages) in their native language. Currently we only support Mandarin Chinese. +`data.table` offers some translations so that our users can get feedback (errors, warnings, verbose messages) in their native language. There is a [translation team](https://github.com/orgs/Rdatatable/teams/translators/teams) for each currently supported translation. The data for these translations lives in the `po` folder. You do not need to make any changes here for your PR -- translations are updated in a batch before each CRAN release. From 8f45cf4dd9185876ad07c5b4cef2b1e27d42c9d4 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 2 Aug 2024 12:55:06 -0400 Subject: [PATCH 13/21] dcast warns when agg fun returns length!=1 and fill is not NULL (#6329) Co-authored-by: Toby Dylan Hocking Co-authored-by: Michael Chirico --- NEWS.md | 6 ++++-- R/fcast.R | 9 ++++++++- inst/tests/tests.Rraw | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index fdc0db931..553d436cb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -92,8 +92,6 @@ ## NOTES -7. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. - 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. 2. The documentation for the `fill` argument in `rbind()` and `rbindlist()` now notes the expected behaviour for missing `list` columns when `fill=TRUE`, namely to use `NULL` (not `NA`), [#4198](https://github.com/Rdatatable/data.table/pull/4198). Thanks @sritchie73 for the proposal and fix. @@ -186,6 +184,10 @@ This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/43 24. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. +25. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. + +26. `dcast()` docs have always required aggregation function to return a single value, and when `fill=NULL`, `dcast` would error if vector with `length!=1` was returned, but silently return an undefined result when fill is not `NULL`. Now `dcast` will additionally warn that this is undefined behavior, when fill is not `NULL`, [#6032](https://github.com/Rdatatable/data.table/issues/6032). In particular, this will warn for `fun.aggregate=identity`, which was observed in several revdeps. We may change this to an error in a future release, so revdeps should fix their code as soon as possible. Thanks to Toby Dylan Hocking for the PR, and Michael Chirico for analysis of GitHub revdeps. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/R/fcast.R b/R/fcast.R index 9675f0170..71360e135 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -191,7 +191,14 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) maybe_err = function(list.of.columns) { - if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") + if (!all(lengths(list.of.columns) == 1L)) { + msg <- gettext("Aggregating function(s) should take a vector as input and return a single value (length=1), but they do not, so the result is undefined. Please fix by modifying your function so that a single value is always returned.") + if (is.null(fill)) { # TODO change to always stopf #6329 + stop(msg, domain=NA, call. = FALSE) + } else { + warning(msg, domain=NA, call. = FALSE) + } + } list.of.columns } dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)] diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 57a348ce1..696697ed9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3704,7 +3704,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # bug git #693 - dcast error message improvement: DT = data.table(x=c(1,1), y=c(2,2), z = 3:4) - test(1102.19, dcast(DT, x ~ y, value.var="z", fun.aggregate=identity), error="should take vector inputs and return a single value") + test(1102.19, dcast(DT, x ~ y, value.var="z", fun.aggregate=identity), error="should take a vector as input and return a single value") # bug #688 - preserving attributes DT = data.table(id = c(1,1,2,2), ty = c("a","b","a","b"), da = as.Date("2014-06-20")) @@ -3727,7 +3727,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # issues/715 DT = data.table(id=rep(1:2, c(3,2)), k=c(letters[1:3], letters[1:2]), v=1:5) - test(1102.25, dcast(DT, id ~ k, fun.aggregate=last, value.var="v"), error="should take vector inputs and return a single value") + test(1102.25, dcast(DT, id ~ k, fun.aggregate=last, value.var="v"), error="should take a vector as input and return a single value") test(1102.26, dcast(DT, id ~ k, fun.aggregate=last, value.var="v", fill=NA_integer_), data.table(id=1:2, a=c(1L, 4L), b=c(2L,5L), c=c(3L,NA_integer_), key="id")) # Fix for #893 From 9ff288592251dc7bf06e1d352e4414e786fa817b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Fri, 2 Aug 2024 23:53:20 +0200 Subject: [PATCH 14/21] lighten name check for lin-ancient (#6338) * lighten name check for lin-ancient * remove slipped in file --------- Co-authored-by: Michael Chirico --- .ci/publish.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/publish.R b/.ci/publish.R index 7d43a44e6..a07a4ce90 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -163,7 +163,7 @@ r.ver <- function(x) { v = tmp[3L] if (identical(v, "rel")) "r-release" else if (identical(v, "dev")) "r-devel" - else if (identical(v, "old")) "r-oldrel" + else if (identical(v, "old") || identical(v, "ancient")) "r-oldrel" else { if (grepl("\\D", v)) stop("third word in test job name must be rel/dev/old or numbers of R version") paste0("r-", paste(strsplit(v, "")[[1L]], collapse=".")) From 268dd5fa7ce8bc2b4d776e8f8aebe40748ae52c9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 2 Aug 2024 22:13:44 +0000 Subject: [PATCH 15/21] Fix numbering of NEWS sections --- NEWS.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/NEWS.md b/NEWS.md index 553d436cb..ee587597d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,25 +70,25 @@ 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -8. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. +7. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. -9. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. +8. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. -10. `dt[,,by=año]` (i.e., using a column name containing a non-ASCII character in `by` as a plain symbol) no longer errors with "object 'año' not found", #4708. Thanks @pfv07 for the report, and Michael Chirico for the fix. +9. `dt[,,by=año]` (i.e., using a column name containing a non-ASCII character in `by` as a plain symbol) no longer errors with "object 'año' not found", #4708. Thanks @pfv07 for the report, and Michael Chirico for the fix. -11. Fix some memory management issues in the C routines backing `melt()`, `froll()`, and GForce `mean()`, as identified by `rchk`. Thanks Tomas Kalibera and the CRAN team for setting up the `rchk` system, and @MichaelChirico for the fix. +10. Fix some memory management issues in the C routines backing `melt()`, `froll()`, and GForce `mean()`, as identified by `rchk`. Thanks Tomas Kalibera and the CRAN team for setting up the `rchk` system, and @MichaelChirico for the fix. -12. data.table's `all.equal()` method now dispatches to each column's own `all.equal()` method as appropriate, [#4543](https://github.com/Rdatatable/data.table/issues/4543). Thanks @MichaelChirico for the report and fix. Note that this had two noteworthy changes to data.table's own test suite that might affect you: (1) comparisons of POSIXct columns compare absolute, not relative differences, meaning that millisecond-scale differences might trigger a "not equal" report that was hidden before; and (2) comparisons of integer64 columns could be totally wrong since they were being compared on the basis of their representation as doubles, not long integers. The former might be a matter of preference requiring you to specify a different `tolerance=`, while the latter was clearly a bug. +11. data.table's `all.equal()` method now dispatches to each column's own `all.equal()` method as appropriate, [#4543](https://github.com/Rdatatable/data.table/issues/4543). Thanks @MichaelChirico for the report and fix. Note that this had two noteworthy changes to data.table's own test suite that might affect you: (1) comparisons of POSIXct columns compare absolute, not relative differences, meaning that millisecond-scale differences might trigger a "not equal" report that was hidden before; and (2) comparisons of integer64 columns could be totally wrong since they were being compared on the basis of their representation as doubles, not long integers. The former might be a matter of preference requiring you to specify a different `tolerance=`, while the latter was clearly a bug. -13. `rbindlist` and `shift` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger (`rbindlist`) and @MichaelChirico (`shift`) for the fix. +12. `rbindlist` and `shift` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger (`rbindlist`) and @MichaelChirico (`shift`) for the fix. -14. `fread(x, colClasses="POSIXct")` now also works for columns containing only NA values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and Benjamin Schwendinger for the fix. +13. `fread(x, colClasses="POSIXct")` now also works for columns containing only NA values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and Benjamin Schwendinger for the fix. -15. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. +14. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. -16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. +15. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. -17. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. +16. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. ## NOTES @@ -140,7 +140,7 @@ 22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. -22. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. +23. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. @@ -180,13 +180,13 @@ d1[d2, on="id", verbose=TRUE] This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. -23. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. +24. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. -24. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. +25. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. -25. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. +26. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. -26. `dcast()` docs have always required aggregation function to return a single value, and when `fill=NULL`, `dcast` would error if vector with `length!=1` was returned, but silently return an undefined result when fill is not `NULL`. Now `dcast` will additionally warn that this is undefined behavior, when fill is not `NULL`, [#6032](https://github.com/Rdatatable/data.table/issues/6032). In particular, this will warn for `fun.aggregate=identity`, which was observed in several revdeps. We may change this to an error in a future release, so revdeps should fix their code as soon as possible. Thanks to Toby Dylan Hocking for the PR, and Michael Chirico for analysis of GitHub revdeps. +27. `dcast()` docs have always required aggregation function to return a single value, and when `fill=NULL`, `dcast` would error if vector with `length!=1` was returned, but silently return an undefined result when fill is not `NULL`. Now `dcast` will additionally warn that this is undefined behavior, when fill is not `NULL`, [#6032](https://github.com/Rdatatable/data.table/issues/6032). In particular, this will warn for `fun.aggregate=identity`, which was observed in several revdeps. We may change this to an error in a future release, so revdeps should fix their code as soon as possible. Thanks to Toby Dylan Hocking for the PR, and Michael Chirico for analysis of GitHub revdeps. ## TRANSLATIONS @@ -346,7 +346,7 @@ This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/43 23. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. -25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. Also thanks to Manuel López-Ibáñez for testing dev and mentioning needed documentation before release. +24. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. Also thanks to Manuel López-Ibáñez for testing dev and mentioning needed documentation before release. 25. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. From 2dac309515d3d3ec9f645a050c425085082a224a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 2 Aug 2024 22:01:49 -0700 Subject: [PATCH 16/21] Add Ani & Doris as contributors (#6344) --- DESCRIPTION | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b78495edc..9fba4f71a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -86,5 +86,7 @@ Authors@R: c( person("Dmitry", "Shemetov", role="ctb"), person("Nitish", "Jha", role="ctb"), person("Joshua", "Wu", role="ctb"), - person("Iago", "Giné-Vázquez", role="ctb") + person("Iago", "Giné-Vázquez", role="ctb"), + person("Anirban", "Chetia", role="ctb"), + person("Doris", "Amoakohene", role="ctb") ) From 70ebba6a4a2cdafd542fc656b11656aac79055f8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 4 Aug 2024 17:37:43 -0700 Subject: [PATCH 17/21] Take a pass at condensing and tightening the NEWS (#6348) * Take a pass at condensing and tightening the NEWS * fix numbering --------- Co-authored-by: Benjamin Schwendinger --- NEWS.md | 181 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 93 insertions(+), 88 deletions(-) diff --git a/NEWS.md b/NEWS.md index ee587597d..f9ccdaa5f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,21 @@ ## NEW FEATURES -1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. +1. We continue to use user feedback to prioritize development. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) for the current list of most-requested issues. In this release we add five highly-requested features: + + a. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works to update all columns, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. + + b. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). Thanks @mattdowle for the original issue, 50 or more others for expressing support, and @MichaelChirico for the fix. + + c. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#4258](https://github.com/Rdatatable/data.table/issues/4258). Thanks @sindribaldur for the feature request, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. + + d. `[.data.table` gains argument `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). The progress bar reports information such as the number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply to `GForce` optimized operations. Thanks to @eatonya and @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. + + e. `rbindlist(l, use.names=TRUE)` and `rbind` now work correctly on columns with different class attributes across the inputs for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with matched columns of similar classes, e.g., `rbind(data.table(d = Sys.Date()), data.table(d = as.IDate(Sys.Date()-1)))`. The conversion is done automatically and the class attribute of the final column is determined by the first class attribute encountered in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). + + `rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` (default `FALSE`) to manually deactivate the safety net preventing binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. + +2. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. ```R data.table(a=list(NULL, "")) @@ -20,41 +34,31 @@ # 2: ``` -2. `cedta()` now returns `FALSE` if `.datatable.aware = FALSE` is set in the calling environment, [#5654](https://github.com/Rdatatable/data.table/issues/5654). Thanks @dvg-p4 for the request and PR. +3. `cedta()` now returns `FALSE` if `.datatable.aware = FALSE` is set in the calling environment, [#5654](https://github.com/Rdatatable/data.table/issues/5654). Thanks @dvg-p4 for the request and PR. -3. `split.data.table` also accepts a formula for `f`, [#5392](https://github.com/Rdatatable/data.table/issues/5392), mirroring the same in `base::split.data.frame` since R 4.1.0 (May 2021). Thanks to @XiangyunHuang for the request, and @ben-schwen for the PR. - -4. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. - -5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR. - -6. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. - -7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Using `fill=Inf` reads the full file for estimating the number of columns [#2727](https://github.com/Rdatatable/data.table/issues/2727). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. - -8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. +4. The `split()` method for `data.table`s is more consistent with that for base methods: -9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. + a. `f` can be a formula, [#5392](https://github.com/Rdatatable/data.table/issues/5392), mirroring the same in `base::split.data.frame` since R 4.1.0 (May 2021). Thanks to @XiangyunHuang for the request, and @ben-schwen for the PR. -10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. + b. `sep=` is recognized when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). Thanks @MichaelChirico for the request and PR. -11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). +5. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. -12. `setDT` is faster for data with many columns, thanks @MichaelChirico for reporting and fixing the issue, [#5426](https://github.com/Rdatatable/data.table/issues/5426). +6. `transpose` gains `list.cols=` argument (default `FALSE`), [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoid type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and @ben-schwen for the PR. -13. `dcast` gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. +7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values -- an upper bound on the number of columns in the file. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2691](https://github.com/Rdatatable/data.table/issues/2691), [#4130](https://github.com/Rdatatable/data.table/issues/4130), [#3436](https://github.com/Rdatatable/data.table/issues/3436), [#1812](https://github.com/Rdatatable/data.table/issues/1812) and [#5378](https://github.com/Rdatatable/data.table/issues/5378). The suggestion for `fill` to allow a manual estimate of the number of columns instead comes from [#2727](https://github.com/Rdatatable/data.table/issues/2727). Using `fill=Inf` reads the full file for estimating the number of columns. Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and @ben-schwen for the PR. -14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. +8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. -15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +9. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. -`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. +10. `setDT` is faster for data with many columns, thanks @MichaelChirico for reporting and fixing the issue, [#5426](https://github.com/Rdatatable/data.table/issues/5426). -16. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks @sindribaldur for the feature request, which has been highly requested, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. +11. `dcast` gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. -17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. +12. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and @ben-schwen for the PR. -18. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation. +13. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation. ## BUG FIXES @@ -62,9 +66,9 @@ 2. `dcast` handles coercion of `fill` to `integer64` correctly, [#4561](https://github.com/Rdatatable/data.table/issues/4561). Thanks to @emallickhossain for the bug report and @MichaelChirico for the fix. -3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and Benjamin Schwendinger for the fix. +3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and @ben-schwen for the fix. -4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix. +4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings which were potentially confusing (for example, when `fun.aggregate=min` or `max`, warning was "NAs introduced by coercion to integer range"), [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to @tdhock for the report and fix. 5. `fwrite(x, row.names=TRUE)` with `x` a `matrix` writes `row.names` when present, not row numbers, [#5315](https://github.com/Rdatatable/data.table/issues/5315). Thanks to @Liripo for the report, and @ben-schwen for the fix. @@ -74,19 +78,19 @@ 8. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. -9. `dt[,,by=año]` (i.e., using a column name containing a non-ASCII character in `by` as a plain symbol) no longer errors with "object 'año' not found", #4708. Thanks @pfv07 for the report, and Michael Chirico for the fix. +9. `dt[,,by=año]` (i.e., using a column name containing a non-ASCII character in `by` as a plain symbol) no longer errors with "object 'año' not found", #4708. Thanks @pfv07 for the report, and @MichaelChirico for the fix. -10. Fix some memory management issues in the C routines backing `melt()`, `froll()`, and GForce `mean()`, as identified by `rchk`. Thanks Tomas Kalibera and the CRAN team for setting up the `rchk` system, and @MichaelChirico for the fix. +10. Fixed some memory management issues in the C routines backing `melt()`, `froll()`, and GForce `mean()`, as identified by `rchk`. Thanks Tomas Kalibera and the CRAN team for setting up the `rchk` system, and @MichaelChirico for the fix. 11. data.table's `all.equal()` method now dispatches to each column's own `all.equal()` method as appropriate, [#4543](https://github.com/Rdatatable/data.table/issues/4543). Thanks @MichaelChirico for the report and fix. Note that this had two noteworthy changes to data.table's own test suite that might affect you: (1) comparisons of POSIXct columns compare absolute, not relative differences, meaning that millisecond-scale differences might trigger a "not equal" report that was hidden before; and (2) comparisons of integer64 columns could be totally wrong since they were being compared on the basis of their representation as doubles, not long integers. The former might be a matter of preference requiring you to specify a different `tolerance=`, while the latter was clearly a bug. 12. `rbindlist` and `shift` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger (`rbindlist`) and @MichaelChirico (`shift`) for the fix. -13. `fread(x, colClasses="POSIXct")` now also works for columns containing only NA values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and Benjamin Schwendinger for the fix. +13. `fread(x, colClasses="POSIXct")` now also works for columns containing only `NA` values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and @ben-schwen for the fix. -14. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. +14. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte of the file is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. -15. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. +15. Selecting the key column like `DT[, .(key1, key2)]` will retain the key without a performance penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report and @MichaelChirico for the fix. 16. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. @@ -102,91 +106,92 @@ 5. Input files are now kept open during `mmap()` when running under Emscripten, [emscripten-core/emscripten#20459](https://github.com/emscripten-core/emscripten/issues/20459). This avoids an error in `fread()` when running in WebAssembly, [#5969](https://github.com/Rdatatable/data.table/issues/5969). Thanks to @maek-ies for the report and @georgestagg for the PR. -6. `dcast()` message about `fun.aggregate` defaulting to `length()` when aggregation is necessary, which could be confusing if duplicates were unexpected, does better explaining the behavior and suggesting alternatives, [#5217](https://github.com/Rdatatable/data.table/issues/5217). Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. - -7. Updated a test relying on `>` working for comparing language objects to a string, which will be deprecated by R, [#5977](https://github.com/Rdatatable/data.table/issues/5977); no user-facing effect. Thanks to R-core for continuously improving the language. +6. `dcast()` improves behavior for the situation that the `fun.aggregate` value of `length()` is used but not provided by the user. -8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup. + a. This now triggers a warning, not a message, since relying on this default often signals unexpected duplicates in the data, [#5386](https://github.com/Rdatatable/data.table/issues/5386). The warning is classed as `dt_missing_fun_aggregate_warning`, allowing for more targeted handling in user code. Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. -9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. + b. The warning itself does better explaining the behavior and suggesting alternatives, [#5217](https://github.com/Rdatatable/data.table/issues/5217). Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. -10. `test.data.table()` runs robustly: - + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. - + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix. +7. Updated a test relying on operator `>` working for comparing language objects to a string, which will be deprecated by R, [#5977](https://github.com/Rdatatable/data.table/issues/5977); no user-facing effect. Thanks to R-core for continuously improving the language. -11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. +8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the expected setup. -12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. +9. `print.data.table`: -13. `test.data.table()` now initialises the numeric rounding value to 0 using `setNumericRounding(0)` to avoid failed tests if the user has set a different value, [#6082](https://github.com/Rdatatable/data.table/issues/6082). The user's value is restored on exit. Thanks to @MichaelChirico for the report and for describing the solution, and @markseeto for implementing. + a. Now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. -14. `setNumericRounding()` now invisibly returns the old rounding value instead of `NULL`, which is now consistent with similar behavior by `setwd()`, `options()`, etc. Thanks @MichaelChirico for the report and @joshhwuu for the fix. + b. Prints the indicator `---` in every value column when truncation is needed and `row.names = FALSE` instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. -15. `dcast()` now issues a warning when `fun.aggregate` is used but not provided by the user. `fun.aggregate` defaults to `length` in this case. Previously, only a message was issued. However, relying on this default often signals unexpected duplicates in the data. Therefore, a stricter class of signal was deemed more appropriate, [#5386](https://github.com/Rdatatable/data.table/issues/5386). The warning is classed as `dt_missing_fun_aggregate_warning`, allowing for more targeted handling in user code. Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. + c. Honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. -16. `print.data.table` gains new argument `show.indices` and option `datatable.show.indices` that allows the user to print a `data.table`'s indices as columns without having to modify the `data.table` itself. Thanks @MichaelChirico for the report and @joshhwuu for the PR. + d. Gains new argument `show.indices` and option `datatable.show.indices` that allows the user to print a `data.table`'s indices as columns without having to modify the `data.table` itself. Thanks @MichaelChirico for the report and @joshhwuu for the PR. -17. The `measure` and `patterns` functions are now exported for use within `[` and `melt()` to ensure consistency with other non-standard evaluation (NSE) exports like `.N` and `:=`. This change addresses [#5604](https://github.com/Rdatatable/data.table/issues/5604), allowing package developers to import these names and avoid `R CMD check` `NOTE`s about undefined variables. Thanks to @MichaelChirico and @ylelkes for their suggestions, and to @Nj221102 for the implementation. + e. Displays `integer64` columns well even if {bit64} has not yet been loaded, [#6224](https://github.com/Rdatatable/data.table/issues/6224). Thanks @renkun-ken for the report and @MichaelChirico for the fix. - We plan to export similar placeholders for `.` and `J` in roughly one year (e.g. data.table 1.18.0), but excluded them from this release to avoid back-compatibility issues. Specifically, some packages doing `import(plyr)` _and_ `import(data.table)`, and/or with those packages in `Depends`, will error when data.table starts exporting `.` (and similarly for a potential conflict with `rJava::J()`). We discourage using data.table (or any package, really) in Depends; blanket `import()` of package is also generally best avoided. See `vignette("datatable-importing")`. +10. `test.data.table()` runs robustly: + + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. + + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix. + + Initialises the numeric rounding value to 0 using `setNumericRounding(0)` to avoid failed tests if the user has set a different value, [#6082](https://github.com/Rdatatable/data.table/issues/6082). The user's value is restored on exit. Thanks to @MichaelChirico for the report and for describing the solution, and @markseeto for implementing. To enable this, `setNumericRounding()` now invisibly returns the old rounding value instead of `NULL`, which is consistent with similar behavior by `setwd()`, `options()`, etc. Thanks @MichaelChirico for the report and @joshhwuu for the fix. -18. `integer64` columns print well even if {bit64} has not yet been loaded, [#6224](https://github.com/Rdatatable/data.table/issues/6224). Thanks @renkun-ken for the report and @MichaelChirico for the fix. +11. The `measure` and `patterns` functions are now exported for use within `[` and `melt()` to ensure consistency with other non-standard evaluation (NSE) exports like `.N` and `:=`. This change addresses [#5604](https://github.com/Rdatatable/data.table/issues/5604), allowing package developers to import these names and avoid `R CMD check` `NOTE`s about undefined variables. Thanks to @MichaelChirico and @ylelkes for their suggestions, and to @Nj221102 for the implementation. -19. `fwrite()` header names are no longer quoted automatically when `na` argument is given, [#2964](https://github.com/Rdatatable/data.table/issues/2964). Thanks @jangorecki for the report and @joshhwuu for the fix. + We plan to export similar placeholders for `.` and `J` in roughly one year (e.g. data.table 1.18.0), but excluded them from this release to avoid back-compatibility issues. Specifically, some packages doing `import(plyr)` _and_ `import(data.table)`, and/or with those packages in `Depends`, will error when data.table starts exporting `.` (and similarly for a potential conflict with `rJava::J()`). We discourage using data.table (or any package, really) in Depends; blanket `import()` of package is also generally best avoided. See `vignette("datatable-importing")`. -20. Removed a warning about the now totally-obsolete option `datatable.CJ.names`, as discussed in previous releases. +12. `fwrite()` header names are no longer quoted automatically when `na` argument is given, [#2964](https://github.com/Rdatatable/data.table/issues/2964). Thanks @jangorecki for the report and @joshhwuu for the fix. -21. Refactored some non-API calls in the package C code, (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. +13. Removed a warning about the now totally-obsolete option `datatable.CJ.names`, as discussed in previous releases. -22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. +14. Refactored some non-API calls in the package C code, (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. -23. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. +15. C code is more unified in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. -Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. +16. The internal routine for finding sort order (`forder`) will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. -Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. + Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. -```r -d2 = data.table(id=2:1, v2=1:2) -setindexv(d2, "id") -str(attr(attr(d2, "index"), "__id")) -# int [1:2] 2 1 -# - attr(*, "starts")= int [1:2] 1 2 -# - attr(*, "maxgrpn")= int 1 -# - attr(*, "anyna")= int 0 -# - attr(*, "anyinfnan")= int 0 -# - attr(*, "anynotascii")= int 0 -# - attr(*, "anynotutf8")= int 0 + Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. -d2 = data.table(id=2:1, v2=1:2) -invisible(d2[id==1L]) -str(attr(attr(d2, "index"), "__id")) -# int [1:2] 2 1 -``` + ```r + d2 = data.table(id=2:1, v2=1:2) + setindexv(d2, "id") + str(attr(attr(d2, "index"), "__id")) + # int [1:2] 2 1 + # - attr(*, "starts")= int [1:2] 1 2 + # - attr(*, "maxgrpn")= int 1 + # - attr(*, "anyna")= int 0 + # - attr(*, "anyinfnan")= int 0 + # - attr(*, "anynotascii")= int 0 + # - attr(*, "anynotutf8")= int 0 -This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. + d2 = data.table(id=2:1, v2=1:2) + invisible(d2[id==1L]) + str(attr(attr(d2, "index"), "__id")) + # int [1:2] 2 1 + ``` -```r -d1 = data.table(id=1:2, v1=1:2) -d2 = data.table(id=2:1, v2=1:2) -setindexv(d2, "id") -d1[d2, on="id", verbose=TRUE] -#... -#Starting bmerge ... -#forderReuseSorting: using existing index: __id -#forderReuseSorting: opt=2, took 0.000s -#... -``` + This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. + + ```r + d1 = data.table(id=1:2, v1=1:2) + d2 = data.table(id=2:1, v2=1:2) + setindexv(d2, "id") + d1[d2, on="id", verbose=TRUE] + #... + #Starting bmerge ... + #forderReuseSorting: using existing index: __id + #forderReuseSorting: opt=2, took 0.000s + #... + ``` -This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. + This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. -24. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. +17. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. -25. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. +18. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. -26. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. +19. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. -27. `dcast()` docs have always required aggregation function to return a single value, and when `fill=NULL`, `dcast` would error if vector with `length!=1` was returned, but silently return an undefined result when fill is not `NULL`. Now `dcast` will additionally warn that this is undefined behavior, when fill is not `NULL`, [#6032](https://github.com/Rdatatable/data.table/issues/6032). In particular, this will warn for `fun.aggregate=identity`, which was observed in several revdeps. We may change this to an error in a future release, so revdeps should fix their code as soon as possible. Thanks to Toby Dylan Hocking for the PR, and Michael Chirico for analysis of GitHub revdeps. +20. `dcast()` docs have always required `fun.aggregate` to return a single value, and when `fill=NULL`, `dcast` would indeed error if vector with `length!=1` was returned, but silently return an undefined result when fill is not `NULL`. Now `dcast` will additionally warn that this is undefined behavior, when fill is not `NULL`, [#6032](https://github.com/Rdatatable/data.table/issues/6032). In particular, this will warn for `fun.aggregate=identity`, which was observed in several revdeps. We may change this to an error in a future release, so revdeps should fix their code as soon as possible. Thanks to Toby Dylan Hocking for the PR, and Michael Chirico for analysis of GitHub revdeps. ## TRANSLATIONS From 3b9dcd21133757a3302176e8e119813c4ba3ae48 Mon Sep 17 00:00:00 2001 From: Rafael Fontenelle Date: Sun, 4 Aug 2024 22:03:11 -0300 Subject: [PATCH 18/21] Fix Leonardo email in the pt_BR po file (#6351) --- po/pt_BR.po | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/po/pt_BR.po b/po/pt_BR.po index 284be4046..af77d9234 100644 --- a/po/pt_BR.po +++ b/po/pt_BR.po @@ -1,7 +1,7 @@ # # Translators: # Italo Santos , 2024 -# Leonardo Fontenelle , 2024 +# Leonardo Fontenelle , 2024 # Rafael Fontenelle , 2024 # msgid "" @@ -9,7 +9,7 @@ msgstr "" "Project-Id-Version: data.table 1.15.99\n" "POT-Creation-Date: 2024-06-23 12:07-0300\n" "PO-Revision-Date: 2024-06-23 16:40-0300\n" -"Last-Translator: Leonardo Fontenelle \n" +"Last-Translator: Leonardo Fontenelle \n" "Language-Team: Brazilian Portuguese\n" "Language: pt_BR\n" "MIME-Version: 1.0\n" From 8e03ca8db18d3a5d10c98b49a1b3457704eb1e9d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 4 Aug 2024 23:47:42 -0700 Subject: [PATCH 19/21] Add a linter for calls to omp_set_num_threads (#6345) Part of #6272 Probably, the step to pre-process the file to strip comments will be repeated by other linters, perhaps we should _just_ work from the preprocessed file for the C linters? Leaving the simple version for now that's more parallel to the existing code. --- .ci/linters/c/omp_set_num_threads_linter.R | 13 +++++++++++++ .dev/CRAN_Release.cmd | 3 --- .github/workflows/code-quality.yaml | 7 ++++--- 3 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 .ci/linters/c/omp_set_num_threads_linter.R diff --git a/.ci/linters/c/omp_set_num_threads_linter.R b/.ci/linters/c/omp_set_num_threads_linter.R new file mode 100644 index 000000000..b3a100b6a --- /dev/null +++ b/.ci/linters/c/omp_set_num_threads_linter.R @@ -0,0 +1,13 @@ + +# Ensure no calls to omp_set_num_threads() [to avoid affecting other packages and base R] +# Only comments referring to it should be in openmp-utils.c +omp_set_num_threads_linter = function(c_file) { + # strip comments, we only care if the function appears in actual code. + processed_lines = system2("gcc", c("-fpreprocessed", "-E", c_file), stdout=TRUE, stderr=FALSE) + idx = grep("omp_set_num_threads", processed_lines, fixed = TRUE) + if (!length(idx)) return() + stop(sprintf( + "In %s, found omp_set_num_threads() usage, which could affect other packages and base R:\n%s", + c_file, paste0(" ", format(idx), ":", processed_lines[idx], collapse = "\n") + )) +} diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index ebf17bcd2..eeddd7271 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -53,9 +53,6 @@ grep -RI --exclude-dir=".git" --exclude="*.md" --exclude="*~" --color='auto' -P # Unicode is now ok. This unicode in tests.Rraw is passing on CRAN. grep -RI --exclude-dir=".git" --exclude="*.md" --exclude="*~" --color='auto' -n "[\]u[0-9]" ./ -# Ensure no calls to omp_set_num_threads() [to avoid affecting other packages and base R] -# Only comments referring to it should be in openmp-utils.c -grep omp_set_num_threads ./src/* # Ensure no calls to omp_set_nested() as i) it's hard to fully honor OMP_THREAD_LIMIT as required by CRAN, and # ii) a simpler non-nested approach is always preferable if possible, as has been the case so far diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index f9066eacd..960315e78 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -40,9 +40,10 @@ jobs: - uses: r-lib/actions/setup-r@v2 - name: Lint run: | - for (f in list.files('.ci/linters/c', full.names=TRUE)) source(f) - for (f in list.files('src', pattern='[.]c$', full.names=TRUE)) { - alloc_linter(f) + linter_env = new.env() + for (f in list.files('.ci/linters/c', full.names=TRUE)) sys.source(f, linter_env) + for (f in list.files('src', pattern='[.][ch]$', full.names=TRUE)) { + for (linter in ls(linter_env)) linter_env[[linter]](f) # TODO(#6272): Incorporate more checks from CRAN_Release } shell: Rscript {0} From bcc015673af7670e11752187b83f9d25fbb08161 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Mon, 5 Aug 2024 10:11:27 -0400 Subject: [PATCH 20/21] fix markdown links --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index f9ccdaa5f..29047f1a1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -141,9 +141,9 @@ 13. Removed a warning about the now totally-obsolete option `datatable.CJ.names`, as discussed in previous releases. -14. Refactored some non-API calls in the package C code, (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. +14. Refactored some non-API calls in the package C code, [#6180](https://github.com/Rdatatable/data.table/issues/6180). There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. -15. C code is more unified in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. +15. C code is more unified in how failures to allocate memory (`malloc()`/`calloc()`) are handled, [#1115](https://github.com/Rdatatable/data.table/issues/1115). No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. 16. The internal routine for finding sort order (`forder`) will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. From 377c20268f1537fdb5054b3dca35bd8312990f24 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 5 Aug 2024 13:19:46 -0700 Subject: [PATCH 21/21] More careful PROTECT()/UNPROTECT() in rbindlist for rchk (#6311) * More careful PROTECT()/UNPROTECT() in rbindlist for rchk * use one UNPROTECT * initialize nprotect=2 * smaller diff --- inst/tests/tests.Rraw | 4 ++-- src/rbindlist.c | 27 +++++++++++++++------------ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 696697ed9..7d49411c6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3046,8 +3046,8 @@ test(1006, print(as.data.table(M), nrows=10), output="gear NA.*1: 21.0") # rbinding factor with non-factor/character DT1 <- data.table(x=1:5, y=factor("a")) DT2 <- data.table(x=1:5, y=2) -test(1007, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2')))) -test(1008, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5))))) +test(1007.1, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2')))) +test(1007.2, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5))))) # rbindlist different types DT1 <- data.table(a = 1L, b = 2L) diff --git a/src/rbindlist.c b/src/rbindlist.c index 68c3166e1..40fcd8fe0 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -244,9 +244,9 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor ncol = length(VECTOR_ELT(l, first)); // ncol was increased as if fill=true, so reduce it back given fill=false (fill==false checked above) } - int nprotect = 0; - SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); nprotect++; - SEXP ansNames = PROTECT(allocVector(STRSXP, idcol + ncol)); nprotect++; + int nprotect = 2; + SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); + SEXP ansNames = PROTECT(allocVector(STRSXP, idcol + ncol)); setAttrib(ans, R_NamesSymbol, ansNames); if (idcol) { SET_STRING_ELT(ansNames, 0, STRING_ELT(idcolArg, 0)); @@ -534,15 +534,18 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871 writeNA(target, ansloc, thisnrow, false); // writeNA is integer64 aware and writes INT64_MIN } else { - bool listprotect = false; - if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) { - // do an as.list() on the atomic column; #3528 - thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); listprotect = true; + bool listprotect = (TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target); + // do an as.list() on the atomic column; #3528 + if (listprotect) { + thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); + // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909) + const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName); + UNPROTECT(1); // earlier unprotect rbindlist calls with lots of lists #4536 + if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); + } else { + const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName); + if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); } - // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909) - const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName); - if (listprotect) UNPROTECT(1); // earlier unprotect rbindlist calls with lots of lists #4536 - if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); // e.g. when precision is lost like assigning 3.4 to integer64; test 2007.2 // TODO: but maxType should handle that and this should never warn } @@ -550,6 +553,6 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor } } } - UNPROTECT(nprotect); // ans, coercedForFactor, thisCol + UNPROTECT(nprotect); // ans, ansNames, coercedForFactor? return(ans); }