Merge branch 'master' into i18n-assign

Rdatatable · Sep 26, 2024 · 3cbaa3e · 3cbaa3e
2 parents 7b40653 + e22f774
commit 3cbaa3e
Show file tree

Hide file tree

Showing 16 changed files with 84 additions and 80 deletions.
diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R
@@ -1,12 +1,12 @@
-# #6107 fixed performance across 3 ways to specify a column as Date, test each individually
+# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
+# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
 extra.args.6107 <- c(
   "colClasses=list(Date='date')",
   "colClasses='Date'",
   "select=list(Date='date')")
 extra.test.list <- list()
 for (extra.arg in extra.args.6107){
   this.test <- atime::atime_test(
-    N = 10^seq(1, 7, by=0.25),
     setup = {
       set.seed(1)
       DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
@@ -40,6 +40,8 @@ for (extra.arg in extra.args.6107){
 # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
 # nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
 test.list <- atime::atime_test_list(
+  # Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
+  N = as.integer(10^seq(1, 7, by=0.25)),
   # A function to customize R package metadata and source files to facilitate version-specific installation and testing.
   #
   # This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
@@ -96,10 +98,9 @@ test.list <- atime::atime_test_list(
       paste0('useDynLib(', new.Package_))
   },
 
-  # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/4440
+  # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
-    N = 10^seq(3, 8),
     setup = {
       set.seed(1L)
       dt <- data.table(a = sample.int(N))
@@ -110,17 +111,16 @@ test.list <- atime::atime_test_list(
     Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
     Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
 
-  # Test based on: https://github.com/Rdatatable/data.table/issues/5424
-  # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/5463
+  # Test based on https://github.com/Rdatatable/data.table/issues/5424
+  # Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
   "memrecycle regression fixed in #5463" = atime::atime_test(
-    N = 10^seq(3, 8),
     setup = {
-      n <- N/100
+      bigN <- N*100
       set.seed(2L)
       dt <- data.table(
-        g = sample(seq_len(n), N, TRUE),
-        x = runif(N),
+        g = sample(seq_len(N), bigN, TRUE),
+        x = runif(bigN),
         key = "g")
       dt_mod <- copy(dt)
     },
@@ -129,10 +129,9 @@ test.list <- atime::atime_test_list(
     Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
     Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
-  # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
+  # Issue reported in https://github.com/Rdatatable/data.table/issues/5426
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
   "setDT improved in #5427" = atime::atime_test(
-    N = 10^seq(1, 7),
     setup = {
       L <- replicate(N, 1, simplify = FALSE)
       setDT(L)
@@ -144,10 +143,9 @@ test.list <- atime::atime_test_list(
     Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
     Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/4200
-  # To be fixed in: https://github.com/Rdatatable/data.table/pull/4558
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
-    N = 10^seq(1, 20),
     setup = {
       d <- data.table(
         id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
@@ -160,10 +158,9 @@ test.list <- atime::atime_test_list(
     Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
     Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
 
-  # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/4501
+  # Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
   "DT[,.SD] improved in #4501" = atime::atime_test(
-    N = 10^seq(1, 10, by=0.5),
     setup = {
       set.seed(1)
       L = as.data.table(as.character(rnorm(N, 1, 0.5)))
@@ -175,10 +172,9 @@ test.list <- atime::atime_test_list(
     Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
     Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/6286
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/6296
-  "DT[by, verbose = TRUE] improved in #6296" = atime::atime_test(
-    N = 10^seq(1, 9),
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/6296
+  "DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
     setup = {
       dt = data.table(a = 1:N)
       dt_mod <- copy(dt)
@@ -187,9 +183,9 @@ test.list <- atime::atime_test_list(
     Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
     Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
 
-  # Issue mentioned and fixed in: https://github.com/Rdatatable/data.table/pull/5493
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
+  # and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
   "transform improved in #5493" = atime::atime_test(
-    N = 10^seq(1, 7),
     setup = {
       df <- data.frame(x = runif(N))
       dt <- as.data.table(df)
@@ -198,9 +194,8 @@ test.list <- atime::atime_test_list(
     Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
     Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
 
-  # Issue mentioned and fixed in: https://github.com/Rdatatable/data.table/pull/5054
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
   "melt improved in #5054" = atime::atime_test(
-    N = 10^seq(1, 4),
     setup = {
       DT <- as.data.table(as.list(1:N))
       measure.vars <- lapply(1:N, function(i) {

diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml
@@ -1,4 +1,4 @@
-name: Autocomment atime-based performance regression analysis on PRs
+name: atime performance tests
 
 on:
   pull_request:

diff --git a/NEWS.md b/NEWS.md
@@ -63,6 +63,8 @@ rowwiseDT(
     # 15: All values  Total  Total   999   999   NaN    10
     ```
 
+4. `patterns()` in `melt()` combines correctly with user-defined `cols=`, which can be useful to specify a subset of columns to reshape without having to use a regex, for example `patterns("2", cols=c("y1", "y2"))` will only give `y2` even if there are other columns in the input matching `2`, [#6498](https://github.com/Rdatatable/data.table/issues/6498). Thanks to @hongyuanjia for the report, and to @tdhock for the PR.
+
 ## BUG FIXES
 
 1. Using `print.data.table()` with character truncation using `datatable.prettyprint.char` no longer errors with `NA` entries, [#6441](https://github.com/Rdatatable/data.table/issues/6441). Thanks to @r2evans for the bug report, and @joshhwuu for the fix.

diff --git a/R/between.R b/R/between.R
@@ -5,14 +5,14 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
   if (is.logical(upper)) upper = as.integer(upper)   # typically NA (which is logical type)
   is.px = function(x) inherits(x, "POSIXct")
   is.i64 = function(x) inherits(x, "integer64")   # this is true for nanotime too
-  # POSIX special handling to auto coerce character
+  # POSIXct special handling to auto coerce character
   if (is.px(x) && (is.character(lower) || is.character(upper))) {
     tz = attr(x, "tzone", exact=TRUE)
     if (is.null(tz)) tz = ""
     if (is.character(lower)) lower = tryCatch(as.POSIXct(lower, tz=tz), error=function(e)stopf(
-      "'between' function the 'x' argument is a POSIX class while '%s' was not, coercion to POSIX failed with: %s", 'lower', e$message))
+      "The 'x' argument of the 'between' function is POSIXct while '%s' was not, coercion to POSIXct failed with: %s", 'lower', conditionMessage(e)))
     if (is.character(upper)) upper = tryCatch(as.POSIXct(upper, tz=tz), error=function(e)stopf(
-      "'between' function the 'x' argument is a POSIX class while '%s' was not, coercion to POSIX failed with: %s", 'upper', e$message))
+      "The 'x' argument of the 'between' function is POSIXct while '%s' was not, coercion to POSIXct failed with: %s", 'upper', conditionMessage(e)))
     stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal
   }
   # POSIX check timezone match

diff --git a/R/data.table.R b/R/data.table.R
@@ -292,7 +292,7 @@ replace_dot_alias = function(e) {
         root = root_name(jsub)
       } else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") {
         #2142 -- j can be {} and have length 1
-        stopf("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}")
+        stopf("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}]")
       }
     }
     if (root=="eval" && !any(all.vars(jsub[[2L]]) %chin% names_x)) {
@@ -409,7 +409,7 @@ replace_dot_alias = function(e) {
           "'%s' is not found in calling scope and it is not a column name either",
           as.character(isub)
         ) else gettextf(
-          "'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE} is particularly clear and is optimized",
+          "'%s' is not found in calling scope, but it is a column of type %s. If you wish to select rows where that column contains TRUE, or perhaps that column contains row numbers of itself to select, try DT[(col)], DT[DT$col], or DT[col==TRUE] is particularly clear and is optimized",
           as.character(isub), typeof(col)
         )
         stopf("%s. When the first argument inside DT[...] is a single symbol (e.g. DT[var]), data.table looks for var in calling scope.", msg)
@@ -1040,7 +1040,7 @@ replace_dot_alias = function(e) {
           if (anyNA(.SDcols))
             stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols))))
           if (is.logical(.SDcols)) {
-            if (length(.SDcols)!=length(x)) stopf(".SDcols is a logical vector length %d but there are %d columns", length(.SDcols), length(x))
+            if (length(.SDcols)!=length(x)) stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(x))
             ansvals = which_(.SDcols, !negate_sdcols)
             ansvars = sdvars = names_x[ansvals]
           } else if (is.numeric(.SDcols)) {

diff --git a/R/duplicated.R b/R/duplicated.R
@@ -102,7 +102,7 @@ anyDuplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=s
 uniqueN = function(x, by = if (is.list(x)) seq_along(x) else NULL, na.rm=FALSE) { # na.rm, #1455
   if (is.null(x)) return(0L)
   if (!is.atomic(x) && !is.data.frame(x))
-    stopf("x must be an atomic vector or data.frames/data.tables")
+    stopf("x must be an atomic vector or a data.frame/data.table")
   if (is.atomic(x)) {
     if (is.logical(x)) return(.Call(CuniqueNlogical, x, na.rm=na.rm))
     x = as_list(x)