Rdatatable · MichaelChirico · Jul 29, 2024 · May 27, 2020 · May 27, 2020 · May 27, 2020
@@ -119,6 +119,25 @@ test.list <- atime::atime_test_list(
       data.table:::setDT(L)
     },
     Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
-    Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
-)
+    Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
+
+  # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498
+  # Fixed in: https://github.com/Rdatatable/data.table/pull/4501
+  "DT[,.SD] improved in #4501" = atime::atime_test(
+    N = 10^seq(1, 10, by=0.5),
+    setup = {
+      set.seed(1)
+      L = as.data.table(as.character(rnorm(N, 1, 0.5)))
+      setkey(L, V1)
+    },
+    ## New DT can safely retain key.
+    expr = {
+      data.table:::`[.data.table`(L, , .SD)
+    },
+    Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
+    Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
+    Slower = "cacdc92df71b777369a217b6c902c687cf35a70d" # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
+  ),
+
+  NULL)
 # nolint end: undesirable_operator_linter.
@@ -82,6 +82,8 @@
 
 15. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix.
 
+16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report.
+
 ## NOTES
 
 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1.

@@ -1427,8 +1427,27 @@ replace_dot_alias = function(e) {
       # should set the parent class only when jval is a plain data.table #4324
       if (identical(class(jval), c('data.table', 'data.frame')))
         setattr(jval, 'class', class(x)) # fix for #64
-      if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x)))
+      # can jval be sorted by the same key as x? improved for #4498
+      get_shared_keys = function(jsub, jvnames, sdvars, key) {
+        if (is.null(key)) return(NULL)
+        if (!((SD_only <- jsub == quote(.SD)) || jsub %iscall% "list")) return(NULL)
+        if (SD_only)
+          jvnames = jnames = sdvars
+        else
+          jnames = as.character(Filter(is.name, jsub)[-1L])
+        key_idx = chmatch(key, jnames)
+        missing_keys = which(is.na(key_idx))
+        if (length(missing_keys) && missing_keys[1L] == 1L) return(NULL)
+        if (!length(missing_keys)) return(jvnames[key_idx])
+        jvnames[head(key_idx, missing_keys[1L] - 1L)]
+      }
+      shared_keys = get_shared_keys(jsub, jvnames, sdvars = sdvars, key(x))
+      if (is.null(irows) && !is.null(shared_keys)) {
+        setattr(jval, 'sorted', shared_keys)
+        # potentially inefficient backup -- check if jval is sorted by key(x)
+      } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) {
         setattr(jval, 'sorted', key(x))
+      }
       if (any(vapply_1b(jval, is.null))) stopf("Internal error: j has created a data.table result containing a NULL column") # nocov
     }
     return(jval)

@@ -3892,7 +3892,7 @@ test(1118, dt[, lapply(.SD, function(y) weighted.mean(y, b2, na.rm=TRUE)), by=x]
 
 # a(nother) test of #295
 DT <- data.table(x=5:1, y=1:5, key="y")
-test(1119, is.null(key(DT[, list(z = y, y = 1/y)])))
+test(1119, key(DT[, list(z = y, y = 1/y)]), 'z')
 
 ## various ordered factor rbind tests
 DT1 = data.table(ordered('a', levels = c('a','b','c')))
@@ -18718,6 +18718,12 @@ DT = data.table(a = rep(1:3, 2))
 # NB: recall we can't use non-ASCII symbols here. the text is a-<n-tilde>-o (year in Spanish)
 setnames(DT, "a", "a\U00F1o")
 test(2266, eval(parse(text="DT[ , .N, a\U00F1o]$N[1L]")), 2L)
+# sub-key can also be retained in plain query, part of #4498
+DT = data.table(id = rep(1:10, 2L), grp = rep(1:2, each=10L), V = 1:20/13, key=c('id', 'grp'))
+test(2266.1, key(DT[ , .(id)]), 'id')
+test(2266.2, key(DT[ , .(grp)]), NULL)
+## renaming also caught
+test(2266.3, key(DT[ , .(newid = id, newgrp = grp)]), c('newid', 'newgrp'))
 
 # all.equal failed to dispatch to methods of columns, #4543
 DT1 = data.table(t = .POSIXct(1590973200, tz='UTC'))