diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index fddfb1347..ecd5d33d5 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -119,6 +119,25 @@ test.list <- atime::atime_test_list( data.table:::setDT(L) }, Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) - Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) -) + Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) + + # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498 + # Fixed in: https://github.com/Rdatatable/data.table/pull/4501 + "DT[,.SD] improved in #4501" = atime::atime_test( + N = 10^seq(1, 10, by=0.5), + setup = { + set.seed(1) + L = as.data.table(as.character(rnorm(N, 1, 0.5))) + setkey(L, V1) + }, + ## New DT can safely retain key. + expr = { + data.table:::`[.data.table`(L, , .SD) + }, + Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue + Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue + Slower = "cacdc92df71b777369a217b6c902c687cf35a70d" # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue + ), + + NULL) # nolint end: undesirable_operator_linter. diff --git a/NEWS.md b/NEWS.md index a54583a3d..fd13331a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -82,6 +82,8 @@ 15. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. +16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/data.table.R b/R/data.table.R index cb32836b0..9bc04e3ca 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1427,8 +1427,27 @@ replace_dot_alias = function(e) { # should set the parent class only when jval is a plain data.table #4324 if (identical(class(jval), c('data.table', 'data.frame'))) setattr(jval, 'class', class(x)) # fix for #64 - if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) + # can jval be sorted by the same key as x? improved for #4498 + get_shared_keys = function(jsub, jvnames, sdvars, key) { + if (is.null(key)) return(NULL) + if (!((SD_only <- jsub == quote(.SD)) || jsub %iscall% "list")) return(NULL) + if (SD_only) + jvnames = jnames = sdvars + else + jnames = as.character(Filter(is.name, jsub)[-1L]) + key_idx = chmatch(key, jnames) + missing_keys = which(is.na(key_idx)) + if (length(missing_keys) && missing_keys[1L] == 1L) return(NULL) + if (!length(missing_keys)) return(jvnames[key_idx]) + jvnames[head(key_idx, missing_keys[1L] - 1L)] + } + shared_keys = get_shared_keys(jsub, jvnames, sdvars = sdvars, key(x)) + if (is.null(irows) && !is.null(shared_keys)) { + setattr(jval, 'sorted', shared_keys) + # potentially inefficient backup -- check if jval is sorted by key(x) + } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) { setattr(jval, 'sorted', key(x)) + } if (any(vapply_1b(jval, is.null))) stopf("Internal error: j has created a data.table result containing a NULL column") # nocov } return(jval) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a614a9e2f..dfec93e89 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3892,7 +3892,7 @@ test(1118, dt[, lapply(.SD, function(y) weighted.mean(y, b2, na.rm=TRUE)), by=x] # a(nother) test of #295 DT <- data.table(x=5:1, y=1:5, key="y") -test(1119, is.null(key(DT[, list(z = y, y = 1/y)]))) +test(1119, key(DT[, list(z = y, y = 1/y)]), 'z') ## various ordered factor rbind tests DT1 = data.table(ordered('a', levels = c('a','b','c'))) @@ -18718,6 +18718,12 @@ DT = data.table(a = rep(1:3, 2)) # NB: recall we can't use non-ASCII symbols here. the text is a--o (year in Spanish) setnames(DT, "a", "a\U00F1o") test(2266, eval(parse(text="DT[ , .N, a\U00F1o]$N[1L]")), 2L) +# sub-key can also be retained in plain query, part of #4498 +DT = data.table(id = rep(1:10, 2L), grp = rep(1:2, each=10L), V = 1:20/13, key=c('id', 'grp')) +test(2266.1, key(DT[ , .(id)]), 'id') +test(2266.2, key(DT[ , .(grp)]), NULL) +## renaming also caught +test(2266.3, key(DT[ , .(newid = id, newgrp = grp)]), c('newid', 'newgrp')) # all.equal failed to dispatch to methods of columns, #4543 DT1 = data.table(t = .POSIXct(1590973200, tz='UTC'))