From 5ea35ee535a584e20267826bb26fa8336d8274b9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 17 Jul 2024 00:02:58 +0000 Subject: [PATCH 01/41] More variable-length arrays in forder --- src/forder.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/forder.c b/src/forder.c index 70b7070e3..bf3db2dcb 100644 --- a/src/forder.c +++ b/src/forder.c @@ -845,7 +845,9 @@ void radix_r(const int from, const int to, const int radix) { #endif uint8_t *restrict my_key = key[radix]+from; // safe to write as we don't use this radix again - uint8_t o[my_n]; + uint8_t *o = (uint8_t *)malloc(my_n * sizeof(uint8_t)); + if (!o) + STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(uint8_t)), "o"); // if last key (i.e. radix+1==nradix) there are no more keys to reorder so we could reorder osub by reference directly and save allocating and populating o just // to use it once. However, o's type is uint8_t so many moves within this max-256 vector should be faster than many moves in osub (4 byte or 8 byte ints) [1 byte // type is always aligned] @@ -912,7 +914,11 @@ void radix_r(const int from, const int to, const int radix) { } if (!skip) { // reorder osub and each remaining ksub - int TMP[my_n]; // on stack fine since my_n is very small (<=256) + int *TMP = malloc(my_n * sizeof(int)); + if (!TMP) { + free(o); + STOP(_("Failed to allocate %d bytes for '%s'."), my_n * sizeof(int), "TMP"); + } const int *restrict osub = anso+from; for (int i=0; i Date: Wed, 17 Jul 2024 00:16:56 +0000 Subject: [PATCH 02/41] Last one in fwrite --- src/fwrite.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index dacfc7e11..cd9ef98ee 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -848,7 +848,9 @@ void fwriteMain(fwriteMainArgs args) int failed_write = 0; // same. could use +ve and -ve in the same code but separate it out to trace Solaris problem, #3931 #ifndef NOZLIB - z_stream thread_streams[nth]; + z_stream *thread_streams = (z_stream *)malloc(nth * sizeof(z_stream)); + if (!thread_streams) + STOP(_("Failed to allocated %d bytes for '%s'."), (int)(nth * sizeof(z_stream)), "thread_streams"); // VLA on stack should be fine for nth structs; in zlib v1.2.11 sizeof(struct)==112 on 64bit // not declared inside the parallel region because solaris appears to move the struct in // memory when the #pragma omp for is entered, which causes zlib's internal self reference @@ -988,6 +990,7 @@ void fwriteMain(fwriteMainArgs args) } free(buffPool); #ifndef NOZLIB + free(thread_streams); free(zbuffPool); #endif From 518b68ee86aa4f8bdd28a7bd8b726f482239fbcc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 17 Jul 2024 00:20:47 +0000 Subject: [PATCH 03/41] Forgot to (int) cast --- src/forder.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/forder.c b/src/forder.c index bf3db2dcb..70744d091 100644 --- a/src/forder.c +++ b/src/forder.c @@ -917,7 +917,7 @@ void radix_r(const int from, const int to, const int radix) { int *TMP = malloc(my_n * sizeof(int)); if (!TMP) { free(o); - STOP(_("Failed to allocate %d bytes for '%s'."), my_n * sizeof(int), "TMP"); + STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(int)), "TMP"); } const int *restrict osub = anso+from; for (int i=0; i Date: Wed, 17 Jul 2024 00:24:09 +0000 Subject: [PATCH 04/41] Revert 3 last commits (to master by mistake) --- src/forder.c | 28 +++++----------------------- src/fwrite.c | 5 +---- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/src/forder.c b/src/forder.c index 70744d091..70b7070e3 100644 --- a/src/forder.c +++ b/src/forder.c @@ -845,9 +845,7 @@ void radix_r(const int from, const int to, const int radix) { #endif uint8_t *restrict my_key = key[radix]+from; // safe to write as we don't use this radix again - uint8_t *o = (uint8_t *)malloc(my_n * sizeof(uint8_t)); - if (!o) - STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(uint8_t)), "o"); + uint8_t o[my_n]; // if last key (i.e. radix+1==nradix) there are no more keys to reorder so we could reorder osub by reference directly and save allocating and populating o just // to use it once. However, o's type is uint8_t so many moves within this max-256 vector should be faster than many moves in osub (4 byte or 8 byte ints) [1 byte // type is always aligned] @@ -914,11 +912,7 @@ void radix_r(const int from, const int to, const int radix) { } if (!skip) { // reorder osub and each remaining ksub - int *TMP = malloc(my_n * sizeof(int)); - if (!TMP) { - free(o); - STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(int)), "TMP"); - } + int TMP[my_n]; // on stack fine since my_n is very small (<=256) const int *restrict osub = anso+from; for (int i=0; i Date: Wed, 17 Jul 2024 02:50:38 +0200 Subject: [PATCH 05/41] MacOS: fix linker flags for SHLIB feature test (#6283) Co-authored-by: Michael Chirico --- configure | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index 04fd8104f..853c0d5fa 100755 --- a/configure +++ b/configure @@ -111,7 +111,7 @@ detect_openmp () { # https://mac.r-project.org/openmp printf "%s" "* checking if R installation supports OpenMP with \"-Xclang -fopenmp\" ... " - if CPPFLAGS="${CPPFLAGS} -Xclang -fopenmp" LDFLAGS="${LDFLAGS} -lomp" "${R_HOME}/bin/R" CMD SHLIB test-omp.c >> config.log 2>&1; then + if CPPFLAGS="${CPPFLAGS} -Xclang -fopenmp" PKG_LIBS="-lomp" "${R_HOME}/bin/R" CMD SHLIB test-omp.c >> config.log 2>&1; then echo "yes" export PKG_CFLAGS="${PKG_CFLAGS} -Xclang -fopenmp" export PKG_LIBS="${PKG_LIBS} -lomp" From 517045206de1cb892f989f38b9be27abf2c321e5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 17 Jul 2024 07:15:46 -0700 Subject: [PATCH 06/41] Consistently set R_LIBS_USER across steps (#6292) * Need to re-install remotes * Use R_LIBS_USER instead --- .github/workflows/R-CMD-check-occasional.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml index 9efe7bbf4..fe7f3a163 100644 --- a/.github/workflows/R-CMD-check-occasional.yaml +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -1,6 +1,6 @@ on: schedule: - - cron: '17 13 17 * *' # 17th of month at 13:17 UTC + - cron: '17 13 18 * *' # 18th of month at 13:17 UTC # A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release name: R-CMD-check-occasional @@ -83,11 +83,15 @@ jobs: run: brew install gdal proj - name: Install remotes + env: + R_LIBS_USER: /home/runner/work/r-lib run: install.packages("remotes") shell: Rscript {0} - name: Install system dependencies if: runner.os == 'Linux' + env: + R_LIBS_USER: /home/runner/work/r-lib run: | while read -r cmd do @@ -103,6 +107,7 @@ jobs: R_LIBS_USER: /home/runner/work/r-lib run: | options(crayon.enabled = TRUE) + install.packages("remotes") # different R_LIBS_USER now... remotes::install_deps(dependencies=TRUE, force=TRUE) # we define this in data.table namespace, but it appears to be exec From ff808ae4caaab48724c0cecbef7ea26ee0c0ea9e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 17 Jul 2024 07:22:59 -0700 Subject: [PATCH 07/41] Handle mean(a,b) under R's startsWith() (#6291) * R's startsWith() doesn't accept non-character input -> regression * Share logic with .gforce_ok --- R/data.table.R | 16 ++++++++++------ inst/tests/tests.Rraw | 4 ++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 133c987fe..3b27b8d7c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1987,11 +1987,11 @@ DT = function(x, ...) { #4872 .optmean = function(expr) { # called by optimization of j inside [.data.table only. Outside for a small speed advantage. if (length(expr)==2L) # no parameters passed to mean, so defaults of trim=0 and na.rm=FALSE - return(call(".External",quote(Cfastmean),expr[[2L]], FALSE)) + return(call(".External", quote(Cfastmean), expr[[2L]], FALSE)) # return(call(".Internal",expr)) # slightly faster than .External, but R now blocks .Internal in coerce.c from apx Sep 2012 - if (length(expr)==3L && startsWith(names(expr)[3L], "na")) # one parameter passed to mean() - return(call(".External",quote(Cfastmean),expr[[2L]], expr[[3L]])) # faster than .Call - assign("nomeanopt",TRUE,parent.frame()) + if (length(expr)==3L && .arg_is_narm(expr)) + return(call(".External", quote(Cfastmean), expr[[2L]], expr[[3L]])) # faster than .Call + assign("nomeanopt", TRUE, parent.frame()) expr # e.g. trim is not optimized, just na.rm } @@ -3072,13 +3072,17 @@ is_constantish = function(q, check_singleton=FALSE) { if (q1[[3L]] %chin% gdtfuns) return(q1[[3L]]) NULL } + +# Check for na.rm= in expr in the expected slot; allows partial matching and +# is robust to unnamed expr. Note that NA names are not possible here. +.arg_is_narm <- function(expr, which=3L) !is.null(nm <- names(expr)[which]) && startsWith(nm, "na") + .gforce_ok = function(q, x) { if (is.N(q)) return(TRUE) # For #334 q1 = .get_gcall(q) if (is.null(q1)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(x) && q2 != ".I") return(FALSE) # 875 - if (length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na") && is_constantish(q[[3L]]))) return(TRUE) - # ^^ base::startWith errors on NULL unfortunately + if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]))) return(TRUE) switch(as.character(q1), "shift" = .gshift_ok(q), "weighted.mean" = .gweighted.mean_ok(q, x), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4f388bf78..64a012d6b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18739,3 +18739,7 @@ test(2268, rbindlist(y, fill=TRUE), rbindlist(x, fill=TRUE)[rep(1:5, N)]) dt = data.table(x=as.POSIXct(c(NA, NA))) test(2269.1, fread("x\n \n \n", colClasses="POSIXct"), dt) test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) + +# Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL +DT = data.table(a = 1, b = 2) +test(2270, options=c(datatable.optimize=1L), DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") From cd497408bb4dc6650d871b5076e738420ff431d7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 17 Jul 2024 08:29:36 -0700 Subject: [PATCH 08/41] Throw custom error class for calling := to avoid depending on error text (#6294) --- NEWS.md | 2 ++ R/data.table.R | 14 ++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index bfe423e2e..f23859487 100644 --- a/NEWS.md +++ b/NEWS.md @@ -126,6 +126,8 @@ 2. data.table is now translated into Brazilian Portuguese (`pt_BR`) as well as Mandarin (`zh_CN`). Thanks to the [new translation team](https://github.com/orgs/Rdatatable/teams/brazil) consisting initially of @rffontenelle, @leofontenelle, and @italo-07. The team is open if you'd also like to join and support maintenance of these translations. +3. A more helpful error message for using `:=` inside the first argument (`i`) of `[.data.table` is now available in translation, [#6293](https://github.com/Rdatatable/data.table/issues/6293). Previously, the code to display this assumed an earlier message was printed in English. The solution is for calling `:=` directly (i.e., outside the second argument `j` of `[.data.table`) to throw an error of class `dt_invalid_let_error`. Thanks to Spanish translator @rikivillalba for spotting the issue and @MichaelChirico for the fix. + # data.table [v1.15.4](https://github.com/Rdatatable/data.table/milestone/33) (27 March 2024) ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index 3b27b8d7c..3f8ea8a1e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -392,13 +392,11 @@ replace_dot_alias = function(e) { } else if (!is.name(isub)) { ienv = new.env(parent=parent.frame()) - if (getOption("datatable.optimize")>=1L) assign("order", forder, ienv) - i = tryCatch(eval(.massagei(isub), x, ienv), error=function(e) { - if (grepl(":=.*defined for use in j.*only", e$message)) - stopf("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number.") - else - .checkTypos(e, names_x) - }) + if (getOption("datatable.optimize") >= 1L) assign("order", forder, ienv) + i = tryCatch(eval(.massagei(isub), x, ienv), + dt_invalid_let_error = function(e) stopf("Operator := detected in i, the first argument inside DT[...], but is only valid in the second argument, j. Most often, this happens when forgetting the first comma (e.g. DT[newvar := 5] instead of DT[ , new_var := 5]). Please double-check the syntax. Run traceback(), and debugger() to get a line number."), + error = function(e) .checkTypos(e, names_x) + ) } else { # isub is a single symbol name such as B in DT[B] i = try(eval(isub, parent.frame(), parent.frame()), silent=TRUE) @@ -2772,7 +2770,7 @@ address = function(x) .Call(Caddress, eval(substitute(x), parent.frame())) ":=" = function(...) { # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227 - stopf('Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) are defined for use in j, once only and in particular ways. See help(":=").') + stopf('Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) are defined for use in j, once only and in particular ways. See help(":=").', class="dt_invalid_let_error") } # TODO(#6197): Export these. From a374da766e5434d2a9dc5b922da165f7d77af363 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 17 Jul 2024 17:46:32 -0700 Subject: [PATCH 09/41] Newline in verbose output (#6297) * Newline in verbose output * regression test * missing '.' --- R/bmerge.R | 2 +- inst/tests/tests.Rraw | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/R/bmerge.R b/R/bmerge.R index f32a19f42..881b8528e 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -102,7 +102,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (!isReallyReal(i[[ic]])) { # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys # we've always coerced to int and returned int, for convenience. - if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s", iname, xname) + if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname) val = as.integer(i[[ic]]) if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679 set(i, j=ic, value=val) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 64a012d6b..020a4db3e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18743,3 +18743,8 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) test(2270, options=c(datatable.optimize=1L), DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") + +# Missing newline in verbose output -> harder to read +DT1 = data.table(a=1:2) +DT2 = data.table(a=c(1, 1, 2, 2), b=1:4) +test(2271, options=c(datatable.verbose=TRUE), copy(DT1)[DT2, on='a', v := 4], copy(DT1)[, v := 4], output="x.a.\nAssigning") From 0030b15d1ebc9242e30159351388ef3cc114e344 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 18 Jul 2024 16:29:55 +0200 Subject: [PATCH 10/41] add orcid for Ben (#6300) --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7035cfcad..b78495edc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ Authors@R: c( person("Jan","Gorecki", role="aut"), person("Michael","Chirico", role="aut", comment = c(ORCID="0000-0003-0787-087X")), person("Toby","Hocking", role="aut", comment = c(ORCID="0000-0002-3146-0865")), - person("Benjamin","Schwendinger",role="aut"), + person("Benjamin","Schwendinger",role="aut", comment = c(ORCID="0000-0003-3315-8114")), person("Pasha","Stetsenko", role="ctb"), person("Tom","Short", role="ctb"), person("Steve","Lianoglou", role="ctb"), From 069eed93a27817303a109c1f391b4bc9319bb6f7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 19 Jul 2024 12:43:37 -0700 Subject: [PATCH 11/41] Explicitly request -Wvla to prevent backsliding (#6274) * Explicitly request -Wvla to prevent backsliding * More variable-length arrays in forder * Last one in fwrite * Forgot to (int) cast --- .gitlab-ci.yml | 20 ++++++++++---------- src/forder.c | 28 +++++++++++++++++++++++----- src/fwrite.c | 5 ++++- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ace646734..8a3fa4d71 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -117,8 +117,8 @@ test-lin-rel: OPENBLAS_MAIN_FREE: "1" script: - *install-deps - - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O3 -flto=auto -fno-common -fopenmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - R CMD check $(ls -1t data.table_*.tar.gz | head -n 1) - (! grep "warning:" data.table.Rcheck/00install.out) @@ -132,8 +132,8 @@ test-lin-rel-vanilla: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc script: - - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - R CMD check --no-manual --ignore-vignettes $(ls -1t data.table_*.tar.gz | head -n 1) ## R-release on Linux @@ -149,8 +149,8 @@ test-lin-rel-cran: _R_CHECK_PKG_SIZES_THRESHOLD_: "7" ## MB 'checking installed package size' NOTE script: - *install-deps - - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O2 -fopenmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O2 -fopenmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: OK")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: OK"), " but ", shQuote(l)) else q("no")' @@ -168,8 +168,8 @@ test-lin-dev-gcc-strict-cran: _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" ## detects S3 method lookup found on search path #4777 _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" script: - - echo 'CFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O2 -flto=auto -fno-common -fopenmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - *install-deps - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - (! grep "warning:" data.table.Rcheck/00install.out) @@ -189,8 +189,8 @@ test-lin-dev-clang-cran: _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" _R_S3_METHOD_LOOKUP_REPORT_SEARCH_PATH_USES_: "TRUE" script: - - echo 'CFLAGS=-g -O2 -fno-common -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - - echo 'CXXFLAGS=-g -O2 -fno-common -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars + - echo 'CFLAGS=-g -O2 -fno-common -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars + - echo 'CXXFLAGS=-g -O2 -fno-common -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars - *install-deps - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1) - (! grep "warning:" data.table.Rcheck/00install.out) diff --git a/src/forder.c b/src/forder.c index 70b7070e3..70744d091 100644 --- a/src/forder.c +++ b/src/forder.c @@ -845,7 +845,9 @@ void radix_r(const int from, const int to, const int radix) { #endif uint8_t *restrict my_key = key[radix]+from; // safe to write as we don't use this radix again - uint8_t o[my_n]; + uint8_t *o = (uint8_t *)malloc(my_n * sizeof(uint8_t)); + if (!o) + STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(uint8_t)), "o"); // if last key (i.e. radix+1==nradix) there are no more keys to reorder so we could reorder osub by reference directly and save allocating and populating o just // to use it once. However, o's type is uint8_t so many moves within this max-256 vector should be faster than many moves in osub (4 byte or 8 byte ints) [1 byte // type is always aligned] @@ -912,7 +914,11 @@ void radix_r(const int from, const int to, const int radix) { } if (!skip) { // reorder osub and each remaining ksub - int TMP[my_n]; // on stack fine since my_n is very small (<=256) + int *TMP = malloc(my_n * sizeof(int)); + if (!TMP) { + free(o); + STOP(_("Failed to allocate %d bytes for '%s'."), (int)(my_n * sizeof(int)), "TMP"); + } const int *restrict osub = anso+from; for (int i=0; i Date: Sun, 21 Jul 2024 01:23:09 +0530 Subject: [PATCH 12/41] Update Error Message to Explain Namespace Restrictions for `:=` and `let` in `data.table` (#6302) * udated the error message * suggested wording --------- Co-authored-by: nitish jha Co-authored-by: Michael Chirico --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 3f8ea8a1e..072da0cdc 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2770,7 +2770,7 @@ address = function(x) .Call(Caddress, eval(substitute(x), parent.frame())) ":=" = function(...) { # this error is detected when eval'ing isub and replaced with a more helpful one when using := in i due to forgetting a comma, #4227 - stopf('Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) are defined for use in j, once only and in particular ways. See help(":=").', class="dt_invalid_let_error") + stopf('Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) are defined for use in j, once only and in particular ways. Note that namespace-qualification like data.table::`:=`(...) is not supported. See help(":=").', class="dt_invalid_let_error") } # TODO(#6197): Export these. From eecd4b1488d8b28ae20d6019ca135fcf0265eefa Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 21 Jul 2024 16:46:15 +0200 Subject: [PATCH 13/41] add zed editor to ignores (#6305) --- .Rbuildignore | 1 + .gitignore | 3 +++ 2 files changed, 4 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 5cfaa1ecb..6e6b8b401 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,6 +16,7 @@ ^\.devcontainer$ ^\.graphics$ ^\.github$ +^\.zed$ ^\.gitlab-ci\.yml$ diff --git a/.gitignore b/.gitignore index 9dd72b5c0..572d7352c 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,9 @@ config.log .Rproj.user data.table.Rproj +# zed editor +.zed + # produced vignettes vignettes/*.html vignettes/*.pdf From f952062030e6657bef83de2748c65120990031c1 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 10:20:51 -0700 Subject: [PATCH 14/41] Change to use SHALLOW_DUPLICATE_ATTRIB() since that's API since R3.3.0 (#6264) * More conservative PROTECT() around switch to API usage in asS4 * Use SHALLOW_DUPLICATE_ATTRIB --- src/assign.c | 5 +---- src/dogroups.c | 9 +++++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/assign.c b/src/assign.c index 6eb0866e4..d937601eb 100644 --- a/src/assign.c +++ b/src/assign.c @@ -151,10 +151,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) // where n is set to truelength (i.e. a shallow copy only with no size change) int protecti=0; SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here? - SET_ATTRIB(newdt, shallow_duplicate(ATTRIB(dt))); - SET_OBJECT(newdt, OBJECT(dt)); - if (isS4(dt)) newdt = asS4(newdt, TRUE, 1); // To support S4 objects that include data.table - //SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // SHALLOW_DUPLICATE_ATTRIB would be a bit neater but is only available from R 3.3.0 + SHALLOW_DUPLICATE_ATTRIB(newdt, dt); // TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It // also increases truelength. Perhaps make that distinction, then, and split out, but marked diff --git a/src/dogroups.c b/src/dogroups.c index da9a7da17..e03ad84df 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -482,8 +482,13 @@ SEXP keepattr(SEXP to, SEXP from) // Same as R_copyDFattr in src/main/attrib.c, but that seems not exposed in R's api // Only difference is that we reverse from and to in the prototype, for easier calling above SET_ATTRIB(to, ATTRIB(from)); - if (isS4(from)) to = asS4(to, TRUE, 1); - SET_OBJECT(to, OBJECT(from)); + if (isS4(from)) { + to = PROTECT(asS4(to, TRUE, 1)); + SET_OBJECT(to, OBJECT(from)); + UNPROTECT(1); + } else { + SET_OBJECT(to, OBJECT(from)); + } return to; } From b303cd4cea2c31378ee22cea053cd9f60a2fd4e8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 10:23:05 -0700 Subject: [PATCH 15/41] PROTECT() names vector (#6265) --- src/rbindlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rbindlist.c b/src/rbindlist.c index 24a785bde..e206d4ce4 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -243,8 +243,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int nprotect = 0; SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); nprotect++; - SEXP ansNames; - setAttrib(ans, R_NamesSymbol, ansNames=allocVector(STRSXP, idcol + ncol)); + SEXP ansNames = PROTECT(allocVector(STRSXP, idcol + ncol)); nprotect++; + setAttrib(ans, R_NamesSymbol, ansNames); if (idcol) { SET_STRING_ELT(ansNames, 0, STRING_ELT(idcolArg, 0)); SEXP idval, listNames=getAttrib(l, R_NamesSymbol); From b754e8035330420b39d4eb92e32dc9ed89ae6f80 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 13:33:10 -0700 Subject: [PATCH 16/41] More careful memory management in shift() (#6258) * More careful PROTECT() usage in shift * ws * actually merits a test+NEWS * Link an issue --- NEWS.md | 2 +- inst/tests/tests.Rraw | 4 ++++ src/shift.c | 9 +++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index f23859487..5898724d5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -66,7 +66,7 @@ 12. data.table's `all.equal()` method now dispatches to each column's own `all.equal()` method as appropriate, [#4543](https://github.com/Rdatatable/data.table/issues/4543). Thanks @MichaelChirico for the report and fix. Note that this had two noteworthy changes to data.table's own test suite that might affect you: (1) comparisons of POSIXct columns compare absolute, not relative differences, meaning that millisecond-scale differences might trigger a "not equal" report that was hidden before; and (2) comparisons of integer64 columns could be totally wrong since they were being compared on the basis of their representation as doubles, not long integers. The former might be a matter of preference requiring you to specify a different `tolerance=`, while the latter was clearly a bug. -13. `rbindlist` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger for the fix. +13. `rbindlist` and `shift` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger (`rbindlist`) and @MichaelChirico (`shift`) for the fix. 14. `fread(x, colClasses="POSIXct")` now also works for columns containing only NA values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and Benjamin Schwendinger for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 020a4db3e..a407d7e94 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18748,3 +18748,7 @@ test(2270, options=c(datatable.optimize=1L), DT[, mean(b, 1), by=a], data.table( DT1 = data.table(a=1:2) DT2 = data.table(a=c(1, 1, 2, 2), b=1:4) test(2271, options=c(datatable.verbose=TRUE), copy(DT1)[DT2, on='a', v := 4], copy(DT1)[, v := 4], output="x.a.\nAssigning") + +# shift of many elements accumulated PROTECT() for the fill values instead of releasing as soon as possible. Spotted by rchk in #6257. +l = as.list(seq_len(2e4)) +test(2272, shift(l), as.list(rep(NA_integer_, 2e4))) diff --git a/src/shift.c b/src/shift.c index e3e4a2b82..77d246d97 100644 --- a/src/shift.c +++ b/src/shift.c @@ -40,7 +40,7 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) SEXP elem = VECTOR_ELT(x, i); size_t size = SIZEOF(elem); R_xlen_t xrows = xlength(elem); - SEXP thisfill = PROTECT(coerceAs(fill, elem, ScalarLogical(0))); nprotect++; // #4865 use coerceAs for type coercion + SEXP thisfill = PROTECT(coerceAs(fill, elem, ScalarLogical(0))); // #4865 use coerceAs for type coercion switch (TYPEOF(elem)) { case INTSXP: case LGLSXP: { const int ifill = INTEGER(thisfill)[0]; @@ -170,8 +170,9 @@ SEXP shift(SEXP obj, SEXP k, SEXP fill, SEXP type) default : error(_("Type '%s' is not supported"), type2char(TYPEOF(elem))); } + UNPROTECT(1); // thisfill } - UNPROTECT(nprotect); - return isVectorAtomic(obj) && length(ans) == 1 ? VECTOR_ELT(ans, 0) : ans; + if (isVectorAtomic(obj) && length(ans) == 1) ans = VECTOR_ELT(ans, 0); + UNPROTECT(nprotect); // ans, x? + return ans; } - From 0c022e22232928eb4ca759cd788bc93be1e6ab99 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 24 Jul 2024 00:15:46 -0700 Subject: [PATCH 17/41] More robust detection of bzip2 magic number (#6308) * More robust to false positives checking for bzip signature * NEWS * trailing ws --- NEWS.md | 2 ++ R/fread.R | 33 +++++++++++++++++++++++++++------ inst/tests/tests.Rraw | 15 +++++++++++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5898724d5..d1d2f20ae 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,6 +70,8 @@ 14. `fread(x, colClasses="POSIXct")` now also works for columns containing only NA values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and Benjamin Schwendinger for the fix. +15. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/fread.R b/R/fread.R index 68ce4830a..d45774b71 100644 --- a/R/fread.R +++ b/R/fread.R @@ -95,10 +95,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } # support zip and tar files #3834 - zip_signature = charToRaw("PK\x03\x04") file_signature = readBin(file, raw(), 8L) - if ((w <- endsWithAny(file, c(".zip", ".tar"))) || identical(head(file_signature, 4L), zip_signature)) { + if ((w <- endsWithAny(file, c(".zip", ".tar"))) || is_zip(file_signature)) { FUN = if (w==2L) untar else unzip fnames = FUN(file, list=TRUE) if (is.data.frame(fnames)) fnames = fnames[,1L] @@ -110,12 +109,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(decompFile), add=TRUE) } - gz_signature = as.raw(c(0x1F, 0x8B)) - bz2_signature = as.raw(c(0x42, 0x5A, 0x68)) gzsig = FALSE - if ((w <- endsWithAny(file, c(".gz", ".bgz",".bz2"))) || (gzsig <- identical(head(file_signature, 2L), gz_signature)) || identical(head(file_signature, 3L), bz2_signature)) { + if ((w <- endsWithAny(file, c(".gz", ".bgz",".bz2"))) || (gzsig <- is_gzip(file_signature)) || is_bzip(file_signature)) { if (!requireNamespace("R.utils", quietly = TRUE)) - stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov + stopf("To read %s files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.", if (w<=2L || gzsig) "gz" else "bz2") # nocov FUN = if (w<=2L || gzsig) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download @@ -361,6 +358,30 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") ans } +known_signatures = list( + zip = as.raw(c(0x50, 0x4b, 0x03, 0x04)), # charToRaw("PK\x03\x04") + gzip = as.raw(c(0x1F, 0x8B)), + bzip = as.raw(c(0x42, 0x5A, 0x68)) +) + +# https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers +# not checked: what's a valid 'version' entry to check the 5th+6th bytes +is_zip = function(file_signature) { + identical(file_signature[1:4], known_signatures$zip) +} + +# https://en.wikipedia.org/wiki/Gzip#File_format +# not checked: remaining 8 bytes of header +is_gzip = function(file_signature) { + identical(file_signature[1:2], known_signatures$gzip) +} + +# https://en.wikipedia.org/wiki/Bzip2#File_format +is_bzip = function(file_signature) { + identical(file_signature[1:3], known_signatures$bzip) && + isTRUE(file_signature[4L] %in% charToRaw('123456789')) # for #6304 +} + # simplified but faster version of `factor()` for internal use. as_factor = function(x) { lev = forderv(x, retGrp = TRUE, na.last = NA) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a407d7e94..bf9bfff39 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18752,3 +18752,18 @@ test(2271, options=c(datatable.verbose=TRUE), copy(DT1)[DT2, on='a', v := 4], co # shift of many elements accumulated PROTECT() for the fill values instead of releasing as soon as possible. Spotted by rchk in #6257. l = as.list(seq_len(2e4)) test(2272, shift(l), as.list(rep(NA_integer_, 2e4))) + +# false positive detecting the bz2 header in some rare cases, #6304 +tmp = tempfile() +fwrite(data.table(c1 = "BZh"), tmp, col.names=FALSE) +test(2273.1, fread(tmp), data.table(BZh = logical())) +test(2273.2, fread(tmp, header=FALSE), data.table(V1="BZh")) +fwrite(ans<-data.table(BZh=1L, CZi=2L), tmp) +test(2273.3, fread(tmp), ans) +if (test_R.utils) { + DT = data.table(a=1L, b=2L) + fwrite(DT, tmp) + R.utils::bzip2(tmp, tmp2 <- tempfile(), remove=FALSE) # _not_ with .bz2 extension + test(2273.4, fread(tmp2), DT) +} +file.remove(tmp, tmp2) From 4fd75e2bc7bd01a4653bd02fcccac8db12522180 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:30:27 +0200 Subject: [PATCH 18/41] Rbind allow binding of different class attributes (#5446) * add fix #5309 * fix test numbering * add rbind for ITime * more tests * add merge tests * add AsIs #4934 * add news * news typo * add ignore.attr argument * fix news * change arguments of registered rbindlist * add attribute to usage * move nanotime tests * adjust test numbering * add test coverage * prohibit NA for ignore.att * move news * finish todo of #5857 * Update NEWS.md Co-authored-by: Michael Chirico * update comment * update doc for ignore.attr * fix nit ignoreattr * fix test consistency * remove setnames * update asis test to use rbindlist * update test comments * update NEWS num * NEWS wording * more NEWS wording * template message for i18n * simplify condition (C boolean --> no NA to worry about) * && not & * correct error message --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 4 ++++ R/data.table.R | 8 ++++---- R/merge.R | 16 +-------------- inst/tests/other.Rraw | 8 ++++++++ inst/tests/tests.Rraw | 47 +++++++++++++++++++++++++++++++++++++++++++ man/rbindlist.Rd | 8 +++++++- src/data.table.h | 3 ++- src/init.c | 2 ++ src/rbindlist.c | 25 ++++++++++++++++++----- 9 files changed, 95 insertions(+), 26 deletions(-) diff --git a/NEWS.md b/NEWS.md index d1d2f20ae..bb19620cd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,10 @@ 14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. +15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). + +`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 072da0cdc..99d06fad7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2735,14 +2735,14 @@ chgroup = function(x) { } # plain rbind and cbind methods are registered using S3method() in NAMESPACE only from R>=4.0.0; #3948 -rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL) { +rbind.data.table = function(..., use.names=TRUE, fill=FALSE, idcol=NULL, ignore.attr=FALSE) { l = lapply(list(...), function(x) if (is.list(x)) x else as.data.table(x)) #1626; e.g. psych binds a data.frame|table with a matrix - rbindlist(l, use.names, fill, idcol) + rbindlist(l, use.names, fill, idcol, ignore.attr) } cbind.data.table = data.table .rbind.data.table = rbind.data.table # the workaround using this in FAQ 2.24 is still applied to support R < 4.0.0 -rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { +rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) { if (is.null(l)) return(null.data.table()) if (!is.list(l) || is.data.frame(l)) stopf("Input is %s but should be a plain list of items to be stacked", class(l)[1L]) if (isFALSE(idcol)) { idcol = NULL } @@ -2758,7 +2758,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL) { if (!miss) stopf("use.names='check' cannot be used explicitly because the value 'check' is new in v1.12.2 and subject to change. It is just meant to convey default behavior. See ?rbindlist.") use.names = NA } - ans = .Call(Crbindlist, l, use.names, fill, idcol) + ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr) if (!length(ans)) return(null.data.table()) setDT(ans)[] } diff --git a/R/merge.R b/R/merge.R index aabaaf740..025488740 100644 --- a/R/merge.R +++ b/R/merge.R @@ -96,21 +96,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] - # TO DO: replace by following once #5446 is merged - # if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) - if (length(missingyidx)) { - yy = y[missingyidx] - othercolsx = setdiff(nm_x, by) - if (length(othercolsx)) { - # create NA rectangle with correct types and attributes of x to cbind to y - tmp = rep.int(NA_integer_, length(missingyidx)) - # TO DO: use set() here instead.. - yy = cbind(yy, x[tmp, othercolsx, with = FALSE]) - } - # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist - # takes care of #24 without having to save names. This is how it should be, IMHO. - dt = rbind(dt, yy, use.names=FALSE) - } + if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. newend = setdiff(nm_y, by.y) diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 79f64b487..0a0195279 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -699,6 +699,14 @@ if (loaded[["nanotime"]]) { DT = data.table(time=nanotime(c(1,NA,3))) test(27, na.omit(DT), DT[c(1,3)]) + # rbind with vectors with class attributes #5309 + x = data.table(a=1L, b=as.nanotime(0)) + y = data.table(a=2L, b=NA) + test(27.01, rbind(x,y), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(27.02, rbind(y,x), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) + y[, b := NULL] + test(27.03, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.nanotime(c(0, NA)))) + test(27.04, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.nanotime(c(NA, 0)))) } # that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bf9bfff39..c7f0833cf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14435,6 +14435,8 @@ test(2003.81, rbind(x, y, fill=TRUE, use.names=TRUE), ans) test(2003.82, rbind(y, x, fill=TRUE, use.names=TRUE), ans[2:1,]) test(2003.83, rbind(x, y, fill=TRUE, use.names=FALSE), ans) test(2003.84, rbind(y, x, fill=TRUE, use.names=FALSE), ans[2:1,]) +# rbindlist ignore attributes #3911 +test(2003.85, rbindlist(list(), ignore.attr=1), error="ignore.attr should be TRUE or FALSE") # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" @@ -18767,3 +18769,48 @@ if (test_R.utils) { test(2273.4, fread(tmp2), DT) } file.remove(tmp, tmp2) + +# rbind with vectors with class attributes #5309 +x = data.table(a = 1L, b = as.Date("2020-01-01")) +y = data.table(a = 2L, b = as.IDate("2021-01-01")) +z = data.table(a = 3L, b = NA) +test(2274.01, rbind(x, y), data.table(a=c(1L, 2L), b= as.Date(c("2020-01-01", "2021-01-01")))) +test(2274.02, rbind(y, x), data.table(a=c(2L, 1L), b=as.IDate(c("2021-01-01", "2020-01-01")))) +test(2274.03, rbind(x, z), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) +test(2274.04, rbind(z, x), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) +test(2274.05, rbind(y, z), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA)))) +test(2274.06, rbind(z, y), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01")))) +z[, b := NULL] +test(2274.07, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b= as.Date(c("2020-01-01", NA)))) +test(2274.08, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b= as.Date(c(NA, "2020-01-01")))) +test(2274.09, rbind(y, z, fill=TRUE), data.table(a=c(2L, 3L), b=as.IDate(c("2021-01-01", NA)))) +test(2274.10, rbind(z, y, fill=TRUE), data.table(a=c(3L, 2L), b=as.IDate(c(NA, "2021-01-01")))) +x = data.table(a=1L, b=as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2274.11, rbind(x, z, fill=TRUE), data.table(a=c(1L, 3L), b=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2274.12, rbind(z, x, fill=TRUE), data.table(a=c(3L, 1L), b=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(c=1L, d=as.POSIXct("2021-10-06 13:58:00 UTC")) +test(2274.13, rbind(x, z, fill=TRUE, use.names=FALSE), data.table(c = c(1L, 3L), d=as.POSIXct(c("2021-10-06 13:58:00 UTC", NA)))) +test(2274.14, rbind(z, x, fill=TRUE, use.names=FALSE), data.table(a=c(3L, 1L), d=as.POSIXct(c(NA, "2021-10-06 13:58:00 UTC")))) +x = data.table(a=1L, b=as.ITime(0)) +y = data.table(a=2L, b=NA) +test(2274.15, rbind(x,y), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) +test(2274.16, rbind(y,x), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +y[, b := NULL] +test(2274.17, rbind(x,y, fill = TRUE), data.table(a = c(1L, 2L), b=as.ITime(c(0, NA)))) +test(2274.18, rbind(y,x, fill = TRUE), data.table(a = c(2L, 1L), b=as.ITime(c(NA, 0)))) +# follow up to #5263 to simplify merge logic +x = data.table(a = 1L, b = as.Date("2020-01-01")) +y = data.table(a = 2L, b = NA) +test(2274.19, merge(x, y, by="a", all=TRUE), data.table(a=1:2, b.x=as.Date(c("2020-01-01", NA)), b.y=NA, key="a")) +test(2274.20, merge(y, x, by="a", all=TRUE), data.table(a=1:2, b.x=NA, key="a", b.y=as.Date(c("2020-01-01", NA)))) +# rbindlist with AsIs +x = data.table(a = 1L, b=I(3L)) +y = data.table(a = 2L, b=4) +test(2274.21, rbindlist(list(x,y)), data.table(a = c(1L, 2L), b=I(c(3L, 4)))) +test(2274.22, rbindlist(list(y,x)), data.table(a = c(2L, 1L), b=c(4, 3))) +# rbind ignore attributes #3911 +x = data.table(a = structure(1:2, class=c("a", "integer")), key="a") +y = data.table(a = 2:3, key="a") +test(2274.31, merge(x,y, all.y=TRUE), data.table(a=structure(2:3, class=c("a", "integer")), key="a")) +test(2274.32, rbind(x,y), error="Class attribute .* does not match with .*") +test(2274.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, 3L), class=c("a", "integer")))) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 9e218c5e1..17c5c2205 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -7,7 +7,7 @@ Same as \code{do.call(rbind, l)} on \code{data.frame}s, but much faster. } \usage{ -rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) +rbindlist(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=FALSE) # rbind(..., use.names=TRUE, fill=FALSE, idcol=NULL) } \arguments{ @@ -15,6 +15,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL) \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.} \item{fill}{\code{TRUE} fills missing columns with NAs, or NULL for missing list columns. By default \code{FALSE}.} \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.} + \item{ignore.attr}{Logical, default \code{FALSE}. When \code{TRUE}, allows binding columns with different attributes (e.g. class).} } \details{ Each item of \code{l} can be a \code{data.table}, \code{data.frame} or \code{list}, including \code{NULL} (skipped) or an empty object (0 rows). \code{rbindlist} is most useful when there are an unknown number of (potentially many) objects to stack, such as returned by \code{lapply(fileNames, fread)}. \code{rbind} is most useful to stack two or three objects which you know in advance. \code{\dots} should contain at least one \code{data.table} for \code{rbind(\dots)} to call the fast method and return a \code{data.table}, whereas \code{rbindlist(l)} always returns a \code{data.table} even when stacking a plain \code{list} with a \code{data.frame}, for example. @@ -54,6 +55,11 @@ rbindlist(l, use.names=TRUE, fill=TRUE, idcol=TRUE) setattr(l, 'names', c("a", "b")) rbindlist(l, use.names=TRUE, fill=TRUE, idcol="ID") +# bind different classes +DT1 = data.table(A=1:3,B=letters[1:3]) +DT2 = data.table(A=4:5,B=letters[4:5]) +setattr(DT1[["A"]], "class", c("a", "integer")) +rbind(DT1, DT2, ignore.attr=TRUE) } \keyword{ data } diff --git a/src/data.table.h b/src/data.table.h index ee4a55d3a..cd9e40efa 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -93,6 +93,7 @@ extern SEXP char_datatable; extern SEXP char_dataframe; extern SEXP char_NULL; extern SEXP char_maxString; +extern SEXP char_AsIs; extern SEXP sym_sorted; extern SEXP sym_index; extern SEXP sym_BY; @@ -286,7 +287,7 @@ SEXP chmatchdup_R(SEXP, SEXP, SEXP); SEXP chin_R(SEXP, SEXP); SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); -SEXP rbindlist(SEXP, SEXP, SEXP, SEXP); +SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP setlistelt(SEXP, SEXP, SEXP); SEXP address(SEXP); SEXP expandAltRep(SEXP); diff --git a/src/init.c b/src/init.c index 49de93746..48046b8d6 100644 --- a/src/init.c +++ b/src/init.c @@ -23,6 +23,7 @@ SEXP char_datatable; SEXP char_dataframe; SEXP char_NULL; SEXP char_maxString; +SEXP char_AsIs; SEXP sym_sorted; SEXP sym_index; SEXP sym_BY; @@ -260,6 +261,7 @@ void attribute_visible R_init_data_table(DllInfo *info) char_dataframe = PRINTNAME(install("data.frame")); char_NULL = PRINTNAME(install("NULL")); char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF")); + char_AsIs = PRINTNAME(install("AsIs")); if (TYPEOF(char_integer64) != CHARSXP) { // checking one is enough in case of any R-devel changes diff --git a/src/rbindlist.c b/src/rbindlist.c index e206d4ce4..7d7578cde 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -2,16 +2,19 @@ #include #include // for isdigit -SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) +SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignoreattrArg) { if (!isLogical(fillArg) || LENGTH(fillArg) != 1 || LOGICAL(fillArg)[0] == NA_LOGICAL) error(_("%s should be TRUE or FALSE"), "fill"); if (!isLogical(usenamesArg) || LENGTH(usenamesArg)!=1) error(_("use.names= should be TRUE, FALSE, or not used (\"check\" by default)")); // R levels converts "check" to NA + if (!isLogical(ignoreattrArg) || LENGTH(ignoreattrArg)!=1 || LOGICAL(ignoreattrArg)[0] == NA_LOGICAL) + error(_("%s should be TRUE or FALSE"), "ignore.attr"); if (!length(l)) return(l); if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists.")); Rboolean usenames = LOGICAL(usenamesArg)[0]; const bool fill = LOGICAL(fillArg)[0]; + const bool ignoreattr = LOGICAL(ignoreattrArg)[0]; if (fill && usenames==NA_LOGICAL) { usenames=TRUE; } @@ -275,7 +278,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) bool factor=false, orderedFactor=false; // ordered factor is class c("ordered","factor"). isFactor() is true when isOrdered() is true. int longestLen=-1, longestW=-1, longestI=-1; // just for ordered factor; longestLen must be initialized as -1 so that rbind zero-length ordered factor could work #4795 SEXP longestLevels=R_NilValue; // just for ordered factor - bool int64=false; + bool int64=false, date=false, posixct=false, itime=false, asis=false; const char *foundName=NULL; bool anyNotStringOrFactor=false; SEXP firstCol=R_NilValue; @@ -306,14 +309,25 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (INHERITS(thisCol, char_integer64)) { if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } // so the integer64 attribute gets copied to target below int64=true; + } else if (INHERITS(thisCol, char_Date)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + date=true; + } else if (INHERITS(thisCol, char_POSIXct)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + posixct=true; + } else if (INHERITS(thisCol, char_ITime)) { + if (firsti>=0 && !length(getAttrib(firstCol, R_ClassSymbol))) { firsti=i; firstw=w; firstCol=thisCol; } + itime=true; + } else if (!asis && INHERITS(thisCol, char_AsIs)) { + asis=true; } if (firsti==-1) { firsti=i; firstw=w; firstCol=thisCol; } else { - if (!factor && !int64) { + if (!factor && !int64 && date == posixct && !itime && !asis) { // prohibit binding of date and posixct if (!R_compute_identical(PROTECT(getAttrib(thisCol, R_ClassSymbol)), PROTECT(getAttrib(firstCol, R_ClassSymbol)), - 0)) { - error(_("Class attribute on column %d of item %d does not match with column %d of item %d."), w+1, i+1, firstw+1, firsti+1); + 0) && !ignoreattr) { + error(_("Class attribute on column %d of item %d does not match with column %d of item %d. You can deactivate this safety-check by using ignore.attr=TRUE"), w+1, i+1, firstw+1, firsti+1); } UNPROTECT(2); } @@ -324,6 +338,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option) if (int64 && maxType!=REALSXP) error(_("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP"), j+1, type2char(maxType)); // # nocov + if (date && INHERITS(firstCol, char_IDate)) maxType=INTSXP; // first encountered Date determines class and type #5309 SEXP target; SET_VECTOR_ELT(ans, idcol+j, target=allocVector(maxType, nrow)); // does not initialize logical & numerics, but does initialize character and list if (!factor) copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB. From 4c5785b9ee4708df54a1a3acfdf30f913b05dd04 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 24 Jul 2024 14:46:19 -0700 Subject: [PATCH 19/41] Set default for PROJ_PATH to '.' if unset (#6307) * Set default for PROJ_PATH to '.' if unset * Sync cc() signature in README --- .dev/README.md | 2 +- R/test.data.table.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.dev/README.md b/.dev/README.md index 9184793b8..a5c201141 100644 --- a/.dev/README.md +++ b/.dev/README.md @@ -17,7 +17,7 @@ source(".dev/cc.R") Developer helper script providing `cc` function. If one starts R session in `data.table` project root directory `.dev/cc.R` file should be automatically sourced (due to local `.Rprofile` file) making `cc()` (and `dd()`) function available straightaway. ```r -cc(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc") +cc(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, path=Sys.getenv("PROJ_PATH", unset=normalizePath(".")), CC="gcc", quiet=FALSE) ``` Use `cc()` to re-compile all C sources and attach all `data.table` R functions (including non-exported ones). diff --git a/R/test.data.table.R b/R/test.data.table.R index d12ac7166..13f240d75 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -15,7 +15,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # nocov start dev = TRUE if ("package:data.table" %chin% search()) stopf("data.table package is loaded. Unload or start a fresh R session.") - rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH") + rootdir = if (pkg!="." && pkg %chin% dir()) file.path(getwd(), pkg) else Sys.getenv("PROJ_PATH", normalizePath(".")) subdir = file.path("inst","tests") env = new.env(parent=.GlobalEnv) # in dev cc() sources all functions in .GlobalEnv # nocov end From 1600b516c3823abceb4cf68b83f07fb7ab32762e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 25 Jul 2024 01:22:22 -0700 Subject: [PATCH 20/41] Ensure we're not UNPROTECT()ing 'x' in gsumm (#6306) * Ensure we're not UNPROTECT()ing 'x' * UNPROTECT()+PROTECT() approach --------- Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- src/gsumm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 7607f4258..16cc228a3 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -594,8 +594,9 @@ SEXP gmean(SEXP x, SEXP narmArg) x = PROTECT(coerceVector(x, REALSXP)); protecti++; case REALSXP: { if (INHERITS(x, char_integer64)) { - x = PROTECT(coerceAs(x, /*as=*/PROTECT(ScalarReal(1)), /*copyArg=*/ScalarLogical(TRUE))); protecti++; - UNPROTECT(1); // as= input to coerceAs() + SEXP as = PROTECT(ScalarReal(1)); + x = PROTECT(coerceAs(x, as, /*copyArg=*/ScalarLogical(TRUE))); protecti++; + UNPROTECT(2); PROTECT(x); // PROTECT() is stack-based, UNPROTECT() back to 'as' then PROTECT() 'x' again } const double *restrict gx = gather(x, &anyNA); ans = PROTECT(allocVector(REALSXP, ngrp)); protecti++; From 41fc24ffb7e3699b7a6033d451950fe6d472a0f4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 25 Jul 2024 23:19:44 -0700 Subject: [PATCH 21/41] Use STRING_PTR_RO, not STRING_PTR (#6312) * Move to STRING_PTR_RO in all sure-fire 'const' cases * Remaining cases compile as well * Conditionally define STRING_PTR_RO on 3.3.0<=R<3.5.0 --- src/assign.c | 20 ++++++++++---------- src/between.c | 6 +++--- src/bmerge.c | 4 ++-- src/chmatch.c | 6 +++--- src/cj.c | 2 +- src/coalesce.c | 4 ++-- src/data.table.h | 3 +++ src/fifelse.c | 10 +++++----- src/fmelt.c | 6 +++--- src/forder.c | 14 +++++++------- src/frank.c | 4 ++-- src/fwriteR.c | 2 +- src/gsumm.c | 8 ++++---- src/rbindlist.c | 14 +++++++------- src/uniqlist.c | 4 ++-- src/utils.c | 8 ++++---- 16 files changed, 59 insertions(+), 56 deletions(-) diff --git a/src/assign.c b/src/assign.c index d937601eb..b1623875e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -799,7 +799,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con if (sourceIsFactor) { sourceLevels=PROTECT(getAttrib(source, R_LevelsSymbol)); protecti++; } if (!sourceIsFactor || !R_compute_identical(sourceLevels, targetLevels, 0)) { // !sourceIsFactor for test 2115.6 const int nTargetLevels=length(targetLevels), nSourceLevels=length(sourceLevels); - const SEXP *targetLevelsD=STRING_PTR(targetLevels), *sourceLevelsD=STRING_PTR(sourceLevels); + const SEXP *targetLevelsD=STRING_PTR_RO(targetLevels), *sourceLevelsD=STRING_PTR_RO(sourceLevels); SEXP newSource = PROTECT(allocVector(INTSXP, length(source))); protecti++; savetl_init(); for (int k=0; k #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT +#ifndef STRING_PTR_RO +#define STRING_PTR_RO STRING_PTR +#endif #include // for uint64_t rather than unsigned long long #include #include "types.h" diff --git a/src/fifelse.c b/src/fifelse.c index b5b06200c..72b7f2c01 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -146,9 +146,9 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) { } } break; case STRSXP : { - const SEXP *restrict pa = na_a ? NULL : STRING_PTR(a); - const SEXP *restrict pb = na_b ? NULL : STRING_PTR(b); - const SEXP *restrict pna = na_n ? NULL : STRING_PTR(na); + const SEXP *restrict pa = na_a ? NULL : STRING_PTR_RO(a); + const SEXP *restrict pb = na_b ? NULL : STRING_PTR_RO(b); + const SEXP *restrict pna = na_n ? NULL : STRING_PTR_RO(na); const SEXP na = NA_STRING; for (int64_t i=0; ilmax; ++j) { for (int k=0; knrow; ++k) { SET_STRING_ELT(target, j*data->nrow + k, s[k]); diff --git a/src/forder.c b/src/forder.c index 70744d091..7226f7e45 100644 --- a/src/forder.c +++ b/src/forder.c @@ -277,7 +277,7 @@ static void cradix(SEXP *x, int n) free(cradix_xtmp); cradix_xtmp=NULL; } -static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count) +static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count) // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; @@ -323,7 +323,7 @@ static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int SEXP *ustr3 = (SEXP *)malloc(ustr_n * sizeof(SEXP)); if (!ustr3) STOP(_("Failed to alloc ustr3 when converting strings to UTF8")); // # nocov - memcpy(ustr3, STRING_PTR(ustr2), ustr_n*sizeof(SEXP)); + memcpy(ustr3, STRING_PTR_RO(ustr2), ustr_n*sizeof(SEXP)); // need to reset ustr_maxlen because we need ustr_maxlen for utf8 strings ustr_maxlen = 0; for (int i=0; i a a regular factor because this case isn't yet implemented. a Date: Fri, 26 Jul 2024 14:40:23 -0700 Subject: [PATCH 22/41] Remove SET_TYPEOF() call (#6313) --- src/fwriteR.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fwriteR.c b/src/fwriteR.c index c179ea2c8..5a0ab2dc7 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -199,9 +199,8 @@ SEXP fwriteR( DFcoerced = PROTECT(allocVector(VECSXP, args.ncol)); protecti++; // potentially large if ncol=1e6 as reported in #1903 where using large VLA caused stack overflow - SEXP s = PROTECT(allocList(2)); + SEXP s = PROTECT(LCONS(R_NilValue, allocList(1))); // no protecti++ needed here as one-off UNPROTECT(1) a few lines below - SET_TYPEOF(s, LANGSXP); SETCAR(s, install("format.POSIXct")); for (int j=0; j Date: Fri, 26 Jul 2024 17:59:18 -0700 Subject: [PATCH 23/41] Update Chinese translations for release (#6281) * initial update of zh_CN .po files * Remove/migrate(with fuzzy) deprecated R messages * Remove deprecated C messages * Quick stab at fuzzy R messages * updated fuzzy messages * remove fuzzies * mistaken commit of temp file * spurious fuzzy * inferred update * remove fuzzy * message not present in 'master' * Translations for zh_CN.po Work was done in collaboration with my good friend Andy! Co-authored-by: YuMing Chen * slight error in variable ordering * last translations/fuzzies --------- Co-authored-by: joshhwuu Co-authored-by: YuMing Chen --- po/R-zh_CN.po | 897 ++++++++++++++++++++++--------------------- po/zh_CN.po | 1002 ++++++++++++++++++++++++++----------------------- 2 files changed, 997 insertions(+), 902 deletions(-) diff --git a/po/R-zh_CN.po b/po/R-zh_CN.po index 5e29eee02..532654b9c 100644 --- a/po/R-zh_CN.po +++ b/po/R-zh_CN.po @@ -1,7 +1,7 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.5\n" -"POT-Creation-Date: 2023-12-28 12:46+0000\n" +"POT-Creation-Date: 2024-06-23 12:07-0300\n" "PO-Revision-Date: 2019-11-16 18:37+0800\n" "Last-Translator: Xianying Tan \n" "Language-Team: Mandarin\n" @@ -129,7 +129,8 @@ msgid "" "'between' function the 'x' argument is a POSIX class while '%s' was not, " "coercion to POSIX failed with: %s" msgstr "" -"'between' 中的 'x' 参数为 POSIX 类,而 '%s' 并不是,将其强制转换成 POSIX 时失败:%s" +"'between' 中的 'x' 参数为 POSIX 类,而 '%s' 并不是,将其强制转换成 POSIX 时失" +"败:%s" #: between.R:27 #, c-format @@ -145,8 +146,9 @@ msgstr "" msgid "" "'between' arguments are all POSIXct but have mismatched tzone attributes: " "%s. The UTC times will be compared." -msgstr "'between' 的参数均为 POSIXct 类型但时区属性(tzone)不匹配:" -"%s。将使用 UTC 时间进行比较。" +msgstr "" +"'between' 的参数均为 POSIXct 类型但时区属性(tzone)不匹配:%s。将使用 UTC 时" +"间进行比较。" #: between.R:36 #, c-format @@ -228,42 +230,54 @@ msgstr "不等长联结还不能执行 roll " msgid "Column name '_nqgrp_' is reserved for non-equi joins." msgstr "列名 '_nqgrp_' 是为不等长联结保留的" -#: data.table.R:63 +#: data.table.R:55 #, c-format msgid "key argument of data.table() must be character" msgstr "data.table() 的key参数必须是字符" -#: data.table.R:132 +#: data.table.R:123 #, c-format msgid "Object '%s' not found. Perhaps you intended %s" msgstr "对象 '%s' 不存在, 可能你打算 %s" -#: data.table.R:134 +#: data.table.R:125 #, c-format msgid "Object '%s' not found amongst %s" msgstr "%2$s 中对象 '%1$s' 不存在" -#: data.table.R:157 +#: data.table.R:141 +#, c-format +msgid "" +"[ was called on a data.table in an environment that is not data.table-aware " +"(i.e. cedta()), but '%s' was used, implying the owner of this call really " +"intended for data.table methods to be called. See vignette('datatable-" +"importing') for details on properly importing data.table." +msgstr "" +"[ 在不支持 data.table 的环境中对 data.table 进行调用 (例如 cedta()),但" +"使用了 '%s',这意味着该调用者确实打算调用 data.table 方法。有关正确导入" +" data.table 的详细信息,请参阅 vignette('datatable-importing')。" + +#: data.table.R:152 #, c-format msgid "verbose must be logical or integer" msgstr "verbose必须是一个逻辑向量" -#: data.table.R:158 +#: data.table.R:153 #, c-format msgid "verbose must be length 1 non-NA" msgstr "verbose 的长度必须是 1 和不NA" -#: data.table.R:166 +#: data.table.R:161 #, c-format msgid "Ignoring by/keyby because 'j' is not supplied" msgstr "因为没有提供 j= ,所以忽略 by/keyby" -#: data.table.R:180 +#: data.table.R:175 #, c-format msgid "When by and keyby are both provided, keyby must be TRUE or FALSE" msgstr "当 by 和 keyby 都提供的时候,keyby 必须是 TRUE 或 FALSE" -#: data.table.R:192 +#: data.table.R:187 #, c-format msgid "" "When on= is provided but not i=, on= must be a named list or data.table|" @@ -273,19 +287,19 @@ msgstr "" "当提供 on= 而不提供 i= 的时候, on= 必须是带名称的 list 或者 data.table 或者 " "data.frame,并且会调用自然联结(例如,按照共有名称联结),忽略 on= %s" -#: data.table.R:205 +#: data.table.R:200 #, c-format msgid "" "i and j are both missing so ignoring the other arguments. This warning will " "be upgraded to error in future." msgstr "i 和 j 都缺少的时候忽略其他参数。将来此警告信息将升级为错误信息。" -#: data.table.R:209 +#: data.table.R:204 #, c-format msgid "mult argument can only be 'first', 'last' or 'all'" msgstr "mult 参数只能赋值为 'first', 'last' 或 'all'" -#: data.table.R:211 +#: data.table.R:206 #, c-format msgid "" "roll must be a single TRUE, FALSE, positive/negative integer/double " @@ -294,103 +308,109 @@ msgstr "" "roll 必须是单个参数,例如 TRUE, FALSE, 正或负的 integer 或 double包括 " "+Inf , -Inf 或 'nearest'" -#: data.table.R:213 +#: data.table.R:208 #, c-format msgid "roll is '%s' (type character). Only valid character value is 'nearest'." msgstr "roll 是 '%s'(字符类型)。 唯一有效的字符值是'nearest'。" -#: data.table.R:218 +#: data.table.R:213 #, c-format msgid "rollends must be a logical vector" msgstr "rollends必须是一个逻辑向量" -#: data.table.R:219 +#: data.table.R:214 #, c-format msgid "rollends must be length 1 or 2" msgstr "rollends 的长度必须是 1 或者 2" -#: data.table.R:227 +#: data.table.R:222 #, c-format msgid "" "nomatch= must be either NA or NULL (or 0 for backwards compatibility which " "is the same as NULL but please use NULL)" -msgstr "nomatch= 必须是 NA 或 NULL (或者在向后兼容的情形下为 0,这等同于 NULL但是请用 NULL)" +msgstr "" +"nomatch= 必须是 NA 或 NULL (或者在向后兼容的情形下为 0,这等同于 NULL但是请" +"用 NULL)" -#: data.table.R:230 +#: data.table.R:225 #, c-format msgid "which= must be a logical vector length 1. Either FALSE, TRUE or NA." msgstr "which= 必须是一个长度为 1 的逻辑向量。其取值为 FALSE,TRUE 或者 NA。" -#: data.table.R:231 +#: data.table.R:226 #, c-format msgid "" "which==%s (meaning return row numbers) but j is also supplied. Either you " "need row numbers or the result of j, but only one type of result can be " "returned." msgstr "" -"which==%s (表示行数会被返回) 但是 j 也被提供了。你可能需要行数或者是 j 的结果,但是只能" -"返回一种结果。" +"which==%s (表示行数会被返回) 但是 j 也被提供了。你可能需要行数或者是 j 的结" +"果,但是只能返回一种结果。" -#: data.table.R:232 +#: data.table.R:227 #, c-format msgid "" "which=NA with nomatch=0|NULL would always return an empty vector. Please " "change or remove either which or nomatch." msgstr "" -"同时使用 which=NA 和 nomatch=0或NULL 会得到一个空向量。请改变或者是移除 which或 " -"nomatch 的取值" +"同时使用 which=NA 和 nomatch=0或NULL 会得到一个空向量。请改变或者是移除 which" +"或 nomatch 的取值" -#: data.table.R:233 +#: data.table.R:228 #, c-format msgid "j must be provided when with=FALSE" msgstr "如果with=FALSE(假),j必须要赋值" -#: data.table.R:273 +#: data.table.R:268 #, c-format msgid "" "The symbol .. is invalid. The .. prefix must be followed by at least one " "character." msgstr "符号 .. 是无效的。前缀 .. 之后必须要有至少一个字符" -#: data.table.R:276 +#: data.table.R:271 #, c-format msgid "" "Variable '..%s' does exist in calling scope though, so please just removed " "the .. prefix from that variable name in calling scope." -msgstr "变量 '%s' 并不存在于调用环境中。所以请移除在调用环境中那个变量名字的..前缀" +msgstr "" +"变量 '%s' 并不存在于调用环境中。所以请移除在调用环境中那个变量名字的..前缀" -#: data.table.R:280 +#: data.table.R:275 #, c-format msgid "" "Variable '%s' is not found in calling scope. Looking in calling scope " "because you used the .. prefix.%s" -msgstr "变量 '%s' 并没有存在于调用环境中。之所以在调用环境中寻找是因为你使用了..的前缀.%s" +msgstr "" +"变量 '%s' 并没有存在于调用环境中。之所以在调用环境中寻找是因为你使用了..的前" +"缀.%s" -#: data.table.R:282 +#: data.table.R:277 #, c-format msgid "" "Both '%1$s' and '..%1$s' exist in calling scope. Please remove the '..%1$s' " "variable in calling scope for clarity." -msgstr "'%1$s'和'..%1$s'均在当前调用环境中。为清晰起见,请移除在调用环境中名为" -"..%1$s' 的变量。" +msgstr "" +"'%1$s'和'..%1$s'均在当前调用环境中。为清晰起见,请移除在调用环境中名为.." +"%1$s' 的变量。" -#: data.table.R:288 +#: data.table.R:283 #, c-format msgid "" "Internal error: DT[, ..var] should be dealt with by the branch above now." msgstr "内部错误: DT[, ..var]应该被分支处理中。" -#: data.table.R:290 +#: data.table.R:285 #, c-format msgid "" "Variable '%s' is not found in calling scope. Looking in calling scope " "because you set with=FALSE. Also, please use .. symbol prefix and remove " "with=FALSE." msgstr "" -"变量 '%s' 并没有存在于调用环境中。之所以在调用环境中搜索是因为你使用了with=FALSE。请" -"使用 .. 符号前缀并且移除 with=FALSE。" +"变量 '%s' 并没有存在于调用环境中。之所以在调用环境中搜索是因为你使用了" +"with=FALSE。请使用 .. 符号前缀并且移除 with=FALSE。" -#: data.table.R:298 +#: data.table.R:293 #, c-format msgid "" "You have wrapped := with {} which is ok but then := must be the only thing " @@ -402,7 +422,7 @@ msgstr "" "试将 {} 置于 := 的RHS之上;比如,DT[,someCol:={tmpVar1<-...; tmpVar2<-...; " "tmpVar1*tmpVar2}" -#: data.table.R:318 +#: data.table.R:313 #, c-format msgid "" ":= with keyby is only possible when i is not supplied since you can't setkey " @@ -411,12 +431,12 @@ msgstr "" ":=和keyby的组合只有在i没有赋值下才合理存在。因为你不能在一个行的子集调用" "setkey。要么把keyby换成by或者是移除i" -#: data.table.R:320 +#: data.table.R:315 #, c-format msgid "nomatch isn't relevant together with :=, ignoring nomatch" msgstr "nomatch 并不和 := 有任何的相关,将忽略nomatch" -#: data.table.R:376 +#: data.table.R:371 #, c-format msgid "" "not-join '!' prefix is present on i but nomatch is provided. Please remove " @@ -424,7 +444,7 @@ msgid "" msgstr "" "not-join '!' 前缀在 i 中存在,但是 nomatch 也被提供了。需要移除nomatch。" -#: data.table.R:405 +#: data.table.R:400 #, c-format msgid "" "Operator := detected in i, the first argument inside DT[...], but is only " @@ -438,12 +458,12 @@ msgstr "" "(如错误地将 [DT , new_var := 5] 写作 DT[newvar := 5])。请再次检查语法是否正" "确。运行 trackback(),和 debugger() 来获取发生错误的行号。" -#: data.table.R:416 +#: data.table.R:411 #, c-format msgid "'%s' is not found in calling scope and it is not a column name either" msgstr "'%s' 既不存在于调用环境中,也非列名。" -#: data.table.R:419 +#: data.table.R:414 #, c-format msgid "" "'%s' is not found in calling scope, but it is a column of type %s. If you " @@ -455,16 +475,16 @@ msgstr "" "TRUE 值的列中的行,或者选择本身包含行号的列中的行,尝试 DT[(col)]、" "DT[DT$col],或者 DT[col==TRUE],它们表意非常清晰且效率高。" -#: data.table.R:422 +#: data.table.R:417 #, c-format msgid "" "%s. When the first argument inside DT[...] is a single symbol (e.g. " "DT[var]), data.table looks for var in calling scope." msgstr "" -"%s。 当DT[...]的第一个参数是一个单个的符号(e.g. DT[var]),data.table会在调用环境中" -"搜寻var。" +"%s。 当DT[...]的第一个参数是一个单个的符号(e.g. DT[var]),data.table会在调用" +"环境中搜寻var。" -#: data.table.R:434 +#: data.table.R:429 #, c-format msgid "" "i is invalid type (matrix). Perhaps in future a 2 column matrix could return " @@ -476,7 +496,7 @@ msgstr "" "DT (请参考问答集2.14的A[B])。如果你有需求,请将此问题汇报给data.table 问题追" "踪器或者是在FR中留下你的想法" -#: data.table.R:457 +#: data.table.R:452 #, c-format msgid "" "When i is a data.table (or character vector), the columns to join by must be " @@ -490,32 +510,32 @@ msgstr "" "setkey),或者是在x和i共用列的名字(比如,自然连接)。如果x有在内存被排序过,键" "(keyed)连接的速度会在非常大的数据上有较明显的提高。" -#: data.table.R:465 +#: data.table.R:460 #, c-format msgid "Attempting to do natural join but no common columns in provided tables" msgstr "尝试进行自然连接然而并没有找到表格中相同的列" -#: data.table.R:543 +#: data.table.R:538 #, c-format msgid "Internal error. Cannot by=.EACHI when joining to an index, yet" msgstr "内部错误:目前尚无法对索引(index)使用by=.EACH命令" -#: data.table.R:546 +#: data.table.R:541 #, c-format msgid "Internal error. irows has length in by=.EACHI" msgstr "内部错误:by=.EACHI 中 irows 有长度" -#: data.table.R:597 +#: data.table.R:592 #, c-format msgid "logical error. i is not a data.table, but 'on' argument is provided." msgstr "逻辑错误。当 i 并非一个 data.table时,不应提供'on'参数" -#: data.table.R:601 +#: data.table.R:596 #, c-format msgid "i has evaluated to type %s. Expecting logical, integer or double." msgstr "经计算 i 为 %s 类型。需要布尔类型,整型或浮点型。" -#: data.table.R:605 +#: data.table.R:600 #, c-format msgid "" "internal error: notjoin and which=NA (non-matches), huh? please provide " @@ -524,7 +544,7 @@ msgstr "" "内部错误:notjoin 并且 which=NA (非匹配)?请提供可重现的例子给问题追踪器" "(issue tracker)" -#: data.table.R:623 +#: data.table.R:618 #, c-format msgid "" "i evaluates to a logical vector length %d but there are %d rows. Recycling " @@ -532,23 +552,24 @@ msgid "" "rare convenience. Explicitly use rep(...,length=.N) if you really need to " "recycle." msgstr "" -"经计算 i 为长度为 %d 的逻辑向量,但数据框有 %d 行。循环补齐" -"循环补齐逻辑向量 i 的特性虽然在少数情况下使用方便,但这种行为会隐藏更多的 bug," -"因此现已不被允许。若确实需要循环补齐,请直接使用 rep(...,length=.N)。" +"经计算 i 为长度为 %d 的逻辑向量,但数据框有 %d 行。循环补齐循环补齐逻辑向量 " +"i 的特性虽然在少数情况下使用方便,但这种行为会隐藏更多的 bug,因此现已不被允" +"许。若确实需要循环补齐,请直接使用 rep(...,length=.N)。" -#: data.table.R:626 +#: data.table.R:621 +#, c-format msgid "" "Please use nomatch=NULL instead of nomatch=0; see news item 5 in v1.12.0 " "(Jan 2019)" msgstr "" "请使用 nomatch=NULL 而非 nomatch=0;参见 v1.12.0 (2019年1月) 中更新条目 5" -#: data.table.R:639 +#: data.table.R:634 #, c-format msgid "Internal error: notjoin but byjoin or !integer or nomatch==NA" msgstr "内部错误。原因可能为:notjoin 而非 byjoin;非整数;nomatch 为空" -#: data.table.R:699 +#: data.table.R:694 #, c-format msgid "" "with=FALSE together with := was deprecated in v1.9.4 released Oct 2014. " @@ -560,41 +581,41 @@ msgstr "" "侧部分打上括号;例如,DT[,(myVar):=sum(b),by=a]对 myVar 中的列名进行赋值。输" "入 ?':=' 参看其他的例子。正如2014年的版本中所说明的,现在这种用法会出现警告。" -#: data.table.R:702 +#: data.table.R:697 #, c-format msgid "" "with=FALSE ignored, it isn't needed when using :=. See ?':=' for examples." msgstr "" "当使用 :=. 的时候,with=FALSE 是多余的,会被忽略。输入 ?':=' 参看例子。" -#: data.table.R:726 +#: data.table.R:721 #, c-format msgid "column(s) not removed because not found: %s" msgstr "列未被删除因为不存在:%s" -#: data.table.R:740 +#: data.table.R:735 #, c-format msgid "column(s) not found: %s" msgstr "列不存在: %s" -#: data.table.R:746 +#: data.table.R:741 #, c-format msgid "Item %d of j is %d which is outside the column number range [1,ncol=%d]" msgstr "j 中的第 %d 项的数值为 %d,已超出列索引的范围内1,ncol=%d]" -#: data.table.R:749 +#: data.table.R:744 #, c-format msgid "j mixes positives and negatives" msgstr "j 中同时存在正数和负数" -#: data.table.R:757 +#: data.table.R:752 #, c-format msgid "" "When with=FALSE, j-argument should be of type logical/character/integer " "indicating the columns to select." msgstr "当 with=FALSE,参数 j 必须为布尔型/字符型/整型之一,表征要选择的列。" -#: data.table.R:771 +#: data.table.R:766 #, c-format msgid "" "'by' contains .I but only the following are currently supported: by=.I, by=." @@ -602,33 +623,22 @@ msgid "" msgstr "" "'by' 包含 .I,但目前仅支持以下用法:by=.I、by=.(.I)、by=c(.I)、by=list(.I)" -#: data.table.R:795 +#: data.table.R:790 #, c-format msgid "by=c(...), key(...) or names(...) must evaluate to 'character'" msgstr "by=c(...), key(...) 或 names(...) 只接受 'character' " -#: data.table.R:805 +#: data.table.R:806 #, c-format -msgid "" -"'by' is a character vector length %d but one or more items include a comma. " -"Either pass a vector of column names (which can contain spaces, but no " -"commas), or pass a vector length 1 containing comma separated column names. " -"See ?data.table for other possibilities." -msgstr "" -"'by' 为长度为 %d 的字符串向量,但是其中一个或多个元素包含逗号。请传入一个由列" -"名组成的向量(可以包含空格,但是不能" "包含逗号),或传入一个长度为 1 的由逗" -"号分隔列名字符串入。可通过 ?data.table 查看其他可能选项。" - -#: data.table.R:812 msgid "At least one entry of by is empty" msgstr "至少有一个 by 的元素为空" -#: data.table.R:853 +#: data.table.R:847 #, c-format msgid "Internal error: irows isn't integer" msgstr "内部错误:irows 不是整型" -#: data.table.R:887 +#: data.table.R:881 #, c-format msgid "" "'by' appears to evaluate to column names but isn't c() or key(). Use " @@ -639,7 +649,7 @@ msgstr "" "by=list(...)。其他情况,by=eval(%s) 应该均可成功运行。执行上述检查是出于运行" "效率原因,以确保 data.table 能够检测计算时所需使用的列。" -#: data.table.R:898 +#: data.table.R:892 #, c-format msgid "" "'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' " @@ -648,7 +658,7 @@ msgstr "" "'by' 或者 'keyby' 参数只接受一个向量或由向量组成的列表(这里 'list'包含 data." "table 和 data.frame,这二者本质也是列表" -#: data.table.R:902 +#: data.table.R:896 #, c-format msgid "" "Column or expression %d of 'by' or 'keyby' is type '%s' which is not " @@ -658,11 +668,11 @@ msgid "" "toString), whilst taking care to maintain distinctness in the process." msgstr "" "列或表达式 %d 的 'by' 或 'keyby' 为 '%s' 类型,目前并不支持。如确实需要该功" -"能,请在 https://github.com/Rdatatable/data.table/issues/1597 中提出。" -"暂时的解决办法是,将列转换为支持的类型,例如 by=sapply(list_col, toString)," -"同时请确保在转换后其值仍保持不同。" +"能,请在 https://github.com/Rdatatable/data.table/issues/1597 中提出。暂时的" +"解决办法是,将列转换为支持的类型,例如 by=sapply(list_col, toString),同时请" +"确保在转换后其值仍保持不同。" -#: data.table.R:906 +#: data.table.R:900 #, c-format msgid "" "The items in the 'by' or 'keyby' list are length(s) %s. Each must be length " @@ -672,17 +682,17 @@ msgstr "" "在'by'或'keyby'列表中的项长度为 %s 。每一项的长度须均为%d,即应与 x (或经 i " "筛选后的子集)中所包含行数相同。" -#: data.table.R:940 +#: data.table.R:934 #, c-format msgid "Internal error: drop_dot passed %d items" msgstr "内部错误:drop_dot 传入的参数有 %d" -#: data.table.R:959 +#: data.table.R:953 #, c-format msgid "Item %d of the .() or list() passed to j is missing" msgstr "传递给j的 .() 或 list()中第%d项缺失" -#: data.table.R:965 +#: data.table.R:959 #, c-format msgid "" "j may not evaluate to the same number of columns for each group; if you're " @@ -692,7 +702,7 @@ msgstr "" "j 在每个组中可能不会计算出相同的列数;如果您确定该警告信息是错误的,请将分支" "逻辑放在 [ 之外以提高运行效率" -#: data.table.R:967 +#: data.table.R:961 #, c-format msgid "" "Different branches of j expression produced different auto-named columns: " @@ -702,12 +712,12 @@ msgid "" "call; (2) explicitly provide missing defaults for each branch in all cases; " "or (3) use the same name for each branch and re-name it in a follow-up call." msgstr "" -"j 表达式的不同分支产生了不同的自动命名列:%s;将使用“最后”一个得到的名字。" -"如果这是有意为之(如,您知道在给定的查询中只有一个分支会被使用,因为该分支" -"由函数参数控制),请(1)将该分支从调用中分离出来;(2)在所有情况下为每个分" -"支提供缺失的默认值;或(3)为每个分支使用相同的名字,并在后续调用中重新命名。" +"j 表达式的不同分支产生了不同的自动命名列:%s;将使用“最后”一个得到的名字。如" +"果这是有意为之(如,您知道在给定的查询中只有一个分支会被使用,因为该分支由函" +"数参数控制),请(1)将该分支从调用中分离出来;(2)在所有情况下为每个分支提" +"供缺失的默认值;或(3)为每个分支使用相同的名字,并在后续调用中重新命名。" -#: data.table.R:1030 +#: data.table.R:1031 #, c-format msgid "" "When .SDcols is a function, it is applied to each column; the output of this " @@ -717,44 +727,44 @@ msgstr "" "当传入 .SDcols 的参数为一个方程时,该方程将应用于每一列,并须返回单个非缺失值" "的布尔值指示该列是否应当被包含/排除。然而上述条件对如下列并不满足:%s" -#: data.table.R:1036 +#: data.table.R:1037 #, c-format msgid ".SDcols missing at the following indices: %s" msgstr ".SDcols 的如下位置为缺失值:%s" -#: data.table.R:1038 +#: data.table.R:1039 #, c-format msgid ".SDcols is a logical vector length %d but there are %d columns" msgstr ".SDcols 为长度为 %d 的逻辑向量,但共计只有 %d 列" -#: data.table.R:1044 +#: data.table.R:1045 #, c-format msgid ".SDcols is numeric but has both +ve and -ve indices" msgstr ".SDcols 为数值,但同时具有 +ve 和 -ve 索引" -#: data.table.R:1046 +#: data.table.R:1047 #, c-format msgid ".SDcols is numeric but out of bounds [1, %d] at: %s" msgstr ".SDcols 为数值但在 %2$s 超出了 [1, %1$d]" -#: data.table.R:1050 +#: data.table.R:1051 #, c-format msgid ".SDcols should be column numbers or names" msgstr ".SDcols 应为列数或是列名" -#: data.table.R:1052 +#: data.table.R:1053 #, c-format msgid "Some items of .SDcols are not column names: %s" msgstr ".SDcols 中的部份项目不是列名: %s" -#: data.table.R:1094 +#: data.table.R:1095 #, c-format msgid "" "This j doesn't use .SD but .SDcols has been supplied. Ignoring .SDcols. See ?" "data.table." msgstr "此处 j 不使用 .SD 但提供了 .SDcols ,因此忽略 .SDcols详见 ?data.table" -#: data.table.R:1110 +#: data.table.R:1111 #, c-format msgid "" ".SD is locked. Using := in .SD's j is reserved for possible future use; a " @@ -764,32 +774,47 @@ msgstr "" ".SD 已锁定,在 .SD 的 j 中使用 := 进行分组修改是较不直观的方式此功能被保留" "以供未来使用请直接在 j 中使用 := 依照引用进行分组修改" -#: data.table.R:1118 data.table.R:1130 +#: data.table.R:1119 #, c-format msgid "In %s(col1=val1, col2=val2, ...) form, all arguments must be named." msgstr "在 %s(col1=val1, col2=val2, ...) 中,所有参数必须被指名" -#: data.table.R:1135 +#: data.table.R:1136 +#, c-format +msgid "" +"In %s(col1=val1, col2=val2, ...) form, all arguments must be named, but the " +"last argument has no name. Did you forget a trailing comma?" +msgstr "在 %s(col1=val1, col2=val2, ...) 中,所有参数必须被指名,但最后参数没名。" +"您是否忘记一个尾随逗号?" + +#: data.table.R:1138 +#, c-format +msgid "" +"In %s(col1=val1, col2=val2, ...) form, all arguments must be named, but " +"these arguments lack names: %s." +msgstr "在 %s(col1=val1, col2=val2, ...) 中,所有参数必须被指名,但这些参数没有名: %s。" + +#: data.table.R:1145 #, c-format msgid "" "LHS of := must be a symbol, or an atomic vector (column names or positions)." msgstr ":= 的 LHS 必须是符号或是原子向量(列名或是列的位置)" -#: data.table.R:1140 +#: data.table.R:1150 #, c-format msgid "" "LHS of := appears to be column positions but are outside [1,ncol] range. New " "columns can only be added by name." msgstr ":= 的 LHS 是列的位置但超出了 [1,ncol] 的范围新列只能以名称的方式新增" -#: data.table.R:1143 +#: data.table.R:1153 #, c-format msgid "" "LHS of := isn't column names ('character') or positions ('integer' or " "'numeric')" msgstr ":= 的 LHS 不是列名('字符')或列的位置('整数'或'数值')" -#: data.table.R:1174 +#: data.table.R:1184 #, c-format msgid "" "Invalid .internal.selfref detected and fixed by taking a (shallow) copy of " @@ -808,7 +833,7 @@ msgstr "" "及 ?setattr如果以上讯息无法提供帮助,请回报你的案例至 data.table 问题追踪以助" "于修复根本原因或改进本讯息" -#: data.table.R:1205 +#: data.table.R:1215 #, c-format msgid "" "Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] " @@ -817,17 +842,17 @@ msgstr "" "无法指定配置不足的递归索引列表-- L[[i]][,:=] 语法只有在 i 长度为1时有效,但它" "的長度是 %d" -#: data.table.R:1207 +#: data.table.R:1217 #, c-format msgid "Internal error -- item '%s' not found in names of list" msgstr "内部错误 -- 未能在列表名称中找到名为 '%s' 的项" -#: data.table.R:1236 data.table.R:1249 +#: data.table.R:1246 data.table.R:1259 #, c-format msgid "Internal error -- column(s) not found: %s" msgstr "内部错误 -- 找不到此列: %s" -#: data.table.R:1261 +#: data.table.R:1271 #, c-format msgid "" "strptime() usage detected and wrapped with as.POSIXct(). This is to minimize " @@ -836,24 +861,24 @@ msgid "" "needed internally) to avoid this warning." msgstr "" "侦测到使用 strptime() ,已用 as.POSIXct() 包裹这么做是为了尽量避免列被指定为 " -"POSIXltPOSIXlt用40个以上的位元组储存日期(相较于 POSIXct 只用8位元组)请使用 " +"POSIXlt用40个以上的位元组储存日期(相较于 POSIXct 只用8位元组)请使用 " "as.POSIXct() 以避免本警告 (此函数会根据需求在内部调用 strptime())" -#: data.table.R:1278 +#: data.table.R:1289 #, c-format msgid "" "Variable '%s' is not found in calling scope. Looking in calling scope " "because this symbol was prefixed with .. in the j= parameter." msgstr "" -"未能在调用环境中找到变量 '%s'。在调用环境中查询该变量的原因是因为在 j= 的传入参" -"数中该符号以 .. 为前缀。" +"未能在调用环境中找到变量 '%s'。在调用环境中查询该变量的原因是因为在 j= 的传入" +"参数中该符号以 .. 为前缀。" -#: data.table.R:1290 +#: data.table.R:1301 #, c-format msgid "Internal error: xcolAns does not pass checks: %d/%d/%d/%s" msgstr "内部错误 : xcolAns 无法通过检查: %d/%d/%d/%s" -#: data.table.R:1300 +#: data.table.R:1311 #, c-format msgid "" "Internal error: irows is NULL when making join result at R level. Should no " @@ -862,7 +887,7 @@ msgstr "" "内部错误 : 在 R 生成连接结果时,irows为 NULL 我们已使用了 CsubsetDT,现在不应" "该再发生了" -#: data.table.R:1364 +#: data.table.R:1375 #, c-format msgid "" "j (the 2nd argument inside [...]) is a single symbol but column name '%1$s' " @@ -874,13 +899,13 @@ msgstr "" "环境中使用变量选择列,请尝试 DT[, ..%1$s]。.. 前缀表示上一级,类似于文件系统" "路径。" -#: data.table.R:1419 +#: data.table.R:1430 #, c-format msgid "" "Internal error: j has created a data.table result containing a NULL column" msgstr "内部错误 : j 创建了一个有列为 NULL 的 data.table" -#: data.table.R:1429 +#: data.table.R:1440 #, c-format msgid "" "The column '.N' can't be grouped because it conflicts with the special .N " @@ -889,7 +914,7 @@ msgstr "" "无法对 '.N' 列进行分组,因为与 data.table 特有的 .N 变量冲突请先尝试 " "setnames(DT,'.N','N')" -#: data.table.R:1430 +#: data.table.R:1441 #, c-format msgid "" "The column '.I' can't be grouped because it conflicts with the special .I " @@ -898,27 +923,27 @@ msgstr "" "无法对 '.I' 列进行分组,因为与 data.table 特有的 .I 变量冲突请先尝试 " "setnames(DT,'.I','I')" -#: data.table.R:1457 +#: data.table.R:1469 #, c-format msgid "logical error. i is not data.table, but mult='all' and 'by'=.EACHI" msgstr "逻辑错误: i 不是data.table,但 mult='all' 及 'by'=.EACHI" -#: data.table.R:1480 +#: data.table.R:1492 #, c-format msgid "Internal error: by= is missing" msgstr "内部错误 : 缺少 by=" -#: data.table.R:1521 +#: data.table.R:1533 #, c-format msgid "Internal error: byindex not the index name" msgstr "内部错误 : byindex 不是索引(index)名称" -#: data.table.R:1524 +#: data.table.R:1536 #, c-format msgid "Internal error: byindex not found" msgstr "内部错误 : 找不到 byindex" -#: data.table.R:1797 +#: data.table.R:1790 #, c-format msgid "" "Unable to optimize call to mean() and could be very slow. You must name 'na." @@ -929,12 +954,12 @@ msgstr "" "果您直接使用 mean(x,TRUE)会被认定为 trim=TRUE,trim 是 mean() 中尚未被优化的" "第二顺位参数" -#: data.table.R:1834 +#: data.table.R:1827 #, c-format msgid "Internal error: length(irows)!=length(o__)" msgstr "内部错误:length(irows)!=length(o__)" -#: data.table.R:1935 +#: data.table.R:1936 #, c-format msgid "" "The setkey() normally performed by keyby= has been skipped (as if by= was " @@ -946,83 +971,84 @@ msgstr "" "执行的`setkey()` -- 代码只执行了`by=`。若要避免此警告, 请使用`by=`, 或者提供" "`keyby=`现有的列名" -#: data.table.R:1950 +#: data.table.R:1951 #, c-format msgid "Internal error: jvnames is length %d but ans is %d and bynames is %d" msgstr "内部错误:jvnames 长度为 %d,但是 ans 长度为 %d 且 bynames 为 %d" -#: data.table.R:2020 +#: data.table.R:2021 #, c-format msgid "rownames and rownames.value cannot both be used at the same time" msgstr "rownames和rownames.value 不能同时使用" -#: data.table.R:2025 +#: data.table.R:2026 #, c-format msgid "" "length(rownames)==%d but nrow(DT)==%d. The rownames argument specifies a " "single column name or number. Consider rownames.value= instead." -msgstr "length(rownames)==%d 但 nrow(DT)==%d。 rownames参数为单一列名或单一数值。请考虑使用`rownames.values=`。" +msgstr "" +"length(rownames)==%d 但 nrow(DT)==%d。 rownames参数为单一列名或单一数值。请考" +"虑使用`rownames.values=`。" -#: data.table.R:2029 +#: data.table.R:2030 #, c-format msgid "" "length(rownames)==0 but should be a single column name or number, or NULL" msgstr "" "行名长度为零,`length(rownames)==0`,但应该为单一列名,单一数值,或NULL" -#: data.table.R:2033 +#: data.table.R:2034 #, c-format msgid "" "rownames is TRUE but key has multiple columns %s; taking first column x[,1] " "as rownames" msgstr "rownames 为 TRUE但键(key)为多个列 %s;将使用第一列 x[,1] 为行名。" -#: data.table.R:2043 +#: data.table.R:2044 #, c-format msgid "'%s' is not a column of x" msgstr "'%s' 不是x的一个列" -#: data.table.R:2049 +#: data.table.R:2050 #, c-format msgid "" "as.integer(rownames)==%d which is outside the column number range [1," "ncol=%d]." msgstr "as.integer(rownames)==%d 不在列索引范围内 [1,ncol%d]。" -#: data.table.R:2054 +#: data.table.R:2055 #, c-format msgid "length(rownames.value)==%d but should be nrow(x)==%d" msgstr "length(rownames.value)==%d 但应该是 nrow(x)==%d" -#: data.table.R:2116 +#: data.table.R:2115 #, c-format msgid "" "Internal error: as.matrix.data.table length(X)==%d but a dimension is zero" -msgstr "" -"内部错误: as.matrix.data.table length(X)==%d 但有一个维度为零" +msgstr "内部错误: as.matrix.data.table length(X)==%d 但有一个维度为零" -#: data.table.R:2152 +#: data.table.R:2151 #, c-format msgid "" "When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j" msgstr "当i以`DT[i]<-value`的形式出现,不需要提供j" -#: data.table.R:2162 +#: data.table.R:2161 #, c-format msgid "j must be an atomic vector, see ?is.atomic" msgstr "j必须是原子向量,请参考 ?is.atomic" -#: data.table.R:2163 +#: data.table.R:2162 #, c-format msgid "NA in j" msgstr "j里有NA" -#: data.table.R:2169 +#: data.table.R:2168 #, c-format msgid "j must be vector of column name or positions" msgstr "j 必须是列名(column name)或列位(column position)的向量" -#: data.table.R:2170 +#: data.table.R:2169 #, c-format msgid "" "Attempt to assign to column position greater than ncol(x). Create the column " @@ -1031,7 +1057,7 @@ msgstr "" "试图指定至比ncol(x)还大的列位(column position)。请指定至列名(column name)。一" "般来说用列名取代列位能解决大部分错误。" -#: data.table.R:2237 +#: data.table.R:2236 #, c-format msgid "" "data.table inherits from data.frame (from v1.5), but this data.table does " @@ -1042,42 +1068,42 @@ msgstr "" "table是不是手动创建的(可能创建时使用了`structure()`而非`data.table()`),或者" "是不是使用了更早的data.table版本创建后存到硬盘了。" -#: data.table.R:2246 +#: data.table.R:2245 #, c-format msgid "attempting to assign invalid object to dimnames of a data.table" msgstr "试图指定无效对象给data.table的维度名(dimnames)" -#: data.table.R:2247 +#: data.table.R:2246 #, c-format msgid "data.tables do not have rownames" msgstr "data.tables没有rownames" -#: data.table.R:2248 data.table.R:2626 +#: data.table.R:2247 data.table.R:2616 #, c-format msgid "Can't assign %d names to a %d-column data.table" msgstr "无法将 %d 个名字赋值给一个包含 %d 列的 data.table" -#: data.table.R:2327 +#: data.table.R:2311 #, c-format msgid "'subset' must evaluate to logical" msgstr "'subset' 必须为logical" -#: data.table.R:2370 +#: data.table.R:2354 #, c-format msgid "Argument 'invert' must be logical TRUE/FALSE" msgstr " 'invert' 的参数是逻辑值,必须是 TRUE/FALSE" -#: data.table.R:2411 +#: data.table.R:2395 #, c-format msgid "x argument must be a data.table" msgstr "参数 x 必须是一个 data.table" -#: data.table.R:2416 +#: data.table.R:2400 #, c-format msgid "group length is 0 but data nrow > 0" msgstr "分组长度为0,但 data nrow > 0" -#: data.table.R:2418 +#: data.table.R:2402 #, c-format msgid "" "passing 'f' argument together with 'by' is not allowed, use 'by' when split " @@ -1086,48 +1112,48 @@ msgstr "" "不可同时指定参数 'f' 和参数 'by' ,当利用 data.table的纵列进行数据分割时,请使" "用参数 'by';当利用外部因子进行数据分割时,请使用参数 'f'" -#: data.table.R:2422 +#: data.table.R:2410 #, c-format msgid "Either 'by' or 'f' argument must be supplied" msgstr "必须提供参数 'by' 或参数 'f'" -#: data.table.R:2424 +#: data.table.R:2412 #, c-format msgid "Column '.ll.tech.split' is reserved for split.data.table processing" msgstr "为 split.data.table 进程,纵列 '.ll.tech.split' 被保存" -#: data.table.R:2425 +#: data.table.R:2413 #, c-format msgid "Column '.nm.tech.split' is reserved for split.data.table processing" msgstr "为 split.data.table 进程,纵列 '.nm.tech.split' 被保存" -#: data.table.R:2426 +#: data.table.R:2414 #, c-format msgid "Argument 'by' must refer to column names in x" msgstr "参数 'by' 只适用于 x 中的列名" -#: data.table.R:2427 +#: data.table.R:2415 #, c-format msgid "" "Argument 'by' must refer only to atomic-type columns, but the following " "columns are non-atomic: %s" msgstr "参数 'by' 只适用于原子类型的纵列,但现在关联的纵列不是原子类型: %s" -#: data.table.R:2557 +#: data.table.R:2547 #, c-format msgid "" "x is not a data.table|frame. Shallow copy is a copy of the vector of column " "pointers (only), so is only meaningful for data.table|frame" msgstr "" -"浅拷贝(shallow copy)只是列指针向量的拷贝,因此仅对 data.table或data.frame 有意义,而 x " -"不是 data.table或data.frame" +"浅拷贝(shallow copy)只是列指针向量的拷贝,因此仅对 data.table或data.frame " +"有意义,而 x 不是 data.table或data.frame" -#: data.table.R:2566 +#: data.table.R:2556 #, c-format msgid "setalloccol attempting to modify `*tmp*`" msgstr "setalloccol 试图修改 '*tmp*'" -#: data.table.R:2601 +#: data.table.R:2591 #, c-format msgid "" "Input is a length=1 logical that points to the same address as R's global " @@ -1137,94 +1163,95 @@ msgstr "" "输入值是一个指向与R全局值相同位置的长度为1的逻辑值。因此,该属性是通过副本," "而不是reference 的形式设置。您需要将结果分配回一个变量,参看 issue #1281" -#: data.table.R:2616 +#: data.table.R:2606 #, c-format msgid "x is not a data.table or data.frame" msgstr "x 不是 data.table 或 data.frame." -#: data.table.R:2618 +#: data.table.R:2608 #, c-format msgid "x has %d columns but its names are length %d" msgstr "x有%d列,但其列名的长度为%d" -#: data.table.R:2625 +#: data.table.R:2615 #, c-format msgid "Passed a vector of type '%s'. Needs to be type 'character'." msgstr "传入了一个类型为 '%s' 的向量。须为 'character' 类型。" -#: data.table.R:2638 +#: data.table.R:2628 #, c-format msgid "'new' is not a character vector or a function" msgstr "'new' 既不是特征向量也不是 function" -#: data.table.R:2640 +#: data.table.R:2630 #, c-format msgid "NA in 'new' at positions %s" msgstr "在 'new' 中有NA值 %s" -#: data.table.R:2641 +#: data.table.R:2631 #, c-format msgid "Some duplicates exist in 'old': %s" msgstr "在'old' 中存在重复名称: %s" -#: data.table.R:2643 +#: data.table.R:2633 #, c-format msgid "'old' is type %s but should be integer, double or character" msgstr "'old' 为 %s 类型,但只接受整型、浮点型或者字符串型" -#: data.table.R:2644 +#: data.table.R:2634 #, c-format msgid "'old' is length %d but 'new' is length %d" msgstr "'old' 长度为 %d 但 'new' 的长度为 %d" -#: data.table.R:2645 +#: data.table.R:2635 #, c-format msgid "NA (or out of bounds) in 'old' at positions %s" msgstr "NA(或超出界限)出现在'old' 的位置 %s" -#: data.table.R:2648 +#: data.table.R:2638 #, c-format msgid "" "Item %d of 'old' is '%s' which appears several times in column names. Just " "the first will be changed. There are %d other items in 'old' that are also " "duplicated in column names." -msgstr "'old' 中的第 %d 项 '%s' 在列名中重复出现。仅第一个会被修改。在 'old' " -"中仍有 %d 项在列名中重复出现。" +msgstr "" +"'old' 中的第 %d 项 '%s' 在列名中重复出现。仅第一个会被修改。在 'old' 中仍有 " +"%d 项在列名中重复出现。" -#: data.table.R:2656 +#: data.table.R:2646 #, c-format msgid "" "Items of 'old' not found in column names: %s. Consider skip_absent=TRUE." msgstr "在 'old' 中未找到如下列名:%s。请考虑设置 skip_absent=TRUE。" -#: data.table.R:2666 +#: data.table.R:2656 #, c-format msgid "Internal error: length(i)!=length(new)" msgstr "内部错误:length(i)!=length(new)" -#: data.table.R:2695 +#: data.table.R:2685 #, c-format msgid "" "x has some duplicated column name(s): %s. Please remove or rename the " "duplicate(s) and try again." msgstr "x 中有如下重复的列名:%s。请移除或者重命名重复项后重试。" -#: data.table.R:2697 +#: data.table.R:2687 #, c-format msgid "Provide either before= or after= but not both" msgstr "提供 before= 或 after= ,但两者不能同时存在" -#: data.table.R:2699 +#: data.table.R:2689 #, c-format msgid "before=/after= accept a single column name or number, not more than one" msgstr "before=/after= 只接受一个列名或列号,不能多于一个" -#: data.table.R:2754 +#: data.table.R:2744 #, c-format msgid "Input is %s but should be a plain list of items to be stacked" msgstr "项是 %s 但应该叠加普通列表项" -#: data.table.R:2758 +#: data.table.R:2748 #, c-format msgid "" "idcol must be a logical or character vector of length 1. If logical TRUE the " @@ -1232,12 +1259,12 @@ msgid "" msgstr "" "idcol必须为逻辑型向量或长度为1的字符型向量.如果逻辑值为TRUEid 列会命名为'.id'" -#: data.table.R:2763 +#: data.table.R:2753 #, c-format msgid "use.names=NA invalid" msgstr "use.names=NA 无效赋值" -#: data.table.R:2765 +#: data.table.R:2755 #, c-format msgid "" "use.names='check' cannot be used explicitly because the value 'check' is new " @@ -1247,42 +1274,54 @@ msgstr "" "请勿直接使用use.names='check',因为值'check'为新增在v1.12.2中后续有所变化,仅" "用表示默认模式详见 ?rbindlist" -#: data.table.R:2780 +#: data.table.R:2770 #, c-format msgid "" "Check that is.data.table(DT) == TRUE. Otherwise, :=, `:=`(...) and let(...) " "are defined for use in j, once only and in particular ways. See help(\":=\")." msgstr "" -"检查是否is.data.table(DT) == TRUE,否则,:=, `:=`(...), 和 let(...) 为被界定在j使用,仅一" -"次以特别的方式使用,详见help(\":=\")" +"检查是否is.data.table(DT) == TRUE,否则,:=, `:=`(...), 和 let(...) 为被界定在j" +"使用,仅一次以特别的方式使用,详见help(\":=\")" -#: data.table.R:2786 +#: data.table.R:2775 +#, c-format +msgid "J() called outside of [.data.table. J() is only intended for use in i." +msgstr "J() 在 [.data.table. 外被使用。J() 仅适用于 i 中。" + +#: data.table.R:2779 +#, c-format +msgid "" +".() called outside of [.data.table. .() is only intended as an alias for " +"list() inside DT[...]." +msgstr ".() 在 [.data.table 外被使用。.() 仅适用于 DT[...] 内 list() 的别名。" + +#: data.table.R:2785 #, c-format msgid "" "setDF only accepts data.table, data.frame or list of equal length as input" msgstr "setDF仅允许data.table,data.frame或者同样长度的列表作为输入" -#: data.table.R:2787 +#: data.table.R:2786 #, c-format msgid "rownames contains duplicates" msgstr "行名含有重复" -#: data.table.R:2794 data.table.R:2805 data.table.R:2828 +#: data.table.R:2793 data.table.R:2804 data.table.R:2827 #, c-format msgid "rownames incorrect length; expected %d names, got %d" msgstr "行名长度不正确;需要 %d 名 受到 %d" -#: data.table.R:2813 +#: data.table.R:2812 #, c-format msgid "All elements in argument 'x' to 'setDF' must be of same length" msgstr "'setDF'中的参数'x'的所有元素必须同等长度" -#: data.table.R:2842 +#: data.table.R:2841 #, c-format msgid "Cannot find symbol %s" msgstr "无法找到符号 %s" -#: data.table.R:2849 +#: data.table.R:2848 #, c-format msgid "" "Cannot convert '%1$s' to data.table by reference because binding is locked. " @@ -1293,43 +1332,10 @@ msgid "" msgstr "" "无法通过引用直接将 '%1$s' 转换成 data.table,因其与所在环境已绑定。 这有很大" "可能是因为 '%1$s' 存在于一个包(或环境)中,该包(或环境)已被锁定从而无法修" -"改其绑定的变量。可以尝试将该对象复制到你的现有环境中,如:var <- copy(var)" -"然后再运行 setDT。" - -#: data.table.R:2856 -#, c-format -msgid "" -"Some columns are a multi-column type (such as a matrix column): %s. setDT " -"will retain these columns as-is but subsequent operations like grouping and " -"joining may fail. Please consider as.data.table() instead which will create " -"a new column for each embedded column." -msgstr "" -"某些列为包含多列的类型(如矩阵列):%s。setDT 会保留这些列,但后续的操作如分" -"组(grouping)和联接(joining)等操作可能会失败。请考虑使用 as.data.table() 因" -"为它会为每个内嵌列建立一个新列。" - -#: data.table.R:2888 -#, c-format -msgid "" -"Column %d is of POSIXlt type. Please convert it to POSIXct using as.POSIXct " -"and run setDT again. We do not recommend use of POSIXlt at all because it " -"uses 40 bytes to store one date." -msgstr "" -"第 %d 列属于 POSIXlt 类型。请使用 as.POSIXct 将其转换为 POSIXct 类型并再次运行" -"setDT。我们非常不推荐使用 POSIXlt 类型,因为它需使用 40 个字节来存储一个日期。" +"改其绑定的变量。可以尝试将该对象复制到你的现有环境中,如:var <- copy(var)然" +"后再运行 setDT。" -#: data.table.R:2894 -#, c-format -msgid "" -"All elements in argument 'x' to 'setDT' must be of same length, but the " -"profile of input lengths (length:frequency) is: %s\n" -"The first entry with fewer than %d entries is %d." -msgstr "" -"'setDT' 参数 'x' 中所有的元素均须为同一长度,但输入元素的长度频数表(长度:频" -"数)为:%s\n" -"x中第一个长度比 %d 少的元素为第 %d 项。" - -#: data.table.R:2911 +#: data.table.R:2901 #, c-format msgid "" "Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'" @@ -1337,27 +1343,27 @@ msgstr "" "'setDT' 参数 'x' 应为一个列表('list'),数据框('data.frame')或 'data." "table'" -#: data.table.R:2926 +#: data.table.R:2916 #, c-format msgid "Item '%s' not found in names of input list" msgstr "元素 '%s' 不存在于输入列表的元素名中" -#: data.table.R:2951 data.table.R:2976 +#: data.table.R:2941 data.table.R:2966 #, c-format msgid "'prefix' must be NULL or a character vector of length 1." msgstr "'prefix' 必须为 空(NULL)或者长度为 1 的字符向量。" -#: data.table.R:2954 data.table.R:2979 +#: data.table.R:2944 data.table.R:2969 #, c-format msgid "x is a single vector, non-NULL 'cols' doesn't make sense." msgstr "x 是单个向量,非空的 'cols' 没有意义。" -#: data.table.R:2958 data.table.R:2983 +#: data.table.R:2948 data.table.R:2973 #, c-format msgid "x is a list, 'cols' cannot be 0-length." msgstr "x 是一个列表(list),'cols' 长度不能为0。" -#: data.table.R:3092 +#: data.table.R:3160 #, c-format msgid "" "RHS of %s is length %d which is not 1 or nrow (%d). For robustness, no " @@ -1366,13 +1372,13 @@ msgstr "" "%s 的右手侧 (RHS) 长度为 %d, 其非 1 或 总行数 nrow (%d)。考虑到程序的稳健性," "只有在右侧元素长度为 1 的情况下,我们才会对之进行循环。考虑改用 %%in%% 。" -#: data.table.R:3120 +#: data.table.R:3188 #, c-format msgid "" "Internal error in .isFastSubsettable. Please report to data.table developers" msgstr ".isFastSubsettable 产生了内部错误。请向 data.table 开发者报告" -#: data.table.R:3207 +#: data.table.R:3275 #, c-format msgid "" "'on' argument should be a named atomic vector of column names indicating " @@ -1381,29 +1387,30 @@ msgstr "" "'on' 参数应为一个有子项名字的原子列名向量,指明'i' 中的哪些列应与 'x' 中的哪" "些列联接。" -#: data.table.R:3248 +#: data.table.R:3316 #, c-format msgid "" "Found more than one operator in one 'on' statement: %s. Please specify a " "single operator." -msgstr "在一个 'on' 语句中出现了多个操作符(operator):%s。请仅指定单个操作" -"符。" +msgstr "" +"在一个 'on' 语句中出现了多个操作符(operator):%s。请仅指定单个操作符。" -#: data.table.R:3271 +#: data.table.R:3339 #, c-format msgid "" "'on' contains no column name: %s. Each 'on' clause must contain one or two " "column names." msgstr "'on' 语句中包含非列名的项:%s。每个'on' 语句中必须包含一个或两个列名。" -#: data.table.R:3273 +#: data.table.R:3341 #, c-format msgid "" "'on' contains more than 2 column names: %s. Each 'on' clause must contain " "one or two column names." -msgstr "'on' 语句包含超过 2 个列名:%s。每个'on' 语句中必须包含一个或两个列名。" +msgstr "" +"'on' 语句包含超过 2 个列名:%s。每个'on' 语句中必须包含一个或两个列名。" -#: data.table.R:3278 +#: data.table.R:3346 #, c-format msgid "Invalid join operators %s. Only allowed operators are %s." msgstr "无效联接操作符 %s。只允许如下操作符: %s。" @@ -1452,9 +1459,9 @@ msgid "" "In the next version, this warning will become an error." msgstr "" "data.table 中的 %1$s 泛型方法接收到了一个 %2$s 对象,将尝试重定向使用" -"reshape2 包中相关的方法。请注意 reshape2 包已经被其实包取代且不再积极开发," -"故该重定向目前也不推荐使用。请手动执行该重定向,如 reshape2::%1$s(%3$s)。" -"下一版本中,本警告将变成一个错误。" +"reshape2 包中相关的方法。请注意 reshape2 包已经被其实包取代且不再积极开发,故" +"该重定向目前也不推荐使用。请手动执行该重定向,如 reshape2::%1$s(%3$s)。下一版" +"本中,本警告将变成一个错误。" #: fcast.R:31 #, c-format @@ -1463,17 +1470,17 @@ msgid "" "b ~ c." msgstr "无效的公式。所转换的公式的形式应为LHS ~ RHS,如a + b ~ c。" -#: fcast.R:36 +#: fcast.R:38 #, c-format msgid "data.table to cast must have unique column names" msgstr "要转换的data.table必须具有唯一的列名" -#: fcast.R:60 +#: fcast.R:83 #, c-format msgid "value.var values %s are not found in 'data'." msgstr "value.var 的值 %s 无法在 'data' 中找到。" -#: fcast.R:76 +#: fcast.R:99 #, c-format msgid "" "When 'fun.aggregate' and 'value.var' are both lists, 'value.var' must be " @@ -1482,32 +1489,58 @@ msgstr "" "当 'fun.aggregate' 和 'value.var' 同为 list时, 'value.var' 的长度必须为 1 或 " "length(fun.aggregate)。" -#: fcast.R:109 +#: fcast.R:132 #, c-format msgid "'data' must be a data.table." msgstr "'data' 必须为 data.table" -#: fcast.R:111 +#: fcast.R:134 #, c-format msgid "'drop' must be logical TRUE/FALSE" msgstr "'drop' 必须为逻辑 TRUE/FALSE" -#: fcast.R:128 +#: fcast.R:136 +#, c-format +msgid "Argument 'value.var.in.dots' should be logical TRUE/FALSE" +msgstr "参数 'value.var.in.dots' 应为逻辑值 TRUE 或 FALSE" + +#: fcast.R:138 +#, c-format +msgid "" +"Arguments 'value.var.in.LHSdots', 'value.var.in.RHSdots' should be logical " +"TRUE/FALSE" +msgstr "参数 'value.var.in.LHSdots' 和 'value.var.in.RHSdots' 应为逻辑值 TRUE 或 FALSE" + +#: fcast.R:155 #, c-format msgid "Column [%s] not found or of unknown type." msgstr "列 [%s]] 无法找到或其类型未知。" -#: fcast.R:143 +#: fcast.R:170 #, c-format msgid "Columns specified in formula can not be of type list" msgstr "在formula中指定的列不应为列表类型" -#: fcast.R:159 +#: fcast.R:185 #, c-format -msgid "Aggregate function missing, defaulting to 'length'" -msgstr "聚合函数缺失,将默认采用'length'" +msgid "" +"'fun.aggregate' is NULL, but found duplicate row/column combinations, so " +"defaulting to length(). That is, the variables %s used in 'formula' do not " +"uniquely identify rows in the input 'data'. In such cases, 'fun.aggregate' " +"is used to derive a single representative value for each combination in the " +"output data.table, for example by summing or averaging (fun.aggregate=sum or " +"fun.aggregate=mean, respectively). Check the resulting table for values " +"larger than 1 to see which combinations were not unique. See ?dcast.data." +"table for more details." +msgstr "" +"'fun.aggregate' 为 NULL,但发现重复的行/列组合,因此默认为 length()。也就是说," +"'formula' 中使用的变量 %s 并不单独标识输入'data'中的行。在这种情况下,'fun.aggregate' " +"用于为输出 data.table 中的每个组合导出单个代表值,例如通过求和或平均(分别为 " +"fun.aggregate=sum 或 fun.aggregate=mean)。请检查结果表中是否有大于 1 的值," +"来查看哪些组合不唯一。有关更多详细信息,请参阅 ?dcast.data.table。" -#: fcast.R:165 +#: fcast.R:194 +#, c-format msgid "" "Aggregating function(s) should take vector inputs and return a single value " "(length=1). However, function(s) returns length!=1. This value will have to " @@ -1515,11 +1548,11 @@ msgid "" "Either override by setting the 'fill' argument explicitly or modify your " "function to handle this case appropriately." msgstr "" -"聚合函数应接收向量输入并返回单个值(长度为1)。然而,当前函数返回的长度不为1。" -"因为该单个值将用于填充任何缺失的组合,故必须为长度为1。你可以通过显式设置" -"'fill' 参数来覆盖该值,或修改你的函数以处理该情况。" +"聚合函数应接收向量输入并返回单个值(长度为1)。然而,当前函数返回的长度不为" +"1。因为该单个值将用于填充任何缺失的组合,故必须为长度为1。你可以通过显式设" +"置'fill' 参数来覆盖该值,或修改你的函数以处理该情况。" -#: fcast.R:222 +#: fcast.R:253 #, c-format msgid "Internal error -- empty rhsnames in dcast; please report" msgstr "内部错误:dcast 中 rhsnames 为空;请报告" @@ -1535,17 +1568,17 @@ msgid "" "reshape2::%1$s(%3$s). In the next version, this warning will become an error." msgstr "" "data.table 中的 %1$s 泛型方法接收到了一个 %2$s 对象,将尝试重定向使用" -"reshape2 包中相关的方法。请注意 reshape2 包已经被其实包取代且不再积极开发," -"故该重定向目前也不推荐使用。要在 data.table 和 reshape2 包同时载入的情况下" -"继续使用 reshape2 包中的 melt 方法,如 melt.list,你可指定命名空间,如" -"reshape2::%1$s(%3$s)。下一版本中,本警告将变成一个错误。" +"reshape2 包中相关的方法。请注意 reshape2 包已经被其实包取代且不再积极开发,故" +"该重定向目前也不推荐使用。要在 data.table 和 reshape2 包同时载入的情况下继续" +"使用 reshape2 包中的 melt 方法,如 melt.list,你可指定命名空间,如reshape2::" +"%1$s(%3$s)。下一版本中,本警告将变成一个错误。" #: fmelt.R:28 #, c-format msgid "Input patterns must be of type character." msgstr "输入的 patterns 必须是字符类型。" -#: fmelt.R:32 +#: fmelt.R:31 #, c-format msgid "Pattern(s) not found: [%s]" msgstr "未找到下列 pattern:[%s]" @@ -1563,16 +1596,14 @@ msgstr "" msgid "" "group names specified in ... conflict with measure argument names; please " "fix by changing group names: %s" -msgstr "" -"在 ... 中指定的组名与 measure 参数名冲突;请通过修改组名来解决:%s" +msgstr "在 ... 中指定的组名与 measure 参数名冲突;请通过修改组名来解决:%s" #: fmelt.R:60 #, c-format msgid "" "each ... argument to measure must be a function with at least one argument, " "problem: %s" -msgstr "" -"measure 中的每个 ... 参数必须为至少带有一个参数的函数,问题:%s" +msgstr "measure 中的每个 ... 参数必须为至少带有一个参数的函数,问题:%s" #: fmelt.R:74 #, c-format @@ -1624,15 +1655,14 @@ msgstr "" #, c-format msgid "" "pattern must contain at least one capture group (parenthesized sub-pattern)" -msgstr "" -"pattern 必须包含至少一个捕获组(括号子模式)" +msgstr "pattern 必须包含至少一个捕获组(括号子模式)" -#: fmelt.R:123 +#: fmelt.R:124 #, c-format msgid "sep must be character string" msgstr "sep 必须是一个字符串" -#: fmelt.R:129 +#: fmelt.R:130 #, c-format msgid "" "each column name results in only one item after splitting using sep, which " @@ -1643,7 +1673,7 @@ msgstr "" "使用 sep 分割后,每个列名只产生一个条目,这意味着所有列都将被融合;请通过直接" "指定所有列同时不使用 measure 来解决,或使用其他 sep/pattern 来解决" -#: fmelt.R:138 +#: fmelt.R:140 #, c-format msgid "" "number of unique column IDs =%d is less than number of melted columns =%d; " @@ -1651,63 +1681,60 @@ msgid "" msgstr "" "唯一列 ID 的数量 =%d 少于融合的列数量 =%d;请通过修改 pattern/sep 来解决" -#: fmelt.R:148 +#: fmelt.R:150 #, c-format msgid "" "in the measurev fun.list, each non-NULL element must be a function with at " "least one argument, problem: %s" msgstr "" -"在 measurev 的 fun.list 中,每个非 NULL 元素必须为至少带有一个参数的函数," -"问题:%s" +"在 measurev 的 fun.list 中,每个非 NULL 元素必须为至少带有一个参数的函数,问" +"题:%s" -#: fmelt.R:152 +#: fmelt.R:154 #, c-format msgid "" "each conversion function must return an atomic vector with same length as " "its first argument, problem: %s" -msgstr "" -"每个转换函数必须返回一个与其第一个参数长度相同的原子向量,问题:%s" +msgstr "每个转换函数必须返回一个与其第一个参数长度相同的原子向量,问题:%s" -#: fmelt.R:155 +#: fmelt.R:157 #, c-format msgid "%s conversion function returned vector of all NA" msgstr "%s 转换函数返回了全为 NA 的向量" -#: fmelt.R:161 +#: fmelt.R:163 #, c-format msgid "" "number of unique groups after applying type conversion functions less than " "number of groups, change type conversion" -msgstr "" -"应用类型转换函数后,唯一组数量少于组总数量,请修改类型转换函数" +msgstr "应用类型转换函数后,唯一组数量少于组总数量,请修改类型转换函数" -#: fmelt.R:166 +#: fmelt.R:168 #, c-format msgid "" "%s column class=%s after applying conversion function, but must be character" -msgstr "" -"应用转换函数后,%s 列的类型=%s,但此处要求必须为字符串类型" +msgstr "应用转换函数后,%s 列的类型=%s,但此处要求必须为字符串类型" -#: fmelt.R:170 +#: fmelt.R:172 #, c-format msgid "%s is the only group; fix by creating at least one more group" msgstr "%s 是唯一的组;请通过再创建至少一个组来解决" -#: fmelt.R:193 +#: fmelt.R:195 #, c-format msgid "'data' must be a data.table" msgstr "'data' 必须是一个 data.table" -#: fmelt.R:212 +#: fmelt.R:214 #, c-format msgid "" "'value.name' provided in both 'measure.vars' and 'value.name argument'; " "value provided in 'measure.vars' is given precedence." msgstr "" -"在 'measure.vars' 和 'value.name' 参数中都提供了 'value.name';" -"将优先使用 'measure.vars' 中的值。" +"在 'measure.vars' 和 'value.name' 参数中都提供了 'value.name';将优先使用 " +"'measure.vars' 中的值。" -#: fmelt.R:215 +#: fmelt.R:217 #, c-format msgid "Please provide a name to each element of 'measure.vars'." msgstr "请为 'measure.vars' 中的每个元素提供一个名称。" @@ -1753,8 +1780,8 @@ msgid "" "y must be keyed (i.e., sorted, and, marked as sorted). Call setkey(y, ...) " "first, see ?setkey. Also check the examples in ?foverlaps." msgstr "" -"y 必须有键(key:已经排序并且标记为已排序)。请先用 setkey(y, ...) 设置主" -"键,可以参考 ?setkey 以及 ?foverlaps 中提供的例子。" +"y 必须有键(key:已经排序并且标记为已排序)。请先用 setkey(y, ...) 设置主键," +"可以参考 ?setkey 以及 ?foverlaps 中提供的例子。" #: foverlaps.R:24 #, c-format @@ -1830,16 +1857,17 @@ msgid "" "The last two columns in by.x should correspond to the 'start' and 'end' " "intervals in data.table x and must be integer/numeric type." msgstr "" -"'by.x'的最后两列应该与data.table x中的'开始'与'结尾'的间隔对应且必须是整数/" -"数字类型" +"'by.x'的最后两列应该与data.table x中的'开始'与'结尾'的间隔对应且必须是整数/数" +"字类型" #: foverlaps.R:60 foverlaps.R:62 foverlaps.R:69 foverlaps.R:71 #, c-format msgid "" "NA values in data.table %s '%s' column: '%s'. All rows with NA values in the " "range columns must be removed for foverlaps() to work." -msgstr "data.table %s 的 '%s' 列中存在缺失值:'%s'。若使 foverlaps() 运行,必" -"须删除指定范围列内所有包含NA值的行。" +msgstr "" +"data.table %s 的 '%s' 列中存在缺失值:'%s'。若使 foverlaps() 运行,必须删除指" +"定范围列内所有包含NA值的行。" #: foverlaps.R:63 #, c-format @@ -1854,8 +1882,8 @@ msgid "" "The last two columns in by.y should correspond to the 'start' and 'end' " "intervals in data.table y and must be integer/numeric type." msgstr "" -"'by.y'的最后两列应该与data.table y中的'开始'与'结尾'的间隔对应且必须是整数/" -"数字类型" +"'by.y'的最后两列应该与data.table y中的'开始'与'结尾'的间隔对应且必须是整数/数" +"字类型" #: foverlaps.R:72 #, c-format @@ -1922,33 +1950,31 @@ msgstr "x是一个list, 'cols'不能为0长度" msgid "" "Input column '..na_prefix..' conflicts with data.table internal usage; " "please rename" -msgstr "" -"输入列 '..na_prefix..' 与 data.table 内部使用变量冲突,请重命名" +msgstr "输入列 '..na_prefix..' 与 data.table 内部使用变量冲突,请重命名" #: frank.R:46 #, c-format msgid "" "Input column '..stats_runif..' conflicts with data.table internal usage; " "please rename" -msgstr "" -"输入列 '..stats_runif..' 与 data.table 内部使用变量冲突,请重命名" +msgstr "输入列 '..stats_runif..' 与 data.table 内部使用变量冲突,请重命名" #: fread.R:10 #, c-format msgid "Used more than one of the arguments input=, file=, text= and cmd=." msgstr "使用了超过一个参数, 包括input=, file=, text= 和 cmd=." -#: fread.R:22 +#: fread.R:23 #, c-format msgid "Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'." msgstr "参数 'encoding' 必须为 'unknown', 'UTF-8' 或 'Latin-1'." -#: fread.R:40 +#: fread.R:42 #, c-format msgid "'text=' is type %s but must be character." msgstr "'text=' 为 %s 类型,但需要其必须为字符串类型。" -#: fread.R:53 +#: fread.R:55 #, c-format msgid "" "input= must be a single character string containing a file name, a system " @@ -1960,7 +1986,7 @@ msgstr "" "令, 以'http[s]://','ftp[s]://' 或 'file://' 开头的URL, 或是本身就包含至少一个" "\\n 或 \\r的输入数据" -#: fread.R:58 +#: fread.R:60 #, c-format msgid "" "input= contains no \\n or \\r, but starts with a space. Please remove the " @@ -1969,7 +1995,7 @@ msgstr "" "input= 不包含 \\n 或 \\r, 但是开头有个空格请移除开头的空格, 或使用text=, " "file= 或 cmd=" -#: fread.R:62 +#: fread.R:64 #, c-format msgid "" "Taking input= as a system command because it contains a space ('%s'). If " @@ -1980,23 +2006,23 @@ msgid "" "app is running as root. Please read item 5 in the NEWS file for v1.11.6 for " "more information and for the option to suppress this message." msgstr "" -"因 input= 中包含空格('%s'),故将其视为一个系统命令处理。如果其为文件名," -"请将空格去除,或直接使用 file=。当一个变量被传给 input= 但其又被当作一个系" -"统命令来处理时,则存在一定的安全隐患:设想一下你在开发一个 app,而该 app 可" -"能存在恶意用户, 同时这个 app 运行在不安全的环境中,如该 app 被以 root 权限" -"运行。若需获取更多信息或了解哪一选项可禁止本信息,请阅读 v1.11.6 版本 NEWS " -"文件里面的第 5 条。" +"因 input= 中包含空格('%s'),故将其视为一个系统命令处理。如果其为文件名,请" +"将空格去除,或直接使用 file=。当一个变量被传给 input= 但其又被当作一个系统命" +"令来处理时,则存在一定的安全隐患:设想一下你在开发一个 app,而该 app 可能存在" +"恶意用户, 同时这个 app 运行在不安全的环境中,如该 app 被以 root 权限运行。若" +"需获取更多信息或了解哪一选项可禁止本信息,请阅读 v1.11.6 版本 NEWS 文件里面的" +"第 5 条。" -#: fread.R:75 +#: fread.R:77 #, c-format msgid "" "file= must be a single character string containing a filename, or URL " "starting 'http[s]://', 'ftp[s]://' or 'file://'" msgstr "" -"file= 须为包含文件名的字符串,或以 'http[s]://'、'ftp[s]://' 或 'file://' 开头" -"的URL字符串。" +"file= 须为包含文件名的字符串,或以 'http[s]://'、'ftp[s]://' 或 'file://' 开" +"头的URL字符串。" -#: fread.R:80 +#: fread.R:82 #, c-format msgid "" "URL requires download.file functionalities from R >=3.2.2. You can still " @@ -2005,29 +2031,28 @@ msgstr "" "URL 需要 R >=3.2.2 的 download.file 功能。不过你仍可手动下载该文件并使用" "fread 来读取它。" -#: fread.R:91 +#: fread.R:93 #, c-format msgid "File '%s' does not exist or is non-readable. getwd()=='%s'" msgstr "文件 '%s' 不存在, 或不可读. getwd()=='%s'" -#: fread.R:92 +#: fread.R:94 #, c-format msgid "File '%s' is a directory. Not yet implemented." msgstr "文件 '%s' 是个目录。还没有编程实现。" -#: fread.R:94 +#: fread.R:96 #, c-format msgid "File '%s' has size 0. Returning a NULL %s." msgstr "文件 '%s' 的大小为0. 返回一个NULL %s." -#: fread.R:107 +#: fread.R:109 #, c-format msgid "" "Compressed files containing more than 1 file are currently not supported." -msgstr "" -"目前不支持包含多个文件的压缩文件。" +msgstr "目前不支持包含多个文件的压缩文件。" -#: fread.R:119 +#: fread.R:121 #, c-format msgid "" "To read gz and bz2 files directly, fread() requires 'R.utils' package which " @@ -2037,14 +2062,14 @@ msgstr "" "想要直接读取 gz 和 bz2 文件, fread() 需要 'R.utils' 包.请用 'install." "packages('R.utils')'安装 'R.utils' 包." -#: fread.R:129 +#: fread.R:131 #, c-format msgid "" "'autostart' is now deprecated and ignored. Consider skip='string' or skip=n" msgstr "" "'autostart' 现在已经不再推荐使用且失效, 请考虑用 skip='string' 或 skip=n" -#: fread.R:131 +#: fread.R:133 #, c-format msgid "" "colClasses is type 'logical' which is ok if all NA but it has some TRUE or " @@ -2054,12 +2079,12 @@ msgstr "" "colClasses 只能在全部是NA的情况下为逻辑变量. 请考虑使用 drop= 或 select= . 更" "多信息请参照 ?fread" -#: fread.R:135 +#: fread.R:137 #, c-format msgid "colClasses is not type list or character vector" msgstr "colClasses 不是列表 (list) 或字符向量 (character vector)" -#: fread.R:140 +#: fread.R:142 #, c-format msgid "" "colClasses=\"NULL\" (quoted) is interpreted as colClasses=NULL (the default) " @@ -2068,30 +2093,30 @@ msgstr "" "colClasses=\"NULL\" (带引号) 应该被视为 colClasses=NULL (默认情况)而不是弃掉" "每个列" -#: fread.R:152 +#: fread.R:154 #, c-format msgid "na.strings[%d]==\"%s\" consists only of whitespace, ignoring" msgstr "na.strings[%d]==\"%s\" 仅包含空格, 已忽略" -#: fread.R:155 +#: fread.R:157 #, c-format msgid "" -"%s. strip.white==TRUE (default) and \"\" is present in na.strings, so any " -"number of spaces in string columns will already be read as ." +"%s. Since strip.white=TRUE (default), use na.strings=\"\" to specify that " +"any number of spaces in a string column should be read as ." msgstr "" -"%s. na.strings 中包含 strip.white==TRUE (默认情况) 和 \"\", 因此(字符类型的)列中" -"的空格会被当作 ." +"%s. 因为 strip.white=TRUE (默认情况), 请使用 na.strings=\"\" 以使得(字符类型" +"的)列中的空格会被当作 ." -#: fread.R:157 +#: fread.R:159 #, c-format msgid "" -"%s. Since strip.white=TRUE (default), use na.strings=\"\" to specify that " -"any number of spaces in a string column should be read as ." +"%s. strip.white==TRUE (default) and \"\" is present in na.strings, so any " +"number of spaces in string columns will already be read as ." msgstr "" -"%s. 因为 strip.white=TRUE (默认情况), 请使用 na.strings=\"\" 以使得(字符类型的)列" -"中的空格会被当作 ." +"%s. na.strings 中包含 strip.white==TRUE (默认情况) 和 \"\", 因此(字符类型的)" +"列中的空格会被当作 ." -#: fread.R:161 +#: fread.R:163 #, c-format msgid "" "%s. But strip.white=FALSE. Use strip.white=TRUE (default) together with na." @@ -2100,7 +2125,7 @@ msgstr "" "%s. 但是 strip.white=FALSE. 请使用 strip.white=TRUE (默认情况), 同时na." "strings=\"\", 以使得(字符类型的)列中的空格转成 ." -#: fread.R:167 +#: fread.R:169 #, c-format msgid "" "'data.table' relies on the package 'yaml' to parse the file header; please " @@ -2109,7 +2134,7 @@ msgstr "" "'data.table' 依赖 'yaml' 包来分析文件头 (header)请使用 install." "packages('yaml') 来安装然后再重试." -#: fread.R:171 +#: fread.R:173 #, c-format msgid "" "Combining a search string as 'skip' and reading a YAML header may not work " @@ -2121,17 +2146,17 @@ msgstr "" "时, 将从文件的开头开始搜索 'skip', 而 不是 从元数据的结尾开始;如果你想要" "fread 提供更直观的功能支持, 请在 GitHub 上提交一个 issue。" -#: fread.R:181 +#: fread.R:183 #, c-format msgid "" "Encountered <%s%s> at the first unskipped line (%d), which does not " "constitute the start to a valid YAML header (expecting something matching " "regex \"%s\"); please check your input and try again." msgstr "" -"在第一个未被跳过的行 (%3$d) 中遇到 <%1$s%2$s>。它不是一个有效的 YAML 文件头的开头" -"(有效的开头应符合正则表达式 \"%4$s\");请检查你的输入并重试。" +"在第一个未被跳过的行 (%3$d) 中遇到 <%1$s%2$s>。它不是一个有效的 YAML 文件头的" +"开头(有效的开头应符合正则表达式 \"%4$s\");请检查你的输入并重试。" -#: fread.R:193 +#: fread.R:195 #, c-format msgid "" "Reached the end of the file before finding a completion to the YAML header. " @@ -2141,19 +2166,19 @@ msgstr "" "至文件的结尾 YAML 文件头仍未结束。一个有效的 YAML 文件头是指符合正则表达式 " "\"%s\"的行。请再次检查输入文件是否为一有效的 csvy 文件。" -#: fread.R:207 +#: fread.R:209 #, c-format msgid "User-supplied 'header' will override that found in metadata." msgstr "用户提供的'header'将覆盖元数据中的表头" -#: fread.R:225 +#: fread.R:227 #, c-format msgid "" "User-supplied column names in 'col.names' will override those found in YAML " "metadata." msgstr "用户在“col.names”中提供的列名将覆盖在YAML元数据中找到的列名" -#: fread.R:234 +#: fread.R:236 #, c-format msgid "" "colClasses dictated by user input and those read from YAML header are in " @@ -2162,31 +2187,31 @@ msgid "" "YAML header; please exclude the column(s) from colClasses if this was " "unintentional." msgstr "" -"用户输入的 colClasses 和从 YAML 文件头中读取的 colClasses 冲突(冲突列为 [%s]" -");程序会假设用户输入是有意要覆盖 YAML 文件头,因此,YAML 文件头中的类型将被" -"忽略;如果这并非有意为之,请在 colClasses 中排除这些列。" +"用户输入的 colClasses 和从 YAML 文件头中读取的 colClasses 冲突(冲突列为 " +"[%s]);程序会假设用户输入是有意要覆盖 YAML 文件头,因此,YAML 文件头中的类型" +"将被忽略;如果这并非有意为之,请在 colClasses 中排除这些列。" -#: fread.R:255 +#: fread.R:257 #, c-format msgid "User-supplied 'sep' will override that found in metadata." msgstr "用户提供的“sep”将覆盖元数据中的分隔符" -#: fread.R:260 +#: fread.R:262 #, c-format msgid "User-supplied 'quote' will override that found in metadata." msgstr "用户提供的“quote”将覆盖元数据中的引号" -#: fread.R:265 +#: fread.R:267 #, c-format msgid "User-supplied 'dec' will override that found in metadata." msgstr "用户提供的“dec”将覆盖元数据中的小数点分隔符" -#: fread.R:269 +#: fread.R:271 #, c-format msgid "User-supplied 'na.strings' will override that found in metadata." msgstr "用户提供的“na.strings”将覆盖元数据中对默认值的预处理" -#: fread.R:317 +#: fread.R:319 #, c-format msgid "" "Column '%s' was requested to be '%s' but fread encountered the following " @@ -2194,12 +2219,11 @@ msgid "" "\t%s\n" "so the column has been left as type '%s'" msgstr "" -"列 '%s' 被指定为 '%s' 类型,但 fread 却遇到了以下" -"%s:\n" +"列 '%s' 被指定为 '%s' 类型,但 fread 却遇到了以下%s:\n" "\t%s\n" "故该列的类型将被定为 '%s'" -#: fread.R:340 +#: fread.R:342 #, c-format msgid "" "key argument of data.table() must be a character vector naming columns (NB: " @@ -2207,7 +2231,7 @@ msgid "" msgstr "" "data.table()的key参数必须是字符向量命名的列(NB:col.names在这之前被使用过)" -#: fread.R:349 +#: fread.R:351 #, c-format msgid "" "index argument of data.table() must be a character vector naming columns " @@ -2245,31 +2269,32 @@ msgid "" "remove logicalAsInt in future." msgstr "" "logicalAsInt 参数已重命名为 logical01 参数以与 fread 保持一致。目前它可以正常" -"工作,但请在方便的时候将其改为 logical01,以便我们在未来删除 logicalAsInt 参数。" +"工作,但请在方便的时候将其改为 logical01,以便我们在未来删除 logicalAsInt 参" +"数。" #: fwrite.R:40 #, c-format msgid "x being coerced from class: matrix to data.table" msgstr "x 的类将强制从 matrix 转变为 data.table" -#: fwrite.R:79 +#: fwrite.R:85 #, c-format msgid "" "If you intended to overwrite the file at %s with an empty one, please use " "file.remove first." msgstr "若你想以一空白文件覆盖文件 %s,请先使用 file.remove。" -#: fwrite.R:80 +#: fwrite.R:86 #, c-format msgid "Input has no columns; doing nothing.%s" msgstr "输入没有列,不执行任何操作。%s" -#: fwrite.R:83 +#: fwrite.R:89 #, c-format msgid "Input has no columns; creating an empty file at '%s' and exiting." msgstr "输入中没有任何列,将创建一个空文件 '%s' 并退出。" -#: fwrite.R:90 +#: fwrite.R:96 #, c-format msgid "" "'data.table' relies on the package 'yaml' to write the file header; please " @@ -2351,9 +2376,9 @@ msgid "" "grouping by A,B and B,A will produce the same aggregations. Use " "`sets=unique(lapply(sets, sort))` to eliminate duplicates." msgstr "" -"'sets' 中第 %d 个输入中包含重复的元素(即排序结果相同),因此输出的结果中将" -"包含重复的行。请注意,按照 A、B 分组与按照 B、A 分组的结果是一样的。" -"可使用 `sets=unique(lapply(sets, sort))` 来消除重复项。" +"'sets' 中第 %d 个输入中包含重复的元素(即排序结果相同),因此输出的结果中将包" +"含重复的行。请注意,按照 A、B 分组与按照 B、A 分组的结果是一样的。可使用 " +"`sets=unique(lapply(sets, sort))` 来消除重复项。" #: groupingsets.R:73 #, c-format @@ -2466,7 +2491,8 @@ msgstr "列名 %s 在结果中是重复的" msgid "" "data.table %s IN DEVELOPMENT built %s%s using %d threads (see ?getDTthreads)." msgstr "" -"正在开发中的 data.table %s,构建于 %s%s,使用了 %d 个线程(参见 ?getDTthreads)。" +"正在开发中的 data.table %s,构建于 %s%s,使用了 %d 个线程(参见 ?" +"getDTthreads)。" #: onAttach.R:25 #, c-format @@ -2548,9 +2574,9 @@ msgid "" "GitHub issue.\n" "**********" msgstr "" -"这是 %s。此警告一般不应出现在 Windows 或 Linux 平台中,因为data.table 的 configure " -"脚本中已通过向编译器传递 -fopenmp 参数启用了 OpenMP。如果你在 Windows 或 " -"Linux 平台中发现此警告,请在 GitHub 中提交 issue。" +"这是 %s。此警告一般不应出现在 Windows 或 Linux 平台中,因为data.table 的 " +"configure 脚本中已通过向编译器传递 -fopenmp 参数启用了 OpenMP。如果你在 " +"Windows 或 Linux 平台中发现此警告,请在 GitHub 中提交 issue。" #: onAttach.R:40 #, c-format @@ -2566,9 +2592,9 @@ msgid "" msgstr "" "**********\n" "该 data.table 使用 R < 3.4.0(2017年4月)进行编译,已知会存在内存泄露问题。请" -"升级R 后重新安装 data.table 以修复内存泄露问题。维护和测试支持旧版本的代码分支会" -"增加开发时间,所以请升级 R。我们打算将 data.table 的依赖从 8 年前的 R 3.1.0(" -"2014年4月)升级到 5 年前的 R 3.4.0(2017年4月)。\n" +"升级R 后重新安装 data.table 以修复内存泄露问题。维护和测试支持旧版本的代码分" +"支会增加开发时间,所以请升级 R。我们打算将 data.table 的依赖从 8 年前的 R " +"3.1.0(2014年4月)升级到 5 年前的 R 3.4.0(2017年4月)。\n" "**********" #: onLoad.R:9 @@ -2577,10 +2603,10 @@ msgid "" "Option 'datatable.nomatch' is defined but is now ignored. Please see note 11 " "in v1.12.4 NEWS (Oct 2019), and note 14 in v1.14.2." msgstr "" -"选项 'datatable.nomatch' 虽定义,但现在已被忽略。请参见 v1.12.4 NEWS(2019年10月)" -"的第11条说明,以及 v1.14.2 的第14条说明。" +"选项 'datatable.nomatch' 虽定义,但现在已被忽略。请参见 v1.12.4 NEWS(2019年" +"10月)的第11条说明,以及 v1.14.2 的第14条说明。" -#: onLoad.R:26 +#: onLoad.R:27 #, c-format msgid "" "The data_table.%s version (%s) does not match the package (%s). Please close " @@ -2603,15 +2629,16 @@ msgstr "" "17478协助我们确认这个Bug。R和C代码之间的这种不匹配可能发生在任何包中,而不仅" "仅是在data.table中。只是data.table添加了这个检查" -#: onLoad.R:30 +#: onLoad.R:31 #, c-format msgid "" "This is R %s but data.table has been installed using R %s. The major version " "must match. Please reinstall data.table." -msgstr "当前 R 的版本为 %s,但 data.table 安装是使用的是 R %s 版本。R 主版本号" -"必须匹配。请重新安装 data.table。" +msgstr "" +"当前 R 的版本为 %s,但 data.table 安装是使用的是 R %s 版本。R 主版本号必须匹" +"配。请重新安装 data.table。" -#: onLoad.R:95 +#: onLoad.R:97 #, c-format msgid "" "Option 'datatable.CJ.names' no longer has any effect, as promised for 4 " @@ -2621,31 +2648,31 @@ msgstr "" "选项 'datatable.CJ.names' 已经没有任何效果。这一变更早在 4 年前就已决定。现在" "该选项已被忽略。如果你仍倾向于旧的行为,请按需手动命名 `...` 条目。" -#: onLoad.R:100 +#: onLoad.R:102 #, c-format msgid "Unexpected base R behaviour: list(x) has copied x" msgstr "意外的base R行为:list(x)已经复制了x" -#: onLoad.R:108 +#: onLoad.R:110 #, c-format msgid "Unexpected base R behaviour: names<- has copied column contents" msgstr "意外的base R行为:names<- 已复制列内容" -#: onLoad.R:118 +#: onLoad.R:120 #, c-format msgid "" "Unexpected base R behaviour: DF[2,2]<- did not copy column 2 which was " "assigned to" msgstr "意外的base R行为:DF[2,2]<- 没有复制第二列它被分配给" -#: onLoad.R:119 +#: onLoad.R:121 #, c-format msgid "" "Unexpected base R behaviour: DF[2,2]<- copied the first column which was not " "assigned to, too" msgstr "意外的base R行为:DF[2,2]<-复制了第一列的内容,它也没有被分配给" -#: onLoad.R:121 +#: onLoad.R:123 #, c-format msgid "Unexpected base R behaviour: DF[2,2]<- has not copied address(DF)" msgstr "意外的base R行为:DF[2,2]<- 还没有复制address(DF)" @@ -2665,22 +2692,22 @@ msgstr "提供了percent=,但为长度 %d" msgid "percent==%d but should be a number between 2 and 100" msgstr "percent==%d 但应为2到100之间的数字" -#: print.data.table.R:17 +#: print.data.table.R:19 #, c-format msgid "Valid options for col.names are 'auto', 'top', and 'none'" msgstr "对col.names有效的参数为'auto', 'top', and 'none'" -#: print.data.table.R:19 +#: print.data.table.R:21 #, c-format msgid "Valid options for trunc.cols are TRUE and FALSE" msgstr "对trunc.cols有效的参数为TRUE和FALSE" -#: print.data.table.R:21 +#: print.data.table.R:23 #, c-format msgid "Column classes will be suppressed when col.names is 'none'" msgstr "当col.names为'none'时,列的类型将被抑制" -#: print.data.table.R:146 +#: print.data.table.R:156 #, c-format msgid "" "Internal structure doesn't seem to be a list. Possibly corrupt data.table." @@ -2697,8 +2724,8 @@ msgid "" "Character objects provided in the input are not scalar objects, if you need " "them as character vector rather than a name, then wrap each into 'I' call: %s" msgstr "" -"输入中所提供的字符串对象不是标量(长度为 1 )对象,如果您需要它们作为字符串向量而" -"非名称,请将每个对象放入到 'I' 调用中:%s" +"输入中所提供的字符串对象不是标量(长度为 1 )对象,如果您需要它们作为字符串向量" +"而非名称,请将每个对象放入到 'I' 调用中:%s" #: programming.R:50 #, c-format @@ -2905,8 +2932,8 @@ msgstr "列 '%s' 为 '%s' 类型,目前尚不支持使用该类型排序。" msgid "" "'sorted' is TRUE but element %d is non-atomic, which can't be sorted; try " "setting sorted = FALSE" -msgstr "'sorted' 为 TRUE 但 %d 元素并非原子类型,无法排序。请尝试使用 " -"sorted = FALSE" +msgstr "" +"'sorted' 为 TRUE 但 %d 元素并非原子类型,无法排序。请尝试使用 sorted = FALSE" #: setkey.R:344 #, c-format @@ -3046,12 +3073,9 @@ msgid "Internal error: factor type mismatch should have been caught earlier" msgstr "内部错误:此时不匹配的因子类型应已被发现" #: shift.R:3 -msgid "Provided argument fill=" -msgstr "提供的 fill= 参数" - -#: shift.R:3 -msgid "will be ignored since type='shift'." -msgstr "将被忽略,因为 type='shift'。" +#, c-format +msgid "Provided argument fill=%s will be ignored since type='cyclic'." +msgstr "提供的 fill= 参数将被忽略,因为 type='cyclic'。" #: tables.R:46 #, c-format @@ -3077,33 +3101,38 @@ msgstr "" msgid "Neither %s nor %s exist in %s" msgstr "%3$s 中 %1$s 也 %2$s 不存在" -#: test.data.table.R:114 +#: test.data.table.R:118 msgid "object '%s' not found" msgstr "未找到 '%s' 对象" -#: test.data.table.R:138 +#: test.data.table.R:142 #, c-format msgid "" "memtest intended for Linux. Step through data.table:::rss() to see what went " "wrong." msgstr "memtest 仅适用于 Linux。请逐步执行 data.table:::rss() 以查看错误原因。" -#: test.data.table.R:176 +#: test.data.table.R:196 +#, c-format +msgid "Attempt to subset to %d tests matching '%s' failed, running full suite." +msgstr "尝试进行子集化 %d 个测试匹配 '%s' 失败,正在运行所有测试。" + +#: test.data.table.R:244 #, c-format msgid "Failed in %s after test %s before the next test() call in %s" msgstr "测试于 %s 后失败,失败发生在测试 %s 之后、下一 test() 之前、调用 %s 时" -#: test.data.table.R:186 +#: test.data.table.R:254 #, c-format msgid "%d error(s) out of %d. Search %s for test number(s) %s. Duration: %s." msgstr "%2$d 中共产生 %1$d 个错误。搜索 %3$s 以定位测试编号 %4$s。用时:%5$s。" -#: test.data.table.R:199 +#: test.data.table.R:267 #, c-format msgid "Timings count mismatch: %d vs %d" msgstr "计时不一致: %d 对 %d" -#: test.data.table.R:312 +#: test.data.table.R:396 #, c-format msgid "" "Test %s is invalid: when error= is provided it does not make sense to pass y " @@ -3150,8 +3179,8 @@ msgid "" "specified)." msgstr "" "当参数 'type.convert' 包含一个未命名元素时,此元素应为最后一个元素,且应为一" -"个函数。不允许包含多个未命名元素,除非所有元素都为函数,且所有元素的总个数为" -"%d(即输入列表转置后的长度或 'keep' 参数指定的长度)。" +"个函数。不允许包含多个未命名元素,除非所有元素都为函数,且所有元素的总个数" +"为%d(即输入列表转置后的长度或 'keep' 参数指定的长度)。" #: transpose.R:66 #, c-format @@ -3172,19 +3201,19 @@ msgid "" "specified) or be between %d and %d (if it is not). But '%s' is/are not " "contained in '%s'." msgstr "" -"当参数 'type.convert' 包含转置列表的索引时,它们应该是参数 'keep' (如果" -"提供了的话)中的整数值,或者(如果 'keep' 没有指定的话)应介于 %d 和 %d 之间。" -"但 '%s' 并不包含在 '%s' 中。" +"当参数 'type.convert' 包含转置列表的索引时,它们应该是参数 'keep' (如果提供" +"了的话)中的整数值,或者(如果 'keep' 没有指定的话)应介于 %d 和 %d 之间。但 " +"'%s' 并不包含在 '%s' 中。" #: transpose.R:74 #, c-format msgid "" "In the argument 'type.convert', '%s' was ignored because all elements in the " -"transpose list or elements corrisponding to indices specified in the 'keep' " +"transpose list or elements corresponding to indices specified in the 'keep' " "argument have already been converted." msgstr "" -"忽略参数 'type.convert' 中的 '%s',因为转置列表中的所有元素或参数 'keep'" -"中索引对应的元素已经被转换。" +"忽略参数 'type.convert' 中的 '%s',因为转置列表中的所有元素或参数 'keep'中索" +"引对应的元素已经被转换。" #: transpose.R:83 #, c-format @@ -3202,7 +3231,7 @@ msgstr "" msgid "length(names) (= %d) is not equal to length(%s) (= %d)." msgstr "length(names) (= %d) 并不等于 length(%s) (= %d)。" -#: uniqlist.R:12 +#: uniqlist.R:11 #, c-format msgid "l not type list" msgstr "l并非列表(list)类型" @@ -3217,16 +3246,16 @@ msgstr "参数 'nan' 的长度必须为 1" msgid "Argument 'nan' must be NA or NaN" msgstr "参数 'nan' 必须为 NA 或 NaN" -#: utils.R:32 +#: utils.R:28 msgid "Internal error: use endsWithAny instead of base::endsWith" msgstr "内部错误:使用 endsWithAny 而非 base::endsWith" -#: utils.R:43 utils.R:52 +#: utils.R:39 utils.R:48 #, c-format msgid "x not boolean" msgstr "x并非布尔值" -#: utils.R:63 +#: utils.R:59 #, c-format msgid "" "Some columns are type 'integer64' but package bit64 is not installed. Those " @@ -3273,7 +3302,7 @@ msgstr "" msgid "Following columns are not numeric and will be omitted: %s" msgstr "以下的列并非数值类型,将被忽略:%s" -#: print.data.table.R:51 +#: print.data.table.R:53 msgid "Index: %s\n" msgid_plural "Indices: %s\n" msgstr[0] "索引(index): %s\n" diff --git a/po/zh_CN.po b/po/zh_CN.po index b915a63c2..a46981251 100644 --- a/po/zh_CN.po +++ b/po/zh_CN.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: data.table 1.12.5\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2023-12-28 12:46+0000\n" +"POT-Creation-Date: 2024-06-23 12:07-0300\n" "PO-Revision-Date: 2020-10-18 20:39-0400\n" "Last-Translator: Yuhang Chen \n" "Language-Team: Mandarin\n" @@ -49,15 +49,48 @@ msgstr "内部错误: .internal.selfref ptr不为NULL或字符向量" msgid "Internal error: length(names)>0 but =0 and not NA." msgstr "getOption('datatable.alloc')值为%d, 其必须大于等于零且不能为NA" -#: assign.c:250 between.c:16 between.c:22 forder.c:459 forder.c:462 frollR.c:40 +#: assign.c:292 between.c:16 between.c:22 forder.c:460 forder.c:463 frollR.c:40 #: frollR.c:94 fsort.c:105 gsumm.c:343 gsumm.c:579 gsumm.c:723 gsumm.c:860 -#: gsumm.c:1016 gsumm.c:1108 openmp-utils.c:79 uniqlist.c:354 utils.c:106 +#: gsumm.c:1016 gsumm.c:1108 nafill.c:103 openmp-utils.c:79 uniqlist.c:354 +#: utils.c:107 utils.c:109 #, c-format msgid "%s must be TRUE or FALSE" msgstr "%s 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: assign.c:298 +#: assign.c:340 msgid "assign has been passed a NULL dt" msgstr "赋值已经被传递给一个空的(NULL)dt" -#: assign.c:299 +#: assign.c:341 msgid "dt passed to assign isn't type VECSXP" msgstr "传递给赋值操作的dt不是VECSXP类型" -#: assign.c:301 +#: assign.c:343 msgid "" ".SD is locked. Updating .SD by reference using := or set are reserved for " "future use. Use := in j directly. Or use copy(.SD) as a (slow) last resort, " @@ -155,15 +189,15 @@ msgstr "" ".SD被锁定。 使用':='更新.SD操作保留将来使用对'j'直接使用':=', 或可以使用" "copy(.SD), 直到导出shallow()" -#: assign.c:309 +#: assign.c:351 msgid "Internal error: dt passed to Cassign is not a data.table or data.frame" msgstr "内部错误: 传递给赋值操作的dt不是data.table或data.frame类型" -#: assign.c:313 +#: assign.c:355 msgid "dt passed to assign has no names" msgstr "传递给赋值操作的dt没有命名" -#: assign.c:317 +#: assign.c:359 msgid "" "data.table is NULL; malformed. A null data.table should be an empty list. " "typeof() should always return 'list' for data.table." @@ -171,18 +205,18 @@ msgstr "" "data.table为空, 格式错误,一个null的data.table应该为空的列表list即对data." "table使用typeof()函数应该返回'list'类型" -#: assign.c:326 +#: assign.c:369 #, c-format msgid "Assigning to all %d rows\n" msgstr "为所有的%d行赋值\n" -#: assign.c:331 +#: assign.c:374 msgid "" "Coerced i from numeric to integer. Please pass integer for efficiency; e.g., " "2L rather than 2" msgstr "将i由数值型强制转换为整数型。请直接传入整数以提高效率,如传入2L而非2" -#: assign.c:334 +#: assign.c:377 #, c-format msgid "" "i is type '%s'. Must be integer, or numeric is coerced with warning. If i is " @@ -193,26 +227,26 @@ msgstr "" "整型并发出警告)。如果 i 为一个用于筛选的逻辑(logical)向量,请直接将它传给 " "which(),且如果可能的话将 which() 放置于循环之外以保持高效。" -#: assign.c:340 subset.c:165 +#: assign.c:383 subset.c:165 #, c-format msgid "i[%d] is %d which is out of range [1,nrow=%d]" msgstr "i[%d] 是 %d ,超出 [1,nrow=%d] 的范围" -#: assign.c:343 +#: assign.c:386 #, c-format msgid "Assigning to %d row subset of %d rows\n" msgstr "正在为 %d 行(总数为 %d 行)进行赋值\n" -#: assign.c:351 +#: assign.c:394 #, c-format msgid "Added %d new column%s initialized with all-NA\n" msgstr "添加了 %d 个新列 %s 并全部初始化为 NA\n" -#: assign.c:356 +#: assign.c:399 msgid "length(LHS)==0; no columns to delete or assign RHS to." msgstr "左手侧长度为0(length(LHS)==0);没有列可供删除或赋值给右手侧(RHS)。" -#: assign.c:370 +#: assign.c:413 msgid "" "set() on a data.frame is for changing existing columns, not adding new ones. " "Please use a data.table for that. data.table's are over-allocated and don't " @@ -222,7 +256,7 @@ msgstr "" "table 来添加新列。data.table 的操作是超额分配的(over-allocated)并且不进行浅" "拷贝(shallow copy)。" -#: assign.c:381 +#: assign.c:424 msgid "" "Coerced j from numeric to integer. Please pass integer for efficiency; e.g., " "2L rather than 2" @@ -230,7 +264,7 @@ msgstr "" "将 j 从数值(numeric)型自动转换为整(integer)型。为了保持高效请直接传入整" "型,如2L 而非 2" -#: assign.c:384 +#: assign.c:427 #, c-format msgid "" "j is type '%s'. Must be integer, character, or numeric is coerced with " @@ -239,22 +273,22 @@ msgstr "" "j 为 '%s' 型。j 必须为整(integer)型、字符(character)型,或数值(numeric)" "型(将被自动转换成整型并发出警告)。" -#: assign.c:386 +#: assign.c:429 msgid "" "Can't assign to the same column twice in the same query (duplicates " "detected)." msgstr "在一次查询中无法对同一列赋值两次(检测出重复项)。" -#: assign.c:387 +#: assign.c:430 msgid "newcolnames is supplied but isn't a character vector" msgstr "指定了 newcolnames 但其并非一字符串向量" -#: assign.c:389 +#: assign.c:432 #, c-format msgid "RHS_list_of_columns == %s\n" msgstr "RHS_list_of_columns == %s\n" -#: assign.c:394 +#: assign.c:437 #, c-format msgid "" "RHS_list_of_columns revised to true because RHS list has 1 item which is " @@ -263,7 +297,7 @@ msgstr "" "RHS_list_of_columns 改为真(True),因为右手侧列表(RHS list)有一子项为空值" "(NULL)或长度 %d 为 1 或 targetlen(%d)。请拆开右手侧。\n" -#: assign.c:399 +#: assign.c:442 #, c-format msgid "" "Supplied %d columns to be assigned an empty list (which may be an empty data." @@ -274,19 +308,19 @@ msgstr "" "后两者也是列表的一种)。删除多个列时请使用空值(NULL)。添加多个空列表列" "(list columns)时,请使用 list(list())。" -#: assign.c:404 +#: assign.c:447 #, c-format msgid "Recycling single RHS list item across %d columns. Please unwrap RHS.\n" msgstr "" "回收重用(Recycling)单个右手侧(RHS)列表子项于 %d 列。请拆开右手侧。\n" -#: assign.c:406 +#: assign.c:449 #, c-format msgid "" "Supplied %d columns to be assigned %d items. Please see NEWS for v1.12.2." msgstr "试图将 %2$d 项赋值给 %1$d 列。请阅读 v1.12.2 的更新信息(NEWS)。" -#: assign.c:414 +#: assign.c:457 #, c-format msgid "" "Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. " @@ -296,7 +330,7 @@ msgstr "" "j 中的列编号里第 %d 项是 %d,超出了有效范围 [1,ncol=%d]。数据框(data.frame)" "的 set() 是用于修改现有列,而非添加新列。请使用 data.table 来添加新列。" -#: assign.c:415 +#: assign.c:458 #, c-format msgid "" "Item %d of column numbers in j is %d which is outside range [1,ncol=%d]. Use " @@ -305,11 +339,11 @@ msgstr "" "j 中的列编号里第 %d 项是 %d,超出了有效范围 [1,ncol=%d]。请在 j 中使用列名来" "添加新列。" -#: assign.c:420 +#: assign.c:463 msgid "When deleting columns, i should not be provided" msgstr "当删除列时,不应指定 i" -#: assign.c:426 +#: assign.c:469 #, c-format msgid "" "RHS of assignment to existing column '%s' is zero length but not NULL. If " @@ -325,23 +359,24 @@ msgstr "" "一个与该列原数据等长的向量,如 vector('list',nrow(DT)),即,用新数据替换" "(plonk)重新生成该列。" -#: assign.c:431 +#: assign.c:474 #, c-format msgid "" "Internal error in assign.c: length(newcolnames)=%d, length(names)=%d, coln=%d" msgstr "assign.c 内部错误:length(newcolnames)=%d, length(names)=%d, coln=%d" -#: assign.c:433 +#: assign.c:476 #, c-format -msgid "Column '%s' does not exist to remove" +msgid "" +"Tried to assign NULL to column '%s', but this column does not exist to remove" msgstr "要删除的列 '%s' 不存在" -#: assign.c:441 +#: assign.c:484 #, c-format msgid "%d column matrix RHS of := will be treated as one vector" msgstr "':=' 右手侧(RHS)%d 列矩阵将被视为一维向量" -#: assign.c:446 +#: assign.c:489 #, c-format msgid "" "Can't assign to column '%s' (type 'factor') a value of type '%s' (not " @@ -350,7 +385,7 @@ msgstr "" "无法给因子(factor)类型列 '%s' 赋类型为 '%s' 的值(不是字符(character)、因" "子(factor)、整数(integer)或数值(numeric)类中的一种)" -#: assign.c:452 +#: assign.c:495 #, c-format msgid "" "Supplied %d items to be assigned to %d items of column '%s'. If you wish to " @@ -360,7 +395,7 @@ msgstr "" "试图将 %d 项赋值给 %d 项(列 '%s')。如果想'回收重用'('recycle')右手侧,请" "使用 rep() 以将该意图清晰地表述给阅读代码的人。" -#: assign.c:462 +#: assign.c:505 msgid "" "This data.table has either been loaded from disk (e.g. using readRDS()/" "load()) or constructed manually (e.g. using structure()). Please run setDT() " @@ -371,7 +406,7 @@ msgstr "" "structure() )。在通过引用的方式进行赋值前,请先运行 setDT() 或 setalloccol() " "来为增加的列预先分配空间" -#: assign.c:463 +#: assign.c:506 #, c-format msgid "" "Internal error: oldtncol(%d) < oldncol(%d). Please report to data.table " @@ -380,7 +415,7 @@ msgstr "" "内部错误: oldtncol(%d) < oldncol(%d)。 请将此问题汇报给 data.table 问题追踪" "器,包括 sessionInfo() 的信息。" -#: assign.c:465 +#: assign.c:508 #, c-format msgid "" "truelength (%d) is greater than 10,000 items over-allocated (length = %d). " @@ -392,7 +427,7 @@ msgstr "" "truelength。如果你没有将 datatable.alloccol 设置为非常大的数值,请将此问题汇" "报给 data.table 问题追踪器,包含 sessionInfo() 的信息" -#: assign.c:467 +#: assign.c:510 #, c-format msgid "" "Internal error: DT passed to assign has not been allocated enough column " @@ -400,7 +435,7 @@ msgid "" msgstr "" "内部错误: 传递出去赋值的 DT 没有被分配足够的列槽。 l=%d, tl=%d, 增加 %d" -#: assign.c:469 +#: assign.c:512 msgid "" "It appears that at some earlier point, names of this data.table have been " "reassigned. Please ensure to use setnames() rather than names<- or " @@ -410,18 +445,18 @@ msgstr "" "names<- 或 colnames<- 进行赋值。如果该办法无效,请将此问题汇报给 data.table " "问题追踪器,包含 sessionInfo() 的信息" -#: assign.c:474 +#: assign.c:517 #, c-format msgid "Internal error: selfrefnames is ok but tl names [%lld] != tl [%d]" msgstr "内部错误: selfrefnames 正确,但 tl 的名称 [%lld] != tl [%d]" -#: assign.c:493 +#: assign.c:536 msgid "" "Internal error: earlier error 'When deleting columns, i should not be " "provided' did not happen." msgstr "内部错误: 前期的错误 '当删除列的时候,不应该提供参数 i ' 没有发生" -#: assign.c:504 +#: assign.c:547 #, c-format msgid "" "RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d, but " @@ -430,12 +465,12 @@ msgstr "" "因为 NAMED==%d MAYBE_SHARED==%d, 所以条目 %d 的 RHS 已经被复制,但是接下来又" "要被替换了。length(values)==%d; length(cols)==%d)\n" -#: assign.c:509 +#: assign.c:552 #, c-format msgid "Direct plonk of unnamed RHS, no copy. NAMED==%d, MAYBE_SHARED==%d\n" msgstr "直接替换没有名字的 RHS,并没有复制。 NAMED==%d, MAYBE_SHARED==%d\n" -#: assign.c:578 +#: assign.c:621 #, c-format msgid "" "Dropping index '%s' as it doesn't have '__' at the beginning of its name. It " @@ -444,29 +479,29 @@ msgstr "" "丢掉索引(index) '%s' 因为它的名字前面没有 '__' 。这个很可能由data.table " "v1.9.4 创建\n" -#: assign.c:586 +#: assign.c:629 msgid "Internal error: index name ends with trailing __" msgstr "内部错误: 索引(index)名称以 __ 结尾" -#: assign.c:591 +#: assign.c:634 msgid "Internal error: Couldn't allocate memory for s4." msgstr "内部错误: 不能给 s4 分配内存" -#: assign.c:602 +#: assign.c:645 msgid "Internal error: Couldn't allocate memory for s5." msgstr "内部错误: 不能给 s5 分配内存" -#: assign.c:623 assign.c:639 +#: assign.c:666 assign.c:682 #, c-format msgid "Dropping index '%s' due to an update on a key column\n" msgstr " 因为一个键(key)列的更新,丢掉索引(index) '%s'\n" -#: assign.c:632 +#: assign.c:675 #, c-format msgid "Shortening index '%s' to '%s' due to an update on a key column\n" msgstr "因为一个键(key)列的更新,缩短索引(index) '%s' 到 '%s'\n" -#: assign.c:662 +#: assign.c:705 #, c-format msgid "" "Internal error: %d column numbers to delete not now in strictly increasing " @@ -474,37 +509,37 @@ msgid "" msgstr "" "内部错误:指定 %d 删除列的序号目前并非严格升序排列。重复项已于之前检查过。" -#: assign.c:690 +#: assign.c:733 #, c-format msgid "target vector" msgstr "目标向量" -#: assign.c:690 +#: assign.c:733 #, c-format msgid "column %d named '%s'" msgstr "第 %d 列名为 '%s'" -#: assign.c:706 +#: assign.c:749 #, c-format msgid "" "Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d" msgstr "memrecycle 内部错误:sourceStart=%d sourceLen=%d length(source)=%d" -#: assign.c:708 +#: assign.c:751 #, c-format msgid "Internal error memrecycle: start=%d len=%d length(target)=%d" msgstr "memrecycle 内部错误:start=%d len=%d length(target)=%d" -#: assign.c:711 +#: assign.c:754 #, c-format msgid "Internal error: recycle length error not caught earlier. slen=%d len=%d" msgstr "内部错误: 早期未被发现的循环长度错误 slen=%d len=%d" -#: assign.c:715 +#: assign.c:758 msgid "Internal error: memrecycle has received NULL colname" msgstr "内部错误: memrecycle 接受到的列名为 NULL " -#: assign.c:724 +#: assign.c:767 #, c-format msgid "" "Cannot assign 'factor' to '%s'. Factors can only be assigned to factor, " @@ -512,22 +547,20 @@ msgid "" msgstr "" "不能将 'factor' 赋值为 '%s' 。因子类型只能赋值为因子,字符或者列表其中的列" -#: assign.c:738 +#: assign.c:781 #, c-format msgid "" "Assigning factor numbers to %s. But %d is outside the level range [1,%d]" msgstr "%s 赋值为因子。但是 %d 在层次范围[1,%d]之外" -#: assign.c:747 +#: assign.c:790 #, c-format msgid "" "Assigning factor numbers to %s. But %f is outside the level range [1,%d], or " "is not a whole number." -msgstr "" -"%s 赋值为因子。但是 %f 在层次范围[1,%d]之外,或者不是一个完" -"整的数字" +msgstr "%s 赋值为因子。但是 %f 在层次范围[1,%d]之外,或者不是一个完整的数字" -#: assign.c:753 +#: assign.c:796 #, c-format msgid "" "Cannot assign '%s' to 'factor'. Factor columns can be assigned factor, " @@ -535,56 +568,55 @@ msgid "" msgstr "" "不能将 'factor' 赋值为 '%s' 。 因子列可被赋值为因子,字符 ,NA 或者 层次数值" -#: assign.c:774 +#: assign.c:817 msgid "" "Internal error: levels of target are either not unique or have truelength<0" msgstr "内部错误: 目标的层次不是唯一或者长度<0" -#: assign.c:813 +#: assign.c:856 msgid "Unable to allocate working memory of %zu bytes to combine factor levels" msgstr "不能分配 %zu 字节的工作内存来组合因子层次" -#: assign.c:820 +#: assign.c:863 msgid "Internal error: extra level check sum failed" msgstr "内部错误: 额外的层次校验和失败" -#: assign.c:839 +#: assign.c:882 #, c-format msgid "Coercing 'character' RHS to '%s' to match the type of %s." msgstr "将'character' RHS 强制转换成 '%s' 来匹配目标列 %s 的类型" -#: assign.c:846 +#: assign.c:889 #, c-format msgid "Cannot coerce 'list' RHS to 'integer64' to match the type of %s." -msgstr "" -"不能将'list' RHS 强制转换成 'integer64' 来匹配目 %s 的类型" +msgstr "不能将'list' RHS 强制转换成 'integer64' 来匹配目 %s 的类型" -#: assign.c:851 +#: assign.c:894 #, c-format msgid "Coercing 'list' RHS to '%s' to match the type of %s." msgstr "将'list' RHS 强制转换成 '%s' 来匹配目 %s 的类型" -#: assign.c:856 +#: assign.c:899 #, c-format msgid "Zero-copy coerce when assigning '%s' to '%s' %s.\n" msgstr "在 %s 中将 '%s' 赋值成 '%s' 时发生了零拷贝强制转换。\n" -#: assign.c:958 +#: assign.c:1001 #, c-format msgid "type '%s' cannot be coerced to '%s'" msgstr "类型 '%s' 不能强制转换成 '%s'" -#: assign.c:1116 +#: assign.c:1159 #, c-format msgid "Unsupported column type in assign.c:memrecycle '%s'" msgstr "assign.c:memrecycle '%s' 里有不支持的列的类型" -#: assign.c:1170 +#: assign.c:1213 #, c-format msgid "Internal error: writeNA passed a vector of type '%s'" msgstr "内部错误:writeNA 函数读取到了一个类型是'%s'的向量" -#: assign.c:1201 +#: assign.c:1244 #, c-format msgid "" "Internal error: savetl_init checks failed (%d %d %p %p). please report to " @@ -593,12 +625,12 @@ msgstr "" "内部错误:savetl_init的校验失败 (%d %d %p %p),请将此问题汇报给data.table 问" "题追踪器。" -#: assign.c:1209 +#: assign.c:1252 #, c-format msgid "Failed to allocate initial %d items in savetl_init" msgstr "不能为 savetl_init 最开始的 %d 个项分配空间" -#: assign.c:1218 +#: assign.c:1261 #, c-format msgid "" "Internal error: reached maximum %d items for savetl. Please report to data." @@ -607,34 +639,34 @@ msgstr "" "内部错误:已经达到了 savetl 能处理的子项上限 %d。请将此问题汇报给data.table问" "题追踪器。" -#: assign.c:1225 +#: assign.c:1268 #, c-format msgid "Failed to realloc saveds to %d items in savetl" msgstr "不能给 savetl 里的 %d 个项重新分配 saveds" -#: assign.c:1231 +#: assign.c:1274 #, c-format msgid "Failed to realloc savedtl to %d items in savetl" msgstr "不能给savetl里的 %d 个项提供 savetl" -#: assign.c:1254 +#: assign.c:1297 msgid "x must be a character vector" msgstr "x 必须是一个字符向量" -#: assign.c:1255 +#: assign.c:1298 msgid "'which' must be an integer vector" msgstr "'which' 必须是一个整数向量" -#: assign.c:1256 +#: assign.c:1299 msgid "'new' must be a character vector" msgstr "'new' 必须是一个字符向量" -#: assign.c:1257 +#: assign.c:1300 #, c-format msgid "'new' is length %d. Should be the same as length of 'which' (%d)" msgstr "'new' 的长度是 %d。 它的长度必须和'which' (%d)的长度一致。" -#: assign.c:1260 +#: assign.c:1303 #, c-format msgid "" "Item %d of 'which' is %d which is outside range of the length %d character " @@ -815,7 +847,7 @@ msgid "Internal error: nqgrpArg must be an integer vector" msgstr "内部错误:nqgrpArg 必须为一个整数向量" #: bmerge.c:124 -msgid "Intrnal error: nqmaxgrpArg is not a positive length-1 integer vector" +msgid "Internal error: nqmaxgrpArg is not a positive length-1 integer vector" msgstr "内部错误:nqmaxgrpArg不是长度为1的正整型向量" #: bmerge.c:133 @@ -1028,23 +1060,34 @@ msgstr "length(xSD)[%d] != length(xjiscols)[%d]" msgid "j evaluates to type '%s'. Must evaluate to atomic vector or list." msgstr "j的运算结果为'%s'类型。其运算结果必须为原子向量或列表。" -#: dogroups.c:279 +#: dogroups.c:280 +#, c-format msgid "" -"All items in j=list(...) should be atomic vectors or lists. If you are " -"trying something like j=list(.SD,newcol=mean(colA)) then use := by group " +"Entry %d for group %d in j=list(...) should be atomic vector or list. If you " +"are trying something like j=list(.SD,newcol=mean(colA)) then use := by group " "instead (much quicker), or cbind or merge afterwards." msgstr "" -"j=list(...) 中的所有项目必须是原子向量或列表如果您试图进行 j=list(.SD," +"j=list(...) 里的 %d 组的第 %d 项目本该是原子向量或列表。如果您在试图进行 j=list(.SD," "newcol=mean(colA)) 之类的操作请使用 := by group 代替(更快速),或事后使用 " "cbind()、merge()" -#: dogroups.c:288 +#: dogroups.c:287 +#, c-format +msgid "" +"Entry %d for group %d in j=list(...) is an array with %d dimensions > 1, " +"which is disallowed. \"Break\" the array yourself with c() or as.vector() if " +"that is intentional." +msgstr "" +"j=list(...) 里的 %d 组的第 %d 项目本是一个 %d 维度 > 1 的排列,这" +"是不允许的。如果这是故意的话请使用 c() 或 as.vector() 来分裂该数组" + +#: dogroups.c:297 msgid "" "RHS of := is NULL during grouped assignment, but it's not possible to delete " "parts of a column." msgstr "用 := 分组时 RHS 为 NULL但無法刪除部分列" -#: dogroups.c:292 +#: dogroups.c:301 #, c-format msgid "" "Supplied %d items to be assigned to group %d of size %d in column '%s'. The " @@ -1056,7 +1099,7 @@ msgstr "" "须是 1(可以是单个值) 或完全符合 LHS 的长度如果您想回收(recycle) RHS,请使用 " "rep() 向你的代码读者明确表达你的意图" -#: dogroups.c:303 +#: dogroups.c:312 msgid "" "Internal error: Trying to add new column by reference but tl is full; " "setalloccol should have run first at R level before getting to this point in " @@ -1065,16 +1108,16 @@ msgstr "" "内部错误 : 尝试依照引用增加新列但 tl 已满在进入 dogroups 之前,setalloccol 应" "该先在 R 运行" -#: dogroups.c:324 +#: dogroups.c:333 #, c-format msgid "Group %d column '%s': %s" msgstr "列 '%2$s' 第 %1$d 组 : %3$s" -#: dogroups.c:331 +#: dogroups.c:340 msgid "j doesn't evaluate to the same number of columns for each group" msgstr "j 估算出的每组的列数不同" -#: dogroups.c:365 +#: dogroups.c:374 #, c-format msgid "" "Column %d of j's result for the first group is NULL. We rely on the column " @@ -1088,7 +1131,7 @@ msgstr "" "(需要一致性)空 (NULL) 列可以出现在后面的组(适当的以 NA 取代并回收)但不能是第 " "1 组请输入空向量代替,例如 integer() 或 numeric()" -#: dogroups.c:368 +#: dogroups.c:377 msgid "" "j appears to be a named vector. The same names will likely be created over " "and over again for each group and slow things down. Try and pass a named " @@ -1097,7 +1140,7 @@ msgstr "" "j 是名称向量,这可能使相同的名称不停重复创建导致速度变慢请尝试输入名称列表(较" "适合 data.table)或是非名称列表代替\n" -#: dogroups.c:370 +#: dogroups.c:379 #, c-format msgid "" "Column %d of j is a named vector (each item down the rows is named, " @@ -1107,7 +1150,7 @@ msgstr "" "j 的第 %d 列是名称向量(整行的项都是名称)为了效率请移除这些名称(避免在每组重复" "创建这些名称)总之他们被忽略了\n" -#: dogroups.c:378 +#: dogroups.c:387 msgid "" "The result of j is a named list. It's very inefficient to create the same " "names over and over again for each group. When j=list(...), any names are " @@ -1119,17 +1162,17 @@ msgstr "" "j=list(...) 时侦测到的所有名称会被移出,待分组完成后再放回来可以使用 " "j=transform() 避免这种加速此讯息可能会在未来升级为警告\n" -#: dogroups.c:390 +#: dogroups.c:399 #, c-format msgid "dogroups: growing from %d to %d rows\n" msgstr "dogroups: 从 %d 列增加至 %d 列\n" -#: dogroups.c:391 +#: dogroups.c:400 #, c-format msgid "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" msgstr "dogroups: length(ans)[%d]!=ngrpcols[%d]+njval[%d]" -#: dogroups.c:409 +#: dogroups.c:418 #, c-format msgid "" "Item %d of j's result for group %d is zero length. This will be filled with " @@ -1140,7 +1183,7 @@ msgstr "" "j 的结果第 %d 项在第 %d 组中为零长度(zero length)将使用 %d 个 NA 填入以符合结" "果中最长列的长度后面的分组也有相同问题,但只回报第一组以避免过多警告" -#: dogroups.c:416 +#: dogroups.c:425 #, c-format msgid "" "Column %d of result for group %d is type '%s' but expecting type '%s'. " @@ -1149,7 +1192,7 @@ msgstr "" "结果的第 %d 列在第 %d 组中是 '%s' 类别而非预期的 '%s' 类别所有组的列类别必须" "一致" -#: dogroups.c:418 +#: dogroups.c:427 #, c-format msgid "" "Supplied %d items for column %d of group %d which has %d rows. The RHS " @@ -1161,17 +1204,17 @@ msgstr "" "單個值) 或與 LHS 長度完全匹配如果您想回收(recycle) RHS,请使用 rep() 向你的代" "码读者明确表达你的意图" -#: dogroups.c:439 +#: dogroups.c:448 #, c-format msgid "Wrote less rows (%d) than allocated (%d).\n" msgstr "写入的行 (%d) 少于分配的 (%d)\n" -#: dogroups.c:461 +#: dogroups.c:470 #, c-format msgid "Internal error: block 0 [%d] and block 1 [%d] have both run" msgstr "内部错误 : 区块 0 [%d] 与区块 1 [%d] 都运行了" -#: dogroups.c:463 +#: dogroups.c:472 #, c-format msgid "" "\n" @@ -1180,23 +1223,24 @@ msgstr "" "\n" " %s 花了 %.3fs 在 %d 个组\n" -#: dogroups.c:465 +#: dogroups.c:474 #, c-format msgid " eval(j) took %.3fs for %d calls\n" msgstr " eval(j)取%.3fs给 %d 调用\n" -#: dogroups.c:489 +#: dogroups.c:498 msgid "growVector passed NULL" msgstr "growVector通过NULL" -#: dogroups.c:509 +#: dogroups.c:518 #, c-format msgid "Internal error: growVector doesn't support type '%s'" msgstr "内部错误:growVector 不支持 '%s' 类型" -#: fastmean.c:39 -msgid "narm should be TRUE or FALSE" -msgstr "narm必须是TRUE或FALSE" +#: fastmean.c:39 rbindlist.c:8 +#, c-format +msgid "%s should be TRUE or FALSE" +msgstr "%s 必须是TRUE或FALSE" #: fastmean.c:45 #, c-format @@ -1208,7 +1252,7 @@ msgstr "传递给 fastmean 的是 %s 类型,而不是数值或逻辑类型" msgid "Internal error: type '%s' not caught earlier in fastmean" msgstr "内部错误:先前fastmean没有侦测到类型 '%s' " -#: fcast.c:92 +#: fcast.c:101 #, c-format msgid "Unsupported column type in fcast val: '%s'" msgstr "fcast val不支持的列类型:'%s'" @@ -1449,7 +1493,7 @@ msgstr "找不到'id.vars'。将指定所有'measure.vars'以外的所有列为' msgid "Assigned 'id.vars' are [%s].\n" msgstr "指定的 'id.vars' 是 [%s].\n" -#: fmelt.c:311 +#: fmelt.c:313 msgid "" "When 'measure.vars' is a list, 'value.name' must be a character vector of " "length =1 or =length(measure.vars)." @@ -1457,7 +1501,7 @@ msgstr "" "当'measure.vars'是一个列表(list), 'value.name' 必须是一个长度为1或者等于" "length(measure.vars)的字符向量" -#: fmelt.c:312 +#: fmelt.c:314 msgid "" "When 'measure.vars' is either not specified or a character/integer vector, " "'value.name' must be a character vector of length =1." @@ -1465,17 +1509,17 @@ msgstr "" "当'measure.vars'未被指定或者是一个字符/整数向量时,'value.name'必须是一个长度" "1的字符/整数向量" -#: fmelt.c:315 +#: fmelt.c:317 msgid "'variable.name' must be a character/integer vector of length 1." msgstr "'variable.name' 必须是长度1的字符/整数向量。" -#: fmelt.c:368 +#: fmelt.c:370 msgid "" "variable_table attribute of measure.vars should be a data table with at " "least one column" msgstr "measure.vars 的 variable_table 属性应为一个至少有一列的 data.table" -#: fmelt.c:373 +#: fmelt.c:375 #, c-format msgid "" "variable_table attribute of measure.vars should be a data table with same " @@ -1484,24 +1528,24 @@ msgstr "" "measure.vars 的 variable_table 属性应为一个 data.table,且该 data.table 的行" "应与 measure.vars 向量中的最大长度(=%d)一致" -#: fmelt.c:377 +#: fmelt.c:379 msgid "" "variable_table attribute of measure.vars should be either NULL or a data " "table" msgstr "measure.vars 的 variable_table 属性应为 NULL 或一个 data.table" -#: fmelt.c:394 +#: fmelt.c:396 msgid "" "Internal error: combineFactorLevels in fmelt.c expects all-character input" msgstr "内部错误:fmelt.c里的combineFactorLevels期望输入值为全字符" -#: fmelt.c:397 +#: fmelt.c:399 msgid "" "Internal error: combineFactorLevels in fmelt.c expects a character target to " "factorize" msgstr "内部错误:fmelt.c里的combineFactorLevels期望一个字符来分解" -#: fmelt.c:460 +#: fmelt.c:462 #, c-format msgid "" "'measure.vars' [%s] are not all of the same type. By order of hierarchy, the " @@ -1513,59 +1557,59 @@ msgstr "" "以变量中不是'%3$s'类型的数将被强制转换为'%2$s'类型,更多关于强制转换的信息请" "查看 ?melt.data.table.\n" -#: fmelt.c:572 +#: fmelt.c:574 #, c-format msgid "Unknown column type '%s' for column '%s'." msgstr "'%s'列是未知的纵列类型: '%s'" -#: fmelt.c:593 +#: fmelt.c:595 #, c-format msgid "Internal error: fmelt.c:getvarcols %d %d" msgstr "内部错误:fmelt.c : getvarcols %d %d" -#: fmelt.c:679 +#: fmelt.c:681 #, c-format msgid "variable_table does not support column type '%s' for column '%s'." msgstr "variable_table 不支持列 '%2$s' 所包含的数据类型 '%1$s'。" -#: fmelt.c:773 +#: fmelt.c:775 #, c-format msgid "Unknown column type '%s' for column '%s' in 'data'" msgstr "'data' 中的'%s'列是未知列类型:'%s'" -#: fmelt.c:784 +#: fmelt.c:786 msgid "Input is not of type VECSXP, expected a data.table, data.frame or list" msgstr "输入类型不是 VECSXP,输入类型应该是 data.table,data.frame 或 list。" -#: fmelt.c:785 +#: fmelt.c:787 msgid "Argument 'value.factor' should be logical TRUE/FALSE" msgstr "'value.factor' 的参数是逻辑值,必须是 TRUE 或FALSE" -#: fmelt.c:786 +#: fmelt.c:788 msgid "Argument 'variable.factor' should be logical TRUE/FALSE" msgstr "'variable.factor' 的参数是逻辑值,必须是 TRUE 或FALSE" -#: fmelt.c:787 +#: fmelt.c:789 msgid "Argument 'na.rm' should be logical TRUE/FALSE." msgstr "'na.rm' 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: fmelt.c:788 +#: fmelt.c:790 msgid "Argument 'variable.name' must be a character vector" msgstr "'variable.name' 必须是字符串类型" -#: fmelt.c:789 +#: fmelt.c:791 msgid "Argument 'value.name' must be a character vector" msgstr "'value.name' 必须是字符串类型" -#: fmelt.c:790 +#: fmelt.c:792 msgid "Argument 'verbose' should be logical TRUE/FALSE" msgstr "'verbose' 的参数是逻辑值,必须是 TRUE 或 FALSE" -#: fmelt.c:794 +#: fmelt.c:796 msgid "ncol(data) is 0. Nothing to melt. Returning original data.table." msgstr "ncol(data)为0,返回原 data.table" -#: fmelt.c:799 +#: fmelt.c:801 msgid "names(data) is NULL. Please report to data.table-help" msgstr "names(data)为NULL,请向 data.table-help 报告" @@ -1626,16 +1670,16 @@ msgstr "必须是长度为1的整数或数字向量" msgid "Must be 2, 1 or 0" msgstr "必须是2、1或者0" -#: forder.c:404 +#: forder.c:405 msgid "Unknown non-finite value; not NA, NaN, -Inf or +Inf" msgstr "未知的取值范围,不属于 NA, NaN, -Inf 或 +Inf" -#: forder.c:426 +#: forder.c:427 msgid "" "Internal error: input is not either a list of columns, or an atomic vector." msgstr "内部错误:输入值既不是列表中的一列,也不是原子向量" -#: forder.c:428 +#: forder.c:429 msgid "" "Internal error: input is an atomic vector (not a list of columns) but by= is " "not NULL" @@ -1643,73 +1687,74 @@ msgstr "" "内部错误:输入值是一个原子向量(而不是列表中的一列),但是'by' 的参数是列表而不" "是NULL" -#: forder.c:430 +#: forder.c:431 msgid "" "Input is an atomic vector (not a list of columns) but order= is not a length " "1 integer" msgstr "" "输入值是一个原子向量(而不是列表中的一列),但参数 order不是长度为1的整数" -#: forder.c:432 +#: forder.c:433 #, c-format msgid "forder.c received a vector type '%s' length %d\n" msgstr "forder.c 接收到一个类型为'%s'长度为%d的向量\n" -#: forder.c:440 +#: forder.c:441 #, c-format msgid "forder.c received %d rows and %d columns\n" msgstr "forder.c 接收到%d行和%d列\n" -#: forder.c:443 +#: forder.c:444 msgid "Internal error: DT is an empty list() of 0 columns" msgstr "内部错误:DT 是一个0列的空 list" -#: forder.c:445 +#: forder.c:446 #, c-format msgid "" "Internal error: DT has %d columns but 'by' is either not integer or is " "length 0" msgstr "内部错误:DT 内部有%d列,但参数 'by' 不是整数或长度为0" -#: forder.c:447 +#: forder.c:448 #, c-format msgid "" "Either order= is not integer or its length (%d) is different to by='s length " "(%d)" msgstr "参数 order 不是整数,或者它的长度(%d)与参数 'by' 指定的长度(%d)不同" -#: forder.c:453 +#: forder.c:454 #, c-format msgid "internal error: 'by' value %d out of range [1,%d]" msgstr "内部错误:参数 'by' 的值%d超出[1,%d]的范围" -#: forder.c:455 +#: forder.c:456 #, c-format msgid "" "Column %d is length %d which differs from length of column 1 (%d), are you " "attempting to order by a list column?\n" -msgstr "第 %d 列的长度是 %d,与第 1 列的长度(%d)不同。你是在尝试通过一个列表类" -"型的列来排序吗?\n" +msgstr "" +"第 %d 列的长度是 %d,与第 1 列的长度(%d)不同。你是在尝试通过一个列表类型的列" +"来排序吗?\n" -#: forder.c:465 +#: forder.c:466 msgid "At least one of retGrp= or sort= must be TRUE" msgstr "retGrp 和sort 的参数中,至少一个必须是 TRUE" -#: forder.c:467 +#: forder.c:468 msgid "na.last must be logical TRUE, FALSE or NA of length 1" msgstr "na.last 的参数必须是逻辑值 TRUE, FALSE 或 NA " -#: forder.c:495 forder.c:599 +#: forder.c:496 forder.c:600 #, c-format msgid "Unable to allocate % bytes of working memory" msgstr "无法分配%字节的工作内存" -#: forder.c:511 +#: forder.c:512 #, c-format msgid "Item %d of order (ascending/descending) is %d. Must be +1 or -1." msgstr "排序(ascending/descending)选项%d是%d,必须是+1 or -1" -#: forder.c:537 +#: forder.c:538 #, c-format msgid "" "\n" @@ -1721,82 +1766,82 @@ msgstr "" "***传递给 forder 的%d列是一个没有小数的8字节 double 类型的日期数据,请考虑使" "用4字节的整数日期(例如IDate)以节省空间和时间\n" -#: forder.c:553 +#: forder.c:554 #, c-format msgid "Column %d passed to [f]order is type '%s', not yet supported." msgstr "传递给 [f]order 的第%d列为 '%s'类型,目前尚不支持。" -#: forder.c:706 +#: forder.c:707 msgid "Internal error: column not supported, not caught earlier" msgstr "内部错误:列有不支持类型,未被前置识别" -#: forder.c:714 +#: forder.c:715 #, c-format msgid "nradix=%d\n" msgstr "nradix=%d\n" -#: forder.c:721 +#: forder.c:722 #, c-format msgid "" "Failed to allocate TMP or UGRP or they weren't cache line aligned: nth=%d" msgstr "分配TMP或UGRP失败或缓存行不一致: nth=%d" -#: forder.c:727 +#: forder.c:728 msgid "Could not allocate (very tiny) group size thread buffers" msgstr "无法分配(极小)块组大小的线程缓冲区" -#: forder.c:788 +#: forder.c:789 #, c-format msgid "Timing block %2d%s = %8.3f %8d\n" msgstr "定时块 %2d%s = %8.3f %8d\n" -#: forder.c:791 +#: forder.c:792 #, c-format msgid "stat[%03d]==%20\n" msgstr "stat[%03d]==%20\n" -#: forder.c:1047 +#: forder.c:1048 #, c-format msgid "Failed to allocate parallel counts. my_n=%d, nBatch=%d" msgstr "分配并行计算失败,my_n=%d, nBatch=%d" -#: forder.c:1156 +#: forder.c:1157 #, c-format msgid "Unable to allocate TMP for my_n=%d items in parallel batch counting" msgstr "无法分配TMP给并行批处理计算的 my_n=%d 项" -#: forder.c:1264 +#: forder.c:1265 msgid "Internal error: issorted 'by' must be NULL or integer vector" msgstr "内部错误:issorted 参数 'by' 须为 NULL 或一个整数向量" -#: forder.c:1268 forder.c:1319 +#: forder.c:1269 forder.c:1320 #, c-format msgid "issorted 'by' [%d] out of range [1,%d]" msgstr "issorted 参数 'by' 的值%d超出[1,%d]的范围" -#: forder.c:1273 +#: forder.c:1274 msgid "is.sorted does not work on list columns" msgstr "is.sorted 不支持列表(list)列" -#: forder.c:1306 forder.c:1336 forder.c:1370 +#: forder.c:1307 forder.c:1337 forder.c:1371 #, c-format msgid "type '%s' is not yet supported" msgstr "类型 '%s' 目前不支持" -#: forder.c:1383 +#: forder.c:1384 msgid "x must be either NULL or an integer vector" msgstr "x 必须为空值或整型向量" -#: forder.c:1385 +#: forder.c:1386 msgid "nrow must be integer vector length 1" msgstr "nrow 必须为长度为1的整型向量" -#: forder.c:1387 +#: forder.c:1388 #, c-format msgid "nrow==%d but must be >=0" msgstr "nrow==%d 但是必须 >=0" -#: forder.c:1404 +#: forder.c:1405 msgid "x must be type 'double'" msgstr "x 必须为浮点数类型" @@ -1818,7 +1863,7 @@ msgid "" "that item which is length %d." msgstr "输入列表x的列 %d 长度为 %d,不同于第一列的该项长度为 %d" -#: frank.c:101 frank.c:234 transpose.c:88 +#: frank.c:101 frank.c:234 transpose.c:97 #, c-format msgid "Unsupported column type '%s'" msgstr "不支持的列类型 '%s'" @@ -1836,253 +1881,249 @@ msgstr "" msgid "Internal error: unknown ties value in frank: %d" msgstr "内部错误:frank中有未知的ties值 %d" -#: fread.c:105 +#: fread.c:113 #, c-format msgid "" "Internal error in line %d of fread.c, please report on data.table GitHub: " msgstr "fread.c中%d行出现内部错误,请在 data.table 的 GitHub中提交报告:" -#: fread.c:150 +#: fread.c:159 #, c-format msgid "System error %lu unmapping view of file\n" msgstr "系统错误 %lu 取消映射文件视图\n" -#: fread.c:153 +#: fread.c:162 #, c-format msgid "System errno %d unmapping file: %s\n" msgstr "系统错误 %d 取消映射文件: %s\n" -#: fread.c:213 +#: fread.c:225 #, c-format msgid "Internal error: NUMTYPE(%d) > nLetters(%d)" msgstr "内部错误:NUMTYPE(%d) > nLetters(%d)" -#: fread.c:438 +#: fread.c:450 #, c-format msgid "Unable to allocate %s of contiguous virtual RAM. %s allocation." msgstr "无法分配 %s 的连续虚拟内存。 %s 已分配。" -#: fread.c:443 +#: fread.c:455 #, c-format msgid "Avoidable %.3f seconds. %s time to copy.\n" msgstr "可避免的 %.3f 秒。 %s 复制用时\n" -#: fread.c:444 +#: fread.c:456 #, c-format msgid " File copy in RAM took %.3f seconds.\n" msgstr " 内存上的文件复制耗时 %.3f 秒\n" -#: fread.c:1260 +#: fread.c:1287 msgid "" "Previous fread() session was not cleaned up properly. Cleaned up ok at the " "beginning of this fread() call.\n" msgstr "之前的会话fread()未正确清理。在当前 fread() 会话开始前清理好\n" -#: fread.c:1263 +#: fread.c:1290 msgid "[01] Check arguments\n" msgstr "[01] 参数检查\n" -#: fread.c:1270 +#: fread.c:1297 #, c-format msgid " Using %d threads (omp_get_max_threads()=%d, nth=%d)\n" msgstr " 使用 %d 线程 (omp_get_max_threads()=%d, nth=%d)\n" -#: fread.c:1278 +#: fread.c:1305 msgid "" "Internal error: NAstrings is itself NULL. When empty it should be pointer to " "NULL." msgstr "内部错误:NAstrings 自身为空值。当清空该项会指向NULL空值" -#: fread.c:1292 +#: fread.c:1319 #, c-format msgid "freadMain: NAstring <<%s>> has whitespace at the beginning or end" msgstr "freadMain: NAstring <<%s>> 在开始或者结束处有空白" -#: fread.c:1296 +#: fread.c:1323 #, c-format msgid "" "freadMain: NAstring <<%s>> is recognized as type boolean, this is not " "permitted." msgstr "freadMain: NAstring <<%s>> 被识别为布尔型,这是不允许" -#: fread.c:1298 +#: fread.c:1325 #, c-format msgid "freadMain: NAstring <<%s>> and logical01=TRUE, this is not permitted." msgstr "freadMain: NAstring 为 <<%s>> 同时 logical01=TRUE,不允许这种情况。" -#: fread.c:1310 +#: fread.c:1337 msgid " No NAstrings provided.\n" msgstr " 未提供 NAstrings \n" -#: fread.c:1312 +#: fread.c:1339 msgid " NAstrings = [" msgstr " NAstrings = [" -#: fread.c:1315 +#: fread.c:1342 msgid "]\n" msgstr "]\n" -#: fread.c:1317 +#: fread.c:1344 msgid " One or more of the NAstrings looks like a number.\n" msgstr " 一个或多个 NAstrings 类似数值\n" -#: fread.c:1319 +#: fread.c:1346 msgid " None of the NAstrings look like numbers.\n" msgstr " 没有 NAstrings 为数值\n" -#: fread.c:1321 +#: fread.c:1348 #, c-format msgid " skip num lines = %\n" msgstr " 跳过行数为 %\n" -#: fread.c:1322 +#: fread.c:1349 #, c-format msgid " skip to string = <<%s>>\n" msgstr " 跳转至 string = <<%s>>\n" -#: fread.c:1323 +#: fread.c:1350 #, c-format msgid " show progress = %d\n" msgstr " 显示进程 %d\n" -#: fread.c:1324 +#: fread.c:1351 #, c-format msgid " 0/1 column will be read as %s\n" msgstr " 0/1 列被读取为 %s\n" -#: fread.c:1336 +#: fread.c:1363 #, c-format msgid "sep == quote ('%c') is not allowed" msgstr "sep == quote ('%c') 不被允许" -#: fread.c:1337 -msgid "dec='' not allowed. Should be '.' or ','" -msgstr "dec='' 不允许,应该为 '.' 或者 ','" - -#: fread.c:1338 +#: fread.c:1364 #, c-format msgid "sep == dec ('%c') is not allowed" msgstr "sep == dec ('%c') 不允许" -#: fread.c:1339 +#: fread.c:1365 #, c-format msgid "quote == dec ('%c') is not allowed" msgstr "quote == dec ('%c') 不允许" -#: fread.c:1356 +#: fread.c:1382 msgid "[02] Opening the file\n" msgstr "[02] 打开文件\n" -#: fread.c:1359 +#: fread.c:1385 msgid "" " `input` argument is provided rather than a file name, interpreting as raw " "text to read\n" msgstr "提供 `input` 参数而非文件名,理解为原始的文本读取\n" -#: fread.c:1363 +#: fread.c:1389 msgid "Internal error: last byte of character input isn't \\0" msgstr "内部错误:字符输入的最后一个字节不是 \\0" -#: fread.c:1366 +#: fread.c:1392 #, c-format msgid " Opening file %s\n" msgstr " 打开文件 %s\n" -#: fread.c:1370 fread.c:1395 +#: fread.c:1396 fread.c:1425 #, c-format msgid "File not found: %s" msgstr "文件没有找到:%s" -#: fread.c:1374 +#: fread.c:1400 #, c-format msgid "Opened file ok but couldn't obtain its size: %s" msgstr "文件能够打开但无法获知其大小:%s" -#: fread.c:1377 fread.c:1405 +#: fread.c:1403 fread.c:1435 #, c-format msgid "File is empty: %s" msgstr "文件是空的:%s" -#: fread.c:1378 fread.c:1406 +#: fread.c:1404 fread.c:1436 #, c-format msgid " File opened, size = %s.\n" msgstr " 文件已打开,大小为 %s.\n" -#: fread.c:1401 +#: fread.c:1431 #, c-format msgid "Unable to open file after %d attempts (error %lu): %s" msgstr "经过 %d 次尝试后仍无法打开文件(错误 %lu):%s" -#: fread.c:1403 +#: fread.c:1433 #, c-format msgid "GetFileSizeEx failed (returned 0) on file: %s" msgstr "GetFileSizeEx 未能成功执行(返回值为0)于文件:%s" -#: fread.c:1408 +#: fread.c:1438 #, c-format msgid "This is Windows, CreateFileMapping returned error %lu for file %s" msgstr "现在在Windows下,CreateFileMapping 返回错误 %lu 于文件 %s" -#: fread.c:1415 +#: fread.c:1445 #, c-format msgid "" "Opened %s file ok but could not memory map it. This is a %dbit process. %s." msgstr "能够打开文件 %s 但不能创建内存映射。这是一个 %d 位进程。 %s." -#: fread.c:1416 +#: fread.c:1446 msgid "Please upgrade to 64bit" msgstr "请升级到64位" -#: fread.c:1416 +#: fread.c:1446 msgid "There is probably not enough contiguous virtual memory available" msgstr "多半没有足够的连续虚拟内存" -#: fread.c:1419 +#: fread.c:1449 msgid " Memory mapped ok\n" msgstr " 内存映射正常\n" -#: fread.c:1421 +#: fread.c:1451 msgid "" "Internal error: Neither `input` nor `filename` are given, nothing to read." msgstr "" "内部错误:既没有`input`(输入)也没有`filename`(文件名),没有什么可供读入。" -#: fread.c:1438 +#: fread.c:1468 msgid "[03] Detect and skip BOM\n" msgstr "[03] 检测并跳过字节顺序标记(BOM)\n" -#: fread.c:1442 +#: fread.c:1472 msgid "" " UTF-8 byte order mark EF BB BF found at the start of the file and " "skipped.\n" msgstr "在文件头发现了UTF-8 字节顺序标记(BOM)EF BB BF 并已跳过。\n" -#: fread.c:1447 +#: fread.c:1477 msgid "" "GB-18030 encoding detected, however fread() is unable to decode it. Some " "character fields may be garbled.\n" msgstr "检测到GB-18030 编码,但fread() 未能解码。某些 字符字段可能有乱码。\n" -#: fread.c:1450 +#: fread.c:1480 msgid "" "File is encoded in UTF-16, this encoding is not supported by fread(). Please " "recode the file to UTF-8." msgstr "文件编码是UTF-16,fread()不支持此编码。请 将文件转换为UTF-8。" -#: fread.c:1455 +#: fread.c:1485 #, c-format msgid " Last byte(s) of input found to be %s and removed.\n" msgstr " 发现输入的最后字节是 %s 并已去除。\n" -#: fread.c:1458 +#: fread.c:1488 msgid "Input is empty or only contains BOM or terminal control characters" msgstr "输入是空的或只有字节顺序标记(BOM)或终端控制字符" -#: fread.c:1465 +#: fread.c:1495 msgid "[04] Arrange mmap to be \\0 terminated\n" msgstr "[04] 设定mmap为 \\0 终止\n" -#: fread.c:1472 +#: fread.c:1502 msgid "" " No \\n exists in the file at all, so single \\r (if any) will be taken as " "one line ending. This is unusual but will happen normally when there is no " @@ -2091,7 +2132,7 @@ msgstr "" " 文件中完全没有换行符\\n,所以单个 \\r(如果有的话)将被当成一行的结束。这不" "太常见但如果没有\\r 的话属于正常;例如单个行没有行尾结束符。\n" -#: fread.c:1473 +#: fread.c:1503 msgid "" " \\n has been found in the input and different lines can end with different " "line endings (e.g. mixed \\n and \\r\\n in one file). This is common and " @@ -2100,7 +2141,7 @@ msgstr "" " 输入中有\\n 并且不同行可以有不同的 行尾结束符(如在一个文件中混合使用 \\n " "和\\r\\n)。这很常见也是理想情况。\n" -#: fread.c:1497 +#: fread.c:1527 #, c-format msgid "" " File ends abruptly with '%c'. Final end-of-line is missing. Using cow page " @@ -2109,7 +2150,7 @@ msgstr "" " 文件突然中止于 '%c'。没有最后一个行尾结束符。正使用写时复制页(cow, copy-" "on-write)写入 0 到最后一个字节。\n" -#: fread.c:1503 +#: fread.c:1533 msgid "" "This file is very unusual: it ends abruptly without a final newline, and " "also its size is a multiple of 4096 bytes. Please properly end the last row " @@ -2118,16 +2159,16 @@ msgstr "" "这个文件非常不正常:它突然中止而没有最后的换行,并且其大小是4096 字节的整数" "倍。请用一个换行(例如 'echo >> file')来恰当地结束最后一行以避免此错误" -#: fread.c:1504 +#: fread.c:1534 #, c-format msgid " File ends abruptly with '%c'. Copying file in RAM. %s copy.\n" msgstr " 文件突然中止于 '%c'。正在从内存中复制文件。%s 复制。\n" -#: fread.c:1538 +#: fread.c:1568 msgid "[05] Skipping initial rows if needed\n" msgstr "[05] 如需要的话跳过起始行\n" -#: fread.c:1544 +#: fread.c:1574 #, c-format msgid "" "skip='%s' not found in input (it is case sensitive and literal; i.e., no " @@ -2136,79 +2177,79 @@ msgstr "" "在输入中没有发现 skip='%s' (这里大小写敏感并需要是字面形式,也就是说不能使用" "模式,适配符或正则表达式)" -#: fread.c:1550 +#: fread.c:1580 #, c-format msgid "" "Found skip='%s' on line %. Taking this to be header row or first row " "of data.\n" msgstr "在行 %2$ 发现了 skip='%1$s'。将此当做表头或数据的第一行。\n" -#: fread.c:1563 +#: fread.c:1593 #, c-format msgid " Skipped to line % in the file" msgstr " 跳到文件的第 % 行" -#: fread.c:1564 +#: fread.c:1594 #, c-format msgid "skip=% but the input only has % line%s" msgstr "skip=% 但输入只有 % 行 %s" -#: fread.c:1573 +#: fread.c:1603 msgid "" "Input is either empty, fully whitespace, or skip has been set after the last " "non-whitespace." msgstr "输入是空,或全部为空白,或跳过设置是在最后一个非空白字符之后。" -#: fread.c:1575 +#: fread.c:1605 #, c-format msgid " Moved forward to first non-blank line (%d)\n" msgstr " 前移到第一个非空行 (%d)\n" -#: fread.c:1576 +#: fread.c:1606 #, c-format msgid " Positioned on line %d starting: <<%s>>\n" msgstr " 定位到行 %d 开始于: <<%s>>\n" -#: fread.c:1594 +#: fread.c:1624 msgid "[06] Detect separator, quoting rule, and ncolumns\n" msgstr "[06] 检测分隔符,引用规则,以及列数\n" -#: fread.c:1598 +#: fread.c:1628 msgid " sep='\\n' passed in meaning read lines as single character column\n" msgstr " sep='\\n' 设定意味着将把所有行读作一个字符列\n" -#: fread.c:1617 +#: fread.c:1647 msgid " Detecting sep automatically ...\n" msgstr " 自动检测分隔符中 ...\n" -#: fread.c:1624 +#: fread.c:1654 #, c-format msgid " Using supplied sep '%s'\n" msgstr " 使用提供的分隔符 '%s'\n" -#: fread.c:1658 +#: fread.c:1688 #, c-format msgid " with %d fields using quote rule %d\n" msgstr " 对 %d 个字段使用引用规则 %d\n" -#: fread.c:1708 +#: fread.c:1738 #, c-format msgid " with %d lines of %d fields using quote rule %d\n" msgstr " 对 %d 行的 %d 字段使用引用规则 %d\n" -#: fread.c:1715 +#: fread.c:1745 msgid "" " No sep and quote rule found a block of 2x2 or greater. Single column " "input.\n" msgstr " 没有分隔符并且引用规则发现了一个大于或等于2x2的区块。输入是单列。\n" -#: fread.c:1731 +#: fread.c:1761 msgid "" "Single column input contains invalid quotes. Self healing only effective " "when ncol>1" msgstr "单列输入包含了不合法的引用。自我修正只有在列数大于1(ncol>1)时才有效" -#: fread.c:1736 +#: fread.c:1766 #, c-format msgid "" "Found and resolved improper quoting in first %d rows. If the fields are not " @@ -2218,35 +2259,35 @@ msgstr "" "在前 %d 行中发现并修正了不合适的引号用法。如果字段没有加引号(例如字段间隔符" "没有在任何字段内出现),可以尝试使用 quote=\"\" 来避免此警告。" -#: fread.c:1752 +#: fread.c:1782 #, c-format msgid "" "Internal error: ncol==%d line==%d after detecting sep, ncol and first line" msgstr "内部错误:检测分隔符,列数和首行后,ncol==%d line==%d" -#: fread.c:1755 +#: fread.c:1785 #, c-format msgid "Internal error: first line has field count %d but expecting %d" msgstr "内部错误:首行有%d个字段,但应该有%d个" -#: fread.c:1757 +#: fread.c:1787 #, c-format msgid "" " Detected %d columns on line %d. This line is either column names or first " "data row. Line starts as: <<%s>>\n" msgstr "检测到第%2$d行有%1$d列。该行为列名或数据集首行。该行以<<%3$s>>开始\n" -#: fread.c:1759 +#: fread.c:1789 #, c-format msgid " Quote rule picked = %d\n" msgstr "标点符号规则 = %d\n" -#: fread.c:1760 +#: fread.c:1790 #, c-format msgid " fill=%s and the most number of columns found is %d\n" msgstr "fill=%s 且找到的最大列数为 %d\n" -#: fread.c:1766 +#: fread.c:1796 msgid "" "This file is very unusual: it's one single column, ends with 2 or more end-" "of-line (representing several NA at the end), and is a multiple of 4096, too." @@ -2254,12 +2295,12 @@ msgstr "" "该文件极为特殊,仅有一列数据,在结尾处包含多个行结束标记(表示多个空值),且" "长度为4096的整数倍。" -#: fread.c:1767 +#: fread.c:1797 #, c-format msgid " Copying file in RAM. %s\n" msgstr "正在将文件拷贝到RAM。%s\n" -#: fread.c:1773 +#: fread.c:1803 msgid "" " 1-column file ends with 2 or more end-of-line. Restoring last eol using " "extra byte in cow page.\n" @@ -2267,37 +2308,41 @@ msgstr "" "该文件包含一列数据,存在多个行结束标记(表示多个空值)。正在使用写时复制页" "(cow, copy-on-write)额外的字节恢复最后一个标记.\n" -#: fread.c:1792 +#: fread.c:1822 msgid "" -"[07] Detect column types, good nrow estimate and whether first row is column " -"names\n" -msgstr "[07] 检测列类型,估计行数以及首行是否为列名\n" +"[07] Detect column types, dec, good nrow estimate and whether first row is " +"column names\n" +msgstr "[07] 检测列类型,小数点分隔,良好的行数估计以及首行是否为列名\n" -#: fread.c:1793 +#: fread.c:1823 #, c-format msgid " 'header' changed by user from 'auto' to %s\n" msgstr " 用户已将'header'(列名)从 'auto' 改为 %s\n" -#: fread.c:1797 +#: fread.c:1827 #, c-format msgid "Failed to allocate 2 x %d bytes for type and tmpType: %s" msgstr "为 %2$s 类型分配 2 x %1$d bytes失败" -#: fread.c:1818 +#: fread.c:1831 +msgid " sep=',' so dec set to '.'\n" +msgstr " sep=',' 所以 dec 被设成 '.'\n" + +#: fread.c:1855 #, c-format msgid " Number of sampling jump points = %d because " msgstr "采样跳点数 = %d 因为" -#: fread.c:1819 +#: fread.c:1856 #, c-format msgid "nrow limit (%) supplied\n" msgstr "指定了nrow 的最大值 (%) \n" -#: fread.c:1820 +#: fread.c:1857 msgid "jump0size==0\n" msgstr "jump0size==0\n" -#: fread.c:1821 +#: fread.c:1858 #, c-format msgid "" "(% bytes from row 1 to eof) / (2 * % jump0size) == " @@ -2305,53 +2350,58 @@ msgid "" msgstr "" "(从首行到结束共 % bytes) / (2 * % jump0size) == %\n" -#: fread.c:1859 +#: fread.c:1897 #, c-format msgid "" " A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. " "%s\n" msgstr "第%5$d个跳点所找到的第%4$d行,该行字段过于%1$s(%2$d/%3$d). %6$s\n" -#: fread.c:1860 +#: fread.c:1898 msgid "few" msgstr "少" -#: fread.c:1860 +#: fread.c:1898 msgid "many" msgstr "多" -#: fread.c:1860 +#: fread.c:1898 msgid "" "Most likely this jump landed awkwardly so type bumps here will be skipped." msgstr "很有可能这一跳点的位置并不合适,因此此处的类型转换将被跳过。" -#: fread.c:1886 +#: fread.c:1921 fread.c:1947 +#, c-format +msgid " dec='%c' detected based on a balance of %d parsed fields\n" +msgstr " dec='%c' 因为 %d 个解析字段的余额被检测到\n" + +#: fread.c:1931 #, c-format msgid " Type codes (jump %03d) : %s Quote rule %d\n" msgstr " 类型码(跳点 %03d) : %s 引用规则 %d\n" -#: fread.c:1898 +#: fread.c:1953 #, c-format msgid "" " 'header' determined to be true due to column %d containing a string on row " "1 and a lower type (%s) in the rest of the % sample rows\n" msgstr "" -" 'header' 参数设为真,原因是第%1$d列首行包含字符串,并且在样本中的另外%3$行" -"包含有较底层的数据类型(%2$s)\n" +" 'header' 参数设为真,原因是第%1$d列首行包含字符串,并且在样本中的另" +"外%3$行包含有较底层的数据类型(%2$s)\n" -#: fread.c:1910 +#: fread.c:1965 msgid "" "Internal error: row before first data row has the same number of fields but " "we're not using it." msgstr "内部错误:数据首行的前一行包含相同数量的字段但不会用到该行。" -#: fread.c:1911 +#: fread.c:1966 msgid "" "Internal error: ch!=pos after counting fields in the line before the first " "data row." msgstr "内部错误:对数据首行前一行的字段计数后,ch不等于pos" -#: fread.c:1912 +#: fread.c:1967 #, c-format msgid "" "Types in 1st data row match types in 2nd data row but previous row has %d " @@ -2360,7 +2410,7 @@ msgstr "" "数据第一行的类型与第二行相匹配,但是之前的行有 %d 个字段。故将第一行数据的前" "一行作为列名" -#: fread.c:1915 +#: fread.c:1970 #, c-format msgid "" "Detected %d column names but the data has %d columns (i.e. invalid file). " @@ -2368,7 +2418,7 @@ msgid "" msgstr "" "检测到 %d 个列名,然而数据共有 %d 列(文件不合法)。添加了 %d 个额外列名%s\n" -#: fread.c:1916 +#: fread.c:1971 msgid "" " for the first column which is guessed to be row names or an index. Use " "setnames() afterwards if this guess is not correct, or fix the file write " @@ -2377,17 +2427,17 @@ msgstr "" "作为第一列,并被用于猜测行名或索引。若上述猜测不正确,可在后续使用setnames()" "进行修改,或修复用于生成该文件的文件写入命令以生成有效的文件。" -#: fread.c:1916 +#: fread.c:1971 msgid "s at the end." msgstr "到结尾处" -#: fread.c:1918 +#: fread.c:1973 msgid "" "Internal error: fill=true but there is a previous row which should already " "have been filled." msgstr "内部错误:参数fill=true,但是在此之前有一行应当已经被填充。" -#: fread.c:1919 +#: fread.c:1974 #, c-format msgid "" "Detected %d column names but the data has %d columns. Filling rows " @@ -2396,74 +2446,74 @@ msgstr "" "检测到%d个列名,但数据共有%d列。已经自动填充。设置参数fill=TRUE以屏蔽此警" "告。\n" -#: fread.c:1923 +#: fread.c:1978 #, c-format msgid "Failed to realloc 2 x %d bytes for type and tmpType: %s" msgstr "为 %2$s 类型重新分配 2 x %1$d bytes失败" -#: fread.c:1943 +#: fread.c:1998 #, c-format msgid "" " 'header' determined to be %s because there are%s number fields in the " "first and only row\n" msgstr " 参数'header' 被设置为%s, 因为唯一的一行包含 %s 个字段\n" -#: fread.c:1943 +#: fread.c:1998 msgid " no" msgstr "0" -#: fread.c:1946 +#: fread.c:2001 msgid "" " 'header' determined to be true because all columns are type string and a " "better guess is not possible\n" msgstr "参数 'header' 被设置为true,因为所有列类型均为字符串\n" -#: fread.c:1948 +#: fread.c:2003 msgid "" " 'header' determined to be false because there are some number columns and " "those columns do not have a string field at the top of them\n" msgstr "参数 'header' 被设置为false,因为部分字段的首行不为字符串\n" -#: fread.c:1964 +#: fread.c:2019 #, c-format msgid " Type codes (first row) : %s Quote rule %d\n" msgstr " 类型码(第一行) : %s 引用规则 %d\n" -#: fread.c:1973 +#: fread.c:2028 #, c-format msgid "" " All rows were sampled since file is small so we know nrow=% " "exactly\n" msgstr " 文件太小,全部行均被采样到,所以 nrow=%\n" -#: fread.c:1985 fread.c:1992 +#: fread.c:2040 fread.c:2047 msgid " =====\n" msgstr " =====\n" -#: fread.c:1986 +#: fread.c:2041 #, c-format msgid "" " Sampled % rows (handled \\n inside quoted fields) at %d jump " "points\n" msgstr " 已使用了 %2$d个跳点抽样 %1$ 行(处理了字段间的分隔符\\n)\n" -#: fread.c:1987 +#: fread.c:2042 #, c-format msgid "" " Bytes from first data row on line %d to the end of last row: %\n" msgstr " 从第一个数据行(%d)到最后一行的字节: %\n" -#: fread.c:1988 +#: fread.c:2043 #, c-format msgid " Line length: mean=%.2f sd=%.2f min=%d max=%d\n" msgstr "文件每行长度的统计量:均值=%.2f,标准差=%.2f,最小值=%d ,最大值=%d\n" -#: fread.c:1989 +#: fread.c:2044 #, c-format msgid " Estimated number of rows: % / %.2f = %\n" msgstr "估计数据共有 % / %.2f = % 行\n" -#: fread.c:1990 +#: fread.c:2045 #, c-format msgid "" " Initial alloc = % rows (% + %d%%) using bytes/" @@ -2472,44 +2522,44 @@ msgstr "" "为 % 行 (% + %d%%)分配初始内存,大小为字节数/max(mean-2*sd," "min),并确保该数值落于区间[1.1*estn, 2.0*estn]中\n" -#: fread.c:1994 +#: fread.c:2049 #, c-format msgid "Internal error: sampleLines(%) > allocnrow(%)" msgstr "内部错误:sampleLines(%) > allocnrow(%)" -#: fread.c:1998 +#: fread.c:2053 #, c-format msgid " Alloc limited to lower nrows=% passed in.\n" msgstr " 分配被限制在输入的更小的 nrows=% 值上。\n" -#: fread.c:2010 +#: fread.c:2065 msgid "[08] Assign column names\n" msgstr "[08] 指定列名\n" -#: fread.c:2018 +#: fread.c:2073 #, c-format msgid "Unable to allocate %d*%d bytes for column name pointers: %s" msgstr "无法分配 %d*%d 字节给列名指针: %s" -#: fread.c:2040 +#: fread.c:2095 #, c-format msgid "Internal error: reading colnames ending on '%c'" msgstr "内部错误:读取列名终止于 '%c'" -#: fread.c:2058 +#: fread.c:2113 msgid "[09] Apply user overrides on column types\n" msgstr "[09] 使用用户指定的列类型\n" -#: fread.c:2062 +#: fread.c:2117 msgid " Cancelled by user: userOverride() returned false." msgstr " 用户已取消:userOverride() 返回 false。" -#: fread.c:2072 +#: fread.c:2127 #, c-format msgid "Failed to allocate %d bytes for size array: %s" msgstr "无法分配 %d 字节给 size 数组:%s" -#: fread.c:2079 +#: fread.c:2134 #, c-format msgid "" "Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' " @@ -2519,55 +2569,65 @@ msgstr "" "试图覆盖第 %d 列 %s%.*s%s,将内部类型 '%s' 降级为 '%s' 的操作被忽略。只支持将" "列类型升为更高阶的类型。如果确定此操作,请完成之后再转换类型。" -#: fread.c:2094 +#: fread.c:2149 #, c-format msgid " After %d type and %d drop user overrides : %s\n" msgstr " 经过 %d 类型和 %d 丢弃用户覆盖:%s\n" -#: fread.c:2102 +#: fread.c:2157 msgid "[10] Allocate memory for the datatable\n" msgstr "[10] 分配内存给 datatable\n" -#: fread.c:2103 +#: fread.c:2158 #, c-format msgid " Allocating %d column slots (%d - %d dropped) with % rows\n" msgstr " 正在分配 %d 列位置(%d - %d 已丢弃),% 行\n" -#: fread.c:2157 +#: fread.c:2213 #, c-format msgid "Buffer size % is too large\n" msgstr "缓冲长度 % 过大\n" -#: fread.c:2160 +#: fread.c:2216 msgid "[11] Read the data\n" msgstr "[11] 读取数据\n" -#: fread.c:2163 +#: fread.c:2219 #, c-format msgid " jumps=[%d..%d), chunk_size=%, total_size=%\n" msgstr " jumps=[%d..%d),chunk_size=%,total_size=%\n" -#: fread.c:2175 +#: fread.c:2231 #, c-format msgid "Internal error: Master thread is not thread 0 but thread %d.\n" msgstr "内部错误:主线程并非线程0而是线程%d\n" -#: fread.c:2386 +#: fread.c:2444 #, c-format msgid "" "Column %d%s%.*s%s bumped from '%s' to '%s' due to <<%.*s>> on row %\n" msgstr "" -"第 %d 列 %s%.*s%s 发生了从 '%s' 到 '%s' 的类型转换,原因是由于 <<%.*s>> 出现在第 " -"% 行\n" +"第 %d 列 %s%.*s%s 发生了从 '%s' 到 '%s' 的类型转换,原因是由于 <<%.*s>> 出现" +"在第 % 行\n" -#: fread.c:2436 +#: fread.c:2494 #, c-format msgid "" "Internal error: invalid head position. jump=%d, headPos=%p, " "thisJumpStart=%p, sof=%p" msgstr "内部错误:head 位置无效。jump=%d, headPos=%p, thisJumpStart=%p, sof=%p" -#: fread.c:2509 +#: fread.c:2562 +#, c-format +msgid " Provided number of fill columns: %d but only found %d\n" +msgstr "提供的填充列数: %d 但只找到 %d\n" + +#: fread.c:2563 +#, c-format +msgid " Dropping %d overallocated columns\n" +msgstr " 删除 %d 个过度分配的列\n" + +#: fread.c:2586 #, c-format msgid "" " Too few rows allocated. Allocating additional % rows (now " @@ -2576,42 +2636,52 @@ msgstr "" " 分配的行数太少。正在分配额外的 % 行(当前 nrows=%),并从跳" "跃 %d 继续读取\n" -#: fread.c:2516 +#: fread.c:2593 #, c-format msgid " Restarting team from jump %d. nSwept==%d quoteRule==%d\n" msgstr " 从跳跃 %d 重启组。nSwept==%d quoteRule==%d\n" -#: fread.c:2536 +#: fread.c:2613 #, c-format msgid " %d out-of-sample type bumps: %s\n" msgstr " %d 样本外类型变更:%s\n" -#: fread.c:2571 +#: fread.c:2648 #, c-format msgid "" "Read % rows x %d columns from %s file in %02d:%06.3f wall clock " "time\n" msgstr "读取 % 行 x %d 列,从 %s 文件(时钟时间 %02d:%06.3f)\n" -#: fread.c:2578 +#: fread.c:2655 msgid "[12] Finalizing the datatable\n" msgstr "[12] 最后定型 datatable\n" -#: fread.c:2579 +#: fread.c:2656 msgid " Type counts:\n" msgstr " 类型数量:\n" -#: fread.c:2581 +#: fread.c:2658 #, c-format msgid "%10d : %-9s '%c'\n" msgstr "%10d : %-9s '%c'\n" -#: fread.c:2597 +#: fread.c:2674 #, c-format msgid "Discarded single-line footer: <<%s>>" msgstr "丢弃末尾行:<<%s>>" -#: fread.c:2602 +#: fread.c:2680 +#, c-format +msgid "" +"Stopped early on line %. Expected %d fields but found %d. Consider " +"fill=%d or even more based on your knowledge of the input file. First " +"discarded non-empty line: <<%s>>" +msgstr "" +"在第 % 行提前终止。预期有 %d 个字段但只找到 %d 个。可以考虑设置 " +"fill=%d 甚至更多,基于您对输入文件的了解。 首个丢弃的非空行:<<%s>>" + +#: fread.c:2683 #, c-format msgid "" "Stopped early on line %. Expected %d fields but found %d. Consider " @@ -2620,7 +2690,7 @@ msgstr "" "在第 % 行提前终止。预期有 %d 个字段但只找到 %d 个。可以考虑设置 " "fill=TRUE 和 comment.char=。 首个丢弃的非空行:<<%s>>" -#: fread.c:2608 +#: fread.c:2690 #, c-format msgid "" "Found and resolved improper quoting out-of-sample. First healed line " @@ -2631,31 +2701,31 @@ msgstr "" "不在引号内(例如:字段间隔符没有在任何一个字段中出现),尝试用 quote=\"\" 来" "避免该警告。" -#: fread.c:2612 +#: fread.c:2694 msgid "=============================\n" msgstr "=============================\n" -#: fread.c:2614 +#: fread.c:2696 #, c-format msgid "%8.3fs (%3.0f%%) Memory map %.3fGB file\n" msgstr "%8.3fs (%3.0f%%) 内存映射 %.3fGB 文件\n" -#: fread.c:2615 +#: fread.c:2697 #, c-format msgid "%8.3fs (%3.0f%%) sep=" msgstr "%8.3fs (%3.0f%%) sep=" -#: fread.c:2617 +#: fread.c:2699 #, c-format msgid " ncol=%d and header detection\n" msgstr " ncol=%d 和表头检测\n" -#: fread.c:2618 +#: fread.c:2700 #, c-format msgid "%8.3fs (%3.0f%%) Column type detection using % sample rows\n" msgstr "%8.3fs (%3.0f%%) 列类型检测基于 % 个样本行\n" -#: fread.c:2620 +#: fread.c:2702 #, c-format msgid "" "%8.3fs (%3.0f%%) Allocation of % rows x %d cols (%.3fGB) of which " @@ -2664,7 +2734,7 @@ msgstr "" "%8.3fs (%3.0f%%) % 行 x %d 列 (%.3fGB) 的分配中已使用 % " "(%3.0f%%) 行\n" -#: fread.c:2624 +#: fread.c:2706 #, c-format msgid "" "%8.3fs (%3.0f%%) Reading %d chunks (%d swept) of %.3fMB (each chunk %d rows) " @@ -2673,29 +2743,29 @@ msgstr "" "%8.3fs (%3.0f%%) 正在读取 %d 个块 (%d 已扫描) of %.3fMB (每个块 %d 行) 使用 " "%d 个线程\n" -#: fread.c:2626 +#: fread.c:2708 #, c-format msgid "" " + %8.3fs (%3.0f%%) Parse to row-major thread buffers (grown %d times)\n" msgstr " + %8.3fs (%3.0f%%) 解析到行处理线程的缓冲区(已增长 %d 次)\n" -#: fread.c:2627 +#: fread.c:2709 #, c-format msgid " + %8.3fs (%3.0f%%) Transpose\n" msgstr " + %8.3fs (%3.0f%%) 转置\n" -#: fread.c:2628 +#: fread.c:2710 #, c-format msgid " + %8.3fs (%3.0f%%) Waiting\n" msgstr " + %8.3fs (%3.0f%%) 正在等待\n" -#: fread.c:2629 +#: fread.c:2711 #, c-format msgid "" "%8.3fs (%3.0f%%) Rereading %d columns due to out-of-sample type exceptions\n" msgstr "%8.3fs (%3.0f%%) 正在重读 %d 列,由于样本外类型异常\n" -#: fread.c:2631 +#: fread.c:2713 #, c-format msgid "%8.3fs Total\n" msgstr "%8.3fs 总计\n" @@ -2731,33 +2801,33 @@ msgid "" "Internal error: freadR dec not a single character. R level catches this." msgstr "内部错误:freadR dec 不是单个字符。R 中应该捕获此错误。" -#: freadR.c:113 +#: freadR.c:114 msgid "quote= must be a single character, blank \"\", or FALSE" msgstr "quote= 必须是单个字符,空白 \"\",或者 FALSE" -#: freadR.c:125 +#: freadR.c:126 msgid "Internal error: freadR nrows not a single real. R level catches this." msgstr "内部错误:freadR nrows 并非为一单一实数。R 中应该捕获此错误。" -#: freadR.c:141 +#: freadR.c:142 msgid "Internal error: skip not integer or string in freadR.c" msgstr "内部错误:freadR.c 中 skip 非整数或字符串" -#: freadR.c:144 +#: freadR.c:145 #, c-format msgid "Internal error: NAstringsArg is type '%s'. R level catches this" msgstr "内部错误:NAstringsArg是'%s'数据类型.R中能够捕获这个信息" -#: freadR.c:157 +#: freadR.c:158 #, c-format msgid "nThread(%d)<1" msgstr "nThread(%1$d)<1(线程数(%1$d)小于1)" -#: freadR.c:165 +#: freadR.c:166 msgid "'integer64' must be a single character string" msgstr "'64整数型'必须是单个字符串" -#: freadR.c:173 +#: freadR.c:174 #, c-format msgid "" "Invalid value integer64='%s'. Must be 'integer64', 'character', 'double' or " @@ -2766,11 +2836,11 @@ msgstr "" "64位整数型有效值='%s'.必须是'64位整数型','字符串','双精度浮点型'或者'数值" "型'" -#: freadR.c:181 +#: freadR.c:182 msgid "Use either select= or drop= but not both." msgstr "select=和drop=不可同时使用" -#: freadR.c:184 +#: freadR.c:185 msgid "" "select= is type list for specifying types in select=, but colClasses= has " "been provided as well. Please remove colClasses=." @@ -2778,7 +2848,7 @@ msgstr "" "select=是用于在select=中指定类型的类型列表,但是还提供了colClasses=。请删除" "colClasses=。" -#: freadR.c:186 +#: freadR.c:187 msgid "" "select= is type list but has no names; expecting list(type1=cols1, " "type2=cols2, ...)" @@ -2786,7 +2856,7 @@ msgstr "" "select =是类型列表,但没有名称; 期望列表(type1 = cols1,type2 = " "cols2,...)" -#: freadR.c:193 +#: freadR.c:194 msgid "" "select= is a named vector specifying the columns to select and their types, " "but colClasses= has been provided as well. Please remove colClasses=." @@ -2794,45 +2864,45 @@ msgstr "" "select =是一个命名向量,用于指定要选择的列及其类型,但是还提供了colClasses " "=。 请删除colClasses =。" -#: freadR.c:201 freadR.c:367 +#: freadR.c:202 freadR.c:368 msgid "colClasses is type list but has no names" msgstr "colClasses是类型列表,但没有名称" -#: freadR.c:211 +#: freadR.c:212 #, c-format msgid "encoding='%s' invalid. Must be 'unknown', 'Latin-1' or 'UTF-8'" msgstr "encoding ='%s'无效。 必须为'未知','Latin-1'或'UTF-8'" -#: freadR.c:234 +#: freadR.c:235 #, c-format msgid "Column name '%s' (%s) not found" msgstr "找不到列名'%s'(%s)" -#: freadR.c:236 +#: freadR.c:237 #, c-format msgid "%s is NA" msgstr "%s是缺失值" -#: freadR.c:238 +#: freadR.c:239 #, c-format msgid "%s is %d which is out of range [1,ncol=%d]" msgstr "%s是%d,超出范围[1,ncol =%d]" -#: freadR.c:252 +#: freadR.c:253 msgid "Internal error: typeSize[CT_BOOL8_N] != 1" msgstr "内部错误:类型大小[CT_BOOL8_N]不等于1" -#: freadR.c:253 +#: freadR.c:254 msgid "Internal error: typeSize[CT_STRING] != 1" msgstr "内部错误:类型大小[CT_STRING]不等于1" -#: freadR.c:287 +#: freadR.c:288 #, c-format msgid "" "Column name '%s' not found in column name header (case sensitive), skipping." msgstr "在列名标题中找不到列名'%s'(区分大小写),正在跳过。" -#: freadR.c:297 +#: freadR.c:298 #, c-format msgid "" "Column number %d (select[%d]) is negative but should be in the range [1," @@ -2840,7 +2910,7 @@ msgid "" msgstr "" "列号%d(select [%d])为负,但应在[1,ncol =%d]范围内。考虑drop=用于排除列。" -#: freadR.c:298 +#: freadR.c:299 #, c-format msgid "" "select = 0 (select[%d]) has no meaning. All values of select should be in " @@ -2848,19 +2918,19 @@ msgid "" msgstr "" "select=0(select[%d])没有意义。select的所有值都应在[1,ncol=%d]范围内。" -#: freadR.c:299 +#: freadR.c:300 #, c-format msgid "" "Column number %d (select[%d]) is too large for this table, which only has %d " "columns." msgstr "对于此表(仅包含%d列,)列号%d(select [%d])太大。" -#: freadR.c:300 +#: freadR.c:301 #, c-format msgid "Column number %d ('%s') has been selected twice by select=" msgstr "列号%d('%s')已由select =选择两次" -#: freadR.c:323 +#: freadR.c:324 #, c-format msgid "" "colClasses= is an unnamed vector of types, length %d, but there are %d " @@ -2872,11 +2942,11 @@ msgstr "" "定类型,可以使用命名向量,列表格式或使用select=而不是colClasses=。请参阅'?" "fread'中的示例。" -#: freadR.c:343 +#: freadR.c:344 msgid "Internal error: selectInts is NULL but selectColClasses is true" msgstr "内部错误:selectInts为NULL,但selectColClasses为true" -#: freadR.c:345 +#: freadR.c:346 msgid "" "Internal error: length(selectSxp)!=length(colClassesSxp) but " "selectColClasses is true" @@ -2884,22 +2954,27 @@ msgstr "" "内部错误:length(select xp)!=length(colClasses xp),但select ColClasses" "为true" -#: freadR.c:365 +#: freadR.c:366 #, c-format msgid "colClasses is type '%s' but should be list or character" msgstr "colClasses是类型'%s',但应该是列表或字符" -#: freadR.c:389 +#: freadR.c:390 #, c-format msgid "Column name '%s' (colClasses[[%d]][%d]) not found" msgstr "找不到列名'%s'(colClasses[[%d]][%d])" -#: freadR.c:391 +#: freadR.c:392 #, c-format msgid "colClasses[[%d]][%d] is NA" msgstr "colClasses[[%d]][%d]是NA" -#: freadR.c:395 +#: freadR.c:396 +#, c-format +msgid "Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]" +msgstr "列号%d(colClasses[[%d]][%d])超出范围[1,ncol=%d]" + +#: freadR.c:400 #, c-format msgid "" "Column %d ('%s') appears more than once in colClasses. The second time is " @@ -2907,22 +2982,17 @@ msgid "" msgstr "" "Column %d ('%s')在colClasses中出现了多次。第二次是colClasses[[%d]][%d]." -#: freadR.c:407 -#, c-format -msgid "Column number %d (colClasses[[%d]][%d]) is out of range [1,ncol=%d]" -msgstr "列号%d(colClasses[[%d]][%d])超出范围[1,ncol=%d]" - -#: freadR.c:625 +#: freadR.c:640 #, c-format msgid "Field size is 1 but the field is of type %d\n" msgstr "字段大小为1,但字段类型为%d \n" -#: freadR.c:634 +#: freadR.c:649 #, c-format msgid "Internal error: unexpected field of size %d\n" msgstr "内部错误:大小为%d 的意外字段\n" -#: freadR.c:702 utils.c:376 +#: freadR.c:717 utils.c:387 #, c-format msgid "%s" msgstr "%s" @@ -3061,8 +3131,8 @@ msgid "" "Internal error: invalid %s argument in %s function should have been caught " "earlier. Please report to the data.table issue tracker." msgstr "" -"内部错误:函数 %s 中的 %s 参数无效,该错误理应已被捕获。请向 data.table" -"开发团队提交 issue 报告此问题。" +"内部错误:函数 %s 中的 %s 参数无效,该错误理应已被捕获。请向 data.table开发团" +"队提交 issue 报告此问题。" #: frollR.c:112 msgid "" @@ -3303,8 +3373,8 @@ msgid "" "the time data.table was compiled. To enable fwrite compression, please " "reinstall data.table and study the output for further guidance." msgstr "" -"fwrite 中的压缩功能使用了 zlib 库。并未在 data.table 编译时找到它的头文件。" -"若想启用 fwrite 的压缩功能,请重新安装 data.table 并查看输出以获得进一步的指" +"fwrite 中的压缩功能使用了 zlib 库。并未在 data.table 编译时找到它的头文件。若" +"想启用 fwrite 的压缩功能,请重新安装 data.table 并查看输出以获得进一步的指" "导。" #: fwrite.c:704 @@ -3445,14 +3515,14 @@ msgstr "列%d的长度(%d)和列1的长度(%)不一致" msgid "Column %d's type is '%s' - not yet implemented in fwrite." msgstr "列%d的类型是'%s' - 尚未在fwrite中实施" -#: fwriteR.c:267 +#: fwriteR.c:268 #, c-format msgid "" -"input has specific integer rownames but their length (%) != nrow " +"input has specific integer rownames but their length (%lld) != nrow " "(%)" -msgstr "输入具有特定的整数行名但是他们的长度(%) != nrow (%)" +msgstr "输入具有特定的整数行名但是他们的长度(%lld) != nrow (%)" -#: fwriteR.c:282 +#: fwriteR.c:283 msgid "" "No list columns are present. Setting sep2='' otherwise quote='auto' would " "quote fields containing sep2.\n" @@ -3460,7 +3530,7 @@ msgstr "" "当前没有列表页. 设置sep2=''否则quote='auto'会自动为所有包含sep2的字段加上引" "号.\n" -#: fwriteR.c:286 +#: fwriteR.c:287 #, c-format msgid "" "If quote='auto', fields will be quoted if the field contains either sep " @@ -3470,7 +3540,7 @@ msgstr "" "that host lists),所有包含sep('%1$c') 或 sep2 ('%2$c')的字段将会被自动加上引" "号。\n" -#: fwriteR.c:290 +#: fwriteR.c:291 #, c-format msgid "" "sep ('%c'), sep2 ('%c') and dec ('%c') must all be different. Column %d is a " @@ -3574,8 +3644,8 @@ msgid "" "Type '%s' is not supported by GForce %s. Either add the prefix %s or turn " "off GForce optimization using options(datatable.optimize=1)" msgstr "" -"GForce %s 不支持类型'%s',要么添加前缀 %s,要么使用选项" -"datatable.optimize=1来关闭GForce优化。" +"GForce %s 不支持类型'%s',要么添加前缀 %s,要么使用选项datatable.optimize=1来" +"关闭GForce优化。" #: gsumm.c:584 #, c-format @@ -3593,8 +3663,8 @@ msgid "" "Type '%s' not supported by GForce mean (gmean). Either add the prefix base::" "mean(.) or turn off GForce optimization using options(datatable.optimize=1)" msgstr "" -"类型 '%s' 不支持应用 GForce mean(gmin) 优化。你可以添加前缀 base::mean(.) 或" -"者使用 options(datatable.optimize=1) 关闭 GForce 优化" +"类型 '%s' 不支持应用 GForce mean(gmin) 优化。你可以添加前缀 base::mean(.) " +"或者使用 options(datatable.optimize=1) 关闭 GForce 优化" #: gsumm.c:724 msgid "" @@ -3603,10 +3673,10 @@ msgid "" "SD) or turn off GForce optimization using options(datatable.optimize=1). " "More likely, you may be looking for 'DT[,lapply(.SD,min),by=,.SDcols=]'" msgstr "" -"GForce 的 min/max 函数只能在应用在数据列上,无法应用于 .SD 或者其他对象上。为了" -"找到某个 list,如 .SD 里所有元素的最大值/最大值,请使用 base::min(.SD) 或者通过" -"设置 options(datatable.optimize=1) 来关闭 GForce 优化。更有可能的是,你真正想" -"要使用的命令是 'DT[,lapply(.SD,median),by=,.SDcols=]'" +"GForce 的 min/max 函数只能在应用在数据列上,无法应用于 .SD 或者其他对象上。为" +"了找到某个 list,如 .SD 里所有元素的最大值/最大值,请使用 base::min(.SD) 或者" +"通过设置 options(datatable.optimize=1) 来关闭 GForce 优化。更有可能的是,你真" +"正想要使用的命令是 'DT[,lapply(.SD,median),by=,.SDcols=]'" #: gsumm.c:835 msgid "Type 'complex' has no well-defined min/max" @@ -3641,8 +3711,9 @@ msgid "" "the namespace prefix (e.g. utils::head(.)) or turn off GForce optimization " "using options(datatable.optimize=1)" msgstr "" -"类型 '%s' 不支持应用 GForce head/tail/first/last/`[` 优化。你可以添加命名空间前" -"缀(如 utils::head(.))或者使用 options(datatable.optimize=1) 关闭 GForce 优化" +"类型 '%s' 不支持应用 GForce head/tail/first/last/`[` 优化。你可以添加命名空间" +"前缀(如 utils::head(.))或者使用 options(datatable.optimize=1) 关闭 GForce " +"优化" #: gsumm.c:995 gsumm.c:1001 msgid "" @@ -3720,9 +3791,8 @@ msgid "" "(e.g. data.table::shift(.)) or turn off GForce optimization using " "options(datatable.optimize=1)" msgstr "" -"类型 '%s' 不支持应用 GForce gshift 优化。你可以添加命名空间前" -"缀(如 data.table::shift(.))或者使用 options(datatable.optimize=1) 关闭 GForce" -"优化" +"类型 '%s' 不支持应用 GForce gshift 优化。你可以添加命名空间前缀(如 data." +"table::shift(.))或者使用 options(datatable.optimize=1) 关闭 GForce优化" #: idatetime.c:126 vecseq.c:13 msgid "x must be an integer vector" @@ -3821,126 +3891,122 @@ msgstr "内部错误:在重叠中出现未知的mult:%d" msgid "Final step, fetching indices in overlaps ... done in %8.3f seconds\n" msgstr "重叠的最后一步:获取索引...在%8.3f秒内完成\n" -#: init.c:163 +#: init.c:165 msgid "" "Pointers are %zu bytes, greater than 8. We have not tested on any " "architecture greater than 64bit yet." msgstr "指针是%zu个字节,大于8。我们尚未在大于64位的任何体系结构上进行测试。" -#: init.c:177 +#: init.c:179 msgid "... failed. Please forward this message to maintainer('data.table')." msgstr "... 失败。请将此消息转发给 maintainer('data.table')。" -#: init.c:178 +#: init.c:180 #, c-format msgid "Checking NA_INTEGER [%d] == INT_MIN [%d] %s" msgstr "检查NA_INTEGER [%d] == INT_MIN [%d] %s" -#: init.c:179 +#: init.c:181 #, c-format msgid "Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s" msgstr "检查Checking NA_INTEGER [%d] == NA_LOGICAL [%d] %s" -#: init.c:180 init.c:181 init.c:183 init.c:186 init.c:187 init.c:188 init.c:189 -#: init.c:190 init.c:191 init.c:192 +#: init.c:182 init.c:183 init.c:185 init.c:188 init.c:189 init.c:190 init.c:191 +#: init.c:192 init.c:193 init.c:194 #, c-format msgid "Checking sizeof(%s) [%zu] is %d %s" msgstr "检查sizeof(%s)[%zu]是否为%d %s" -#: init.c:184 +#: init.c:186 #, c-format msgid "Checking sizeof(pointer) [%zu] is 4 or 8 %s" msgstr "检查sizeof(pointer) [%zu]是否为4 或者 8 %s" -#: init.c:185 +#: init.c:187 #, c-format msgid "Checking sizeof(SEXP) [%zu] == sizeof(pointer) [%zu] %s" msgstr "检查sizeof(SEXP) [%zu] == sizeof(pointer) [%zu] %s" -#: init.c:195 +#: init.c:197 #, c-format msgid "Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s" msgstr "检查LENGTH(allocVector(INTSXP,2)) [%d]是否为2 %s" -#: init.c:197 +#: init.c:199 #, c-format msgid "Checking TRUELENGTH(allocVector(INTSXP,2)) [%lld] is 0 %s" msgstr "检查TRUELENGTH(allocVector(INTSXP,2)) [%lld]是否为0 %s" -#: init.c:204 +#: init.c:206 #, c-format msgid "Checking memset(&i,0,sizeof(int)); i == (int)0 %s" msgstr "检查memset(&i,0,sizeof(int)); i == (int)0 %s" -#: init.c:207 +#: init.c:209 #, c-format msgid "Checking memset(&ui, 0, sizeof(unsigned int)); ui == (unsigned int)0 %s" msgstr "检查memset(&ui, 0, sizeof(unsigned int)); ui == (unsigned int)0 %s" -#: init.c:210 +#: init.c:212 #, c-format msgid "Checking memset(&d, 0, sizeof(double)); d == (double)0.0 %s" msgstr "检查memset(&d, 0, sizeof(double)); d == (double)0.0 %s" -#: init.c:213 +#: init.c:215 #, c-format msgid "Checking memset(&ld, 0, sizeof(long double)); ld == (long double)0.0 %s" msgstr "检查memset(&ld, 0, sizeof(long double)); ld == (long double)0.0 %s" -#: init.c:216 +#: init.c:218 msgid "The ascii character '/' is not just before '0'" msgstr "ASCII 字符 '/' 后一个字符并非字符 '0'" -#: init.c:217 +#: init.c:219 msgid "The C expression (uint_fast8_t)('/'-'0')<10 is true. Should be false." msgstr "C表达式 (uint_fast8_t)('/'-'0') <10 为 true. 应该是 false." -#: init.c:218 +#: init.c:220 msgid "The ascii character ':' is not just after '9'" msgstr "ascii字符':'不是在'9'后" -#: init.c:219 +#: init.c:221 msgid "The C expression (uint_fast8_t)('9'-':')<10 is true. Should be false." msgstr "C表达式(uint_fast8_t)('9'-':') < 10 为 true. 应该是 false." -#: init.c:224 +#: init.c:226 #, c-format msgid "Conversion of NA_INT64 via double failed %!=%" msgstr "double类型转化为NA_INT64失败,%!=%" -#: init.c:228 +#: init.c:230 msgid "NA_INT64_D (negative -0.0) is not == 0.0." msgstr "NA_INT64_D (negative -0.0) 不是 == 0.0." -#: init.c:229 +#: init.c:231 msgid "NA_INT64_D (negative -0.0) is not ==-0.0." msgstr "NA_INT64_D (negative -0.0) 不是 ==-0.0." -#: init.c:230 +#: init.c:232 msgid "ISNAN(NA_INT64_D) is TRUE but should not be" msgstr "ISNAN(NA_INT64_D) 不应该是TRUE" -#: init.c:231 +#: init.c:233 msgid "isnan(NA_INT64_D) is TRUE but should not be" msgstr "isnan(NA_INT64_D) 不应该是 TRUE" -#: init.c:264 +#: init.c:266 #, c-format msgid "PRINTNAME(install(\"integer64\")) has returned %s not %s" msgstr "PRINTNAME(install(\"integer64\")) 返回了 %s , 而不是 %s" -#: init.c:318 +#: init.c:320 msgid "verbose option must be length 1 non-NA logical or integer" msgstr "verbose 选项必须为一个长度为 1 的非 NA 逻辑值或整数" -#: init.c:349 +#: init.c:354 msgid ".Last.value in namespace is not a length 1 integer" msgstr "命名空间中,.Last.value 不是一个长度为 1 的整型" -#: nafill.c:103 -msgid "nan_is_na must be TRUE or FALSE" -msgstr "nan_is_na 必须是 TRUE 或者 FALSE" - #: nafill.c:110 msgid "" "'x' argument is atomic vector, in-place update is supported only for list/" @@ -4079,12 +4145,9 @@ msgid "" "name' and 'I' can be used to work out proper substitution, see ?substitute2 " "examples." msgstr "" -"尝试用 '%s' 元素替换 '%s' 类型的对象,但当替换调用参数的名称时,它必须是" -"'symbol' 类型,函数 'as.name' 和 'I' 可以用来实现适当的替换,参见 ?substitute2" - -#: rbindlist.c:8 -msgid "fill= should be TRUE or FALSE" -msgstr "fill= 应该是 TRUE 或 FALSE" +"尝试用 '%s' 元素替换 '%s' 类型的对象,但当替换调用参数的名称时,它必须" +"是'symbol' 类型,函数 'as.name' 和 'I' 可以用来实现适当的替换,参见 ?" +"substitute2" #: rbindlist.c:10 msgid "use.names= should be TRUE, FALSE, or not used (\"check\" by default)" @@ -4360,8 +4423,7 @@ msgstr "内部错误: setcolorder读取到的dt有 %d 列但是有 %d 个名字 msgid "" "shift input must not be matrix or array, consider wrapping it into data." "table() or c()" -msgstr "" -"shift 的输入不能为矩阵或数组,考虑将其放入 data.table() 或 c() 中。" +msgstr "shift 的输入不能为矩阵或数组,考虑将其放入 data.table() 或 c() 中。" #: shift.c:17 #, c-format @@ -4515,10 +4577,14 @@ msgstr "keep.names 应该是空(NULL),或者为放置输入名称的结果 msgid "fill must be a length 1 vector, such as the default NA" msgstr "fill 必须是长度为 1 的向量,例如默认值 NA" -#: transpose.c:28 +#: transpose.c:22 +msgid "list.cols should be logical TRUE/FALSE." +msgstr "list.cols 应该是逻辑类型 TRUE 或 FALSE。" + +#: transpose.c:31 #, c-format -msgid "Item %d of list input is not an atomic vector" -msgstr "列表输入的第 %d 项不是原子(atomic)向量" +msgid "Item %d of list input is not either an atomic vector, or a list" +msgstr "列表输入的第 %d 项既不是原子(atomic)向量也不是列表" #: types.c:55 msgid "internal error: status, nx, nk must be integer" @@ -4586,78 +4652,78 @@ msgstr "x不是一个逻辑向量" msgid "Unsupported type '%s' passed to allNA()" msgstr "allNA() 不支持'%s'类型" -#: utils.c:104 +#: utils.c:105 msgid "'x' argument must be data.table compatible" msgstr "'x' 必须为data.table支持的类型" -#: utils.c:122 +#: utils.c:129 msgid "" "argument specifying columns is type 'double' and one or more items in it are " "not whole integers" msgstr "指定列的参数是一个双精度类型而其中至少有一个元素不是整数" -#: utils.c:128 +#: utils.c:135 #, c-format msgid "" "argument specifying columns received non-existing column(s): cols[%d]=%d" msgstr "指定列的参数受到了不存在的列: cols[%d]=%d" -#: utils.c:133 +#: utils.c:142 msgid "'x' argument data.table has no names" msgstr "data.table的参数x并没有名字" -#: utils.c:138 +#: utils.c:148 #, c-format msgid "" "argument specifying columns received non-existing column(s): cols[%d]='%s'" msgstr "指定列的参数受到了不存在的列: cols[%d]='%s'" -#: utils.c:141 +#: utils.c:152 msgid "argument specifying columns must be character or numeric" msgstr "指定列的参数必须是字符或者是数值" -#: utils.c:144 +#: utils.c:155 msgid "argument specifying columns received duplicate column(s)" msgstr "指定列的参数受到了重复的列" -#: utils.c:229 +#: utils.c:240 #, c-format msgid "Internal error: type '%s' not supported in %s" msgstr "内部错误:类型 '%s' 不支持应用 %s" -#: utils.c:234 +#: utils.c:245 #, c-format msgid "Internal error: copyAsPlain returning ALTREP for type '%s'" msgstr "内部错误:copyAsPlain 返回了类型为 '%s' 的 ALTREP" -#: utils.c:278 +#: utils.c:289 #, c-format msgid "Found and copied %d column%s with a shared memory address\n" msgstr "发现并拷贝了具有相同的内存地址的%d列%s\n" -#: utils.c:358 +#: utils.c:369 msgid "'x' is not atomic" msgstr "x 不是一个原子向量" -#: utils.c:360 +#: utils.c:371 msgid "'x' must not be matrix or array" msgstr "'x' 不能是矩阵或者数组" -#: utils.c:362 +#: utils.c:373 msgid "input must not be matrix or array" msgstr "输入不能是矩阵或者数组" -#: utils.c:366 +#: utils.c:377 #, c-format msgid "copy=false and input already of expected type and class %s[%s]\n" msgstr "copy=false 并且输入已经是预期的类型和类 %s[%s]\n" -#: utils.c:373 +#: utils.c:384 #, c-format msgid "Coercing %s[%s] into %s[%s]\n" msgstr "强制转换 %s[%s] 为 %s[%s]\n" -#: utils.c:389 +#: utils.c:400 #, c-format msgid "zlib header files were not found when data.table was compiled" msgstr "zlib头文件在 data.table 编译时没有找到" From 822631e985cfcbd29ddae9e63bb75b30c42c5816 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 26 Jul 2024 18:42:32 -0700 Subject: [PATCH 24/41] Fine-tune news for API docs (#6314) --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index bb19620cd..e13ec71fa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -122,7 +122,7 @@ 20. Removed a warning about the now totally-obsolete option `datatable.CJ.names`, as discussed in previous releases. -21. Refactored some non-API calls to R macros for S4 objects (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users & R core for pushing to have a clearer definition of "API" for R, and thanks @MichaelChirico for implementing here. +21. Refactored some non-API calls in the package C code, (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. 22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. From e06624879d8ab33036587b35fef947ff460db6bd Mon Sep 17 00:00:00 2001 From: Tyson Barrett Date: Sun, 28 Jul 2024 16:11:39 -0400 Subject: [PATCH 25/41] add !is.na to subset function example (#6317) --- vignettes/datatable-programming.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/datatable-programming.Rmd b/vignettes/datatable-programming.Rmd index 0b01cb5c0..eedb5e992 100644 --- a/vignettes/datatable-programming.Rmd +++ b/vignettes/datatable-programming.Rmd @@ -62,16 +62,16 @@ There are multiple ways to work around this problem. #### Avoid *lazy evaluation* -The easiest workaround is to avoid *lazy evaluation* in the first place, and fall back to less intuitive, more error-prone approaches like `df[["variable"]]`, etc. +The easiest workaround is to avoid *lazy evaluation* in the first place, and fall back to less intuitive, more error-prone approaches like `df[["variable"]]`, etc. ```{r subset_nolazy} my_subset = function(data, col, val) { - data[data[[col]] == val, ] + data[data[[col]] == val & !is.na(data[[col]]), ] } my_subset(iris, col = "Species", val = "setosa") ``` -Here, we compute a logical vector of length `nrow(iris)`, then this vector is supplied to the `i` argument of `[.data.frame` to perform ordinary "logical vector"-based subsetting. It works well for this simple example, but it lacks flexibility, introduces variable repetition, and requires user to change the function interface to pass the column name as a character rather than unquoted symbol. The more complex the expression we need to parameterize, the less practical this approach becomes. +Here, we compute a logical vector of length `nrow(iris)`, then this vector is supplied to the `i` argument of `[.data.frame` to perform ordinary "logical vector"-based subsetting. To align with `subset()`, which also drops NAs, we need to include an additional use of `data[[col]]` to catch that. It works well enough for this simple example, but it lacks flexibility, introduces variable repetition, and requires user to change the function interface to pass the column name as a character rather than unquoted symbol. The more complex the expression we need to parameterize, the less practical this approach becomes. #### Use of `parse` / `eval` From 1a84514f6d20ff1f9cc614ea9b92ccdee5541506 Mon Sep 17 00:00:00 2001 From: Jan Gorecki Date: Sun, 28 Jul 2024 23:32:07 +0200 Subject: [PATCH 26/41] Teach forder to re-use existing key and index attributes instead of sorting from scratch (#4386) * lazy forder * fix tests * fix tests * respect use.index option * bmerge timings * codecov * helper function * reduce diff to master * rename fix * setindex writes groups (retGrp=TRUE) forder C set index directly smart opt for index retGrp=T/F no tests updated yet * calc order, not groups * expect to reach optimization * skip opt for list * override retGrp=F by retGrp=T is legit use * more backward compatiblility, no retGrp from getindex * more verbose messages during opts in forderLazy * recycle order 1/-1 argument in one place * precise verbose messages * recycle arg once _by_ arg is known * copy paste typo fix * forder writing index disabled * fix tests * code coverage * minor update for safer use of internal option * fix bad name in unit test * retGrp=F requires downgrade idx and it seems to be costly * NA stats from forder * keyOpt fix, and existing tests * fixes for na.last in key and setting idx * filling tests for na.last=T and possible fixes * more stats, any non ascii utf8 * better naming of new stats attributes * add extra escape to escape IS_ASCII checks * update test number after merge * apply minor review feedback * More minor review feedback * use options= to set options * more feedback * Rename forderLazy->forderMaybePresorted * UNPROTECT() more aggressively * maybe_reset_index() helper * Strict prototyping (-Wstrict-prototypes) * fix sloppy refactor for maybe_reset_index() * Fix implicit reliance on datatable.optimize * Fix elsewhere, and encapsulate the logic inside a test() * style * spurious whitespace change * NEWS entry for lazy forder * tidy up NEWS * PROTECT() on key attribute * rename arg/option 'lazy' -> 'reuseSorting' * MaybeSorted->ReuseSorting * other lazy= usage --------- Co-authored-by: Matt Dowle Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 40 ++++++ R/bmerge.R | 5 +- R/data.table.R | 12 +- R/setkey.R | 61 +++----- inst/tests/tests.Rraw | 185 +++++++++++++++++++++--- src/bmerge.c | 28 ++-- src/data.table.h | 9 +- src/forder.c | 317 ++++++++++++++++++++++++++++++++++++++++-- src/init.c | 9 ++ 9 files changed, 576 insertions(+), 90 deletions(-) diff --git a/NEWS.md b/NEWS.md index e13ec71fa..10ec47757 100644 --- a/NEWS.md +++ b/NEWS.md @@ -126,6 +126,46 @@ 22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. +22. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. + +Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. + +Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. + +```r +d2 = data.table(id=2:1, v2=1:2) +setindexv(d2, "id") +str(attr(attr(d2, "index"), "__id")) +# int [1:2] 2 1 +# - attr(*, "starts")= int [1:2] 1 2 +# - attr(*, "maxgrpn")= int 1 +# - attr(*, "anyna")= int 0 +# - attr(*, "anyinfnan")= int 0 +# - attr(*, "anynotascii")= int 0 +# - attr(*, "anynotutf8")= int 0 + +d2 = data.table(id=2:1, v2=1:2) +invisible(d2[id==1L]) +str(attr(attr(d2, "index"), "__id")) +# int [1:2] 2 1 +``` + +This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. + +```r +d1 = data.table(id=1:2, v1=1:2) +d2 = data.table(id=2:1, v2=1:2) +setindexv(d2, "id") +d1[d2, on="id", verbose=TRUE] +#... +#Starting bmerge ... +#forderReuseSorting: using existing index: __id +#forderReuseSorting: opt=2, took 0.000s +#... +``` + +This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/R/bmerge.R b/R/bmerge.R index 881b8528e..16d58fa53 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -118,9 +118,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } - ## after all modifications of i, check if i has a proper key on all icols - io = identical(icols, head(chmatch(key(i), names(i)), length(icols))) - ## after all modifications of x, check if x has a proper key on all xcols. ## If not, calculate the order. Also for non-equi joins, the order must be calculated. non_equi = which.first(ops != 1L) # 1 is "==" operator @@ -180,7 +177,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()} - ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) + ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} # TO DO: xo could be moved inside Cbmerge diff --git a/R/data.table.R b/R/data.table.R index 99d06fad7..cb32836b0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3241,13 +3241,13 @@ is_constantish = function(q, check_singleton=FALSE) { if (is.null(idx)){ ## if nothing else helped, auto create a new index that can be used if (!getOption("datatable.auto.index")) return(NULL) - if (verbose) {catf("Creating new index '%s'\n", paste(names(i), collapse = "__"));flush.console()} - if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(names(i), collapse = "__"));flush.console()} - setindexv(x, names(i)) - if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} - if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(names(i), collapse = "__"));flush.console()} - idx = attr(attr(x, "index", exact=TRUE), paste("__", names(i), collapse = ""), exact=TRUE) idxCols = names(i) + if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} + if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()} + idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, reuseSorting=TRUE) + maybe_reset_index(x, idx, idxCols) ## forder can write index, but disabled for now, see #4386 + if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} } if(!is.null(idxCols)){ setkeyv(i, idxCols) diff --git a/R/setkey.R b/R/setkey.R index b50cb8da9..f43ed39aa 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -50,23 +50,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU miss = !(cols %chin% colnames(x)) if (any(miss)) stopf("some columns are not in the data.table: %s", brackify(cols[miss])) - ## determine, whether key is already present: - if (identical(key(x),cols)) { - if (!physical) { - ## create index as integer() because already sorted by those columns - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) - } - return(invisible(x)) - } else if(identical(head(key(x), length(cols)), cols)){ - if (!physical) { - ## create index as integer() because already sorted by those columns - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) - } else { - ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. - setattr(x,"sorted",cols) - } + if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387 + ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. + setattr(x,"sorted",cols) return(invisible(x)) } @@ -77,26 +63,20 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } if (!is.character(cols) || length(cols)<1L) stopf("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov - newkey = paste(cols, collapse="__") - if (!any(indices(x) == newkey)) { - if (verbose) { - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R - # suppress needed for tests 644 and 645 in verbose mode - catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) - } else { - o = forderv(x, cols, sort=TRUE, retGrp=FALSE) - } + if (verbose) { + # we now also retGrp=TRUE #4387 for !physical + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + # suppress needed for tests 644 and 645 in verbose mode + catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) } else { - if (verbose) catf("setkey on columns %s using existing index '%s'\n", brackify(cols), newkey) - o = getindex(x, newkey) + o = forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE) } - if (!physical) { - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) + if (!physical) { # index COULD BE saved from C forderReuseSorting already, but disabled for now + maybe_reset_index(x, o, cols) return(invisible(x)) } - setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { + setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. Only when order changes. if (verbose) { last.started.at = proc.time() } .Call(Creorder,x,o) if (verbose) { catf("reorder took %s\n", timetaken(last.started.at)); flush.console() } @@ -124,7 +104,7 @@ getindex = function(x, name) { if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) { stopf("Internal error: index '%s' exists but is invalid", name) # nocov } - ans + c(ans) ## drop starts and maxgrpn attributes } haskey = function(x) !is.null(key(x)) @@ -160,19 +140,24 @@ is.sorted = function(x, by=NULL) { # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) } +maybe_reset_index = function(x, idx, cols) { + if (isTRUE(getOption("datatable.forder.auto.index"))) return(invisible()) + if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) + setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), idx) + invisible(x) +} + ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) -{ +forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, reuseSorting=getOption("datatable.reuse.sorting", NA)) { if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL } else { if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L)) by = colnamesInt(x, by, check_dups=FALSE) - if (length(order) == 1L) order = rep(order, length(by)) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(Cforder, x, by, retGrp, sort, order, na.last) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) @@ -209,7 +194,7 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) data = eval(sub, parent.frame(), parent.frame()) } stopifnot(isTRUEorFALSE(decreasing)) - o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last) + o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last) if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o o } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c7f0833cf..b90743e28 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6377,7 +6377,7 @@ test(1419.61, DT$a, c(1,1,2,3,4,5,6,7,8,9)) setkey(DT, NULL) setindex(DT, a) test(1419.62, setkey(DT, a, verbose=TRUE), data.table(a=c(1,1:9), aaa=c(1,1,2,2,1,2,2,2,1,2), key="a"), - output="setkey on columns [a] using existing index 'a'") # checks also that the prior index a is dropped (because y is keyed with no index) + output="using existing index: __a") # setkey picks correct index of multiple indexes (e.g. exact=TRUE is used in internals) DT = data.table(a = c(3,3,4,4,5,6,1,1,7,2), @@ -6392,7 +6392,7 @@ setindex(DT, aaa, a) setindex(DT, aaa) # this aaa not previous aaa_a should be used by setkey(DT,aaa); i.e. ensure no partial matching test(1419.65, allIndicesValid(DT), TRUE) test(1419.66, setkey(DT, aaa, verbose=TRUE), data.table(a=c(1,3,3,6,1,2,4,4,5,7), aaa=c(1,1,1,1,2,2,2,2,2,2), bbb=c(1,1,1,1,0,1,2,0,1,1), key="aaa"), - output="setkey on columns [aaa] using existing index 'aaa'") # checks that all indexes are dropped (aaa_a too) + output="using existing index: __aaa") # checks that all indexes are dropped (aaa_a too) # setnames updates secondary key DT = data.table(a=1:5,b=10:6) @@ -11757,7 +11757,7 @@ test(1775.3, capture.output(print(DT2, print.keys = TRUE)), setkey(DT2, a) setindexv(DT2, c("b","a")) test(1775.4, capture.output(print(DT2, print.keys = TRUE)), - c("Key: ", "Index: ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) + c("Key: ", "Indices: , ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) ## index 'b' is still good, so we keep it # dev regression #2285 cat("A B C\n1 2 3\n4 5 6", file=f<-tempfile()) @@ -12756,7 +12756,7 @@ test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) DT = data.table(a = c(3, 2, 1, 2, 3), b = c(1, 2, 1, 1, 2)) setindexv(DT, list('a', c('a', 'b'))) test(1897.1, indices(DT), c("a", "a__b")) -test(1897.2, attributes(attr(DT, 'index')), +test(1897.2, lapply(attributes(attr(DT, 'index')), c), ## lapply(, c) to ensure no starts, maxgrpn attributes list(`__a` = c(3L, 2L, 4L, 1L, 5L), `__a__b` = c(3L, 4L, 2L, 1L, 5L))) @@ -12796,9 +12796,9 @@ test(1899.18, as.matrix(DT, rownames=TRUE, rownames.value=1:nrow(DT)), error="ro # index argument for fread, #2633 DT_str = c('a,b\n3,1\n2,2\n1,1\n2,1\n3,2') -test(1900.1, attributes(attr(fread(DT_str, index = 'a'), 'index')), +test(1900.1, lapply(attributes(attr(fread(DT_str, index = 'a'), 'index')), c), # lapply(, c) to ensure no starts, maxgrpn attributes list(`__a` = c(3L, 2L, 4L, 1L, 5L))) -test(1900.2, attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), +test(1900.2, lapply(attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L), `__b__a` = c(3L, 4L, 1L, 2L, 5L), `__a` = c(3L, 2L, 4L, 1L, 5L))) @@ -12809,7 +12809,7 @@ test(1900.4, fread(DT_str, index = list('a', 1L)), # col.names applied before index test(1900.5, fread(DT_str, col.names = c('c', 'd'), index = 'a'), error = 'some columns are not in the data.table') -test(1900.6, attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')), +test(1900.6, lapply(attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L))) # . within bquote shouldn't be swapped to list, #1912 @@ -13307,7 +13307,7 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), # appearance order of two low-cardinality columns that were squashed in pr#3124 DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) -test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L)) +test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L)) # skip values that are not present in old, #3030 DT <- data.table(a=1, b=2, d=3) @@ -13546,11 +13546,11 @@ test(1962.042, forderv(DT, na.last = c(TRUE, FALSE)), error='na.last must be lo test(1962.043, forderv(DT$a, by = 'a'), error='x is a single vector, non-NULL') test(1962.044, forderv(DT$a, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.045, forderv(DT$a, order = c(1L, -1L)), error='Input is an atomic vector (not a list of columns) but order= is not a length 1 integer') -test(1962.0461, forderv(DT, order = c(1L, -1L)), error="Either order= is not integer or its length (2) is different to by='s length (1)") +test(1962.0461, forderv(DT, order = c(1L, -1L)), error="length (2) is different to by='s length (1)") test(1962.0462, forderv(DT, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.0471, forderv(mean), error="'x' argument must be data.table compatible") test(1962.0472, forderv(DT, by=mean), error="argument specifying columns must be character or numeric") -test(1962.0473, forderv(NULL), error="DT is an empty list() of 0 columns") +test(1962.0473, forderv(NULL), error="DT is NULL") setDF(DT) test(1962.0481, forder(DT), 3:1) @@ -14292,12 +14292,12 @@ test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not") # forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8. -test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L)) -test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L)) -test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L)) -test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L)) -test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L)) -test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L)) +test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3) # 0 length items should not result in no-recycle error, #3386 @@ -18814,3 +18814,156 @@ y = data.table(a = 2:3, key="a") test(2274.31, merge(x,y, all.y=TRUE), data.table(a=structure(2:3, class=c("a", "integer")), key="a")) test(2274.32, rbind(x,y), error="Class attribute .* does not match with .*") test(2274.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, 3L), class=c("a", "integer")))) + +# lazy forder, #4386 +dd = data.table(a=1:2, b=2:1) +d = copy(dd) +test(2275.01, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2275.02, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +setkeyv(d, "b") +test(2275.03, options=c(datatable.verbose=TRUE), forderv(d, "b"), integer(), output="forder.*opt=1.*took") +test(2275.04, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, "b") +test(2275.05, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=2.*took") +test(2275.06, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +test(2275.11, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2275.12, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.13, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2275.14, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +setkeyv(d, c("a","b")) +test(2275.21, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") +test(2275.22, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.23, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2275.24, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +setkeyv(d, c("b","a")) +test(2275.25, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") +test(2275.26, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.27, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") +test(2275.28, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, c("a","b")) +test(2275.31, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2275.32, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.33, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2275.34, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, c("b","a")) +test(2275.35, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2275.36, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.37, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2275.38, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, list(c("a","b"), c("b","a"))) +test(2275.41, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2275.42, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.43, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2275.44, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setkeyv(d, c("a","b")) +setindexv(d, list(c("a","b"), c("b","a"))) +test(2275.51, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached +test(2275.52, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.53, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2275.54, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = copy(dd) +setindexv(d, list(c("a","b"), c("b","a"))) +test(2275.55, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") +test(2275.56, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +d = copy(dd) +setkeyv(d, c("a","b")) +setindexv(d, list(c("a","b"), c("b","a"))) +ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) +ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) +test(2275.60, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") # c(): strip attributes +test(2275.61, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") +test(2275.62, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") +test(2275.63, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") +test(2275.64, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, reuseSorting=FALSE), ba, output="forder.*opt=0.*took") +test(2275.65, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute +test(2275.66, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.67, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute +test(2275.68, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.69, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2275.70, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") +test(2275.71, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2275.72, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") +test(2275.73, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") +test(2275.74, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.75, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") +test(2275.76, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.77, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") +test(2275.78, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.79, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") +test(2275.80, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.81, options=c(datatable.verbose=TRUE), forderv(1:2), integer(), output="forder.*opt=0.*took") +test(2275.82, options=c(datatable.verbose=TRUE), forderv(1:2, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.83, options=c(datatable.verbose=TRUE), forderv(2:1), 2:1, output="forder.*opt=0.*took") +test(2275.84, options=c(datatable.verbose=TRUE), forderv(2:1, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") +test(2275.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") +ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg +d = copy(ddd) +test(2275.8530, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") +test(2275.8531, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") +test(2275.854, options=c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") +setattr(d, "index", setattr(integer(), "__v1", o)) +test(2275.855, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") +test(2275.856, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2275.857, options=c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") +d = copy(ddd) +test(2275.8580, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") # _not_ setindex(d, v1), which will compute retGrp/retStats +test(2275.8581, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") +setindexv(d, "v2") +test(2275.859, options=c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") +d = copy(ddd) +setkeyv(d, "v1") +setindexv(d, list("v2","v3","v4","v5",c("v1","v2"),c("v1","v3"),c("v2","v3"),c("v1","v4"),c("v1","v5"),c("v1","v4","v5"))) +test(2275.861, options=c(datatable.verbose=TRUE), forderv(d, "v1"), integer(), output="forder.*opt=1.*took") +test(2275.862, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last +setindexv(d, "v1") +test(2275.863, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2275.864, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.865, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2275.866, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.867, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2275.868, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.869, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2275.870, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.871, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2275.872, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2275.873, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +d = fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1") ## re-use some existing non utf8 data +anyEnc = function(x) unlist(attributes(forderv(x, retStats=TRUE))[c("anynotascii","anynotutf8")]) +test(2275.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) +test(2275.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) +test(2275.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) +d = copy(dd) +setindexv(d, "b") +test(2275.91, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), + forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2275.92, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), + forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") +d = data.table(x = 2:1) +test(2275.93, options=c(datatable.optimize=Inf), {d[x == 1L]; attr(attr(d, "index"), "__x")}, 2:1) +test(2275.94, options=c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +d = data.table(x = 2:1) +test(2275.95, options=list(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE, datatable.optimize=Inf), + d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") +test(2275.96, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") +setindexv(d, NULL) +test(2275.971, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL) +test(2275.972, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL) +test(2275.973, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2275.974, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2275.975, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2275.99, forderv(data.table(a=1), reuseSorting=c(TRUE, TRUE)), error="reuseSorting must be") diff --git a/src/bmerge.c b/src/bmerge.c index 351baff28..108d82861 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -39,7 +39,11 @@ static Rboolean rollToNearest=FALSE; void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisgrp, int lowmax, int uppmax); -SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { +SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { + const bool verbose = GetVerbose(); + double tic=0.0, tic0=0.0; + if (verbose) + tic = omp_get_wtime(); int xN, iN, protecti=0; ctr=0; // needed for non-equi join case SEXP retFirstArg, retLengthArg, retIndexArg, allLen1Arg, allGrp1Arg; @@ -158,17 +162,11 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP allGrp1[0] = TRUE; protecti += 2; - // isorted arg - o = NULL; - if (!LOGICAL(isorted)[0]) { - SEXP order = PROTECT(allocVector(INTSXP, length(icolsArg))); - protecti++; - for (int j=0; j 1 && mult == ALL) { @@ -213,6 +215,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP Free(retLength); Free(retIndex); } + if (verbose) + Rprintf("bmerge: took %.3fs\n", omp_get_wtime()-tic); UNPROTECT(protecti); return (ans); } diff --git a/src/data.table.h b/src/data.table.h index 812ad5f97..a848ef034 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -102,6 +102,10 @@ extern SEXP sym_index; extern SEXP sym_BY; extern SEXP sym_starts, char_starts; extern SEXP sym_maxgrpn; +extern SEXP sym_anyna; +extern SEXP sym_anyinfnan; +extern SEXP sym_anynotascii; +extern SEXP sym_anynotutf8; extern SEXP sym_colClassesAs; extern SEXP sym_verbose; extern SEXP SelfRefSymbol; @@ -138,7 +142,8 @@ int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); -SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); +SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); +SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg); // reuseSorting wrapper to forder int getNumericRounding_C(void); // reorder.c @@ -187,7 +192,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP on, SEXP verbose); // bmerge.c -SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, +SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg); diff --git a/src/forder.c b/src/forder.c index 7226f7e45..564f55ec3 100644 --- a/src/forder.c +++ b/src/forder.c @@ -32,6 +32,7 @@ static int nth = 1; // number of threads to use, throttled by default; used by cleanup() to ensure no mismatch in getDTthreads() calls static bool retgrp = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn : +static bool retstats = true; // return extra flags for any NA, NaN, -Inf, +Inf, non-ASCII, non-UTF8 static int nrow = 0; // used as group size stack allocation limit (when all groups are 1 row) static int *gs = NULL; // gs = final groupsizes e.g. 23,12,87,2,1,34,... static int gs_alloc = 0; // allocated size of gs @@ -277,11 +278,11 @@ static void cradix(SEXP *x, int n) free(cradix_xtmp); cradix_xtmp=NULL; } -static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count) +static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8) // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; - bool anyneedutf8=false; + bool anynotascii=false, anynotutf8=false; if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc); // # nocov if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str")); // # nocov // savetl_init() has already been called at the start of forder @@ -308,16 +309,24 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max ustr[ustr_n++] = s; SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); - if (!anyneedutf8 && NEED2UTF8(s)) anyneedutf8=true; + if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already + !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one + if (!anynotascii) + anynotascii=true; + if (!IS_UTF8(s)) + anynotutf8=true; + } } } *out_na_count = na_count; + *out_anynotascii = anynotascii; + *out_anynotutf8 = anynotutf8; if (ustr_n==0) { // all na *out_min = 0; *out_max = 0; return; } - if (anyneedutf8) { + if (anynotutf8) { SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n)); for (int i=0; i0) + any_na = 1; // may be written multiple times, for each column that has NA, but thats fine + if (infnan_count>0) + any_infnan = 1; + if (anynotascii) + any_notascii = 1; + if (anynotutf8) + any_notutf8 = 1; if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) { // all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA) if (na_count==nrow && nalast==-1) { for (int i=0; i0 || attr(idx, "anyinfnan")>0 +bool idxAnyNF(SEXP idx) { + return INTEGER(getAttrib(idx, sym_anyna))[0]>0 || INTEGER(getAttrib(idx, sym_anyinfnan))[0]>0; +} + +// forder, re-use existing key or index if possible, otherwise call forder +SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg) { + const bool verbose = GetVerbose(); + int protecti = 0; + double tic=0.0; + if (verbose) + tic = omp_get_wtime(); + if (isNull(DT)) + error("DT is NULL"); + if (!IS_TRUE_OR_FALSE(retGrpArg)) + error("retGrp must be TRUE or FALSE"); + bool retGrp = (bool)LOGICAL(retGrpArg)[0]; + if (!IS_TRUE_OR_FALSE(retStatsArg)) + error("retStats must be TRUE or FALSE"); + bool retStats = (bool)LOGICAL(retStatsArg)[0]; + if (!retStats && retGrp) + error("retStats must be TRUE whenever retGrp is TRUE"); // retStats doesnt cost anything and it will be much easier to optimize use of index + if (!IS_TRUE_OR_FALSE(sortGroupsArg)) + error("sort must be TRUE or FALSE"); + bool sortGroups = (bool)LOGICAL(sortGroupsArg)[0]; + if (!isLogical(naArg) || LENGTH(naArg) != 1) + error("na.last must be logical TRUE, FALSE or NA of length 1"); + bool na = (bool)LOGICAL(naArg)[0]; + if (!isInteger(ascArg)) + error("order must be integer"); // # nocov # coerced to int in R + if (!isLogical(reuseSortingArg) || LENGTH(reuseSortingArg) != 1) + error("reuseSorting must be logical TRUE, FALSE or NA of length 1"); + int reuseSorting = LOGICAL(reuseSortingArg)[0]; + if (!length(DT)) + return allocVector(INTSXP, 0); + int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt + if (reuseSorting==NA_LOGICAL) { + if (INHERITS(DT, char_datatable) && // unnamed list should not be optimized + sortGroups && + all1(ascArg)) { // could ascArg=-1 be handled by a rev()? + opt = -1; + } else { + if (verbose) + Rprintf("forderReuseSorting: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); + opt = 0; + } + } else if (reuseSorting) { + if (!INHERITS(DT,char_datatable)) + error("internal error: reuseSorting set to TRUE but DT is not a data.table"); // # nocov + if (!sortGroups) + error("internal error: reuseSorting set to TRUE but sort is FALSE"); // # nocov + if (!all1(ascArg)) + error("internal error: reuseSorting set to TRUE but order is not all 1"); // # nocov + opt = -1; + } else if (!reuseSorting) { + opt = 0; + } + SEXP ans = R_NilValue; + if (opt == -1 && !na && !retGrp && colsKeyHead(DT, by)) { + opt = 1; // keyOpt + ans = PROTECT(allocVector(INTSXP, 0)); protecti++; + if (verbose) + Rprintf("forderReuseSorting: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } + if (opt == -1 && GetUseIndex()) { + SEXP idx = getIndex(DT, by); + if (!isNull(idx)) { + bool hasStats = !isNull(getAttrib(idx, sym_anyna)); + if (!na || // na.last=FALSE + (hasStats && !idxAnyNF(idx))) { // na.last=TRUE && !anyNA + bool hasGrp = !isNull(getAttrib(idx, sym_starts)); + if (hasGrp && !hasStats) + error("internal error: index has 'starts' attribute but not 'anyna', please report to issue tracker"); // # nocov + if (hasGrp==retGrp && hasStats==retStats) { + opt = 2; // idxOpt + } else if ( + (hasGrp && !retGrp && !(!hasStats && retStats)) || // !hasStats should never happen when hasGrp + (hasStats && !retStats && !(!hasGrp && retGrp)) + ) { + // shallow_duplicate is faster than copyAsPlain, but shallow_duplicate is AFAIK good for VECSXP, not for INTSXP + // it is still the bottleneck in this opt, it is better to call retGrp=TRUE and just not use those extra attributes + // can we do better here? real shallow for INTSXP? If we could just re-point data pointer... like we do for DT columns + // SEXP new; INTEGER(new) = INTEGER(idx); setAttrib(new, ..., R_NilValue) + idx = shallow_duplicate(idx); + if (hasGrp && !retGrp) { + setAttrib(idx, sym_starts, R_NilValue); + setAttrib(idx, sym_maxgrpn, R_NilValue); + } + if (hasStats && !retStats) { + setAttrib(idx, sym_anyna, R_NilValue); + setAttrib(idx, sym_anyinfnan, R_NilValue); + setAttrib(idx, sym_anynotascii, R_NilValue); + setAttrib(idx, sym_anynotutf8, R_NilValue); + } + opt = 2; // idxOpt but need to drop groups or stats + } else if (!hasGrp && retGrp && !hasStats && retStats) { + if (verbose) + Rprintf("forderReuseSorting: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (!hasGrp && retGrp) { + if (verbose) + Rprintf("forderReuseSorting: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (!hasStats && retStats) { + if (verbose) + Rprintf("forderReuseSorting: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else { + error("internal error: reuseSorting forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov + } + } else { + if (!hasStats) { + if (verbose) + Rprintf("forderReuseSorting: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (idxAnyNF(idx)) { + if (verbose) + Rprintf("forderReuseSorting: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else { + error("internal error: reuseSorting forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov + } + } + if (opt == 2) { + ans = idx; + if (verbose) + Rprintf("forderReuseSorting: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } + } + } + if (opt < 1) { + ans = PROTECT(forder(DT, by, retGrpArg, retStatsArg, sortGroupsArg, ascArg, naArg)); protecti++; + if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or reuseSorting=FALSE + (!na || (retStats && !idxAnyNF(ans))) && // lets create index even if na.last=T used but no NAs detected! + GetUseIndex() && + GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only + putIndex(DT, by, ans); + if (verbose) + Rprintf("forderReuseSorting: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); + } + } + if (verbose) + Rprintf("forderReuseSorting: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); + UNPROTECT(protecti); + return ans; +} diff --git a/src/init.c b/src/init.c index 48046b8d6..5ab53d091 100644 --- a/src/init.c +++ b/src/init.c @@ -29,6 +29,10 @@ SEXP sym_index; SEXP sym_BY; SEXP sym_starts, char_starts; SEXP sym_maxgrpn; +SEXP sym_anyna; +SEXP sym_anyinfnan; +SEXP sym_anynotascii; +SEXP sym_anynotutf8; SEXP sym_colClassesAs; SEXP sym_verbose; SEXP SelfRefSymbol; @@ -73,6 +77,7 @@ R_CallMethodDef callMethods[] = { {"Cfcast", (DL_FUNC) &fcast, -1}, {"Cuniqlist", (DL_FUNC) &uniqlist, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, +{"CforderReuseSorting", (DL_FUNC) &forderReuseSorting, -1}, {"Cforder", (DL_FUNC) &forder, -1}, {"Cissorted", (DL_FUNC) &issorted, -1}, {"Cgforce", (DL_FUNC) &gforce, -1}, @@ -279,6 +284,10 @@ void attribute_visible R_init_data_table(DllInfo *info) sym_index = install("index"); sym_BY = install(".BY"); sym_maxgrpn = install("maxgrpn"); + sym_anyna = install("anyna"); + sym_anyinfnan = install("anyinfnan"); + sym_anynotascii = install("anynotascii"); + sym_anynotutf8 = install("anynotutf8"); sym_colClassesAs = install("colClassesAs"); sym_verbose = install("datatable.verbose"); SelfRefSymbol = install(".internal.selfref"); From 7bc46a90d9500843fa5bdcc8b3ccec321cb8cfb0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 29 Jul 2024 07:44:14 -0700 Subject: [PATCH 27/41] Remove exported [.data.table (#6002) * Remove exported [.data.table * disable related tests * delete commented code * cautious NEWS item --- NAMESPACE | 1 - NEWS.md | 4 ++++ inst/tests/tests.Rraw | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 829dc0800..109336c9e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,7 +62,6 @@ export(substitute2) #export(DT) # mtcars |> DT(i,j,by) #4872 #5472 S3method("[", data.table) -export("[.data.table") # so that functional DT() finds it; PR#5176 S3method("[<-", data.table) # S3method("[[", data.table) # S3method("[[<-", data.table) diff --git a/NEWS.md b/NEWS.md index 10ec47757..baa1d256d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,10 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) +## BREAKING CHANGE + +1. `` `[.data.table` `` is un-exported again. This was exported to support an experimental feature (`DT()` functional form of `[`) that never made it to release, but we forgot to claw back this export in the NAMESPACE; sorry about that. We didn't find anyone calling the method directly (which is inadvisable to begin with). + ## NEW FEATURES 1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b90743e28..643bd6134 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17660,7 +17660,7 @@ for (col in c("a","b","c")) { } # DT() functional form, #4872 #5106 #5107 #5129 -if (base::getRversion() >= "4.1.0") { +if (FALSE) { DT = DTfun # we have to EVAL "|>" here too otherwise this tests.Rraw file won't parse in R<4.1.0 droprn = function(df) { rownames(df)=NULL; df } # TODO: could retain rownames where droprn is currently used below From 3c19d4df0ae8c3479b3ae40312a58bc9d4b01dc0 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Tue, 30 Jul 2024 00:15:54 +0800 Subject: [PATCH 28/41] fcase support scalar condition, vectorize default and lazy-eval default (#4264) * fcase support vector default value * need to explicitly add the `L` to enforce the integer type * should cover all the fcase types * double ; * _() wrap messaging * comestic tweaks due to adding `_()` to errors * @2005m's simpler implementation * should be namask since amask has been used later * fcase supports scalar conditions * rm the na arg in fcase() * add two more tests * the last error message should use TYPEOF(ans) - maybe safer * merge tests first * more files... * more files... * INHERITS copied from master * another manual change to fifelse copied from master * more files copied... * more files copied over * bulk copying files continues until morale improves * bulk copy all other tracked files * something like manual merge complete * obvious things to keep from master * duplicated in bad merge * more dup * Whitespace * better(?) var naming * Fixed bad UNPROTECT() issue, tests passing * updated error message for whens type * forgot period at end of error * style * unclutter * natural order by type hierarchy * style * only UNPROTECT() when we're sure compute_identical was run * Corresponding test case * breathe * Improve error * Use (long long)-->%lld for formatter, rename masking indices * correct test numbering * match updated error message * correct again * NEWS * remove unused Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * use 'j' for consistency with other branches; initialize type0 * set l=0 consistently instead of in multiple places --------- Co-authored-by: Michael Chirico Co-authored-by: Tyson Barrett Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- NEWS.md | 2 + R/wrappers.R | 2 +- inst/tests/tests.Rraw | 25 ++++- src/data.table.h | 2 +- src/fifelse.c | 217 +++++++++++++++++++++--------------------- 5 files changed, 137 insertions(+), 111 deletions(-) diff --git a/NEWS.md b/NEWS.md index baa1d256d..a54583a3d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -48,6 +48,8 @@ `rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. +16. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks @sindribaldur for the feature request, which has been highly requested, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/wrappers.R b/R/wrappers.R index a018b91ae..a339a919e 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -6,7 +6,7 @@ fcoalesce = function(...) .Call(Ccoalesce, list(...), FALSE) setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) -fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) +fcase = function(..., default=NA) .Call(CfcaseR, parent.frame(), as.list(substitute(list(..., TRUE, default)))[-1L]) colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(CcolnamesInt, x, cols, check_dups, skip_absent) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 643bd6134..a614a9e2f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16646,9 +16646,9 @@ test(2127.65, fcase(test_vec1, list(1)), list(1,1,1,1,1, NULL, NULL, NULL, NULL, test(2127.66, fcase(test_vec1, as.Date("2019-10-11")), c(rep(as.Date("2019-10-11"),5),rep(NA,6))) test(2127.67, fcase(test_vec1, factor("a", levels=letters[1:3])), factor(c(rep("a",5),rep("NA",6)), levels=letters[1:3])) test(2127.68, fcase(test_vec1, 1L, default = 1:2), error = "Length of 'default' must be 1.") -test(2127.69, fcase(test_vec1, 1L, test_vec_na1, 2L), error = "Argument #3 has a different length than argument #1. Please make sure all logical conditions have the same length.") +test(2127.69, fcase(test_vec1, 1L, test_vec_na1, 2L), error = "Argument #3 has length 12 which differs from that of argument #1 (11). Please make sure all logical conditions have the same length or length 1.") test(2127.70, fcase(test_vec1, as.Date("2019-10-11"), test_vec2, 2), error = "Argument #4 has different class than argument #2, Please make sure all output values have the same class.") -test(2127.71, fcase(test_vec1, 1L, test_vec2, 2:3), error = "Length of output value #4 must either be 1 or length of logical condition.") +test(2127.71, fcase(test_vec1, 1L, test_vec2, 2:3), error = "Length of output value #4 (2) must either be 1 or match the length of the logical condition (11).") test(2127.72, fcase(TRUE, 1L, FALSE, stop("bang!")), 1L) test(2127.73, fcase(test_vec1, 1L, test_vec2, 0:10), as.integer(c( 1, 1, 1, 1, 1, NA, 6, 7, 8, 9, 10))) test(2127.74, fcase(test_vec1, 0:10, test_vec2, 0L), as.integer(c( 0, 1, 2, 3, 4, NA, 0, 0, 0, 0, 0))) @@ -18967,3 +18967,24 @@ test(2275.974, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRU test(2275.975, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") test(2275.99, forderv(data.table(a=1), reuseSorting=c(TRUE, TRUE)), error="reuseSorting must be") + +# fcase supports vector default values, #4258 +## for default +test(2276.01, fcase(c(TRUE, FALSE, NA, NA), 1:4, default=11:13), error="Length of 'default' must be 1 or 4.") +test(2276.02, fcase(c(TRUE, FALSE, NA, NA), 1:4, default=11:14), c(1L, 12:14)) +test(2276.03, fcase(c(TRUE, FALSE, NA, NA), 1:4 + 0.1, default=11:14 + 0.1), c(1L, 12:14) + 0.1) +test(2276.04, fcase(c(TRUE, FALSE, NA, NA), (1:4)+1i, default=(11:14)+1i), c(1L, 12:14)+1i) +test(2276.05, fcase(c(TRUE, FALSE, NA, NA), as.character(1:4), default=as.character(11:14)), as.character(c(1L, 12:14))) +test(2276.06, fcase(c(TRUE, FALSE, NA, NA), as.list(1:4), default=as.list(11:14)), as.list(c(1L, 12:14))) +## for scalar condition +test(2276.07, fcase(c(TRUE, FALSE, NA, NA), 1:4, TRUE, 11:13), error="Length of output value #4 (3) must either be 1 or match the length of the logical condition (4).") +test(2276.08, fcase(c(TRUE, FALSE, NA, NA), 1:4, TRUE, 11:14), c(1L, 12:14)) +test(2276.09, fcase(c(TRUE, FALSE, NA, NA), 1:4 + 0.1, TRUE, 11:14 + 0.1), c(1L, 12:14) + 0.1) +test(2276.10, fcase(c(TRUE, FALSE, NA, NA), (1:4)+1i, TRUE, (11:14)+1i), c(1L, 12:14)+1i) +test(2276.11, fcase(c(TRUE, FALSE, NA, NA), as.character(1:4), TRUE, as.character(11:14)), as.character(c(1L, 12:14))) +test(2276.12, fcase(c(TRUE, FALSE, NA, NA), as.list(1:4), TRUE, as.list(11:14)), as.list(c(1L, 12:14))) +test(2276.13, fcase(TRUE, 1L, default=stop("lazy eval")), 1L) # default is lazy eval'ed +test(2276.14, fcase(c(TRUE, FALSE), 1L, c(TRUE, TRUE), NA), c(1L, NA_integer_)) # scalar NA will be converted + +# output is missing +test(2276.15, fcase(c(TRUE, FALSE), NA_integer_, c(TRUE, TRUE), 2L), c(NA_integer_, 2L)) diff --git a/src/data.table.h b/src/data.table.h index a848ef034..ed63978d6 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -267,7 +267,7 @@ SEXP testMsgR(SEXP status, SEXP x, SEXP k); //fifelse.c SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na); -SEXP fcaseR(SEXP na, SEXP rho, SEXP args); +SEXP fcaseR(SEXP rho, SEXP args); //snprintf.c int dt_win_snprintf(char *dest, size_t n, const char *fmt, ...); diff --git a/src/fifelse.c b/src/fifelse.c index 72b7f2c01..d3bc0fdb1 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -201,105 +201,106 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) { return ans; } -SEXP fcaseR(SEXP na, SEXP rho, SEXP args) { - const int narg=length(args); +SEXP fcaseR(SEXP rho, SEXP args) { + const int narg=length(args); // `default` will take the last two positions if (narg % 2) { error(_("Received %d inputs; please supply an even number of arguments in ..., " "consisting of logical condition, resulting value pairs (in that order). " - "Note that the default argument must be named explicitly, e.g., default=0"), narg); + "Note that the default argument must be named explicitly, e.g., default=0"), narg - 2); } - if (narg==0) return R_NilValue; - - SEXP cons0 = PROTECT(eval(SEXPPTR_RO(args)[0], rho)); - SEXP value0 = PROTECT(eval(SEXPPTR_RO(args)[1], rho)); // value0 will be compared to from loop so leave it protected throughout - SEXPTYPE type0 = TYPEOF(value0); - int64_t len0=xlength(cons0), len2=len0; - if (isS4(value0) && !INHERITS(value0, char_nanotime)) { - error(_("S4 class objects (except nanotime) are not supported. Please see https://github.com/Rdatatable/data.table/issues/4131.")); - // otherwise 'invalid type/length (S4/1) in vector allocation' from test 2132.3 - } - SEXP ans = PROTECT(allocVector(type0, len0)); - SEXP tracker = PROTECT(allocVector(INTSXP, len0)); - int *restrict p = INTEGER(tracker); - copyMostAttrib(value0, ans); - - bool nonna=!isNull(na); - if (nonna) { - if (xlength(na) != 1) { - error(_("Length of 'default' must be 1.")); - } - SEXPTYPE tn = TYPEOF(na); - if (tn==LGLSXP && LOGICAL(na)[0]==NA_LOGICAL) { - nonna = false; - } else { - if (tn != type0) { - error(_("Resulting value is of type %s but 'default' is of type %s. " - "Please make sure that both arguments have the same type."), type2char(type0), type2char(tn)); - } - if (!R_compute_identical(PROTECT(getAttrib(value0,R_ClassSymbol)), PROTECT(getAttrib(na,R_ClassSymbol)), 0)) { - error(_("Resulting value has different class than 'default'. " - "Please make sure that both arguments have the same class.")); - } - UNPROTECT(2); - if (isFactor(value0)) { - if (!R_compute_identical(PROTECT(getAttrib(value0,R_LevelsSymbol)), PROTECT(getAttrib(na,R_LevelsSymbol)), 0)) { - error(_("Resulting value and 'default' are both type factor but their levels are different.")); - } - UNPROTECT(2); - } - } - } - + int nprotect=0, l; + int64_t len0=0, len1=0, len2=0; + SEXP ans=R_NilValue, value0=R_NilValue, tracker=R_NilValue, whens=R_NilValue, thens=R_NilValue; + PROTECT_INDEX Iwhens, Ithens; + PROTECT_WITH_INDEX(whens, &Iwhens); nprotect++; + PROTECT_WITH_INDEX(thens, &Ithens); nprotect++; + SEXPTYPE type0=NILSXP; + // naout means if the output is scalar logic na + bool imask = true, naout = false, idefault = false; + int *restrict p = NULL; const int n = narg/2; for (int i=0; i0) { - if (xlength(cons) != len0) { - error(_("Argument #%d has a different length than argument #1. " - "Please make sure all logical conditions have the same length."), - i*2+1); + const int *restrict pwhens = LOGICAL(whens); + l = 0; + if (i == 0) { + len0 = xlength(whens); + len2 = len0; + type0 = TYPEOF(thens); + value0 = thens; + ans = PROTECT(allocVector(type0, len0)); nprotect++; + copyMostAttrib(thens, ans); + tracker = PROTECT(allocVector(INTSXP, len0)); nprotect++; + p = INTEGER(tracker); + } else { + imask = false; + naout = xlength(thens) == 1 && TYPEOF(thens) == LGLSXP && LOGICAL(thens)[0]==NA_LOGICAL; + if (xlength(whens) != len0 && xlength(whens) != 1) { + // no need to check `idefault` here because the con for default is always `TRUE` + error(_("Argument #%d has length %lld which differs from that of argument #1 (%lld). " + "Please make sure all logical conditions have the same length or length 1."), + i*2+1, (long long)xlength(whens), (long long)len0); } - if (TYPEOF(outs) != type0) { - error(_("Argument #%d is of type %s, however argument #2 is of type %s. " - "Please make sure all output values have the same type."), - i*2+2, type2char(TYPEOF(outs)), type2char(type0)); + if (!naout && TYPEOF(thens) != type0) { + if (idefault) { + error(_("Resulting value is of type %s but 'default' is of type %s. " + "Please make sure that both arguments have the same type."), type2char(type0), type2char(TYPEOF(thens))); + } else { + error(_("Argument #%d is of type %s, however argument #2 is of type %s. " + "Please make sure all output values have the same type."), + i*2+2, type2char(TYPEOF(thens)), type2char(type0)); + } } - if (!R_compute_identical(PROTECT(getAttrib(value0,R_ClassSymbol)), PROTECT(getAttrib(outs,R_ClassSymbol)), 0)) { - error(_("Argument #%d has different class than argument #2, " - "Please make sure all output values have the same class."), i*2+2); + if (!naout) { + if (!R_compute_identical(PROTECT(getAttrib(value0, R_ClassSymbol)), PROTECT(getAttrib(thens, R_ClassSymbol)), 0)) { + if (idefault) { + error(_("Resulting value has different class than 'default'. " + "Please make sure that both arguments have the same class.")); + } else { + error(_("Argument #%d has different class than argument #2, " + "Please make sure all output values have the same class."), i*2+2); + } + } + UNPROTECT(2); // class(value0), class(thens) } - UNPROTECT(2); - if (isFactor(value0)) { - if (!R_compute_identical(PROTECT(getAttrib(value0,R_LevelsSymbol)), PROTECT(getAttrib(outs,R_LevelsSymbol)), 0)) { - error(_("Argument #2 and argument #%d are both factor but their levels are different."), i*2+2); + if (!naout && isFactor(value0)) { + if (!R_compute_identical(PROTECT(getAttrib(value0, R_LevelsSymbol)), PROTECT(getAttrib(thens, R_LevelsSymbol)), 0)) { + if (idefault) { + error(_("Resulting value and 'default' are both type factor but their levels are different.")); + } else { + error(_("Argument #2 and argument #%d are both factor but their levels are different."), i*2+2); + } } - UNPROTECT(2); + UNPROTECT(2); // levels(value0), levels(thens) } } - int64_t len1 = xlength(outs); - if (len1!=len0 && len1!=1) { - error(_("Length of output value #%d must either be 1 or length of logical condition."), i*2+2); + len1 = xlength(thens); + if (len1 != len0 && len1 != 1) { + if (idefault) { + error(_("Length of 'default' must be 1 or %lld."), (long long)len0); + } else { + error(_("Length of output value #%d (%lld) must either be 1 or match the length of the logical condition (%lld)."), i*2+2, (long long)len1, (long long)len0); + } } - int64_t amask = len1>1 ? INT64_MAX : 0; - const int *restrict pcons = LOGICAL(cons); - const bool imask = i==0; - int64_t l=0; // how many this case didn't satisfy; i.e. left for next case - switch(TYPEOF(outs)) { + int64_t thenMask = len1>1 ? INT64_MAX : 0, whenMask = xlength(whens)>1 ? INT64_MAX : 0; + switch(TYPEOF(ans)) { case LGLSXP: { - const int *restrict pouts = LOGICAL(outs); + const int *restrict pthens; + if (!naout) pthens = LOGICAL(thens); // the content is not useful if out is NA_LOGICAL scalar int *restrict pans = LOGICAL(ans); - const int pna = nonna ? LOGICAL(na)[0] : NA_LOGICAL; + const int pna = NA_LOGICAL; for (int64_t j=0; j Date: Mon, 29 Jul 2024 12:55:21 -0400 Subject: [PATCH 29/41] Skip sorting already sorted (#4501) * Update data.table.R * Update data.table.R * Update NEWS.md * improved logic for catching for-sure-sorted jval * Allow .SD; allow original is.sorted portion to escape if irows is.null * partial rollback * Fixed logic of .SD selection * typo * typo * DT[,.SD] improved in #4501 * Slow is a recognized test name with a specific color * fix lint * bigger N and set.seed * nearly->close-to --------- Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico Co-authored-by: Toby Hocking --- .ci/atime/tests.R | 23 +++++++++++++++++++++-- NEWS.md | 2 ++ R/data.table.R | 21 ++++++++++++++++++++- inst/tests/tests.Rraw | 8 +++++++- 4 files changed, 50 insertions(+), 4 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index fddfb1347..ecd5d33d5 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -119,6 +119,25 @@ test.list <- atime::atime_test_list( data.table:::setDT(L) }, Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) - Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) -) + Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) + + # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498 + # Fixed in: https://github.com/Rdatatable/data.table/pull/4501 + "DT[,.SD] improved in #4501" = atime::atime_test( + N = 10^seq(1, 10, by=0.5), + setup = { + set.seed(1) + L = as.data.table(as.character(rnorm(N, 1, 0.5))) + setkey(L, V1) + }, + ## New DT can safely retain key. + expr = { + data.table:::`[.data.table`(L, , .SD) + }, + Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue + Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue + Slower = "cacdc92df71b777369a217b6c902c687cf35a70d" # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue + ), + + NULL) # nolint end: undesirable_operator_linter. diff --git a/NEWS.md b/NEWS.md index a54583a3d..fd13331a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -82,6 +82,8 @@ 15. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. +16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/data.table.R b/R/data.table.R index cb32836b0..9bc04e3ca 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1427,8 +1427,27 @@ replace_dot_alias = function(e) { # should set the parent class only when jval is a plain data.table #4324 if (identical(class(jval), c('data.table', 'data.frame'))) setattr(jval, 'class', class(x)) # fix for #64 - if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) + # can jval be sorted by the same key as x? improved for #4498 + get_shared_keys = function(jsub, jvnames, sdvars, key) { + if (is.null(key)) return(NULL) + if (!((SD_only <- jsub == quote(.SD)) || jsub %iscall% "list")) return(NULL) + if (SD_only) + jvnames = jnames = sdvars + else + jnames = as.character(Filter(is.name, jsub)[-1L]) + key_idx = chmatch(key, jnames) + missing_keys = which(is.na(key_idx)) + if (length(missing_keys) && missing_keys[1L] == 1L) return(NULL) + if (!length(missing_keys)) return(jvnames[key_idx]) + jvnames[head(key_idx, missing_keys[1L] - 1L)] + } + shared_keys = get_shared_keys(jsub, jvnames, sdvars = sdvars, key(x)) + if (is.null(irows) && !is.null(shared_keys)) { + setattr(jval, 'sorted', shared_keys) + # potentially inefficient backup -- check if jval is sorted by key(x) + } else if (haskey(x) && all(key(x) %chin% names(jval)) && is.sorted(jval, by=key(x))) { setattr(jval, 'sorted', key(x)) + } if (any(vapply_1b(jval, is.null))) stopf("Internal error: j has created a data.table result containing a NULL column") # nocov } return(jval) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a614a9e2f..dfec93e89 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3892,7 +3892,7 @@ test(1118, dt[, lapply(.SD, function(y) weighted.mean(y, b2, na.rm=TRUE)), by=x] # a(nother) test of #295 DT <- data.table(x=5:1, y=1:5, key="y") -test(1119, is.null(key(DT[, list(z = y, y = 1/y)]))) +test(1119, key(DT[, list(z = y, y = 1/y)]), 'z') ## various ordered factor rbind tests DT1 = data.table(ordered('a', levels = c('a','b','c'))) @@ -18718,6 +18718,12 @@ DT = data.table(a = rep(1:3, 2)) # NB: recall we can't use non-ASCII symbols here. the text is a--o (year in Spanish) setnames(DT, "a", "a\U00F1o") test(2266, eval(parse(text="DT[ , .N, a\U00F1o]$N[1L]")), 2L) +# sub-key can also be retained in plain query, part of #4498 +DT = data.table(id = rep(1:10, 2L), grp = rep(1:2, each=10L), V = 1:20/13, key=c('id', 'grp')) +test(2266.1, key(DT[ , .(id)]), 'id') +test(2266.2, key(DT[ , .(grp)]), NULL) +## renaming also caught +test(2266.3, key(DT[ , .(newid = id, newgrp = grp)]), c('newid', 'newgrp')) # all.equal failed to dispatch to methods of columns, #4543 DT1 = data.table(t = .POSIXct(1590973200, tz='UTC')) From 0a25b42754d33d38c38819a42e1acc66b42c34a6 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 29 Jul 2024 12:32:15 -0700 Subject: [PATCH 30/41] Use deparse() to cast calls to string when needed (#6027) * Don't cast jsub[[1]] to character if invalid * Fix for case of lambda in j * comment on why '(' handling is needed * switch to format() in progress * fix %iscall% * more fixes * var in earlier test was masking stats::var * NEWS * Use deparse() directly to avoid tiny overhead & make it easier to find deparse1() calls later * discourage f=sum, encourage f="sum" * Also catch pkg:::fun * test for ':::' too --- NEWS.md | 2 ++ R/data.table.R | 40 +++++++++++++++++++++------------------- R/utils.R | 3 ++- inst/tests/tests.Rraw | 8 ++++++++ 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index fd13331a2..0ba12fc4b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -84,6 +84,8 @@ 16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. +17. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. + ## NOTES 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. diff --git a/R/data.table.R b/R/data.table.R index 9bc04e3ca..5b23169c4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -242,7 +242,7 @@ replace_dot_alias = function(e) { } if (!missing(j)) { jsub = replace_dot_alias(jsub) - root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" + root = root_name(jsub) if (root == ":" || (root %chin% c("-","!") && jsub[[2L]] %iscall% '(' && jsub[[2L]][[2L]] %iscall% ':') || ( (!length(av<-all.vars(jsub)) || all(startsWith(av, ".."))) && @@ -285,7 +285,7 @@ replace_dot_alias = function(e) { if (root=="{") { if (length(jsub) == 2L) { jsub = jsub[[2L]] # to allow {} wrapping of := e.g. [,{`:=`(...)},] [#376] - root = if (is.call(jsub)) as.character(jsub[[1L]])[1L] else "" + root = root_name(jsub) } else if (length(jsub) > 2L && jsub[[2L]] %iscall% ":=") { #2142 -- j can be {} and have length 1 stopf("You have wrapped := with {} which is ok but then := must be the only thing inside {}. You have something else inside {} as well. Consider placing the {} on the RHS of := instead; e.g. DT[,someCol:={tmpVar1<-...;tmpVar2<-...;tmpVar1*tmpVar2}") @@ -298,10 +298,8 @@ replace_dot_alias = function(e) { jsub = eval(jsub[[2L]], parent.frame(), parent.frame()) # this evals the symbol to return the dynamic expression if (is.expression(jsub)) jsub = jsub[[1L]] # if expression, convert it to call # Note that the dynamic expression could now be := (new in v1.9.7) - root = if (is.call(jsub)) { - jsub = replace_dot_alias(jsub) - as.character(jsub[[1L]])[1L] - } else "" + jsub = replace_dot_alias(jsub) + root = root_name(jsub) } if (root == ":=" || root == "let") { # let(...) as alias for :=(...) (#3795) if (root == "let") @@ -1401,7 +1399,7 @@ replace_dot_alias = function(e) { .Call(Cassign,x,irows,cols,newnames,jval) return(suppPrint(x)) } - if ((is.call(jsub) && jsub[[1L]] != "get" && is.list(jval) && !is.object(jval)) || !missingby) { + if ((is.call(jsub) && !jsub %iscall% "get" && is.list(jval) && !is.object(jval)) || !missingby) { # is.call: selecting from a list column should return list # is.object: for test 168 and 168.1 (S4 object result from ggplot2::qplot). Just plain list results should result in data.table @@ -1647,25 +1645,25 @@ replace_dot_alias = function(e) { jsub = as.call(c(quote(list), lapply(sdvars, as.name))) jvnames = sdvars } - } else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with + } else if (is.name(jsub[[1L]])) { # Else expect problems with # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 subopt = length(jsub) == 3L && - (jsub[[1L]] == "[" || - (jsub[[1L]] == "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && + (jsub %iscall% "[" || + (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") - headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" - firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 + headopt = jsub %iscall% c("head", "tail") + firstopt = jsub %iscall% c("first", "last") # fix for #2030 if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && (subopt || headopt || firstopt)) { if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) jvnames = sdvars - } else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) { + } else if (jsub %iscall% "lapply" && jsub[[2L]]==".SD" && length(xcols)) { deparse_ans = .massageSD(jsub) jsub = deparse_ans[[1L]] jvnames = deparse_ans[[2L]] - } else if (jsub[[1L]] == "c" && length(jsub) > 1L) { + } else if (jsub %iscall% "c" && length(jsub) > 1L) { # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already. # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains @@ -1770,7 +1768,7 @@ replace_dot_alias = function(e) { GForce = FALSE } else { # Apply GForce - if (jsub[[1L]]=="list") { + if (jsub %iscall% "list") { GForce = TRUE for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break} @@ -1778,7 +1776,7 @@ replace_dot_alias = function(e) { } else GForce = .gforce_ok(jsub, SDenv$.SDall) if (GForce) { - if (jsub[[1L]]=="list") + if (jsub %iscall% "list") for (ii in seq_along(jsub)[-1L]) { if (is.N(jsub[[ii]])) next; # For #334 jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x) @@ -1796,7 +1794,7 @@ replace_dot_alias = function(e) { # Still do the old speedup for mean, for now nomeanopt=FALSE # to be set by .optmean() using <<- inside it oldjsub = jsub - if (jsub[[1L]]=="list") { + if (jsub %iscall% "list") { # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() # jsub[[1]]=="list" so the first item of todo will always be FALSE todo = sapply(jsub, `%iscall%`, 'mean') @@ -1804,7 +1802,7 @@ replace_dot_alias = function(e) { w = which(todo) jsub[w] = lapply(jsub[w], .optmean) } - } else if (jsub[[1L]]=="mean") { + } else if (jsub %iscall% "mean") { jsub = .optmean(jsub) } if (nomeanopt) { @@ -1884,7 +1882,7 @@ replace_dot_alias = function(e) { (q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 else 0 } - if (jsub[[1L]] == "list"){ + if (jsub %iscall% "list"){ q3 = max(sapply(jsub, headTail_arg)) } else if (length(jsub)==3L) { q3 = headTail_arg(jsub) @@ -1986,6 +1984,10 @@ replace_dot_alias = function(e) { setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } +# What's the name of the top-level call in 'j'? +# NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026). +root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else "" + DT = function(x, ...) { #4872 old = getOption("datatable.optimize") if (!is.data.table(x) && old>2L) { diff --git a/R/utils.R b/R/utils.R index ff4766d5e..cf23609ee 100644 --- a/R/utils.R +++ b/R/utils.R @@ -155,7 +155,8 @@ is_utc = function(tz) { `%iscall%` = function(e, f) { if (!is.call(e)) return(FALSE) if (is.name(e1 <- e[[1L]])) return(e1 %chin% f) - e1 %iscall% '::' && e1[[3L]] %chin% f + if (e1 %iscall% c('::', ':::')) return(e1[[3L]] %chin% f) + paste(deparse(e1), collapse = " ") %chin% f # complicated cases e.g. a closure/builtin on LHS of call; note that format() is much (e.g. 40x) slower than deparse() } # nocov start #593 always return a data.table diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dfec93e89..094af74c7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18994,3 +18994,11 @@ test(2276.14, fcase(c(TRUE, FALSE), 1L, c(TRUE, TRUE), NA), c(1L, NA_integer_)) # output is missing test(2276.15, fcase(c(TRUE, FALSE), NA_integer_, c(TRUE, TRUE), 2L), c(NA_integer_, 2L)) + +# passing a function in env= doesn't trip up processing 'j', #6026 +DT=data.table(a=1:2, b=3:4) +test(2277.1, DT[, builtin(b), env=list(builtin=sum)], 7L) +test(2277.2, DT[, closure(b), env=list(closure=var)], 0.5) +test(2277.3, DT[, closure(b), env=list(closure=stats::var)], 0.5) +test(2277.4, DT[, closure(b), env=list(closure=stats:::var)], 0.5) +test(2277.5, DT[, lambda(b), env=list(lambda=function(x) sum(x))], 7L) From bb46efe2088e7e87491262a94f891871f9a19af0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 29 Jul 2024 12:33:15 -0700 Subject: [PATCH 31/41] PROTECT() more in forder+bmerge (#6324) --- src/bmerge.c | 6 +++++- src/forder.c | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/bmerge.c b/src/bmerge.c index 108d82861..f12a56f01 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -162,7 +162,11 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r allGrp1[0] = TRUE; protecti += 2; - SEXP oSxp = PROTECT(forderReuseSorting(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), /* ascArg= */ScalarInteger(1), /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; + SEXP ascArg = PROTECT(ScalarInteger(1)); + SEXP oSxp = PROTECT(forderReuseSorting(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), ascArg, /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; + UNPROTECT(2); // down stack to 'ascArg' + PROTECT(oSxp); + if (!LENGTH(oSxp)) o = NULL; else diff --git a/src/forder.c b/src/forder.c index 564f55ec3..db2197443 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1540,10 +1540,12 @@ bool colsKeyHead(SEXP x, SEXP cols) { SEXP idxName(SEXP x, SEXP cols) { if (!isInteger(cols)) error("internal error: 'cols' must be an integer"); // # nocov - SEXP dt_names = getAttrib(x, R_NamesSymbol); + SEXP dt_names = PROTECT(getAttrib(x, R_NamesSymbol)); if (!isString(dt_names)) error("internal error: 'DT' has no names"); // # nocov SEXP idx_names = PROTECT(subsetVector(dt_names, cols)); + UNPROTECT(2); // down-stack to 'dt_names' + PROTECT(idx_names); SEXP char_underscore2 = PROTECT(ScalarString(mkChar("__"))); SEXP char_empty = PROTECT(ScalarString(mkChar(""))); SEXP sym_paste0 = install("paste0"); From b3c24a67e236e11a206d3759704fec8f258acf7f Mon Sep 17 00:00:00 2001 From: Nitish Jha <151559388+Nj221102@users.noreply.github.com> Date: Tue, 30 Jul 2024 01:10:37 +0530 Subject: [PATCH 32/41] updated documentation regarding behavior of `rbindlist` when applied to `difftime` objects with different units (#6309) * updated documentation regarding behavior of rbind when applied to difftime objects with different units * Update rbindlist.Rd * fix mistakes * add dot --------- Co-authored-by: nitish jha Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> --- man/rbindlist.Rd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd index 17c5c2205..f3b7e6845 100644 --- a/man/rbindlist.Rd +++ b/man/rbindlist.Rd @@ -25,6 +25,8 @@ Columns with duplicate names are bound in the order of occurrence, similar to ba If column \code{i} does not have the same type in each of the list items; e.g, the column is \code{integer} in item 1 while others are \code{numeric}, they are coerced to the highest type. If a column contains factors then a factor is created. If any of the factors are also ordered factors then the longest set of ordered levels are found (the first if this is tied). Then the ordered levels from each list item are checked to be an ordered subset of these longest levels. If any ambiguities are found (e.g. \code{blue Date: Mon, 29 Jul 2024 12:49:26 -0700 Subject: [PATCH 33/41] adding an atime test case; groupby with dogroups (R expression) #PR4558 (#6288) Co-authored-by: Toby Dylan Hocking --- .ci/atime/tests.R | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index ecd5d33d5..f962b6672 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -121,6 +121,24 @@ test.list <- atime::atime_test_list( Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits) + # Issue reported in: https://github.com/Rdatatable/data.table/issues/4200 + # To be fixed in: https://github.com/Rdatatable/data.table/pull/4558 + "DT[by] fixed in #4558" = atime::atime_test( + N = 10^seq(1, 20), + setup = { + d <- data.table( + id3 = sample(c(seq.int(N*0.9), sample( N*0.9, N*0.1, TRUE))), + v1 = sample(5L, N, TRUE), + v2 = sample(5L, N, TRUE) + ) + }, + expr = { + expr=data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id3) + }, + Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression + Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression + Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression + # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498 # Fixed in: https://github.com/Rdatatable/data.table/pull/4501 "DT[,.SD] improved in #4501" = atime::atime_test( @@ -136,8 +154,7 @@ test.list <- atime::atime_test_list( }, Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - Slower = "cacdc92df71b777369a217b6c902c687cf35a70d" # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue - ), + Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue NULL) # nolint end: undesirable_operator_linter. From 4704c82c1cd3f929faeac46774108f33b89c1198 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Mon, 29 Jul 2024 13:02:45 -0700 Subject: [PATCH 34/41] Progress bar/indicator for "by" operations (#6228) Co-authored-by: Michael Chirico Co-authored-by: Toby Dylan Hocking --- NEWS.md | 2 ++ R/data.table.R | 5 +++-- man/data.table.Rd | 5 ++++- src/data.table.h | 2 +- src/dogroups.c | 21 +++++++++++++++++++-- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0ba12fc4b..e3f7c9f4c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -50,6 +50,8 @@ 16. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks @sindribaldur for the feature request, which has been highly requested, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. +17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 5b23169c4..fa07593e9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -127,7 +127,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) +"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -224,6 +224,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") + if (!isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -1901,7 +1902,7 @@ replace_dot_alias = function(e) { } ans = c(g, ans) } else { - ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) + ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress) } # unlock any locked data.table components of the answer, #4159 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 diff --git a/man/data.table.Rd b/man/data.table.Rd index 680e25574..729c0861c 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -35,7 +35,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL, env = NULL) + drop = NULL, on = NULL, env = NULL, + showProgress = getOption("datatable.showProgress", interactive())) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} @@ -177,6 +178,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } + + \item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/src/data.table.h b/src/data.table.h index ed63978d6..49f3f1634 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -189,7 +189,7 @@ void warn_matrix_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, - SEXP on, SEXP verbose); + SEXP on, SEXP verbose, SEXP showProgressArg); // bmerge.c SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, diff --git a/src/dogroups.c b/src/dogroups.c index e03ad84df..2728a8bcf 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -63,7 +63,7 @@ static bool anySpecialStatic(SEXP x) { return false; } -SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg) +SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; @@ -71,6 +71,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; const bool verbose = LOGICAL(verboseArg)[0]==1; + const bool showProgress = LOGICAL(showProgressArg)[0]==1; + bool hasPrinted = false; + double startTime = (showProgress) ? wallclock() : 0; + double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -169,7 +173,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // because it is a rare edge case for it to be true. See #4892. bool anyNA=false, orderedSubset=false; check_idx(order, length(VECTOR_ELT(dt, 0)), &anyNA, &orderedSubset); - for(int i=0; i-1)) continue; @@ -435,6 +438,19 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (copied) UNPROTECT(1); } } + // progress printing, #3060 + // could potentially refactor to use fread's progress() function, however we would lose some information in favor of simplicity. + double now; + if (showProgress && (now=wallclock())>=nextTime) { + double avgTimePerGroup = (now-startTime)/(i+1); + int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); + if (hasPrinted || ETA >= 0) { + if (verbose && !hasPrinted) Rprintf(_("\n")); + Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); + } + nextTime = now+1; + hasPrinted = true; + } ansloc += maxn; if (firstalloc) { nprotect++; // remember the first jval. If we UNPROTECTed now, we'd unprotect @@ -443,6 +459,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok. } + if (showProgress && hasPrinted) Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds.\n"), ngrp, ngrp, 100.0, (int)(wallclock()-startTime), 0); if (isNull(lhs) && ans!=NULL) { if (ansloc < LENGTH(VECTOR_ELT(ans,0))) { if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0))); From 0ed550249ce132881ac5824152e5516d1b60f808 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Mon, 29 Jul 2024 13:45:26 -0700 Subject: [PATCH 35/41] set() adds new cols when rows aren't updated (#6204) * set() adds new cols when rows aren't updated * new test * review suggestions * review changes * NEWS * Update NEWS.md Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> * new test * documentation update * consistent framing * grammar/clarity --------- Co-authored-by: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 9 +++++++++ man/assign.Rd | 2 +- src/assign.c | 5 +++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index e3f7c9f4c..59a671410 100644 --- a/NEWS.md +++ b/NEWS.md @@ -178,6 +178,8 @@ d1[d2, on="id", verbose=TRUE] This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. +23. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 094af74c7..1f303c31e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -19002,3 +19002,12 @@ test(2277.2, DT[, closure(b), env=list(closure=var)], 0.5) test(2277.3, DT[, closure(b), env=list(closure=stats::var)], 0.5) test(2277.4, DT[, closure(b), env=list(closure=stats:::var)], 0.5) test(2277.5, DT[, lambda(b), env=list(lambda=function(x) sum(x))], 7L) + +# test that set() correctly adds new columns even if no rows are updated +dt = data.table(a=1L) +test(2278.1, set(copy(dt), 0L, "b", logical(0)), data.table(a=1L, b=NA)) +test(2278.2, set(copy(dt), NA_integer_, "b", NA), data.table(a=1L, b=NA)) +test(2278.3, set(copy(dt), 0L, "b", NA), copy(dt)[0L, b := NA]) +test(2278.4, set(copy(dt), NA_integer_, "b", logical(0)), copy(dt)[NA_integer_, b := logical(0)]) +test(2278.5, set(copy(dt), integer(0), "b", numeric(0)), copy(dt)[integer(0), b := numeric(0)]) +test(2278.6, { set(dt, 0L, "b", logical(0)); set(dt, 1L, "a", 2L); dt }, data.table(a=2L, b=NA)) diff --git a/man/assign.Rd b/man/assign.Rd index daeac56a7..71e954230 100644 --- a/man/assign.Rd +++ b/man/assign.Rd @@ -35,7 +35,7 @@ set(x, i = NULL, j, value) \item{LHS}{ A character vector of column names (or numeric positions) or a variable that evaluates as such. If the column doesn't exist, it is added, \emph{by reference}. } \item{RHS}{ A list of replacement values. It is recycled in the usual way to fill the number of rows satisfying \code{i}, if any. To remove a column use \code{NULL}. } \item{x}{ A \code{data.table}. Or, \code{set()} accepts \code{data.frame}, too. } -\item{i}{ Optional. Indicates the rows on which the values must be updated with. If not provided, implies \emph{all rows}. The \code{:=} form is more powerful as it allows \emph{subsets} and \code{joins} based add/update columns by reference. See \code{Details}. +\item{i}{ Optional. Indicates the rows on which the values must be updated. If not \code{NULL}, implies \emph{all rows}. Missing or zero values are ignored. The \code{:=} form is more powerful as it allows adding/updating columns by reference based on \emph{subsets} and \code{joins}. See \code{Details}. In \code{set}, only integer type is allowed in \code{i} indicating which rows \code{value} should be assigned to. \code{NULL} represents all rows more efficiently than creating a vector such as \code{1:nrow(x)}. } \item{j}{ Column name(s) (character) or number(s) (integer) to be assigned \code{value} when column(s) already exist, and only column name(s) if they are to be created. } diff --git a/src/assign.c b/src/assign.c index b1623875e..a78b9452c 100644 --- a/src/assign.c +++ b/src/assign.c @@ -378,12 +378,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) for (int i=0; inrow) error(_("i[%d] is %d which is out of range [1,nrow=%d]"), i+1, rowsd[i], nrow); // set() reaches here (test 2005.2); := reaches the same error in subset.c first - if (rowsd[i]>=1) numToDo++; + if (rowsd[i]>=0) numToDo++; } if (verbose) Rprintf(_("Assigning to %d row subset of %d rows\n"), numToDo, nrow); // TODO: include in message if any rows are assigned several times (e.g. by=.EACHI with dups in i) if (numToDo==0) { - if (!length(newcolnames)) { + // isString(cols) is exclusive to calls from set() + if (!length(newcolnames) && !isString(cols)) { *_Last_updated = 0; UNPROTECT(protecti); return(dt); // all items of rows either 0 or NA. !length(newcolnames) for #759 From 138477c60f87c27a60ceb3e92a6d65afcf313fe3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 29 Jul 2024 13:50:25 -0700 Subject: [PATCH 36/41] Set _R_CHECK_COMPILATION_FLAGS_KNOWN_ to suppress R CMD check NOTE (#6327) --- .gitlab-ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8a3fa4d71..77ecc5280 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -110,6 +110,7 @@ test-lin-rel: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-data.table variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "FALSE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_CHECK_FORCE_SUGGESTS_: "TRUE" @@ -131,6 +132,8 @@ test-lin-rel: test-lin-rel-vanilla: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base-gcc + variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" script: - echo 'CFLAGS=-g -O0 -fno-openmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars - echo 'CXXFLAGS=-g -O0 -fno-openmp -Wall -Wvla -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' >> ~/.R/Makevars @@ -143,6 +146,7 @@ test-lin-rel-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-base variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though) _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284 _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes @@ -163,6 +167,7 @@ test-lin-dev-gcc-strict-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-gcc-strict variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" ## detects S3 method lookup found on search path #4777 @@ -184,6 +189,7 @@ test-lin-dev-clang-cran: <<: *test-lin image: registry.gitlab.com/jangorecki/dockerfiles/r-devel-clang variables: + _R_CHECK_COMPILATION_FLAGS_KNOWN_: "-Wvla" _R_CHECK_CRAN_INCOMING_: "TRUE" _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" _R_S3_METHOD_LOOKUP_BASEENV_AFTER_GLOBALENV_: "FALSE" From b8d5f83270d45a47316a41259bedae63f3d2854d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 30 Jul 2024 05:17:26 -0700 Subject: [PATCH 37/41] Setdroplevels for in-place removal of unused levels (#6316) * Add setdroplevels() to handle droplevels.data.table() * Export it * Some tests * NEWS --- NAMESPACE | 2 +- NEWS.md | 8 ++++++-- R/fdroplevels.R | 35 ++++++++++++++++++++--------------- inst/tests/tests.Rraw | 11 +++++++++-- man/fdroplevels.Rd | 2 ++ 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 109336c9e..2bc30543f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -201,5 +201,5 @@ S3method(format_col, expression) export(format_list_item) S3method(format_list_item, default) -export(fdroplevels) +export(fdroplevels, setdroplevels) S3method(droplevels, data.table) diff --git a/NEWS.md b/NEWS.md index 59a671410..d104981f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,9 +2,11 @@ # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) -## BREAKING CHANGE +## BREAKING CHANGES + +1. `droplevels(in.place=TRUE)` is deprecated in favor of calling `setdroplevels()`, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Given the associated risks/pain points, we strongly prefer all in-place/by-reference behavior within data.table come from functions `set*` (and `:=`) to make it as clear as possible that inputs are mutable. See below and `?setdroplevels` for more. -1. `` `[.data.table` `` is un-exported again. This was exported to support an experimental feature (`DT()` functional form of `[`) that never made it to release, but we forgot to claw back this export in the NAMESPACE; sorry about that. We didn't find anyone calling the method directly (which is inadvisable to begin with). +2. `` `[.data.table` `` is un-exported again. This was exported to support an experimental feature (`DT()` functional form of `[`) that never made it to release, but we forgot to claw back this export in the NAMESPACE; sorry about that. We didn't find anyone calling the method directly (which is inadvisable to begin with). ## NEW FEATURES @@ -52,6 +54,8 @@ 17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. +18. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/fdroplevels.R b/R/fdroplevels.R index 69f23cb61..3116a3f85 100644 --- a/R/fdroplevels.R +++ b/R/fdroplevels.R @@ -8,19 +8,24 @@ fdroplevels = function(x, exclude = if (anyNA(levels(x))) NULL else NA, ...) { return(ans) } -droplevels.data.table = function(x, except = NULL, exclude, in.place = FALSE, ...){ - stopifnot(is.logical(in.place)) - if (nrow(x)==0L) return(x) - ix = vapply(x, is.factor, NA) - if(!is.null(except)){ - stopifnot(is.numeric(except), except <= length(x)) - ix[except] = FALSE - } - if(!sum(ix)) return(x) - if(!in.place) x = copy(x) - for(nx in names(ix)[ix==TRUE]){ - if (missing(exclude)) set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]])) - else set(x, i = NULL, j = nx, value = fdroplevels(x[[nx]], exclude = exclude)) - } - return(x) +droplevels.data.table = function(x, except=NULL, exclude, in.place=FALSE, ...){ + stopifnot(is.logical(in.place)) + if (isTRUE(in.place)) warningf("droplevels() with in.place=TRUE is deprecated. Use setdroplevels() instead.") + if (!in.place) x = copy(x) + if (missing(exclude)) exclude = NULL + setdroplevels(x, except, exclude)[] +} + +setdroplevels = function(x, except=NULL, exclude=NULL) { + if (!nrow(x)) return(invisible(x)) + ix = vapply_1b(x, is.factor) + if (!is.null(except)) { + stopifnot(is.numeric(except), except >= 1, except <= length(x)) + ix[except] = FALSE + } + if (!any(ix)) return(invisible(x)) + for (nx in names(ix)[ix]) { + set(x, i=NULL, j=nx, value=fdroplevels(x[[nx]], exclude=exclude)) + } + invisible(x) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1f303c31e..b3447b04e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1682,7 +1682,7 @@ test(529.4, set(DT1, i=NULL, j=7L, value=5L), error="Item 1 of column numbers in # Test that data.frame incompability is fixed, came to light in Feb 2012 DT = data.table(name=c('a','b','c'), value=1:3) -test(530, base::droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # base:: because we'll implement a fast droplevels, too. +test(530, droplevels(DT[ name != 'a' ]), data.table(name=c('b','c'),value=2:3)) # Test that .set_row_names() is maintained on .SD for each group DT = data.table(a=INT(1,1,2,2,2,3,3,3,3),b=1:9) @@ -17732,7 +17732,7 @@ if (base::getRversion() >= "3.4.0") { } test(2214.06, droplevels(DT)[["a"]], droplevels(DT[1:5,a])) test(2214.07, droplevels(DT, 1)[["a"]], x[1:5]) -test(2214.08, droplevels(DT, in.place=TRUE), DT) +test(2214.08, droplevels(DT, in.place=TRUE), DT, warning="droplevels() with in.place=TRUE is deprecated.") # support ordered factors in fdroplevels o = factor(letters[1:10], ordered=TRUE) test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) @@ -17740,6 +17740,13 @@ test(2214.09, fdroplevels(o[1:5]), droplevels(o[1:5])) test(2214.10, droplevels(DT[0]), DT[0]) test(2214.11, droplevels(data.table()), data.table()) +# setdroplevels() for in-place operations #6014 +x = factor(letters[1:10]) +DT = data.table(a = x)[1:5] +test(2214.12, setdroplevels(DT, except=1L), DT) # don't do anything +test(2214.13, setdroplevels(DT, except=0L), error="except >= 1") +test(2214.14, setdroplevels(DT, except=2L), error="except <= length(x)") +test(2214.15, setdroplevels(DT), DT) # factor i should be just like character i and work, #1632 DT = data.table(A=letters[1:3], B=4:6, key="A") diff --git a/man/fdroplevels.Rd b/man/fdroplevels.Rd index 98334f011..724399d20 100644 --- a/man/fdroplevels.Rd +++ b/man/fdroplevels.Rd @@ -2,6 +2,7 @@ \alias{fdroplevels} \alias{droplevels} \alias{droplevels.data.table} +\alias{setdroplevels} \title{Fast droplevels} \description{ Similar to \code{base::droplevels} but \emph{much faster}. @@ -9,6 +10,7 @@ \usage{ fdroplevels(x, exclude = if (anyNA(levels(x))) NULL else NA, \dots) +setdroplevels(x, except = NULL, exclude = NULL) \method{droplevels}{data.table}(x, except = NULL, exclude, in.place = FALSE, \dots) } From 488bdd2e068e4e56444916b75a61e029d3d4d296 Mon Sep 17 00:00:00 2001 From: rtobar Date: Thu, 1 Aug 2024 01:40:41 +0800 Subject: [PATCH 38/41] Mark fread's init function as static (#6328) * Mark fread's init function as static The function isn't used elsewhere, and making it publicly accessible opens the door for runtime linking issues -- where the function is served by other libraries exposing the same function. This was seen in a HPC cluster with software built with spack: 0 0x00001555513d8ce0 in init () from /opt/cray/pe/lib64/libsci_gnu_82_mpi.so.5 1 0x00001555433f46ba in parse_double_extended (...) at fread.c:819 2 0x00001555433f3e97 in detect_types (...) at fread.c:1203 3 0x00001555433f7959 in freadMain (...) at fread.c:1852 4 0x00001555433fd84d in freadR (...) at fRead.c:217 Signed-off-by: Rodrigo Tobar * rm ws * correct placement --------- Signed-off-by: Rodrigo Tobar Co-authored-by: Michael Chirico Co-authored-by: Michael Chirico --- NEWS.md | 2 ++ src/fread.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d104981f6..83f48d685 100644 --- a/NEWS.md +++ b/NEWS.md @@ -184,6 +184,8 @@ This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/43 23. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. +24. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. diff --git a/src/fread.c b/src/fread.c index e301e8cd1..45efa6eeb 100644 --- a/src/fread.c +++ b/src/fread.c @@ -84,7 +84,7 @@ static double NAND; static double INFD; // NAN and INFINITY constants are float, so cast to double once up front. -void init(void) { +static void init(void) { NAND = (double)NAN; INFD = (double)INFINITY; } From 7f18c097bce775793d576eebec2d1a88ba8e78ec Mon Sep 17 00:00:00 2001 From: Rafael Fontenelle Date: Wed, 31 Jul 2024 18:53:07 -0300 Subject: [PATCH 39/41] Use proper language name in R-pt_BR.po (#6332) --- po/R-pt_BR.po | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/po/R-pt_BR.po b/po/R-pt_BR.po index b0db1159c..4fb95ffb8 100644 --- a/po/R-pt_BR.po +++ b/po/R-pt_BR.po @@ -2771,10 +2771,10 @@ msgid "" "**********" msgstr "" "**********\n" -"Executando data.table em inglês; o suporte ao pacote está disponível apenas " +"Executando data.table em português; o suporte ao pacote está disponível apenas " "em inglês. Ao procurar ajuda online, certifique-se de verificar também a " "mensagem de erro em inglês. Isso pode ser obtido examinando os arquivos po/R-" -".po e po/.po no código-fonte do pacote, onde as mensagens de " +"pt_BR.po e po/pt_BR.po no código-fonte do pacote, onde as mensagens de " "erro no idioma nativo e em inglês podem ser encontradas lado a lado.\n" "**********" From c36c84ff4a78313f2c71e176553549c9c3c7a5dd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 31 Jul 2024 20:15:27 -0700 Subject: [PATCH 40/41] Don't include po/ directory in bundled package (#6331) --- .Rbuildignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.Rbuildignore b/.Rbuildignore index 6e6b8b401..a6996ac55 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -46,3 +46,6 @@ ^lib$ ^library$ ^devwd$ + +# only the inst/po compressed files are needed, not raw .pot/.po +^po$ From 2349536e0cac05935781f553b31cea4946cf996a Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 1 Aug 2024 09:50:24 -0400 Subject: [PATCH 41/41] melt warns for measure.vars=list of length=1 (#6333) Co-authored-by: Michael Chirico --- NEWS.md | 4 ++-- inst/tests/tests.Rraw | 8 ++++---- src/fmelt.c | 7 +++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 83f48d685..fdc0db931 100644 --- a/NEWS.md +++ b/NEWS.md @@ -70,8 +70,6 @@ 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. Any users who were relying on this behavior can change `measure.vars=list("col_name")` (output `variable` was column name, now is column index/integer) to `measure.vars="col_name"` (`variable` still is column name). - 8. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. 9. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. @@ -94,6 +92,8 @@ ## NOTES +7. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. + 1. `transform` method for data.table sped up substantially when creating new columns on large tables. Thanks to @OfekShilon for the report and PR. The implemented solution was proposed by @ColeMiller1. 2. The documentation for the `fill` argument in `rbind()` and `rbindlist()` now notes the expected behaviour for missing `list` columns when `fill=TRUE`, namely to use `NULL` (not `NA`), [#4198](https://github.com/Rdatatable/data.table/pull/4198). Thanks @sritchie73 for the proposal and fix. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b3447b04e..57a348ce1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17257,13 +17257,13 @@ exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) -test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("1","2")), b=c(1,2))) # measure.vars named list length=1, #5065 +test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2)), warning="measure.vars is a list with length=1") # measure.vars named list length=1, #5065 # consistency between measure.vars=list with length=1 and length>1, #5209 -test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor(1), value=2)) +test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2), warning="measure.vars is a list with length=1") test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) -test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) +test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2), warning="measure.vars is a list with length=1") test(2182.74, melt(DT.wide, measure.vars=c("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2)) -test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="1", n=10))#thanks @mnazarov +test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="a", n=10), warning="measure.vars is a list with length=1")#thanks @mnazarov ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues diff --git a/src/fmelt.c b/src/fmelt.c index 51f4fbbb8..77a48f6a4 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -595,9 +595,12 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { + if ((data->lvalues == 1) & data->measure_is_list) { + warning("measure.vars is a list with length=1, which as long documented should return integer indices in the 'variable' column, but currently returns character column names. To increase consistency in the next release, we plan to change 'variable' to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name now, but will become column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); + } if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; @@ -616,7 +619,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++;