diff --git a/.Rbuildignore b/.Rbuildignore index 6e6b8b401..a6996ac55 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -46,3 +46,6 @@ ^lib$ ^library$ ^devwd$ + +# only the inst/po compressed files are needed, not raw .pot/.po +^po$ diff --git a/.ci/linters/c/omp_set_num_threads_linter.R b/.ci/linters/c/omp_set_num_threads_linter.R new file mode 100644 index 000000000..b3a100b6a --- /dev/null +++ b/.ci/linters/c/omp_set_num_threads_linter.R @@ -0,0 +1,13 @@ + +# Ensure no calls to omp_set_num_threads() [to avoid affecting other packages and base R] +# Only comments referring to it should be in openmp-utils.c +omp_set_num_threads_linter = function(c_file) { + # strip comments, we only care if the function appears in actual code. + processed_lines = system2("gcc", c("-fpreprocessed", "-E", c_file), stdout=TRUE, stderr=FALSE) + idx = grep("omp_set_num_threads", processed_lines, fixed = TRUE) + if (!length(idx)) return() + stop(sprintf( + "In %s, found omp_set_num_threads() usage, which could affect other packages and base R:\n%s", + c_file, paste0(" ", format(idx), ":", processed_lines[idx], collapse = "\n") + )) +} diff --git a/.ci/publish.R b/.ci/publish.R index 7d43a44e6..a07a4ce90 100644 --- a/.ci/publish.R +++ b/.ci/publish.R @@ -163,7 +163,7 @@ r.ver <- function(x) { v = tmp[3L] if (identical(v, "rel")) "r-release" else if (identical(v, "dev")) "r-devel" - else if (identical(v, "old")) "r-oldrel" + else if (identical(v, "old") || identical(v, "ancient")) "r-oldrel" else { if (grepl("\\D", v)) stop("third word in test job name must be rel/dev/old or numbers of R version") paste0("r-", paste(strsplit(v, "")[[1L]], collapse=".")) diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd index 385e4ef88..6666f709f 100644 --- a/.dev/CRAN_Release.cmd +++ b/.dev/CRAN_Release.cmd @@ -54,9 +54,6 @@ grep -RI --exclude-dir=".git" --exclude="*.md" --exclude="*~" --color='auto' -P # Unicode is now ok. This unicode in tests.Rraw is passing on CRAN. grep -RI --exclude-dir=".git" --exclude="*.md" --exclude="*~" --color='auto' -n "[\]u[0-9]" ./ -# Ensure no calls to omp_set_num_threads() [to avoid affecting other packages and base R] -# Only comments referring to it should be in openmp-utils.c -grep omp_set_num_threads ./src/* # Ensure no calls to omp_set_nested() as i) it's hard to fully honor OMP_THREAD_LIMIT as required by CRAN, and # ii) a simpler non-nested approach is always preferable if possible, as has been the case so far diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 211cb063a..706ab2933 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -114,7 +114,7 @@ and then the `commit` and `push` shown above would push to the branch in the mai #### Translations -`data.table` offers some translations so that our users can get feedback (errors, warnings, verbose messages) in their native language. Currently we only support Mandarin Chinese. +`data.table` offers some translations so that our users can get feedback (errors, warnings, verbose messages) in their native language. There is a [translation team](https://github.com/orgs/Rdatatable/teams/translators/teams) for each currently supported translation. The data for these translations lives in the `po` folder. You do not need to make any changes here for your PR -- translations are updated in a batch before each CRAN release. diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index f9066eacd..960315e78 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -40,9 +40,10 @@ jobs: - uses: r-lib/actions/setup-r@v2 - name: Lint run: | - for (f in list.files('.ci/linters/c', full.names=TRUE)) source(f) - for (f in list.files('src', pattern='[.]c$', full.names=TRUE)) { - alloc_linter(f) + linter_env = new.env() + for (f in list.files('.ci/linters/c', full.names=TRUE)) sys.source(f, linter_env) + for (f in list.files('src', pattern='[.][ch]$', full.names=TRUE)) { + for (linter in ls(linter_env)) linter_env[[linter]](f) # TODO(#6272): Incorporate more checks from CRAN_Release } shell: Rscript {0} diff --git a/.github/workflows/rchk.yaml b/.github/workflows/rchk.yaml new file mode 100644 index 000000000..c799df028 --- /dev/null +++ b/.github/workflows/rchk.yaml @@ -0,0 +1,59 @@ +# adapted from a similar check run by {arrow} +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +on: + push: + branches: + - master + pull_request: + +name: 'rchk' + +jobs: + rchk: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: r-lib/actions/setup-r@v2 + with: + r-version: 'devel' + - name: Build + run: | + ls -l | cat + R CMD build --no-build-vignettes . + mkdir packages + mv data.table_*.tar.gz packages + - name: Run rchk + run: | + docker run -v `pwd`/packages:/rchk/packages kalibera/rchk:latest /rchk/packages/data.table_*.tar.gz |& tee rchk.out + - name: Confirm that rchk has no errors + # Suspicious call, [UP], and [PB] are all of the error types currently at + # https://github.com/kalibera/cran-checks/tree/HEAD/rchk/results + # though this might not be exhaustive, there does not appear to be a way to have rchk return an error code + # CRAN also will remove some of the outputs (especially those related to Rcpp and strptime, e.g. + # ERROR: too many states (abstraction error?)) + # https://github.com/kalibera/rchk + run: | + cat rchk.out + if [ $(grep -Fc "Suspicious call" rchk.out) -gt 0 ] || [ $(grep -F "[UP]" rchk.out | grep -Fvc "results will be incomplete") -gt 0 ] || [ $(grep -Fc "[PB]" rchk.out) -gt 0 ]; then + echo "Found rchk errors" + exit 1 + fi + if: always() + - name: Dump rchk output logs + run: cat rchk.out + if: always() diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 77ecc5280..83def117b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -203,10 +203,9 @@ test-lin-dev-clang-cran: - >- Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 3 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (size of tarball, installed package size, non-API calls) but ", shQuote(l)) else q("no")' -## R 3.1.0 # stated dependency on R -test-lin-310-cran: - image: registry.gitlab.com/jangorecki/dockerfiles/r-3.1.0 +test-lin-ancient-cran: + image: registry.gitlab.com/jangorecki/dockerfiles/r-3.3.0 <<: *test-lin script: - *install-deps @@ -301,7 +300,7 @@ integration: - saas-linux-medium-amd64 only: - master - needs: ["mirror-packages","build","test-lin-rel","test-lin-rel-cran","test-lin-dev-gcc-strict-cran","test-lin-dev-clang-cran","test-lin-rel-vanilla","test-lin-310-cran","test-win-rel","test-win-dev" ,"test-win-old"] + needs: ["mirror-packages","build","test-lin-rel","test-lin-rel-cran","test-lin-dev-gcc-strict-cran","test-lin-dev-clang-cran","test-lin-rel-vanilla","test-lin-ancient-cran","test-win-rel","test-win-dev" ,"test-win-old"] script: - R --version - *install-deps ## markdown pkg not present in r-pkgdown image diff --git a/DESCRIPTION b/DESCRIPTION index b78495edc..9fba4f71a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -86,5 +86,7 @@ Authors@R: c( person("Dmitry", "Shemetov", role="ctb"), person("Nitish", "Jha", role="ctb"), person("Joshua", "Wu", role="ctb"), - person("Iago", "Giné-Vázquez", role="ctb") + person("Iago", "Giné-Vázquez", role="ctb"), + person("Anirban", "Chetia", role="ctb"), + person("Doris", "Amoakohene", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index d104981f6..29047f1a1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,21 @@ ## NEW FEATURES -1. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. +1. We continue to use user feedback to prioritize development. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) for the current list of most-requested issues. In this release we add five highly-requested features: + + a. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works to update all columns, [#795](https://github.com/Rdatatable/data.table/issues/795). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. + + b. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). Thanks @mattdowle for the original issue, 50 or more others for expressing support, and @MichaelChirico for the fix. + + c. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#4258](https://github.com/Rdatatable/data.table/issues/4258). Thanks @sindribaldur for the feature request, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. + + d. `[.data.table` gains argument `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). The progress bar reports information such as the number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply to `GForce` optimized operations. Thanks to @eatonya and @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. + + e. `rbindlist(l, use.names=TRUE)` and `rbind` now work correctly on columns with different class attributes across the inputs for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with matched columns of similar classes, e.g., `rbind(data.table(d = Sys.Date()), data.table(d = as.IDate(Sys.Date()-1)))`. The conversion is done automatically and the class attribute of the final column is determined by the first class attribute encountered in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). + + `rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` (default `FALSE`) to manually deactivate the safety net preventing binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. + +2. `print.data.table()` shows empty (`NULL`) list column entries as `[NULL]` for emphasis. Previously they would just print nothing (same as for empty string). Part of [#4198](https://github.com/Rdatatable/data.table/issues/4198). Thanks @sritchie73 for the proposal and fix. ```R data.table(a=list(NULL, "")) @@ -20,41 +34,31 @@ # 2: ``` -2. `cedta()` now returns `FALSE` if `.datatable.aware = FALSE` is set in the calling environment, [#5654](https://github.com/Rdatatable/data.table/issues/5654). Thanks @dvg-p4 for the request and PR. +3. `cedta()` now returns `FALSE` if `.datatable.aware = FALSE` is set in the calling environment, [#5654](https://github.com/Rdatatable/data.table/issues/5654). Thanks @dvg-p4 for the request and PR. -3. `split.data.table` also accepts a formula for `f`, [#5392](https://github.com/Rdatatable/data.table/issues/5392), mirroring the same in `base::split.data.frame` since R 4.1.0 (May 2021). Thanks to @XiangyunHuang for the request, and @ben-schwen for the PR. - -4. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. - -5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR. - -6. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. - -7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Using `fill=Inf` reads the full file for estimating the number of columns [#2727](https://github.com/Rdatatable/data.table/issues/2727). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. - -8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. +4. The `split()` method for `data.table`s is more consistent with that for base methods: -9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. + a. `f` can be a formula, [#5392](https://github.com/Rdatatable/data.table/issues/5392), mirroring the same in `base::split.data.frame` since R 4.1.0 (May 2021). Thanks to @XiangyunHuang for the request, and @ben-schwen for the PR. -10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. + b. `sep=` is recognized when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). Thanks @MichaelChirico for the request and PR. -11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). +5. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. -12. `setDT` is faster for data with many columns, thanks @MichaelChirico for reporting and fixing the issue, [#5426](https://github.com/Rdatatable/data.table/issues/5426). +6. `transpose` gains `list.cols=` argument (default `FALSE`), [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoid type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and @ben-schwen for the PR. -13. `dcast` gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. +7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values -- an upper bound on the number of columns in the file. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2691](https://github.com/Rdatatable/data.table/issues/2691), [#4130](https://github.com/Rdatatable/data.table/issues/4130), [#3436](https://github.com/Rdatatable/data.table/issues/3436), [#1812](https://github.com/Rdatatable/data.table/issues/1812) and [#5378](https://github.com/Rdatatable/data.table/issues/5378). The suggestion for `fill` to allow a manual estimate of the number of columns instead comes from [#2727](https://github.com/Rdatatable/data.table/issues/2727). Using `fill=Inf` reads the full file for estimating the number of columns. Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and @ben-schwen for the PR. -14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. +8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. -15. `rbindlist(l, use.names=TRUE)` and `rbind` now works correctly on columns with different class attributes for certain classes such as `Date`, `IDate`, `ITime`, `POSIXct` and `AsIs` with other columns of similar classes, e.g., `IDate` and `Date`. The conversion is done automatically and the class attribute of the final column is determined by the first encountered class attribute in the binding list, [#5309](https://github.com/Rdatatable/data.table/issues/5309), [#4934](https://github.com/Rdatatable/data.table/issues/4934), [#5391](https://github.com/Rdatatable/data.table/issues/5391). +9. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. -`rbindlist(l, ignore.attr=TRUE)` and `rbind` also gains argument `ignore.attr` to manually deactivate the safety-net of binding columns with different column classes, [#3911](https://github.com/Rdatatable/data.table/issues/3911), [#5542](https://github.com/Rdatatable/data.table/issues/5542). Thanks to @dcaseykc, @fox34, @adrian-quintario, @berg-michael, @arunsrinivasan, @statquant, @pkress, @jrausch12, @therosko, @OfekShilon, @iMissile, @tdhock for the request and @ben-schwen for the PR. +10. `setDT` is faster for data with many columns, thanks @MichaelChirico for reporting and fixing the issue, [#5426](https://github.com/Rdatatable/data.table/issues/5426). -16. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks @sindribaldur for the feature request, which has been highly requested, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. +11. `dcast` gains `value.var.in.dots`, `value.var.in.LHSdots` and `value.var.in.RHSdots` arguments, [#5824](https://github.com/Rdatatable/data.table/issues/5824). This allows the `value.var` variable(s) in `dcast` to be represented by `...` in the formula (if not otherwise mentioned). Thanks to @iago-pssjd for the report and PR. -17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. +12. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and @ben-schwen for the PR. -18. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation. +13. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation. ## BUG FIXES @@ -62,35 +66,33 @@ 2. `dcast` handles coercion of `fill` to `integer64` correctly, [#4561](https://github.com/Rdatatable/data.table/issues/4561). Thanks to @emallickhossain for the bug report and @MichaelChirico for the fix. -3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and Benjamin Schwendinger for the fix. +3. Optimized `shift` per group produced wrong results when simultaneously subsetting, for example, `DT[i==1L, shift(x), by=group]`, [#5962](https://github.com/Rdatatable/data.table/issues/5962). Thanks to @renkun-ken for the report and @ben-schwen for the fix. -4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings (for example, when fun.aggregate=min or max, warning was NAs introduced by coercion to integer range) which were potentially confusing, [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to Toby Dylan Hocking for the fix. +4. `dcast(fill=NULL)` only computes default fill value if necessary, which eliminates some previous warnings which were potentially confusing (for example, when `fun.aggregate=min` or `max`, warning was "NAs introduced by coercion to integer range"), [#5512](https://github.com/Rdatatable/data.table/issues/5512), [#5390](https://github.com/Rdatatable/data.table/issues/5390). Thanks to @tdhock for the report and fix. 5. `fwrite(x, row.names=TRUE)` with `x` a `matrix` writes `row.names` when present, not row numbers, [#5315](https://github.com/Rdatatable/data.table/issues/5315). Thanks to @Liripo for the report, and @ben-schwen for the fix. 6. `patterns()` helper for `.SDcols` now accepts arguments `ignore.case`, `perl`, `fixed`, and `useBytes`, which are passed to `grep`, #5387. Thanks to @iago-pssjd for the feature request, and @tdhock for the implementation. -7. `melt` returns an integer column for `variable` when `measure.vars` is a list of length=1, consistent with the documented behavior, [#5209](https://github.com/Rdatatable/data.table/issues/5209). Thanks to @tdhock for reporting and fixing. Any users who were relying on this behavior can change `measure.vars=list("col_name")` (output `variable` was column name, now is column index/integer) to `measure.vars="col_name"` (`variable` still is column name). - -8. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. +7. Adding a list column to an empty `data.table` works consistently with other column types, [#5738](https://github.com/Rdatatable/data.table/issues/5738). Thanks to Benjamin Schwendinger for the report and the fix. -9. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. +8. In `DT[,j,by]`, `by` retains its attributes (e.g. class) when `j` is GForce optimized, [#5567](https://github.com/Rdatatable/data.table/issues/5567). Thanks to @danwwilson for the report, and @ben-schwen for the PR. -10. `dt[,,by=año]` (i.e., using a column name containing a non-ASCII character in `by` as a plain symbol) no longer errors with "object 'año' not found", #4708. Thanks @pfv07 for the report, and Michael Chirico for the fix. +9. `dt[,,by=año]` (i.e., using a column name containing a non-ASCII character in `by` as a plain symbol) no longer errors with "object 'año' not found", #4708. Thanks @pfv07 for the report, and @MichaelChirico for the fix. -11. Fix some memory management issues in the C routines backing `melt()`, `froll()`, and GForce `mean()`, as identified by `rchk`. Thanks Tomas Kalibera and the CRAN team for setting up the `rchk` system, and @MichaelChirico for the fix. +10. Fixed some memory management issues in the C routines backing `melt()`, `froll()`, and GForce `mean()`, as identified by `rchk`. Thanks Tomas Kalibera and the CRAN team for setting up the `rchk` system, and @MichaelChirico for the fix. -12. data.table's `all.equal()` method now dispatches to each column's own `all.equal()` method as appropriate, [#4543](https://github.com/Rdatatable/data.table/issues/4543). Thanks @MichaelChirico for the report and fix. Note that this had two noteworthy changes to data.table's own test suite that might affect you: (1) comparisons of POSIXct columns compare absolute, not relative differences, meaning that millisecond-scale differences might trigger a "not equal" report that was hidden before; and (2) comparisons of integer64 columns could be totally wrong since they were being compared on the basis of their representation as doubles, not long integers. The former might be a matter of preference requiring you to specify a different `tolerance=`, while the latter was clearly a bug. +11. data.table's `all.equal()` method now dispatches to each column's own `all.equal()` method as appropriate, [#4543](https://github.com/Rdatatable/data.table/issues/4543). Thanks @MichaelChirico for the report and fix. Note that this had two noteworthy changes to data.table's own test suite that might affect you: (1) comparisons of POSIXct columns compare absolute, not relative differences, meaning that millisecond-scale differences might trigger a "not equal" report that was hidden before; and (2) comparisons of integer64 columns could be totally wrong since they were being compared on the basis of their representation as doubles, not long integers. The former might be a matter of preference requiring you to specify a different `tolerance=`, while the latter was clearly a bug. -13. `rbindlist` and `shift` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger (`rbindlist`) and @MichaelChirico (`shift`) for the fix. +12. `rbindlist` and `shift` could lead to a protection stack overflow when applied to a list containing many nested lists exceeding the pointer protection stack size, [#4536](https://github.com/Rdatatable/data.table/issues/4536). Thanks to @ProfFancyPants for reporting, and Benjamin Schwendinger (`rbindlist`) and @MichaelChirico (`shift`) for the fix. -14. `fread(x, colClasses="POSIXct")` now also works for columns containing only NA values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and Benjamin Schwendinger for the fix. +13. `fread(x, colClasses="POSIXct")` now also works for columns containing only `NA` values, [#6208](https://github.com/Rdatatable/data.table/issues/6208). Thanks to @markus-schaffer for the report, and @ben-schwen for the fix. -15. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. +14. `fread()` is more careful about detecting that a file is compressed in bzip2 format, [#6304](https://github.com/Rdatatable/data.table/issues/6304). In particular, we also check the 4th byte of the file is a digit; in rare cases, a legitimate uncompressed CSV file could match 'BZh' as the first 3 bytes. We think an uncompressed CSV file matching 'BZh[1-9]' is all the more rare and unlikely to be encountered in "real" examples. Other formats (zip, gzip) are friendly enough to use non-printable characters in their magic numbers. Thanks @grainnemcguire for the report and @MichaelChirico for the fix. -16. Selecting keyed list columns will retain key without a performance penalty, closes [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report. +15. Selecting the key column like `DT[, .(key1, key2)]` will retain the key without a performance penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @user9439449 on StackOverflow for the report and @MichaelChirico for the fix. -17. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. +16. Passing functions programmatically with `env=` doesn't produce an opaque error, e.g. `DT[, f(b), env = list(f=sum)]`, [#6026](https://github.com/Rdatatable/data.table/issues/6026). Note that it's much better to pass functions like `f="sum"` instead. Thanks to @MichaelChirico for the bug report and fix. ## NOTES @@ -104,85 +106,92 @@ 5. Input files are now kept open during `mmap()` when running under Emscripten, [emscripten-core/emscripten#20459](https://github.com/emscripten-core/emscripten/issues/20459). This avoids an error in `fread()` when running in WebAssembly, [#5969](https://github.com/Rdatatable/data.table/issues/5969). Thanks to @maek-ies for the report and @georgestagg for the PR. -6. `dcast()` message about `fun.aggregate` defaulting to `length()` when aggregation is necessary, which could be confusing if duplicates were unexpected, does better explaining the behavior and suggesting alternatives, [#5217](https://github.com/Rdatatable/data.table/issues/5217). Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. +6. `dcast()` improves behavior for the situation that the `fun.aggregate` value of `length()` is used but not provided by the user. -7. Updated a test relying on `>` working for comparing language objects to a string, which will be deprecated by R, [#5977](https://github.com/Rdatatable/data.table/issues/5977); no user-facing effect. Thanks to R-core for continuously improving the language. + a. This now triggers a warning, not a message, since relying on this default often signals unexpected duplicates in the data, [#5386](https://github.com/Rdatatable/data.table/issues/5386). The warning is classed as `dt_missing_fun_aggregate_warning`, allowing for more targeted handling in user code. Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. -8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the uxpected setup. + b. The warning itself does better explaining the behavior and suggesting alternatives, [#5217](https://github.com/Rdatatable/data.table/issues/5217). Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. -9. `print.data.table` now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. +7. Updated a test relying on operator `>` working for comparing language objects to a string, which will be deprecated by R, [#5977](https://github.com/Rdatatable/data.table/issues/5977); no user-facing effect. Thanks to R-core for continuously improving the language. -10. `test.data.table()` runs robustly: - + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. - + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix. +8. OpenMP detection when building from source on Mac is improved, [#4348](https://github.com/Rdatatable/data.table/issues/4348). Thanks @jameshester and @kevinushey for the request and @kevinushey for the PR, @jameslamb for the advice and @s-u of R-core for ensuring CRAN machines are configured to support the expected setup. + +9. `print.data.table`: -11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. + a. Now handles combination multibyte characters correctly when truncating wide string entries, [#5096](https://github.com/Rdatatable/data.table/issues/5096). Thanks to @MichaelChirico for the report and @joshhwuu for the fix. -12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. + b. Prints the indicator `---` in every value column when truncation is needed and `row.names = FALSE` instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. -13. `test.data.table()` now initialises the numeric rounding value to 0 using `setNumericRounding(0)` to avoid failed tests if the user has set a different value, [#6082](https://github.com/Rdatatable/data.table/issues/6082). The user's value is restored on exit. Thanks to @MichaelChirico for the report and for describing the solution, and @markseeto for implementing. + c. Honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. -14. `setNumericRounding()` now invisibly returns the old rounding value instead of `NULL`, which is now consistent with similar behavior by `setwd()`, `options()`, etc. Thanks @MichaelChirico for the report and @joshhwuu for the fix. + d. Gains new argument `show.indices` and option `datatable.show.indices` that allows the user to print a `data.table`'s indices as columns without having to modify the `data.table` itself. Thanks @MichaelChirico for the report and @joshhwuu for the PR. -15. `dcast()` now issues a warning when `fun.aggregate` is used but not provided by the user. `fun.aggregate` defaults to `length` in this case. Previously, only a message was issued. However, relying on this default often signals unexpected duplicates in the data. Therefore, a stricter class of signal was deemed more appropriate, [#5386](https://github.com/Rdatatable/data.table/issues/5386). The warning is classed as `dt_missing_fun_aggregate_warning`, allowing for more targeted handling in user code. Thanks @MichaelChirico for the suggestion and @Nj221102 for the fix. + e. Displays `integer64` columns well even if {bit64} has not yet been loaded, [#6224](https://github.com/Rdatatable/data.table/issues/6224). Thanks @renkun-ken for the report and @MichaelChirico for the fix. -16. `print.data.table` gains new argument `show.indices` and option `datatable.show.indices` that allows the user to print a `data.table`'s indices as columns without having to modify the `data.table` itself. Thanks @MichaelChirico for the report and @joshhwuu for the PR. +10. `test.data.table()` runs robustly: + + In sessions where the `digits` or `warn` options are not their defaults (`7` and `0`, respectively), [#5285](https://github.com/Rdatatable/data.table/issues/5285). Thanks @OfekShilon for the report and suggested fix and @MichaelChirico for the PR. + + In locales where `letters != sort(letters)`, e.g. Latvian, [#3502](https://github.com/Rdatatable/data.table/issues/3502). Thanks @minemR for the report and @MichaelChirico for the fix. + + Initialises the numeric rounding value to 0 using `setNumericRounding(0)` to avoid failed tests if the user has set a different value, [#6082](https://github.com/Rdatatable/data.table/issues/6082). The user's value is restored on exit. Thanks to @MichaelChirico for the report and for describing the solution, and @markseeto for implementing. To enable this, `setNumericRounding()` now invisibly returns the old rounding value instead of `NULL`, which is consistent with similar behavior by `setwd()`, `options()`, etc. Thanks @MichaelChirico for the report and @joshhwuu for the fix. -17. The `measure` and `patterns` functions are now exported for use within `[` and `melt()` to ensure consistency with other non-standard evaluation (NSE) exports like `.N` and `:=`. This change addresses [#5604](https://github.com/Rdatatable/data.table/issues/5604), allowing package developers to import these names and avoid `R CMD check` `NOTE`s about undefined variables. Thanks to @MichaelChirico and @ylelkes for their suggestions, and to @Nj221102 for the implementation. +11. The `measure` and `patterns` functions are now exported for use within `[` and `melt()` to ensure consistency with other non-standard evaluation (NSE) exports like `.N` and `:=`. This change addresses [#5604](https://github.com/Rdatatable/data.table/issues/5604), allowing package developers to import these names and avoid `R CMD check` `NOTE`s about undefined variables. Thanks to @MichaelChirico and @ylelkes for their suggestions, and to @Nj221102 for the implementation. We plan to export similar placeholders for `.` and `J` in roughly one year (e.g. data.table 1.18.0), but excluded them from this release to avoid back-compatibility issues. Specifically, some packages doing `import(plyr)` _and_ `import(data.table)`, and/or with those packages in `Depends`, will error when data.table starts exporting `.` (and similarly for a potential conflict with `rJava::J()`). We discourage using data.table (or any package, really) in Depends; blanket `import()` of package is also generally best avoided. See `vignette("datatable-importing")`. -18. `integer64` columns print well even if {bit64} has not yet been loaded, [#6224](https://github.com/Rdatatable/data.table/issues/6224). Thanks @renkun-ken for the report and @MichaelChirico for the fix. +12. `fwrite()` header names are no longer quoted automatically when `na` argument is given, [#2964](https://github.com/Rdatatable/data.table/issues/2964). Thanks @jangorecki for the report and @joshhwuu for the fix. -19. `fwrite()` header names are no longer quoted automatically when `na` argument is given, [#2964](https://github.com/Rdatatable/data.table/issues/2964). Thanks @jangorecki for the report and @joshhwuu for the fix. +13. Removed a warning about the now totally-obsolete option `datatable.CJ.names`, as discussed in previous releases. -20. Removed a warning about the now totally-obsolete option `datatable.CJ.names`, as discussed in previous releases. +14. Refactored some non-API calls in the package C code, [#6180](https://github.com/Rdatatable/data.table/issues/6180). There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. -21. Refactored some non-API calls in the package C code, (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users, R core, and especially Luke Tierney for pushing to have a clearer definition of "API" for R and for offering clear documentation and suggested workarounds. Thanks @MichaelChirico and @TysonStanley for implementing changes for this release; more will follow. +15. C code is more unified in how failures to allocate memory (`malloc()`/`calloc()`) are handled, [#1115](https://github.com/Rdatatable/data.table/issues/1115). No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. -22. C code was unified more in how failures to allocate memory (`malloc()`/`calloc()`) are handled, (#1115)[https://github.com/Rdatatable/data.table/issues/1115]. No OOM issues were reported, as these regions of code typically request relatively small blocks of memory, but it is good to handle memory pressure consistently. Thanks @elfring for the report and @MichaelChirico for the clean-up effort and future-proofing linter. +16. The internal routine for finding sort order (`forder`) will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. -22. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. + Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. -Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. + Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. -Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. + ```r + d2 = data.table(id=2:1, v2=1:2) + setindexv(d2, "id") + str(attr(attr(d2, "index"), "__id")) + # int [1:2] 2 1 + # - attr(*, "starts")= int [1:2] 1 2 + # - attr(*, "maxgrpn")= int 1 + # - attr(*, "anyna")= int 0 + # - attr(*, "anyinfnan")= int 0 + # - attr(*, "anynotascii")= int 0 + # - attr(*, "anynotutf8")= int 0 -```r -d2 = data.table(id=2:1, v2=1:2) -setindexv(d2, "id") -str(attr(attr(d2, "index"), "__id")) -# int [1:2] 2 1 -# - attr(*, "starts")= int [1:2] 1 2 -# - attr(*, "maxgrpn")= int 1 -# - attr(*, "anyna")= int 0 -# - attr(*, "anyinfnan")= int 0 -# - attr(*, "anynotascii")= int 0 -# - attr(*, "anynotutf8")= int 0 + d2 = data.table(id=2:1, v2=1:2) + invisible(d2[id==1L]) + str(attr(attr(d2, "index"), "__id")) + # int [1:2] 2 1 + ``` + + This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. + + ```r + d1 = data.table(id=1:2, v1=1:2) + d2 = data.table(id=2:1, v2=1:2) + setindexv(d2, "id") + d1[d2, on="id", verbose=TRUE] + #... + #Starting bmerge ... + #forderReuseSorting: using existing index: __id + #forderReuseSorting: opt=2, took 0.000s + #... + ``` -d2 = data.table(id=2:1, v2=1:2) -invisible(d2[id==1L]) -str(attr(attr(d2, "index"), "__id")) -# int [1:2] 2 1 -``` + This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. -This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. +17. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. -```r -d1 = data.table(id=1:2, v1=1:2) -d2 = data.table(id=2:1, v2=1:2) -setindexv(d2, "id") -d1[d2, on="id", verbose=TRUE] -#... -#Starting bmerge ... -#forderReuseSorting: using existing index: __id -#forderReuseSorting: opt=2, took 0.000s -#... -``` +18. The internal `init()` function in `fread.c` module has been marked as `static`, [#6328](https://github.com/Rdatatable/data.table/pull/6328). This is to avoid name collisions, and the resulting segfaults, with other libraries that might expose the same symbol name, and be already loaded by the R process. This was observed in Cray HPE environments where the `libsci` library providing LAPACK to R already has an `init` symbol. Thanks to @rtobar for the report and fix. -This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. +19. `?melt` has long documented that the returned `variable` column should contain integer column indices when `measure.vars` is a list, but when the list length is 1, `variable` is actually a character column name, which is inconsistent with the documentation, [#5209](https://github.com/Rdatatable/data.table/issues/5209). To increase consistency in the next release, we plan to change `variable` to integer, so users who were relying on this behavior should change `measure.vars=list("col_name")` (output `variable` is column name, will be column index/integer) to `measure.vars="col_name"` (`variable` is column name before and after the planned change). For now, relying on this undocumented behavior throws a new warning. -23. `set()` now adds new columns even if no rows are updated, [#5409](https://github.com/Rdatatable/data.table/issues/5409). This behavior is now consistent with `:=`, thanks to @mb706 for the report and @joshhwuu for the fix. +20. `dcast()` docs have always required `fun.aggregate` to return a single value, and when `fill=NULL`, `dcast` would indeed error if vector with `length!=1` was returned, but silently return an undefined result when fill is not `NULL`. Now `dcast` will additionally warn that this is undefined behavior, when fill is not `NULL`, [#6032](https://github.com/Rdatatable/data.table/issues/6032). In particular, this will warn for `fun.aggregate=identity`, which was observed in several revdeps. We may change this to an error in a future release, so revdeps should fix their code as soon as possible. Thanks to Toby Dylan Hocking for the PR, and Michael Chirico for analysis of GitHub revdeps. ## TRANSLATIONS @@ -342,7 +351,7 @@ This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/43 23. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. -25. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. Also thanks to Manuel López-Ibáñez for testing dev and mentioning needed documentation before release. +24. `setcolorder()` gains `before=` and `after=`, [#4385](https://github.com/Rdatatable/data.table/issues/4358). Thanks to Matthias Gomolka for the request, and both Benjamin Schwendinger and Xianghui Dong for implementing. Also thanks to Manuel López-Ibáñez for testing dev and mentioning needed documentation before release. 25. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, Boniface Kamgang and Martin Binder for testing, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. diff --git a/R/fcast.R b/R/fcast.R index 9675f0170..71360e135 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -191,7 +191,14 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., if (run_agg_funs) { fun.call = aggregate_funs(fun.call, lvals, sep, ...) maybe_err = function(list.of.columns) { - if (any(lengths(list.of.columns) != 1L)) stopf("Aggregating function(s) should take vector inputs and return a single value (length=1). However, function(s) returns length!=1. This value will have to be used to fill any missing combinations, and therefore must be length=1. Either override by setting the 'fill' argument explicitly or modify your function to handle this case appropriately.") + if (!all(lengths(list.of.columns) == 1L)) { + msg <- gettext("Aggregating function(s) should take a vector as input and return a single value (length=1), but they do not, so the result is undefined. Please fix by modifying your function so that a single value is always returned.") + if (is.null(fill)) { # TODO change to always stopf #6329 + stop(msg, domain=NA, call. = FALSE) + } else { + warning(msg, domain=NA, call. = FALSE) + } + } list.of.columns } dat = dat[, maybe_err(eval(fun.call)), by=c(varnames)] diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b3447b04e..7d49411c6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3046,8 +3046,8 @@ test(1006, print(as.data.table(M), nrows=10), output="gear NA.*1: 21.0") # rbinding factor with non-factor/character DT1 <- data.table(x=1:5, y=factor("a")) DT2 <- data.table(x=1:5, y=2) -test(1007, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2')))) -test(1008, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5))))) +test(1007.1, rbindlist(list(DT1, DT2)), data.table(x = c(1:5, 1:5), y = factor(c(rep('a', 5), rep('2', 5)), levels = c('a', '2')))) +test(1007.2, rbindlist(list(DT2, DT1)), data.table(x = c(1:5, 1:5), y = factor(c(rep('2', 5), rep('a', 5))))) # rbindlist different types DT1 <- data.table(a = 1L, b = 2L) @@ -3704,7 +3704,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # bug git #693 - dcast error message improvement: DT = data.table(x=c(1,1), y=c(2,2), z = 3:4) - test(1102.19, dcast(DT, x ~ y, value.var="z", fun.aggregate=identity), error="should take vector inputs and return a single value") + test(1102.19, dcast(DT, x ~ y, value.var="z", fun.aggregate=identity), error="should take a vector as input and return a single value") # bug #688 - preserving attributes DT = data.table(id = c(1,1,2,2), ty = c("a","b","a","b"), da = as.Date("2014-06-20")) @@ -3727,7 +3727,7 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # issues/715 DT = data.table(id=rep(1:2, c(3,2)), k=c(letters[1:3], letters[1:2]), v=1:5) - test(1102.25, dcast(DT, id ~ k, fun.aggregate=last, value.var="v"), error="should take vector inputs and return a single value") + test(1102.25, dcast(DT, id ~ k, fun.aggregate=last, value.var="v"), error="should take a vector as input and return a single value") test(1102.26, dcast(DT, id ~ k, fun.aggregate=last, value.var="v", fill=NA_integer_), data.table(id=1:2, a=c(1L, 4L), b=c(2L,5L), c=c(3L,NA_integer_), key="id")) # Fix for #893 @@ -17257,13 +17257,13 @@ exid = data.table(id=1, expected) test(2182.3, melt(DTid, measure.vars=list(a=c(NA,1), b=2:3), id.vars="id"), exid) test(2182.4, melt(DTid, measure.vars=list(a=c(NA,"a2"), b=c("b1","b2")), id.vars="id"), exid) test(2182.5, melt(DT.wide, measure.vars=list(a=c(NA,1), b=2:3), na.rm=TRUE), data.table(variable=factor(2), a=2, b=2)) -test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("1","2")), b=c(1,2))) # measure.vars named list length=1, #5065 +test(2182.6, melt(DT.wide, measure.vars=list(b=c("b1","b2"))), data.table(a2=2, variable=factor(c("b1","b2")), b=c(1,2)), warning="measure.vars is a list with length=1") # measure.vars named list length=1, #5065 # consistency between measure.vars=list with length=1 and length>1, #5209 -test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor(1), value=2)) +test(2182.71, melt(DT.wide, measure.vars=list("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2), warning="measure.vars is a list with length=1") test(2182.72, melt(DT.wide, measure.vars=c("a2"), variable.factor=TRUE), data.table(b1=1, b2=2, variable=factor("a2"), value=2)) -test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="1", value=2)) +test(2182.73, melt(DT.wide, measure.vars=list("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2), warning="measure.vars is a list with length=1") test(2182.74, melt(DT.wide, measure.vars=c("a2"), variable.factor=FALSE), data.table(b1=1, b2=2, variable="a2", value=2)) -test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="1", n=10))#thanks @mnazarov +test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.factor=FALSE), data.table(b=20, variable="a", n=10), warning="measure.vars is a list with length=1")#thanks @mnazarov ### First block testing measurev # new variable_table attribute for measure.vars, PR#4731 for multiple issues diff --git a/po/R-pt_BR.po b/po/R-pt_BR.po index b0db1159c..4fb95ffb8 100644 --- a/po/R-pt_BR.po +++ b/po/R-pt_BR.po @@ -2771,10 +2771,10 @@ msgid "" "**********" msgstr "" "**********\n" -"Executando data.table em inglês; o suporte ao pacote está disponível apenas " +"Executando data.table em português; o suporte ao pacote está disponível apenas " "em inglês. Ao procurar ajuda online, certifique-se de verificar também a " "mensagem de erro em inglês. Isso pode ser obtido examinando os arquivos po/R-" -".po e po/.po no código-fonte do pacote, onde as mensagens de " +"pt_BR.po e po/pt_BR.po no código-fonte do pacote, onde as mensagens de " "erro no idioma nativo e em inglês podem ser encontradas lado a lado.\n" "**********" diff --git a/po/pt_BR.po b/po/pt_BR.po index 284be4046..af77d9234 100644 --- a/po/pt_BR.po +++ b/po/pt_BR.po @@ -1,7 +1,7 @@ # # Translators: # Italo Santos , 2024 -# Leonardo Fontenelle , 2024 +# Leonardo Fontenelle , 2024 # Rafael Fontenelle , 2024 # msgid "" @@ -9,7 +9,7 @@ msgstr "" "Project-Id-Version: data.table 1.15.99\n" "POT-Creation-Date: 2024-06-23 12:07-0300\n" "PO-Revision-Date: 2024-06-23 16:40-0300\n" -"Last-Translator: Leonardo Fontenelle \n" +"Last-Translator: Leonardo Fontenelle \n" "Language-Team: Brazilian Portuguese\n" "Language: pt_BR\n" "MIME-Version: 1.0\n" diff --git a/src/fmelt.c b/src/fmelt.c index 51f4fbbb8..77a48f6a4 100644 --- a/src/fmelt.c +++ b/src/fmelt.c @@ -595,9 +595,12 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str if (data->lvalues==1 && length(VECTOR_ELT(data->valuecols, 0)) != data->lmax) error(_("Internal error: fmelt.c:getvarcols %d %d"), length(VECTOR_ELT(data->valuecols, 0)), data->lmax); // # nocov if (isNull(data->variable_table)) { + if ((data->lvalues == 1) & data->measure_is_list) { + warning("measure.vars is a list with length=1, which as long documented should return integer indices in the 'variable' column, but currently returns character column names. To increase consistency in the next release, we plan to change 'variable' to integer, so users who were relying on this behavior should change measure.vars=list('col_name') (output variable is column name now, but will become column index/integer) to measure.vars='col_name' (variable is column name before and after the planned change)."); + } if (!varfactor) { SET_VECTOR_ELT(ansvars, 0, target=allocVector(STRSXP, data->totlen)); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list const int *thisvaluecols = INTEGER(VECTOR_ELT(data->valuecols, 0)); for (int j=0, ansloc=0; jlmax; ++j) { const int thislen = data->narm ? length(VECTOR_ELT(data->not_NA_indices, j)) : data->nrow; @@ -616,7 +619,7 @@ SEXP getvarcols(SEXP DT, SEXP dtnames, Rboolean varfactor, Rboolean verbose, str SET_VECTOR_ELT(ansvars, 0, target=allocVector(INTSXP, data->totlen)); SEXP levels; int *td = INTEGER(target); - if (!data->measure_is_list) {//one value column to output. + if (data->lvalues == 1) {//one value column to output. TODO #5247 change to !data->measure_is_list SEXP thisvaluecols = VECTOR_ELT(data->valuecols, 0); int len = length(thisvaluecols); levels = PROTECT(allocVector(STRSXP, len)); protecti++; diff --git a/src/fread.c b/src/fread.c index e301e8cd1..45efa6eeb 100644 --- a/src/fread.c +++ b/src/fread.c @@ -84,7 +84,7 @@ static double NAND; static double INFD; // NAN and INFINITY constants are float, so cast to double once up front. -void init(void) { +static void init(void) { NAND = (double)NAN; INFD = (double)INFINITY; } diff --git a/src/rbindlist.c b/src/rbindlist.c index 68c3166e1..40fcd8fe0 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -244,9 +244,9 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor ncol = length(VECTOR_ELT(l, first)); // ncol was increased as if fill=true, so reduce it back given fill=false (fill==false checked above) } - int nprotect = 0; - SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); nprotect++; - SEXP ansNames = PROTECT(allocVector(STRSXP, idcol + ncol)); nprotect++; + int nprotect = 2; + SEXP ans = PROTECT(allocVector(VECSXP, idcol + ncol)); + SEXP ansNames = PROTECT(allocVector(STRSXP, idcol + ncol)); setAttrib(ans, R_NamesSymbol, ansNames); if (idcol) { SET_STRING_ELT(ansNames, 0, STRING_ELT(idcolArg, 0)); @@ -534,15 +534,18 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor if (w==-1 || !length(thisCol=VECTOR_ELT(li, w))) { // !length for zeroCol warning above; #1871 writeNA(target, ansloc, thisnrow, false); // writeNA is integer64 aware and writes INT64_MIN } else { - bool listprotect = false; - if ((TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target)) { - // do an as.list() on the atomic column; #3528 - thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); listprotect = true; + bool listprotect = (TYPEOF(target)==VECSXP || TYPEOF(target)==EXPRSXP) && TYPEOF(thisCol)!=TYPEOF(target); + // do an as.list() on the atomic column; #3528 + if (listprotect) { + thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); + // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909) + const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName); + UNPROTECT(1); // earlier unprotect rbindlist calls with lots of lists #4536 + if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); + } else { + const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName); + if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); } - // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909) - const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, 0, -1, idcol+j+1, foundName); - if (listprotect) UNPROTECT(1); // earlier unprotect rbindlist calls with lots of lists #4536 - if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); // e.g. when precision is lost like assigning 3.4 to integer64; test 2007.2 // TODO: but maxType should handle that and this should never warn } @@ -550,6 +553,6 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor } } } - UNPROTECT(nprotect); // ans, coercedForFactor, thisCol + UNPROTECT(nprotect); // ans, ansNames, coercedForFactor? return(ans); }