diff --git a/.dev/cc.R b/.dev/cc.R index f2031ca48..28f398f16 100644 --- a/.dev/cc.R +++ b/.dev/cc.R @@ -38,6 +38,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) { if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly") return(invisible()) } + suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports))) for (ii in seq_along(nsParsedImports)) { @@ -51,7 +52,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) { return(invisible()) } -cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) { +cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) { if (!missing(cc_dir)) { warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead") path = cc_dir diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml new file mode 100644 index 000000000..1358f0538 --- /dev/null +++ b/.github/workflows/R-CMD-check-occasional.yaml @@ -0,0 +1,96 @@ +on: + schedule: + - cron: '18 13 8 * *' # 8th of month at 13:18 UTC + +# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release +name: R-CMD-check-occasional + +jobs: + R-CMD-check-occasional: + runs-on: ${{ matrix.os }} + + name: ${{ matrix.os }} (${{ matrix.r }}) + + strategy: + matrix: + os: [macOS-latest, windows-latest, ubuntu-latest] + r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3'] + locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502) + exclude: + - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu + locale: ['zh_CN.utf8', 'lv_LV.utf8'] + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Set locale + if: matrix.locale == 'en_US.utf8' + run: | + sudo locale-gen en_US + echo "LC_ALL=en_US.utf8" >> $GITHUB_ENV + + - name: Set locale + if: matrix.locale == 'zh_CN.utf8' + run: | + sudo locale-gen zh_CN + echo "LC_ALL=zh_CN.utf8" >> $GITHUB_ENV + echo "LANGUAGE=zh_CN" >> $GITHUB_ENV + + - name: Set locale + if: matrix.locale == 'lv_LV.utf8' + run: | + sudo locale-gen lv_LV + echo "LC_ALL=lv_LV.utf8" >> $GITHUB_ENV + echo "LANGUAGE=lv_LV" >> $GITHUB_ENV + + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r }} + + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Restore R package cache + uses: actions/cache@v2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + run: | + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: | + options(crayon.enabled = TRUE) + rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@main + with: + name: ${{ runner.os }}-r${{ matrix.r }}-results + path: check diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml new file mode 100644 index 000000000..2fc3a76f5 --- /dev/null +++ b/.github/workflows/performance-tests.yml @@ -0,0 +1,23 @@ +name: Autocomment atime-based performance regression analysis on PRs + +on: + pull_request: + branches: + - '*' + types: + - opened + - reopened + - synchronize + paths: + - 'R/**' + - 'src/**' + +jobs: + comment: + runs-on: ubuntu-latest + container: ghcr.io/iterative/cml:0-dvc2-base1 + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + repo_token: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: Anirban166/Autocomment-atime-results@v1.1.6 diff --git a/NEWS.md b/NEWS.md index d27e4ec9c..e30849114 100644 --- a/NEWS.md +++ b/NEWS.md @@ -32,7 +32,11 @@ 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix. -9. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). +9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development. + +10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR. + +11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). ## BUG FIXES @@ -74,6 +78,8 @@ 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix. +12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix. + # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024) ## BREAKING CHANGE diff --git a/R/fmelt.R b/R/fmelt.R index 23f07c552..5c50ca26c 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -107,17 +107,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na stopf("pattern must be character string") } match.vec = regexpr(pattern, cols, perl=TRUE) - measure.vec = which(0 < match.vec) - if (length(measure.vec) == 0L) { + measure.vec.i = which(0 < match.vec) + if (length(measure.vec.i) == 0L) { stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern") } - start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE] + start = attr(match.vec, "capture.start")[measure.vec.i, , drop=FALSE] if (is.null(start)) { stopf("pattern must contain at least one capture group (parenthesized sub-pattern)") } err.args.groups("number of capture groups in pattern", ncol(start)) - end = attr(match.vec, "capture.length")[measure.vec,]+start-1L - names.mat = matrix(cols[measure.vec], nrow(start), ncol(start)) + end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L + measure.vec <- cols[measure.vec.i] + names.mat = matrix(measure.vec, nrow(start), ncol(start)) substr(names.mat, start, end) } else { #pattern not specified, so split using sep. if (!is.character(sep)) { @@ -130,10 +131,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification") } err.args.groups("max number of items after splitting column names", n.groups) - measure.vec = which(vector.lengths==n.groups) - do.call(rbind, list.of.vectors[measure.vec]) + measure.vec.i = which(vector.lengths==n.groups) + measure.vec = cols[measure.vec.i] + do.call(rbind, list.of.vectors[measure.vec.i]) } - err.names.unique("measured columns", cols[measure.vec]) + err.names.unique("measured columns", measure.vec) uniq.mat = unique(group.mat) if (nrow(uniq.mat) < nrow(group.mat)) { stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat)) diff --git a/R/fread.R b/R/fread.R index b2e55403d..66bda3fb1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ fread = function( -input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", +input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, @@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") else if (sep=="auto") sep="" # sep=="" at C level means auto sep else stopifnot( nchar(sep)==1L ) # otherwise an actual character to use as sep } - stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L ) + stopifnot( is.character(dec), length(dec)==1L) + if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L) # handle encoding, #563 if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) { stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") diff --git a/R/print.data.table.R b/R/print.data.table.R index dd641f946..9e33e0c4d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -8,6 +8,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), print.keys=getOption("datatable.print.keys"), trunc.cols=getOption("datatable.print.trunc.cols"), quote=FALSE, + na.print=NULL, timezone=FALSE, ...) { # topn - print the top topn and bottom topn rows with '---' inbetween (5) # nrows - under this the whole (small) table is printed, unless topn is provided (100) @@ -109,6 +110,13 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), # When nrow(toprint) = 1, attributes get lost in the subset, # function below adds those back when necessary toprint = toprint_subset(toprint, cols_to_print) + trunc.cols <- length(not_printed) > 0L + } + print_default = function(x) { + if (col.names != "none") cut_colnames = identity + cut_colnames(print(x, right=TRUE, quote=quote, na.print=na.print)) + # prints names of variables not shown in the print + if (trunc.cols) trunc_cols_message(not_printed, abbs, class, col.names) } if (printdots) { if (isFALSE(row.names)) { @@ -117,30 +125,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn)) } rownames(toprint) = format(rownames(toprint), justify="right") - if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote)) - } else { - print(toprint, right=TRUE, quote=quote) - } - if (trunc.cols && length(not_printed) > 0L) - # prints names of variables not shown in the print - trunc_cols_message(not_printed, abbs, class, col.names) - + print_default(toprint) return(invisible(x)) } if (nrow(toprint)>20L && col.names == "auto") # repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them # option to shut this off per request of Oleg Bondar on SO, #1482 - toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 - if (col.names == "none") { - cut_colnames(print(toprint, right=TRUE, quote=quote)) - } else { - print(toprint, right=TRUE, quote=quote) - } - if (trunc.cols && length(not_printed) > 0L) - # prints names of variables not shown in the print - trunc_cols_message(not_printed, abbs, class, col.names) - + toprint = rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97 + print_default(toprint) invisible(x) } diff --git a/R/test.data.table.R b/R/test.data.table.R index e2efe29d9..748e09512 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -1,4 +1,4 @@ -test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, +test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, testPattern=NULL, memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) { stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress)) memtest = as.integer(memtest) @@ -38,7 +38,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F scripts = scripts[!grepl("bench|other", scripts)] scripts = gsub("[.]bz2$","",scripts) return(sapply(scripts, function(fn) { - err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress)) + err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress, testPattern=testPattern)) cat("\n"); isTRUE(err) })) @@ -140,6 +140,68 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.") } + # nocov start: only used interactively -- "production" suites should always run in full + if (!is.null(testPattern)) { + # due to how non-hermetic our tests are, the simple approach (pass this to test(), return early if 'numStr' matches testPattern) + # does not work, or at least getting it to work is not much more efficient (see initial commit of #6040). so instead, + # here we parse the file, extract the tests that match the pattern to a new file, and include other setup lines likely required + # to run the tests successfully. two major drawbacks (1) we can only take a guess which lines are required, so this approach + # can't work (or at least, may need a lot of adjustment) for _every_ test, though not working is also a good sign that test + # should be refactored to be more hermetic (2) not all tests have literal test numbers, meaning we can't always match the + # runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the + # full test expression string, i.e., not just limited to numeric literal test numbers. + arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check + pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE))) + file_lines = readLines(fn) + # NB: a call looks like (with id/parent tracking) + # + # name + # ( + # ... + # ... + # ) + # + ## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call + test_calls = pd[ + pd[ + pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'], + list(call_lhs_id=id, call_id=x.parent), + on=c(id='parent')], + list(line1, id), + on=c(id='call_id')] + ## all the arguments for each call to test() + test_call_args = test_calls[pd[token == 'expr'], list(call_id=parent, arg_line=i.line1, col1, col2), on=c(id='parent'), nomatch=NULL] + ## 2nd argument is the num= argument + test_num_expr = test_call_args[ , .SD[2L], by="call_id"] + # NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so + keep_test_ids = test_num_expr[grepl(testPattern, substring(file_lines[arg_line], col1, col2)), call_id] + # Now find all tests just previous to the keep tests; we want to keep non-test setup lines between them, e.g. + # test(drop, ...) + # setup_line1 # retain + # setup_line2 # retain + # test(keep, ...) # retain + intertest_ranges = test_calls[!id %in% keep_test_ids][ + test_calls[id %in% keep_test_ids], + list(preceding_line=x.line1, test_start_line=i.line1), + on='line1', + roll=TRUE] + # TODO(michaelchirico): this doesn't do well with tests inside control statements. + # those could be included by looking for tests with parent!=0, i.e., not-top-level tests, + # and including the full parent for such tests. omitting for now until needed. + keep_lines = intertest_ranges[, sort(unique(unlist(Map(function(l, u) l:u, preceding_line+1L, test_start_line))))] + header_lines = seq_len(test_calls$line1[1L]-1L) + + tryCatch(error = function(c) warningf("Attempt to subset to %d tests matching '%s' failed, running full suite.", length(keep_test_ids), testPattern), { + new_script = file_lines[c(header_lines, keep_lines)] + parse(text = new_script) # as noted above the static approach is not fool-proof (yet?), so force the script to at least parse before continuing. + fn = tempfile() + on.exit(unlink(fn), add=TRUE) + catf("Running %d of %d tests matching '%s'\n", length(keep_test_ids), nrow(test_calls), testPattern) + writeLines(new_script, fn) + }) + } + # nocov end + err = try(sys.source(fn, envir=env), silent=silent) options(oldOptions) @@ -251,7 +313,19 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) { + if (!is.null(env)) { + old = Sys.getenv(names(env), names=TRUE, unset=NA) + to_unset = !lengths(env) + # NB: Sys.setenv() (no arguments) errors + if (!all(to_unset)) do.call(Sys.setenv, as.list(env[!to_unset])) + Sys.unsetenv(names(env)[to_unset]) + on.exit(add=TRUE, { + is_preset = !is.na(old) + if (any(is_preset)) do.call(Sys.setenv, as.list(old[is_preset])) + Sys.unsetenv(names(old)[!is_preset]) + }) + } if (!is.null(options)) { old_options <- do.call('options', as.list(options)) # as.list(): allow passing named character vector for convenience on.exit(options(old_options), add=TRUE) @@ -418,7 +492,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no } } if (!fail && !length(error) && (!length(output) || !missing(y))) { # TODO test y when output=, too - y = try(y,TRUE) + capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it if (identical(x,y)) return(invisible(TRUE)) all.equal.result = TRUE if (is.data.frame(x) && is.data.frame(y)) { diff --git a/R/wrappers.R b/R/wrappers.R index dcf8ba08e..a018b91ae 100644 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -8,7 +8,7 @@ setcoalesce = function(...) .Call(Ccoalesce, list(...), TRUE) fifelse = function(test, yes, no, na=NA) .Call(CfifelseR, test, yes, no, na) fcase = function(..., default=NA) .Call(CfcaseR, default, parent.frame(), as.list(substitute(list(...)))[-1L]) -colnamesInt = function(x, cols, check_dups=FALSE) .Call(CcolnamesInt, x, cols, check_dups) +colnamesInt = function(x, cols, check_dups=FALSE, skip_absent=FALSE) .Call(CcolnamesInt, x, cols, check_dups, skip_absent) testMsg = function(status=0L, nx=2L, nk=2L) .Call(CtestMsgR, as.integer(status)[1L], as.integer(nx)[1L], as.integer(nk)[1L]) diff --git a/inst/atime/tests.R b/inst/atime/tests.R new file mode 100644 index 000000000..a0635d063 --- /dev/null +++ b/inst/atime/tests.R @@ -0,0 +1,108 @@ +# A function to customize R package metadata and source files to facilitate version-specific installation and testing. +# +# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R) +# to support testing across different versions (base and HEAD for PRs, commits associated with historical regressions, etc.) of the package. +# It appends a SHA1 hash to the package name (PKG.SHA), ensuring each version can be installed and tested separately. +# +# @param old.Package Current name of the package. +# @param new.Package New name of the package, including a SHA hash. +# @param sha SHA1 hash used for differentiating versions. +# @param new.pkg.path Path to the package files. +# +# @details +# The function modifies: +# - DESCRIPTION, updating the package name. +# - Makevars, customizing the shared object file name and adjusting the build settings. +# - R/onLoad.R, adapting custom version checking for package loading operations. +# - NAMESPACE, changing namespace settings for dynamic linking. +# +# @examples +# pkg.edit.fun("data.table", "data.table.some_SHA1_hash", "some_SHA1_hash", "/path/to/data.table") +# +# @return None (performs in-place file modifications) +# @note This setup is typically unnecessary for most packages but essential for data.table due to its unique configuration. +pkg.edit.fun = function(old.Package, new.Package, sha, new.pkg.path) { + pkg_find_replace <- function(glob, FIND, REPLACE) { + atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE) + } + Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE) + Package_ <- gsub(".", "_", old.Package, fixed = TRUE) + new.Package_ <- paste0(Package_, "_", sha) + pkg_find_replace( + "DESCRIPTION", + paste0("Package:\\s+", old.Package), + paste("Package:", new.Package)) + pkg_find_replace( + file.path("src", "Makevars.*in"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + Package_regex, + new.Package_) + pkg_find_replace( + file.path("R", "onLoad.R"), + sprintf('packageVersion\\("%s"\\)', old.Package), + sprintf('packageVersion\\("%s"\\)', new.Package)) + pkg_find_replace( + file.path("src", "init.c"), + paste0("R_init_", Package_regex), + paste0("R_init_", gsub("[.]", "_", new.Package_))) + pkg_find_replace( + "NAMESPACE", + sprintf('useDynLib\\("?%s"?', Package_regex), + paste0('useDynLib(', new.Package_)) + } + +# A list of performance tests. +# +# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments: +# - N: A numeric sequence of data sizes to vary. +# - setup: An expression evaluated for every data size before measuring time/memory. +# - expr: An expression that will be evaluated for benchmarking performance across different git commit versions. +# This must call a function from data.table using a syntax with double or triple colon prefix. +# The package name before the colons will be replaced by a new package name that uses the commit SHA hash. +# (For instance, data.table:::[.data.table will become data.table.some_40_digit_SHA1_hash:::[.data.table) +# +# Optional parameters that may be useful to configure tests: +# - times: Number of times each expression is evaluated (default is 10). +# - seconds.limit: The maximum median timing (in seconds) of an expression. No timings for larger N are computed past that threshold. +# - sha.vec: Named character vector or a list of vectors that specify data.table-specific commit SHAs for testing across those different git commit versions. +# For historical regressions, use 'Before', 'Regression', and 'Fixed' (otherwise something like 'Slow' or 'Fast' ideally). +# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information. +test.list <- list( + # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311 + # Fixed in: https://github.com/Rdatatable/data.table/pull/4440 + "Test regression fixed in #4440" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3,8), + setup = quote({ + set.seed(1L) + dt <- data.table(a = sample.int(N)) + setindexv(dt, "a") + }), + expr = quote(data.table:::shallow(dt)), + # Before = "", This needs to be updated later as there are two issues here: A) The source of regression (or the particular commit that led to it) is not clear; B) Older versions of data.table are having problems when being installed in this manner (This includes commits from before March 20 2020, when the issue that discovered or first mentioned the regression was created) + Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression + Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440) + + # Test based on: https://github.com/Rdatatable/data.table/issues/5424 + # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491 + # Fixed in: https://github.com/Rdatatable/data.table/pull/5463 + "Test regression fixed in #5463" = list( + pkg.edit.fun = pkg.edit.fun, + N = 10^seq(3, 8), + setup = quote({ + n <- N/100 + set.seed(2L) + dt <- data.table( + g = sample(seq_len(n), N, TRUE), + x = runif(N), + key = "g") + dt_mod <- copy(dt) + }), + expr = quote(data.table:::`[.data.table`(dt_mod, , N := .N, by = g)), + Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) + Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits) + Fixed = "58409197426ced4714af842650b0cc3b9e2cb842") # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits) +) diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index b72c0b506..cf65f61bf 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -149,8 +149,22 @@ test(4.20, colnamesInt(dt, integer()), integer()) test(4.21, colnamesInt(dt, NULL), seq_along(dt)) test(4.22, colnamesInt("asd", 1), error="must be data.table compatible") test(4.23, colnamesInt(dt, 1, check_dups="a"), error="check_dups") +test(4.24, colnamesInt(dt, c("a", "e"), skip_absent=TRUE), c(1L,0L)) +test(4.25, colnamesInt(dt, c(1L, 4L), skip_absent=TRUE), c(1L,0L)) +test(4.26, colnamesInt(dt, c(1, 4), skip_absent=TRUE), c(1L,0L)) +test(4.27, colnamesInt(dt, c("a", NA), skip_absent=TRUE), c(1L,0L)) +test(4.28, colnamesInt(dt, c(1L, 0L), skip_absent=TRUE), error="received non-existing column*.*0") +test(4.29, colnamesInt(dt, c(1, -5), skip_absent=TRUE), error="received non-existing column*.*-5") +test(4.30, colnamesInt(dt, c(1, 4), skip_absent=NULL), error="skip_absent must be TRUE or FALSE") +test(4.31, colnamesInt(dt, c(1L, 1000L), skip_absent=TRUE), c(1L,0L)) +cols=c(1L,100L) +test(4.32, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) +test(4.33, cols, c(1L, 100L)) # ensure input was not overwritten with output 0 +cols=c(1,100) +test(4.34, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) +test(4.35, cols, c(1, 100)) # ensure input was not overwritten with output 0 names(dt) <- NULL -test(4.24, colnamesInt(dt, "a"), error="has no names") +test(4.36, colnamesInt(dt, "a"), error="has no names") # verbose dt = data.table(a=c(1L, 2L, NA_integer_), b=c(1, 2, NA_real_)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c0e7a8517..e00c4ac6e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2662,13 +2662,13 @@ for (ne in seq_along(eols)) { lines = capture.output(fwrite(headDT, verbose=FALSE)) cat(paste(lines,collapse=eol), file=f, sep="") # so last line abruptly ends (missing last eol) to test that, otherwise could just pass eol to fwrite # on unix we simulate Windows too. On Windows \n will write \r\n (and \r\n will write \r\r\n) - num = 894 + nr/100 + nc/1000 + ne/10000 + num_major = nr/100 + nc/1000 + ne/10000 # if (isTRUE(all.equal(testIDtail, 0.4103))) browser() - test(num+0.00001, fread(f,na.strings=""), headDT) + test(894+num_major+0.00001, fread(f,na.strings=""), headDT) cat(eol,file=f,append=TRUE) # now a normal file properly ending with final \n - test(num+0.00002, fread(f,na.strings=""), headDT) + test(894+num_major+0.00002, fread(f,na.strings=""), headDT) cat(eol,file=f,append=TRUE) # extra \n should be ignored other than for single columns where it is significant - test(num+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT) + test(894+num_major+0.00003, fread(f,na.strings=""), if (nc==1) rbind(headDT, list(NA)) else headDT) unlink(f) }}} if (test_bit64) { @@ -2681,15 +2681,13 @@ if (test_bit64) { test(897, class(DT$b), "integer64") test(898, fread(f), DT) unlink(f) - DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))] - DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))] - DT[,r:=a/100][,r2:=gsub(" ","",format(r))] - DT[112, a2:=as.integer64(12345678901234)] # start on row 112 to avoid the first 100 - DT[113, a3:=3.14] - DT[114, a4:="123A"] - DT[115, b2:=1234567890123.45] - DT[116, b3:="12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) - DT[117, r2:="3.14A"] + DT[ , a2 := as.integer64(a)][112L, a2 := as.integer64(12345678901234)] # start on row 112 to avoid the first 100 + DT[ , a3 := as.double(a) ][113L, a3 := 3.14] + DT[ , a4 := as.character(a)][114L, a4 := "123A"] + DT[ , b2 := as.double(b) ][115L, b2 := 1234567890123.45] + DT[ , b3 := as.character(b)][116L, b3 := "12345678901234567890A"] # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR) + DT[ , r := a/100] + DT[ , r2 := as.character(r)][117L, r2 := "3.14A"] fwrite(DT,f<-tempfile()) test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>") test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE), @@ -3159,6 +3157,13 @@ test(1034, as.data.table(x<-as.character(sample(letters, 5))), data.table(V1=x)) DT_missing_l_2 = data.table(num_1=1, num_2=2, list_1=list(1), list_3=list(3)) test(1035.0186, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=TRUE), data.table(char="1", num=1, list=list(1))) test(1035.0187, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_"), na.rm=FALSE), data.table(char=c("1","2","3"), num=c(1,2,NA), list=list(1,NA,3))) + # measure supports cols arg, #5063 + expected_without_value = data.table(num_1=1,num_2=2,prefix="list",char=c("1","3"),value=list(1,3)) + test(1035.0188, melt(DT_missing_l_2, measure.vars=measure(prefix, char, sep="_", cols=c("list_1","list_3"))), expected_without_value) + test(1035.0189, melt(DT_missing_l_2, measure.vars=measure(prefix, char, pattern="(.*)_(.*)", cols=c("list_1","list_3"))), expected_without_value) + expected_with_value = data.table(num_1=1,num_2=2,char=c("1","3"),list=list(1,3)) + test(1035.0190, melt(DT_missing_l_2, measure.vars=measure(value.name, char, sep="_", cols=c("list_1","list_3"))), expected_with_value) + test(1035.0191, melt(DT_missing_l_2, measure.vars=measure(value.name, char, pattern="(.*)_(.*)", cols=c("list_1","list_3"))), expected_with_value) ans1 = cbind(DT[, c(1,2,8), with=FALSE], variable=factor("l_1")) ans1[, value := DT$l_1] @@ -4466,13 +4471,13 @@ colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -test_no = 1223.0 +test_no = 0L oldnfail = nfail for (nvars in seq_along(names(DT))) { signs = expand.grid(replicate(nvars, c(-1L,1L), simplify=FALSE)) combn(names(DT), nvars, simplify=FALSE, function(x) { # simplify=FALSE needed for R 3.1.0 for (i in seq_len(nrow(signs))) { - test_no <<- signif(test_no+.001, 7) + test_no <<- test_no + 1L ll = as.call(c(as.name("order"), lapply(seq_along(x), function(j) { if (signs[i,j] == 1L) @@ -4485,7 +4490,7 @@ for (nvars in seq_along(names(DT))) { } }) )) - test(test_no, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) + test(1223.0 + test_no*0.001, forderv(DT, by=x, order=signs[i,]), with(DT, eval(ll))) } integer() }) @@ -4617,16 +4622,16 @@ colorder=sample(ncol(DT)) setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") -test_no = 1246.0 +test_no = 0L oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { - test_no <<- signif(test_no+.01, 7) # first without key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) - test_no <<- signif(test_no+.01, 7) + test_no <<- test_no + 1L # first without key + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test_no <<- test_no + 1L setkeyv(DT, jj) # with key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce @@ -4645,11 +4650,11 @@ oldnfail = nfail for (i in seq_along(names(DT))) { cc = combn(names(DT), i) apply(cc, 2L, function(jj) { - test_no <<- signif(test_no+.01, 7) # first without key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) - test_no <<- signif(test_no+.01, 7) + test_no <<- test_no + 1L # first without key + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test_no <<- test_no + 1L setkeyv(DT, jj) # with key - test(test_no, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) + test(1246.0 + test_no*0.01, duplicated(DT, by=jj, fromLast=TRUE), duplicated.data.frame(DT[, jj, with=FALSE], fromLast=TRUE)) }) } if (nfail > oldnfail) cat(seedInfo, "\n") # to reproduce @@ -4734,13 +4739,13 @@ setcolorder(DT, names(DT)[colorder]) seedInfo = paste(seedInfo, "colorder = ", paste(colorder, collapse=","), sep="") ans = vector("list", length(names(DT))) -test_no = 1252 +test_no = 0L oldnfail = nfail for (i in seq_along(names(DT))) { cj = as.matrix(do.call(CJ, split(rep(c(1L,-1L), each=i), 1:i))) ans[[i]] = combn(names(DT), i, function(x) { tmp = apply(cj, 1, function(y) { - test_no <<- signif(test_no+.001, 7) + test_no <<- test_no + 1L ll = as.call(c(as.name("base_order"), lapply(seq_along(x), function(j) { if (y[j] == 1L) @@ -4754,11 +4759,11 @@ for (i in seq_along(names(DT))) { }) )) ans1 = forderv(DT, by=x, order=y, na.last=TRUE) # adding tests for both nalast=TRUE and nalast=NA - test(test_no, ans1, with(DT, eval(ll))) - test_no <<- signif(test_no+.001, 7) + test(1252.0 + test_no*0.001, ans1, with(DT, eval(ll))) + test_no <<- test_no + 1L ll <- as.call(c(as.list(ll), na.last=NA)) ans1 = forderv(DT, by=x, order=y, na.last=NA) # nalast=NA here. - test(test_no, ans1[ans1 != 0], with(DT, eval(ll))) + test(1252.0 + test_no*0.001, ans1[ans1 != 0], with(DT, eval(ll))) }) dim(tmp)=NULL list(tmp) @@ -4885,13 +4890,13 @@ setNumericRounding(old_rounding) # distinguishing small numbers from 0.0 as from v1.9.2, test from Rick # http://stackoverflow.com/questions/22290544/grouping-very-small-numbers-e-g-1e-28-and-0-0-in-data-table-v1-8-10-vs-v1-9-2 old_rounding = getNumericRounding() -test_no = 1278.001 +test_no = 0L for (dround in c(0,2)) { setNumericRounding(dround) # rounding should not affect the result here because although small, it's very accurace (1 s.f.) for (i in c(-30:-1,1:30)) { DT = data.table(c(1 * (10^i),2,9999,-1,0,1)) - test(test_no, nrow(DT[, .N, by=V1]), 6L) - test_no = test_no + 0.001 + test_no = test_no + 1L + test(1278.0 + test_no*0.001, nrow(DT[, .N, by=V1]), 6L) } } setNumericRounding(old_rounding) @@ -5697,7 +5702,7 @@ dt = data.table(AA=sample(c(-2:2), 50, TRUE), DD=sample(c(-2:2), 50, TRUE), EE=sample(as.logical(c(-2:2)), 50, TRUE)) if (test_bit64) dt[, DD := as.integer64(DD)] -test_no = 1368.0 +test_no = 0L for (i in seq_along(dt)) { col = dt[[i]] for (j in list(TRUE, FALSE, "keep")) { @@ -5716,10 +5721,10 @@ for (i in seq_along(dt)) { r3 = frankv(col, ties.method=k, na.last=j) r4 = frankv(col, order=-1L, ties.method=k, na.last=j) - test_no = test_no+.0001 - test(test_no, r1, r3) - test_no = test_no+.0001 - test(test_no, r2, r4) + test_no = test_no + 1L + test(1368.0 + test_no*0.0001, r1, r3) + test_no = test_no + 1L + test(1368.0 + test_no*0.0001, r2, r4) } } } @@ -5730,7 +5735,7 @@ dt = data.table(AA=sample(c(-2:2, NA), 50, TRUE), DD=sample(c(-2:2, NA), 50, TRUE), EE=sample(as.logical(c(-2:2, NA)), 50, TRUE)) if (test_bit64) dt[, DD := as.integer64(DD)] -test_no = 1369.0 +test_no = 0L for (i in seq_along(dt)) { col = dt[[i]] # ensure consistency with base::rank ties.methods as advertised @@ -5748,10 +5753,10 @@ for (i in seq_along(dt)) { r3 = frankv(col, ties.method=k, na.last=NA) r4 = frankv(col, order=-1L, ties.method=k, na.last=NA) - test_no = test_no+.0001 - test(test_no, r1, r3) - test_no = test_no+.0001 - test(test_no, r2, r4) + test_no = test_no + 1L + test(1369.0 + test_no*0.0001, r1, r3) + test_no = test_no + 1L + test(1369.0 + test_no*0.0001, r2, r4) } } @@ -5767,20 +5772,20 @@ dt = list(AA=sample(c(NA,-2:2), 50, TRUE), DD=sample(c(NA,-2:2), 50, TRUE), EE=sample(as.logical(c(NA,-2:2)), 50, TRUE)) if (test_bit64) dt[["DD"]] = as.integer64(dt[["DD"]]) -test_no = 1370.0 +test_no = 0L ans = as.list(na.omit(as.data.table(dt))) for (i in seq_along(dt)) { combn(names(dt), i, function(cols) { ans1 = is_na(dt[cols]) ans2 = rowSums(is.na(as.data.table(dt[cols]))) > 0L - test_no <<- test_no+.0001 - test(test_no, ans1, ans2) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, ans1, ans2) # update: tests for any_na - test_no <<- test_no+.0001 - test(test_no, any_na(dt[cols]), TRUE) - test_no <<- test_no+.0001 - test(test_no, any_na(ans[cols]), FALSE) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, any_na(dt[cols]), TRUE) + test_no <<- test_no + 1L + test(1370.0 + test_no*0.0001, any_na(ans[cols]), FALSE) TRUE }) } @@ -5874,7 +5879,7 @@ types=c("any", "within", "start", "end", "equal") # add 'equal' as well mults=c("all", "first", "last") maxgap=-1L; minoverlap=0L # default has changed in IRanges/GenomicRanges :: findOverlaps verbose=FALSE; which=TRUE -test_no = 1372.0 +test_no = 0L load(testDir("test1372.Rdata")) # Regenerated on 17/02/2019 to include type = 'equal'. Var 'ans' has all the results saved by running GenomicRanges separately using code above, is a list with names of the format type_mult_run set.seed(123) this = 1L @@ -5903,11 +5908,11 @@ for (run in seq_len(times)) { # data.table overlap join nomatch = if(mult == "all") NULL else NA_integer_ thisans = foverlaps(i, x, mult=mult, type=type, nomatch=nomatch, which=which, verbose=verbose) - test_no = test_no+.01 + test_no = test_no + 1L # cat("test =", test_no, ", run = ", run, ", type = ", type, ", mult = ", mult, "\n", sep="") idx = paste(type, mult, run, sep="_") # ans[[idx]] contains fo(gr(i), gr(x), type=type, select=mult) - test(test_no, thisans, ans[[idx]]) + test(1372.0 + test_no*0.01, thisans, ans[[idx]]) this = this+1L } } @@ -6127,13 +6132,13 @@ DT = data.table(a=sample(col, 20, TRUE), b=as.numeric(sample(col,20,TRUE)), c=as # if (test_bit64) { # DT[, e := as.integer64(sample(col,20,TRUE))] # } -test_no = 1394 +test_no = 0L for (i in seq_along(DT)) { combn(names(DT), i, function(cols) { ans1 = na.omit(DT, cols=cols) ans2 = DT[stats::complete.cases(DT[, cols, with=FALSE])] - test_no <<- test_no+.001 - test(test_no, ans1, ans2) + test_no <<- test_no + 1L + test(1394.0 + test_no*0.001, ans1, ans2) 0L }) } @@ -6509,15 +6514,15 @@ for(t in seq_len(nrow(all))){ ansOpt <- DT[eval(parse(text = thisQuery))] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery))] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) ## repeat the test with 'which = TRUE' options("datatable.optimize" = 3L) ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" @@ -6526,8 +6531,8 @@ for(t in seq_len(nrow(all))){ ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] options("datatable.optimize" = 2L) ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] - test_no <- test_no + 0.0001 - test(test_no, ansOpt, ansRef) + test_no <- test_no + 1L + test(1438.0 + test_no*0.0001, ansOpt, ansRef) } } } @@ -12905,10 +12910,10 @@ M <- merge(x, y) m <- merge(as.data.frame(x), as.data.frame(y), by="a") test(1913.09, is.data.table(M) && !is.data.table(m)) test(1913.10, all(names(M) %in% union(names(M), names(m)))) -test_no = 1913.11 +test_no = 0L for (name in names(m)) { - test_no = test_no + 0.0001 - test(test_no, M[[name]], m[[name]]) + test_no = test_no + 1L + test(1913.11 + test_no*0.0001, M[[name]], m[[name]]) } # # Original example that smoked out the bug @@ -12923,10 +12928,10 @@ for (i in 1:3) { } test(1913.12, is.data.table(M) && !is.data.table(m)) test(1913.13, all(names(M) %in% union(names(M), names(m)))) -test_no = 1913.14 +test_no = 0L for (name in names(m)) { - test_no = test_no + 0.0001 - test(test_no, M[[name]], m[[name]]) + test_no = test_no + 1L + test(1913.14 + test_no*0.0001, M[[name]], m[[name]]) } # # simple subset maintains keys @@ -12961,10 +12966,10 @@ t2 <- transform(dt, d=c+4, a=sample(c('x', 'y', 'z'), 20, replace=TRUE)) test(1913.23, is.null(key(t2))) # transforming a key column nukes the key ## This is probably not necessary, but let's just check that transforming ## a key column doesn't twist around the rows in the result. -test_no = 1913.24 +test_no = 0L for (col in c('b', 'c')) { - test_no = test_no + 0.0001 - test(test_no, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns + test_no = test_no + 1L + test(1913.24 + test_no*0.0001, t2[[col]], dt[[col]]) # mutating-key-transform maintains other columns } # Test 1914 of S4 compatibility was moved to S4.Rraw for #3808 @@ -14290,35 +14295,24 @@ test(1997.06, setDTthreads(percent=NULL), error="but is length 0") test(1997.07, setDTthreads(percent=1:2), error="but is length 2") test(1997.08, setDTthreads(restore_after_fork=21), error="must be TRUE, FALSE, or NULL") old = getDTthreads() # (1) -oldenv1 = Sys.getenv("R_DATATABLE_NUM_PROCS_PERCENT") -oldenv2 = Sys.getenv("R_DATATABLE_NUM_THREADS") -Sys.setenv(R_DATATABLE_NUM_THREADS="") # in case user has this set, so we can test PROCS_PERCENT -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="3.0") -test(1997.09, setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") +test(1997.09, env = c(R_DATATABLE_NUM_THREADS="", R_DATATABLE_NUM_PROCS_PERCENT="3.0"), setDTthreads(), old, ignore.warning="Ignoring invalid.*Please remove any.*not a digit") new = getDTthreads() # old above at (1) may not have been default. new now is. test(1997.10, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="1") -test(1997.11, setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") +test(1997.11, env=c(R_DATATABLE_NUM_PROCS_PERCENT="1"), setDTthreads(), new, ignore.warning="Ignoring invalid.*integer between 2 and 100") test(1997.12, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") -test(1997.13, setDTthreads(), new) +test(1997.13, env=c(R_DATATABLE_NUM_PROCS_PERCENT="75"), setDTthreads(), new) new = getDTthreads() setDTthreads(percent=75) test(1997.14, getDTthreads(), new) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="100") -setDTthreads() +test(1997.15, env=c(R_DATATABLE_NUM_PROCS_PERCENT="100"), setDTthreads(), new) allcpu = getDTthreads() -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT="75") -Sys.setenv(R_DATATABLE_NUM_THREADS=allcpu) -setDTthreads() -test(1997.15, getDTthreads(), allcpu) -Sys.setenv(R_DATATABLE_NUM_PROCS_PERCENT=oldenv1) -Sys.setenv(R_DATATABLE_NUM_THREADS=oldenv2) -test(1997.16, setDTthreads(old), allcpu) -test(1997.17, getDTthreads(), old) -test(1997.18, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") +test(1997.16, env=c(R_DATATABLE_NUM_PROCS_PERCENT="75", R_DATATABLE_NUM_THREADS=allcpu), setDTthreads(), allcpu) +test(1997.17, getDTthreads(), allcpu) +test(1997.18, setDTthreads(old), allcpu) +test(1997.19, getDTthreads(), old) +test(1997.20, setDTthreads(throttle=NA), error="throttle.*must be a single number, non-NA, and >=1") setDTthreads(throttle=65536) -test(1997.19, getDTthreads(TRUE), output="throttle==65536") +test(1997.21, getDTthreads(TRUE), output="throttle==65536") setDTthreads(throttle=1024) # test that a copy is being made and output is printed, #3385 after partial revert of #3281 @@ -14377,7 +14371,7 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica #rbindlist coverage test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") -test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE") +test(2003.2, rbindlist(list(), fill=1), error="fill should be TRUE or FALSE") test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4))) test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), @@ -14695,18 +14689,18 @@ test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data. f = tempfile() for (nNUL in 0:3) { writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f) - test_no = 2025 + (1+nNUL)/10 - test(test_no + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) - test(test_no + .02, fread(f), ans) # auto detect skip and header works too + num_major = (1+nNUL)/10 + test(2025 + num_major + .01, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) + test(2025 + num_major + .02, fread(f), ans) # auto detect skip and header works too writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f) - test(test_no + .03, fread(f, skip=1, header=TRUE), ans) - test(test_no + .04, fread(f), ans) + test(2025 + num_major + .03, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .04, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f) - test(test_no + .05, fread(f, skip=1, header=TRUE), ans) - test(test_no + .06, fread(f), ans) + test(2025 + num_major + .05, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .06, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f) - test(test_no + .07, fread(f, skip=1, header=TRUE), ans) - test(test_no + .08, fread(f), ans) + test(2025 + num_major + .07, fread(f, skip=1, header=TRUE), ans) + test(2025 + num_major + .08, fread(f), ans) } makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, con=f)} # "*" (42) represents NUL makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n") @@ -16429,14 +16423,9 @@ test(2122.2, DT, data.table(V3=5:6)) dt = data.table(SomeNumberA=c(1,1,1),SomeNumberB=c(1,1,1)) test(2123, dt[, .(.N, TotalA=sum(SomeNumberA), TotalB=sum(SomeNumberB)), by=SomeNumberA], data.table(SomeNumberA=1, N=3L, TotalA=1, TotalB=3)) -# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085 -oldtz=Sys.getenv('TZ', unset=NA) -Sys.setenv(TZ='Asia/Jakarta') # UTC+7 -t0 = as.POSIXct('2019-10-01') -test(2124.1, format(as.ITime(t0)), '00:00:00') -test(2124.2, format(as.IDate(t0)), '2019-10-01') -if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) -# careful to unset because TZ="" means UTC whereas unset TZ means local, #4261 and #4464 +# system timezone is not usually UTC, so as.ITime.POSIXct shouldn't assume so, #4085, #4261, #4464 +test(2124.1, env=c(TZ='Asia/Jakarta'), format(as.ITime(as.POSIXct('2019-10-01'))), '00:00:00') +test(2124.2, env=c(TZ='Asia/Jakarta'), format(as.IDate(as.POSIXct('2019-10-01'))), '2019-10-01') # trunc.cols in print.data.table, #4074 old_width = options("width" = 40L) @@ -16799,20 +16788,12 @@ if (.Platform$OS.type=="windows") local({ ) x_old = Map(Sys.getlocale, names(x)) invisible(Map(Sys.setlocale, names(x), x)) - old = Sys.getenv('LANGUAGE') - Sys.setenv('LANGUAGE' = 'zh_CN') - on.exit({ - if (nzchar(old)) - Sys.setenv('LANGUAGE' = old) - else - Sys.unsetenv('LANGUAGE') - invisible(Map(Sys.setlocale, names(x_old), x_old)) - }, add = TRUE) + on.exit(Map(Sys.setlocale, names(x_old), x_old)) # triggered segfault here in #4402, Windows-only under translation. # test that the argument order changes correctly (the 'item 2' moves to the beginning of the message) # since the argument order changes in this example (and that was the crash) we don't need to test # the display of the Chinese characters here. Thanks to @shrektan for all his help on this. - test(2143, rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") + test(2143, env=c(LANGUAGE='zh_CN'), rbind(DT,list(c=4L,a=7L)), error="2.*1.*c.*1") }) # test back to English (the argument order is back to 1,c,2,1) test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1") @@ -16871,18 +16852,13 @@ tmp = tempfile() fwrite(DT, tmp) test(2150.01, fread(tmp), DT) # defaults for fwrite/fread simple and preserving fwrite(DT, tmp, dateTimeAs='write.csv') # as write.csv, writes the UTC times as-is not local because the time column has tzone=="UTC", but without the Z marker -oldtz = Sys.getenv("TZ", unset=NA) -Sys.unsetenv("TZ") -test(2150.021, sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character -test(2150.022, fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC -Sys.setenv(TZ="UTC") -test(2150.023, fread(tmp), DT) # TZ environment variable is also recognized +test(2150.021, env=list(TZ=NULL), sapply(fread(tmp,tz=""), typeof), c(dates="integer", times="character")) # from v1.14.0 tz="" needed to read datetime as character +test(2150.022, env=list(TZ=NULL), fread(tmp,tz="UTC"), DT) # user can tell fread to interpet the unmarked datetimes as UTC +test(2150.023, env=c(TZ='UTC'), fread(tmp), DT) # TZ environment variable is also recognized if (.Platform$OS.type!="windows") { - Sys.setenv(TZ="") # on Windows this unsets TZ, see ?Sys.setenv - test(2150.024, fread(tmp), DT) + test(2150.024, env=c(TZ=''), fread(tmp), DT) # on Windows this unsets TZ, see ?Sys.setenv # blank TZ env variable on non-Windows is recognized as UTC consistent with C and R; but R's tz= argument is the opposite and uses "" for local } -Sys.unsetenv("TZ") # Notes: # - from v1.14.0 tz="" needed # - as.POSIXct puts "" on the result (testing the write.csv version here with missing tzone) @@ -16891,11 +16867,11 @@ Sys.unsetenv("TZ") # as.POSIXct() failure means 'times' is returned as a character, hence no 'tzone' attribute. # fread() will also throw a warning, one substring of which will be the reproduced base R error. test(2150.025, + env=list(TZ=NULL), attr(fread(tmp, colClasses=list(POSIXct="times"), tz="")$times, "tzone"), if (is.null(base_messages$maybe_invalid_old_posixct)) "" else NULL, warning=base_messages$maybe_invalid_old_posixct) # the times will be different though here because as.POSIXct read them as local time. -if (is.na(oldtz)) Sys.unsetenv("TZ") else Sys.setenv(TZ=oldtz) fwrite(copy(DT)[ , times := format(times, '%FT%T+00:00')], tmp) test(2150.03, fread(tmp), DT) fwrite(copy(DT)[ , times := format(times, '%FT%T+0000')], tmp) @@ -17614,19 +17590,21 @@ EVAL = function(...) { # cat(e,"\n") # uncomment to check the queries tested eval(parse(text=e)) } -testnum = 2211.0 +testnum = 0L for (col in c("a","b","c")) { - testnum = testnum+0.1 + testnum = testnum + 100L for (fi in seq_along(funs)) { if (col=="c" && fi<=6L) next # first 6 funs don't support type character f = funs[fi] - testnum = testnum+0.001 - test(testnum, EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i - EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first + testnum = testnum + 1L + test(2211.0 + testnum*0.001, + EVAL("DT[i, ",f,"(",col, if(fi>8L)", 1L","), by=grp]"), # segfault before when NA in i + EVAL("DT[i][, ",f,"(",col, if(fi>8L)", 1L","), by=grp]")) # ok before by taking DT[i] subset first if (fi<=8L) { - testnum = testnum+0.001 - test(testnum, EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), - EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) + testnum = testnum + 1L + test(2211.0 + testnum*0.001, + EVAL("DT[i, ",f,"(",col,", na.rm=TRUE), by=grp]"), + EVAL("DT[i][, ",f,"(",col,", na.rm=TRUE), by=grp]")) } } } @@ -17730,7 +17708,7 @@ DT2 = data.table(grp = c('a', 'b'), agg = list(c('1' = 4, '2' = 5), c('3' = 6))) test(2217, DT1[, by = grp, .(agg = list(setNames(as.numeric(value), id)))], DT2) # shift integer64 when fill isn't integer32, #4865 -testnum = 2218 +testnum = 0L funs = c(as.integer, as.double, as.complex, as.character, if (test_bit64) as.integer64) # when test_bit64==FALSE these all passed before; now passes with test_bit64==TRUE too # add grouping tests for #5205 @@ -17739,32 +17717,32 @@ options(datatable.optimize = 2L) for (f1 in funs) { DT = data.table(x=f1(1:4), g=g) for (f2 in funs) { - testnum = testnum + 0.001 - test(testnum, DT[, shift(x)], f1(c(NA, 1:3))) - testnum = testnum + 0.001 + testnum = testnum + 1L + test(2218.0 + testnum*0.001, DT[, shift(x)], f1(c(NA, 1:3))) + testnum = testnum + 1L w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" - test(testnum, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) - testnum = testnum + 0.001 + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f2(NA))], f1(c(NA, 1:3)), warning=w) + testnum = testnum + 1L if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" - test(testnum, DT[, shift(x, fill="0")], f1(0:3)) + test(2218.0 + testnum*0.001, DT[, shift(x, fill="0")], f1(0:3)) } else { - test(testnum, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f2(0))], f1(0:3), warning=w) } - testnum = testnum + 0.001 - test(testnum, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) - testnum = testnum + 0.001 + testnum = testnum + 1L + test(2218.0 + testnum*0.001, DT[, shift(x), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3)))) + testnum = testnum + 1L w = if (identical(f2,as.character) && !identical(f1,as.character)) "Coercing.*character.*to match the type of target vector" f = f2(NA) - test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) - testnum = testnum + 0.001 + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(NA, 1, NA, 3))), warning=w) + testnum = testnum + 1L if (identical(f1,as.character) && identical(f2,as.complex)) { # one special case due to as.complex(0)=="0+0i"!="0" - test(testnum, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) + test(2218.0 + testnum*0.001, DT[, shift(x, fill="0"), by=g], data.table(g=g, V1=f1(c(0,1,0,3)))) } else { f = f2(0) - test(testnum, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) + test(2218.0 + testnum*0.001, DT[, shift(x, fill=f), by=g], data.table(g=g, V1=f1(c(0,1,0,3))), warning=w) } } } @@ -17778,14 +17756,15 @@ if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) options(datatable.optimize = 2L) funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") -testnum = 2220 +testnum = 0L for (fun in funs) { - testnum = testnum + 0.01 - test(testnum, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") - testnum = testnum + 0.01 - test(testnum, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + testnum = testnum + 1L + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + testnum = testnum + 1L + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } -test(testnum+0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +testnum = testnum + 1L +test(2220.0 + testnum*0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -18067,19 +18046,18 @@ test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=I set.seed(10) n = 100 a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) -testnum = 2233.39 for (opt in c(0,Inf)) { options(datatable.optimize=opt) out = if (opt) "GForce.*gsum" else "GForce FALSE" B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle - test(testnum+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= + num_bump = (opt>0)/100 + test(2233.39+num_bump+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) - test(testnum+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) - test(testnum+0.003, any(A[,t1!=t2]), FALSE) - test(testnum+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) - test(testnum+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) - testnum = 2233.40 + test(2233.39+num_bump+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) + test(2233.39+num_bump+0.003, any(A[,t1!=t2]), FALSE) + test(2233.39+num_bump+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) + test(2233.39+num_bump+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) } # test from #5337 n=4; k=2 @@ -18099,22 +18077,24 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -old = options(datatable.verbose=TRUE) -testnum = 2233.43 -for (opt in c(0,Inf)) { - options(datatable.optimize=opt) - out = if (opt) "GForce.*gsum" else "GForce FALSE" - test(testnum, - copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") - ][, n_idT :=dim(.SD)[[1]], by=list(t, id) - ][, sum_v2_id :=sum(v2), by=.(id) - ][, sum_v1_idT:=sum(v1), by=c("id", "t") - ][, sum_v1_id :=sum(v1), by=c("id")], - ans, - output=out) - testnum = 2233.44 -} -options(old) +test(2233.43, + options = list(datatable.verbose=TRUE, datatable.optimize=0), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce FALSE") +test(2233.44, + options = list(datatable.verbose=TRUE, datatable.optimize=Inf), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce.*gsum") # optimized := with gforce functions that can return lists #5403 old = options(datatable.verbose=TRUE) DT = data.table(grp=1:2, x=1:4) @@ -18259,17 +18239,20 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) j=1L -old = options(datatable.optimize=1L) -test(2243.41, dt[, .I[TRUE], x]$V1, 1:4) -test(2243.42, dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) -options(datatable.optimize=2L, datatable.verbose=TRUE) -test(2243.51, dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.52, dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.54, dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.55, dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") -options(old) +test(2243.41, options=c(datatable.optimize=1L), dt[, .I[TRUE], x]$V1, 1:4) +test(2243.42, options=c(datatable.optimize=1L), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) +test(2243.51, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.52, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.54, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.55, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18310,7 +18293,12 @@ test(2247.4, split(dt, ~y+z), list("a.c"=dt[1], "b.c"=dt[2], "a.d"=dt[3], "b.d"= if (test_bit64) { i64v = as.integer64(c(12345678901234, 70, 20, NA)) apple = data.table(id = c("a", "b", "b"), time = c(1L, 1L, 2L), y = i64v[1:3]) - test(2248, dcast(apple, id ~ time, value.var = "y"), data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) + test(2248.1, dcast(apple, id ~ time, value.var = "y"), ans<-data.table(id = c('a', 'b'), `1` = i64v[1:2], `2` = i64v[4:3], key='id')) + # associated regression test: downtreams used fill=list() which is not directly supported by coerceAs() + DT = data.table(a=1:2, b=2:3, c=3) + test(2248.2, dcast(DT, a ~ b, value.var='c', fill=list(0L)), data.table(a=1:2, `2`=c(3, 0), `3`=c(0, 3), key='a')) + # also ensure list() gets coerced to integer64 correctly + test(2248.3, dcast(apple, id ~ time, value.var = "y", fill=list(NA)), ans) } # Unit tests for DT[, .SD] retaining secondary indices, #1709 @@ -18453,7 +18441,48 @@ DF <- structure( test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5") +# automatic detection of dec=',' for #2431 +DT = data.table(a = letters, b = 1:26/6, c = 1:26) +## auto-detect dec=',' +fwrite(DT, f <- tempfile(), dec=',', sep=';') +test(2256.1, fread(f), DT) + +fwrite(DT, f, dec=',', sep='|') +test(2256.2, fread(f), DT) + +## auto-detect dec='.' +fwrite(DT, f) +test(2256.3, fread(f), DT) + +## verbose output +test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'") + +fwrite(DT, f, dec=',', sep=';') +test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18") +test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ") + +# helpful error about deleting during grouping, #1873 +DT = data.table(id = c(1, 1, 2, 2), a = 1:4, b = 5:8) +test(2257, DT[ , c("c", "a") := .(a + 1, NULL), by=id], error="it's not possible to delete parts of a column") + +# testing printing data.tables with na.print, #3152 +DT = data.table(x=c(NA, "a", "b")) +test(2258.1, capture.output(print(DT, na.print=".")), c(" x", "1: .", "2: a", "3: b")) +test(2258.2, capture.output(print(DT, na.print="_")), c(" x", "1: _", "2: a", "3: b")) +test(2258.3, capture.output(print(DT, na.print="NA")), c(" x", "1: NA", "2: a", "3: b")) +test(2258.4, capture.output(print(DT, na.print=TRUE)), error="invalid 'na.print' specification") +test(2258.5, capture.output(print(DT, na.print=".", quote=TRUE)), c(' "x"', "1: .", '2: "a"', '3: "b"')) +test(2258.6, capture.output(print(DT, na.print=".", right=TRUE)), c(" x", "1: .", "2: a", "3: b")) +# tests for other call sites +# col.names="none" +test(2258.7, capture.output(print(DT, na.print=".", col.names="none")), c("1: .", "2: a", "3: b")) +# table requires splitting, col.names="none" +DT = data.table(x = c(NA, "e", "b", "j", "w", NA)) +test(2258.8, capture.output(print(DT, na.print=".", topn=2, col.names="none")), c(" 1: .", " 2: e", "--- ", " 5: w", " 6: .")) +# table requires splitting, col.names!="none" +test(2258.9, capture.output(print(DT, na.print=".", topn=2)), c(" x", " 1: .", " 2: e", "--- ", " 5: w", " 6: .")) + # split(by = ., sep = ..) works like split(f= ., sep = ..), #5417 x = data.table(rep(1:2, each=5L), 1:5, 1:10) -test(2256.1, names(split(x, by = c("V1", "V2"), sep = "|")), sort(names(split(x, list(x$V1, x$V2), sep = "|")))) -test(2256.2, names(split(x, by = c("V1", "V2"), sep = "||")), sort(names(split(x, list(x$V1, x$V2), sep = "||")))) +test(2259.1, names(split(x, by = c("V1", "V2"), sep = "|")), sort(names(split(x, list(x$V1, x$V2), sep = "|")))) +test(2259.2, names(split(x, by = c("V1", "V2"), sep = "||")), sort(names(split(x, list(x$V1, x$V2), sep = "||")))) diff --git a/man/fread.Rd b/man/fread.Rd index 49b187364..d397a441d 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -9,7 +9,7 @@ \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. } \usage{ -fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"", +fread(input, file, text, cmd, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), # due to change to ""; see NEWS stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE), @@ -47,7 +47,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code. } \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". } - \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. } + \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. } \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. } \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.} \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } @@ -79,9 +79,9 @@ If an empty line is encountered then reading stops there with warning if any tex \bold{Line endings:} All known line endings are detected automatically: \code{\\n} (*NIX including Mac), \code{\\r\\n} (Windows CRLF), \code{\\r} (old Mac) and \code{\\n\\r} (just in case). There is no need to convert input files first. \code{fread} running on any architecture will read a file from any architecture. Both \code{\\r} and \code{\\n} may be embedded in character strings (including column names) provided the field is quoted. -\bold{Decimal separator and locale:} \code{fread(\dots,dec=",")} should just work. \code{fread} uses C function \code{strtod} to read numeric data; e.g., \code{1.23} or \code{1,23}. \code{strtod} retrieves the decimal separator (\code{.} or \code{,} usually) from the locale of the R session rather than as an argument passed to the \code{strtod} function. So for \code{fread(\dots,dec=",")} to work, \code{fread} changes this (and only this) R session's locale temporarily to a locale which provides the desired decimal separator. +\bold{Decimal separator:} \code{dec} is used to parse numeric fields as the separator between integral and fractional parts. When \code{dec='auto'}, during column type detection, when a field is a candidate for being numeric (i.e., parsing as lower types has already failed), \code{dec='.'} is tried, and, if it fails to create a numeric field, \code{dec=','} is tried. At the end of the sample lines, if more were successfully parsed with \code{dec=','}, \code{dec} is set to \code{','}; otherwise, \code{dec} is set to \code{'.'}. -On Windows, "French_France.1252" is tried which should be available as standard (any locale with comma decimal separator would suffice) and on unix "fr_FR.utf8" (you may need to install this locale on unix). \code{fread()} is very careful to set the locale back again afterwards, even if the function fails with an error. The choice of locale is determined by \code{options()$datatable.fread.dec.locale}. This may be a \emph{vector} of locale names and if so they will be tried in turn until the desired \code{dec} is obtained; thus allowing more than two different decimal separators to be selected. This is a new feature in v1.9.6 and is experimental. In case of problems, turn it off with \code{options(datatable.fread.dec.experiment=FALSE)}. +Automatic detection of \code{sep} occurs \emph{prior} to column type detection -- as such, it is possible that \code{sep} has been inferred to be \code{','}, in which case \code{dec} is set to \code{'.'}. \bold{Quotes:} diff --git a/man/melt.data.table.Rd b/man/melt.data.table.Rd index 6dd74291d..ad4dfd8dd 100644 --- a/man/melt.data.table.Rd +++ b/man/melt.data.table.Rd @@ -27,7 +27,7 @@ non-measure columns will be assigned to it. If integer, must be positive; see De \item{ When missing, \code{measure.vars} will become all columns outside \code{id.vars}. } \item{ Vector can be \code{integer} (implying column numbers) or \code{character} (column names). } \item{ \code{list} is a generalization of the vector version -- each element of the list (which should be \code{integer} or \code{character} as above) will become a \code{melt}ed column. } - \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} syntax; multiple patterns will produce multiple columns. } + \item{ Pattern-based column matching can be achieved with the regular expression-based \code{\link{patterns}} (regex without capture groups; matching column names are used in the \code{variable.name} output column), or \code{\link{measure}} (regex with capture groups; each capture group becomes an output column). } } For convenience/clarity in the case of multiple \code{melt}ed columns, resulting column names can be supplied as names to the elements \code{measure.vars} (in the \code{list} and \code{patterns} usages). See also \code{Examples}. } @@ -154,6 +154,11 @@ melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, sep="_ # specifying columns to melt via regex. melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="(.)_(.)")) +melt(DT.missing.cols, measure.vars=measure(value.name, number=as.integer, pattern="([dc])_(.)")) + +# cols arg of measure can be used if you do not want to use regex +melt(DT.missing.cols, measure.vars=measure( + value.name, number=as.integer, sep="_", cols=c("d_1","d_2","c_1"))) } \seealso{ \code{\link{dcast}}, \url{https://cran.r-project.org/package=reshape} diff --git a/man/print.data.table.Rd b/man/print.data.table.Rd index a39c8c446..f740de9d9 100644 --- a/man/print.data.table.Rd +++ b/man/print.data.table.Rd @@ -26,6 +26,7 @@ print.keys=getOption("datatable.print.keys"), # default: TRUE trunc.cols=getOption("datatable.print.trunc.cols"), # default: FALSE quote=FALSE, + na.print=NULL, timezone=FALSE, \dots) format_col(x, \dots) @@ -47,6 +48,7 @@ \item{trunc.cols}{ If \code{TRUE}, only the columns that can be printed in the console without wrapping the columns to new lines will be printed (similar to \code{tibbles}). } \item{quote}{ If \code{TRUE}, all output will appear in quotes, as in \code{print.default}. } \item{timezone}{ If \code{TRUE}, time columns of class POSIXct or POSIXlt will be printed with their timezones (if attribute is available). } + \item{na.print}{ The string to be printed in place of \code{NA} values, as in \code{print.default}. } \item{\dots}{ Other arguments ultimately passed to \code{format}. } } \value{ diff --git a/man/setNumericRounding.Rd b/man/setNumericRounding.Rd index 87ce2256b..f9e00de27 100644 --- a/man/setNumericRounding.Rd +++ b/man/setNumericRounding.Rd @@ -18,8 +18,8 @@ Computers cannot represent some floating point numbers (such as 0.6) precisely, using base 2. This leads to unexpected behaviour when joining or grouping columns of type 'numeric'; i.e. 'double', see example below. In cases where this is undesirable, data.table allows rounding such data up to -approximately 11 s.f. which is plenty of digits for many cases. This is -achieved by rounding the last 2 bytes off the significand. Other possible +approximately 11 significant figures which is plenty of digits for many cases. +This is achieved by rounding the last 2 bytes off the significand. Other possible values are 1 byte rounding, or no rounding (full precision, default). It is bytes rather than bits because it is tied in with the radix sort diff --git a/man/test.Rd b/man/test.Rd index ddf1198bf..d264d98af 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL) + options = NULL, env = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -21,6 +21,7 @@ test(num, x, y = TRUE, \item{notOutput}{ Or if you are testing that a feature does \emph{not} print particular console output. Case insensitive (unlike output) so that the test does not incorrectly pass just because the string is not found due to case. } \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to `test()` (usually, `x`, or maybe `y`) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } +\item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. diff --git a/man/test.data.table.Rd b/man/test.data.table.Rd index c36e5f9d4..37496fddd 100644 --- a/man/test.data.table.Rd +++ b/man/test.data.table.Rd @@ -8,6 +8,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", silent = FALSE, showProgress = interactive() && !silent, + testPattern = NULL, memtest = Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id = NULL) } @@ -17,6 +18,7 @@ test.data.table(script = "tests.Rraw", verbose = FALSE, pkg = ".", \item{pkg}{ Root directory name under which all package content (ex: DESCRIPTION, src/, R/, inst/ etc..) resides. Used only in \emph{dev-mode}. } \item{silent}{ Controls what happens if a test fails. Like \code{silent} in \code{\link{try}}, \code{TRUE} causes the error message to be suppressed and \code{FALSE} to be returned, otherwise the error is returned. } \item{showProgress}{ Output 'Running test ...\\r' at the start of each test? } +\item{testPattern}{ When present, a regular expression tested againt the number of each test for inclusion. Useful for running only a small portion of a large test script. } \item{memtest}{ Measure and report memory usage of tests (1:gc before ps, 2:gc after ps) rather than time taken (0) by default. Intended for and tested on Linux. See PR #5515 for more details. } \item{memtest.id}{ An id for which to print memory usage for every sub id. May be a range of ids. } } diff --git a/src/data.table.h b/src/data.table.h index 21b7e30e0..297167d46 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -238,7 +238,7 @@ bool isRealReallyInt(SEXP x); SEXP isRealReallyIntR(SEXP x); SEXP isReallyReal(SEXP x); bool allNA(SEXP x, bool errorForBadType); -SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups); +SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups, SEXP skip_absent); bool INHERITS(SEXP x, SEXP char_); SEXP copyAsPlain(SEXP x); void copySharedColumns(SEXP x); diff --git a/src/fastmean.c b/src/fastmean.c index 2fcc6ebd2..1c9b3eb64 100644 --- a/src/fastmean.c +++ b/src/fastmean.c @@ -36,7 +36,7 @@ SEXP fastmean(SEXP args) if (length(args)>2) { tmp = CADDR(args); if (!isLogical(tmp) || LENGTH(tmp)!=1 || LOGICAL(tmp)[0]==NA_LOGICAL) - error(_("narm should be TRUE or FALSE")); // # nocov ; [.data.table should construct the .External call correctly + error(_("%s should be TRUE or FALSE"), "narm"); // # nocov ; [.data.table should construct the .External call correctly narm=LOGICAL(tmp)[0]; } PROTECT(ans = allocNAVector(REALSXP, 1)); diff --git a/src/fcast.c b/src/fcast.c index d049711bf..334dfd7e8 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -21,14 +21,17 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SEXP thisfill = fill; const SEXPTYPE thistype = TYPEOF(thiscol); int nprotect = 0; - if(some_fill){ + if (some_fill) { if (isNull(fill)) { - if (LOGICAL(is_agg)[0]) { - thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; - } else thisfill = VECTOR_ELT(fill_d, i); + if (LOGICAL(is_agg)[0]) { + thisfill = PROTECT(allocNAVector(thistype, 1)); nprotect++; + } else + thisfill = VECTOR_ELT(fill_d, i); } if (isVectorAtomic(thiscol)) { // defer error handling to below, but also skip on list - thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++; + // #5980: some callers used fill=list(...) and relied on R's coercion mechanics for lists, which are nontrivial, so just dispatch and double-coerce. + if (isNewList(thisfill)) { thisfill = PROTECT(coerceVector(thisfill, TYPEOF(thiscol))); nprotect++; } + thisfill = PROTECT(coerceAs(thisfill, thiscol, /*copyArg=*/ScalarLogical(false))); nprotect++; } } switch (thistype) { diff --git a/src/fread.c b/src/fread.c index a1521fb37..e2602e596 100644 --- a/src/fread.c +++ b/src/fread.c @@ -33,6 +33,7 @@ static const char *sof, *eof; static char sep; static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t') static char quote, dec; +static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.' static bool eol_one_r; // only true very rarely for \r-only files // Quote rule: @@ -1206,11 +1207,16 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped skip_white(&ch); if (eol(&ch)) return 0; // empty line int field=0; + const bool autoDec = dec == '\0'; while (field>(%d)"), strlim(ch,20), quoteRule); skip_white(&ch); const char *fieldStart = ch; while (tmpType[field]<=CT_STRING) { + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '\0') { // guess . first + dec = '.'; + } + fun[tmpType[field]](&fctx); if (end_of_field(ch)) break; skip_white(&ch); @@ -1234,9 +1240,19 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped } } ch = fieldStart; + if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '.') { // . didn't parse a double; try , + dec = ','; + continue; + } while (++tmpType[field]=eof) break; // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want. bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines int jumpLine = 0; // line from this jump point start + linesForDecDot = 0; while(ch0, apply the bumps (if any) at the end of the successfully completed jump sample ASSERT(jump>0, "jump(%d)>0", jump); @@ -1906,7 +1936,17 @@ int freadMain(freadMainArgs _args) { if (args.header==NA_BOOL8) { for (int j=0; j0) for (int j=0; jCT_EMPTY) { args.header=true; diff --git a/src/fread.h b/src/fread.h index 1e2783643..89dea2592 100644 --- a/src/fread.h +++ b/src/fread.h @@ -36,6 +36,8 @@ typedef enum { NUMTYPE // placeholder for the number of types including drop; used for allocation and loop bounds } colType; +#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT) // types where dec matters + extern int8_t typeSize[NUMTYPE]; extern const char typeName[NUMTYPE][10]; extern const long double pow10lookup[301]; diff --git a/src/freadR.c b/src/freadR.c index 97fbfadac..035c76eda 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -102,9 +102,10 @@ SEXP freadR( error(_("Internal error: freadR sep not a single character. R level catches this.")); // # nocov args.sep = CHAR(STRING_ELT(sepArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level - if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1)) + if (!isString(decArg) || LENGTH(decArg)!=1 || strlen(CHAR(STRING_ELT(decArg,0)))>1) { error(_("Internal error: freadR dec not a single character. R level catches this.")); // # nocov - args.dec = CHAR(STRING_ELT(decArg,0))[0]; + } + args.dec = CHAR(STRING_ELT(decArg,0))[0]; // '\0' when default "auto" was replaced by "" at R level if (IS_FALSE(quoteArg)) { args.quote = '\0'; diff --git a/src/nafill.c b/src/nafill.c index 03aa6d091..8d50f32ea 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -100,7 +100,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S bool binplace = LOGICAL(inplace)[0]; if (!IS_TRUE_OR_FALSE(nan_is_na_arg)) - error(_("nan_is_na must be TRUE or FALSE")); // # nocov + error(_("%s must be TRUE or FALSE"), "nan_is_na"); // # nocov bool nan_is_na = LOGICAL(nan_is_na_arg)[0]; SEXP x = R_NilValue; @@ -114,7 +114,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S obj = PROTECT(allocVector(VECSXP, 1)); protecti++; // wrap into list SET_VECTOR_ELT(obj, 0, obj1); } - SEXP ricols = PROTECT(colnamesInt(obj, cols, ScalarLogical(TRUE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) + SEXP ricols = PROTECT(colnamesInt(obj, cols, /* check_dups= */ ScalarLogical(TRUE), /* skip_absent= */ ScalarLogical(FALSE))); protecti++; // nafill cols=NULL which turns into seq_along(obj) x = PROTECT(allocVector(VECSXP, length(ricols))); protecti++; int *icols = INTEGER(ricols); for (int i=0; inx) || (icols[i]<1)) + for (int i=0; inx) || (icols[i]<1)) error(_("argument specifying columns received non-existing column(s): cols[%d]=%d"), i+1, icols[i]); // handles NAs also + else if(bskip_absent && icols[i]>nx) + icols[i] = 0L; } } else if (isString(cols)) { SEXP xnames = PROTECT(getAttrib(x, R_NamesSymbol)); protecti++; @@ -133,9 +142,11 @@ SEXP colnamesInt(SEXP x, SEXP cols, SEXP check_dups) { error(_("'x' argument data.table has no names")); ricols = PROTECT(chmatch(cols, xnames, 0)); protecti++; int *icols = INTEGER(ricols); - for (int i=0; i