Merge branch 'master' into split-by-sep

Rdatatable · Apr 19, 2024 · 415831c · 415831c
2 parents a00d55e + 47a7f52
commit 415831c
Show file tree

Hide file tree

Showing 27 changed files with 680 additions and 267 deletions.
diff --git a/.dev/cc.R b/.dev/cc.R
@@ -38,6 +38,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) {
     if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly")
     return(invisible())
   }
+  suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv
   nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function
   if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports)))
   for (ii in seq_along(nsParsedImports)) {
@@ -51,7 +52,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) {
   return(invisible())
 }
 
-cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) {
+cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) {
   if (!missing(cc_dir)) {
     warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead")
     path = cc_dir

diff --git a/.github/workflows/R-CMD-check-occasional.yaml b/.github/workflows/R-CMD-check-occasional.yaml
@@ -0,0 +1,96 @@
+on:
+  schedule:
+   - cron: '18 13 8 * *' # 8th of month at 13:18 UTC
+
+# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release
+name: R-CMD-check-occasional
+
+jobs:
+  R-CMD-check-occasional:
+    runs-on: ${{ matrix.os }}
+
+    name: ${{ matrix.os }} (${{ matrix.r }})
+
+    strategy:
+      matrix:
+        os: [macOS-latest, windows-latest, ubuntu-latest]
+        r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3']
+        locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502)
+        exclude:
+          - os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu
+            locale: ['zh_CN.utf8', 'lv_LV.utf8']
+
+    env:
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - name: Set locale
+        if: matrix.locale == 'en_US.utf8'
+        run: |
+          sudo locale-gen en_US
+          echo "LC_ALL=en_US.utf8" >> $GITHUB_ENV
+
+      - name: Set locale
+        if: matrix.locale == 'zh_CN.utf8'
+        run: |
+          sudo locale-gen zh_CN
+          echo "LC_ALL=zh_CN.utf8" >> $GITHUB_ENV
+          echo "LANGUAGE=zh_CN" >> $GITHUB_ENV
+
+      - name: Set locale
+        if: matrix.locale == 'lv_LV.utf8'
+        run: |
+          sudo locale-gen lv_LV
+          echo "LC_ALL=lv_LV.utf8" >> $GITHUB_ENV
+          echo "LANGUAGE=lv_LV" >> $GITHUB_ENV
+
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.r }}
+
+
+      - name: Query dependencies
+        run: |
+          install.packages('remotes')
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+        shell: Rscript {0}
+
+      - name: Restore R package cache
+        uses: actions/cache@v2
+        with:
+          path: ${{ env.R_LIBS_USER }}
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
+
+      - name: Install system dependencies
+        if: runner.os == 'Linux'
+        run: |
+          while read -r cmd
+          do
+            eval sudo $cmd
+          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
+
+      - name: Install dependencies
+        run: |
+          remotes::install_deps(dependencies = TRUE)
+          remotes::install_cran("rcmdcheck")
+        shell: Rscript {0}
+
+      - name: Check
+        env:
+          _R_CHECK_CRAN_INCOMING_REMOTE_: false
+        run: |
+          options(crayon.enabled = TRUE)
+          rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
+        shell: Rscript {0}
+
+      - name: Upload check results
+        if: failure()
+        uses: actions/upload-artifact@main
+        with:
+          name: ${{ runner.os }}-r${{ matrix.r }}-results
+          path: check
diff --git a/.github/workflows/performance-tests.yml b/.github/workflows/performance-tests.yml
@@ -0,0 +1,23 @@
+name: Autocomment atime-based performance regression analysis on PRs
+
+on:
+  pull_request:
+    branches:
+      - '*'
+    types:
+      - opened
+      - reopened
+      - synchronize
+    paths:
+      - 'R/**'
+      - 'src/**'       
+
+jobs:
+  comment:
+    runs-on: ubuntu-latest
+    container: ghcr.io/iterative/cml:0-dvc2-base1
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      repo_token: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: Anirban166/Autocomment-atime-results@v1.1.6
diff --git a/NEWS.md b/NEWS.md
@@ -32,7 +32,11 @@
 
 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix.
 
-9. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417).
+9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development.
+
+10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR.
+
+11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417).
 
 ## BUG FIXES
 
@@ -74,6 +78,8 @@
 
 11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix.
 
+12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix.
+
 # data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29)  (30 Jan 2024)
 
 ## BREAKING CHANGE

diff --git a/R/fmelt.R b/R/fmelt.R
@@ -107,17 +107,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
       stopf("pattern must be character string")
     }
     match.vec = regexpr(pattern, cols, perl=TRUE)
-    measure.vec = which(0 < match.vec)
-    if (length(measure.vec) == 0L) {
+    measure.vec.i = which(0 < match.vec)
+    if (length(measure.vec.i) == 0L) {
       stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern")
     }
-    start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE]
+    start = attr(match.vec, "capture.start")[measure.vec.i, , drop=FALSE]
     if (is.null(start)) {
       stopf("pattern must contain at least one capture group (parenthesized sub-pattern)")
     }
     err.args.groups("number of capture groups in pattern", ncol(start))
-    end = attr(match.vec, "capture.length")[measure.vec,]+start-1L
-    names.mat = matrix(cols[measure.vec], nrow(start), ncol(start))
+    end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L
+    measure.vec <- cols[measure.vec.i]
+    names.mat = matrix(measure.vec, nrow(start), ncol(start))
     substr(names.mat, start, end)
   } else { #pattern not specified, so split using sep.
     if (!is.character(sep)) {
@@ -130,10 +131,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
       stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
     }
     err.args.groups("max number of items after splitting column names", n.groups)
-    measure.vec = which(vector.lengths==n.groups)
-    do.call(rbind, list.of.vectors[measure.vec])
+    measure.vec.i = which(vector.lengths==n.groups)
+    measure.vec = cols[measure.vec.i]
+    do.call(rbind, list.of.vectors[measure.vec.i])
   }
-  err.names.unique("measured columns", cols[measure.vec])
+  err.names.unique("measured columns", measure.vec)
   uniq.mat = unique(group.mat)
   if (nrow(uniq.mat) < nrow(group.mat)) {
     stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat))

diff --git a/R/fread.R b/R/fread.R
@@ -1,5 +1,5 @@
 fread = function(
-input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto",
+input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
 na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
 skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
 col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
@@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     else if (sep=="auto") sep=""      # sep=="" at C level means auto sep
     else stopifnot( nchar(sep)==1L )  # otherwise an actual character to use as sep
   }
-  stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L )
+  stopifnot( is.character(dec), length(dec)==1L)
+  if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L)
   # handle encoding, #563
   if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) {
     stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")

diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -8,6 +8,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
                print.keys=getOption("datatable.print.keys"),
                trunc.cols=getOption("datatable.print.trunc.cols"),
                quote=FALSE,
+               na.print=NULL,
                timezone=FALSE, ...) {
   # topn  - print the top topn and bottom topn rows with '---' inbetween (5)
   # nrows - under this the whole (small) table is printed, unless topn is provided (100)
@@ -109,6 +110,13 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
     # When nrow(toprint) = 1, attributes get lost in the subset,
     #   function below adds those back when necessary
     toprint = toprint_subset(toprint, cols_to_print)
+    trunc.cols <- length(not_printed) > 0L
+  }
+  print_default = function(x) {
+    if (col.names != "none") cut_colnames = identity
+    cut_colnames(print(x, right=TRUE, quote=quote, na.print=na.print))
+    # prints names of variables not shown in the print
+    if (trunc.cols) trunc_cols_message(not_printed, abbs, class, col.names)
   }
   if (printdots) {
     if (isFALSE(row.names)) {
@@ -117,30 +125,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
       toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn))
     }
     rownames(toprint) = format(rownames(toprint), justify="right")
-    if (col.names == "none") {
-      cut_colnames(print(toprint, right=TRUE, quote=quote))
-    } else {
-      print(toprint, right=TRUE, quote=quote)
-    }
-    if (trunc.cols && length(not_printed) > 0L)
-      # prints names of variables not shown in the print
-      trunc_cols_message(not_printed, abbs, class, col.names)
-
+    print_default(toprint)
     return(invisible(x))
   }
   if (nrow(toprint)>20L && col.names == "auto")
     # repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them
     #   option to shut this off per request of Oleg Bondar on SO, #1482
-    toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97
-  if (col.names == "none") {
-    cut_colnames(print(toprint, right=TRUE, quote=quote))
-  } else {
-    print(toprint, right=TRUE, quote=quote)
-  }
-  if (trunc.cols && length(not_printed) > 0L)
-    # prints names of variables not shown in the print
-    trunc_cols_message(not_printed, abbs, class, col.names)
-
+    toprint = rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97
+  print_default(toprint)
   invisible(x)
 }
 

diff --git a/R/test.data.table.R b/R/test.data.table.R
@@ -1,4 +1,4 @@
-test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent,
+test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, testPattern=NULL,
                            memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) {
   stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress))
   memtest = as.integer(memtest)
@@ -38,7 +38,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
     scripts = scripts[!grepl("bench|other", scripts)]
     scripts = gsub("[.]bz2$","",scripts)
     return(sapply(scripts, function(fn) {
-      err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress))
+      err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress, testPattern=testPattern))
       cat("\n");
       isTRUE(err)
     }))
@@ -140,6 +140,68 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
     if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.")
   }
 
+  # nocov start: only used interactively -- "production" suites should always run in full
+  if (!is.null(testPattern)) {
+    # due to how non-hermetic our tests are, the simple approach (pass this to test(), return early if 'numStr' matches testPattern)
+    #   does not work, or at least getting it to work is not much more efficient (see initial commit of #6040). so instead,
+    #   here we parse the file, extract the tests that match the pattern to a new file, and include other setup lines likely required
+    #   to run the tests successfully. two major drawbacks (1) we can only take a guess which lines are required, so this approach
+    #   can't work (or at least, may need a lot of adjustment) for _every_ test, though not working is also a good sign that test
+    #   should be refactored to be more hermetic (2) not all tests have literal test numbers, meaning we can't always match the
+    #   runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the
+    #   full test expression string, i.e., not just limited to numeric literal test numbers.
+    arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check
+    pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE)))
+    file_lines = readLines(fn)
+    # NB: a call looks like (with id/parent tracking)
+    # <expr>
+    #   <expr "lhs"><SYMBOL_FUNCTION_CALL>name</SYMBOL_FUNCTION_CALL></expr>
+    #   <LEFT_PAREN>(</LEFT_PAREN>
+    #   <expr "arg1"> ... </expr>
+    #   ...
+    #   <RIGHT_PAREN>)</RIGHT_PAREN>
+    # </expr>
+    ## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call
+    test_calls = pd[
+      pd[
+        pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'],
+        list(call_lhs_id=id, call_id=x.parent),
+        on=c(id='parent')],
+      list(line1, id),
+      on=c(id='call_id')]
+    ## all the arguments for each call to test()
+    test_call_args = test_calls[pd[token == 'expr'], list(call_id=parent, arg_line=i.line1, col1, col2), on=c(id='parent'), nomatch=NULL]
+    ## 2nd argument is the num= argument
+    test_num_expr = test_call_args[ , .SD[2L], by="call_id"]
+    # NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so
+    keep_test_ids = test_num_expr[grepl(testPattern, substring(file_lines[arg_line], col1, col2)), call_id]
+    # Now find all tests just previous to the keep tests; we want to keep non-test setup lines between them, e.g.
+    # test(drop, ...)
+    # setup_line1     # retain
+    # setup_line2     # retain
+    # test(keep, ...) # retain
+    intertest_ranges = test_calls[!id %in% keep_test_ids][
+      test_calls[id %in% keep_test_ids],
+      list(preceding_line=x.line1, test_start_line=i.line1),
+      on='line1',
+      roll=TRUE]
+    # TODO(michaelchirico): this doesn't do well with tests inside control statements.
+    #   those could be included by looking for tests with parent!=0, i.e., not-top-level tests,
+    #   and including the full parent for such tests. omitting for now until needed.
+    keep_lines = intertest_ranges[, sort(unique(unlist(Map(function(l, u) l:u, preceding_line+1L, test_start_line))))]
+    header_lines = seq_len(test_calls$line1[1L]-1L)
+
+    tryCatch(error = function(c) warningf("Attempt to subset to %d tests matching '%s' failed, running full suite.", length(keep_test_ids), testPattern), {
+      new_script = file_lines[c(header_lines, keep_lines)]
+      parse(text = new_script) # as noted above the static approach is not fool-proof (yet?), so force the script to at least parse before continuing.
+      fn = tempfile()
+      on.exit(unlink(fn), add=TRUE)
+      catf("Running %d of %d tests matching '%s'\n", length(keep_test_ids), nrow(test_calls), testPattern)
+      writeLines(new_script, fn)
+    })
+  }
+  # nocov end
+
   err = try(sys.source(fn, envir=env), silent=silent)
 
   options(oldOptions)
@@ -251,7 +313,19 @@ gc_mem = function() {
   # nocov end
 }
 
-test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL) {
+test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) {
+  if (!is.null(env)) {
+    old = Sys.getenv(names(env), names=TRUE, unset=NA)
+    to_unset = !lengths(env)
+    # NB: Sys.setenv() (no arguments) errors
+    if (!all(to_unset)) do.call(Sys.setenv, as.list(env[!to_unset]))
+    Sys.unsetenv(names(env)[to_unset])
+    on.exit(add=TRUE, {
+      is_preset = !is.na(old)
+      if (any(is_preset)) do.call(Sys.setenv, as.list(old[is_preset]))
+      Sys.unsetenv(names(old)[!is_preset])
+    })
+  }
   if (!is.null(options)) {
     old_options <- do.call('options', as.list(options)) # as.list(): allow passing named character vector for convenience
     on.exit(options(old_options), add=TRUE)
@@ -418,7 +492,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
     }
   }
   if (!fail && !length(error) && (!length(output) || !missing(y))) {   # TODO test y when output=, too
-    y = try(y,TRUE)
+    capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it
     if (identical(x,y)) return(invisible(TRUE))
     all.equal.result = TRUE
     if (is.data.frame(x) && is.data.frame(y)) {