Merge branch 'master' into i18n-r-fragments2

Rdatatable · Sep 27, 2024 · b1b9bf7 · b1b9bf7
2 parents d378f7f + 0c957c7
commit b1b9bf7
Show file tree

Hide file tree

Showing 33 changed files with 248 additions and 159 deletions.
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -0,0 +1,55 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+name: test-coverage.yaml
+
+permissions: read-all
+
+jobs:
+  test-coverage:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+          r-version: '4.3' # TODO(r-lib/covr#567): Go back to using r-devel
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::covr, any::xml2
+          needs: coverage
+
+      - name: Test coverage
+        run: |
+          cov <- covr::package_coverage(
+            quiet = FALSE,
+            clean = FALSE,
+            install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
+          )
+          covr::to_cobertura(cov)
+        shell: Rscript {0}
+
+      - uses: codecov/codecov-action@v4
+        with:
+          fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }}
+          file: ./cobertura.xml
+          plugin: noop
+          disable_search: true
+          token: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Upload test results
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-test-failures
+          path: ${{ runner.temp }}/package
diff --git a/NEWS.md b/NEWS.md
@@ -63,6 +63,8 @@ rowwiseDT(
     # 15: All values  Total  Total   999   999   NaN    10
     ```
 
+4. `patterns()` in `melt()` combines correctly with user-defined `cols=`, which can be useful to specify a subset of columns to reshape without having to use a regex, for example `patterns("2", cols=c("y1", "y2"))` will only give `y2` even if there are other columns in the input matching `2`, [#6498](https://github.com/Rdatatable/data.table/issues/6498). Thanks to @hongyuanjia for the report, and to @tdhock for the PR.
+
 ## BUG FIXES
 
 1. Using `print.data.table()` with character truncation using `datatable.prettyprint.char` no longer errors with `NA` entries, [#6441](https://github.com/Rdatatable/data.table/issues/6441). Thanks to @r2evans for the bug report, and @joshhwuu for the fix.

diff --git a/R/data.table.R b/R/data.table.R
@@ -2256,8 +2256,8 @@ tail.data.table = function(x, n=6L, ...) {
 
 "$<-.data.table" = function(x, name, value) {
   if (!cedta()) {
-    ans = `$<-.data.frame`(x, name, value)
-    return(setalloccol(ans))           # over-allocate (again)
+    ans = `$<-.data.frame`(x, name, value) # nocov
+    return(setalloccol(ans))           # nocov. over-allocate (again)
   }
   x = copy(x)
   set(x,j=name,value=value)  # important i is missing here
@@ -2430,7 +2430,7 @@ which_ = function(x, bool = TRUE) {
 }
 
 is.na.data.table = function(x) {
-  if (!cedta()) return(is.na.data.frame(x))
+  if (!cedta()) return(is.na.data.frame(x)) # nocov
   do.call(cbind, lapply(x, is.na))
 }
 

diff --git a/R/fmelt.R b/R/fmelt.R
@@ -29,7 +29,7 @@ patterns = function(..., cols=character(0L), ignore.case=FALSE, perl=FALSE, fixe
   p = unlist(L, use.names = any(nzchar(names(L))))
   if (!is.character(p))
     stopf("Input patterns must be of type character.")
-  matched = lapply(p, grep, cols, ignore.case=ignore.case, perl=perl, fixed=fixed, useBytes=useBytes)
+  matched = lapply(p, grep, cols, ignore.case=ignore.case, perl=perl, fixed=fixed, useBytes=useBytes, value=TRUE)
   if (length(idx <- which(lengths(matched) == 0L)))
     stopf(ngettext(length(idx), 'Pattern not found: [%s]', 'Patterns not found: [%s]'), brackify(p[idx]), domain=NA)
   if (length(matched) == 1L) return(matched[[1L]])
@@ -64,14 +64,11 @@ measure = function(..., sep="_", pattern, cols, multiple.keyword="value.name") {
     }
     fun.list[[fun.i]] = fun
   }
-  measurev.args = c(
-    list(fun.list),
-    L[formal.i.vec],
-    list(group.desc="... arguments to measure"))
+  measurev.args = c(list(fun.list), L[formal.i.vec])
   do.call(measurev, measurev.args)
 }
 
-measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name", group.desc="elements of fun.list"){
+measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.name"){
   # 1. basic error checking.
   if (!missing(sep) && !missing(pattern)) {
     stopf("both sep and pattern arguments used; must use either sep or pattern (not both)")
@@ -88,21 +85,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
     which(!nzchar(names(fun.list)))
   }
   if (length(prob.i)) {
-    stopf("in measurev, %s must be named, problems: %s", group.desc, brackify(prob.i))
+    stopf("in measurev, elements of fun.list must be named, problems: %s", brackify(prob.i))
   }
-  err.names.unique = function(err.what, name.vec) {
-    name.tab = table(name.vec)
-    bad.counts = name.tab[1 < name.tab]
-    if (length(bad.counts)) {
-      stopf("%s should be uniquely named, problems: %s", err.what, brackify(names(bad.counts)))
-    }
-  }
-  err.args.groups = function(type, N){
-    if (N != length(fun.list)) {
-      stopf("number of %s =%d must be same as %s =%d", group.desc, length(fun.list), type, N)
-    }
+  if (length(dup.funs <- duplicated_values(names(fun.list)))) {
+    stopf("elements of fun.list should be uniquely named, problems: %s", brackify(dup.funs))
   }
-  err.names.unique(group.desc, names(fun.list))
   # 2. compute initial group data table, used as variable_table attribute.
   group.mat = if (!missing(pattern)) {
     if (!is.character(pattern)) {
@@ -117,7 +104,9 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
     if (is.null(start)) {
       stopf("pattern must contain at least one capture group (parenthesized sub-pattern)")
     }
-    err.args.groups("number of capture groups in pattern", ncol(start))
+    if (ncol(start) != length(fun.list)) {
+      stopf("number of elements of fun.list (%d) must be the same as the number of capture groups in pattern (%d)", length(fun.list), ncol(start))
+    }
     end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L
     measure.vec <- cols[measure.vec.i]
     names.mat = matrix(measure.vec, nrow(start), ncol(start))
@@ -132,12 +121,16 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
     if (n.groups == 1) {
       stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
     }
-    err.args.groups("max number of items after splitting column names", n.groups)
+    if (n.groups != length(fun.list)) {
+      stopf("number of elements of fun.list (%d) must be the same as the max number of items after splitting column names (%d)", length(fun.list), n.groups)
+    }
     measure.vec.i = which(vector.lengths==n.groups)
     measure.vec = cols[measure.vec.i]
     do.call(rbind, list.of.vectors[measure.vec.i])
   }
-  err.names.unique("measured columns", measure.vec)
+  if (length(dup.measures <- duplicated_values(measure.vec))) {
+    stopf("measured columns should be uniquely named, problems: %s", brackify(dup.measures))
+  }
   uniq.mat = unique(group.mat)
   if (nrow(uniq.mat) < nrow(group.mat)) {
     stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat))

diff --git a/R/fread.R b/R/fread.R
@@ -158,6 +158,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     }
     # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804
   }
+  # nocov start. Tested in other.Rraw tests 16, not in the main suite.
   if (yaml) {
     if (!requireNamespace('yaml', quietly = TRUE))
       stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
@@ -267,6 +268,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     }
     if (is.integer(skip)) skip = skip + n_read
   }
+  # nocov end
   warnings2errors = getOption("warn") >= 2
   stopifnot(identical(tz,"UTC") || identical(tz,""))
   if (tz=="") {
@@ -347,7 +349,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
       key = cols_from_csv(key)
     setkeyv(ans, key)
   }
-  if (yaml) setattr(ans, 'yaml_metadata', yaml_header)
+  if (yaml) setattr(ans, 'yaml_metadata', yaml_header) # nocov
   if (!is.null(index) && data.table) {
     if (!all(vapply_1b(index, is.character)))
       stopf("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")

diff --git a/R/fwrite.R b/R/fwrite.R
@@ -91,6 +91,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
       return(invisible())
     }
   }
+  # nocov start. See test 17 in other.Rraw, not tested in the main suite.
   yaml = if (!yaml) "" else {
     if (!requireNamespace('yaml', quietly=TRUE))
       stopf("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
@@ -114,6 +115,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto",
     )
     paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline
   }
+  # nocov end
   file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
   .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
         row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread,

diff --git a/R/last.R b/R/last.R
@@ -1,6 +1,7 @@
 # data.table defined last(x) with no arguments, just for last. If you need the last 10 then use tail(x,10).
 # for xts class objects it will dispatch to xts::last
 # reworked to avoid loading xts namespace (#3857) then again to fix dispatching of xts class (#4053)
+# nocov start. Tests 19.* in other.Rraw, not in the main suite.
 last = function(x, n=1L, ...) {
   verbose = isTRUE(getOption("datatable.verbose", FALSE))
   if (!inherits(x, "xts")) {
@@ -82,3 +83,4 @@ first = function(x, n=1L, ...) {
     xts::first(x, n=n, ...)
   }
 }
+# nocov end
diff --git a/R/utils.R b/R/utils.R
@@ -34,11 +34,17 @@ check_duplicate_names = function(x, table_name=deparse(substitute(x))) {
   if (!anyDuplicated(nm <- names(x))) return(invisible())
   duplicate_names = unique(nm[duplicated(nm)])
   stopf(ngettext(length(duplicate_names),
-                  "%s has duplicated column name %s. Please remove or rename the duplicate and try again.",
-                  "%s has duplicated column names %s. Please remove or rename the duplicates and try again."),
+                 "%s has duplicated column name %s. Please remove or rename the duplicate and try again.",
+                 "%s has duplicated column names %s. Please remove or rename the duplicates and try again."),
         table_name, brackify(duplicate_names), domain=NA)
 }
 
+duplicated_values = function(x) {
+  # fast anyDuplicated for the typical/non-error case; second duplicated() pass for (usually) error case
+  if (!anyDuplicated(x)) return(vector(typeof(x)))
+  unique(x[duplicated(x)])
+}
+
 # TODO(R>=4.0.0): Remove this workaround. From R 4.0.0, rep_len() dispatches rep.Date(), which we need.
 #   Before that, rep_len() strips attributes --> breaks data.table()'s internal recycle() helper.
 #   This also impacts test 2 in S4.Rraw, because the error message differs for rep.int() vs. rep_len().

diff --git a/R/xts.R b/R/xts.R
@@ -1,3 +1,4 @@
+# nocov start. See tests 18.* in other.Rraw, not in the main suite.
 as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) {
   stopifnot(requireNamespace("xts"), !missing(x), xts::is.xts(x))
   if (length(keep.rownames) != 1L) stopf("keep.rownames must be length 1")
@@ -26,3 +27,4 @@ as.xts.data.table = function(x, numeric.only = TRUE, ...) {
   }
   return(xts::xts(as.matrix(r), order.by = if (inherits(x[[1L]], "IDate")) as.Date(x[[1L]]) else x[[1L]]))
 }
+# nocov end
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -12427,6 +12427,9 @@ DTout = data.table(
   value2 = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j")
 )
 test(1866.6, melt(DT, measure.vars = patterns("^x", "^y", cols=names(DT))), DTout)
+# patterns supports cols arg, #6498
+test(1866.71, melt(data.table(x1=1,x2=2,y1=3,y2=4),measure.vars=patterns("2",cols=c("y1","y2"))), data.table(x1=1,x2=2,y1=3,variable=factor("y2"),value=4))
+test(1866.72, DT[, lapply(.SD, sum), .SDcols=patterns("2",cols=c("x1","x2"))], data.table(x2=40L))
 
 # informative errors for bad user-provided cols arg to patterns
 DT = data.table(x1=1,x2=2,y1=3,y2=4)
@@ -17301,11 +17304,11 @@ test(2183.00020, melt(iris.dt, measure.vars=measurev(value.name, dim, sep=".", p
 test(2183.000201, melt(iris.dt, measure.vars=measurev(list(NULL, dim=NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1]")
 test(2183.000202, melt(iris.dt, measure.vars=measurev(list(NULL, NULL), sep=".")), error="in measurev, elements of fun.list must be named, problems: [1, 2]")
 test(2183.00027, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim="bar"), sep=".")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: dim")
-test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list =3 must be same as max number of items after splitting column names =2")
+test(2183.00028, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), sep=".")), error="number of elements of fun.list (3) must be the same as the max number of items after splitting column names (2)")
 test(2183.00042, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function()1), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr")
 test(2183.00043, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=interactive), pattern="([ab])([12])")), error="in the measurev fun.list, each non-NULL element must be a function with at least one argument, problem: istr")
 test(2183.00044, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=function(x)1), pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr")
-test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list =3 must be same as number of capture groups in pattern =2")
+test(2183.00045, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, dim=NULL, baz=NULL), pattern="(.*)[.](.*)")), error="number of elements of fun.list (3) must be the same as the number of capture groups in pattern (2)")
 test(2183.00048, melt(iris.dt, measure.vars=measurev(list(value.name=NULL, value.name=NULL), sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]")
 # measure with factor conversion.
 myfac = function(x)factor(x)#user-defined conversion function.
@@ -17348,7 +17351,7 @@ test(2183.24, names(melt(iris.dt, measure.vars=measure(value.name, dim, sep=".")
 test(2183.25, names(melt(iris.dt, measure.vars=measure(part, value.name, sep="."))), c("Species", "part", "Length", "Width"))
 test(2183.26, names(melt(iris.dt, measure.vars=measure(part, dim, sep="."))), c("Species", "part", "dim", "value"))
 test(2183.27, melt(iris.dt, measure.vars=measure(value.name, dim="bar", sep=".")), error="each ... argument to measure must be a function with at least one argument, problem: dim")
-test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of ... arguments to measure =3 must be same as max number of items after splitting column names =2")
+test(2183.28, melt(iris.dt, measure.vars=measure(value.name, dim, baz, sep=".")), error="number of elements of fun.list (3) must be the same as the max number of items after splitting column names (2)")
 test(2183.29, melt(iris.dt, measure.vars=measure()), error="each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
 # patterns with iris data.
 test(2183.40, names(melt(iris.dt, measure.vars=patterns("[.]"))), c("Species", "variable", "value"))
@@ -17357,10 +17360,10 @@ test(2183.41, melt(DTid, measure.vars=measure(value.name, istr="bar", pattern="(
 test(2183.42, melt(DTid, measure.vars=measure(value.name, istr=function()1, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr")
 test(2183.43, melt(DTid, measure.vars=measure(value.name, istr=interactive, pattern="([ab])([12])")), error="each ... argument to measure must be a function with at least one argument, problem: istr")
 test(2183.44, melt(DTid, measure.vars=measure(value.name, istr=function(x)1, pattern="([ab])([12])")), error="each conversion function must return an atomic vector with same length as its first argument, problem: istr")
-test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of ... arguments to measure =3 must be same as number of capture groups in pattern =2")
+test(2183.45, melt(iris.dt, measure.vars=measure(value.name, dim, baz, pattern="(.*)[.](.*)")), error="number of elements of fun.list (3) must be the same as the number of capture groups in pattern (2)")
 test(2183.46, melt(iris.dt, measure.vars=measure(function(x)factor(x), dim, pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]")
 test(2183.47, melt(iris.dt, measure.vars=measure(function(x)factor(x), pattern="(.*)[.](.*)")), error="each ... argument to measure must be either a symbol without argument name, or a function with argument name, problems: [1]")
-test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="... arguments to measure should be uniquely named, problems: [value.name]")
+test(2183.48, melt(iris.dt, measure.vars=measure(value.name, value.name, sep=".")), error="elements of fun.list should be uniquely named, problems: [value.name]")
 # measure with factor conversion.
 myfac = function(x)factor(x)#user-defined conversion function.
 test(2183.60, melt(DTid, measure.vars=measure(letter=myfac, value.name, pattern="([ab])([12])")), data.table(id=1, letter=factor(c("a","b")), "2"=c(2,2), "1"=c(NA,1)))

diff --git a/man/measure.Rd b/man/measure.Rd
@@ -18,8 +18,7 @@
 }
 \usage{
 measure(\dots, sep, pattern, cols, multiple.keyword="value.name")
-measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name",
-  group.desc="elements of fun.list")
+measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name")
 }
 \arguments{
   \item{\dots}{One or more (1) symbols (without argument name; symbol
@@ -44,7 +43,6 @@ measurev(fun.list, sep, pattern, cols, multiple.keyword="value.name",
     value columns (with names defined by the unique values in that
     group). Otherwise if the string not used as a group name, then
     measure returns a vector and melt returns a single value column.}
-  \item{group.desc}{Internal, used in error messages.}
 }
 \seealso{
   \code{\link{melt}},

diff --git a/man/patterns.Rd b/man/patterns.Rd
@@ -2,8 +2,8 @@
 \alias{patterns}
 \title{Obtain matching indices corresponding to patterns}
 \description{
-\code{patterns} returns the matching indices in the argument \code{cols}
-corresponding to the regular expression patterns provided. The patterns must be
+\code{patterns} returns the elements of \code{cols}
+that match the regular expression patterns, which must be
 supported by \code{\link[base]{grep}}.
 
 From \code{v1.9.6}, \code{\link{melt.data.table}} has an enhanced functionality