Skip to content

Commit

Permalink
Merge branch 'master' into split-by-sep
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico committed Apr 19, 2024
2 parents a00d55e + 47a7f52 commit 415831c
Show file tree
Hide file tree
Showing 27 changed files with 680 additions and 267 deletions.
3 changes: 2 additions & 1 deletion .dev/cc.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) {
if (!quiet) warning("No NAMESPACE file found, required to guarantee imports resolve correctly")
return(invisible())
}
suppressWarnings(rm("getRversion", envir=.GlobalEnv)) # clean up from previous cc() because parseNamespaceFile() run getRversion() in NAMESPACE in .GlobalEnv
nsParsedImports = parseNamespaceFile(basename(path), "..")$imports # weird signature to this function
if (!quiet && length(nsParsedImports)) cat(sprintf("Ensuring objects from %d import entries in NAMESPACE resolve correctly\n", length(nsParsedImports)))
for (ii in seq_along(nsParsedImports)) {
Expand All @@ -51,7 +52,7 @@ sourceImports = function(path=getwd(), quiet=FALSE) {
return(invisible())
}

cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH"), CC="gcc", quiet=FALSE) {
cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, cc_dir, path=Sys.getenv("PROJ_PATH", unset="."), CC="gcc", quiet=FALSE) {
if (!missing(cc_dir)) {
warning("'cc_dir' arg is deprecated, use 'path' argument or 'PROJ_PATH' env var instead")
path = cc_dir
Expand Down
96 changes: 96 additions & 0 deletions .github/workflows/R-CMD-check-occasional.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
on:
schedule:
- cron: '18 13 8 * *' # 8th of month at 13:18 UTC

# A more complete suite of checks to run monthly; each PR/merge need not pass all these, but they should pass before CRAN release
name: R-CMD-check-occasional

jobs:
R-CMD-check-occasional:
runs-on: ${{ matrix.os }}

name: ${{ matrix.os }} (${{ matrix.r }})

strategy:
matrix:
os: [macOS-latest, windows-latest, ubuntu-latest]
r: ['devel', 'release', '3.2', '3.3', '3.4', '3.5', '3.6', '4.0', '4.1', '4.2', '4.3']
locale: ['en_US.utf8', 'zh_CN.utf8', 'lv_LV.utf8'] # Chinese for translations, Latvian for collate order (#3502)
exclude:
- os: ['macOS-latest', 'windows-latest'] # only run non-English locale CI on Ubuntu
locale: ['zh_CN.utf8', 'lv_LV.utf8']

env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- name: Set locale
if: matrix.locale == 'en_US.utf8'
run: |
sudo locale-gen en_US
echo "LC_ALL=en_US.utf8" >> $GITHUB_ENV
- name: Set locale
if: matrix.locale == 'zh_CN.utf8'
run: |
sudo locale-gen zh_CN
echo "LC_ALL=zh_CN.utf8" >> $GITHUB_ENV
echo "LANGUAGE=zh_CN" >> $GITHUB_ENV
- name: Set locale
if: matrix.locale == 'lv_LV.utf8'
run: |
sudo locale-gen lv_LV
echo "LC_ALL=lv_LV.utf8" >> $GITHUB_ENV
echo "LANGUAGE=lv_LV" >> $GITHUB_ENV
- uses: actions/checkout@v2

- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.r }}


- name: Query dependencies
run: |
install.packages('remotes')
saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
shell: Rscript {0}

- name: Restore R package cache
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-

- name: Install system dependencies
if: runner.os == 'Linux'
run: |
while read -r cmd
do
eval sudo $cmd
done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
- name: Install dependencies
run: |
remotes::install_deps(dependencies = TRUE)
remotes::install_cran("rcmdcheck")
shell: Rscript {0}

- name: Check
env:
_R_CHECK_CRAN_INCOMING_REMOTE_: false
run: |
options(crayon.enabled = TRUE)
rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
shell: Rscript {0}

- name: Upload check results
if: failure()
uses: actions/upload-artifact@main
with:
name: ${{ runner.os }}-r${{ matrix.r }}-results
path: check
23 changes: 23 additions & 0 deletions .github/workflows/performance-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Autocomment atime-based performance regression analysis on PRs

on:
pull_request:
branches:
- '*'
types:
- opened
- reopened
- synchronize
paths:
- 'R/**'
- 'src/**'

jobs:
comment:
runs-on: ubuntu-latest
container: ghcr.io/iterative/cml:0-dvc2-base1
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
repo_token: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: Anirban166/Autocomment-atime-results@v1.1.6
8 changes: 7 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@
8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix.
9. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417).
9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development.
10. `measure` now supports user-specified `cols` argument, which can be useful to specify a subset of columns to `melt`, without having to use a regex, [#5063](https://github.com/Rdatatable/data.table/issues/5063). Thanks to @UweBlock and @Henrik-P for reporting, and @tdhock for the PR.
11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417).
## BUG FIXES
Expand Down Expand Up @@ -74,6 +78,8 @@
11. Using `print.data.table` when truncation is needed with `row.names = FALSE` prints the indicator `---` in every value column instead of adding a blank column where the `rownames` would have been just to include `---`, [#4083](https://github.com/Rdatatable/data.table/issues/4083). Thanks @MichaelChirico for the report and @joshhwuu for the fix.
12. `print.data.table` now honors `na.print`, as seen in `print.default`, allowing for string replacement of `NA` values when printing. Thanks @HughParsonage for the report and @joshhwuu for the fix.
# data.table [v1.15.0](https://github.com/Rdatatable/data.table/milestone/29) (30 Jan 2024)
## BREAKING CHANGE
Expand Down
18 changes: 10 additions & 8 deletions R/fmelt.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,18 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
stopf("pattern must be character string")
}
match.vec = regexpr(pattern, cols, perl=TRUE)
measure.vec = which(0 < match.vec)
if (length(measure.vec) == 0L) {
measure.vec.i = which(0 < match.vec)
if (length(measure.vec.i) == 0L) {
stopf("pattern did not match any cols, so nothing would be melted; fix by changing pattern")
}
start = attr(match.vec, "capture.start")[measure.vec, , drop=FALSE]
start = attr(match.vec, "capture.start")[measure.vec.i, , drop=FALSE]
if (is.null(start)) {
stopf("pattern must contain at least one capture group (parenthesized sub-pattern)")
}
err.args.groups("number of capture groups in pattern", ncol(start))
end = attr(match.vec, "capture.length")[measure.vec,]+start-1L
names.mat = matrix(cols[measure.vec], nrow(start), ncol(start))
end = attr(match.vec, "capture.length")[measure.vec.i,]+start-1L
measure.vec <- cols[measure.vec.i]
names.mat = matrix(measure.vec, nrow(start), ncol(start))
substr(names.mat, start, end)
} else { #pattern not specified, so split using sep.
if (!is.character(sep)) {
Expand All @@ -130,10 +131,11 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")
}
err.args.groups("max number of items after splitting column names", n.groups)
measure.vec = which(vector.lengths==n.groups)
do.call(rbind, list.of.vectors[measure.vec])
measure.vec.i = which(vector.lengths==n.groups)
measure.vec = cols[measure.vec.i]
do.call(rbind, list.of.vectors[measure.vec.i])
}
err.names.unique("measured columns", cols[measure.vec])
err.names.unique("measured columns", measure.vec)
uniq.mat = unique(group.mat)
if (nrow(uniq.mat) < nrow(group.mat)) {
stopf("number of unique column IDs =%d is less than number of melted columns =%d; fix by changing pattern/sep", nrow(uniq.mat), nrow(group.mat))
Expand Down
5 changes: 3 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
fread = function(
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto",
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
Expand All @@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
else if (sep=="auto") sep="" # sep=="" at C level means auto sep
else stopifnot( nchar(sep)==1L ) # otherwise an actual character to use as sep
}
stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L )
stopifnot( is.character(dec), length(dec)==1L)
if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L)
# handle encoding, #563
if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) {
stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
Expand Down
30 changes: 11 additions & 19 deletions R/print.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
print.keys=getOption("datatable.print.keys"),
trunc.cols=getOption("datatable.print.trunc.cols"),
quote=FALSE,
na.print=NULL,
timezone=FALSE, ...) {
# topn - print the top topn and bottom topn rows with '---' inbetween (5)
# nrows - under this the whole (small) table is printed, unless topn is provided (100)
Expand Down Expand Up @@ -109,6 +110,13 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
# When nrow(toprint) = 1, attributes get lost in the subset,
# function below adds those back when necessary
toprint = toprint_subset(toprint, cols_to_print)
trunc.cols <- length(not_printed) > 0L
}
print_default = function(x) {
if (col.names != "none") cut_colnames = identity
cut_colnames(print(x, right=TRUE, quote=quote, na.print=na.print))
# prints names of variables not shown in the print
if (trunc.cols) trunc_cols_message(not_printed, abbs, class, col.names)
}
if (printdots) {
if (isFALSE(row.names)) {
Expand All @@ -117,30 +125,14 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"),
toprint = rbind(head(toprint, topn + isTRUE(class)), "---"="", tail(toprint, topn))
}
rownames(toprint) = format(rownames(toprint), justify="right")
if (col.names == "none") {
cut_colnames(print(toprint, right=TRUE, quote=quote))
} else {
print(toprint, right=TRUE, quote=quote)
}
if (trunc.cols && length(not_printed) > 0L)
# prints names of variables not shown in the print
trunc_cols_message(not_printed, abbs, class, col.names)

print_default(toprint)
return(invisible(x))
}
if (nrow(toprint)>20L && col.names == "auto")
# repeat colnames at the bottom if over 20 rows so you don't have to scroll up to see them
# option to shut this off per request of Oleg Bondar on SO, #1482
toprint=rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97
if (col.names == "none") {
cut_colnames(print(toprint, right=TRUE, quote=quote))
} else {
print(toprint, right=TRUE, quote=quote)
}
if (trunc.cols && length(not_printed) > 0L)
# prints names of variables not shown in the print
trunc_cols_message(not_printed, abbs, class, col.names)

toprint = rbind(toprint, matrix(if (quote) old else colnames(toprint), nrow=1L)) # fixes bug #97
print_default(toprint)
invisible(x)
}

Expand Down
82 changes: 78 additions & 4 deletions R/test.data.table.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent,
test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=FALSE, showProgress=interactive()&&!silent, testPattern=NULL,
memtest=Sys.getenv("TEST_DATA_TABLE_MEMTEST", 0), memtest.id=NULL) {
stopifnot(isTRUEorFALSE(verbose), isTRUEorFALSE(silent), isTRUEorFALSE(showProgress))
memtest = as.integer(memtest)
Expand Down Expand Up @@ -38,7 +38,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
scripts = scripts[!grepl("bench|other", scripts)]
scripts = gsub("[.]bz2$","",scripts)
return(sapply(scripts, function(fn) {
err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress))
err = try(test.data.table(script=fn, verbose=verbose, pkg=pkg, silent=silent, showProgress=showProgress, testPattern=testPattern))
cat("\n");
isTRUE(err)
}))
Expand Down Expand Up @@ -140,6 +140,68 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
if (is.na(rss())) stopf("memtest intended for Linux. Step through data.table:::rss() to see what went wrong.")
}

# nocov start: only used interactively -- "production" suites should always run in full
if (!is.null(testPattern)) {
# due to how non-hermetic our tests are, the simple approach (pass this to test(), return early if 'numStr' matches testPattern)
# does not work, or at least getting it to work is not much more efficient (see initial commit of #6040). so instead,
# here we parse the file, extract the tests that match the pattern to a new file, and include other setup lines likely required
# to run the tests successfully. two major drawbacks (1) we can only take a guess which lines are required, so this approach
# can't work (or at least, may need a lot of adjustment) for _every_ test, though not working is also a good sign that test
# should be refactored to be more hermetic (2) not all tests have literal test numbers, meaning we can't always match the
# runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the
# full test expression string, i.e., not just limited to numeric literal test numbers.
arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check
pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE)))
file_lines = readLines(fn)
# NB: a call looks like (with id/parent tracking)
# <expr>
# <expr "lhs"><SYMBOL_FUNCTION_CALL>name</SYMBOL_FUNCTION_CALL></expr>
# <LEFT_PAREN>(</LEFT_PAREN>
# <expr "arg1"> ... </expr>
# ...
# <RIGHT_PAREN>)</RIGHT_PAREN>
# </expr>
## navigate up two steps from 'test' SYMBOL_FUNCTION_CALL to the overall 'expr' for the call
test_calls = pd[
pd[
pd[token == 'SYMBOL_FUNCTION_CALL' & text == 'test'],
list(call_lhs_id=id, call_id=x.parent),
on=c(id='parent')],
list(line1, id),
on=c(id='call_id')]
## all the arguments for each call to test()
test_call_args = test_calls[pd[token == 'expr'], list(call_id=parent, arg_line=i.line1, col1, col2), on=c(id='parent'), nomatch=NULL]
## 2nd argument is the num= argument
test_num_expr = test_call_args[ , .SD[2L], by="call_id"]
# NB: subtle assumption that 2nd arg to test() is all on one line, true as of 2024-Apr and likely to remain so
keep_test_ids = test_num_expr[grepl(testPattern, substring(file_lines[arg_line], col1, col2)), call_id]
# Now find all tests just previous to the keep tests; we want to keep non-test setup lines between them, e.g.
# test(drop, ...)
# setup_line1 # retain
# setup_line2 # retain
# test(keep, ...) # retain
intertest_ranges = test_calls[!id %in% keep_test_ids][
test_calls[id %in% keep_test_ids],
list(preceding_line=x.line1, test_start_line=i.line1),
on='line1',
roll=TRUE]
# TODO(michaelchirico): this doesn't do well with tests inside control statements.
# those could be included by looking for tests with parent!=0, i.e., not-top-level tests,
# and including the full parent for such tests. omitting for now until needed.
keep_lines = intertest_ranges[, sort(unique(unlist(Map(function(l, u) l:u, preceding_line+1L, test_start_line))))]
header_lines = seq_len(test_calls$line1[1L]-1L)

tryCatch(error = function(c) warningf("Attempt to subset to %d tests matching '%s' failed, running full suite.", length(keep_test_ids), testPattern), {
new_script = file_lines[c(header_lines, keep_lines)]
parse(text = new_script) # as noted above the static approach is not fool-proof (yet?), so force the script to at least parse before continuing.
fn = tempfile()
on.exit(unlink(fn), add=TRUE)
catf("Running %d of %d tests matching '%s'\n", length(keep_test_ids), nrow(test_calls), testPattern)
writeLines(new_script, fn)
})
}
# nocov end

err = try(sys.source(fn, envir=env), silent=silent)

options(oldOptions)
Expand Down Expand Up @@ -251,7 +313,19 @@ gc_mem = function() {
# nocov end
}

test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL) {
test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) {
if (!is.null(env)) {
old = Sys.getenv(names(env), names=TRUE, unset=NA)
to_unset = !lengths(env)
# NB: Sys.setenv() (no arguments) errors
if (!all(to_unset)) do.call(Sys.setenv, as.list(env[!to_unset]))
Sys.unsetenv(names(env)[to_unset])
on.exit(add=TRUE, {
is_preset = !is.na(old)
if (any(is_preset)) do.call(Sys.setenv, as.list(old[is_preset]))
Sys.unsetenv(names(old)[!is_preset])
})
}
if (!is.null(options)) {
old_options <- do.call('options', as.list(options)) # as.list(): allow passing named character vector for convenience
on.exit(options(old_options), add=TRUE)
Expand Down Expand Up @@ -418,7 +492,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
}
}
if (!fail && !length(error) && (!length(output) || !missing(y))) { # TODO test y when output=, too
y = try(y,TRUE)
capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it
if (identical(x,y)) return(invisible(TRUE))
all.equal.result = TRUE
if (is.data.frame(x) && is.data.frame(y)) {
Expand Down
Loading

0 comments on commit 415831c

Please sign in to comment.