diff --git a/NEWS.md b/NEWS.md index 0ba12fc4b..e3f7c9f4c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -50,6 +50,8 @@ 16. `fcase()` supports scalars in conditions (e.g. supplying just `TRUE`), vectors in `default=` (so the default can vary by row), and `default=` is now lazily evaluated, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks @sindribaldur for the feature request, which has been highly requested, @shrektan for doing most of the implementation, and @MichaelChirico for sewing things up. +17. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 5b23169c4..fa07593e9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -127,7 +127,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) +"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -224,6 +224,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") + if (!isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -1901,7 +1902,7 @@ replace_dot_alias = function(e) { } ans = c(g, ans) } else { - ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) + ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress) } # unlock any locked data.table components of the answer, #4159 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 diff --git a/man/data.table.Rd b/man/data.table.Rd index 680e25574..729c0861c 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -35,7 +35,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL, env = NULL) + drop = NULL, on = NULL, env = NULL, + showProgress = getOption("datatable.showProgress", interactive())) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} @@ -177,6 +178,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } + + \item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/src/data.table.h b/src/data.table.h index ed63978d6..49f3f1634 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -189,7 +189,7 @@ void warn_matrix_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, - SEXP on, SEXP verbose); + SEXP on, SEXP verbose, SEXP showProgressArg); // bmerge.c SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, diff --git a/src/dogroups.c b/src/dogroups.c index e03ad84df..2728a8bcf 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -63,7 +63,7 @@ static bool anySpecialStatic(SEXP x) { return false; } -SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg) +SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; @@ -71,6 +71,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; const bool verbose = LOGICAL(verboseArg)[0]==1; + const bool showProgress = LOGICAL(showProgressArg)[0]==1; + bool hasPrinted = false; + double startTime = (showProgress) ? wallclock() : 0; + double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -169,7 +173,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // because it is a rare edge case for it to be true. See #4892. bool anyNA=false, orderedSubset=false; check_idx(order, length(VECTOR_ELT(dt, 0)), &anyNA, &orderedSubset); - for(int i=0; i-1)) continue; @@ -435,6 +438,19 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (copied) UNPROTECT(1); } } + // progress printing, #3060 + // could potentially refactor to use fread's progress() function, however we would lose some information in favor of simplicity. + double now; + if (showProgress && (now=wallclock())>=nextTime) { + double avgTimePerGroup = (now-startTime)/(i+1); + int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); + if (hasPrinted || ETA >= 0) { + if (verbose && !hasPrinted) Rprintf(_("\n")); + Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); + } + nextTime = now+1; + hasPrinted = true; + } ansloc += maxn; if (firstalloc) { nprotect++; // remember the first jval. If we UNPROTECTed now, we'd unprotect @@ -443,6 +459,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok. } + if (showProgress && hasPrinted) Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds.\n"), ngrp, ngrp, 100.0, (int)(wallclock()-startTime), 0); if (isNull(lhs) && ans!=NULL) { if (ansloc < LENGTH(VECTOR_ELT(ans,0))) { if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0)));