From 20b213738fdb500369201f6ff584dae3a1bcd44b Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Mon, 8 Jul 2024 14:29:41 -0700 Subject: [PATCH 1/7] progress prototype --- src/dogroups.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/dogroups.c b/src/dogroups.c index a72a7e8c5..689a8d544 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,6 +3,8 @@ #include #include +extern void progress(int, int); + static bool anySpecialStatic(SEXP x) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -169,7 +171,8 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // because it is a rare edge case for it to be true. See #4892. bool anyNA=false, orderedSubset=false; check_idx(order, length(VECTOR_ELT(dt, 0)), &anyNA, &orderedSubset); - + double start_time; + double total_time = 0; for(int i=0; i-1)) continue; @@ -178,6 +181,8 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // In cases were no i rows match, '|| estn>-1' ensures that the last empty group creates an empty result. // TODO: revisit and tidy + start_time = wallclock(); + if (!isNull(lhs) && (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER))) @@ -435,6 +440,11 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (copied) UNPROTECT(1); } } + double now = wallclock(); + double time_of_group = (double)(now-start_time); + total_time += time_of_group; + double average_time = (double)total_time/(i+1); + progress((int)(100.0*(i+1)/ngrp), (int)(average_time*(ngrp-i-1))); ansloc += maxn; if (firstalloc) { nprotect++; // remember the first jval. If we UNPROTECTed now, we'd unprotect @@ -443,6 +453,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok. } + progress(100, 0); if (isNull(lhs) && ans!=NULL) { if (ansloc < LENGTH(VECTOR_ELT(ans,0))) { if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0))); From 6b09d1b1e5fea454179c90d8400405dd3b35a0fe Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Mon, 15 Jul 2024 12:14:06 -0700 Subject: [PATCH 2/7] fwrite-esque reporting --- src/dogroups.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 689a8d544..a3b07fb44 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -73,6 +73,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; const bool verbose = LOGICAL(verboseArg)[0]==1; + bool showProgress = true, hasPrinted = false; + double startTime = wallclock(); + double nextTime = startTime+1; if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -171,8 +174,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // because it is a rare edge case for it to be true. See #4892. bool anyNA=false, orderedSubset=false; check_idx(order, length(VECTOR_ELT(dt, 0)), &anyNA, &orderedSubset); - double start_time; - double total_time = 0; for(int i=0; i-1)) continue; @@ -181,8 +182,6 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX // In cases were no i rows match, '|| estn>-1' ensures that the last empty group creates an empty result. // TODO: revisit and tidy - start_time = wallclock(); - if (!isNull(lhs) && (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER))) @@ -440,11 +439,18 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (copied) UNPROTECT(1); } } - double now = wallclock(); - double time_of_group = (double)(now-start_time); - total_time += time_of_group; - double average_time = (double)total_time/(i+1); - progress((int)(100.0*(i+1)/ngrp), (int)(average_time*(ngrp-i-1))); + double now; + if (showProgress && (now=wallclock())>=nextTime) { + double avgTimePerGroup = (now-startTime) / (i+1); + int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); + if (hasPrinted || ETA >= 0) { + if (verbose && !hasPrinted) Rprintf(_("\n")); + Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), + i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); + } + nextTime = now+1; + hasPrinted = true; + } ansloc += maxn; if (firstalloc) { nprotect++; // remember the first jval. If we UNPROTECTed now, we'd unprotect @@ -453,7 +459,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok. } - progress(100, 0); + if (showProgress) Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds.\n"), ngrp, ngrp, 100.0, (int)(wallclock()-startTime), 0); if (isNull(lhs) && ans!=NULL) { if (ansloc < LENGTH(VECTOR_ELT(ans,0))) { if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0))); From dc2320d2427ddaf7b98e21d9bada0e9b4a2a62bc Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Tue, 16 Jul 2024 13:23:30 -0700 Subject: [PATCH 3/7] add as an argument like in fread, update data.table.Rd --- R/data.table.R | 5 +++-- man/data.table.Rd | 2 ++ src/data.table.h | 2 +- src/dogroups.c | 16 +++++++--------- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1689d8592..ea9409398 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -129,7 +129,7 @@ replace_dot_alias = function(e) { } } -"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL) +"[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could # test explicitly if the caller is [.data.table (even stronger test. TO DO.) @@ -226,6 +226,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") + if (!isTRUEorFALSE(showProgress)) stopf("showProgress must be TRUE or FALSE") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -1882,7 +1883,7 @@ replace_dot_alias = function(e) { } ans = c(g, ans) } else { - ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) + ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress) } # unlock any locked data.table components of the answer, #4159 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 diff --git a/man/data.table.Rd b/man/data.table.Rd index 680e25574..1ca87859c 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -177,6 +177,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac } \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } + + \item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/src/data.table.h b/src/data.table.h index ee4a55d3a..f7c67b302 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -180,7 +180,7 @@ void warn_matrix_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, - SEXP on, SEXP verbose); + SEXP on, SEXP verbose, SEXP showProgressArg); // bmerge.c SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, diff --git a/src/dogroups.c b/src/dogroups.c index a3b07fb44..d14515150 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,8 +3,6 @@ #include #include -extern void progress(int, int); - static bool anySpecialStatic(SEXP x) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -65,7 +63,7 @@ static bool anySpecialStatic(SEXP x) { return false; } -SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg) +SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; @@ -73,9 +71,10 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; const bool verbose = LOGICAL(verboseArg)[0]==1; - bool showProgress = true, hasPrinted = false; - double startTime = wallclock(); - double nextTime = startTime+1; + const bool showProgress = LOGICAL(showProgressArg)[0]==1; + bool hasPrinted = false; + double startTime = (showProgress) ? wallclock() : 0; + double nextTime = (showProgress) ? startTime+1 : 0; if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -445,8 +444,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); if (hasPrinted || ETA >= 0) { if (verbose && !hasPrinted) Rprintf(_("\n")); - Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), - i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); + Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds."), i+1, ngrp, 100.0*(i+1)/ngrp, (int)(now-startTime), ETA); } nextTime = now+1; hasPrinted = true; @@ -459,7 +457,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } else UNPROTECT(1); // the jval. Don't want them to build up. The first jval can stay protected till the end ok. } - if (showProgress) Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds.\n"), ngrp, ngrp, 100.0, (int)(wallclock()-startTime), 0); + if (showProgress && hasPrinted) Rprintf(_("\rProcessed %d groups out of %d. %.0f%% done. Time elapsed: %ds. ETA: %ds.\n"), ngrp, ngrp, 100.0, (int)(wallclock()-startTime), 0); if (isNull(lhs) && ans!=NULL) { if (ansloc < LENGTH(VECTOR_ELT(ans,0))) { if (verbose) Rprintf(_("Wrote less rows (%d) than allocated (%d).\n"),ansloc,LENGTH(VECTOR_ELT(ans,0))); From 2bafec95cbf127c71edce1d881dac81ef768869e Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Tue, 16 Jul 2024 13:33:46 -0700 Subject: [PATCH 4/7] rd change --- man/data.table.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/data.table.Rd b/man/data.table.Rd index 1ca87859c..729c0861c 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -35,7 +35,8 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac .SDcols, verbose = getOption("datatable.verbose"), # default: FALSE allow.cartesian = getOption("datatable.allow.cartesian"), # default: FALSE - drop = NULL, on = NULL, env = NULL) + drop = NULL, on = NULL, env = NULL, + showProgress = getOption("datatable.showProgress", interactive())) } \arguments{ \item{\dots}{ Just as \code{\dots} in \code{\link{data.frame}}. Usual recycling rules are applied to vectors of different lengths to create a list of equal length vectors.} From b3d9a8d151982616ef46bb058554d65c1eb4fbc3 Mon Sep 17 00:00:00 2001 From: Joshua Wu Date: Wed, 17 Jul 2024 13:29:37 -0700 Subject: [PATCH 5/7] Update R/data.table.R Co-authored-by: Michael Chirico --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index ea9409398..8d4651d88 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -226,7 +226,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") - if (!isTRUEorFALSE(showProgress)) stopf("showProgress must be TRUE or FALSE") + if (!isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() From f30bf9e8530d20c0ede6bff4d4d85117f2e3a247 Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Thu, 18 Jul 2024 12:40:38 -0700 Subject: [PATCH 6/7] progress printing starts after >3s --- src/dogroups.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index d14515150..69ac8a9a0 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -74,7 +74,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX const bool showProgress = LOGICAL(showProgressArg)[0]==1; bool hasPrinted = false; double startTime = (showProgress) ? wallclock() : 0; - double nextTime = (showProgress) ? startTime+1 : 0; + double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -440,7 +440,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX } double now; if (showProgress && (now=wallclock())>=nextTime) { - double avgTimePerGroup = (now-startTime) / (i+1); + double avgTimePerGroup = (now-startTime)/(i+1); int ETA = (int)(avgTimePerGroup*(ngrp-i-1)); if (hasPrinted || ETA >= 0) { if (verbose && !hasPrinted) Rprintf(_("\n")); From ad76d6151d175bb53e2e943b2b77b1c188af6d2d Mon Sep 17 00:00:00 2001 From: joshhwuu Date: Mon, 29 Jul 2024 12:47:32 -0700 Subject: [PATCH 7/7] NEWS entry --- NEWS.md | 2 ++ src/dogroups.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index f23859487..6d0900e81 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,8 @@ 14. `fread` loads `.bgz` files directly, [#5461](https://github.com/Rdatatable/data.table/issues/5461). Thanks to @TMRHarrison for the request with proposed fix, and Benjamin Schwendinger for the PR. +15. `[.data.table` gains `showProgress`, allowing users to toggle progress printing for large "by" operations, [#3060](https://github.com/Rdatatable/data.table/issues/3060). Reports information such as number of groups processed, total groups, total time elapsed and estimated time until completion. This feature doesn't apply for `GForce` optimized operations. Thanks to @eatonya, @zachmayer for filing FRs, and to everyone else that up-voted/chimed in on the issue. Thanks to @joshhwuu for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/src/dogroups.c b/src/dogroups.c index 9171866ad..7474345f4 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -438,6 +438,8 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (copied) UNPROTECT(1); } } + // progress printing, #3060 + // could potentially refactor to use fread's progress() function, however we would lose some information in favor of simplicity. double now; if (showProgress && (now=wallclock())>=nextTime) { double avgTimePerGroup = (now-startTime)/(i+1);