Rdatatable · ben-schwen · Sep 18, 2021 · Sep 18, 2021 · Sep 18, 2021 · Sep 18, 2021
@@ -56,6 +56,78 @@
 
 18. New `setdroplevels()` as a by-reference version of the `droplevels()` method, which returns a copy of its input, [#6014](https://github.com/Rdatatable/data.table/issues/6014). Thanks @MichaelChirico for the suggestion and implementation.
 
+19. `first()` and `last()` gain `na.rm` taking values `FALSE` (default), `TRUE` or `"row"`, [#4239](https://github.com/Rdatatable/data.table/issues/4239). For vector input, `TRUE` and `"row"` are the same. For `data.table|frame` input, `TRUE` returns the first/last non-NA observation in each column, while `"row"` returns the first/last row where all columns are non-NA. `TRUE` is optimized by group and `"row"` may be optimized by group in future. `n>1` with `na.rm=TRUE` is also optimized by group. Thanks to Nicolas Bennett and Michael Chirico for the requests, and Benjamin Schwendinger for the PR.
+
+    ```R
+    x
+    # [1] NA  1  2 NA
+
+    first(x)
+    # NA
+
+    first(x, na.rm=TRUE)
+    # 1
+
+    last(x, na.rm=TRUE)
+    # 2
+
+    DT
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+    #2:     1     4    NA
+    #3:     2     5    NA
+    #4:     2     6    NA
+
+    last(DT, na.rm=TRUE)
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     2     6     7
+
+    last(DT, na.rm="row")
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+
+    DT[, last(.SD, na.rm=TRUE), by=grp]
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     4     7
+    #2:     2     6    NA
+
+    DT[, last(.SD, na.rm="row"), by=grp]
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+    #2:     2    NA    NA
+
+    DT[, last(na.omit(.SD)), by=grp]  # same as na.rm='row' but drops all-NA groups
+    #     grp     A     B
+    #   <int> <int> <int>
+    #1:     1     3     7
+
+    set.seed(1)
+    DT = data.table(id=rep(1:1e6, each=10),
+                     v=sample(c(1:5,NA), 10e6, replace=TRUE))
+    DT
+    #                id     v
+    #             <int> <int>
+    #        1:       1     2
+    #        2:       1     3
+    #        3:       1     4
+    #        4:       1    NA
+    #        5:       1     2
+    #       ---              
+    #  9999996: 1000000     3
+    #  9999997: 1000000    NA
+    #  9999998: 1000000    NA
+    #  9999999: 1000000     1
+    # 10000000: 1000000     4
+    ans1 = DT[, last(na.omit(v)), by=id]       # 18.7 sec
+    ans2 = DT[, last(v, na.rm=TRUE), by=id]    #  0.1 sec
+    identical(ans1, ans2)                      # TRUE
+    ```
+
 ## BUG FIXES
 
 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.

@@ -1653,7 +1653,8 @@ replace_dot_alias = function(e) {
            (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) &&
         (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
       headopt = jsub %iscall% c("head", "tail")
-      firstopt = jsub %iscall% c("first", "last") # fix for #2030
+      firstopt = jsub %iscall% c("first", "last") &&                    # 2030, 4239
+                 !identical(match.call(first, jsub)[["na.rm"]], "row")  # first's signature same as last's
       if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") &&
           (subopt || headopt || firstopt)) {
         if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
@@ -1860,7 +1861,9 @@ replace_dot_alias = function(e) {
     assign(".N", len__, thisEnv) # For #334
     #fix for #1683
     if (use.I) assign(".I", seq_len(nrow(x)), thisEnv)
-    ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971.
+    ans = gforce(thisEnv, jsub, o__, f__, len__, irows,  # irows needed for #971
+                 .Call(CsubsetVector, groups, grpcols),  # just a list() subset to make C level neater; doesn't copy column contents
+                 lhs)  # for now this just prevents := with new feature first/last n>1; in future see TODO below
     gi = if (length(o__)) o__[f__] else f__
     g = lapply(grpcols, function(i) .Call(CsubsetVector, groups[[i]], gi)) # use CsubsetVector instead of [ to preserve attributes #5567
 
@@ -1922,10 +1925,10 @@ replace_dot_alias = function(e) {
   # Grouping by by: i is by val, icols NULL, o__ may be subset of x, f__ points to o__ (or x if !length o__)
   # TO DO: setkey could mark the key whether it is unique or not.
   if (!is.null(lhs)) {
-    if (GForce) { # GForce should work with := #1414
-      vlen = length(ans[[1L]])
+    if (GForce) { # GForce should work with := #1414. TODO: move down into gforce at C level to save creating/rep'ing ans and grpcols wastefully
+      vlen = length(ans[[1L]])   # TODO: this might be ngrp when na.rm=TRUE and one group has 2 and another 0, so needs enhancing here (by passing all-1 back from gans?)
       # replicate vals if GForce returns 1 value per group
-      jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep, times=len__) else tail(ans, -length(g))  # see comment in #4245 for why rep instead of rep.int
+      jvals = if (vlen==length(len__)) lapply(tail(ans, -length(grpcols)), rep, times=len__) else tail(ans, -length(grpcols))  # see comment in #4245 for why rep instead of rep.int
       jrows = vecseq(f__,len__,NULL)
       if (length(o__)) jrows = o__[jrows]
       if (length(irows)) jrows = irows[jrows]
@@ -3016,8 +3019,8 @@ gfuns = c(gdtfuns,
 `g[` = `g[[` = function(x, n) .Call(Cgnthvalue, x, as.integer(n)) # n is of length=1 here.
 ghead = function(x, n) .Call(Cghead, x, as.integer(n))
 gtail = function(x, n) .Call(Cgtail, x, as.integer(n))
-gfirst = function(x) .Call(Cgfirst, x)
-glast = function(x) .Call(Cglast, x)
+gfirst = function(x, n=1L, na.rm=FALSE) .Call(Cgfirst, x, as.integer(n), na.rm)
+glast = function(x, n=1L, na.rm=FALSE) .Call(Cglast, x, as.integer(n), na.rm)
 gsum = function(x, na.rm=FALSE) .Call(Cgsum, x, na.rm)
 gmean = function(x, na.rm=FALSE) .Call(Cgmean, x, na.rm)
 gweighted.mean = function(x, w, ..., na.rm=FALSE) {
@@ -3042,7 +3045,7 @@ gshift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift", "cyclic")) {
   stopifnot(is.numeric(n))
   .Call(Cgshift, x, as.integer(n), fill, type)
 }
-gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows)
+gforce = function(env, jsub, o, f, l, rows, grpcols, lhs) .Call(Cgforce, env, jsub, o, f, l, rows, grpcols, lhs)
 
 # GForce needs to evaluate all arguments not present in the data.table before calling C part #5547
 # Safe cases: variables [i], calls without variables [c(0,1), list(1)] # TODO extend this list

@@ -1,84 +1,82 @@
-# data.table defined last(x) with no arguments, just for last. If you need the last 10 then use tail(x,10).
-# for xts class objects it will dispatch to xts::last
-# reworked to avoid loading xts namespace (#3857) then again to fix dispatching of xts class (#4053)
-last = function(x, n=1L, ...) {
-  verbose = isTRUE(getOption("datatable.verbose", FALSE))
-  if (!inherits(x, "xts")) {
-    if (nargs()>1L) {
-      if ("package:xts" %chin% search()) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "xts::last", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-        xts::last(x, n=n, ...)
-      } else {
-        # nocov start
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-        utils::tail(x, n=n, ...)
-        # nocov end
-      }
+# data.table originally defined first(x) and last(x) with no arguments just for the single
+# first/last observation. Over time n= has been added since xts::last has n so now it makes
+# sense to support n. The difference to head/tail is the default n=1 vs n=6, and
+# that first/last are not generic for speed by group.
+
+first = function(x, n=1L, na.rm=FALSE, ...) {
+  .firstlast(x, n=n, na.rm=na.rm, first=TRUE, ...)
+}
+
+last = function(x, n=1L, na.rm=FALSE, ...) {
+  .firstlast(x, n=n, na.rm=na.rm, first=FALSE, ...)
+}
+
+.firstlast = function(x, n=1L, na.rm=FALSE, first=TRUE, ...) {
+  if (inherits(x, "xts")) {
+    if (isTRUE(getOption("datatable.verbose", FALSE)))
+      catf("using %s\n", if (first) "xts::first" else "xts::last")
+    return((if (first) xts::first else xts::last)(x, n=n, na.rm=na.rm, ...))
+  }
+  stopifnot(isTRUEorFALSE(na.rm) || identical(na.rm,"row"))
+  stopifnot(is.numeric(n), length(n)==1L, n>=0L)
+  n = as.integer(n)
+  if (is.data.frame(x)) {
+    if (!nrow(x)) return(x)
+    if (identical(na.rm, "row")) {   # any NA on the row removes that row
+      nna = which_(.Call(Cdt_na, x, seq_along(x)), bool=FALSE)
+      # very similar to na.omit.data.table
+      # TODO: n and first/last could be passed to Cdt_na and it could stop after finding n (it already does that in gsumm.c when gforce optimized)
+      nna = .firstlastVector(nna, n=n, first=first, na.rm=FALSE)
+      ans = .Call(CsubsetDT, x, nna, seq_along(x))  # works on DF too
     } else {
-      dx = dim(x)
-      if (is.null(dx)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "'x[[length(x)]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))")
-        lx = length(x)
-        if (!lx) x else x[[lx]]
-      } else if (is.data.frame(x)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "'x[nrow(x),]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)")
-        x[dx[1L], , drop=FALSE]
-      } else {
-        if (verbose)
-          catf("%s: using %s: %s\n", "last", "utils::tail", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-        utils::tail(x, n=n, ...)
+      ans = lapply(x, .firstlastVector, n=n, first=first, na.rm=na.rm)
+      if (na.rm) {
+        l = vapply_1i(ans, length)
+        m = max(l)
+        for (i in which(l<m)) {
+          ans[[i]] = c(ans[[i]], rep(NA, m-l[i]))
+        }
+        # any row.names won't align to the values now in the result so don't retain them
       }
     }
-  } else {
-    if (!requireNamespace("xts", quietly=TRUE))
-      stopf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::last") # nocov
-    if (verbose)
-      catf("%s: using %s: %s\n", "last", "xts::last", "is.xts(x)")
-    xts::last(x, n=n, ...)
+    if (is.data.table(x)) setDT(ans) else setDF(ans)
+    setattr(ans, "class", class(x))
+    if (!isTRUE(na.rm) && length(rn<-attr(x,"row.names")))
+      setattr(ans, "row.names", if (isFALSE(na.rm)) .firstlastVector(rn, n=n, first=first, na.rm=FALSE)
+                                               else rn[nna])
+    return(ans)
+  }
+  if (!length(x))
+    return(x)
+  if (!is.vector(x)) {
+    if (!isFALSE(na.rm))
+      stopf("na.rm=TRUE|'row' is not currently supported for '%s'", class(x)[1L])
+    return((if (first) utils::head else utils::tail)(x, n=n, ...))  # e.g. matrix
   }
+  return(.firstlastVector(x, n=n, first=first, na.rm=!isFALSE(na.rm)))  # !isFALSE to convert 'row' to TRUE
 }
 
-first = function(x, n=1L, ...) {
-  verbose = isTRUE(getOption("datatable.verbose", FALSE))
-  if (!inherits(x, "xts")) {
-    if (nargs()>1L) {
-      if ("package:xts" %chin% search()) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "xts::first", "!is.xts(x) & nargs>1 & 'package:xts'%in%search()")
-        xts::first(x, n=n, ...)
-      } else {
-        # nocov start
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & nargs>1 & !'package:xts'%in%search()")
-        utils::head(x, n=n, ...)
-        # nocov end
-      }
-    } else {
-      dx = dim(x)
-      if (is.null(dx)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "'x[[1L]]'", "!is.xts(x) & !nargs>1 & is.null(dim(x))")
-        lx = length(x)
-        if (!lx) x else x[[1L]]
-      } else if (is.data.frame(x)) {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "'x[1L,]'", "!is.xts(x) & !nargs>1 & is.data.frame(x)")
-        if (!dx[1L]) x else x[1L, , drop=FALSE]
-      } else {
-        if (verbose)
-          catf("%s: using %s: %s\n", "first", "utils::head", "!is.xts(x) & !nargs>1 & !is.null(dim(x)) & !is.data.frame(x)")
-        utils::head(x, n=n, ...)
-      }
-    }
+.firstlastVector = function(x, n, first, na.rm) {
+  if (!length(x)) return(x)
+  if (n==0L) return(x[0L])
+  ans = if (na.rm) {
+    nna = which_(if (is.list(x)) vapply_1b(x,function(y){is.null(y)||(length(y)==1L&&is.na(y))})
+                            else is.na(x), bool=FALSE)   # TODO: again, n and first/last could be passed to C here
+    if (!length(nna)) x[0L]
+    else {y=min(n,length(nna)); x[nna[if (first) seq.int(1L,y) else seq.int(length(nna)-y+1L,length(nna))]]}
   } else {
-    if (!requireNamespace("xts", quietly=TRUE))
-      stopf("'xts' class passed to %s function but 'xts' is not available, you should have 'xts' installed already", "data.table::first") # nocov
-    if (verbose)
-      catf("%s: using %s: %s\n", "first", "xts::first", "is.xts(x)")
-    xts::first(x, n=n, ...)
+    y=min(n,length(x)); x[if (first) seq.int(1L,y) else seq.int(length(x)-y+1L,length(x))] 
   }
+  if (n>1L || na.rm)   # n!=length(ans)
+    .Call("Csettruelength", ans, length(ans))
+  # for dogroups.c to know that shorter results (including when na.rm results in a length-1) should be padded with NA to match the length of longer items
+  # head and tail with na.rm=TRUE are by their nature returning a vector and therefore shouldn't be recycled when length-1; test 2240.81
+  # TODO: new function pad() could be provided so user can do things like DT[, .(pad(na.omit(B)), pad(na.omit(C))), by=grp]
+  #         to avoid the error 'Supplied 2 items for column 1 of group 1 which has 3 rows ...'
+  #       and/or pad= could be added to [.data.table to allow padding all results
+  # Since gforce_dynamic optimizes head/tail it knows to pad and that's optimized. However, default last(x) and first(x) (i.e. n=1 na.rm=FALSE) are
+  # single-valued like mean,median etc and are recycled in the same way. This is consistent with n=1 na.rm=FALSE already not being treated as
+  # gforce_dynamic in gsumm.c either. n=1 na.rm=TRUE returns empty when all-NA so is still a vector result not recycled when length-1.
+  ans
 }
+
@@ -23,6 +23,7 @@ shift = function(x, n=1L, fill, type=c("lag", "lead", "shift", "cyclic"), give.n
     }
     setattr(ans, "names",  paste(rep(nx,each=length(n)), type, n, sep="_"))
   }
+  if (length(n)>1L) setDT(ans)
   ans
 }
 

@@ -496,6 +496,40 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
       # nocov end
     }
   }
+  if (!fail) for (type in c("warning","error","message")) {
+    observed = actual[[type]]
+    expected = get(type)
+    if (type=="warning" && length(observed) && !is.null(ignore.warning)) {
+      # if a warning containing this string occurs, ignore it. First need for #4182 where warning about 'timedatectl' only
+      # occurs in R 3.4, and maybe only on docker too not for users running test.data.table().
+      stopifnot(length(ignore.warning)==1L, is.character(ignore.warning), !is.na(ignore.warning), nchar(ignore.warning)>=1L)
+      observed = grep(ignore.warning, observed, value=TRUE, invert=TRUE)
+    }
+    if (length(expected) != length(observed)) {
+      # nocov start
+      catf("Test %s produced %d %ss but expected %d\n%s\n%s\n", numStr, length(observed), type, length(expected),
+            paste("Expected:", expected, collapse="\n"),
+            paste("Observed:", observed, collapse="\n"))
+      fail = TRUE
+      # nocov end
+    } else {
+      # the expected type occurred and, if more than 1 of that type, in the expected order
+      for (i in seq_along(expected)) {
+        if (!foreign && !string_match(expected[i], observed[i])) {
+          # nocov start
+          catf("Test %s didn't produce the correct %s:\nExpected: %s\nObserved: %s\n", numStr, type, expected[i], observed[i])
+          fail = TRUE
+          # nocov end
+        }
+      }
+    }
+  }
+  if (fail && exists("out",inherits=FALSE)) {
+    # nocov start
+    catf("Output captured before unexpected warning/error/message:\n")
+    writeLines(out)
+    # nocov end
+  }
   if (!fail && !length(error) && (!length(output) || !missing(y))) {   # TODO test y when output=, too
     capture.output(y <- try(y, silent=TRUE)) # y might produce verbose output, just toss it
     if (identical(x,y)) return(invisible(TRUE))