Rdatatable · MichaelChirico · May 4, 2024 · Dec 20, 2023 · Jan 6, 2024 · Jan 7, 2024
@@ -223,7 +223,7 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) {
       setnames(ans, 'rn', keep.rownames[1L])
     return(ans)
   }
-  if (any(vapply_1i(x, function(xi) length(dim(xi))))) { # not is.atomic because is.atomic(matrix) is true
+  if (any(cols_with_dims(x))) {
     # a data.frame with a column that is data.frame needs to be expanded; test 2013.4
     # x may be a class with [[ method that behaves differently, so as.list first for default [[, #4526
     return(as.data.table.list(as.list(x), keep.rownames=keep.rownames, ...))
@@ -245,7 +245,7 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) {
 
 as.data.table.data.table = function(x, ...) {
   # as.data.table always returns a copy, automatically takes care of #473
-  if (any(vapply_1i(x, function(xi) length(dim(xi))))) { # for test 2089.2
+  if (any(cols_with_dims(x))) { # for test 2089.2
     return(as.data.table.list(x, ...))
   }
   x = copy(x) # #1681

@@ -1,11 +1,3 @@
-if (!exists("trimws", "package:base")) {
-  # trimws was new in R 3.2.0. Backport it for internal data.table use in R 3.1.0
-  trimws = function(x) {
-    mysub = function(re, x) sub(re, "", x, perl = TRUE)
-    mysub("[ \t\r\n]+$", mysub("^[ \t\r\n]+", x))
-  }
-}
-
 dim.data.table = function(x)
 {
   .Call(Cdim, x)
@@ -356,8 +348,8 @@ replace_dot_alias = function(e) {
     # Fixes 4994: a case where quoted expression with a "!", ex: expr = quote(!dt1); dt[eval(expr)] requires
     # the "eval" to be checked before `as.name("!")`. Therefore interchanged.
     restore.N = remove.N = FALSE
-    if (exists(".N", envir=parent.frame(), inherits=FALSE)) {
-      old.N = get(".N", envir=parent.frame(), inherits=FALSE)
+    old.N = get0(".N", envir=parent.frame(), inherits=FALSE)
+    if (!is.null(old.N)) {
       locked.N = bindingIsLocked(".N", parent.frame())
       if (locked.N) eval(call("unlockBinding", ".N", parent.frame()))  # eval call to pass R CMD check NOTE until we find cleaner way
       assign(".N", nrow(x), envir=parent.frame(), inherits=FALSE)
@@ -899,12 +891,12 @@ replace_dot_alias = function(e) {
         }
         if (!is.list(byval)) stopf("'by' or 'keyby' must evaluate to a vector or a list of vectors (where 'list' includes data.table and data.frame which are lists, too)")
         if (length(byval)==1L && is.null(byval[[1L]])) bynull=TRUE #3530 when by=(function()NULL)()
-        if (!bynull) for (jj in seq_len(length(byval))) {
+        if (!bynull) for (jj in seq_along(byval)) {
           if (!(this_type <- typeof(byval[[jj]])) %chin% ORDERING_TYPES) {
             stopf("Column or expression %d of 'by' or 'keyby' is type '%s' which is not currently supported. If you have a compelling use case, please add it to https://github.com/Rdatatable/data.table/issues/1597. As a workaround, consider converting the column to a supported type, e.g. by=sapply(list_col, toString), whilst taking care to maintain distinctness in the process.", jj, this_type)
           }
         }
-        tt = vapply_1i(byval,length)
+        tt = lengths(byval)
         if (any(tt!=xnrow)) stopf("The items in the 'by' or 'keyby' list are length(s) %s. Each must be length %d; the same length as there are rows in x (after subsetting if i is provided).", brackify(tt), xnrow)
         if (is.null(bynames)) bynames = rep.int("",length(byval))
         if (length(idx <- which(!nzchar(bynames))) && !bynull) {
@@ -1034,7 +1026,7 @@ replace_dot_alias = function(e) {
               # allow filtering via function in .SDcols, #3950
               if (is.function(.SDcols)) {
                 .SDcols = lapply(x, .SDcols)
-                if (any(idx <- vapply_1i(.SDcols, length) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA)))
+                if (any(idx <- lengths(.SDcols) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA)))
                   stopf("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", brackify(names(x)[idx]))
                 .SDcols = unlist(.SDcols, use.names = FALSE)
               }
@@ -1290,11 +1282,12 @@ replace_dot_alias = function(e) {
       # warningf(sym," in j is looking for ",getName," in calling scope, but a column '", sym, "' exists. Column names should not start with ..")
     }
     getName = substr(sym, 3L, nchar(sym))
-    if (!exists(getName, parent.frame())) {
+    getNameVal <- get0(getName, parent.frame())
+    if (is.null(getNameVal)) {
       if (exists(sym, parent.frame())) next  # user did 'manual' prefix; i.e. variable in calling scope has .. prefix
       stopf("Variable '%s' is not found in calling scope. Looking in calling scope because this symbol was prefixed with .. in the j= parameter.", getName)
     }
-    assign(sym, get(getName, parent.frame()), SDenv)
+    assign(sym, getNameVal, SDenv)
   }
   # hash=TRUE (the default) does seem better as expected using e.g. test 645.  TO DO experiment with 'size' argument
   if (missingby || bynull || (!byjoin && !length(byval))) {
@@ -1460,7 +1453,7 @@ replace_dot_alias = function(e) {
   #   is a more general issue but the former can be fixed by forcing units='secs'
   SDenv$`-.POSIXt` = function(e1, e2) {
     if (inherits(e2, 'POSIXt')) {
-      if (verbose && !exists('done_units_report', parent.frame())) {
+      if (verbose && !get0('done_units_report', parent.frame(), ifnotfound = FALSE)) {
         catf('\nNote: forcing units="secs" on implicit difftime by group; call difftime explicitly to choose custom units\n')
         assign('done_units_report', TRUE, parent.frame())
       }
@@ -2804,7 +2797,7 @@ setDF = function(x, rownames=NULL) {
     }
     x
   } else {
-    n = vapply_1i(x, length)
+    n = lengths(x)
     mn = max(n)
     if (any(n<mn))
       stopf("All elements in argument 'x' to 'setDF' must be of same length")
@@ -3184,7 +3177,7 @@ is_constantish = function(q, check_singleton=FALSE) {
   }
   if (length(i) == 0L) stopf("Internal error in .isFastSubsettable. Please report to data.table developers") # nocov
   ## convert i to data.table with all combinations in rows.
-  if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){
+  if(length(i) > 1L && prod(lengths(i)) > 1e4){
     ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635
     if (verbose) {catf("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()}
     return(NULL)

@@ -235,7 +235,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
       .Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
       lhs = lhs_; rhs = rhs_
     }
-    maplen = vapply_1i(mapunique, length)
+    maplen = lengths(mapunique)
     idx = do.call("CJ", mapunique)[map, 'I' := .I][["I"]] # TO DO: move this to C and avoid materialising the Cross Join.
     some_fill = anyNA(idx)
     fill.default = if (run_agg_funs && is.null(fill) && some_fill) dat_for_default_fill[, maybe_err(eval(fun.call))]

@@ -27,8 +27,7 @@ patterns = function(..., cols=character(0L), ignore.case=FALSE, perl=FALSE, fixe
   if (!is.character(p))
     stopf("Input patterns must be of type character.")
   matched = lapply(p, grep, cols, ignore.case=ignore.case, perl=perl, fixed=fixed, useBytes=useBytes)
-  # replace with lengths when R 3.2.0 dependency arrives
-  if (length(idx <- which(sapply(matched, length) == 0L)))
+  if (length(idx <- which(lengths(matched) == 0L)))
     stopf('Pattern(s) not found: [%s]', brackify(p[idx]))
   if (length(matched) == 1L) return(matched[[1L]])
   matched
@@ -125,7 +124,7 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na
       stopf("sep must be character string")
     }
     list.of.vectors = strsplit(cols, sep, fixed=TRUE)
-    vector.lengths = sapply(list.of.vectors, length)
+    vector.lengths = lengths(list.of.vectors)
     n.groups = max(vector.lengths)
     if (n.groups == 1) {
       stopf("each column name results in only one item after splitting using sep, which means that all columns would be melted; to fix please either specify melt on all columns directly without using measure, or use a different sep/pattern specification")

@@ -91,7 +91,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     }
     file_info = file.info(file)
     if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd())
-    if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # dir.exists() requires R v3.2+, #989
+    if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # Could use dir.exists(), but we already ran file.info().
     if (!file_info$size) {
       warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame')
       return(if (data.table) data.table(NULL) else data.frame(NULL))
@@ -350,7 +350,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     if (!all(vapply_1b(index, is.character)))
       stopf("index argument of data.table() must be a character vector naming columns (NB: col.names are applied before this)")
     if (is.list(index)) {
-      to_split = vapply_1i(index, length) == 1L
+      to_split = lengths(index) == 1L
       if (any(to_split))
         index[to_split] = sapply(index[to_split], strsplit, split = ",", fixed = TRUE)
     } else {

@@ -18,7 +18,7 @@ list2lang = function(x) {
   char = vapply(x, is.character, FALSE)
   to.name = !asis & char
   if (any(to.name)) { ## turns "my_name" character scalar into `my_name` symbol, for convenience
-    if (any(non.scalar.char <- vapply(x[to.name], length, 0L)!=1L)) {
+    if (any(non.scalar.char <- lengths(x[to.name])!=1L)) {
       stopf("Character objects provided in the input are not scalar objects, if you need them as character vector rather than a name, then wrap each into 'I' call: %s", brackify(names(non.scalar.char)[non.scalar.char]))
     }
     x[to.name] = lapply(x[to.name], as.name)

@@ -340,7 +340,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE)
       if (unique) l[[i]] = unique(y)
     }
   }
-  nrow = prod( vapply_1i(l, length) )  # lengths(l) will work from R 3.2.0
+  nrow = prod(lengths(l))
   if (nrow > .Machine$integer.max) stopf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max)
   l = .Call(Ccj, l)
   setDT(l)

@@ -349,11 +349,11 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no
   #    from all.equal and different to identical related to row.names and unused factor levels
   # 3) each test has a unique id which we refer to in commit messages, emails etc.
   # 4) test that a query generates exactly 2 warnings, that they are both the correct warning messages, and that the result is the one expected
-  .test.data.table = exists("nfail", parent.frame()) # test() can be used inside functions defined in tests.Rraw, so inherits=TRUE (default) here
+  nfail = get0("nfail", parent.frame()) # test() can be used inside functions defined in tests.Rraw, so inherits=TRUE (default) here
+  .test.data.table = !is.null(nfail)
   numStr = sprintf("%.8g", num)
   if (.test.data.table) {
     prevtest = get("prevtest", parent.frame())
-    nfail = get("nfail", parent.frame())   # to cater for both test.data.table() and stepping through tests in dev
     whichfail = get("whichfail", parent.frame())
     assign("ntest", get("ntest", parent.frame()) + if (num>0) 1L else 0L, parent.frame(), inherits=TRUE)   # bump number of tests run
     lasttime = get("lasttime", parent.frame())

@@ -21,10 +21,6 @@ nan_is_na = function(x) {
   stopf("Argument 'nan' must be NA or NaN")
 }
 
-if (base::getRversion() < "3.2.0") {  # Apr 2015
-  isNamespaceLoaded = function(x) x %chin% loadedNamespaces()
-}
-
 if (!exists('startsWith', 'package:base', inherits=FALSE)) {  # R 3.3.0; Apr 2016
   startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub
 }
@@ -67,20 +63,23 @@ require_bit64_if_needed = function(DT) {
 }
 
 # vapply for return value character(1)
-vapply_1c = function (x, fun, ..., use.names = TRUE) {
+vapply_1c = function(x, fun, ..., use.names = TRUE) {
   vapply(X = x, FUN = fun, ..., FUN.VALUE = NA_character_, USE.NAMES = use.names)
 }
 
 # vapply for return value logical(1)
-vapply_1b = function (x, fun, ..., use.names = TRUE) {
+vapply_1b = function(x, fun, ..., use.names = TRUE) {
   vapply(X = x, FUN = fun, ..., FUN.VALUE = NA, USE.NAMES = use.names)
 }
 
 # vapply for return value integer(1)
-vapply_1i = function (x, fun, ..., use.names = TRUE) {
+vapply_1i = function(x, fun, ..., use.names = TRUE) {
   vapply(X = x, FUN = fun, ..., FUN.VALUE = NA_integer_, USE.NAMES = use.names)
 }
 
+# not is.atomic because is.atomic(matrix) is true
+cols_with_dims = function(x) vapply_1i(x, function(j) length(dim(j))) > 0L
+
 more = function(f) system(paste("more",f))    # nocov  (just a dev helper)
 
 # helper used to auto-name columns in data.table(x,y) as c("x","y"), CJ(x,y) and similar

@@ -520,7 +520,7 @@ if (loaded[["xts"]]) {  # was 1465 in tests.Rraw, #5516
 
 # was 2108 in tests.Rraw, #5516
 # first and last should no longer load xts namespace, #3857, below commented test for interactive validation when xts present but not loaded or attached
-# stopifnot("xts"%in%installed.packages(), !"xts"%in%loadedNamespaces()); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!"xts" %in% loadedNamespaces())
+# stopifnot("xts"%in%installed.packages(), !isNamespaceLoaded("xts")); library(data.table); x=as.POSIXct("2019-01-01"); last(x); stopifnot(!isNamespaceLoaded("xts"))
 x = as.POSIXct("2019-09-09")+0:1
 old = options(datatable.verbose=TRUE)
 test(19.01, last(x), x[length(x)], output="!is.xts(x)")

@@ -67,7 +67,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
   shallow = data.table:::shallow # until exported
   .shallow = data.table:::.shallow
   split.data.table = data.table:::split.data.table
-  if (!exists('startsWith', 'package:base', inherits=FALSE)) startsWith = data.table:::startsWith
+  if (!exists('startsWith', 'package:base', inherits=FALSE)) startsWith = data.table:::startsWith # R 3.3.0
   stopf = data.table:::stopf
   test = data.table:::test
   uniqlengths = data.table:::uniqlengths
@@ -1472,13 +1472,13 @@ test(462, DT[,foo:=10L], data.table(a=1:3,v=4:9,foo=10L,key="a"))
 unlink(f)
 
 # Test CJ problems with v1.7.4, #1689
-test(463, all(sapply(CJ(1:2,1:3),length)==6L))
+test(463, all(lengths(CJ(1:2,1:3)) == 6L))
 DT = data.table(x=1:4,y=1:2,cnt=1L,key=c('x', 'y'))
 test(464, DT[CJ(1:4,1:4)]$cnt, INT(1,rep(NA,4),1,NA,NA,1,rep(NA,4),1,NA,NA))
 test(465, DT[CJ(1:4,1:4), sum(cnt>0), by=.EACHI]$y, rep(1:4,4))
 f1 = factor(c("READING","MATHEMATICS"))
 f2 = factor(c("2010_2011","2009_2010","2008_2009"), levels=paste(2006:2010,2007:2011,sep="_"))
-test(466, all(sapply(CJ(f1, f2),length)==6L))
+test(466, all(lengths(CJ(f1, f2))==6L))
 
 # Test list(.SD,newcol=..) gives error with guidance
 DT = data.table(a=1:2,v=3:6)
@@ -2882,7 +2882,7 @@ test(966, fread(input, colClasses=list(character=2:4)), data.table(A=1:2, B=c("f
                        warning="Column number 4 (colClasses[[1]][3]) is out of range [1,ncol=3]")
 
 # Character input more than 4096 bytes (used to be passed through path.expand which imposed the limit), #2649
-test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L)
+test(967, nrow(fread( strrep('a\tb\n', 10000L), header=FALSE)), 10000L)
 
 # Test fread warns about removal of any footer (and autostart skips up over it)
 test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2")
@@ -7005,7 +7005,7 @@ test(1477.22, transpose(la, list.cols=NA), error="list.cols should be logical TR
 ll = list(data.frame(a=1), data.frame(x=1, y=2), NULL, list())
 ll <- lapply(ll, setDT)
 test(1478.1, sapply(ll, truelength), c(1025L, 1026L, 1024L, 1024L))
-test(1478.2, sapply(ll, length), INT(1,2,0,0))
+test(1478.2, lengths(ll), INT(1,2,0,0))
 
 # rbindlist stack imbalance issue, #980.
 test(1479, rbindlist(replicate(4,rbindlist(replicate(47, NULL),
@@ -11630,23 +11630,23 @@ set.seed(1L)
 ar.dimnames = list(color = sort(c("green","yellow","red")),
                    year = as.character(2011:2015),
                    status = sort(c("active","inactive","archived","removed")))
-ar.dim = sapply(ar.dimnames, length)
+ar.dim = lengths(ar.dimnames)
 ar = array(sample(c(rep(NA, 4), 4:7/2), prod(ar.dim), TRUE),
            unname(ar.dim), # array() having length(dims) < 3 will be created as matrix in R so will not be dispatched here but as.data.table.matrix
            ar.dimnames)
 dt = as.data.table(ar, na.rm=FALSE)
 dimcols = head(names(dt), -1L)
 test(1774.01, TRUE, all(
   nrow(dt) == 60L,
-  prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
+  prod(lengths(ar.dimnames)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
   dt[is.na(value), .N] == 30L,
   dt[, .N==1L, c(dimcols)]$V1
 ))
 dt = as.data.table(ar)
 dimcols = head(names(dt), -1L)
 test(1774.02, TRUE, all(
   nrow(dt) == 30L,
-  prod(sapply(ar.dimnames, length)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
+  prod(lengths(ar.dimnames)) == dt[, prod(sapply(.SD, uniqueN)), .SDcols = dimcols],
   dt[is.na(value), .N] == 0L,
   dt[, .N==1L, c(dimcols)]$V1
 ))
@@ -11856,7 +11856,7 @@ test(1800.2, fread("A\n1e55555555\n-1e+234056\n2e-59745"), data.table(A=c("1e555
 # Test files with "round" sizes (different multiples of 2, from 512B to 64KB)
 for (mul in c(16, 128, 512, 1024, 2048)) {
   ff = file(f<-tempfile(), open="wb")
-  cat(paste(rep("1234,5678,9012,3456,7890,abcd,4\x0A", mul), collapse=""), file=ff)
+  cat(strrep("1234,5678,9012,3456,7890,abcd,4\x0A", mul), file=ff)
   close(ff)
   DT = data.table(V1=rep(1234L, mul), V2=5678L, V3=9012L, V4=3456L, V5=7890L, V6="abcd", V7=4L)
   test(1801 + log2(mul)/100 + 0.001, file.info(f)$size, mul*32)
@@ -12067,8 +12067,8 @@ if (test_longdouble) {  #3258
 
 # Test that integers just above 128 or 256 characters in length parse as strings, not as integers/floats
 # This guards against potential overflows in the count of digits
-src1 = paste0(rep("1234567890", 13), collapse="")  # length = 130, slightly above 128
-src2 = paste0(rep("12345678900987654321", 13), collapse="")  # length = 260, slightly above 256
+src1 = strrep("1234567890", 13L)  # length = 130, slightly above 128
+src2 = strrep("12345678900987654321", 13L)  # length = 260, slightly above 256
 test(1831.1, fread(paste0("A\n", src1)), data.table(A=src1))
 test(1831.2, fread(paste0("A\n", src2)), data.table(A=src2))
 test(1831.3, fread(paste0("A\n", src2, ".33")), data.table(A=1.2345678900987655e+259))
@@ -16996,7 +16996,7 @@ test(2156.1, DT[,list(list({attr(value,"class")<-"newclass";value})),by=series]$
              DT[1,value])
 test(2156.2, truelength(DT[,list(list(value)),by=series]$V1[[1L]])>=0L)  # not -64 carried over by duplicate() of the .SD column
 # cover NULL case in copyAsPlain by putting a NULL alongside a dogroups .SD column. The 'if(.GRP==1L)' is just for fun.
-test(2156.3, sapply(DT[, list(if (.GRP==1L) list(value,NULL) else list(NULL,value)), by=series]$V1, length),
+test(2156.3, lengths(DT[, list(if (.GRP==1L) list(value,NULL) else list(NULL,value)), by=series]$V1),
              INT(64,0,0,64))
 
 # CornerstoneR usage revealed copySharedColumns needed work afer PR#4655