Skip to content

Commit

Permalink
Closes #5510: undefined behaviour (#5832)
Browse files Browse the repository at this point in the history
* Fix UB via int conversion (#5510)

Note that some double -> integer64 conversions
are collapsed into the one warning, affecting
tests, so these have been updated.

* Ensure nan is not coerced. Closes #5510

* Fix '1' mistakenly removed from earlier commit

* Re #5834

* Rename function more sensibly

* Incorporate finite checks into the logic

* Suspend int64 coerce tests subject to #5834

* Update CODEOWNERS

* Revert CODEOWNERS

* Update CODEOWNERS

With extant file endings

* Simplify logic for integer check

* Check for NA_INTEGER superfluous

* Test 6.53 should emit warning on -2^31

* C++ toolkit included by default

* first pass at fix

* Simplify/unite with within_int64_repres()

* weak inequality

---------

Co-authored-by: Michael Chirico <chiricom@google.com>
  • Loading branch information
HughParsonage and MichaelChirico authored Dec 26, 2023
1 parent 92105e8 commit 19e1798
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 20 deletions.
7 changes: 6 additions & 1 deletion .devcontainer/r-devel-clang-ubsan/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
{
"build": { "dockerfile": "Dockerfile", "context": "../.." },
"customizations": { "vscode": { "extensions": [ "REditorSupport.r" ] } }
"customizations": { "vscode": {
"extensions": [
"REditorSupport.r",
"ms-vscode.cpptools-extension-pack"
]
}}
}
7 changes: 6 additions & 1 deletion .devcontainer/r-devel-gcc/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
{
"build": { "dockerfile": "Dockerfile", "context": "../.." },
"customizations": { "vscode": { "extensions": [ "REditorSupport.r" ] } }
"customizations": { "vscode": {
"extensions": [
"REditorSupport.r",
"ms-vscode.cpptools-extension-pack"
]
}}
}
6 changes: 6 additions & 0 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,9 @@

# .SD vignette
/vignettes/datatable-sd-usage.Rmd @michaelchirico

# assign
/src/assign.c @HughParsonage

# utils
/src/utils.c @HughParsonage
2 changes: 1 addition & 1 deletion inst/tests/nafill.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ if (test_bit64) {
test(6.50, identical(coerceFill(x<-as.integer64(-2147483648)), list(NA_integer_, -2147483648, x)), warning="out-of-range")
test(6.51, identical(coerceFill(x<-as.integer64(-2147483649)), list(NA_integer_, -2147483649, x)), warning="out-of-range")
test(6.52, identical(coerceFill(-2147483647), list(-2147483647L, -2147483647, as.integer64("-2147483647"))))
test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648, as.integer64("-2147483648"))))
test(6.53, identical(coerceFill(-2147483648), list(NA_integer_, -2147483648, as.integer64("-2147483648"))), warning="precision lost")
test(6.54, identical(coerceFill(-2147483649), list(NA_integer_, -2147483649, as.integer64("-2147483649"))), warning="precision lost")
}

Expand Down
14 changes: 7 additions & 7 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
year = data.table::year # lubridate
yearmon = data.table::yearmon # zoo
yearqtr = data.table::yearqtr # zoo

rm_all = function(env=parent.frame()) {
tt = setdiff(ls(envir=env), .do_not_rm)
rm(list=tt, envir=env)
Expand Down Expand Up @@ -3886,7 +3886,7 @@ test(1133.75, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,
# on a new column with warning on 2nd assign
DT[,new:=NULL]
test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)),
warning="Group 2 column 'new': 3.4.*double.*at RHS position 1 truncated.*precision lost.*integer")
warning="Group 2 column 'new': 3.4.*double.*at RHS position 1.*truncated.*precision lost.*integer")

# Fix for FR #2496 - catch `{` in `:=` expression in `j`:
DT <- data.table(x=c("A", "A", "B", "B"), val =1:4)
Expand Down Expand Up @@ -5071,7 +5071,7 @@ dt <- data.table(a=1:3, b=c(7,8,9), c=c(TRUE, NA, FALSE), d=as.list(4:6), e=c("a

test(1294.01, dt[, a := 1]$a, rep(1L, 3L))
test(1294.02, dt[, a := 1.5]$a, rep(1L, 3L),
warning="1.5.*double.*position 1 truncated.*integer.*column 1 named 'a'")
warning="1.5.*double.*position 1.*truncated.*integer.*column 1 named 'a'")
test(1294.03, dt[, a := NA]$a, rep(NA_integer_, 3L))
test(1294.04, dt[, a := "a"]$a, rep(NA_integer_, 3L),
warning=c("Coercing 'character' RHS to 'integer'.*column 1 named 'a'",
Expand Down Expand Up @@ -9668,7 +9668,7 @@ nqjoin_test <- function(x, y, k=1L, test_no, mult="all") {
gc() # no longer needed but left in place just in case, no harm
}

dt1 = nq_fun(100L) # 400 reduced to 100, #5517
dt1 = nq_fun(100L) # 400 reduced to 100, #5517
dt2 = nq_fun(50L)
x = na.omit(dt1)
y = na.omit(dt2)
Expand Down Expand Up @@ -14365,7 +14365,7 @@ DT[,foo:=factor(c("a","b","c"))]
test(2005.05, DT[2, foo:=8i], error="Can't assign to column 'foo' (type 'factor') a value of type 'complex' (not character, factor, integer or numeric)")
test(2005.06, DT[2, a:=9, verbose=TRUE], notOutput="Coerced")
test(2005.07, DT[2, a:=NA, verbose=TRUE], notOutput="Coerced")
test(2005.08, DT[2, a:=9.9]$a, INT(1,9,3), warning="9.9.*double.*position 1 truncated.*integer.*column 1 named 'a'")
test(2005.08, DT[2, a:=9.9]$a, INT(1,9,3), warning="9.9.*double.*position 1.*truncated.*integer.*column 1 named 'a'")
test(2005.09, set(DT, 1L, "c", expression(x+2)), error="type 'expression' cannot be coerced to 'raw'")
test(2005.10, set(DT, 1L, "d", expression(x+2)), error="type 'expression' cannot be coerced to 'logical'")
test(2005.11, set(DT, 1L, "e", expression(x+2)), error="type 'expression' cannot be coerced to 'double'")
Expand Down Expand Up @@ -14417,7 +14417,7 @@ test(2006.2, rbindlist(list(data.table(x = as.raw(1:2), y=as.raw(5:6)), data.tab
if (test_bit64) {
test(2007.1, rbindlist(list( list(a=as.integer64(1), b=3L), list(a=2L, b=4L) )), data.table(a=as.integer64(1:2), b=3:4))
test(2007.2, rbindlist(list( list(a=3.4, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6),
warning="Column 1 of item 1: 3.4.*double.*position 1 truncated.*precision lost.*when assigning.*integer64.*column 1 named 'a'")
warning="Column 1 of item 1: 3.4.*double.*position 1.*truncated.*precision lost.*when assigning.*integer64.*column 1 named 'a'")
test(2007.3, rbindlist(list( list(a=3.0, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6))
test(2007.4, rbindlist(list( list(b=5:6), list(a=as.integer64(4), b=7L)), fill=TRUE), data.table(b=5:7, a=as.integer64(c(NA,NA,4)))) # tests writeNA of integer64
test(2007.5, rbindlist(list( list(a=INT(1,NA,-2)), list(a=as.integer64(c(3,NA))) )), data.table(a=as.integer64(c(1,NA,-2,3,NA)))) # int NAs combined with int64 NA
Expand Down Expand Up @@ -18004,7 +18004,7 @@ test(2233.34, copy(DT)[, same_value:=value[1], by=.(by1, by2), verbose=TRUE],
test(2233.35, copy(DT)[, same_value:=value[1], by=.(by2, by1), verbose=TRUE], ans, output=out)
test(2233.36, copy(DT)[, same_value:=value[1], keyby=.(by2, by1), verbose=TRUE], setkey(ans,by2,by1), output=out)
# similar to #5307 using integer
DT = data.table(A=INT(2,1,2,1), B=6:3, v=11:14)
DT = data.table(A=INT(2,1,2,1), B=6:3, v=11:14)
test(2233.37, copy(DT)[, val:=v[1L], by=.(A,B), verbose=TRUE], copy(DT)[, val:=11:14], output=out)
test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=INT(1,1,2,2), B=INT(3,5,4,6), v=INT(14,12,13,11), val=INT(14,12,13,11), key=c("A","B")), output=out)
# test from #5326 but with n=100 rather than n=100000; confirmed that n=100 fails tests 2233.403-405 before fix
Expand Down
17 changes: 9 additions & 8 deletions src/assign.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ int checkOverAlloc(SEXP x)
error(_("getOption('datatable.alloccol') should be a number, by default 1024. But its type is '%s'."), type2char(TYPEOF(x)));
if (LENGTH(x) != 1)
error(_("getOption('datatable.alloc') is a numeric vector ok but its length is %d. Its length should be 1."), LENGTH(x));
int ans = isInteger(x) ? INTEGER(x)[0] : (int)REAL(x)[0];
int ans = asInteger(x);
if (ans<0)
error(_("getOption('datatable.alloc')==%d. It must be >=0 and not NA."), ans);
return ans;
Expand Down Expand Up @@ -742,7 +742,8 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
const double *sd = REAL(source);
for (int i=0; i<slen; ++i) {
const double val = sd[i+soff];
if (!ISNAN(val) && (!R_FINITE(val) || val!=(int)val || (int)val<1 || (int)val>nlevel)) {
// Since nlevel is an int, val < 1 || val > nlevel will deflect UB guarded against in PR #5832
if (!ISNAN(val) && (val < 1 || val > nlevel || val != (int)val)) {
error(_("Assigning factor numbers to %s. But %f is outside the level range [1,%d], or is not a whole number."), targetDesc(colnum, colname), val, nlevel);
}
}
Expand Down Expand Up @@ -897,14 +898,14 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
switch (TYPEOF(source)) {
case REALSXP: if (sourceIsI64)
CHECK_RANGE(int64_t, REAL, val!=NA_INTEGER64 && (val<=NA_INTEGER || val>INT_MAX), PRId64, "out-of-range (NA)", val)
else CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "f", "truncated (precision lost)", val)
else CHECK_RANGE(double, REAL, !ISNAN(val) && (!within_int32_repres(val) || (int)val!=val), "f", "out-of-range(NA) or truncated (precision lost)", val)
case CPLXSXP: CHECK_RANGE(Rcomplex, COMPLEX, !((ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)) &&
(ISNAN(val.r) || (R_FINITE(val.r) && (int)val.r==val.r))), "f", "either imaginary part discarded or real part truncated (precision lost)", val.r)
(ISNAN(val.r) || (within_int32_repres(val.r) && (int)val.r==val.r))), "f", "either imaginary part discarded or real part truncated (precision lost)", val.r)
} break;
case REALSXP:
switch (TYPEOF(source)) {
case REALSXP: if (targetIsI64 && !sourceIsI64)
CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int64_t)val!=val), "f", "truncated (precision lost)", val)
CHECK_RANGE(double, REAL, !ISNAN(val) && (!within_int64_repres(val) || (int64_t)val!=val), "f", "out-of-range(NA) or truncated (precision lost)", val)
break;
case CPLXSXP: if (targetIsI64)
CHECK_RANGE(Rcomplex, COMPLEX, !((ISNAN(val.i) || (R_FINITE(val.i) && val.i==0.0)) &&
Expand Down Expand Up @@ -1004,8 +1005,8 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
case REALSXP:
if (sourceIsI64)
BODY(int64_t, REAL, int, (val==NA_INTEGER64||val>INT_MAX||val<=NA_INTEGER) ? NA_INTEGER : (int)val, td[i]=cval)
else BODY(double, REAL, int, ISNAN(val) ? NA_INTEGER : (int)val, td[i]=cval)
case CPLXSXP: BODY(Rcomplex, COMPLEX, int, ISNAN(val.r) ? NA_INTEGER : (int)val.r, td[i]=cval)
else BODY(double, REAL, int, (ISNAN(val) || !within_int32_repres(val)) ? NA_INTEGER : (int)val, td[i]=cval)
case CPLXSXP: BODY(Rcomplex, COMPLEX, int, (ISNAN(val.r) || !within_int32_repres(val.r)) ? NA_INTEGER : (int)val.r, td[i]=cval)
default: COERCE_ERROR("integer"); // test 2005.4
}
} break;
Expand All @@ -1021,7 +1022,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
if (mc) {
memcpy(td, (int64_t *)REAL(source), slen*sizeof(int64_t)); break;
} else BODY(int64_t, REAL, int64_t, val, td[i]=cval)
} else BODY(double, REAL, int64_t, R_FINITE(val) ? val : NA_INTEGER64, td[i]=cval)
} else BODY(double, REAL, int64_t, within_int64_repres(val) ? val : NA_INTEGER64, td[i]=cval)
case CPLXSXP: BODY(Rcomplex, COMPLEX, int64_t, ISNAN(val.r) ? NA_INTEGER64 : (int64_t)val.r, td[i]=cval)
default: COERCE_ERROR("integer64");
}
Expand Down
2 changes: 2 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ SEXP between(SEXP x, SEXP lower, SEXP upper, SEXP incbounds, SEXP NAbounds, SEXP
SEXP coalesce(SEXP x, SEXP inplace);

// utils.c
bool within_int32_repres(double x);
bool within_int64_repres(double x);
bool isRealReallyInt(SEXP x);
SEXP isRealReallyIntR(SEXP x);
SEXP isReallyReal(SEXP x);
Expand Down
2 changes: 1 addition & 1 deletion src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1164,7 +1164,7 @@ SEXP gprod(SEXP x, SEXP narmArg) {
if (INHERITS(x, char_integer64)) {
int64_t *ansd = (int64_t *)REAL(ans);
for (int i=0; i<ngrp; ++i) {
ansd[i] = (s[i]>INT64_MAX || s[i]<=INT64_MIN) ? NA_INTEGER64 : (int64_t)s[i];
ansd[i] = (ISNAN(s[i]) || s[i]>INT64_MAX || s[i]<=INT64_MIN) ? NA_INTEGER64 : (int64_t)s[i];
}
} else {
double *ansd = REAL(ans);
Expand Down
13 changes: 12 additions & 1 deletion src/utils.c
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
#include "data.table.h"

bool within_int32_repres(double x) {
// N.B. (int)2147483647.99 is not undefined behaviour since s 6.3.1.4 of the C
// standard states that behaviour is undefined only if the integral part of a
// finite value of standard floating type cannot be represented.
return R_FINITE(x) && x < 2147483648 && x > -2147483648;
}

bool within_int64_repres(double x) {
return R_FINITE(x) && x <= (double)INT64_MAX && x >= (double)INT64_MIN;
}

static R_xlen_t firstNonInt(SEXP x) {
R_xlen_t n=xlength(x), i=0;
const double *dx = REAL(x);
while (i<n &&
( ISNA(dx[i]) ||
( R_FINITE(dx[i]) && dx[i]==(int)(dx[i]) && (int)(dx[i])!=NA_INTEGER))) { // NA_INTEGER == INT_MIN == -2147483648
(within_int32_repres(dx[i]) && dx[i]==(int)(dx[i])))) {
i++;
}
return i==n ? 0 : i+1;
Expand Down

0 comments on commit 19e1798

Please sign in to comment.