diff --git a/NEWS.md b/NEWS.md index 32097b4ad..4e1790710 100644 --- a/NEWS.md +++ b/NEWS.md @@ -57,6 +57,8 @@ rowwiseDT( 6. Fixed a segfault in `fcase()`, [#6448](https://github.com/Rdatatable/data.table/issues/6448). Thanks @ethanbsmith for reporting with reprex, @aitap for finding the root cause, and @MichaelChirico for the PR. +7. `fread()` performance improves when specifying `Date` among `colClasses`, [#6105](https://github.com/Rdatatable/data.table/issues/6105). One implication of the change is that the column will be an `IDate` (which also inherits from `Date`), which may affect code strongly relying on the column class to be `Date` exactly; computations with `IDate` and `Date` columns should otherwise be the same. If you strongly prefer the `Date` class, run `as.Date()` explicitly following `fread()`. Thanks @scipima for the report and @MichaelChirico for the fix. + ## NOTES 1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix. diff --git a/inst/tests/S4.Rraw b/inst/tests/S4.Rraw index 35cee55da..7e0b8111d 100644 --- a/inst/tests/S4.Rraw +++ b/inst/tests/S4.Rraw @@ -90,14 +90,14 @@ ans_print = utils::capture.output(print(ans)) if (TZnotUTC) { test(5.2, options=list(datatable.old.fread.datetime.character=NULL), fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct"), tz=""), - ans, output=ans_print) + copy(ans)[, a := as.IDate(a)], output=ans_print) test(5.3, options=list(datatable.old.fread.datetime.character=NULL), fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA), tz=""), - data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) + data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c="2015-01-03 01:02:03"), output=ans_print) } else { test(5.4, options=list(datatable.old.fread.datetime.character=NULL), fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date","IDate","POSIXct")), - ans<-data.table(a=as.Date("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print) + ans<-data.table(a=as.IDate("2015-01-01"), b=as.IDate("2015-01-02"), c=as.POSIXct("2015-01-03 01:02:03", tz="UTC")), output=ans_print) test(5.5, options=list(datatable.old.fread.datetime.character=NULL), fread("a,b,c\n2015-01-01,2015-01-02,2015-01-03 01:02:03", colClasses=c("Date",NA,NA)), ans, output=ans_print) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a6bbdbf86..b1b46f7ad 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11042,20 +11042,20 @@ test(1743.044, fread("a\n1", colClasses=""), data.table(a=1L)) # Issue #1634: 'fread doesn't check colClasses to be valid type' # Currently using BioGenerics, which doesn't support USE.NAMES ## Date supported character/list colClasses -test(1743.05, sapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), c(a="Date", b="integer")) -test(1743.06, sapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), c(a="Date", b="integer")) -test(1743.07, sapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), c(a="Date", b="Date")) -test(1743.08, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex")) -test(1743.09, sapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), c(a="Date", b="integer", c="complex")) -test(1743.10, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), c(a="Date",b="integer",c="complex",d="integer")) -test(1743.11, sapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), c(a="Date",b="integer",c="complex",d="raw")) +test(1743.05, lapply(fread("a,b\n2017-01-01,1", colClasses=c("Date", "integer")), class), list(a=c("IDate", "Date"), b="integer")) +test(1743.06, lapply(fread("a,b\n2017-01-01,1", colClasses=list(Date = 1L, integer = 2L)), class), list(a=c("IDate", "Date"), b="integer")) +test(1743.07, lapply(fread("a,b\n2017-01-01,2017-01-02", colClasses=list(Date = 1:2)), class), list(a=c("IDate", "Date"), b=c("IDate", "Date"))) +test(1743.08, lapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), list(a=c("IDate", "Date"), b="integer", c="complex")) +test(1743.09, lapply(fread("a,b,c\n2017-01-01,1,1+3i", colClasses=c("Date", "integer", "complex")), class), list(a=c("IDate", "Date"), b="integer", c="complex")) +test(1743.10, lapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", NA)), class), list(a=c("IDate", "Date"), b="integer", c="complex", d="integer")) +test(1743.11, lapply(fread("a,b,c,d\n2017-01-01,1,1+3i,05", colClasses=c("Date", "integer", "complex", "raw")), class), list(a=c("IDate", "Date"), b="integer", c="complex", d="raw")) test(1743.121, sapply(fread("a,b\n2015-01-01,2015-01-01", colClasses=c(NA,"IDate")), inherits, what="IDate"), c(a=TRUE, b=TRUE)) -test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.Date("2015-01-01"))) +test(1743.122, fread("a,b\n2015-01-01,2015-01-01", colClasses=c("POSIXct","Date")), data.table(a=as.POSIXct("2015-01-01"), b=as.IDate("2015-01-01"))) test(1743.123, fread("a,b\n1+3i,2015-01-01", colClasses=c(NA,"IDate")), data.table(a="1+3i", b=as.IDate("2015-01-01"))) ## Attempts to impose incompatible colClasses is a warning (not an error) ## and does not change the value of the columns -test(1743.13, sapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=c(a="character", b="Date"), warning=base_messages$ambiguous_date_fmt) +test(1743.13, lapply(fread("a,b\n09/05/98,2015-01-01", colClasses = "Date"), class), y=list(a="character", b=c("IDate", "Date")), warning=base_messages$ambiguous_date_fmt) ## Just invalid test(1743.14, options = c(useFancyQuotes = FALSE), @@ -11085,6 +11085,9 @@ test(1743.201, fread("A,B,C,D\nA,B,X,4", select=list(factor="A"), colClasses="ch test(1743.202, fread("A,B,C,D\nA,B,X,4", select=c(A="factor"), colClasses="character"), error="select= is a named vector.*but colClasses= has been provided as well. Please remove colClasses=.") test(1743.203, fread("A,B,C,D\nA,B,X,4", select=list(character="D", factor="B")), data.table(D="4", B=factor("B"))) test(1743.204, fread("A,B,C,D\nA,B,X,4", select=list(character=4, character=2)), data.table(D="4", B="B")) +## Date+select +test(1743.205, fread("a,b\n2017-01-01,1", colClasses="Date", select="a"), data.table(a=as.IDate("2017-01-01"))) +test(1743.206, fread("a,b\n2017-01-01,1", select=list(Date="a")), data.table(a=as.IDate("2017-01-01"))) ## factors test(1743.211, sapply(fread("a,b,c\n2,2,f", colClasses = list(factor = 1L), select = 2:3), class), y = c(b="integer", c="character")) @@ -16958,7 +16961,7 @@ test(2150.10, fread(tmp), copy(DT)[ , times := format(times, '%FT%T+00:XX')]) test(2150.11,fread("a,b\n2015-01-01,2015-01-01", colClasses="POSIXct"), # local time for backwards compatibility data.table(a=as.POSIXct("2015-01-01"), b=as.POSIXct("2015-01-01"))) test(2150.12,fread("a,b\n2015-01-01,2015-01-01", select=c(a="Date",b="POSIXct")), # select colClasses form, for coverage - data.table(a=as.Date("2015-01-01"), b=as.POSIXct("2015-01-01"))) + data.table(a=as.IDate("2015-01-01"), b=as.POSIXct("2015-01-01"))) test(2150.13, fread("a,b\n2015-01-01,1.1\n2015-01-02 01:02:03,1.2", tz=""), # no Z, tz="" needed for this test from v1.14.0 if (TZnotUTC) data.table(a=c("2015-01-01","2015-01-02 01:02:03"), b=c(1.1, 1.2)) else data.table(a=setattr(as.POSIXct(c("2015-01-01 00:00:00", "2015-01-02 01:02:03"), tz="UTC"), "tzone", "UTC"), b=c(1.1, 1.2))) diff --git a/src/freadR.c b/src/freadR.c index 7ea24e63b..8063ca707 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -335,10 +335,10 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int type[i]=CT_STRING; // e.g. CT_ISO8601_DATE changed to character here so that as.POSIXct treats the date-only as local time in tests 1743.122 and 2150.11 SET_STRING_ELT(colClassesAs, i, tt); } - } else { + } else if (type[i] != CT_ISO8601_DATE || tt != char_Date) { type[i] = typeEnum[w-1]; // freadMain checks bump up only not down if (w==NUT) SET_STRING_ELT(colClassesAs, i, tt); - } + } // else (when colClasses="Date" and fread found an IDate), don't update type[i] and don't signal any coercion needed on R side } } else { // selectColClasses==true if (!selectInts) internal_error(__func__, "selectInts is NULL but selectColClasses is true"); // # nocov @@ -355,7 +355,7 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int type[y-1]=CT_STRING; SET_STRING_ELT(colClassesAs, y-1, tt); } - } else { + } else if (type[i] != CT_ISO8601_DATE || tt != char_Date) { type[y-1] = typeEnum[w-1]; if (w==NUT) SET_STRING_ELT(colClassesAs, y-1, tt); } @@ -405,7 +405,9 @@ bool userOverride(int8_t *type, lenOff *colNames, const char *anchor, const int } if (selectRankD) selectRankD[colIdx-1] = rank++; // NB: mark as negative to indicate 'seen' - if (colClassType == CT_ISO8601_TIME && type[colIdx-1]!=CT_ISO8601_TIME) { + if (type[colIdx-1]==CT_ISO8601_DATE && colClassType==CT_STRING && STRING_ELT(listNames, i) == char_Date) { + type[colIdx-1] *= -1; + } else if (colClassType == CT_ISO8601_TIME && type[colIdx-1]!=CT_ISO8601_TIME) { type[colIdx-1] = -CT_STRING; // don't use in-built UTC parser, defer to character and as.POSIXct afterwards which reads in local time SET_STRING_ELT(colClassesAs, colIdx-1, STRING_ELT(listNames, i)); } else {