Automatic detection of dec (. or ,) (#4482)

* initial progress on automatic dec=, detection * if sep=, detected, turn off auto-dec * first pass at NEWS and man * add comments, tests * improve man * add verbose output, tests --------- Co-authored-by: Michael Chirico <michael.chirico@grabtaxi.com>
Rdatatable · Apr 11, 2024 · 6f008bd · 6f008bd
1 parent 0711e40
commit 6f008bd
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 22 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -32,6 +32,8 @@
 
 8. Computations in `j` can return a matrix or array _if it is one-dimensional_, e.g. a row or column vector, when `j` is a list of columns during grouping, [#783](https://github.com/Rdatatable/data.table/issues/783). Previously a matrix could be provided `DT[, expr, by]` form, but not `DT[, list(expr), by]` form; this resolves that inconsistency. It is still an error to return a "true" array, e.g. a `2x3` matrix.
 
+9. `fread` now supports automatic detection of `dec` (as either `.` or `,`, the latter being [common in many places in Europe, Africa, and South America](https://en.wikipedia.org/wiki/Decimal_separator)); this behavior is now the default, i.e. `dec='auto'`, [#2431](https://github.com/Rdatatable/data.table/issues/2431). This was our #2 most-requested issue. See [#3189](https://github.com/Rdatatable/data.table/issues/3189) and please do peruse this list and show support to the issues that would help you the most as we continue to use this metric to help prioritize development.
+
 ## BUG FIXES
 
 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix.

diff --git a/R/fread.R b/R/fread.R
@@ -1,5 +1,5 @@
 fread = function(
-input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto",
+input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
 na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
 skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
 col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
@@ -16,7 +16,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC")
     else if (sep=="auto") sep=""      # sep=="" at C level means auto sep
     else stopifnot( nchar(sep)==1L )  # otherwise an actual character to use as sep
   }
-  stopifnot( is.character(dec), length(dec)==1L, nchar(dec)==1L )
+  stopifnot( is.character(dec), length(dec)==1L)
+  if (dec == "auto") dec = "" else stopifnot(nchar(dec) == 1L)
   # handle encoding, #563
   if (length(encoding) != 1L || !encoding %chin% c("unknown", "UTF-8", "Latin-1")) {
     stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -2681,15 +2681,13 @@ if (test_bit64) {
   test(897, class(DT$b), "integer64")
   test(898, fread(f), DT)
   unlink(f)
-  DT[,a2:=as.integer64(a)][,a3:=as.double(a)][,a4:=gsub(" ","",format(a))]
-  DT[,b2:=as.double(b)][,b3:=gsub(" ","",format(b))]
-  DT[,r:=a/100][,r2:=gsub(" ","",format(r))]
-  DT[112, a2:=as.integer64(12345678901234)]   # start on row 112 to avoid the first 100
-  DT[113, a3:=3.14]
-  DT[114, a4:="123A"]
-  DT[115, b2:=1234567890123.45]
-  DT[116, b3:="12345678901234567890A"]  # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR)
-  DT[117, r2:="3.14A"]
+  DT[ , a2 := as.integer64(a)][112L, a2 := as.integer64(12345678901234)]   # start on row 112 to avoid the first 100
+  DT[ , a3 := as.double(a)   ][113L, a3 := 3.14]
+  DT[ , a4 := as.character(a)][114L, a4 := "123A"]
+  DT[ , b2 := as.double(b)   ][115L, b2 := 1234567890123.45]
+  DT[ , b3 := as.character(b)][116L, b3 := "12345678901234567890A"]  # A is needed otherwise read as double with loss of precision (TO DO: should detect and bump to STR)
+  DT[ , r := a/100]
+  DT[ , r2 := as.character(r)][117L, r2 := "3.14A"]
   fwrite(DT,f<-tempfile())
   test(899.1, fread(f, verbose=TRUE), DT, output="Rereading 6 columns.*out-of-sample.*Column 4.*a2.*int32.*int64.*<<12345678901234>>.*Column 10.*r2.*float64.*string.*<<3.14A>>")
   test(899.2, fread(f, colClasses=list(character=c("a4","b3","r2"), integer64="a2", double=c("a3","b2")), verbose=TRUE),
@@ -18432,3 +18430,23 @@ DF <- structure(
 )
 
 test(2255, as.data.table(DF), output="DF1.V1.*DF1.V2.*DF2.V3.*DF2.V4.*V5")
+
+# automatic detection of dec=',' for #2431
+DT = data.table(a = letters, b = 1:26/6, c = 1:26)
+## auto-detect dec=','
+fwrite(DT, f <- tempfile(), dec=',', sep=';')
+test(2256.1, fread(f), DT)
+
+fwrite(DT, f, dec=',', sep='|')
+test(2256.2, fread(f), DT)
+
+## auto-detect dec='.'
+fwrite(DT, f)
+test(2256.3, fread(f), DT)
+
+## verbose output
+test(2256.4, fread(f, verbose=TRUE), DT, output="sep=',' so dec set to '.'")
+
+fwrite(DT, f, dec=',', sep=';')
+test(2256.5, fread(f, verbose=TRUE), DT, output="dec=',' detected based on a balance of 18")
+test(2256.6, fread('a;b\n1,14;5', verbose=TRUE), data.table(a=1.14, b=5L), output="dec=',' detected based on a balance of 1 ")
diff --git a/man/fread.Rd b/man/fread.Rd
@@ -9,7 +9,7 @@
    \code{fread} is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector.
 }
 \usage{
-fread(input, file, text, cmd, sep="auto", sep2="auto", dec=".", quote="\"",
+fread(input, file, text, cmd, sep="auto", sep2="auto", dec="auto", quote="\"",
 nrows=Inf, header="auto",
 na.strings=getOption("datatable.na.strings","NA"),  # due to change to ""; see NEWS
 stringsAsFactors=FALSE, verbose=getOption("datatable.verbose", FALSE),
@@ -47,7 +47,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC"
     If type coercion results in an error, introduces \code{NA}s, or would result in loss of accuracy, the coercion attempt is aborted for that column with warning and the column's type is left unchanged. If you really desire data loss (e.g. reading \code{3.14} as \code{integer}) you have to truncate such columns afterwards yourself explicitly so that this is clear to future readers of your code.
   }
   \item{integer64}{ "integer64" (default) reads columns detected as containing integers larger than 2^31 as type \code{bit64::integer64}. Alternatively, \code{"double"|"numeric"} reads as \code{utils::read.csv} does; i.e., possibly with loss of precision and if so silently. Or, "character". }
-  \item{dec}{ The decimal separator as in \code{utils::read.csv}. If not "." (default) then usually ",". See details. }
+  \item{dec}{ The decimal separator as in \code{utils::read.csv}. When \code{"auto"} (the default), an attempt is made to decide whether \code{"."} or \code{","} is more suitable for this input. See details. }
   \item{col.names}{ A vector of optional names for the variables (columns). The default is to use the header column if present or detected, or if not "V" followed by the column number. This is applied after \code{check.names} and before \code{key} and \code{index}. }
   \item{check.names}{default is \code{FALSE}. If \code{TRUE} then the names of the variables in the \code{data.table} are checked to ensure that they are syntactically valid variable names. If necessary they are adjusted (by \code{\link{make.names}}) so that they are, and also to ensure that there are no duplicates.}
   \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}.  Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. }
@@ -79,9 +79,9 @@ If an empty line is encountered then reading stops there with warning if any tex
 
 \bold{Line endings:} All known line endings are detected automatically: \code{\\n} (*NIX including Mac), \code{\\r\\n} (Windows CRLF), \code{\\r} (old Mac) and \code{\\n\\r} (just in case). There is no need to convert input files first. \code{fread} running on any architecture will read a file from any architecture. Both \code{\\r} and \code{\\n} may be embedded in character strings (including column names) provided the field is quoted.
 
-\bold{Decimal separator and locale:} \code{fread(\dots,dec=",")} should just work. \code{fread} uses C function \code{strtod} to read numeric data; e.g., \code{1.23} or \code{1,23}. \code{strtod} retrieves the decimal separator (\code{.} or \code{,} usually) from the locale of the R session rather than as an argument passed to the \code{strtod} function. So for \code{fread(\dots,dec=",")} to work, \code{fread} changes this (and only this) R session's locale temporarily to a locale which provides the desired decimal separator.
+\bold{Decimal separator:} \code{dec} is used to parse numeric fields as the separator between integral and fractional parts. When \code{dec='auto'}, during column type detection, when a field is a candidate for being numeric (i.e., parsing as lower types has already failed), \code{dec='.'} is tried, and, if it fails to create a numeric field, \code{dec=','} is tried. At the end of the sample lines, if more were successfully parsed with \code{dec=','}, \code{dec} is set to \code{','}; otherwise, \code{dec} is set to \code{'.'}.
 
-On Windows, "French_France.1252" is tried which should be available as standard (any locale with comma decimal separator would suffice) and on unix "fr_FR.utf8" (you may need to install this locale on unix). \code{fread()} is very careful to set the locale back again afterwards, even if the function fails with an error. The choice of locale is determined by \code{options()$datatable.fread.dec.locale}. This may be a \emph{vector} of locale names and if so they will be tried in turn until the desired \code{dec} is obtained; thus allowing more than two different decimal separators to be selected. This is a new feature in v1.9.6 and is experimental. In case of problems, turn it off with \code{options(datatable.fread.dec.experiment=FALSE)}.
+Automatic detection of \code{sep} occurs \emph{prior} to column type detection -- as such, it is possible that \code{sep} has been inferred to be \code{','}, in which case \code{dec} is set to \code{'.'}.
 
 \bold{Quotes:}
 

diff --git a/src/fread.c b/src/fread.c
@@ -33,6 +33,7 @@ static const char *sof, *eof;
 static char sep;
 static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t')
 static char quote, dec;
+static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.'
 static bool eol_one_r;  // only true very rarely for \r-only files
 
 // Quote rule:
@@ -1206,11 +1207,16 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
   skip_white(&ch);
   if (eol(&ch)) return 0;  // empty line
   int field=0;
+  const bool autoDec = dec == '\0';
   while (field<ncol) {
     // DTPRINT(_("<<%s>>(%d)"), strlim(ch,20), quoteRule);
     skip_white(&ch);
     const char *fieldStart = ch;
     while (tmpType[field]<=CT_STRING) {
+      if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '\0') { // guess . first
+        dec = '.';
+      }
+
       fun[tmpType[field]](&fctx);
       if (end_of_field(ch)) break;
       skip_white(&ch);
@@ -1234,9 +1240,19 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
         }
       }
       ch = fieldStart;
+      if (autoDec && IS_DEC_TYPE(tmpType[field]) && dec == '.') { // . didn't parse a double; try ,
+        dec = ',';
+        continue;
+      }
       while (++tmpType[field]<CT_STRING && disabled_parsers[tmpType[field]]) {};
       *bumped = true;
     }
+    if (autoDec && dec != '\0') { // double was attempted
+      if (IS_DEC_TYPE(tmpType[field])) { // double succeeded
+        linesForDecDot += dec == '.' ? 1 : -1;
+      }
+      dec = '\0'; // reset for next parse
+    }
     field++;
     if (sep==' ' && *ch==sep) {
       while (ch[1]==' ') ch++;
@@ -1345,9 +1361,8 @@ int freadMain(freadMainArgs _args) {
   dec = args.dec;
   quote = args.quote;
   if (args.sep == quote && quote!='\0') STOP(_("sep == quote ('%c') is not allowed"), quote);
-  if (dec=='\0') STOP(_("dec='' not allowed. Should be '.' or ','"));
-  if (args.sep == dec) STOP(_("sep == dec ('%c') is not allowed"), dec);
-  if (quote == dec) STOP(_("quote == dec ('%c') is not allowed"), dec);
+  if (args.sep == dec && dec != '\0') STOP(_("sep == dec ('%c') is not allowed"), dec);
+  if (quote == dec && dec != '\0') STOP(_("quote == dec ('%c') is not allowed"), dec);
   // since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise
   //   the ending \0 at eof could be treated as a quote (test xxx)
 
@@ -1794,7 +1809,7 @@ int freadMain(freadMainArgs _args) {
   }
 
   //*********************************************************************************************
-  // [7] Detect column types, good nrow estimate and whether first row is column names
+  // [7] Detect column types, dec, good nrow estimate and whether first row is column names
   //*********************************************************************************************
   int nJumps;             // How many jumps to use when pre-scanning the file
   int64_t sampleLines;     // How many lines were sampled during the initial pre-scan
@@ -1804,13 +1819,20 @@ int freadMain(freadMainArgs _args) {
   double meanLineLen=0.0; // Average length (in bytes) of a single line in the input file
   size_t bytesRead=0;     // Bytes in the data section (i.e. excluding column names, header and footer, if any)
   {
-  if (verbose) DTPRINT(_("[07] Detect column types, good nrow estimate and whether first row is column names\n"));
+  if (verbose) DTPRINT(_("[07] Detect column types, dec, good nrow estimate and whether first row is column names\n"));
   if (verbose && args.header!=NA_BOOL8) DTPRINT(_("  'header' changed by user from 'auto' to %s\n"), args.header?"true":"false");
 
   type =    (int8_t *)malloc((size_t)ncol * sizeof(int8_t));
   tmpType = (int8_t *)malloc((size_t)ncol * sizeof(int8_t));  // used i) in sampling to not stop on errors when bad jump point and ii) when accepting user overrides
   if (!type || !tmpType) STOP(_("Failed to allocate 2 x %d bytes for type and tmpType: %s"), ncol, strerror(errno));
 
+  if (sep == ',' && dec == '\0') { // if sep=',' detected, don't attempt to detect dec [NB: . is not par of seps]
+    if (verbose) {
+      DTPRINT(_("  sep=',' so dec set to '.'\n"));
+    }
+    dec = '.';
+  }
+
   int8_t type0 = 1;
   while (disabled_parsers[type0]) type0++;
   for (int j=0; j<ncol; j++) {
@@ -1861,6 +1883,7 @@ int freadMain(freadMainArgs _args) {
     if (ch>=eof) break;                // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want.
     bool bumped = false;  // did this jump find any different types; to reduce verbose output to relevant lines
     int jumpLine = 0;    // line from this jump point start
+    linesForDecDot = 0;
 
     while(ch<eof && jumpLine++<jumpLines) {
       const char *lineStart = ch;
@@ -1892,6 +1915,13 @@ int freadMain(freadMainArgs _args) {
         bumped = false;  // detect_types() only updates &bumped when it's true. So reset to false here.
       }
     }
+    if (dec == '\0') {
+      dec = linesForDecDot < 0 ? ',' : '.'; // in a tie, dec=. is more likely
+      if (verbose) {
+        DTPRINT(_("  dec='%c' detected based on a balance of %d parsed fields\n"), dec, abs(linesForDecDot));
+      }
+    }
+
     if (bumped) {
       // when jump>0, apply the bumps (if any) at the end of the successfully completed jump sample
       ASSERT(jump>0, "jump(%d)>0", jump);
@@ -1906,7 +1936,17 @@ int freadMain(freadMainArgs _args) {
   if (args.header==NA_BOOL8) {
     for (int j=0; j<ncol; j++) tmpType[j]=type0;   // reuse tmpType
     bool bumped=false;
+
+    if (dec == '\0') { // in files without jumps, dec could still be undecided
+      linesForDecDot = 0;
+    }
     detect_types(&ch, tmpType, ncol, &bumped);
+    if (dec == '\0') {
+      dec = linesForDecDot < 0 ? ',' : '.';
+      if (verbose) {
+        DTPRINT(_("  dec='%c' detected based on a balance of %d parsed fields\n"), dec, abs(linesForDecDot));
+      }
+    }
     if (sampleLines>0) for (int j=0; j<ncol; j++) {
       if (tmpType[j]==CT_STRING && type[j]<CT_STRING && type[j]>CT_EMPTY) {
         args.header=true;

diff --git a/src/fread.h b/src/fread.h
@@ -36,6 +36,8 @@ typedef enum {
   NUMTYPE          // placeholder for the number of types including drop; used for allocation and loop bounds
 } colType;
 
+#define IS_DEC_TYPE(x) ((x) == CT_FLOAT64 || (x) == CT_FLOAT64_EXT) // types where dec matters
+
 extern int8_t typeSize[NUMTYPE];
 extern const char typeName[NUMTYPE][10];
 extern const long double pow10lookup[301];

diff --git a/src/freadR.c b/src/freadR.c
@@ -102,9 +102,10 @@ SEXP freadR(
     error(_("Internal error: freadR sep not a single character. R level catches this."));  // # nocov
   args.sep = CHAR(STRING_ELT(sepArg,0))[0];   // '\0' when default "auto" was replaced by "" at R level
 
-  if (!(isString(decArg) && LENGTH(decArg)==1 && strlen(CHAR(STRING_ELT(decArg,0)))==1))
+  if (!isString(decArg) || LENGTH(decArg)!=1 || strlen(CHAR(STRING_ELT(decArg,0)))>1) {
     error(_("Internal error: freadR dec not a single character. R level catches this."));  // # nocov
-  args.dec = CHAR(STRING_ELT(decArg,0))[0];
+  }
+  args.dec = CHAR(STRING_ELT(decArg,0))[0];   // '\0' when default "auto" was replaced by "" at R level
 
   if (IS_FALSE(quoteArg)) {
     args.quote = '\0';