[misc] cleanups and avoid some unneeded Rcpp::String (#1224)

* [load] inline and reuse code * [load] we only need Rcpp::String for potential unicode issues * [read] minor tweaks * [misc] tweak col2int * [misc] similar for ox_int_to_col * [load] cleanup only when loading workbooks * [read] tweak guess_col_type() * [misc] cleanup * [write] similar to read, only use Rcpp::String where required * [misc] update NEWS
JanMarvin · Dec 28, 2024 · a6d173c · a6d173c
1 parent 3d4ce6e
commit a6d173c
Show file tree

Hide file tree

Showing 11 changed files with 145 additions and 108 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,7 +2,8 @@
 
 ## Fixes
 
-* Formulas that do not have a `A1` cell reference to increase, are now shareable too. `wb_add_formula(x = "1", dims = "A1:A2", shared = TRUE)`
+* The first formula in a workbook can now be a shared formula. [1223](https://github.com/JanMarvin/openxlsx2/pull/1223)
+* Avoid passing ASCII strings through `Rcpp::String()`. Previously all `cc` columns were passed through `Rcpp::String()` to avoid encoding issues on non unicode systems. [1224](https://github.com/JanMarvin/openxlsx2/pull/1224)
 
 
 ***************************************************************************

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -16,8 +16,8 @@ col_to_int <- function(x) {
     .Call(`_openxlsx2_col_to_int`, x)
 }
 
-ox_int_to_col <- function(cell) {
-    .Call(`_openxlsx2_ox_int_to_col`, cell)
+ox_int_to_col <- function(x) {
+    .Call(`_openxlsx2_ox_int_to_col`, x)
 }
 
 rbindlist <- function(x) {

diff --git a/R/converters.R b/R/converters.R
@@ -14,7 +14,7 @@ int2col <- function(x) {
     stop("x must be finite and numeric.")
   }
 
-  sapply(x, ox_int_to_col)
+  ox_int_to_col(x)
 }
 
 #' Convert Excel column to integer

diff --git a/R/read.R b/R/read.R
@@ -235,14 +235,16 @@ wb_to_df <- function(
   # If no dims are requested via named_region, simply construct them from min
   # and max columns and row found on worksheet
   # TODO it would be useful to have both named_region and dims?
+  has_dims <- TRUE
   if (missing(named_region) && missing(dims)) {
+    has_dims <- FALSE
 
     sd <- wb$worksheets[[sheet]]$sheet_data$cc[c("row_r", "c_r")]
-    sd$row <- as.integer(sd$row_r)
-    sd$col <- col2int(sd$c_r)
+    row <- range(as.integer(unique(sd$row_r)))
+    col <- range(col2int(unique(sd$c_r)))
 
-    dims <- paste0(int2col(min(sd$col)), min(sd$row), ":",
-                   int2col(max(sd$col)), max(sd$row))
+    dims <- paste0(int2col(col[1]), row[1], ":",
+                   int2col(col[2]), row[2])
 
   }
 
@@ -336,7 +338,7 @@ wb_to_df <- function(
   keep_rows <- keep_rows[keep_rows %in% rnams]
 
   # reduce data to selected cases only
-  if (length(keep_rows) && length(keep_cols))
+  if (has_dims && length(keep_rows) && length(keep_cols))
     cc <- cc[cc$row_r %in% keep_rows & cc$c_r %in% keep_cols, ]
 
   cc$val <- NA_character_
@@ -430,7 +432,8 @@ wb_to_df <- function(
   }
 
   # remaining values are numeric?
-  if (any(sel <- is.na(cc$typ))) {
+  if (any(cc_tab %in% c("n", ""))) {
+    sel <- which(is.na(cc$typ))
     cc$val[sel] <- cc$v[sel]
     cc$typ[sel] <- "n"
   }
@@ -484,7 +487,7 @@ wb_to_df <- function(
   zz$cols <- match(cc$c_r, colnames(z)) - 1L
   zz$rows <- match(cc$row_r, rownames(z)) - 1L
 
-  zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
+  # zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
   if (any(zz$val == "", na.rm = TRUE)) zz <- zz[zz$val != "", ]
   long_to_wide(z, tt, zz)
 

diff --git a/R/wb_functions.R b/R/wb_functions.R
@@ -82,11 +82,11 @@ dims_to_dataframe <- function(dims, fill = FALSE, empty_rm = FALSE) {
       if (!is.null(full_cols)) full_cols <- seq_along(cols_out) - 1L
     } else {
       # somehow we have to make sure that all columns are covered
-      col_ints <- col2int(cols_out)
-      cols_out <- int2col(seq.int(from = min(col_ints), to = max(col_ints)))
+      col_ints <- range(col2int(cols_out))
+      cols_out <- int2col(seq.int(from = col_ints[1], to = col_ints[2]))
 
-      row_ints <- rows_out
-      rows_out <- seq.int(from = min(row_ints), to = max(row_ints))
+      row_ints <- range(rows_out)
+      rows_out <- seq.int(from = row_ints[1], to = row_ints[2])
     }
   }
 
@@ -156,34 +156,49 @@ dataframe_to_dims <- function(df, dim_break = FALSE) {
 #'
 #' @noRd
 guess_col_type <- function(tt) {
-
-  # all columns are character
+  # Initialize types vector with numeric type (default to 0 for character)
   types <- vector("numeric", NCOL(tt))
   names(types) <- names(tt)
 
-  # but some values are numeric
-  col_num <- vapply(tt, function(x) all(x == "n", na.rm = TRUE), NA)
-  types[names(col_num[col_num])] <- 1
+  # Function to check column type
+  check_col_type <- function(x, type_char) {
+    all(unique(x) == type_char, na.rm = TRUE)
+  }
+
+  # Identify the unique types present in the data frame
+  unique_types <- unique(unlist(lapply(tt, unique)))
+  unique_types[is.na(unique_types)] <- "n"
+
+  # Check for each type and update types vector accordingly
+  if ("n" %in% unique_types) {
+    col_num <- vapply(tt, check_col_type, NA, type_char = "n")
+    types[col_num] <- 1
+  }
 
-  # or even date
-  col_dte <- vapply(tt[!col_num], function(x) all(x == "d", na.rm = TRUE), NA)
-  types[names(col_dte[col_dte])] <- 2
+  if ("d" %in% unique_types) {
+    col_dte <- vapply(tt, check_col_type, NA, type_char = "d")
+    types[col_dte & types == 0] <- 2
+  }
 
-  # or even posix
-  col_dte <- vapply(tt[!col_num], function(x) all(x == "p", na.rm = TRUE), NA)
-  types[names(col_dte[col_dte])] <- 3
+  if ("p" %in% unique_types) {
+    col_posix <- vapply(tt, check_col_type, NA, type_char = "p")
+    types[col_posix & types == 0] <- 3
+  }
 
-  # there are bools as well
-  col_log <- vapply(tt[!col_num], function(x) all(x == "b", na.rm = TRUE), NA)
-  types[names(col_log[col_log])] <- 4
+  if ("b" %in% unique_types) {
+    col_log <- vapply(tt, check_col_type, NA, type_char = "b")
+    types[col_log & types == 0] <- 4
+  }
 
-  # or even hms
-  col_dte <- vapply(tt[!col_num], function(x) all(x == "h", na.rm = TRUE), NA)
-  types[names(col_dte[col_dte])] <- 5
+  if ("h" %in% unique_types) {
+    col_hms <- vapply(tt, check_col_type, NA, type_char = "h")
+    types[col_hms & types == 0] <- 5
+  }
 
-  # or formula
-  col_fml <- vapply(tt[!col_num], function(x) all(x == "f", na.rm = TRUE), NA)
-  types[names(col_fml[col_fml])] <- 6
+  if ("f" %in% unique_types) {
+    col_fml <- vapply(tt, check_col_type, NA, type_char = "f")
+    types[col_fml & types == 0] <- 6
+  }
 
   types
 }

diff --git a/R/wb_load.R b/R/wb_load.R
@@ -216,11 +216,15 @@ wb_load <- function(
   ## feature property bag
   featureProperty   <- grep_xml("featurePropertyBag.xml$")
 
+  cleanup_dir <- function(data_only) {
+    grep_xml("media|vmlDrawing|customXml|embeddings|activeX|vbaProject", ignore.case = TRUE, invert = TRUE)
+  }
+
   ## remove all EXCEPT media and charts
-  on.exit(
+  if (!data_only) on.exit(
     unlink(
-      # TODO: this removes all files, the folders remain. grep instead grep_xml?
-      grep_xml("media|vmlDrawing|customXml|embeddings|activeX|vbaProject", ignore.case = TRUE, invert = TRUE),
+      # TODO: this removes all files, the folders remain
+      cleanup_dir(data_only),
       recursive = TRUE, force = TRUE
     ),
     add = TRUE

diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -45,13 +45,13 @@ BEGIN_RCPP
 END_RCPP
 }
 // ox_int_to_col
-std::string ox_int_to_col(int32_t cell);
-RcppExport SEXP _openxlsx2_ox_int_to_col(SEXP cellSEXP) {
+Rcpp::CharacterVector ox_int_to_col(Rcpp::NumericVector x);
+RcppExport SEXP _openxlsx2_ox_int_to_col(SEXP xSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
-    Rcpp::traits::input_parameter< int32_t >::type cell(cellSEXP);
-    rcpp_result_gen = Rcpp::wrap(ox_int_to_col(cell));
+    Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP);
+    rcpp_result_gen = Rcpp::wrap(ox_int_to_col(x));
     return rcpp_result_gen;
 END_RCPP
 }

diff --git a/src/helper_functions.cpp b/src/helper_functions.cpp
@@ -118,40 +118,60 @@ SEXP openxlsx2_type(SEXP x) {
   return type;
 }
 
-
 // [[Rcpp::export]]
 Rcpp::IntegerVector col_to_int(Rcpp::CharacterVector x) {
 
   // This function converts the Excel column letter to an integer
   R_xlen_t n = static_cast<R_xlen_t>(x.size());
-
-  std::string a;
+  std::unordered_map<std::string, int> col_map;
   Rcpp::IntegerVector colNums(n);
 
-  for (R_xlen_t i = 0; i < n; i++) {
-    a = x[i];
+  for (R_xlen_t i = 0; i < n; ++i) {
+    std::string a = Rcpp::as<std::string>(x[i]);
 
     // check if the value is digit only, if yes, add it and continue the loop
     // at the top. This avoids slow:
     // suppressWarnings(isTRUE(as.character(as.numeric(x)) == x))
-    if (std::all_of(a.begin(), a.end(), ::isdigit))
-    {
+    if (std::all_of(a.begin(), a.end(), ::isdigit)) {
       colNums[i] = std::stoi(a);
       continue;
     }
 
-    // return index from column name
-    colNums[i] = cell_to_colint(a);
+    // Check if the column name is already in the map
+    if (col_map.find(a) != col_map.end()) {
+      colNums[i] = col_map[a];
+    } else {
+      // Compute the integer value and store it in the map
+      int col_int = cell_to_colint(a);
+      col_map[a] = col_int;
+      colNums[i] = col_int;
+    }
   }
 
   return colNums;
-
 }
 
 // [[Rcpp::export]]
-std::string ox_int_to_col(int32_t cell) {
-  uint32_t cell_u32 = static_cast<uint32_t>(cell);
-  return int_to_col(cell_u32);
+Rcpp::CharacterVector ox_int_to_col(Rcpp::NumericVector x) {
+  R_xlen_t n = static_cast<R_xlen_t>(x.size());
+  Rcpp::CharacterVector colNames(n);
+  std::unordered_map<int, std::string> cache;
+
+  for (R_xlen_t i = 0; i < n; ++i) {
+    uint32_t num = static_cast<uint32_t>(x[i]);
+
+    // Check if the column name is already in the cache
+    if (cache.find(num) != cache.end()) {
+      colNames[i] = cache[num];
+    } else {
+      // Compute the column name and store it in the cache
+      std::string col_name = int_to_col(num);
+      cache[num] = col_name;
+      colNames[i] = col_name;
+    }
+  }
+
+  return colNames;
 }
 
 // provide a basic rbindlist for lists of named characters
@@ -323,8 +343,8 @@ void long_to_wide(Rcpp::DataFrame z, Rcpp::DataFrame tt, Rcpp::DataFrame zz) {
   R_xlen_t n = static_cast<R_xlen_t>(zz.nrow());
   R_xlen_t col = 0, row = 0;
 
-  Rcpp::IntegerVector rows = zz["rows"];
   Rcpp::IntegerVector cols = zz["cols"];
+  Rcpp::IntegerVector rows = zz["rows"];
   Rcpp::CharacterVector vals = zz["val"];
   Rcpp::CharacterVector typs = zz["typ"];
 

diff --git a/src/load_workbook.cpp b/src/load_workbook.cpp
@@ -98,7 +98,7 @@ Rcpp::CharacterVector df_to_xml(std::string name, Rcpp::DataFrame df_col) {
 }
 
 
-Rcpp::DataFrame row_to_df(XPtrXML doc) {
+inline Rcpp::DataFrame row_to_df(XPtrXML doc) {
 
   auto ws = doc->child("worksheet").child("sheetData");
 
@@ -245,22 +245,11 @@ void loadvals(Rcpp::Environment sheet_data, XPtrXML doc) {
           single_xml_col.r = buffer;
 
           // get col name
-          std::string colrow = buffer;
-          colrow.erase(std::remove_if(colrow.begin(),
-                                      colrow.end(),
-                                      &isdigit),
-                                      colrow.end());
-          single_xml_col.c_r = colrow;
+          single_xml_col.c_r = rm_rownum(buffer);
           has_colname = true;
 
           // get colnum
-          colrow = buffer;
-          // remove numeric from string
-          colrow.erase(std::remove_if(colrow.begin(),
-                                      colrow.end(),
-                                      &isalpha),
-                                      colrow.end());
-          single_xml_col.row_r = colrow;
+          single_xml_col.row_r = rm_colnum(buffer);
 
           // if some cells of the workbook have colnames but other dont,
           // this will increase itr_cols and avoid duplicates in cc

diff --git a/src/openxlsx2_types.h b/src/openxlsx2_types.h
@@ -85,22 +85,27 @@ inline SEXP wrap(const std::vector<xml_col> &x) {
   Rcpp::CharacterVector is(no_init(n));        // <is> tag
 
   // struct to vector
+  // We have to convert utf8 inputs via Rcpp::String for non unicode R sessions
+  // Ideally there would be a function that calls Rcpp::String only if needed
   for (R_xlen_t i = 0; i < n; ++i) {
     size_t ii = static_cast<size_t>(i);
-    if (!x[ii].r.empty())     r[i]     = Rcpp::String(x[ii].r);
-    if (!x[ii].row_r.empty()) row_r[i] = Rcpp::String(x[ii].row_r);
-    if (!x[ii].c_r.empty())   c_r[i]   = Rcpp::String(x[ii].c_r);
-    if (!x[ii].c_s.empty())   c_s[i]   = Rcpp::String(x[ii].c_s);
-    if (!x[ii].c_t.empty())   c_t[i]   = Rcpp::String(x[ii].c_t);
-    if (!x[ii].c_cm.empty())  c_cm[i]  = Rcpp::String(x[ii].c_cm);
+    if (!x[ii].r.empty())     r[i]     = std::string(x[ii].r);
+    if (!x[ii].row_r.empty()) row_r[i] = std::string(x[ii].row_r);
+    if (!x[ii].c_r.empty())   c_r[i]   = std::string(x[ii].c_r);
+    if (!x[ii].c_s.empty())   c_s[i]   = std::string(x[ii].c_s);
+    if (!x[ii].c_t.empty())   c_t[i]   = std::string(x[ii].c_t);
+    if (!x[ii].c_cm.empty())  c_cm[i]  = std::string(x[ii].c_cm);
     if (!x[ii].c_ph.empty())  c_ph[i]  = Rcpp::String(x[ii].c_ph);
-    if (!x[ii].c_vm.empty())  c_vm[i]  = Rcpp::String(x[ii].c_vm);
-    if (!x[ii].v.empty())     v[i]     = Rcpp::String(x[ii].v);
+    if (!x[ii].c_vm.empty())  c_vm[i]  = std::string(x[ii].c_vm);
+    if (!x[ii].v.empty()) { // can only be utf8 if c_t = "str"
+        if (x[ii].c_t.empty()) v[i]    = std::string(x[ii].v);
+        else                   v[i]    = Rcpp::String(x[ii].v);
+    }
     if (!x[ii].f.empty())     f[i]     = Rcpp::String(x[ii].f);
-    if (!x[ii].f_t.empty())   f_t[i]   = Rcpp::String(x[ii].f_t);
-    if (!x[ii].f_ref.empty()) f_ref[i] = Rcpp::String(x[ii].f_ref);
-    if (!x[ii].f_ca.empty())  f_ca[i]  = Rcpp::String(x[ii].f_ca);
-    if (!x[ii].f_si.empty())  f_si[i]  = Rcpp::String(x[ii].f_si);
+    if (!x[ii].f_t.empty())   f_t[i]   = std::string(x[ii].f_t);
+    if (!x[ii].f_ref.empty()) f_ref[i] = std::string(x[ii].f_ref);
+    if (!x[ii].f_ca.empty())  f_ca[i]  = std::string(x[ii].f_ca);
+    if (!x[ii].f_si.empty())  f_si[i]  = std::string(x[ii].f_si);
     if (!x[ii].is.empty())    is[i]    = Rcpp::String(x[ii].is);
   }