Skip to content

Commit

Permalink
[misc] cleanups and avoid some unneeded Rcpp::String (#1224)
Browse files Browse the repository at this point in the history
* [load] inline and reuse code

* [load] we only need Rcpp::String for potential unicode issues

* [read] minor tweaks

* [misc] tweak col2int

* [misc] similar for ox_int_to_col

* [load] cleanup only when loading workbooks

* [read] tweak guess_col_type()

* [misc] cleanup

* [write] similar to read, only use Rcpp::String where required

* [misc] update NEWS
  • Loading branch information
JanMarvin authored Dec 28, 2024
1 parent 3d4ce6e commit a6d173c
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 108 deletions.
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

## Fixes

* Formulas that do not have a `A1` cell reference to increase, are now shareable too. `wb_add_formula(x = "1", dims = "A1:A2", shared = TRUE)`
* The first formula in a workbook can now be a shared formula. [1223](https://github.com/JanMarvin/openxlsx2/pull/1223)
* Avoid passing ASCII strings through `Rcpp::String()`. Previously all `cc` columns were passed through `Rcpp::String()` to avoid encoding issues on non unicode systems. [1224](https://github.com/JanMarvin/openxlsx2/pull/1224)


***************************************************************************
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ col_to_int <- function(x) {
.Call(`_openxlsx2_col_to_int`, x)
}

ox_int_to_col <- function(cell) {
.Call(`_openxlsx2_ox_int_to_col`, cell)
ox_int_to_col <- function(x) {
.Call(`_openxlsx2_ox_int_to_col`, x)
}

rbindlist <- function(x) {
Expand Down
2 changes: 1 addition & 1 deletion R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ int2col <- function(x) {
stop("x must be finite and numeric.")
}

sapply(x, ox_int_to_col)
ox_int_to_col(x)
}

#' Convert Excel column to integer
Expand Down
17 changes: 10 additions & 7 deletions R/read.R
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,16 @@ wb_to_df <- function(
# If no dims are requested via named_region, simply construct them from min
# and max columns and row found on worksheet
# TODO it would be useful to have both named_region and dims?
has_dims <- TRUE
if (missing(named_region) && missing(dims)) {
has_dims <- FALSE

sd <- wb$worksheets[[sheet]]$sheet_data$cc[c("row_r", "c_r")]
sd$row <- as.integer(sd$row_r)
sd$col <- col2int(sd$c_r)
row <- range(as.integer(unique(sd$row_r)))
col <- range(col2int(unique(sd$c_r)))

dims <- paste0(int2col(min(sd$col)), min(sd$row), ":",
int2col(max(sd$col)), max(sd$row))
dims <- paste0(int2col(col[1]), row[1], ":",
int2col(col[2]), row[2])

}

Expand Down Expand Up @@ -336,7 +338,7 @@ wb_to_df <- function(
keep_rows <- keep_rows[keep_rows %in% rnams]

# reduce data to selected cases only
if (length(keep_rows) && length(keep_cols))
if (has_dims && length(keep_rows) && length(keep_cols))
cc <- cc[cc$row_r %in% keep_rows & cc$c_r %in% keep_cols, ]

cc$val <- NA_character_
Expand Down Expand Up @@ -430,7 +432,8 @@ wb_to_df <- function(
}

# remaining values are numeric?
if (any(sel <- is.na(cc$typ))) {
if (any(cc_tab %in% c("n", ""))) {
sel <- which(is.na(cc$typ))
cc$val[sel] <- cc$v[sel]
cc$typ[sel] <- "n"
}
Expand Down Expand Up @@ -484,7 +487,7 @@ wb_to_df <- function(
zz$cols <- match(cc$c_r, colnames(z)) - 1L
zz$rows <- match(cc$row_r, rownames(z)) - 1L

zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
# zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
if (any(zz$val == "", na.rm = TRUE)) zz <- zz[zz$val != "", ]
long_to_wide(z, tt, zz)

Expand Down
63 changes: 39 additions & 24 deletions R/wb_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ dims_to_dataframe <- function(dims, fill = FALSE, empty_rm = FALSE) {
if (!is.null(full_cols)) full_cols <- seq_along(cols_out) - 1L
} else {
# somehow we have to make sure that all columns are covered
col_ints <- col2int(cols_out)
cols_out <- int2col(seq.int(from = min(col_ints), to = max(col_ints)))
col_ints <- range(col2int(cols_out))
cols_out <- int2col(seq.int(from = col_ints[1], to = col_ints[2]))

row_ints <- rows_out
rows_out <- seq.int(from = min(row_ints), to = max(row_ints))
row_ints <- range(rows_out)
rows_out <- seq.int(from = row_ints[1], to = row_ints[2])
}
}

Expand Down Expand Up @@ -156,34 +156,49 @@ dataframe_to_dims <- function(df, dim_break = FALSE) {
#'
#' @noRd
guess_col_type <- function(tt) {

# all columns are character
# Initialize types vector with numeric type (default to 0 for character)
types <- vector("numeric", NCOL(tt))
names(types) <- names(tt)

# but some values are numeric
col_num <- vapply(tt, function(x) all(x == "n", na.rm = TRUE), NA)
types[names(col_num[col_num])] <- 1
# Function to check column type
check_col_type <- function(x, type_char) {
all(unique(x) == type_char, na.rm = TRUE)
}

# Identify the unique types present in the data frame
unique_types <- unique(unlist(lapply(tt, unique)))
unique_types[is.na(unique_types)] <- "n"

# Check for each type and update types vector accordingly
if ("n" %in% unique_types) {
col_num <- vapply(tt, check_col_type, NA, type_char = "n")
types[col_num] <- 1
}

# or even date
col_dte <- vapply(tt[!col_num], function(x) all(x == "d", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 2
if ("d" %in% unique_types) {
col_dte <- vapply(tt, check_col_type, NA, type_char = "d")
types[col_dte & types == 0] <- 2
}

# or even posix
col_dte <- vapply(tt[!col_num], function(x) all(x == "p", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 3
if ("p" %in% unique_types) {
col_posix <- vapply(tt, check_col_type, NA, type_char = "p")
types[col_posix & types == 0] <- 3
}

# there are bools as well
col_log <- vapply(tt[!col_num], function(x) all(x == "b", na.rm = TRUE), NA)
types[names(col_log[col_log])] <- 4
if ("b" %in% unique_types) {
col_log <- vapply(tt, check_col_type, NA, type_char = "b")
types[col_log & types == 0] <- 4
}

# or even hms
col_dte <- vapply(tt[!col_num], function(x) all(x == "h", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 5
if ("h" %in% unique_types) {
col_hms <- vapply(tt, check_col_type, NA, type_char = "h")
types[col_hms & types == 0] <- 5
}

# or formula
col_fml <- vapply(tt[!col_num], function(x) all(x == "f", na.rm = TRUE), NA)
types[names(col_fml[col_fml])] <- 6
if ("f" %in% unique_types) {
col_fml <- vapply(tt, check_col_type, NA, type_char = "f")
types[col_fml & types == 0] <- 6
}

types
}
Expand Down
10 changes: 7 additions & 3 deletions R/wb_load.R
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,15 @@ wb_load <- function(
## feature property bag
featureProperty <- grep_xml("featurePropertyBag.xml$")

cleanup_dir <- function(data_only) {
grep_xml("media|vmlDrawing|customXml|embeddings|activeX|vbaProject", ignore.case = TRUE, invert = TRUE)
}

## remove all EXCEPT media and charts
on.exit(
if (!data_only) on.exit(
unlink(
# TODO: this removes all files, the folders remain. grep instead grep_xml?
grep_xml("media|vmlDrawing|customXml|embeddings|activeX|vbaProject", ignore.case = TRUE, invert = TRUE),
# TODO: this removes all files, the folders remain
cleanup_dir(data_only),
recursive = TRUE, force = TRUE
),
add = TRUE
Expand Down
8 changes: 4 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ BEGIN_RCPP
END_RCPP
}
// ox_int_to_col
std::string ox_int_to_col(int32_t cell);
RcppExport SEXP _openxlsx2_ox_int_to_col(SEXP cellSEXP) {
Rcpp::CharacterVector ox_int_to_col(Rcpp::NumericVector x);
RcppExport SEXP _openxlsx2_ox_int_to_col(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< int32_t >::type cell(cellSEXP);
rcpp_result_gen = Rcpp::wrap(ox_int_to_col(cell));
Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(ox_int_to_col(x));
return rcpp_result_gen;
END_RCPP
}
Expand Down
48 changes: 34 additions & 14 deletions src/helper_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,40 +118,60 @@ SEXP openxlsx2_type(SEXP x) {
return type;
}


// [[Rcpp::export]]
Rcpp::IntegerVector col_to_int(Rcpp::CharacterVector x) {

// This function converts the Excel column letter to an integer
R_xlen_t n = static_cast<R_xlen_t>(x.size());

std::string a;
std::unordered_map<std::string, int> col_map;
Rcpp::IntegerVector colNums(n);

for (R_xlen_t i = 0; i < n; i++) {
a = x[i];
for (R_xlen_t i = 0; i < n; ++i) {
std::string a = Rcpp::as<std::string>(x[i]);

// check if the value is digit only, if yes, add it and continue the loop
// at the top. This avoids slow:
// suppressWarnings(isTRUE(as.character(as.numeric(x)) == x))
if (std::all_of(a.begin(), a.end(), ::isdigit))
{
if (std::all_of(a.begin(), a.end(), ::isdigit)) {
colNums[i] = std::stoi(a);
continue;
}

// return index from column name
colNums[i] = cell_to_colint(a);
// Check if the column name is already in the map
if (col_map.find(a) != col_map.end()) {
colNums[i] = col_map[a];
} else {
// Compute the integer value and store it in the map
int col_int = cell_to_colint(a);
col_map[a] = col_int;
colNums[i] = col_int;
}
}

return colNums;

}

// [[Rcpp::export]]
std::string ox_int_to_col(int32_t cell) {
uint32_t cell_u32 = static_cast<uint32_t>(cell);
return int_to_col(cell_u32);
Rcpp::CharacterVector ox_int_to_col(Rcpp::NumericVector x) {
R_xlen_t n = static_cast<R_xlen_t>(x.size());
Rcpp::CharacterVector colNames(n);
std::unordered_map<int, std::string> cache;

for (R_xlen_t i = 0; i < n; ++i) {
uint32_t num = static_cast<uint32_t>(x[i]);

// Check if the column name is already in the cache
if (cache.find(num) != cache.end()) {
colNames[i] = cache[num];
} else {
// Compute the column name and store it in the cache
std::string col_name = int_to_col(num);
cache[num] = col_name;
colNames[i] = col_name;
}
}

return colNames;
}

// provide a basic rbindlist for lists of named characters
Expand Down Expand Up @@ -323,8 +343,8 @@ void long_to_wide(Rcpp::DataFrame z, Rcpp::DataFrame tt, Rcpp::DataFrame zz) {
R_xlen_t n = static_cast<R_xlen_t>(zz.nrow());
R_xlen_t col = 0, row = 0;

Rcpp::IntegerVector rows = zz["rows"];
Rcpp::IntegerVector cols = zz["cols"];
Rcpp::IntegerVector rows = zz["rows"];
Rcpp::CharacterVector vals = zz["val"];
Rcpp::CharacterVector typs = zz["typ"];

Expand Down
17 changes: 3 additions & 14 deletions src/load_workbook.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Rcpp::CharacterVector df_to_xml(std::string name, Rcpp::DataFrame df_col) {
}


Rcpp::DataFrame row_to_df(XPtrXML doc) {
inline Rcpp::DataFrame row_to_df(XPtrXML doc) {

auto ws = doc->child("worksheet").child("sheetData");

Expand Down Expand Up @@ -245,22 +245,11 @@ void loadvals(Rcpp::Environment sheet_data, XPtrXML doc) {
single_xml_col.r = buffer;

// get col name
std::string colrow = buffer;
colrow.erase(std::remove_if(colrow.begin(),
colrow.end(),
&isdigit),
colrow.end());
single_xml_col.c_r = colrow;
single_xml_col.c_r = rm_rownum(buffer);
has_colname = true;

// get colnum
colrow = buffer;
// remove numeric from string
colrow.erase(std::remove_if(colrow.begin(),
colrow.end(),
&isalpha),
colrow.end());
single_xml_col.row_r = colrow;
single_xml_col.row_r = rm_colnum(buffer);

// if some cells of the workbook have colnames but other dont,
// this will increase itr_cols and avoid duplicates in cc
Expand Down
29 changes: 17 additions & 12 deletions src/openxlsx2_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,27 @@ inline SEXP wrap(const std::vector<xml_col> &x) {
Rcpp::CharacterVector is(no_init(n)); // <is> tag

// struct to vector
// We have to convert utf8 inputs via Rcpp::String for non unicode R sessions
// Ideally there would be a function that calls Rcpp::String only if needed
for (R_xlen_t i = 0; i < n; ++i) {
size_t ii = static_cast<size_t>(i);
if (!x[ii].r.empty()) r[i] = Rcpp::String(x[ii].r);
if (!x[ii].row_r.empty()) row_r[i] = Rcpp::String(x[ii].row_r);
if (!x[ii].c_r.empty()) c_r[i] = Rcpp::String(x[ii].c_r);
if (!x[ii].c_s.empty()) c_s[i] = Rcpp::String(x[ii].c_s);
if (!x[ii].c_t.empty()) c_t[i] = Rcpp::String(x[ii].c_t);
if (!x[ii].c_cm.empty()) c_cm[i] = Rcpp::String(x[ii].c_cm);
if (!x[ii].r.empty()) r[i] = std::string(x[ii].r);
if (!x[ii].row_r.empty()) row_r[i] = std::string(x[ii].row_r);
if (!x[ii].c_r.empty()) c_r[i] = std::string(x[ii].c_r);
if (!x[ii].c_s.empty()) c_s[i] = std::string(x[ii].c_s);
if (!x[ii].c_t.empty()) c_t[i] = std::string(x[ii].c_t);
if (!x[ii].c_cm.empty()) c_cm[i] = std::string(x[ii].c_cm);
if (!x[ii].c_ph.empty()) c_ph[i] = Rcpp::String(x[ii].c_ph);
if (!x[ii].c_vm.empty()) c_vm[i] = Rcpp::String(x[ii].c_vm);
if (!x[ii].v.empty()) v[i] = Rcpp::String(x[ii].v);
if (!x[ii].c_vm.empty()) c_vm[i] = std::string(x[ii].c_vm);
if (!x[ii].v.empty()) { // can only be utf8 if c_t = "str"
if (x[ii].c_t.empty()) v[i] = std::string(x[ii].v);
else v[i] = Rcpp::String(x[ii].v);
}
if (!x[ii].f.empty()) f[i] = Rcpp::String(x[ii].f);
if (!x[ii].f_t.empty()) f_t[i] = Rcpp::String(x[ii].f_t);
if (!x[ii].f_ref.empty()) f_ref[i] = Rcpp::String(x[ii].f_ref);
if (!x[ii].f_ca.empty()) f_ca[i] = Rcpp::String(x[ii].f_ca);
if (!x[ii].f_si.empty()) f_si[i] = Rcpp::String(x[ii].f_si);
if (!x[ii].f_t.empty()) f_t[i] = std::string(x[ii].f_t);
if (!x[ii].f_ref.empty()) f_ref[i] = std::string(x[ii].f_ref);
if (!x[ii].f_ca.empty()) f_ca[i] = std::string(x[ii].f_ca);
if (!x[ii].f_si.empty()) f_si[i] = std::string(x[ii].f_si);
if (!x[ii].is.empty()) is[i] = Rcpp::String(x[ii].is);
}

Expand Down
Loading

0 comments on commit a6d173c

Please sign in to comment.