Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[misc] cleanups and avoid some unneeded Rcpp::String #1224

Merged
merged 10 commits into from
Dec 28, 2024
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

## Fixes

* Formulas that do not have a `A1` cell reference to increase, are now shareable too. `wb_add_formula(x = "1", dims = "A1:A2", shared = TRUE)`
* The first formula in a workbook can now be a shared formula. [1223](https://github.com/JanMarvin/openxlsx2/pull/1223)
* Avoid passing ASCII strings through `Rcpp::String()`. Previously all `cc` columns were passed through `Rcpp::String()` to avoid encoding issues on non unicode systems. [1224](https://github.com/JanMarvin/openxlsx2/pull/1224)


***************************************************************************
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ col_to_int <- function(x) {
.Call(`_openxlsx2_col_to_int`, x)
}

ox_int_to_col <- function(cell) {
.Call(`_openxlsx2_ox_int_to_col`, cell)
ox_int_to_col <- function(x) {
.Call(`_openxlsx2_ox_int_to_col`, x)
}

rbindlist <- function(x) {
Expand Down
2 changes: 1 addition & 1 deletion R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ int2col <- function(x) {
stop("x must be finite and numeric.")
}

sapply(x, ox_int_to_col)
ox_int_to_col(x)
}

#' Convert Excel column to integer
Expand Down
17 changes: 10 additions & 7 deletions R/read.R
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,16 @@ wb_to_df <- function(
# If no dims are requested via named_region, simply construct them from min
# and max columns and row found on worksheet
# TODO it would be useful to have both named_region and dims?
has_dims <- TRUE
if (missing(named_region) && missing(dims)) {
has_dims <- FALSE

sd <- wb$worksheets[[sheet]]$sheet_data$cc[c("row_r", "c_r")]
sd$row <- as.integer(sd$row_r)
sd$col <- col2int(sd$c_r)
row <- range(as.integer(unique(sd$row_r)))
col <- range(col2int(unique(sd$c_r)))

dims <- paste0(int2col(min(sd$col)), min(sd$row), ":",
int2col(max(sd$col)), max(sd$row))
dims <- paste0(int2col(col[1]), row[1], ":",
int2col(col[2]), row[2])

}

Expand Down Expand Up @@ -336,7 +338,7 @@ wb_to_df <- function(
keep_rows <- keep_rows[keep_rows %in% rnams]

# reduce data to selected cases only
if (length(keep_rows) && length(keep_cols))
if (has_dims && length(keep_rows) && length(keep_cols))
cc <- cc[cc$row_r %in% keep_rows & cc$c_r %in% keep_cols, ]

cc$val <- NA_character_
Expand Down Expand Up @@ -430,7 +432,8 @@ wb_to_df <- function(
}

# remaining values are numeric?
if (any(sel <- is.na(cc$typ))) {
if (any(cc_tab %in% c("n", ""))) {
sel <- which(is.na(cc$typ))
cc$val[sel] <- cc$v[sel]
cc$typ[sel] <- "n"
}
Expand Down Expand Up @@ -484,7 +487,7 @@ wb_to_df <- function(
zz$cols <- match(cc$c_r, colnames(z)) - 1L
zz$rows <- match(cc$row_r, rownames(z)) - 1L

zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
# zz <- zz[order(zz[, "cols"], zz[, "rows"]), ]
if (any(zz$val == "", na.rm = TRUE)) zz <- zz[zz$val != "", ]
long_to_wide(z, tt, zz)

Expand Down
63 changes: 39 additions & 24 deletions R/wb_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ dims_to_dataframe <- function(dims, fill = FALSE, empty_rm = FALSE) {
if (!is.null(full_cols)) full_cols <- seq_along(cols_out) - 1L
} else {
# somehow we have to make sure that all columns are covered
col_ints <- col2int(cols_out)
cols_out <- int2col(seq.int(from = min(col_ints), to = max(col_ints)))
col_ints <- range(col2int(cols_out))
cols_out <- int2col(seq.int(from = col_ints[1], to = col_ints[2]))

row_ints <- rows_out
rows_out <- seq.int(from = min(row_ints), to = max(row_ints))
row_ints <- range(rows_out)
rows_out <- seq.int(from = row_ints[1], to = row_ints[2])
}
}

Expand Down Expand Up @@ -156,34 +156,49 @@ dataframe_to_dims <- function(df, dim_break = FALSE) {
#'
#' @noRd
guess_col_type <- function(tt) {

# all columns are character
# Initialize types vector with numeric type (default to 0 for character)
types <- vector("numeric", NCOL(tt))
names(types) <- names(tt)

# but some values are numeric
col_num <- vapply(tt, function(x) all(x == "n", na.rm = TRUE), NA)
types[names(col_num[col_num])] <- 1
# Function to check column type
check_col_type <- function(x, type_char) {
all(unique(x) == type_char, na.rm = TRUE)
}

# Identify the unique types present in the data frame
unique_types <- unique(unlist(lapply(tt, unique)))
unique_types[is.na(unique_types)] <- "n"

# Check for each type and update types vector accordingly
if ("n" %in% unique_types) {
col_num <- vapply(tt, check_col_type, NA, type_char = "n")
types[col_num] <- 1
}

# or even date
col_dte <- vapply(tt[!col_num], function(x) all(x == "d", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 2
if ("d" %in% unique_types) {
col_dte <- vapply(tt, check_col_type, NA, type_char = "d")
types[col_dte & types == 0] <- 2
}

# or even posix
col_dte <- vapply(tt[!col_num], function(x) all(x == "p", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 3
if ("p" %in% unique_types) {
col_posix <- vapply(tt, check_col_type, NA, type_char = "p")
types[col_posix & types == 0] <- 3
}

# there are bools as well
col_log <- vapply(tt[!col_num], function(x) all(x == "b", na.rm = TRUE), NA)
types[names(col_log[col_log])] <- 4
if ("b" %in% unique_types) {
col_log <- vapply(tt, check_col_type, NA, type_char = "b")
types[col_log & types == 0] <- 4
}

# or even hms
col_dte <- vapply(tt[!col_num], function(x) all(x == "h", na.rm = TRUE), NA)
types[names(col_dte[col_dte])] <- 5
if ("h" %in% unique_types) {
col_hms <- vapply(tt, check_col_type, NA, type_char = "h")
types[col_hms & types == 0] <- 5
}

# or formula
col_fml <- vapply(tt[!col_num], function(x) all(x == "f", na.rm = TRUE), NA)
types[names(col_fml[col_fml])] <- 6
if ("f" %in% unique_types) {
col_fml <- vapply(tt, check_col_type, NA, type_char = "f")
types[col_fml & types == 0] <- 6
}

types
}
Expand Down
10 changes: 7 additions & 3 deletions R/wb_load.R
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,15 @@ wb_load <- function(
## feature property bag
featureProperty <- grep_xml("featurePropertyBag.xml$")

cleanup_dir <- function(data_only) {
grep_xml("media|vmlDrawing|customXml|embeddings|activeX|vbaProject", ignore.case = TRUE, invert = TRUE)
}

## remove all EXCEPT media and charts
on.exit(
if (!data_only) on.exit(
unlink(
# TODO: this removes all files, the folders remain. grep instead grep_xml?
grep_xml("media|vmlDrawing|customXml|embeddings|activeX|vbaProject", ignore.case = TRUE, invert = TRUE),
# TODO: this removes all files, the folders remain
cleanup_dir(data_only),
recursive = TRUE, force = TRUE
),
add = TRUE
Expand Down
8 changes: 4 additions & 4 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,13 @@ BEGIN_RCPP
END_RCPP
}
// ox_int_to_col
std::string ox_int_to_col(int32_t cell);
RcppExport SEXP _openxlsx2_ox_int_to_col(SEXP cellSEXP) {
Rcpp::CharacterVector ox_int_to_col(Rcpp::NumericVector x);
RcppExport SEXP _openxlsx2_ox_int_to_col(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< int32_t >::type cell(cellSEXP);
rcpp_result_gen = Rcpp::wrap(ox_int_to_col(cell));
Rcpp::traits::input_parameter< Rcpp::NumericVector >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(ox_int_to_col(x));
return rcpp_result_gen;
END_RCPP
}
Expand Down
48 changes: 34 additions & 14 deletions src/helper_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,40 +118,60 @@ SEXP openxlsx2_type(SEXP x) {
return type;
}


// [[Rcpp::export]]
Rcpp::IntegerVector col_to_int(Rcpp::CharacterVector x) {

// This function converts the Excel column letter to an integer
R_xlen_t n = static_cast<R_xlen_t>(x.size());

std::string a;
std::unordered_map<std::string, int> col_map;
Rcpp::IntegerVector colNums(n);

for (R_xlen_t i = 0; i < n; i++) {
a = x[i];
for (R_xlen_t i = 0; i < n; ++i) {
std::string a = Rcpp::as<std::string>(x[i]);

// check if the value is digit only, if yes, add it and continue the loop
// at the top. This avoids slow:
// suppressWarnings(isTRUE(as.character(as.numeric(x)) == x))
if (std::all_of(a.begin(), a.end(), ::isdigit))
{
if (std::all_of(a.begin(), a.end(), ::isdigit)) {
colNums[i] = std::stoi(a);
continue;
}

// return index from column name
colNums[i] = cell_to_colint(a);
// Check if the column name is already in the map
if (col_map.find(a) != col_map.end()) {
colNums[i] = col_map[a];
} else {
// Compute the integer value and store it in the map
int col_int = cell_to_colint(a);
col_map[a] = col_int;
colNums[i] = col_int;
}
}

return colNums;

}

// [[Rcpp::export]]
std::string ox_int_to_col(int32_t cell) {
uint32_t cell_u32 = static_cast<uint32_t>(cell);
return int_to_col(cell_u32);
Rcpp::CharacterVector ox_int_to_col(Rcpp::NumericVector x) {
R_xlen_t n = static_cast<R_xlen_t>(x.size());
Rcpp::CharacterVector colNames(n);
std::unordered_map<int, std::string> cache;

for (R_xlen_t i = 0; i < n; ++i) {
uint32_t num = static_cast<uint32_t>(x[i]);

// Check if the column name is already in the cache
if (cache.find(num) != cache.end()) {
colNames[i] = cache[num];
} else {
// Compute the column name and store it in the cache
std::string col_name = int_to_col(num);
cache[num] = col_name;
colNames[i] = col_name;
}
}

return colNames;
}

// provide a basic rbindlist for lists of named characters
Expand Down Expand Up @@ -323,8 +343,8 @@ void long_to_wide(Rcpp::DataFrame z, Rcpp::DataFrame tt, Rcpp::DataFrame zz) {
R_xlen_t n = static_cast<R_xlen_t>(zz.nrow());
R_xlen_t col = 0, row = 0;

Rcpp::IntegerVector rows = zz["rows"];
Rcpp::IntegerVector cols = zz["cols"];
Rcpp::IntegerVector rows = zz["rows"];
Rcpp::CharacterVector vals = zz["val"];
Rcpp::CharacterVector typs = zz["typ"];

Expand Down
17 changes: 3 additions & 14 deletions src/load_workbook.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Rcpp::CharacterVector df_to_xml(std::string name, Rcpp::DataFrame df_col) {
}


Rcpp::DataFrame row_to_df(XPtrXML doc) {
inline Rcpp::DataFrame row_to_df(XPtrXML doc) {

auto ws = doc->child("worksheet").child("sheetData");

Expand Down Expand Up @@ -245,22 +245,11 @@ void loadvals(Rcpp::Environment sheet_data, XPtrXML doc) {
single_xml_col.r = buffer;

// get col name
std::string colrow = buffer;
colrow.erase(std::remove_if(colrow.begin(),
colrow.end(),
&isdigit),
colrow.end());
single_xml_col.c_r = colrow;
single_xml_col.c_r = rm_rownum(buffer);
has_colname = true;

// get colnum
colrow = buffer;
// remove numeric from string
colrow.erase(std::remove_if(colrow.begin(),
colrow.end(),
&isalpha),
colrow.end());
single_xml_col.row_r = colrow;
single_xml_col.row_r = rm_colnum(buffer);

// if some cells of the workbook have colnames but other dont,
// this will increase itr_cols and avoid duplicates in cc
Expand Down
29 changes: 17 additions & 12 deletions src/openxlsx2_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,27 @@ inline SEXP wrap(const std::vector<xml_col> &x) {
Rcpp::CharacterVector is(no_init(n)); // <is> tag

// struct to vector
// We have to convert utf8 inputs via Rcpp::String for non unicode R sessions
// Ideally there would be a function that calls Rcpp::String only if needed
for (R_xlen_t i = 0; i < n; ++i) {
size_t ii = static_cast<size_t>(i);
if (!x[ii].r.empty()) r[i] = Rcpp::String(x[ii].r);
if (!x[ii].row_r.empty()) row_r[i] = Rcpp::String(x[ii].row_r);
if (!x[ii].c_r.empty()) c_r[i] = Rcpp::String(x[ii].c_r);
if (!x[ii].c_s.empty()) c_s[i] = Rcpp::String(x[ii].c_s);
if (!x[ii].c_t.empty()) c_t[i] = Rcpp::String(x[ii].c_t);
if (!x[ii].c_cm.empty()) c_cm[i] = Rcpp::String(x[ii].c_cm);
if (!x[ii].r.empty()) r[i] = std::string(x[ii].r);
if (!x[ii].row_r.empty()) row_r[i] = std::string(x[ii].row_r);
if (!x[ii].c_r.empty()) c_r[i] = std::string(x[ii].c_r);
if (!x[ii].c_s.empty()) c_s[i] = std::string(x[ii].c_s);
if (!x[ii].c_t.empty()) c_t[i] = std::string(x[ii].c_t);
if (!x[ii].c_cm.empty()) c_cm[i] = std::string(x[ii].c_cm);
if (!x[ii].c_ph.empty()) c_ph[i] = Rcpp::String(x[ii].c_ph);
if (!x[ii].c_vm.empty()) c_vm[i] = Rcpp::String(x[ii].c_vm);
if (!x[ii].v.empty()) v[i] = Rcpp::String(x[ii].v);
if (!x[ii].c_vm.empty()) c_vm[i] = std::string(x[ii].c_vm);
if (!x[ii].v.empty()) { // can only be utf8 if c_t = "str"
if (x[ii].c_t.empty()) v[i] = std::string(x[ii].v);
else v[i] = Rcpp::String(x[ii].v);
}
if (!x[ii].f.empty()) f[i] = Rcpp::String(x[ii].f);
if (!x[ii].f_t.empty()) f_t[i] = Rcpp::String(x[ii].f_t);
if (!x[ii].f_ref.empty()) f_ref[i] = Rcpp::String(x[ii].f_ref);
if (!x[ii].f_ca.empty()) f_ca[i] = Rcpp::String(x[ii].f_ca);
if (!x[ii].f_si.empty()) f_si[i] = Rcpp::String(x[ii].f_si);
if (!x[ii].f_t.empty()) f_t[i] = std::string(x[ii].f_t);
if (!x[ii].f_ref.empty()) f_ref[i] = std::string(x[ii].f_ref);
if (!x[ii].f_ca.empty()) f_ca[i] = std::string(x[ii].f_ca);
if (!x[ii].f_si.empty()) f_si[i] = std::string(x[ii].f_si);
if (!x[ii].is.empty()) is[i] = Rcpp::String(x[ii].is);
}

Expand Down
Loading
Loading