Skip to content

Commit

Permalink
Merge pull request #26 from alexsanjoseph/v2.0
Browse files Browse the repository at this point in the history
V2.0
  • Loading branch information
alexsanjoseph authored Jan 5, 2020
2 parents 84b49da + 0713e2f commit 44f5635
Show file tree
Hide file tree
Showing 12 changed files with 462 additions and 278 deletions.
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
Package: compareDF
Type: Package
Title: Do a Git Style Diff of the Rows Between Two Dataframes with Similar Structure
Version: 1.8.0
Date: 2019-06-02
Version: 2.0.0
Date: 2020-01-05
Authors@R: person("Alex", "Joseph", email = "alexsanjoseph@gmail.com", role = c("aut", "cre"))
Description: Compares two dataframes which have the same column
structure to show the rows that have changed. Also gives a git style diff format
to quickly see what has changes in addition to summary statistics.
to quickly see what has changed in addition to summary statistics.
License: MIT + file LICENSE
Depends:
R (>= 3.4.0)
R (>= 3.5.0)
Imports:
dplyr (>= 0.4.3),
magrittr (>= 1.5),
htmlTable (>= 1.5),
openxlsx (>= 4.1),
tidyr (>= 0.4.1),
stringr (>= 1.0.0)
Suggests:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(compare_df)
export(create_output_table)
export(view_html)
import(dplyr)
importFrom(stats,na.omit)
Expand Down
42 changes: 24 additions & 18 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,55 +1,61 @@
# compareDF 1.8.0
## compareDF 2.0.0
* New Major Version! Contains some breaking changes
* Support for `XLSX` format
* Write output to file directly
* Separate functions to compare output and create output tables
* Cleaner abstractions in functions
* More bugs squashed
* Color blind friendly default colors

## compareDF 1.8.0
* Added new option to keep only the columns which have changed using `keep_unchanged_cols`.
* changed option `keep_unchanged` to `keep_unchanged_rows`

# compareDF 1.7.3
## compareDF 1.7.3
* Fixed tests to work with dplyr 0.8.2 and on Linux systems

# compareDF 1.7.2
## compareDF 1.7.2
* Fixed tests to work with dplyr 0.8.1

# compareDF 1.7.1
## compareDF 1.7.1
* Fixed tests to work with dplyr 0.8.0

# compareDF 1.7.0
## compareDF 1.7.0
* Provided options to name the columns in the HTML output
* Provided option change column name
* Provided option to change group column name

# compareDF 1.6.0
## compareDF 1.6.0
* Added option to specify different types of tolerances. Now you can use `difference` as an argument to use difference rather than ratio
* Fixed some bugs
* Lot more tests

# compareDF 1.5.0
* Added an option to preserve the rows that have not changed in the anlysis using the `keep_unchanged_rows` argument
## compareDF 1.5.0
* Added an option to preserve the rows that have not changed in the analysis using the `keep_unchanged_rows` argument
* Added an option to set the color scheme in the HTML using the `color_scheme` argument.
* Updated Documentation
* Fixed some bugs
* Fixed dependencies

# compareDF 1.3.1
## compareDF 1.3.1
* added a test dependency(stringr) as notified by CRAN

# compareDF 1.3.0
## compareDF 1.3.0
* added an option to ignore errors `stop_on_error`

# compareDF 1.2.0
## compareDF 1.2.0
* Fixed bugs
* view_html function to view the html file created for setups that cannot print html directly

# compareDF 1.1.2
## compareDF 1.1.2
* compareDF can now handle negative inputs correctly

# compareDF 1.1.1
## compareDF 1.1.1
* Fixed some bugs when the two frames are similar except reordering

# compareDF 1.1.0
## compareDF 1.1.0
* Fixed some bugs

# compareDF 1.0.0
## compareDF 1.0.0

* First Release!



129 changes: 17 additions & 112 deletions R/fnsComparison.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,13 @@
#' @param group_col A character vector of a string of character vector showing the columns
#' by which to group_by.
#' @param exclude The columns which should be excluded from the comparison
#' @param limit_html maximum number of rows to show in the html diff. >1000 not recommended
#' @param stop_on_error Whether to stop on acceptable errors on not
#' @param tolerance The amount in fraction to which changes are ignored while showing the
#' visual representation. By default, the value is 0 and any change in the value of variables
#' is shown off. Doesn't apply to categorical variables.
#' @param tolerance_type Defaults to 'ratio'. The type of comparison for numeric values, can be 'ratio' or 'difference'
#' @param keep_unchanged_rows whether to preserve unchanged values or not. Defaults to \code{FALSE}
#' @param keep_unchanged_cols whether to preserve unchanged values or not. Defaults to \code{TRUE}
#' @param color_scheme What color scheme to use for the HTML output. Should be a vector/list with
#' named_elements. Default - \code{c("addition" = "green", "removal" = "red", "unchanged_cell" = "gray", "unchanged_row" = "deepskyblue")}
#' @param html_headers A character vector of column names to be used in the table. Defaults to \code{colnames}.
#' @param html_change_col_name Name of the change column to use in the HTML table. Defaults to \code{chng_type}.
#' @param html_group_col_name Name of the group column to be used in the table (if there are multiple grouping vars). Defaults to \code{grp}.
#' @param round_output_to Number of digits to round the output to. Defaults to 3.
#' @import dplyr
#' @export
Expand All @@ -32,10 +26,8 @@
#' ctable = compare_df(new_df, old_df, c("var1"))
#' print(ctable$comparison_df)
#' ctable$html_output
compare_df <- function(df_new, df_old, group_col, exclude = NULL, limit_html = 100, tolerance = 0, tolerance_type = 'ratio',
compare_df <- function(df_new, df_old, group_col, exclude = NULL, tolerance = 0, tolerance_type = 'ratio',
stop_on_error = TRUE, keep_unchanged_rows = FALSE, keep_unchanged_cols = TRUE,
color_scheme = c("addition" = "green", "removal" = "red", "unchanged_cell" = "gray", "unchanged_row" = "deepskyblue"),
html_headers = NULL, html_change_col_name = "chng_type", html_group_col_name = "grp",
round_output_to = 3){

both_tables = list(df_new = df_new, df_old = df_old)
Expand Down Expand Up @@ -84,20 +76,18 @@ compare_df <- function(df_new, df_old, group_col, exclude = NULL, limit_html = 1
if(nrow(comparison_table) == 0) stop_or_warn("The two data frames are the same after accounting for tolerance!", stop_on_error)
if(nrow(comparison_table_diff) == 0) stop_or_warn("The two data frames are the same after accounting for tolerance!", stop_on_error)

html_headers_all = get_headers_for_html_table(html_headers, html_change_col_name, html_group_col_name, comparison_table_diff)

if (limit_html > 0 & nrow(comparison_table_diff) > 0 & nrow(comparison_table) > 0)
html_table = create_html_table(comparison_table_diff, comparison_table_ts2char, group_col, limit_html, color_scheme, html_headers_all) else
html_table = NULL
change_count = create_change_count(comparison_table, group_col)
change_summary = create_change_summary(change_count, both_tables)

comparison_table$chng_type = comparison_table$chng_type %>% replace_numbers_with_symbols()
comparison_table_diff = comparison_table_diff %>% replace_numbers_with_symbols()
comparison_table_diff_symbols = comparison_table_diff %>% replace_numbers_with_symbols()

list(comparison_df = comparison_table, html_output = html_table,
comparison_table_diff = comparison_table_diff,
change_count = change_count, change_summary = change_summary)
list(comparison_df = comparison_table,
comparison_table_diff = comparison_table_diff_symbols,
change_count = change_count, change_summary = change_summary,
group_col = group_col,
comparison_table_ts2char = comparison_table_ts2char,
comparison_table_diff_numbers = comparison_table_diff)

}

Expand Down Expand Up @@ -127,7 +117,9 @@ exclude_columns <- function(both_tables, exclude){
group_columns <- function(both_tables, group_col){
message("Grouping grouping columns")
df_combined = rbind(both_tables$df_new %>% mutate(from = "new"), both_tables$df_old %>% mutate(from = "old"))
df_combined = df_combined %>% piped.do.call(group_by_, group_col) %>% data.frame(grp = group_indices(.), .) %>% ungroup
df_combined = df_combined %>%
group_by_at(group_col) %>%
data.frame(grp = group_indices(.), .) %>% ungroup()
list(df_new = df_combined %>% filter(from == "new") %>% select(-from),
df_old = df_combined %>% filter(from == "old") %>% select(-from))
}
Expand Down Expand Up @@ -155,14 +147,15 @@ create_comparison_table <- function(both_diffs, group_col, round_output_to){
if(nrow(both_diffs$df1_2) != 0) mixed_df = mixed_df %>% rbind(data.frame(chng_type = "1", both_diffs$df1_2))
if(nrow(both_diffs$df2_1) != 0) mixed_df = mixed_df %>% rbind(data.frame(chng_type = "2", both_diffs$df2_1))
mixed_df %>%
arrange(desc(chng_type)) %>% arrange_(group_col) %>%
arrange(desc(chng_type)) %>%
arrange_at(group_col) %>%
# mutate(chng_type = ifelse(chng_type == 1, "1", "2")) %>%
select(one_of(group_col), everything()) %>% round_num_cols(round_output_to)
}


create_comparison_table_diff <- function(comparison_table_ts2char, group_col, tolerance, tolerance_type){
comparison_table_ts2char %>% group_by_(group_col) %>%
comparison_table_ts2char %>% group_by_at(group_col) %>%
do(.diff_type_df(., tolerance = tolerance, tolerance_type = tolerance_type)) %>% as.data.frame
}

Expand All @@ -172,34 +165,6 @@ eliminate_tolerant_rows <- function(comparison_table, comparison_table_diff){
comparison_table %>% filter(!rows_inside_tolerance)
}

#' @importFrom utils head
create_html_table <- function(comparison_table_diff, comparison_table_ts2char, group_col, limit_html, color_scheme, html_headers_all){

comparison_table_ts2char$chng_type = comparison_table_ts2char$chng_type %>% replace_numbers_with_symbols()

if(limit_html > 1000 & comparison_table_diff %>% nrow > 1000)
warning("Creating HTML diff for a large dataset (>1000 rows) could take a long time!")

if(limit_html < nrow(comparison_table_diff))
message("Truncating HTML diff table to ", limit_html, " rows...")

requireNamespace("htmlTable")
comparison_table_color_code = comparison_table_diff %>% do(.colour_coding_df(., color_scheme)) %>% as.data.frame

shading = ifelse(sequence_order_vector(comparison_table_ts2char[[group_col]]) %% 2, "#dedede", "white")

table_css = lapply(comparison_table_color_code, function(x)
paste0("padding: .2em; color: ", x, ";")) %>% data.frame %>% head(limit_html) %>% as.matrix()

colnames(comparison_table_ts2char) <- html_headers_all

message("Creating HTML table for first ", limit_html, " rows")
html_table = htmlTable::htmlTable(comparison_table_ts2char %>% head(limit_html),
col.rgroup = shading,
rnames = F, css.cell = table_css,
padding.rgroup = rep("5em", length(shading))
)
}

check_if_comparable <- function(df_new, df_old, group_col, stop_on_error){

Expand All @@ -223,15 +188,6 @@ round_num_cols <- function(df, round_digits = 2)
df
}

.colour_coding_df <- function(df, color_scheme){
if(nrow(df) == 0) return(df)
df[df == 2] = color_scheme[['addition']]
df[df == 1] = color_scheme[['removal']]
df[df == 0] = color_scheme[['unchanged_cell']]
df[df == -1] = color_scheme[['unchanged_row']]
df
}

#' @importFrom stats na.omit
.diff_type_df <- function(df, tolerance = 1e-6, tolerance_type = 'ratio'){

Expand Down Expand Up @@ -278,8 +234,6 @@ rowdiff <- function(x.1,x.2,...){
df
}

piped.do.call = function(x, fname, largs) do.call(fname, c(list(x), largs))

is.POSIXct <- function(x) inherits(x, "POSIXct")

sequence_order_vector <- function(data)
Expand All @@ -289,7 +243,7 @@ sequence_order_vector <- function(data)
}

create_change_count <- function(comparison_table_ts2char, group_col){
change_count = comparison_table_ts2char %>% group_by_(group_col, "chng_type") %>% tally()
change_count = comparison_table_ts2char %>% group_by_at(c(group_col, "chng_type")) %>% tally()
change_count_replace = change_count %>% tidyr::spread(key = chng_type, value = n) %>% data.frame
change_count_replace[is.na(change_count_replace)] = 0

Expand All @@ -298,7 +252,7 @@ create_change_count <- function(comparison_table_ts2char, group_col){
change_count_replace = change_count_replace %>% as.data.frame %>%
tidyr::gather_("variable", "value", c("X2", "X1"))

change_count = change_count_replace %>% group_by_(group_col) %>% arrange_('variable') %>%
change_count = change_count_replace %>% group_by_at(group_col) %>% arrange_at('variable') %>%
summarize(changes = min(value), additions = value[2] - value[1], removals = value[1] - value[2]) %>%
mutate(additions = replace(additions, is.na(additions) | additions < 0, 0)) %>%
mutate(removals = replace(removals, is.na(removals) | removals < 0, 0))
Expand All @@ -312,7 +266,7 @@ create_change_summary <- function(change_count, both_tables){
changes = sum(change_count$changes), additions = sum(change_count$additions), removals = sum(change_count$removals))
}

get_headers_for_html_table <- function(headers, change_col_name, group_col_name, comparison_table_diff) {
get_headers_for_table <- function(headers, change_col_name, group_col_name, comparison_table_diff) {
# if (is.null(headers)) return(names(comparison_table_diff))

headers_all = names(comparison_table_diff) %>%
Expand All @@ -324,52 +278,3 @@ get_headers_for_html_table <- function(headers, change_col_name, group_col_name,

headers_all
}

# nocov start
#' @title View Comparison output HTML
#'
#' @description Some versions of Rstudio doesn't automatically show the html pane for the html output. This is a workaround
#'
#' @param comparison_output output from the comparisonDF compare function
#' @export
#' @examples
#' old_df = data.frame(var1 = c("A", "B", "C"),
#' val1 = c(1, 2, 3))
#' new_df = data.frame(var1 = c("A", "B", "C"),
#' val1 = c(1, 2, 4))
#' ctable = compare_df(new_df, old_df, c("var1"))
#' # Not Run::
#' # view_html(ctable)
view_html <- function(comparison_output){
temp_dir = tempdir()
temp_file <- paste0(temp_dir, "/temp.html")
cat(comparison_output$html_output, file = temp_file)
getOption("viewer")(temp_file)
unlink("temp.html")
}
# nocov end

# Deprecated. Will bring it back in a letter version if deemed necessary
# create_change_detail_summary <- function(){
# change_detail = comparison_table_diff
# change_detail[[group_col]] = comparison_table_ts2char[[group_col]]
# change_detail = change_detail %>% reshape::melt.data.frame(group_col)
#
# change_detail_replace = change_detail %>% group_by_(group_col, "variable", "value") %>% tally()
# change_detail_replace = change_detail_replace %>% group_by_(group_col, "variable") %>% tidyr::spread(key = value, value = n)
# change_detail_replace[is.na(change_detail_replace)] = 0
# change_detail_summary_replace = change_detail_replace %>% data.frame %>% dplyr::rename(param = variable) %>%
# mutate(param = as.character(param)) %>% tidyr::gather("variable", "value", 3:ncol(.))
#
# change_detail_count = change_detail_summary_replace %>% group_by_(group_col, "param") %>% arrange(desc(variable)) %>%
# summarize(changes = min(value[1:2]), additions = value[1] - value[2], removals = value[2] - value[1]) %>%
# mutate(additions = replace(additions, is.na(additions), 0)) %>%
# mutate(removals = replace(removals, is.na(removals), 0))
# change_detail_count = change_detail_count %>%
# mutate(replace(changes, changes < 0, 0)) %>%
# mutate(replace(removals, removals < 0, 0)) %>%
# mutate(replace(additions, additions < 0, 0))
#
# change_detail_count_summary = change_detail_count %>% group_by(param) %>%
# summarize(total_changes = sum(changes), total_additions = sum(additions), tot_removals = sum(removals))
# }
Loading

0 comments on commit 44f5635

Please sign in to comment.