kaybenleroll
diff --git a/‎Dockerfile
+2-1 b/‎Dockerfile
+2-1
diff --git a/‎Makefile
+1 b/‎Makefile
+1
diff --git a/‎build_models.Rmd
+35-13 b/‎build_models.Rmd
+35-13
diff --git a/‎build_models.html
+250-231 b/‎build_models.html
+250-231
diff --git a/‎exploring_graph_data.Rmd
+9-5 b/‎exploring_graph_data.Rmd
+9-5
diff --git a/‎exploring_graph_data.html
+109-104 b/‎exploring_graph_data.html
+109-104
diff --git a/‎exploring_retail_data.html
+123-123 b/‎exploring_retail_data.html
+123-123
diff --git a/‎exploring_retail_dataexplorer.html
+39-39 b/‎exploring_retail_dataexplorer.html
+39-39
diff --git a/‎initial_arules_models.Rmd
+2-4 b/‎initial_arules_models.Rmd
+2-4
diff --git a/‎initial_arules_models.html
+90-92 b/‎initial_arules_models.html
+90-92
diff --git a/‎initial_btyd_models.html
+70-70 b/‎initial_btyd_models.html
+70-70
diff --git a/‎initial_rfm_models.html
+57-57 b/‎initial_rfm_models.html
+57-57
diff --git a/‎initial_timeseries_models.html
+53-52 b/‎initial_timeseries_models.html
+53-52
diff --git a/‎retrieve_retail_data.Rmd
+16-1 b/‎retrieve_retail_data.Rmd
+16-1
diff --git a/‎retrieve_retail_data.html
+52-31 b/‎retrieve_retail_data.html
+52-31
@@ -1,4 +1,4 @@
-FROM rocker/verse:4.1.0
+FROM rocker/verse:4.1.1
 
 RUN apt-get update \
   && apt-get upgrade -y \
@@ -42,6 +42,7 @@ RUN apt-get update \
     ggwordcloud \
     kableExtra \
     pryr \
+    revealjs \
     rfm \
     rmdformats \
     sessioninfo \
 
@@ -48,6 +48,7 @@ initial_rfm_models.html: exploring_retail_data.html
 initial_timeseries_models.html: exploring_retail_data.html
 build_models.html: initial_arules_models.html initial_btyd_models.html \
   initial_rfm_models.html initial_timeseries_models.html
+summary_slides.html: build_models.html
 
 
 clean-html:
 
@@ -303,8 +303,7 @@ to see which clustering may be the
 ```{r create_largest_subgraph_clusters, echo=TRUE, cache=TRUE}
 run_subgraph_clusters <- function(graph_cluster_func, rules_tblgraph, ...) {
   subgraph_clusters_tbl <- rules_tblgraph %>%
-    to_subgraph(component_size == max(component_size)) %>%
-    use_series(subgraph) %>%
+    convert(to_subgraph, component_size == max(component_size)) %>%
     morph(to_undirected) %>%
     mutate(
       sub_id = graph_cluster_func(...)
@@ -387,8 +386,7 @@ algorithm and use this to create our various product groups.
 
 ```{r construct_fast_greedy_clusters, echo=TRUE}
 subgraph_groups_tbl <- apriori_rules_tblgraph %>%
-  to_subgraph(component_size == max(component_size)) %>%
-  use_series(subgraph) %>%
+  convert(to_subgraph, component_size == max(component_size)) %>%
   morph(to_undirected) %>%
   mutate(
     sub_id = group_louvain()
@@ -509,7 +507,13 @@ nodes_tbl <- list(stock_nodes_tbl, invoice_nodes_tbl) %>%
   bind_rows()
 
 edges_tbl <- tnx_purchase_tbl %>%
-  select(stock_code, invoice_id, quantity, price)
+  group_by(stock_code, invoice_id) %>%
+  summarise(
+    .groups = "drop",
+    
+    total_quantity = sum(quantity),
+    total_cost     = sum(quantity * price)
+    )
 
 
 basket_tblgraph <- tbl_graph(
@@ -583,8 +587,7 @@ cluster_func <- c(
     )
 
 largecomp_tblgraph <- basket_tblgraph %>%
-  to_subgraph(component_size == max(component_size)) %>%
-  use_series(subgraph)
+  convert(to_subgraph, component_size == max(component_size))
 
 cluster_data_tbl <- tibble(cluster_func_name = cluster_func) %>%
   mutate(
@@ -1328,6 +1331,7 @@ make_df_matrix <- function(data_tbl) {
 
 ```{r create_segment_group_frequency_data, echo=TRUE}
 segment_group_freq_tbl <- tnx_correspondence_tbl %>%
+  filter(product_group != "TNX_011") %>%
   count(segment, product_group, name = "freq_count") %>%
   pivot_wider(
     id_cols     = segment,
@@ -1373,19 +1377,19 @@ segment_group_ca %>%
 
 
 According to the biplots, there is a suggested relationship between customers
-in the "Champions" category and those products in grouping "TNX_009" - we
-ignore "TNX_010" as it is very small.
+in the "Champions" category and those products in grouping "TNX_007" - we may
+also want to look at customers in group "TNX_001".
 
 As before, let us look at a wordcloud on the types of items in that.
 
-```{r plot_tnx_009_word_cloud, echo=TRUE}
-wc_009_tbl <- product_group_tokens_tbl %>%
-  filter(product_group == "TNX_009") %>%
+```{r plot_tnx_007_word_cloud, echo=TRUE}
+wc_007_tbl <- product_group_tokens_tbl %>%
+  filter(product_group == "TNX_007") %>%
   count(word, name = "freq") %>%
   slice_max(order_by = freq, n = 100)
 
 wc_plot <- ggwordcloud2(
-    data    = wc_009_tbl,
+    data    = wc_007_tbl,
     shuffle = FALSE,
     size    = 4,
     seed    = 42421
@@ -1414,6 +1418,24 @@ wc_plot %>% plot()
 ```
 
 
+# Write Data to Disk
+
+We now want to write this data to the disk for later use.
+
+```{r write_data_disk, echo=TRUE}
+product_group_tnxgroups_tbl %>% write_rds("data/product_group_tnxgroups_tbl.rds")
+
+customer_rfmdata      %>% write_rds("data/customer_rfmdata.rds")
+customer_segments_tbl %>% write_rds("data/customer_segments_tbl.rds")
+
+validation_rfm_data_tbl %>% write_rds("data/validation_rfm_data_tbl.rds")
+
+segment_group_mat %>% write_rds("data/segment_group_mat.rds")
+
+product_group_tokens_tbl %>% write_rds("data/product_group_tokens_tbl.rds")
+```
+
+
 # R Environment
 
 ```{r show_session_info, echo=TRUE, message=TRUE}
 
@@ -173,7 +173,13 @@ nodes_tbl <- list(stock_nodes_tbl, invoice_nodes_tbl) %>%
   bind_rows()
 
 edges_tbl <- tnx_purchase_tbl %>%
-  select(stock_code, invoice_id, quantity, price)
+  group_by(stock_code, invoice_id) %>%
+  summarise(
+    .groups = "drop",
+    
+    total_quantity = sum(quantity),
+    total_cost     = sum(quantity * price)
+    )
 
 
 basket_tblgraph <- tbl_graph(
@@ -247,8 +253,7 @@ cluster_func <- c(
     )
 
 largecomp_tblgraph <- basket_tblgraph %>%
-  to_subgraph(component_size == max(component_size)) %>%
-  use_series(subgraph)
+  convert(to_subgraph, component_size == max(component_size))
 
 cluster_data_tbl <- tibble(cluster_func_name = cluster_func) %>%
   mutate(
@@ -364,8 +369,7 @@ pairwise_largecomp_tblgraph <- pairwise_tblgraph %>%
   group_by(component_id) %>%
   mutate(component_size = n()) %>%
   ungroup() %>%
-  to_subgraph(component_size == max(component_size)) %>%
-  use_series(subgraph)
+  convert(to_subgraph, component_size == max(component_size))
 
 pairwise_largecomp_tblgraph %>% print()
 ```
 
@@ -481,8 +481,7 @@ further graph clustering algorithms to create further groupings.
 
 ```{r create_large_component_clusters, echo=TRUE}
 apriori_rules_large_tblgraph <- apriori_rules_tblgraph %>%
-  to_subgraph(component_size == max(component_size)) %>%
-  use_series(subgraph) %>%
+  convert(to_subgraph, component_size == max(component_size)) %>%
   morph(to_undirected) %>%
   mutate(
     sub_id = group_louvain()
@@ -713,8 +712,7 @@ product_groups_lower_all_tbl %>% glimpse()
 
 ```{r construct_largest_subgraph_groups, echo=TRUE}
 apriori_lower_rules_bigcomp_tblgraph <- apriori_lower_rules_tblgraph %>%
-  to_subgraph(component_size == max(component_size)) %>%
-  use_series(subgraph) %>%
+  convert(to_subgraph, component_size == max(component_size)) %>%
   mutate(
     sub_id = group_louvain()
     )
 
@@ -133,10 +133,25 @@ retail_data_tbl %>% glimpse()
 ```
 
 
+A number of invoice entries have been duplicated so we only keep on set of
+this data.
+
+```{r deduplicate_rows, echo=TRUE}
+dedupe_data_tbl <- retail_data_tbl %>%
+  group_nest(excel_sheet, Invoice, .key = "invoice_data") %>%
+  group_by(Invoice) %>%
+  slice_max(order_by = excel_sheet, n = 1, with_ties = FALSE) %>%
+  ungroup() %>%
+  unnest(invoice_data)  
+
+dedupe_data_tbl %>% glimpse()
+```
+
+
 Finally, we output this data to the disk.
 
 ```{r write_data_to_disk, echo=TRUE}
-retail_data_tbl %>% write_rds("data/retail_data_raw_tbl.rds")
+dedupe_data_tbl %>% write_rds("data/retail_data_raw_tbl.rds")
 ```