Skip to content

Commit b9d0120

Browse files
committed
WORKSHOP8 - Added code for the final slides
1 parent b57c893 commit b9d0120

17 files changed

+1895
-823
lines changed

Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM rocker/verse:4.1.0
1+
FROM rocker/verse:4.1.1
22

33
RUN apt-get update \
44
&& apt-get upgrade -y \
@@ -42,6 +42,7 @@ RUN apt-get update \
4242
ggwordcloud \
4343
kableExtra \
4444
pryr \
45+
revealjs \
4546
rfm \
4647
rmdformats \
4748
sessioninfo \

Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ initial_rfm_models.html: exploring_retail_data.html
4848
initial_timeseries_models.html: exploring_retail_data.html
4949
build_models.html: initial_arules_models.html initial_btyd_models.html \
5050
initial_rfm_models.html initial_timeseries_models.html
51+
summary_slides.html: build_models.html
5152

5253

5354
clean-html:

build_models.Rmd

+35-13
Original file line numberDiff line numberDiff line change
@@ -303,8 +303,7 @@ to see which clustering may be the
303303
```{r create_largest_subgraph_clusters, echo=TRUE, cache=TRUE}
304304
run_subgraph_clusters <- function(graph_cluster_func, rules_tblgraph, ...) {
305305
subgraph_clusters_tbl <- rules_tblgraph %>%
306-
to_subgraph(component_size == max(component_size)) %>%
307-
use_series(subgraph) %>%
306+
convert(to_subgraph, component_size == max(component_size)) %>%
308307
morph(to_undirected) %>%
309308
mutate(
310309
sub_id = graph_cluster_func(...)
@@ -387,8 +386,7 @@ algorithm and use this to create our various product groups.
387386

388387
```{r construct_fast_greedy_clusters, echo=TRUE}
389388
subgraph_groups_tbl <- apriori_rules_tblgraph %>%
390-
to_subgraph(component_size == max(component_size)) %>%
391-
use_series(subgraph) %>%
389+
convert(to_subgraph, component_size == max(component_size)) %>%
392390
morph(to_undirected) %>%
393391
mutate(
394392
sub_id = group_louvain()
@@ -509,7 +507,13 @@ nodes_tbl <- list(stock_nodes_tbl, invoice_nodes_tbl) %>%
509507
bind_rows()
510508
511509
edges_tbl <- tnx_purchase_tbl %>%
512-
select(stock_code, invoice_id, quantity, price)
510+
group_by(stock_code, invoice_id) %>%
511+
summarise(
512+
.groups = "drop",
513+
514+
total_quantity = sum(quantity),
515+
total_cost = sum(quantity * price)
516+
)
513517
514518
515519
basket_tblgraph <- tbl_graph(
@@ -583,8 +587,7 @@ cluster_func <- c(
583587
)
584588
585589
largecomp_tblgraph <- basket_tblgraph %>%
586-
to_subgraph(component_size == max(component_size)) %>%
587-
use_series(subgraph)
590+
convert(to_subgraph, component_size == max(component_size))
588591
589592
cluster_data_tbl <- tibble(cluster_func_name = cluster_func) %>%
590593
mutate(
@@ -1328,6 +1331,7 @@ make_df_matrix <- function(data_tbl) {
13281331

13291332
```{r create_segment_group_frequency_data, echo=TRUE}
13301333
segment_group_freq_tbl <- tnx_correspondence_tbl %>%
1334+
filter(product_group != "TNX_011") %>%
13311335
count(segment, product_group, name = "freq_count") %>%
13321336
pivot_wider(
13331337
id_cols = segment,
@@ -1373,19 +1377,19 @@ segment_group_ca %>%
13731377

13741378

13751379
According to the biplots, there is a suggested relationship between customers
1376-
in the "Champions" category and those products in grouping "TNX_009" - we
1377-
ignore "TNX_010" as it is very small.
1380+
in the "Champions" category and those products in grouping "TNX_007" - we may
1381+
also want to look at customers in group "TNX_001".
13781382

13791383
As before, let us look at a wordcloud on the types of items in that.
13801384

1381-
```{r plot_tnx_009_word_cloud, echo=TRUE}
1382-
wc_009_tbl <- product_group_tokens_tbl %>%
1383-
filter(product_group == "TNX_009") %>%
1385+
```{r plot_tnx_007_word_cloud, echo=TRUE}
1386+
wc_007_tbl <- product_group_tokens_tbl %>%
1387+
filter(product_group == "TNX_007") %>%
13841388
count(word, name = "freq") %>%
13851389
slice_max(order_by = freq, n = 100)
13861390
13871391
wc_plot <- ggwordcloud2(
1388-
data = wc_009_tbl,
1392+
data = wc_007_tbl,
13891393
shuffle = FALSE,
13901394
size = 4,
13911395
seed = 42421
@@ -1414,6 +1418,24 @@ wc_plot %>% plot()
14141418
```
14151419

14161420

1421+
# Write Data to Disk
1422+
1423+
We now want to write this data to the disk for later use.
1424+
1425+
```{r write_data_disk, echo=TRUE}
1426+
product_group_tnxgroups_tbl %>% write_rds("data/product_group_tnxgroups_tbl.rds")
1427+
1428+
customer_rfmdata %>% write_rds("data/customer_rfmdata.rds")
1429+
customer_segments_tbl %>% write_rds("data/customer_segments_tbl.rds")
1430+
1431+
validation_rfm_data_tbl %>% write_rds("data/validation_rfm_data_tbl.rds")
1432+
1433+
segment_group_mat %>% write_rds("data/segment_group_mat.rds")
1434+
1435+
product_group_tokens_tbl %>% write_rds("data/product_group_tokens_tbl.rds")
1436+
```
1437+
1438+
14171439
# R Environment
14181440

14191441
```{r show_session_info, echo=TRUE, message=TRUE}

build_models.html

+250-231
Large diffs are not rendered by default.

exploring_graph_data.Rmd

+9-5
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,13 @@ nodes_tbl <- list(stock_nodes_tbl, invoice_nodes_tbl) %>%
173173
bind_rows()
174174
175175
edges_tbl <- tnx_purchase_tbl %>%
176-
select(stock_code, invoice_id, quantity, price)
176+
group_by(stock_code, invoice_id) %>%
177+
summarise(
178+
.groups = "drop",
179+
180+
total_quantity = sum(quantity),
181+
total_cost = sum(quantity * price)
182+
)
177183
178184
179185
basket_tblgraph <- tbl_graph(
@@ -247,8 +253,7 @@ cluster_func <- c(
247253
)
248254
249255
largecomp_tblgraph <- basket_tblgraph %>%
250-
to_subgraph(component_size == max(component_size)) %>%
251-
use_series(subgraph)
256+
convert(to_subgraph, component_size == max(component_size))
252257
253258
cluster_data_tbl <- tibble(cluster_func_name = cluster_func) %>%
254259
mutate(
@@ -364,8 +369,7 @@ pairwise_largecomp_tblgraph <- pairwise_tblgraph %>%
364369
group_by(component_id) %>%
365370
mutate(component_size = n()) %>%
366371
ungroup() %>%
367-
to_subgraph(component_size == max(component_size)) %>%
368-
use_series(subgraph)
372+
convert(to_subgraph, component_size == max(component_size))
369373
370374
pairwise_largecomp_tblgraph %>% print()
371375
```

exploring_graph_data.html

+109-104
Large diffs are not rendered by default.

exploring_retail_data.html

+123-123
Large diffs are not rendered by default.

exploring_retail_dataexplorer.html

+39-39
Large diffs are not rendered by default.

initial_arules_models.Rmd

+2-4
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,7 @@ further graph clustering algorithms to create further groupings.
481481

482482
```{r create_large_component_clusters, echo=TRUE}
483483
apriori_rules_large_tblgraph <- apriori_rules_tblgraph %>%
484-
to_subgraph(component_size == max(component_size)) %>%
485-
use_series(subgraph) %>%
484+
convert(to_subgraph, component_size == max(component_size)) %>%
486485
morph(to_undirected) %>%
487486
mutate(
488487
sub_id = group_louvain()
@@ -713,8 +712,7 @@ product_groups_lower_all_tbl %>% glimpse()
713712

714713
```{r construct_largest_subgraph_groups, echo=TRUE}
715714
apriori_lower_rules_bigcomp_tblgraph <- apriori_lower_rules_tblgraph %>%
716-
to_subgraph(component_size == max(component_size)) %>%
717-
use_series(subgraph) %>%
715+
convert(to_subgraph, component_size == max(component_size)) %>%
718716
mutate(
719717
sub_id = group_louvain()
720718
)

initial_arules_models.html

+90-92
Large diffs are not rendered by default.

initial_btyd_models.html

+70-70
Large diffs are not rendered by default.

initial_rfm_models.html

+57-57
Large diffs are not rendered by default.

initial_timeseries_models.html

+53-52
Large diffs are not rendered by default.

retrieve_retail_data.Rmd

+16-1
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,25 @@ retail_data_tbl %>% glimpse()
133133
```
134134

135135

136+
A number of invoice entries have been duplicated so we only keep on set of
137+
this data.
138+
139+
```{r deduplicate_rows, echo=TRUE}
140+
dedupe_data_tbl <- retail_data_tbl %>%
141+
group_nest(excel_sheet, Invoice, .key = "invoice_data") %>%
142+
group_by(Invoice) %>%
143+
slice_max(order_by = excel_sheet, n = 1, with_ties = FALSE) %>%
144+
ungroup() %>%
145+
unnest(invoice_data)
146+
147+
dedupe_data_tbl %>% glimpse()
148+
```
149+
150+
136151
Finally, we output this data to the disk.
137152

138153
```{r write_data_to_disk, echo=TRUE}
139-
retail_data_tbl %>% write_rds("data/retail_data_raw_tbl.rds")
154+
dedupe_data_tbl %>% write_rds("data/retail_data_raw_tbl.rds")
140155
```
141156

142157

retrieve_retail_data.html

+52-31
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)