Skip to content

Commit 0fbf8e8

Browse files
committed
Adding code and output for use in Workshop 4
1 parent 6437693 commit 0fbf8e8

11 files changed

+923
-511
lines changed

Dockerfile

+10-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ RUN apt-get update \
55
graphviz \
66
less \
77
liblapack-dev \
8+
libtk8.6 \
9+
pbzip2 \
810
p7zip-full \
11+
tk8.6 \
912
&& apt-get clean \
1013
&& rm -rf /var/lib/apt/lists/* \
1114
&& install2.r --error \
@@ -22,11 +25,15 @@ RUN apt-get update \
2225
conflicted \
2326
cowplot \
2427
DataExplorer \
28+
DT \
2529
directlabels \
2630
evir \
2731
fitdistrplus \
2832
fs \
2933
furrr \
34+
ggraph \
35+
lobstr \
36+
pryr \
3037
rfm \
3138
rmdformats \
3239
snakecase \
@@ -37,6 +44,7 @@ RUN apt-get update \
3744
tidytext \
3845
timetk
3946

47+
RUN Rscript -e 'BiocManager::install("Rgraphviz")'
4048

4149
COPY build/conffiles.7z /tmp
4250
COPY build/docker_install_rpkgs.R /tmp
@@ -59,4 +67,5 @@ RUN 7z x /tmp/conffiles.7z \
5967
&& cp conffiles/.Rprofile . \
6068
&& cp conffiles/user-settings .rstudio/monitored/user-settings/ \
6169
&& chown -R rstudio:rstudio /home/rstudio \
62-
&& rm -rfv conffiles/
70+
&& rm -rfv conffiles/
71+

exploring_graph_data.html

+37-50
Large diffs are not rendered by default.

exploring_retail_data.Rmd

+37-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ output:
66
rmdformats::readthedown:
77
toc_depth: 3
88
use_bookdown: yes
9+
css: styles.css
910

1011
html_document:
1112
fig_caption: yes
@@ -1424,12 +1425,47 @@ cleaned_data_tbl %>% glimpse()
14241425
```
14251426

14261427

1428+
# Construct Stock Code Lookups
1429+
1430+
Finally, we want to construct a lookup table that provides some free-text
1431+
fields for each `stock_code` value.
1432+
1433+
```{r construct_stock_code_table, echo=TRUE}
1434+
stock_code_lookup_tbl <- cleaned_data_tbl %>%
1435+
filter(exclude == FALSE, quantity > 0, !are_na(description)) %>%
1436+
mutate(
1437+
stock_code_upr = stock_code %>% str_trim() %>% str_to_upper()
1438+
) %>%
1439+
select(stock_code_upr, description) %>%
1440+
drop_na(description) %>%
1441+
distinct() %>%
1442+
group_by(stock_code_upr) %>%
1443+
summarise(
1444+
.groups = "drop",
1445+
1446+
desc = description %>% sort() %>% str_c(collapse = " : ")
1447+
) %>%
1448+
arrange(stock_code_upr)
1449+
1450+
stock_code_lookup_tbl %>% glimpse()
1451+
```
1452+
1453+
We also look at this output using DT
1454+
1455+
```{r view_stock_code_lookup, echo=TRUE}
1456+
stock_code_lookup_tbl %>% datatable()
1457+
```
1458+
1459+
1460+
14271461
# Output Cleaned Data
14281462

14291463
```{r write_to_disk, echo=TRUE}
1464+
stock_code_lookup_tbl %>% write_rds("data/stock_code_lookup_tbl.rds")
1465+
14301466
returns_lookup_tbl %>% write_rds("data/returns_lookup_tbl.rds")
14311467
1432-
cleaned_data_tbl %>% write_rds("data/retail_data_cleaned_tbl.rds")
1468+
cleaned_data_tbl %>% write_rds("data/retail_data_cleaned_tbl.rds")
14331469
```
14341470

14351471

exploring_retail_data.html

+46-12
Large diffs are not rendered by default.

exploring_retail_dataexplorer.html

+2-2
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ <h2><a href="#content">Exploring the Cleaned Online Retail Dataset with DataExpl
205205
</div>
206206
<div id="postamble" data-toggle="wy-nav-shift" class="status">
207207
<p class="author"><span class="glyphicon glyphicon-user"></span> Mick Cooney <a href="mailto:mickcooney@gmail.com" class="email">mickcooney@gmail.com</a></p>
208-
<p class="date"><span class="glyphicon glyphicon-calendar"></span> Last updated: April 26, 2021</p>
208+
<p class="date"><span class="glyphicon glyphicon-calendar"></span> Last updated: May 24, 2021</p>
209209
</div>
210210
</div>
211211

@@ -324,7 +324,7 @@ <h1><span class="header-section-number">4</span> R Environment</h1>
324324
## collate en_US.UTF-8
325325
## ctype en_US.UTF-8
326326
## tz Etc/UTC
327-
## date 2021-04-26
327+
## date 2021-05-24
328328
##
329329
## ─ Packages ───────────────────────────────────────────────────────────────────
330330
## package * version date lib source

initial_arules_models.Rmd

+164-6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ output:
66
rmdformats::readthedown:
77
toc_depth: 3
88
use_bookdown: yes
9+
css: styles.css
910

1011
html_document:
1112
fig_caption: yes
@@ -100,6 +101,16 @@ We now write this data out as a CSV so `arules` can read it in and process it.
100101
tnx_purchase_tbl %>% write_csv("data/tnx_purchase_tbl.csv")
101102
```
102103

104+
We also want to load the free-text description of the various stock items as
105+
this will help will interpretation of the various rules we have found.
106+
107+
```{r load_stock_code_descriptions, echo=TRUE}
108+
stock_code_lookups_tbl <- read_rds("data/stock_code_lookup_tbl.rds")
109+
110+
stock_code_lookups_tbl %>% glimpse()
111+
```
112+
113+
103114

104115

105116
# Basket Analysis with Association Rules
@@ -202,13 +213,13 @@ $$
202213
So, to calculate the confidence for a rule:
203214

204215
$$
205-
\text{Supp}(\text{\{milk, bread\}} \Rightarrow \text{\{butter\}}) = \frac{0.2}{0.4} = 0.5.
216+
\text{Conf}(\text{\{milk, bread\}} \Rightarrow \text{\{butter\}}) = \frac{0.2}{0.4} = 0.5.
206217
$$
207218

208219
To illustrate how rules are not symmetric:
209220

210221
$$
211-
\text{Supp}(\text{\{butter\}} \Rightarrow \text{\{milk, bread\}}) = \frac{0.2}{0.6} = 0.33.
222+
\text{Conf}(\text{\{butter\}} \Rightarrow \text{\{milk, bread\}}) = \frac{0.2}{0.6} = 0.33.
212223
$$
213224

214225

@@ -385,7 +396,7 @@ apriori_rules_igraph <- basket_apriori %>%
385396
) %>%
386397
as("igraph")
387398
388-
apriori_rules_igraph %>% glimpse()
399+
apriori_rules_igraph %>% print()
389400
```
390401

391402
We should first visualise this graph, using the top 30 rules in the dataset,
@@ -434,7 +445,7 @@ product_groups_all_tbl <- apriori_rules_tblgraph %>%
434445
product_count = n()
435446
) %>%
436447
ungroup() %>%
437-
select(product_group_id = component_id, stock_code = label) %>%
448+
select(product_group_id = component_id, product_count, stock_code = label) %>%
438449
arrange(product_group_id, stock_code)
439450
440451
product_groups_all_tbl %>% glimpse()
@@ -488,7 +499,7 @@ product_groups_largest_tbl <- apriori_rules_large_tblgraph %>%
488499
product_count = n()
489500
) %>%
490501
ungroup() %>%
491-
select(product_group_id = sub_id, stock_code = label) %>%
502+
select(product_group_id = sub_id, product_count, stock_code = label) %>%
492503
arrange(product_group_id, stock_code)
493504
494505
product_groups_largest_tbl %>% glimpse()
@@ -573,7 +584,7 @@ group_props_tbl <- arules_groups_tbl %>%
573584
filter(group_size > 1, group_size < 15) %>%
574585
expand_grid(tnx_groups_tbl) %>%
575586
mutate(
576-
comb_data = future_map2(
587+
comb_data = future_map2(
577588
invoice_data, stock_data,
578589
inner_join,
579590
by = "stock_code",
@@ -606,6 +617,21 @@ ggplot(plot_tbl) +
606617
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
607618
```
608619

620+
621+
### Investigate Groups
622+
623+
Now that we have our groups we add on the description fields so that
624+
interpretation of the different groupings is easier.
625+
626+
```{r display_stock_groups, echo=TRUE}
627+
stock_groups_tbl %>%
628+
filter(group_size > 1, group_size < 15) %>%
629+
mutate(stock_code_upr = stock_code %>% str_trim() %>% str_to_upper()) %>%
630+
left_join(stock_code_lookups_tbl, by = "stock_code_upr") %>%
631+
datatable()
632+
```
633+
634+
609635
### Write Product Groups
610636

611637
As this may be useful for later analysis and for later modelling, we output
@@ -616,6 +642,138 @@ stock_groups_tbl %>% write_rds("data/stock_groups_tbl.rds")
616642
```
617643

618644

645+
646+
# Investigate Lower Support Rules
647+
648+
Our previous analysis was all based on rules with a minimum confidence of 0.80
649+
so we now want to repeat our analysis but on this new set of rules.
650+
651+
652+
```{r construct_lower_rules_graph, echo=TRUE}
653+
apriori_lower_rules_igraph <- basket_lower_rules %>%
654+
plot(
655+
measure = "support",
656+
method = "graph",
657+
control = list(max = 5000)
658+
) %>%
659+
as("igraph")
660+
661+
apriori_lower_rules_igraph %>% glimpse()
662+
```
663+
664+
Once again we have a quick look at the top 50 rules.
665+
666+
```{r visual_inspection_rules_igraph, echo=TRUE}
667+
basket_lower_rules %>%
668+
head(n = 50, by = "support") %>%
669+
plot(
670+
measure = "lift",
671+
method = "graph",
672+
engine = "htmlwidget"
673+
)
674+
```
675+
676+
677+
678+
## Determine Distinct Rules Subgraphs
679+
680+
Having converted the association rules to the graph, we then look at the
681+
distinct components of this graph and use these as our first pass at these
682+
clusters.
683+
684+
```{r create_lower_component_labels, echo=TRUE}
685+
apriori_lower_rules_tblgraph <- apriori_lower_rules_igraph %>%
686+
as_tbl_graph() %>%
687+
mutate(
688+
component_id = group_components()
689+
) %>%
690+
group_by(component_id) %>%
691+
mutate(
692+
component_size = n()
693+
) %>%
694+
ungroup()
695+
696+
apriori_lower_rules_tblgraph %>% print()
697+
```
698+
699+
700+
```{r construct_product_groups_all, echo=TRUE}
701+
product_groups_lower_all_tbl <- apriori_lower_rules_tblgraph %>%
702+
activate(nodes) %>%
703+
as_tibble() %>%
704+
filter(are_na(support)) %>%
705+
group_by(component_id) %>%
706+
mutate(
707+
product_count = n()
708+
) %>%
709+
ungroup() %>%
710+
select(product_group_id = component_id, product_count, stock_code = label) %>%
711+
arrange(product_group_id, stock_code)
712+
713+
product_groups_lower_all_tbl %>% glimpse()
714+
```
715+
716+
```{r construct_largest_subgraph_groups, echo=TRUE, cache=TRUE}
717+
apriori_lower_rules_bigcomp_tblgraph <- apriori_lower_rules_tblgraph %>%
718+
to_subgraph(component_size == max(component_size)) %>%
719+
use_series(subgraph) %>%
720+
morph(to_undirected) %>%
721+
mutate(
722+
sub_id = group_edge_betweenness()
723+
) %>%
724+
unmorph()
725+
```
726+
727+
728+
```{r convert_subgraph_groups_tbl}
729+
product_groups_lower_bigcomp_tbl <- apriori_lower_rules_bigcomp_tblgraph %>%
730+
activate(nodes) %>%
731+
as_tibble() %>%
732+
filter(are_na(support)) %>%
733+
group_by(sub_id) %>%
734+
mutate(
735+
product_count = n()
736+
) %>%
737+
ungroup() %>%
738+
select(product_group_id = sub_id, product_count, stock_code = label) %>%
739+
arrange(product_group_id, stock_code)
740+
741+
product_groups_lower_bigcomp_tbl %>% glimpse()
742+
```
743+
744+
745+
```{r construct_combined_data_groups_lower, echo=TRUE}
746+
stock_groups_lower_tbl <- list(
747+
ALL = product_groups_lower_all_tbl,
748+
LRG = product_groups_lower_bigcomp_tbl
749+
) %>%
750+
bind_rows(.id = "type") %>%
751+
mutate(
752+
group_label = sprintf("%s_%02d", type, product_group_id)
753+
) %>%
754+
group_by(group_label) %>%
755+
mutate(
756+
group_size = n()
757+
) %>%
758+
ungroup() %>%
759+
select(group_label, group_size, stock_code)
760+
761+
stock_groups_lower_tbl %>% glimpse()
762+
```
763+
764+
765+
Now that we have our groups we add on the description fields so that
766+
interpretation of the different groupings is easier.
767+
768+
```{r display_lower_stock_groups, echo=TRUE}
769+
stock_groups_lower_tbl %>%
770+
filter(group_size > 1, group_size != max(group_size)) %>%
771+
mutate(stock_code_upr = stock_code %>% str_trim() %>% str_to_upper()) %>%
772+
left_join(stock_code_lookups_tbl, by = "stock_code_upr") %>%
773+
datatable()
774+
```
775+
776+
619777
# R Environment
620778

621779
```{r show_session_info, echo=TRUE, message=TRUE}

0 commit comments

Comments
 (0)