From b9b7d376a4fc683741c2645904db5d18d06cdfed Mon Sep 17 00:00:00 2001
From: Usman Rashid <usman@smme.edu.pk>
Date: Tue, 10 Dec 2024 15:34:55 +1300
Subject: [PATCH] Fixed issues in genepal-report

---
 CHANGELOG.md           |  4 +++-
 bin/genepal_report.Rmd | 29 +++++++++++++++++++++++++----
 conf/base.config       |  3 +++
 3 files changed, 31 insertions(+), 5 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 93c3522..e4d0ca6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v0.6.0 - [6-Dec-2024]
+## v0.6.0 - [10-Dec-2024]
 
 ### 'Added'
 
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 1. Fixed an issue where TSEBRA failed because LIFTOFF lifted non-protein coding genes [#121](https://github.com/Plant-Food-Research-Open/genepal/issues/121)
 2. Switched branch name from `master` to `main` in the GHA CIs
+3. Fixed an issue in `genepal_report.Rmd` which caused the pangene matrix plot to fail when the number of clusters exceeded 65536 [#124](https://github.com/Plant-Food-Research-Open/genepal/issues/124)
+4. Fixed an issue where `GENEPALREPORT` process failed due to OOM kill signal from SLURM [#123](https://github.com/Plant-Food-Research-Open/genepal/issues/123)
 
 ### `Dependencies`
 
diff --git a/bin/genepal_report.Rmd b/bin/genepal_report.Rmd
index d85ee12..5d0ac38 100755
--- a/bin/genepal_report.Rmd
+++ b/bin/genepal_report.Rmd
@@ -190,22 +190,43 @@ cat("<br>")
 
 
 ```{r pheatmap, eval=(exists("n0_df") && !is.null(n0_df$heatmap)), results='hide', fig.align='center', fig.cap="Heatmap showing number of proteins present in each orthocluster (clusters where all individuals have 1 copy are excluded). Columns = Orthologue cluster, Row = Individual", fig.width=7, fig.height=7, dpi=150, warning=FALSE}
-pheatmap(n0_df$heatmap,
+
+# Max 65536 allowed
+# https://github.com/Plant-Food-Research-Open/genepal/issues/124
+
+n_cols <- ncol(n0_df$heatmap)
+max_cols_allowed <- min(n_cols, 5000)
+
+# Approach 1: Random selection of columns
+# selected_cols <- sample(n_cols, max_cols_allowed)
+
+# Approach 2: First N largest clusters
+selected_cols <- order(colSums(n0_df$heatmap), decreasing = TRUE)[seq(1, max_cols_allowed)]
+
+prefix_text <- ""
+
+if ( n_cols != max_cols_allowed ) {
+  prefix_text <- paste0("Top ", max_cols_allowed, " ")
+}
+
+pheatmap(n0_df$heatmap[, selected_cols],
   show_colnames = FALSE,
-  main = "Orthologue clusters containing accessory proteins",
+  main = paste0(prefix_text, "Orthologue clusters"),
   legend = TRUE,
   legend_labels = TRUE,
   border_color = "white"
 )
 
-pheatmap(n0_df$heatmap,
+pheatmap(n0_df$heatmap[, selected_cols],
   filename = file.path(outputs_folder, "pangene.matrix.heatmap.pdf"),
   show_colnames = FALSE,
-  main = "Orthologue clusters containing accessory proteins",
+  main = paste0(prefix_text, "Orthologue clusters"),
   legend = TRUE,
   legend_labels = TRUE,
   border_color = "white"
 )
+
+write.csv(x = transform_hogs(n0o), file = file.path(outputs_folder, "pangenome.matrix.csv"), row.names = FALSE)
 ```
 
 
diff --git a/conf/base.config b/conf/base.config
index d0173d1..408a69d 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -74,4 +74,7 @@ process {
         cpus = { 8          * task.attempt  }
         time = { 7.days     * task.attempt  }
     }
+    withName:GENEPALREPORT {
+        memory = { 20.GB    * task.attempt  }
+    }
 }