Skip to content

Commit

Permalink
v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmyliu1326 committed Oct 13, 2024
1 parent b00e170 commit c5cd4f6
Show file tree
Hide file tree
Showing 47 changed files with 1,947 additions and 287 deletions.
2 changes: 1 addition & 1 deletion assets/adaptivecard.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"size": "Large",
"weight": "Bolder",
"color": "<% if (success) { %>Good<% } else { %>Attention<%} %>",
"text": "nf/samntrek v${version} - ${runName}",
"text": "SamnTrek v${version} - ${runName}",
"wrap": true
},
{
Expand Down
4 changes: 2 additions & 2 deletions assets/email_template.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Run Name: $runName

<% if (success){
out << "## nf/samntrek execution completed successfully! ##"
out << "## SamnTrek execution completed successfully! ##"
} else {
out << """####################################################
## nf/samntrek execution completed unsuccessfully! ##
## SamnTrek execution completed unsuccessfully! ##
####################################################
The exit status of the task that caused the workflow execution to fail was: $exitStatus.
The full error message was:
Expand Down
8 changes: 4 additions & 4 deletions assets/methods_description_template.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
id: "nf-samntrek-methods-description"
id: "samntrek-methods-description"
description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication."
section_name: "nf/samntrek Methods Description"
section_href: "https://github.com/nf/samntrek"
section_name: "jimmyliu1326/SamnTrek Methods Description"
section_href: "https://github.com/jimmyliu1326/samntrek"
plot_type: "html"
## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline
## You inject any metadata in the Nextflow '${workflow}' object
data: |
<h4>Methods</h4>
<p>Data was processed using nf/samntrek v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (<a href="https://doi.org/10.1038/s41587-020-0439-x">Ewels <em>et al.</em>, 2020</a>), utilising reproducible software environments from the Bioconda (<a href="https://doi.org/10.1038/s41592-018-0046-7">Grüning <em>et al.</em>, 2018</a>) and Biocontainers (<a href="https://doi.org/10.1093/bioinformatics/btx192">da Veiga Leprevost <em>et al.</em>, 2017</a>) projects.</p>
<p>Data was processed using jimmyliu1326/samntrek v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (<a href="https://doi.org/10.1038/s41587-020-0439-x">Ewels <em>et al.</em>, 2020</a>), utilising reproducible software environments from the Bioconda (<a href="https://doi.org/10.1038/s41592-018-0046-7">Grüning <em>et al.</em>, 2018</a>) and Biocontainers (<a href="https://doi.org/10.1093/bioinformatics/btx192">da Veiga Leprevost <em>et al.</em>, 2017</a>) projects.</p>
<p>The pipeline was executed with Nextflow v${workflow.nextflow.version} (<a href="https://doi.org/10.1038/nbt.3820">Di Tommaso <em>et al.</em>, 2017</a>) with the following command:</p>
<pre><code>${workflow.commandLine}</code></pre>
<p>${tool_citations}</p>
Expand Down
6 changes: 3 additions & 3 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
report_comment: >
This report has been generated by the <a href="https://github.com/nf/samntrek/releases/tag/0.1" target="_blank">nf/samntrek</a>
This report has been generated by the <a href="https://github.com/jimmyliu1326/SamnTrek/releases/tag/0.1" target="_blank">jimmyliu1326/SamnTrek</a>
analysis pipeline.
report_section_order:
"nf-samntrek-methods-description":
"samntrek-methods-description":
order: -1000
software_versions:
order: -1001
"nf-samntrek-summary":
"samntrek-summary":
order: -1002

export_plots: true
Expand Down
Binary file added assets/pipeline-logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions assets/samplesheet.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
sample,fastq_1,fastq_2
SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
sample,genome
GCA_008046965.1,/home/jimmyliu/projects/def-sponsor00/jimmyliu/pipelines/SamnSorter/test/GCA_008046965.1.fa
GCA_008046795.1,/home/jimmyliu/projects/def-sponsor00/jimmyliu/pipelines/SamnSorter/test/GCA_008046795.1.fa
17 changes: 5 additions & 12 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/nf/samntrek/master/assets/schema_input.json",
"title": "nf/samntrek pipeline - params.input schema",
"title": "SamnTrek - params.input schema",
"description": "Schema for the file provided with params.input",
"type": "array",
"items": {
Expand All @@ -13,21 +13,14 @@
"errorMessage": "Sample name must be provided and cannot contain spaces",
"meta": ["id"]
},
"fastq_1": {
"genome": {
"type": "string",
"format": "file-path",
"exists": true,
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
},
"fastq_2": {
"type": "string",
"format": "file-path",
"exists": true,
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
"pattern": "^\\S+\\.f(ast)?a(\\.gz)?$",
"errorMessage": "Path to genome assembly file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'"
}
},
"required": ["sample", "fastq_1"]
"required": ["sample", "genome"]
}
}
141 changes: 102 additions & 39 deletions bin/dist2ngbrs.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,25 +103,38 @@ hdbscan_search <- function(
theme_bw(15) +
labs(x = "minPts",
y = "Number of clusters")
p3 <- ggarrange(p, p2, ncol = 2)
p3 <- ggarrange(p, p2, ncol = 1)
if (!is.null(outfile)) { ggsave(outfile, p3) } # save plot
# fit hdbscan using optimal minPts
opt_minpts <- tryCatch(
clust_stats_df %>% filter(clusters_n > 2) %>% filter(sil == max(sil)) %>% pull(minpts) %>% min(),
error = function(e) {
clust_stats_df$minpts[which.max(clust_stats_df$sil)]
min_clusters_n <- 5
print(clust_stats_df)
if ( nrow(mat) >= 10000 ) {
if ( any(clust_stats_df$clusters_n >= min_clusters_n) ) {
opt_minpts <- clust_stats_df %>% filter(clusters_n >= min_clusters_n) %>% filter(sil == max(sil, na.rm = T)) %>% pull(minpts) %>% min()
} else {
opt_minpts <- clust_stats_df$minpts[which.max(clust_stats_df$sil)]
}
)
} else {
opt_minpts <- clust_stats_df$minpts[which.max(clust_stats_df$sil)]
}

cat("Optimal minPts: ", opt_minpts, "\n")
cat("Fitting HDBSCAN using optimal minPts...\n")
dbscan.fit <- hdbscan(mat[,c('core', 'accessory')], minPts = opt_minpts)
if (length(opt_minpts) == 0) {
cat("No valid optimal minPts found...\n")
opt_minpts <- NA
dbscan.fit <- NA
} else {
cat("Fitting HDBSCAN using optimal minPts...\n")
dbscan.fit <- hdbscan(mat[,c('core', 'accessory')], minPts = opt_minpts)
}
# return list obj
return(
list(
'fit' = dbscan.fit,
'opt_minpts' = opt_minpts
)
)

}

distance_search <- function(
Expand All @@ -133,10 +146,7 @@ distance_search <- function(
cat("Searching by distance...\n")
mat.filt <- mat %>%
filter(core <= max_core,
accessory <= -1*max_accessory/max_core+max_accessory) %>%
mutate(dist = sqrt(core^2+accessory^2)) %>%
arrange(dist) %>%
slice(1:if_else(nrow(.) >= top_hits, top_hits, nrow(.)))
accessory <= max_accessory)
return(mat.filt)
}

Expand All @@ -147,7 +157,8 @@ main_search <- function(
minPts = seq(5, 15, 1), # range of minPts to search
outdir = ".", # output directory
subsample = 10000, # subsample size
top_hits = 2000 # maximum number of top hits to return
top_hits = 2000, # maximum number of top hits to return
id = NULL # sample ID
) {
# initialize output obj
res <- list(
Expand All @@ -165,12 +176,33 @@ main_search <- function(
)
# perform prefiltering
if (length(prefilter_dist) == 2) {
res[['mat']] <- filter(mat, core <= prefilter_dist[1], accessory <= prefilter_dist[2])
cat("Number of subject IDs after prefiltering:", nrow(res[['mat']]), "\n")
# stop if no candidates remain
res[['mat']] <- filter(mat, core <= prefilter_dist[1], accessory <= prefilter_dist[2])
cat("Number of subject IDs after prefiltering:", nrow(res[['mat']]), "\n")
# stop if no candidates remain
if (nrow(res[['mat']]) == 0) {
writeLines(character(0), con = file.path(outdir, "SamnTrek_hits.txt"))
quit("No rows found after prefiltering. Exiting...\n")
data.frame(
'id' = id,
'subjects_n' = nrow(mat),
'top_hits_n' = 0,
'total_hits_n' = 0,
"clusters_n" = 0
) %>%
write.table(file = file.path(outdir, "SamnTrek_search_stats.tsv"),
sep = "\t", row.names = F, col.names = T, quote = F)
cat("No rows found after prefiltering. Exiting...\n")
quit(save = 'no')
}
}
# disable clustering if fewer than min_n subject IDs
min_n <- 150
if ( nrow(res[['mat']]) < min_n ) {
cat("Detected less than", min_n, "subject IDs after prefiltering, insufficient data for reliable unsupervised clustering...\n")
if (max_dist == 'auto') {
max_dist <- "0.025,0.0005"
cat("Setting core distance threshold to: ", unlist(str_split(max_dist, ','))[1], "\n")
cat("Setting accessory distance threshold to: ", unlist(str_split(max_dist, ','))[2], "\n")
cat("To override this, please specify a value using --distance\n")
}
}
# subsample size
Expand All @@ -182,13 +214,27 @@ main_search <- function(
# execute search
if (max_dist != 'auto') {
# verify distance format
if (length(unlist(str_split(args$distance, ','))) != 2) {
if (length(unlist(str_split(max_dist, ','))) != 2) {
stop("Invalid distance format. Please specify core and accessory distances separated by a comma e.g. 0.05,0.01\n")
core_dist <- as.numeric(unlist(str_split(args$distance, ','))[1])
accessory_dist <- as.numeric(unlist(str_split(args$distance, ','))[2])
}
core_dist <- as.numeric(unlist(str_split(max_dist, ','))[1])
accessory_dist <- as.numeric(unlist(str_split(max_dist, ','))[2])
m <- distance_search(res[['mat']], core_dist, accessory_dist, top_hits)
res[['ngbrs_ids']] <- as.character(m$id)
if (nrow(m) == 0) {
res[['total_hits']] <- NULL
} else {
res[['total_hits']] <- nrow(m)
}
# keep top hits
m <- m %>%
mutate(dist = sqrt(core^2+accessory^2)) %>%
arrange(dist) %>%
slice(1:if_else(nrow(.) >= top_hits, top_hits, nrow(.)))
if (nrow(m) == 0) {
res[['ngbrs_ids']] <- NULL
} else {
res[['ngbrs_ids']] <- as.character(m$id)
}
res[['core']] <- as.numeric(core_dist)
res[['accessory']] <- as.numeric(accessory_dist)
} else {
Expand All @@ -200,14 +246,26 @@ main_search <- function(
} else {
res[['hdbscan_mat']] <- res[['mat']]
}
# check validity of minPts range
if ( max(minPts) > nrow(res[['hdbscan_mat']]) ) {
minPts <- minPts %>% subset(. <= nrow(res[['hdbscan_mat']]))
}
# optimize hdbscan hyperparameter
dbscan.fit <- hdbscan_search(
res[['hdbscan_mat']],
minPts,
outfile = file.path(outdir, 'hdbscan_cluster_score.png')
)
res[['opt_minpts']] <- dbscan.fit[['opt_minpts']]
res[['cluster']] <- if_else(dbscan.fit[['fit']]$cluster == 0, NA, dbscan.fit[['fit']]$cluster)
# check validity of the most optimal hdbscan fit
cat("Validity of fit:", !all(is.na(dbscan.fit[['fit']])), "\n")
if (!all(is.na(dbscan.fit[['fit']]))) {
res[['cluster']] <- dbscan.fit[['fit']]$cluster
} else {
res [['cluster']] <- NA
}
# find cluster with minimum distance to origin
cat("Calculating cluster centroids...\n")
res[['centroids']] <- res[['hdbscan_mat']] %>%
cbind(cluster = res[['cluster']]) %>%
filter(!is.na(cluster)) %>%
Expand All @@ -220,6 +278,7 @@ main_search <- function(
res[['accessory']] <- 0.025
res[['core']] <- 0.0005
res[['clusters_n']] <- 0
res[['centroids']] <- NULL
cat("Setting core distance threshold to: ", res[['core']], "\n")
cat("Setting accessory distance threshold to: ", res[['accessory']], "\n")
cat("To override this, please specify a value using --distance\n")
Expand Down Expand Up @@ -330,19 +389,18 @@ if (args$minPts == "auto" & args$distance == "auto") {
min_minpts <- 15
max_minpts <- 155
step_minpts <- 10
if ( nrow(mat) >= max_minpts) {
minpts_range <- c(seq(min_minpts, max_minpts, step_minpts))
} else {
minpts_range <- seq(min_minpts, nrow(mat), step_minpts)
}
# disable clustering if fewer than min_n subject IDs
min_n <- 100
min_n <- 150
if ( nrow(mat) < min_n ) {
cat("Detected less than", min_n, "subject IDs, insufficient data for reliable unsupervised clustering...")
args$distance <- c(0.025, 0.0005)
cat("Setting core distance threshold to: ", args$distance[1], "\n")
cat("Setting accessory distance threshold to: ", args$distance[2], "\n")
cat("Detected less than", min_n, "subject IDs, insufficient data for reliable unsupervised clustering...\n")
args$distance <- "0.025,0.0005"
cat("Setting core distance threshold to: ", as.numeric(unlist(str_split(args$distance, ','))[1]), "\n")
cat("Setting accessory distance threshold to: ", as.numeric(unlist(str_split(args$distance, ','))[2]), "\n")
cat("To override this, please specify a value using --distance\n")
} else if ( nrow(mat) >= max_minpts) {
minpts_range <- seq(min_minpts, max_minpts, step_minpts)
} else {
minpts_range <- seq(min_minpts, nrow(mat), step_minpts)
}
} else if (args$distance == "auto" & args$minPts != "auto") {
max_minpts <- as.numeric(unlist(str_split(args$minPts, ','))[2])
Expand All @@ -369,7 +427,8 @@ search_res <- main_search(
minPts = minpts_range, # range of minPts to search
outdir = args$outdir, # output directory
subsample = as.numeric(args$subsample), # subsample size
top_hits = as.numeric(args$top_hits) # maximum number of top hits to return
top_hits = as.numeric(args$top_hits), # maximum number of top hits to return
id = sub("\\.[^\\.]*$", "", (basename(args$file)))
)
# plot analytical results
cat("Plotting clustering results...\n")
Expand All @@ -384,19 +443,23 @@ plot_dist(
centroids = search_res[['centroids']], # cluster centroids coordinates
clusters = search_res[['cluster']], # cluster assignments
outfile = file.path(args$outdir, 'core_accessory_plot.png'), # output file path
subsample = search_res[['subsample']] # subsample size
subsample = search_res[['subsample']], # subsample size
opt_minpts = search_res[['opt_minpts']] # optimal minPt
)
# write neighbour accessions
cat("Writing out neighbour accessions...\n")
writeLines(as.character(search_res[['ngbrs_ids']]),
con = file.path(args$outdir, "SamnTrek_hits.txt"))
if (is.null(search_res[['ngbrs_ids']])) {
writeLines(character(0), con = file.path(args$outdir, "SamnTrek_hits.txt"))
} else {
writeLines(as.character(search_res[['ngbrs_ids']]), con = file.path(args$outdir, "SamnTrek_hits.txt"))
}
cat("Writing out search stats...\n")
data.frame(
'id' = sub("\\.[^\\.]*$", "", (basename(args$file))),
'subjects_n' = nrow(mat),
'top_hits_n' = length(search_res[['ngbrs_ids']]),
'total_hits_n' = search_res[['total_hits']],
"clusters_n" = search_res[['clusters_n']]
'top_hits_n' = ifelse(is.null(search_res[['ngbrs_ids']]), 0, length(search_res[['ngbrs_ids']]) ),
'total_hits_n' = ifelse(is.null(search_res[['total_hits']]), 0, search_res[['total_hits']]),
"clusters_n" = ifelse(is.null(search_res[['clusters_n']]), 0, search_res[['clusters_n']])
) %>%
write.table(file = file.path(args$outdir, "SamnTrek_search_stats.tsv"),
sep = "\t", row.names = F, col.names = T, quote = F)
Expand Down
28 changes: 28 additions & 0 deletions bin/merge_summary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env Rscript

# load pkgs
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(argparse))

# cli argument parser
parser <- ArgumentParser()
parser$add_argument("files", nargs='+')
parser$add_argument("-o", "--outfile", type="character", default="./SamnTrek_summary.tsv",
help="Output summary file path [%(default)s]")
args <- parser$parse_args()

# read and combine result files
out <- args$files %>%
map(~fread(., header = T)) %>%
purrr::reduce(dplyr::left_join, by = 'id') %>%
select(id, final_clust, subjects_n, top_hits_n, total_hits_n, clusters_n) %>%
rename('cluster_id' = 'final_clust')

# write output
write.table(
out,
file = file.path(args$outfile),
sep = "\t", row.names = F, col.names = T, quote = F
)
Loading

0 comments on commit c5cd4f6

Please sign in to comment.