-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathselect_important_anchors.R
177 lines (152 loc) · 7.43 KB
/
select_important_anchors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# select_important_anchors.R
# Daniel Cotter
# 2024-09-12
# This script selects the most important anchors from the output of SPLASH
# it prioritizes anchors that have a high effect size and also have a large
# number of nonzero samples. It also filters out anchors that have lookup
# table hits to artifcats. The output is a list of anchor sequences that can
# be used in downstream analyses.
## import packages --------
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(Biostrings))
## parse arguments --------
# define command line arguments
option_list <- list(
make_option(c("-i", "--input"), "Input file", type="character"),
make_option(c("-o", "--output_prefix"), "Output prefix.", type="character"),
make_option(c("-n", "--num_anchors"), "Number of anchors to select",
type="integer", default = 150000),
make_option(c("--num_clusters"), "Number of clusters to create",
type="integer", default = 3000),
make_option(c("-e", "--effect_size"), "Effect size threshold",
type="numeric", default=0.7),
make_option(c("-l", "--lookup_table"), "Lookup table file", type="character"),
make_option(c("--splash_bin"), "Path to SPLASH binary folder",
type="character", default="/oak/stanford/groups/horence/dcotter1/splash-2.6.1/"),
make_option(c("--temp_dir"), "Temporary directory to store intermediate files",
type="character")
)
# parse command line arguments
opt <- parse_args(OptionParser(option_list = option_list))
# check that input file exists
if (!file.exists(opt$input) | is.null(opt$output_prefix)) {
stop("Must provide input and output files")
}
if (!file.exists(opt$lookup_table)) {
stop("Must provide path to lookup table")
}
# define output files
anchors_only_out = paste0(opt$output_prefix, "_anchor_list.txt")
anchor_clusters_out = paste0(opt$output_prefix, "_anchor_clusters.tsv")
# create a temporary directory to store intermediate files
if (!is.null(opt$temp_dir)) {
temp_dir <- ifelse(grepl("/$", opt$temp_dir),
opt$temp_dir,
paste0(opt$temp_dir, "/"))
system(paste("mkdir -p", temp_dir))
} else {
temp_dir <- file.path(dirname(opt$output_prefix), "tmp/")
system(paste("mkdir -p", temp_dir))
}
# cat to screen the input files, output files, temp dir and paramaters
cat("\n###################################################################\n")
cat("Running select_important_anchors.R with the following parameters:\n")
cat(paste("Input file:", opt$input))
cat("\n")
cat(paste("Output anchors:", anchors_only_out))
cat("\n")
cat(paste("Output anchor clusters:", anchor_clusters_out))
cat("\n")
cat(paste("Number of anchors to select:", opt$num_anchors))
cat("\n")
cat(paste("Number of clusters to create:", opt$num_clusters))
cat("\n")
cat(paste("Effect size threshold:", opt$effect_size))
cat("\n")
cat(paste("Lookup table file:", opt$lookup_table))
cat("\n")
cat(paste("SPLASH binary folder:", opt$splash_bin))
cat("\n")
cat(paste("Temporary directory:", temp_dir))
cat("\n")
cat("###################################################################\n\n")
## load the data
# read in the headers of the input file to identify the effect_size_bin column
headers <- fread(opt$input, nrows = 1, header=T)
effect_size_bin_col <- grep("effect_size_bin", names(headers))
# load the input file using awk to filter out rows with effect size < 0.7
load_cmd <- paste0("cat ", opt$input, " | awk '{OFS=\"\t\"}{if ($", effect_size_bin_col, " >= ", opt$effect_size, ") print $0}'")
if (grepl(".gz$", opt$input)) {
load_cmd = paste0("z", load_cmd)
}
# only select the first 18 columns
cat(paste("Reading in anchors from", opt$input, "with effect size >=", opt$effect_size))
cat("\n\n")
dt <- fread(cmd=load_cmd, header=TRUE, select = 1:18)
## run lookup table to filter out artifacts -----------
# write all anchors to a file
anchors_to_keep <- dt %>% select(anchor) %>% distinct()
anchor_file <- file.path(temp_dir, "all_anchors.txt") %>% gsub("//", "/", .)
anchors_to_keep %>% write.table(anchor_file, row.names=FALSE, col.names=FALSE, quote=FALSE)
# write anchor to a fasta file as well (for lookuptable)
anchors_fasta_file <- file.path(temp_dir, "all_anchors.fasta") %>% gsub("//", "/", .)
anchors_dna <- DNAStringSet(anchors_to_keep$anchor)
names(anchors_dna) <- anchors_to_keep$anchor
writeXStringSet(anchors_dna, anchors_fasta_file)
# run lookup table
out_lookup_stats <- file.path(temp_dir, "lookup_stats.txt") %>% gsub("//", "/", .)
lookup_cmd <- paste0(file.path(opt$splash_bin, "lookup_table"),
" query --kmer_skip 1 --truncate_paths --stats_fmt with_stats ",
anchors_fasta_file, " ", opt$lookup_table, " ", out_lookup_stats)
if (file.exists(out_lookup_stats)) {
cat("Lookup stats already in tmp directory. Delete tmp directory to force lookup table to run again...\n")
cat("Filtering anchors for artifacts...\n")
} else {
cat("Running lookup table...\n")
system(lookup_cmd)
cat("Finished Querying lookup table. Filtering anchors for artifacts...\n\n")
}
# read in the lookup table stats
lookup_stats <- fread(out_lookup_stats, header=F, col.names=c("query", "stats"))
lookup_stats <- lookup_stats %>% mutate(anchor = anchors_to_keep$anchor)
# filter out anchors that have lookup table hits to artifacts
artifact_pattern <- "plas|illum|syn|arp|RF|JUNK|Ral|purge|P,|Univec"
anchors_to_keep <- lookup_stats %>% filter(!grepl(artifact_pattern, query, ignore.case=T)) %>% select(anchor)
cat(paste0("Finished filtering. Kept ", nrow(anchors_to_keep), " anchors out of ", nrow(lookup_stats), " total anchors.\n"))
cat(paste0("Keeping the top ", opt$num_anchors, " by number_nonzero_samples for further analysis...\n\n"))
## select the most important anchors -----------
anchors_to_keep <- anchors_to_keep %>%
left_join(dt %>% select(anchor, number_nonzero_samples), by="anchor") %>%
arrange(desc(number_nonzero_samples)) %>%
head(opt$num_anchors)
# cluster the anchors using kmer distances
cat(paste("Clustering anchors into", opt$num_clusters, "clusters by kmer similarity...\n"))
# time
start_time <- Sys.time()
suppressPackageStartupMessages(library(ape))
suppressPackageStartupMessages(library(kmer))
suppressPackageStartupMessages(library(dendextend))
dna <- anchors_to_keep$anchor %>% DNAStringSet(.)
names(dna) <- anchors_to_keep$anchor
dna <- as.DNAbin(dna)
dna.tree <- kmer::cluster(dna)
saveRDS(dna.tree, file=file.path(temp_dir, "dna.tree.rds"))
anchor_clusters <- dendextend::cutree(dna.tree,
k=opt$num_clusters) %>%
enframe() %>%
dplyr::rename(anchor=name, cluster_id=value)
cat(paste("Finished clustering anchors in", round(Sys.time() - start_time, 2), "seconds.\n\n"))
## format anchor clusters ----------
cat("Arranging clusters by effect size...\n")
anchor_clusters <- anchor_clusters %>% left_join(dt, by="anchor") %>%
arrange(cluster_id, desc(effect_size_bin)) %>% select(cluster_id, anchor)
anchors_only <- anchor_clusters %>% select(anchor)
## write the output -----------
cat(paste("Writing", nrow(anchors_to_keep), "anchors to", anchors_only_out))
cat("\n")
anchors_only %>% write.table(anchors_only_out, row.names=FALSE, col.names=FALSE, quote=FALSE)
cat(paste("Writing", opt$num_clusters, "clusters and their anchors to", anchor_clusters_out))
cat("\n")
anchor_clusters %>% write.table(anchor_clusters_out, row.names=FALSE, col.names=FALSE, quote=FALSE)