-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
685 lines (608 loc) · 27.5 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
# Makefile to run the pipeline
SHELL:=/bin/bash
export NXF_VER:=19.07.0
export NXF_ANSI_LOG=false
# extra params to pass for Nextflow in some recipes
EP:=
TIMESTAMP:=$(shell date +%s)
TIMESTAMP_str:=$(shell date +"%Y-%m-%d-%H-%M-%S")
ABSDIR:=$(shell python -c 'import os; print(os.path.realpath("."))')
DIRNAME:=$(shell python -c 'import os; print(os.path.basename(os.path.realpath(".")))')
REMOTE_ssh:=git@github.com:NYU-Molecular-Pathology/LG-PACT.git
REMOTE_http:=https://github.com/NYU-Molecular-Pathology/LG-PACT.git
.PHONY: annovar_db ref
# no default action
none:
# ~~~~~ SETUP PIPELINE ~~~~~ #
HOSTNAME:=$(shell echo $$HOSTNAME)
USER_HOME=$(shell echo "$$HOME")
USER_DATE:=$(shell date +%s)
# relative locations
# Nextflow "publishDir" directory of files to keep
publishDir:=output
# Nextflow "work" directory
workDir:=work
TRACEFILE:=trace.txt
NXF_FRAMEWORK_DIR:=$(USER_HOME)/.nextflow/framework/$(NXF_VER)
# gets stuck on NFS drive and prevents install command from finishing
remove-framework:
@if [ -e "$(NXF_FRAMEWORK_DIR)" ]; then \
new_framework="$(NXF_FRAMEWORK_DIR).$(USER_DATE)" ; \
echo ">>> Moving old Nextflow framework dir $(NXF_FRAMEWORK_DIR) to $${new_framework}" ; \
mv "$(NXF_FRAMEWORK_DIR)" "$${new_framework}" ; \
fi
# install Nextflow in the current directory
# - removes user Nextflow framework dir if exists
# - if on 'phoenix' HPC, need to load Java module
./nextflow:
@[ -d "$(NXF_FRAMEWORK_DIR)" ] && $(MAKE) remove-framework || :
@if grep -q 'phoenix' <<<'$(HOSTNAME)'; then module unload java && module load java/1.8; fi ; \
curl -fsSL get.nextflow.io | bash
install: ./nextflow
nextflow-self-update: ./nextflow
./nextflow self-update
# setup reference data
# NYUMC Big Purple data directory location
REFDIR:=/gpfs/data/molecpathlab/ref
ref: install
@if [ ! -d "$(REFDIR)" ] ; then echo ">>> system ref dir doesnt exist, setting up local ref dir..." ; \
./nextflow run ref.nf -profile ref ; fi
# demo pipeline dataset for testing
NGS580-demo-data:
git clone https://github.com/NYU-Molecular-Pathology/NGS580-demo-data.git
# set up for demo with 'small' dataset (more data output)
demo-small: NGS580-demo-data
./generate-samplesheets.py NGS580-demo-data/small/ && \
mv targets.bed "targets.bed.$(TIMESTAMP)" && \
/bin/cp NGS580-demo-data/small/targets.bed .
./update-samplesheets.py --tumor-normal-sheet NGS580-demo-data/small/samples.tumor.normal.csv
# set up for demo with 'tiny' dataset (faster processing, ~1hr on NYU HPC)
demo: NGS580-demo-data
$(MAKE) config RUNID=demo FASTQDIR=NGS580-demo-data/tiny/fastq/ && \
$(MAKE) samplesheet && \
mv targets.bed "targets.bed.$(TIMESTAMP)" && \
/bin/cp NGS580-demo-data/tiny/targets.bed . && \
./update-samplesheets.py --tumor-normal-sheet NGS580-demo-data/tiny/samples.pairs.csv \
--pairs-tumor-colname '#SAMPLE-T' \
--pairs-normal-colname '#SAMPLE-N'
# ./generate-samplesheets.py NGS580-demo-data/tiny/fastq/ && \
# mv targets.bed "targets.bed.$(TIMESTAMP)" && \
# /bin/cp NGS580-demo-data/tiny/targets.bed .
# ./update-samplesheets.py --tumor-normal-sheet NGS580-demo-data/tiny/samples.pairs.csv \
# --pairs-tumor-colname '#SAMPLE-T' \
# --pairs-normal-colname '#SAMPLE-N'
#
# setup ANNOVAR reference databases
# NYUMC Big Purple data directory location
ANNOVAR_DB_DIR:=/gpfs/data/molecpathlab/ref/annovar/db
annovar_db: install
if [ ! -d "$(ANNOVAR_DB_DIR)" ] ; then echo ">>> system ANNOVAR db dir does not exist, setting up local dir..." ; ./nextflow run annovar_db.nf -profile annovar_db ; fi
annovar_db_power: install
if [ ! -d "$(ANNOVAR_DB_DIR)" ] ; then echo ">>> system ANNOVAR db dir does not exist, setting up local dir..." ; ./nextflow run annovar_db.nf -profile annovar_db_conda ; fi
annovar_db_bigpurple: install
if [ ! -d "$(ANNOVAR_DB_DIR)" ] ; then echo ">>> system ANNOVAR db dir does not exist, setting up local dir..." ; ./nextflow run annovar_db.nf -profile annovar_db_bigpurple ; fi
# main setup commands to use
setup: install ref annovar_db
setup-power: install ref annovar_db_power
check-runid:
@[ -z "$(RUNID)" ] && printf "invalid RUNID specified: $(RUNID)\n" && exit 1 || :
check-fastqdir:
@[ -z "$(FASTQDIR)" ] && printf "invalid FASTQDIR specified: $(FASTQDIR)\n" && exit 1 || :
@[ ! -d "$(FASTQDIR)" ] && printf "FASTQDIR is not a valid directory: $(FASTQDIR)\n" && exit 1 || :
# prepares a new directory with a copy of this repo to start analysis
# `make deploy RUNID=180316_NB501073_0036_AH3VFKBGX5 FASTQDIR=/ifs/data/molecpathlab/production/Demultiplexing/180316_NB501073_0036_AH3VFKBGX5/output/Unaligned/NS18-7`
# - check that valid args were passed
# - clone current repo into destination for analysis
# - create symlink to fastq dir for input
# - create samplesheet; assume '--name-mode noLaneSplit' by default
# location of production demultiplexing for deployment
FASTQDIR:=
FASTQDIRS:=
# samplesheet used for sequencing run demultiplexing
DEMUX_SAMPLESHEET:=
DEMUX_SAMPLESHEET_output:=demux-samplesheet.csv
# sequencing run name for deployment
RUNID:=
# location of production deployment for analysis
# NYUMC Big Purple directory location
PRODDIR:=/gpfs/data/molecpathlab/production/NGS607
deploy:
@$(MAKE) check-runid
@$(MAKE) check-fastqdir
@repo_dir="$${PWD}" && \
output_dir="$(PRODDIR)/$(RUNID)" && \
echo ">>> Setting up new repo in location: $${output_dir}" && \
git clone --recursive "$${repo_dir}" "$${output_dir}" && \
cd "$${output_dir}" && \
echo ">>> Linking input directory: $(FASTQDIR)" && \
ln -s "$(FASTQDIR)" input && \
[ -e "$(DEMUX_SAMPLESHEET)" ] && /bin/cp "$(DEMUX_SAMPLESHEET)" "$(DEMUX_SAMPLESHEET_output)" || : \
echo ">>> Creating input fastq sheet" && \
python generate-samplesheets.py --name-mode "$(NAMEMODE)" '$(FASTQDIR)' && \
echo ">>> Creating config file..." && \
$(MAKE) config CONFIG_OUTPUT="$${output_dir}/config.json" && \
[ -e "$(DEMUX_SAMPLESHEET_output)" ] && echo ">>> Adding demux samplesheet to config" && $(MAKE) config DEMUX_SAMPLESHEET="$(DEMUX_SAMPLESHEET_output)" CONFIG_OUTPUT="$${output_dir}/config.json" || : \
[ -e "$(DEMUX_SAMPLESHEET_output)" ] && $(MAKE) pairs PAIRS_SHEET="$(DEMUX_SAMPLESHEET_output)" PAIRS_MODE=demux
printf ">>> NGS607 analysis directory prepared: $${output_dir}\n"
CONFIG_INPUT:=.config.json
CONFIG_OUTPUT:=config.json
$(CONFIG_OUTPUT):
@echo ">>> Creating $(CONFIG_OUTPUT)"
@cp "$(CONFIG_INPUT)" "$(CONFIG_OUTPUT)"
SAMPLESHEET:=
config: $(CONFIG_OUTPUT)
@[ -n "$(RUNID)" ] && echo ">>> Updating runID config" && python config.py --update "$(CONFIG_OUTPUT)" --runID "$(RUNID)" || :
@[ -n "$(SAMPLESHEET)" ] && echo ">>> Updating samplesheet config" && python config.py --update "$(CONFIG_OUTPUT)" --samplesheet "$(SAMPLESHEET)" || :
@[ -n "$(DEMUX_SAMPLESHEET)" ] && echo ">>> Updating demultiplexing samplesheet config" && python config.py --update "$(CONFIG_OUTPUT)" --demux-samplesheet "$(DEMUX_SAMPLESHEET)" || :
@[ -n "$(FASTQDIR)" ] && echo ">>> Updating fastqDirs config" && python config.py --update "$(CONFIG_OUTPUT)" --fastqDirs "$(FASTQDIR)" || :
@[ -n "$(FASTQDIRS)" ] && echo ">>> Adding fastq dirs to config" && python config.py --update "$(CONFIG_OUTPUT)" --fastqDirs $(FASTQDIRS) || :
config-add-fastqdirs:
@if [ ! -z "$(FASTQDIRS)" ]; then \
for fastqdir in $(FASTQDIRS); do \
echo ">>> Adding $$fastqdir to $(CONFIG_OUTPUT)" ; \
$(MAKE) config FASTQDIR="$$fastqdir" ; \
done; else \
echo ">>> ERROR: no FASTQDIRS passed" ; \
fi
# generate a samplesheet for the analysis based on the configs
# use this for .fastq.gz files split by lane: NAMEMODE=LaneSplit
NAMEMODE:=noLaneSplit
SAMPLESHEET_OUTPUT:=samples.analysis.tsv
samplesheet:
@echo ">>> Getting fastqdirs from config file: $(CONFIG_OUTPUT)" && \
fastqdirs="$$(python -c 'import json; fastq_dirs = json.load(open("$(CONFIG_OUTPUT)")).get("fastqDirs", None); print(" " .join(fastq_dirs) if fastq_dirs else "" )')" && \
echo ">>> loaded fastqdirs: $${fastqdirs}" && \
echo ">>> Generating samplesheet '$(SAMPLESHEET_OUTPUT)' for fastqdirs" && \
python generate-samplesheets.py $(EP) --samples-analysis-tsv "$(SAMPLESHEET_OUTPUT)" --name-mode "$(NAMEMODE)" $${fastqdirs}
# update the samplesheet with the sample pairs information
# requires samplesheet to exist
# default to 'sns' style sample pairs .csv file
PAIRS_SHEET:=samples.pairs.csv
# type of sheet to parse pairs from;
# 'sns': Igor sns pipeline format
# 'demux': Illumina demultiplexing samplesheet with extra 'Paired_Normal' column added to [Data] section
PAIRS_MODE:=sns
pairs:
@if [ ! -e "$(SAMPLESHEET_OUTPUT)" ]; then $(MAKE) samplesheet; fi && \
if [ ! -e "$(PAIRS_SHEET)" ]; then echo ">>> ERROR: PAIRS_SHEET does not exist: $(PAIRS_SHEET)"; exit 1; fi && \
if [ "$(PAIRS_MODE)" == "sns" ]; then \
echo ">>> Updating samplesheet with sample pairs from sns style sheet" ; \
python update-samplesheets.py --tumor-normal-sheet "$(PAIRS_SHEET)" --pairs-tumor-colname '#SAMPLE-T' --pairs-normal-colname '#SAMPLE-N' ; \
elif [ "$(PAIRS_MODE)" == "demux" ]; then \
echo ">>> Updating samplesheet with sample pairs from demultiplexing style sheet" ; \
python bin/demux2tumor_normal_sheet.py "$(PAIRS_SHEET)" samples.tumor.normal.csv && \
python update-samplesheets.py --tumor-normal-sheet samples.tumor.normal.csv ; \
else echo ">>> ERROR: PAIRS_MODE not recognized: $(PAIRS_MODE)"; exit 1; fi
# ~~~~~ UPDATE THIS REPO ~~~~~ #
# commands for bringing this directory's pipeline up to date
update: pull update-submodules update-nextflow
# pull latest version of repo
pull: remote
@echo ">>> Updating repo"
@git pull
# remove local Nextflow install and re-build
update-nextflow:
@if [ -f nextflow ]; then \
echo ">>> Removing old Nextflow" && \
rm -f nextflow && \
echo ">>> Reinstalling Nextflow" && \
$(MAKE) install ; \
else $(MAKE) install ; fi
# pull the latest version of all submodules
update-submodules: remote
@echo ">>> Updating git submodules"
@git submodule update --recursive --remote --init
# update the repo remote for ssh
remote-ssh:
@git remote set-url origin "$(REMOTE_ssh)"
remote:
@echo ">>> Setting git remote origin to $(REMOTE_http)"
@git remote set-url origin "$(REMOTE_http)"
# ~~~~~ RUN PIPELINE ~~~~~ #
# enable 'resume' functionality; set this to '' to disable
RESUME:=-resume
LOGDIR:=logs
LOGDIRABS:=$(shell python -c 'import os; print(os.path.realpath("$(LOGDIR)"))')
LOGID:=$(TIMESTAMP)
LOGFILEBASE:=log.$(LOGID).out
LOGFILE:=$(LOGDIR)/$(LOGFILEBASE)
# git version 2.16.0
GIT_CURRENT_BRANCH:=$(shell git rev-parse --abbrev-ref HEAD)
GIT_CURRENT_COMMIT:=$(shell git rev-parse --short HEAD)
GIT_CURRENT_TAG:=$(shell git describe --tags)
GIT_RECENT_TAG:=$(shell git describe --tags --abbrev=0)
# start run with logging
run: backup
@log_file="$(LOGDIR)/nextflow.$(LOGID).stdout.log" ; \
echo ">>> Running with stdout log file: $(LOGFILE)" ; \
$(MAKE) run-recurse 2>&1 | tee -a "$(LOGFILE)" ; \
echo ">>> Run completed at $$(date +"%Y-%m-%d %H:%M:%S"), stdout log file: $(LOGFILE)" ; \
$(MAKE) record RECDIR="$(publishDir)/logs"
# try to automatically determine which 'run' recipe to use based on hostname
run-recurse:
@if grep -q 'phoenix' <<<'$(HOSTNAME)'; then echo ">>> Running run-phoenix"; $(MAKE) run-phoenix ; \
elif grep -q 'bigpurple' <<<'$(HOSTNAME)'; then echo ">>> Running run-bigpurple"; $(MAKE) run-bigpurple ; \
else $(MAKE) run-local ; fi ;
# run on phoenix in current session
run-phoenix: install
@module unload java && module load java/1.8 && \
./nextflow run main.nf -profile phoenix $(RESUME) -with-dag flowchart.dot $(EP)
# run on NYU Big Purple HPC
SLURM_recurse:=
# HPC queue/partition to submit to by default
Q:=cpu_short
# total number of CPUs on Intellispace queue
Intellispace_queue_max:=80
Intellispace_queue_name:=intellispace
ifneq ($(SLURM_recurse),)
# check the count of intellispace queue cpus used/requested
Intellispace_queue_cpus:=$(shell squeue -p intellispace -o "%C" | tail -n +2 | paste -sd+ | bc)
# try to automatically detect a SLURM queue with idle nodes to submit to
AutoQ_SLURM_idle:=$(shell sinfo -N -O nodelist,partition,statelong | grep 'idle' | grep -v 'data_mover' | grep -v 'dev' | grep -v 'fn_' | grep -v 'cpu_long' | tr -s '[:space:]' | cut -d ' ' -f2 | sort -u | head -1)
# detect which 'mixed' queue has the most open nodes
AutoQ_SLURM_mixed:=$(shell sinfo -N -O nodelist,partition,statelong | grep 'mixed' | grep -v 'data_mover' | grep -v 'dev' | grep -v 'fn_' | grep -v 'cpu_long' | tr -s '[:space:]' | cut -d ' ' -f2 | sort | uniq -c | sort -k 1nr | head -1 | tr -s '[:space:]' | cut -d ' ' -f3)
# first option: use queue with most idle nodes; empty if none exist
ifneq ($(AutoQ_SLURM_idle),)
Q:=$(AutoQ_SLURM_idle)
# second option: use queue with most mixed nodes; empty if none exist
else ifneq ($(AutoQ_SLURM_mixed),)
Q:=$(AutoQ_SLURM_mixed)
# third option: use intellispace queue if it has less than the max number of CPUs used/requested already
else ifneq ($(shell test $(Intellispace_queue_cpus) -lt $(Intellispace_queue_max); echo $$?),0)
Q:=$(Intellispace_queue_name)
endif
endif
test-q:
@$(MAKE) test-q-recurse SLURM_recurse=1
test-q-recurse:
@echo "Q: $(Q), AutoQ_SLURM_idle: $(AutoQ_SLURM_idle), AutoQ_SLURM_mixed: $(AutoQ_SLURM_mixed), Intellispace_queue_cpus: $(Intellispace_queue_cpus)"
run-bigpurple:
$(MAKE) run-bigpurple-recurse SLURM_recurse=1
run-bigpurple-recurse: Q_JSON:=/gpfs/home/kellys04/molecpathlab/pipelines/queue-stats/slurm.json
run-bigpurple-recurse: export NXF_DEBUG=3
run-bigpurple-recurse: install
./nextflow -trace nextflow.executor run main.nf \
-profile bigPurple $(RESUME) \
-with-dag flowchart.dot \
--queue_json "$(Q_JSON)" \
--GIT_CURRENT_BRANCH "$(GIT_CURRENT_BRANCH)" \
--GIT_CURRENT_COMMIT "$(GIT_CURRENT_COMMIT)" \
--GIT_CURRENT_TAG "$(GIT_CURRENT_TAG)" \
--GIT_RECENT_TAG "$(GIT_RECENT_TAG)" \
$(EP) && \
$(MAKE) finalize-work-rm -j $(SUBTHREADS) > work.rm.txt && \
$(MAKE) fix-permissions fix-group
# run locally default settings
run-local: install
./nextflow run main.nf \
-profile local \
$(RESUME) \
-with-dag flowchart.dot \
--GIT_CURRENT_BRANCH "$(GIT_CURRENT_BRANCH)" \
--GIT_CURRENT_COMMIT "$(GIT_CURRENT_COMMIT)" \
--GIT_CURRENT_TAG "$(GIT_CURRENT_TAG)" \
--GIT_RECENT_TAG "$(GIT_RECENT_TAG)" \
$(EP)
# run on Power server
run-power: install
source /shared/miniconda2/bin/activate /shared/biobuilds-2017.11 && \
./nextflow run main.nf -profile power $(RESUME) -with-dag flowchart.dot $(EP)
# submit the parent Nextflow process to phoenix HPC as a cluster job
SUBJOBNAME:=NGS607-$(DIRNAME)
SUBLOG:=$(LOGDIRABS)/slurm-%j.$(LOGFILEBASE)
SUBQ:=intellispace
SUBTIME:=--time=5-00:00:00
SUBTHREADS:=8
SUBMEM:=48G
SUBEP:=
NXF_NODEFILE:=.nextflow.node
NXF_JOBFILE:=.nextflow.jobid
NXF_PIDFILE:=.nextflow.pid
NXF_SUBMIT:=.nextflow.submitted
NXF_SUBMITLOG:=.nextflow.submitted.log
REMOTE:=
PID:=
SUBSCRIPT:=submit.bigpurple.sh
# check for an HPC submission lock file, then try to determine the submission recipe to use
submit:
@if [ -e "$(NXF_SUBMIT)" ]; then echo ">>> ERROR: Workflow locked by $(NXF_SUBMIT); has an instance of the pipeline has already been submitted?"; exit 1 ; \
else \
if grep -q 'phoenix' <<<'$(HOSTNAME)'; then echo ">>> Submission for phoenix not yet configured"; \
elif grep -q 'bigpurple' <<<'$(HOSTNAME)'; then echo ">>> Running submit-bigpurple"; $(MAKE) submit-bigpurple ; \
else echo ">>> ERROR: could not automatically determine 'submit' recipe to use, please consult the Makefile"; exit 1 ; fi ; \
fi
# generate a SLURM sbatch script to start the pipeline in a SLURM job
# sets a submission lock file
# NOTE: Nextflow locks itself from concurrent instances but need to lock against multiple 'make submit'
# catches 'scancel' commands and propagates them up to Nextflow for clean shutdown
$(SUBSCRIPT):
@printf '' > $(SUBSCRIPT)
@chmod +x $(SUBSCRIPT)
@echo '#!/bin/bash' >> $(SUBSCRIPT)
@echo '#SBATCH -D $(ABSDIR)' >> $(SUBSCRIPT)
@echo '#SBATCH -o $(SUBLOG)' >> $(SUBSCRIPT)
@echo '#SBATCH -J $(SUBJOBNAME)' >> $(SUBSCRIPT)
@echo '#SBATCH -p $(SUBQ)' >> $(SUBSCRIPT)
@echo '#SBATCH $(SUBTIME)' >> $(SUBSCRIPT)
@echo '#SBATCH --ntasks-per-node=1' >> $(SUBSCRIPT)
@echo '#SBATCH -c $(SUBTHREADS)' >> $(SUBSCRIPT)
@echo '#SBATCH --mem $(SUBMEM)' >> $(SUBSCRIPT)
@echo '#SBATCH --export=HOSTNAME' >> $(SUBSCRIPT)
@echo 'touch $(NXF_SUBMIT)' >> $(SUBSCRIPT)
@echo 'get_pid(){ head -1 $(NXF_PIDFILE); }' >> $(SUBSCRIPT)
@echo 'rm_submit(){ echo ">>> trap: rm_submit" ; [ -e $(NXF_SUBMIT) ] && rm -f $(NXF_SUBMIT) || : ; }' >> $(SUBSCRIPT)
@echo 'wait_pid(){ local pid=$$1 ; while kill -0 $$pid; do echo waiting for process $$pid to end ; sleep 1 ; done ; }' >> $(SUBSCRIPT)
@echo 'nxf_kill(){ rm_submit ; echo ">>> trap: nxf_kill" && pid=$$(get_pid) && kill $$pid && wait_pid $$pid ; }' >> $(SUBSCRIPT)
@echo 'trap nxf_kill HUP' >> $(SUBSCRIPT)
@echo 'trap nxf_kill INT' >> $(SUBSCRIPT)
@echo 'trap nxf_kill EXIT' >> $(SUBSCRIPT)
@echo 'make submit-bigpurple-run TIMESTAMP=$(TIMESTAMP) $(SUBEP)' >> $(SUBSCRIPT)
.PHONY: $(SUBSCRIPT)
# submit on Big Purple using SLURM
submit-bigpurple: $(SUBSCRIPT)
@sbatch $(SUBSCRIPT) | tee >(sed 's|[^[:digit:]]*\([[:digit:]]*\).*|\1|' > '$(NXF_JOBFILE)')
# run inside a SLURM sbatch
# store old pid and node entries in a backup file in case things get messy
# need to manually set the HOSTNAME here because it changes inside SLURM job
# TODO: come up with a better method for this ^^
submit-bigpurple-run:
if [ -e "$(NXF_NODEFILE)" -a -e "$(NXF_PIDFILE)" ]; then paste "$(NXF_NODEFILE)" "$(NXF_PIDFILE)" >> $(NXF_SUBMITLOG); fi ; \
echo "$${SLURMD_NODENAME}" > "$(NXF_NODEFILE)" && \
$(MAKE) run HOSTNAME="bigpurple" LOGID="$(TIMESTAMP)"
# issue an interupt signal to a process running on a remote server
# e.g. Nextflow running in a qsub job on a compute node
kill: PID=$(shell head -1 "$(NXF_PIDFILE)")
kill: REMOTE=$(shell head -1 "$(NXF_NODEFILE)")
kill: $(NXF_NODEFILE) $(NXF_PIDFILE)
ssh "$(REMOTE)" 'kill $(PID)'
# location to files for making HapMap pool
HAPMAP_POOL_SHEET:=samples.hapmap.tsv
HAPMAP_POOL_PROFILE:=hapmap_pool
hapmap-pool: $(HAPMAP_POOL_SHEET)
./nextflow run hapmap-pool.nf \
-profile "$(HAPMAP_POOL_PROFILE)" \
$(RESUME)
# location for files for making CNV Pool
CNV_POOL_SHEET:=samples.cnv.tsv
CNV_POOL_PROFILE:=cnv_pool
cnv-pool: $(CNV_POOL_SHEET)
./nextflow run cnv-pool.nf \
-profile "$(CNV_POOL_PROFILE)" \
$(RESUME)
# save a record of the most recent Nextflow run completion
PRE:=
RECDIR:=recorded-runs/$(PRE)$(DIRNAME)_$(TIMESTAMP_str)
STDOUTLOGPATH:=
STDOUTLOG:=
ALL_LOGS:=
_RECORD:=
# task in the workdir e.g. '64/c111'
TASK:=
TASKDIR:=
ifneq ($(_RECORD),)
STDOUTLOGPATH:=$(shell ls -d -1t $(LOGDIR)/log.*.out | head -1 | python -c 'import sys, os; print(os.path.realpath(sys.stdin.readlines()[0].strip()))' )
STDOUTLOG:=$(shell basename "$(STDOUTLOGPATH)")
ALL_LOGS:=$(shell find "$(LOGDIR)" -type f -name '*$(STDOUTLOG)*')
ifneq ($(TASK),)
TASKDIR:=$(shell find "$(workDir)/" -mindepth 2 -maxdepth 2 -type d -path "*$(TASK)*" | head -1)
endif
endif
record:
$(MAKE) record-recurse _RECORD=1
record-recurse:
@echo ">>> Recording logs to $(RECDIR)" && \
mkdir -p "$(RECDIR)" && \
cp -a *.html trace.txt .nextflow.log main.nf nextflow.config config.json "$(RECDIR)/" ; \
for item in $(ALL_LOGS); do cp -a "$${item}" "$(RECDIR)/"; done ; \
if [ ! -z "$(TASKDIR)" -a -d "$(TASKDIR)" ]; then \
echo ">>> Copying task dir: $(TASKDIR)" && \
mkdir -p "$(RECDIR)/$(TASKDIR)" && \
cp -a "$(TASKDIR)" "$(RECDIR)/$$(dirname $(TASKDIR))" ; \
fi ; \
echo ">>> Copied execution reports and logs to: $(RECDIR)"
# backup the analysis output
BACKUP_DIR:=$(publishDir)-backup/$(TIMESTAMP_str)
backup:
if [ -d "$(publishDir)" ] ; then \
mkdir -p "$(BACKUP_DIR)" ; \
mv "$(publishDir)" "$(BACKUP_DIR)" ; \
fi
# fix group executable permissions
fix-perm:
find . -type f -name "*.py" -o -name "*.R" ! -path "*/work/*" ! -path "*/output/*" -exec chmod -v g+x {} \;
# fix permissions on this directory
# make all executables group executable
# make all dirs full group accessible
# make all files group read/write
fix-permissions:
@find . -type f -executable -exec chmod ug+X {} \;
@find . -type d -exec chmod ug+rwxs {} \;
@find . -type f -exec chmod ug+rw {} \;
USERGROUP:=molecpathlab
fix-group:
@find . ! -group "$(USERGROUP)" -exec chgrp "$(USERGROUP)" {} \;
OLD_UNPAIRED:=none
OLD_PAIRED:=none
NEW_UNPAIRED:=output/annotations.unpaired.tsv
NEW_PAIRED:=output/annotations.paired.tsv
COMPARE_DIR:=compare
COMPARE_OUTPUT:=$(COMPARE_DIR)/$(TIMESTAMP_str)
COMPARE_OUTPUT_DIR:=$(COMPARE_OUTPUT)/output
COMPARE_STDOUTLOG:=$(COMPARE_OUTPUT)/.nextflow.stdout.log
COMPARE_NXF_LOG:=$(COMPARE_OUTPUT)/.nextflow.log
COMPARE_WORKDIR:=$(COMPARE_OUTPUT)/work
COMPARE_TRACE:=$(COMPARE_OUTPUT)/trace.txt
COMPARE_TIMELINE:=$(COMPARE_OUTPUT)/timeline.html
COMPARE_REPORT:=$(COMPARE_OUTPUT)/nextflow.html
$(COMPARE_OUTPUT):
mkdir -p "$(COMPARE_OUTPUT)"
compare: install $(COMPARE_OUTPUT)
./nextflow \
-log "$(COMPARE_NXF_LOG)" \
run compare.nf \
-profile compare_bigpurple \
-with-report "$(COMPARE_REPORT)" \
-with-timeline "$(COMPARE_TIMELINE)" \
-work-dir "$(COMPARE_WORKDIR)" \
-with-trace "$(COMPARE_TRACE)" \
--outputDir "$(COMPARE_OUTPUT_DIR)" \
--old_unpaired_annotations "$(OLD_UNPAIRED)" \
--new_unpaired_annotations "$(NEW_UNPAIRED)" \
--old_paired_annotations "$(OLD_PAIRED)" \
--new_paired_annotations "$(NEW_PAIRED)" \
| tee "$(COMPARE_STDOUTLOG)"
.PHONY: compare
# ~~~~~ CLEANUP ~~~~~ #
# commands to clean out items in the current directory after running the pipeline
clean-traces:
rm -f trace*.txt.*
clean-logs:
rm -f .nextflow.log.*
clean-reports:
rm -f *.html.*
clean-flowcharts:
rm -f *.dot.*
clean-output:
[ -d output ] && mv output oldoutput && rm -rf oldoutput &
clean-work:
[ -d work ] && mv work oldwork && rm -rf oldwork &
clean-old-stdout-logs:
find . -maxdepth 1 -mindepth 1 -type f -name "nextflow.*.stdout.log" | sort -r | tail -n +2 | xargs rm -f
clean-old-failed-logs:
find . -maxdepth 1 -mindepth 1 -type f -name "failed.*.tsv" | sort -r | tail -n +2 | xargs rm -f
clean-tmp:
rm -f nxf-tmp.*
# clean all files produced by previous pipeline runs
clean: clean-logs clean-traces clean-reports clean-flowcharts clean-old-stdout-logs clean-old-failed-logs clean-tmp
# clean all files produced by all pipeline runs
clean-all: clean clean-output clean-work
[ -d .nextflow ] && mv .nextflow .nextflowold && rm -rf .nextflowold &
rm -f .nextflow.log
rm -f *.png
rm -f trace*.txt*
rm -f *.html*
rm -f flowchart*.dot
rm -f nextflow.*.stdout.log
# compile flow chart
flowchart:
[ -f flowchart.dot ] && dot flowchart.dot -Tpng -o flowchart.png || echo "file flowchart.dot not present"
# ~~~~~ FINALIZE ~~~~~ #
# steps for finalizing the Nextflow pipeline output 'work' directory
# Work dir contains subdirectories with intermediary files created when executing pipeline
# along with command execution logs, etc
# logs should be saved, along with a record of the contents of each subdir
# files in the 'publishDir' are often symlinked to items in the 'work' dir;
# these symlinks should be replaced with copies of the original file
# all files in the 'work' dir should be removed, except for the log files
#
# Makefile configured for parallel processing of files
#
# run with `make finalize -j8`
# remove extraneous work dirs
# resolve publishDir output symlinks
# write work 'ls' files
# create work dir file stubs
finalize:
$(MAKE) finalize-work-rm
$(MAKE) finalize-output
$(MAKE) finalize-work-ls
$(MAKE) finalize-work-stubs
## ~~~ convert all symlinks to their linked items ~~~ ##
# symlinks in the publishDir to convert to files
publishDirLinks:=
FIND_publishDirLinks:=
ifneq ($(FIND_publishDirLinks),)
publishDirLinks:=$(shell find $(publishDir)/ -type l)
endif
finalize-output:
@echo ">>> Converting symlinks in output dir '$(publishDir)' to their targets..."
$(MAKE) finalize-output-recurse FIND_publishDirLinks=1
finalize-output-recurse: $(publishDirLinks)
# convert all symlinks to their linked items
$(publishDirLinks):
@ { \
destination="$@"; \
sourcepath="$$(python -c 'import os; print(os.path.realpath("$@"))')" ; \
if [ ! -e "$${sourcepath}" ]; then echo "ERROR: Source does not exist: $${sourcepath}"; \
elif [ -f "$${sourcepath}" ]; then rsync -va "$$sourcepath" "$$destination" ; \
elif [ -d "$${sourcepath}" ]; then { \
timestamp="$$(date +%s)" ; \
tmpdir="$${destination}.$${timestamp}" ; \
rsync -va "$${sourcepath}/" "$${tmpdir}" && \
rm -f "$${destination}" && \
mv "$${tmpdir}" "$${destination}" ; } ; \
fi ; }
.PHONY: $(publishDirLinks)
## ~~~ write list of files in each subdir to file '.ls.txt' ~~~ ##
# subdirs in the 'work' dir
NXFWORKSUBDIRS:=
FIND_NXFWORKSUBDIRS:=
ifneq ($(FIND_NXFWORKSUBDIRS),)
NXFWORKSUBDIRS:=$(shell find "$(workDir)/" -maxdepth 2 -mindepth 2)
endif
# file to write 'ls' contents of 'work' subdirs to
LSFILE:=.ls.txt
finalize-work-ls:
@echo ">>> Writing list of directory contents for each subdir in Nextflow work directory '$(workDir)'..."
$(MAKE) finalize-work-ls-recurse FIND_NXFWORKSUBDIRS=1
finalize-work-ls-recurse: $(NXFWORKSUBDIRS)
# print the 'ls' contents of each subdir to a file, or delete the subdir
$(NXFWORKSUBDIRS):
@ls_file="$@/$(LSFILE)" ; \
echo ">>> Writing file list: $${ls_file}" ; \
ls -1 "$@" > "$${ls_file}"
.PHONY: $(NXFWORKSUBDIRS)
## ~~~ replace all files in 'work' dirs with empty file stubs ~~~ ##
NXFWORKFILES:=
FIND_NXFWORKFILES:=
# files in work subdirs to keep
LSFILEREGEX:=\.ls\.txt
NXFWORKFILES:='.command.begin|.command.err|.command.log|.command.out|.command.run|.command.sh|.command.stub|.command.trace|.exitcode|$(LSFILE)'
NXFWORKFILESREGEX:='.*\.command\.begin\|.*\.command\.err\|.*\.command\.log\|.*\.command\.out\|.*\.command\.run\|.*\.command\.sh\|.*\.command\.stub\|.*\.command\.trace\|.*\.exitcode\|.*$(LSFILEREGEX)'
ifneq ($(FIND_NXFWORKFILES),)
NXFWORKFILES:=$(shell find -P "$(workDir)/" -type f ! -regex $(NXFWORKFILESREGEX))
endif
finalize-work-stubs:
$(MAKE) finalize-work-stubs-recurse FIND_NXFWORKFILES=1
finalize-work-stubs-recurse: $(NXFWORKFILES)
$(NXFWORKFILES):
@printf '>>> Creating file stub: $@\n' && rm -f "$@" && touch "$@"
.PHONY: $(NXFWORKFILES)
## ~~~ remove 'work' subdirs that are not in the latest trace file (e.g. most previous run) ~~~ ##
# subdirs in the 'work' dir
TRACE_PATTERN_FILE:=.trace.hash.txt
NXFWORKSUBDIRSRM:=
FIND_NXFWORKSUBDIRSRM:=
# regex from the hashes of tasks in the tracefile to match against work subdirs
HASHPATTERN:=
ifneq ($(FIND_NXFWORKSUBDIRSRM),)
NXFWORKSUBDIRSRM:=$(shell find "$(workDir)/" -maxdepth 2 -mindepth 2 )
# HASHPATTERN:=$(shell python -c 'import csv; reader = csv.DictReader(open("$(TRACEFILE)"), delimiter = "\t"); print("|".join([row["hash"] for row in reader]))')
endif
finalize-work-rm:
@echo ">>> Removing subdirs in Nextflow work directory '$(workDir)' which are not included in Nextflow trace file '$(TRACEFILE)'..."
$(MAKE) $(TRACE_PATTERN_FILE) finalize-work-rm-recurse FIND_NXFWORKSUBDIRSRM=1
# need to write out hashes to file because it gets too long to use as CLI arg
$(TRACE_PATTERN_FILE):
@echo ">>> Making trace hash file: $(TRACE_PATTERN_FILE)"
@python -c 'import csv; \
reader = csv.DictReader(open("$(TRACEFILE)"), delimiter = "\t"); \
fout = open("$(TRACE_PATTERN_FILE)", "w"); \
fout.write("\n".join([row["hash"] for row in reader])) ; fout.close(); \
'
finalize-work-rm-recurse: $(NXFWORKSUBDIRSRM)
# if the 9 digits of the work subdir hash is not in the trace.txt file 'hash' column, delete it
# since that means it was not part of the most recent pipeline run
$(NXFWORKSUBDIRSRM):
@pattern="$$(echo $(@) | sed -e 's|$(workDir)/||g' | cut -c 1-9)" ; \
if [ ! "$$( grep -q $${pattern} '$(TRACE_PATTERN_FILE)'; echo $$?)" -eq 0 ]; then \
echo ">>> Removing subdir: $@" ; \
rm -rf "$@" ; \
fi
# remove the subdir if its not listed in the trace hashes
# $(NXFWORKSUBDIRSRM):
# @if [ ! "$$(echo '$@' | grep -q -E "$(HASHPATTERN)"; echo $$? )" -eq 0 ]; then \
# echo ">>> Removing subdir: $@" ; \
# rm -rf "$@" ; \
# fi
.PHONY: $(NXFWORKSUBDIRSRM) $(TRACE_PATTERN_FILE)