-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakefile
148 lines (116 loc) · 6.69 KB
/
makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# make -n fastqc
# make -n sorted_bam
.SECONDEXPANSION:
.SECONDARY:
.DELETE_ON_ERROR:
### OPTIONS
SECOND_PASS = 0
### SYSTEM TOOLS
LN = /bin/ln
MV = /bin/mv
CAT = /bin/cat
MKDIR = /bin/mkdir
ECHO = /bin/echo
CP = /bin/cp
CD = cd
### USER TOOLS
STAR = /athena/abc/scratch/paz2005/bin/src/STAR-2.6.0c/bin/Linux_x86_64_static/STAR #2.6.0c
TRIM_GALORE = /athena/abc/scratch/paz2005/bin/src/TrimGalore-0.5.0/trim_galore # v 0.5.0
SAMTOOLS = /athena/abc/scratch/paz2005/bin/src/samtools-1.8/samtools #v 1.8
FEATURE_COUNTS = /athena/abc/scratch/paz2005/bin/src/subread-1.6.2-Linux-x86_64/bin/featureCounts #v1.6.2
JAVA = /athena/abc/scratch/paz2005/bin/src/subread-1.6.2-Linux-x86_64/bin/jdk1.8.0_171/bin/java # v 1.8.0_171
FASTQC = /athena/abc/scratch/paz2005/bin/src/FastQC/fastqc # v0.11.7
QORTS = /athena/abc/scratch/paz2005/bin/src/QoRTs/QoRTs.jar # v1.3.0
RSCRIPT = /home/paz2005/miniconda3/bin/Rscript #v 3.4.3
R = /home/paz2005/miniconda3/bin/R #v 3.4.3
### REFERENCES
REFERENCE = /athena/abc/scratch/paz2005/references/igenomes/Mus_musculus/UCSC/mm9/Sequence/STARindex
REFERENCE_FA = /athena/abc/scratch/paz2005/references/igenomes/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa
ANNOTATION = /athena/abc/scratch/paz2005/references/igenomes/Mus_musculus/UCSC/mm9/Annotation/Genes/genes.gtf
### PARAMETERS
READ_LENGTH =
STAR_OPTIONS = --runThreadN 1 \
--runMode alignReads \
--genomeDir $(REFERENCE) \
--readFilesCommand zcat \
--outSAMstrandField intronMotif \
--outFilterIntronMotifs RemoveNoncanonicalUnannotated \
--outFilterType BySJout \
--outReadsUnmapped None \
--outSAMtype BAM Unsorted \
--chimOutType SeparateSAMold \
--limitSjdbInsertNsj 10000000
lc = $(subst A,a,$(subst B,b,$(subst C,c,$(subst D,d,$(subst E,e,$(subst F,f,$(subst G,g,$(subst H,h,$(subst I,i,$(subst J,j,$(subst K,k,$(subst L,l,$(subst M,m,$(subst N,n,$(subst O,o,$(subst P,p,$(subst Q,q,$(subst R,r,$(subst S,s,$(subst T,t,$(subst U,u,$(subst V,v,$(subst W,w,$(subst X,x,$(subst Y,y,$(subst Z,z,$1))))))))))))))))))))))))))
### DO NOT EDIT BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING
FASTQFILES := $(wildcard *_R1*.fastq.gz)
TRIMMED_FASTQFILES := $(patsubst %.fastq.gz,%_val_1.fastq.gz,$(wildcard *_R1_???.fastq.gz))
SAMPLES := $(sort $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))))
FLOWCELLS := $(sort $(join $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 2, $(subst _, ,$a))))))
BARCODES := $(sort $(join $(join $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 2, $(subst _, ,$a))))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 3, $(subst _, ,$a))))))
LANES := $(sort $(join $(join $(join $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 2, $(subst _, ,$a))))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 3, $(subst _, ,$a))))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 4, $(subst _, ,$a))))))
FASTQFILES_MM := $(shell find *_R1*.fastq.gz | grep mouse)
SAMPLES_MM := $(sort $(foreach a,$(FASTQFILES_MM),$(firstword $(subst _, ,$a))))
all: fastqc merged_bam gene_counts qc_data
fastqc: $(patsubst %.fastq.gz,%_fastqc.zip,$(wildcard *_R1_???.fastq.gz))
trim: $(patsubst %.fastq.gz,%_val_1.fastq.gz,$(wildcard *_R1_???.fastq.gz))
merge_fastq_lanes: $(addsuffix .fastq.gz, $(LANES))
first_pass: $(FASTQFILES:.fastq.gz=.1p.Aligned.out.bam) $(FASTQFILES:.fastq.gz=.1p.SJ.out.tab)
second_pass: $(FASTQFILES:.fastq.gz=.2p.Aligned.out.bam) $(FASTQFILES:.fastq.gz=.2p.SJ.out.tab)
sorted_bam: $(FASTQFILES:.fastq.gz=.sorted.bam)
merged_bam: $(addsuffix .merged.bam, $(SAMPLES))
gene_counts: $(addsuffix .gene.counts, $(SAMPLES)) # gene.counts.txt
qc_data: $(addprefix ./qc/, $(addsuffix /QC.summary.txt, $(SAMPLES)))
### MERGE FASTQ FILES BY SAMPLE NAME
find-fastq-files = $(sort $(filter $1_% , $(FASTQFILES)))
define merge-fastq-files
$1.fastq.gz: $(call find-fastq-files,$1)
$$(if $$(findstring 1, $$(words $$^)),$$(LN) -fs $$^ $$@,$(CAT) $$^ >> $$@)
endef
$(foreach s,$(LANES),$(eval $(call merge-fastq-files,$s)))
### FASTQC
%_fastqc.zip: %.fastq.gz $$(subst R1,R2,%.fastq.gz)
$(FASTQC) $^
%_fastqc.zip: %.fastq.gz
$(FASTQC) $<
### FIRST PASS ALIGNMENT
%.1p.Aligned.out.bam %.1p.SJ.out.tab: %.fastq.gz $$(subst R1,R2,%.fastq.gz)
$(STAR) --genomeLoad LoadAndKeep --readFilesIn $^ --outFileNamePrefix $*.1p. $(STAR_OPTIONS)
%.1p.Aligned.out.bam %.1p.SJ.out.tab: %.fastq.gz
$(STAR) --genomeLoad LoadAndKeep --readFilesIn $< --outFileNamePrefix $*.1p. $(STAR_OPTIONS)
### SECOND PASS ALIGNMENT
%.2p.Aligned.out.bam %.2p.SJ.out.tab: %.fastq.gz $$(subst R1,R2,%.fastq.gz) $(FASTQFILES:.fastq.gz=.1p.SJ.out.tab)
$(STAR) --genomeLoad NoSharedMemory --readFilesIn $< $(word 2,$^) --outFileNamePrefix $*.2p. --sjdbFileChrStartEnd $(TRIMMED_FASTQFILES:.fastq.gz=.1p.SJ.out.tab) $(STAR_OPTIONS)
%.2p.Aligned.out.bam %.2p.SJ.out.tab: %.fastq.gz
$(STAR) --genomeLoad NoSharedMemory --readFilesIn $< --outFileNamePrefix $*.2p. --sjdbFileChrStartEnd $(TRIMMED_FASTQFILES:.fastq.gz=.1p.SJ.out.tab) $(STAR_OPTIONS)
### SORT BAM FILES
ifdef SECOND_PASS
ifeq ($(SECOND_PASS), 1)
%.sorted.bam: %.2p.Aligned.out.bam
$(SAMTOOLS) sort -n -O bam -T $<.tmp $< > $@
else
%.sorted.bam: %.1p.Aligned.out.bam
$(SAMTOOLS) sort -n -O bam -T $<.tmp $< > $@
endif
endif
### MERGE BAM FILES BY SAMPLE NAME
find-bam-files = $(sort $(filter $1_% , $(FASTQFILES:.fastq.gz=.sorted.bam)))
define merge-bam-files
$1.merged.bam: $(call find-bam-files,$1)
$$(if $$(filter 1, $$(words $$^)),$$(LN) -fs $$^ $$@,$(SAMTOOLS) merge -f $$@ $$^)
endef
$(foreach s,$(SAMPLES),$(eval $(call merge-bam-files,$s)))
### COORD SORT
%.coordSorted.bam: %.merged.bam
$(SAMTOOLS) sort -O bam -T $<.tmp $< > $@ ; $(SAMTOOLS) index $@
### COUNT READS PER GENE
%.gene.counts : %.merged.bam
file_arr=(./$**R2*.fastq.gz) && if [ -f $${file_arr[0]} ] ; then $(FEATURE_COUNTS) -s 2 -p -g "gene_name" -a $(ANNOTATION) -o $@ $< ; else $(FEATURE_COUNTS) -g "gene_name" -a $(ANNOTATION) -o $@ $< ; fi
### COUNT READS PER EXON
%.exon.counts : %.merged.bam
file_arr=(./$**R2*.fastq.gz) && if [ -f $${file_arr[0]} ] ; then $(FEATURE_COUNTS) -p -O -f -g "gene_name" -a $(ANNOTATION) -o $@ $< ; else $(FEATURE_COUNTS) -O -f -g "gene_name" -a $(ANNOTATION) -o $@ $< ; fi
### MERGE GENE COUNTS
gene.counts.txt: $(addsuffix .gene.counts, $(SAMPLES))
$(RSCRIPT) $(MERGE_GENE_COUNTS) $@
### QC
./qc/%/QC.summary.txt: %.merged.bam
file_arr=(./$*_*R2*.fastq.gz) && $(MKDIR) -p ./qc/$* && raw=$$(echo $$(zcat $*_*R1*.fastq.gz | wc -l)/4 | bc) && if [ -f $${file_arr[0]} ] ; then $(JAVA) -Xmx120G -jar $(QORTS) QC --seqReadCt $$(echo $$raw) $< $(ANNOTATION) ./qc/$* ; else $(JAVA) -Xmx120G -jar $(QORTS) QC --singleEnded --seqReadCt $$(echo $$raw) $< $(ANNOTATION) ./qc/$* ; fi