-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
137 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
## email: paz2005@med.cornell.edu | ||
## note, the makefile searches for files ending in *_R1_*.fastq.gz | ||
## it then separates the file names by underscore and uses the first position as sample name | ||
## for example, in a fastq file named, "9_S9_R1_001.fastq.gz", "9" is the sample name. if there is additional important information in the fastq file name, then care must be take to ensure all that information is within the first position after splitting by underscore. | ||
## for example, if we had a fastq file named "9_Rep1_S9_R1_001.fastq.gz", then we should rename it to either "9Rep1_S9_R1_001.fastq.gz" or "9-Rep1_S9_R1_001.fastq.gz" | ||
|
||
.SECONDEXPANSION: | ||
.SECONDARY: | ||
.DELETE_ON_ERROR: | ||
|
||
### SYSTEM TOOLS | ||
LN = /bin/ln | ||
MV = /bin/mv | ||
CAT = /bin/cat | ||
MKDIR = /bin/mkdir | ||
ECHO = /bin/echo | ||
CP = /bin/cp | ||
CD = cd | ||
|
||
### USER TOOLS | ||
STAR = /athena/abc/scratch/paz2005/bin/src/STAR-2.6.0c/bin/Linux_x86_64_static/STAR #2.6.0c | ||
TRIM_GALORE = /athena/abc/scratch/paz2005/bin/src/TrimGalore-0.5.0/trim_galore # v 0.5.0 | ||
SAMTOOLS = /athena/abc/scratch/paz2005/bin/src/samtools-1.8/samtools #v 1.8 | ||
FEATURE_COUNTS = /athena/abc/scratch/paz2005/bin/src/subread-1.6.2-Linux-x86_64/bin/featureCounts #v1.6.2 | ||
JAVA = /athena/abc/scratch/paz2005/bin/src/subread-1.6.2-Linux-x86_64/bin/jdk1.8.0_171/bin/java # v 1.8.0_171 | ||
FASTQC = /athena/abc/scratch/paz2005/bin/src/FastQC/fastqc # v0.11.7 | ||
QORTS = /athena/abc/scratch/paz2005/bin/src/QoRTs/QoRTs.jar # v1.3.0 | ||
RSCRIPT = /home/paz2005/miniconda3/bin/Rscript #v 3.4.3 | ||
R = /home/paz2005/miniconda3/bin/R #v 3.4.3 | ||
SALMON = /athena/abc/scratch/paz2005/miniconda3/envs/salmon/bin/salmon #1.2.1 | ||
|
||
### RSCRIPTS | ||
MERGE_GENE_COUNTS = /scratchLocal/paz2005/rna_scripts/mergeGeneCounts.R | ||
|
||
### REFERENCES | ||
REFERENCE = /athena/abc/scratch/paz2005/references/GRCh38.p12 | ||
REFERENCE_FA = /athena/abc/scratch/paz2005/references/GRCh38.p12/GRCh38.primary_assembly.genome.fa | ||
ANNOTATION = /athena/abc/scratch/paz2005/references/GRCh38.p12/gencode.v28.annotation.gtf | ||
|
||
### PARAMETERS | ||
READ_LENGTH = | ||
STAR_OPTIONS = --runThreadN 1 \ | ||
--runMode alignReads \ | ||
--genomeDir $(REFERENCE) \ | ||
--readFilesCommand zcat \ | ||
--outSAMstrandField intronMotif \ | ||
--outFilterIntronMotifs RemoveNoncanonicalUnannotated \ | ||
--outFilterType BySJout \ | ||
--outReadsUnmapped None \ | ||
--outSAMtype BAM Unsorted \ | ||
--chimOutType SeparateSAMold \ | ||
--limitSjdbInsertNsj 10000000 | ||
|
||
lc = $(subst A,a,$(subst B,b,$(subst C,c,$(subst D,d,$(subst E,e,$(subst F,f,$(subst G,g,$(subst H,h,$(subst I,i,$(subst J,j,$(subst K,k,$(subst L,l,$(subst M,m,$(subst N,n,$(subst O,o,$(subst P,p,$(subst Q,q,$(subst R,r,$(subst S,s,$(subst T,t,$(subst U,u,$(subst V,v,$(subst W,w,$(subst X,x,$(subst Y,y,$(subst Z,z,$1)))))))))))))))))))))))))) | ||
|
||
|
||
### DO NOT EDIT BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE DOING | ||
FASTQFILES := $(wildcard *_R1_*.fastq.gz) | ||
SAMPLES := $(sort $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a)))) | ||
FLOWCELLS := $(sort $(join $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 2, $(subst _, ,$a)))))) | ||
BARCODES := $(sort $(join $(join $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 2, $(subst _, ,$a))))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 3, $(subst _, ,$a)))))) | ||
LANES := $(sort $(join $(join $(join $(foreach a,$(FASTQFILES),$(firstword $(subst _, ,$a))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 2, $(subst _, ,$a))))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 3, $(subst _, ,$a))))), $(addprefix _, $(foreach a,$(FASTQFILES),$(word 4, $(subst _, ,$a)))))) | ||
FASTQFILES_MM := $(shell find *_R1*.fastq.gz | grep mouse) | ||
SAMPLES_MM := $(sort $(foreach a,$(FASTQFILES_MM),$(firstword $(subst _, ,$a)))) | ||
|
||
|
||
all: gene_counts gene.counts.txt qc_data | ||
fastqc: $(patsubst %.fastq.gz,%_fastqc.zip,$(wildcard *_R1_???.fastq.gz)) | ||
trim: $(patsubst %.fastq.gz,%_val_1.fastq.gz,$(wildcard *_R1_???.fastq.gz)) | ||
merge_fastq_lanes: $(addsuffix .fastq.gz, $(LANES)) | ||
first_pass: $(FASTQFILES:.fastq.gz=.1p.Aligned.out.bam) $(FASTQFILES:.fastq.gz=.1p.SJ.out.tab) | ||
second_pass: $(FASTQFILES:.fastq.gz=.2p.Aligned.out.bam) $(FASTQFILES:.fastq.gz=.2p.SJ.out.tab) | ||
sorted_bam: $(FASTQFILES:.fastq.gz=.sorted.bam) | ||
merged_bam: $(addsuffix .merged.bam, $(SAMPLES)) | ||
gene_counts: $(addsuffix .gene.counts, $(SAMPLES)) | ||
qc_data: $(addprefix ./qc/, $(addsuffix /QC.summary.txt, $(SAMPLES))) | ||
|
||
### MERGE FASTQ FILES BY SAMPLE NAME | ||
find-fastq-files = $(sort $(filter $1_% , $(FASTQFILES))) | ||
|
||
define merge-fastq-files | ||
$1.fastq.gz: $(call find-fastq-files,$1) | ||
$$(if $$(findstring 1, $$(words $$^)),$$(LN) -fs $$^ $$@,$(CAT) $$^ >> $$@) | ||
endef | ||
|
||
$(foreach s,$(LANES),$(eval $(call merge-fastq-files,$s))) | ||
|
||
|
||
### FASTQC | ||
%_fastqc.zip: %.fastq.gz $$(subst R1,R2,%.fastq.gz) | ||
$(FASTQC) $^ | ||
|
||
%_fastqc.zip: %.fastq.gz | ||
$(FASTQC) $< | ||
|
||
### FIRST PASS ALIGNMENT | ||
%.1p.Aligned.out.bam %.1p.SJ.out.tab: %.fastq.gz $$(subst R1,R2,%.fastq.gz) | ||
$(STAR) --genomeLoad LoadAndKeep --readFilesIn $^ --outFileNamePrefix $*.1p. $(STAR_OPTIONS) | ||
|
||
%.1p.Aligned.out.bam %.1p.SJ.out.tab: %.fastq.gz | ||
$(STAR) --genomeLoad LoadAndKeep --readFilesIn $< --outFileNamePrefix $*.1p. $(STAR_OPTIONS) | ||
|
||
### SORT BAM FILES | ||
ifdef SECOND_PASS | ||
ifeq ($(SECOND_PASS), 1) | ||
%.sorted.bam: %.2p.Aligned.out.bam | ||
$(SAMTOOLS) sort -n -O bam -T $<.tmp $< > $@ | ||
else | ||
%.sorted.bam: %.1p.Aligned.out.bam | ||
$(SAMTOOLS) sort -n -O bam -T $<.tmp $< > $@ | ||
endif | ||
endif | ||
|
||
### MERGE BAM FILES BY SAMPLE NAME | ||
find-bam-files = $(sort $(filter $1_% , $(FASTQFILES:.fastq.gz=.sorted.bam))) | ||
|
||
define merge-bam-files | ||
$1.merged.bam: $(call find-bam-files,$1) | ||
$$(if $$(filter 1, $$(words $$^)),$$(LN) -fs $$^ $$@,$(SAMTOOLS) merge -f $$@ $$^) | ||
endef | ||
|
||
$(foreach s,$(SAMPLES),$(eval $(call merge-bam-files,$s))) | ||
|
||
### COUNT READS PER GENE | ||
%.gene.counts : %.merged.bam | ||
file_arr=(./$**R2*.fastq.gz) && if [ -f $${file_arr[0]} ] ; then $(FEATURE_COUNTS) -p -g "gene_name" -a $(ANNOTATION) -o $@ $< ; else $(FEATURE_COUNTS) -g "gene_name" -a $(ANNOTATION) -o $@ $< ; fi | ||
|
||
exon.counts.txt: $(addsuffix .merged.bam, $(SAMPLES)) | ||
file_arr=(./$**R2*.fastq.gz) && if [ -f $${file_arr[0]} ] ; then $(FEATURE_COUNTS) -T 6 -p -O -f -g "gene_id" -a $(ANNOTATION_FLAT) -o $@ $^ ; else $(FEATURE_COUNTS) -T 6 -O -f -g "gene_id" -a $(ANNOTATION_FLAT) -o $@ $^ ; fi | ||
|
||
### MERGE GENE COUNTS | ||
gene.counts.txt: $(addsuffix .gene.counts, $(SAMPLES)) | ||
$(RSCRIPT) $(MERGE_GENE_COUNTS) $@ | ||
|
||
### QC | ||
./qc/%/QC.summary.txt: %.merged.bam | ||
file_arr=(./$*_*R2*.fastq.gz) && $(MKDIR) -p ./qc/$* && raw=$$(echo $$(zcat $*_*R1*.fastq.gz | wc -l)/4 | bc) && if [ -f $${file_arr[0]} ] ; then $(JAVA) -Xmx160G -jar $(QORTS) QC --seqReadCt $$(echo $$raw) $< $(ANNOTATION) ./qc/$* ; else $(JAVA) -Xmx160G -jar $(QORTS) QC --singleEnded --seqReadCt $$(echo $$raw) $< $(ANNOTATION) ./qc/$* ; fi |