-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprepare.sh
121 lines (78 loc) · 4.36 KB
/
prepare.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
# bash commands to prepare, clean, and bin the data
####################
# download and format references
# uncompress BED files
gunzip *.bed.gz
# genome FASTA
wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz
tar -xvzf chromFa.tar.gz
cat chr*.fa > hg19.fa
rm chr*.fa
# chrom sizes
wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes
# mappability or uniqueness of reference genome from ENCODE
wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign50mer.bigWig
wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeCrgMapabilityAlign100mer.bigWig
# download GRC issues (GRCh37.p13_issues.gff3)
wget ftp://ftp.ncbi.nlm.nih.gov/pub/grc/human/GRC/Issue_Mapping/GRCh37.p13_issues.gff3
# convert GRC issues to BED
cat GRCh37.p13_issues.gff3 | grep -v "^#" | grep -v "chr=Un" \
| awk -F $'\t' 'BEGIN {OFS=FS} {print $9,$4-1,$5}' \
| perl -pe 's/Name.*chr=(.*?);.*?\t/chr\1\t/' | LC_ALL=C sort -k1,1 -k2,2n \
> GRC_issues.bed
# create a merged GRC issues BED
bedtools merge -i GRC_issues.bed > GRC_issues.merged.bed
# download ENCODE DAC Blacklisted Regions
wget ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDacMapabilityConsensusExcludable.bed.gz
# convert ENCODE DAC Blacklisted Regions to BED
gunzip -c wgEncodeDacMapabilityConsensusExcludable.bed.gz \
| cut -f 1-4 | LC_ALL=C sort -k1,1 -k2,2n \
> ENCODE_DAC_blacklisted.bed
# create a merged ENCODE DAC Blacklisted Regions BED
bedtools merge -i ENCODE_DAC_blacklisted.bed > ENCODE_DAC_blacklisted.merged.bed
# combined GRC and ENCODE issues
cat GRC_issues.bed ENCODE_DAC_blacklisted.bed | cut -f 1-3 | LC_ALL=C sort -k1,1 -k2,2n | bedtools merge > issues.bed
####################
# size for grouping events into windows/bins
bin_size="5000"
bin_bed="hg19.bin.${bin_size}.bed"
# divide genome to $bin_size windows and add interval-style regions name
bedtools makewindows -g hg19.chrom.sizes -w "$bin_size" | grep -v "_" \
| LC_ALL=C sort -k1,1 -k2,2n | awk -F $'\t' 'BEGIN {OFS=FS} {print $0,$1":"$2"-"$3}' \
> "$bin_bed"
####################
# summarize events by bin
# summary files for hg19 5kb bins are about 20 MB each
# bedtools coverage outputs all original BED file columns (4 columns here) and adds:
# 1) The number of features in B that overlapped the A interval.
# 2) The number of bases in A that had non-zero coverage.
# 3) The length of the entry in A.
# 4) The fraction of bases in A that had non-zero coverage.
# GRC known issues (fraction of window with any issues)
echo -e "#BIN\tGRC_issues" > summary.GRC_issues.${bin_size}.txt
bedtools coverage -a "$bin_bed" -b GRC_issues.bed | cut -f 4,8 >> summary.GRC_issues.${bin_size}.txt
# ENCODE DAC Blacklisted Regions (fraction of window with any issues)
echo -e "#BIN\tENCODE_DAC_blacklisted" > summary.ENCODE_DAC_blacklisted.${bin_size}.txt
bedtools coverage -a "$bin_bed" -b ENCODE_DAC_blacklisted.bed | cut -f 4,8 >> summary.ENCODE_DAC_blacklisted.${bin_size}.txt
# GIAB break points
echo -e "#BIN\tevents_GIAB" > summary.GIAB.${bin_size}.txt
bedtools coverage -a "$bin_bed" -b sv.giab.bed | cut -f 4,5 >> summary.GIAB.${bin_size}.txt
# 1KG break points
echo -e "#BIN\tevents_1KG" > summary.1KG.${bin_size}.txt
bedtools coverage -a "$bin_bed" -b sv.1kg.bed | cut -f 4,5 >> summary.1KG.${bin_size}.txt
# average mappability per bin (using bigWigAverageOverBed from UCSC)
bigWigAverageOverBed wgEncodeCrgMapabilityAlign50mer.bigWig $bin_bed wgEncodeCrgMapabilityAlign50mer.${bin_size}.txt
bigWigAverageOverBed wgEncodeCrgMapabilityAlign100mer.bigWig $bin_bed wgEncodeCrgMapabilityAlign100mer.${bin_size}.txt
# average 50bp mappability per bin in comparable format
echo -e "#BIN\tMapability50mer" > summary.map50mer.${bin_size}.txt
cat wgEncodeCrgMapabilityAlign50mer.${bin_size}.txt | cut -f 1,5 \
| tr ':' '\t' | tr '-' '\t' | LC_ALL=C sort -k1,1 -k2,2n \
| awk -F $'\t' 'BEGIN {OFS=FS} {print $1":"$2"-"$3,$4}' >> summary.map50mer.${bin_size}.txt
# average 100bp mappability per bin in comparable format
echo -e "#BIN\tMapability100mer" > summary.map100mer.${bin_size}.txt
cat wgEncodeCrgMapabilityAlign100mer.${bin_size}.txt | cut -f 1,5 \
| tr ':' '\t' | tr '-' '\t' | LC_ALL=C sort -k1,1 -k2,2n \
| awk -F $'\t' 'BEGIN {OFS=FS} {print $1":"$2"-"$3,$4}' >> summary.map100mer.${bin_size}.txt
####################
# end