-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBED12TranslateGenomePos2Tx.sh
112 lines (105 loc) · 4.71 KB
/
BED12TranslateGenomePos2Tx.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env bash
#BED12TranslateGenomePos2Tx.sh
#This is a translator of gemomic locations to transcript positions, based on the BED12 annotation
#The column 4 (names) in the Genes.bed12 must be unique, and match the names in GenomePos.bed4
#The output will be 0-based Tx locations, containing only three columns
#Both of the two input files are 0-based
#Version: Yu H. Sun, 2018-06-10
#Version: Yu H. Sun, 2018-06-12, fix intron location bug (if the genomic pos is in intron), output error
array=( "$@" ) #read all command line variable to "array"
if [ ! -n "$6" ]
then
echo "*********************************************************************************************************"
echo "* Welcome to use BED12TranslateGenomePos2Tx.sh ! *"
echo "*This is a translator of gemomic locations to transcript positions, based on the BED12 annotation *"
echo "*Usage: `basename $0` -i [Genes.bed12] -t [GenomePos.bed4] -o [Tx.bed3|TxName start end]*"
echo "* GenomePos.bed4: Only require the first four columns : chr start end TxName. BED12 input also works *"
echo "*The column 4 (names) in the Genes.bed12 must be unique, and match the names in GenomePos.bed4 *"
echo "*Input and output files are 0-based, and please try to avoid white characters in names *"
echo "*Records in GenomePos.bed4 allows duplications of column 4 *"
echo "*For each record in the GenomePos.bed4, it will generate a corresponding bed12 record in the Tx.bed3 *"
echo "*The output will be 0-based Tx locations, containing only three columns *"
echo "*Example of back-calculating test: gene.cds.bed12 should be same as gene.cds.bed12.validate *"
echo "* BED12Extractor.sh -a cds -i gene.bed12 -o gene.cds.bed12 *"
echo "* BED12TranslateGenomePos2Tx.sh -i gene.bed12 -t gene.cds.bed12 -o gene.cds.tx *"
echo "* BED12TranslateTx2GenomePos.sh -i gene.bed12 -t gene.cds.tx -o gene.cds.bed12.validate *"
echo "*********************************************************************************************************"
else
echo "Starting BED12TranslateGenomePos2Tx.sh pipeline"
echo "1. Getting parameters"
for arg in "$@"
do
if [[ $arg == "-i" ]]
then
Anno=${array[$counter+1]}
echo ' Getting annotation: '$Anno
elif [[ $arg == "-t" ]]
then
Data=${array[$counter+1]}
echo ' Getting GenomePos bed: '$Data
elif [[ $arg == "-o" ]]
then
Output=${array[$counter+1]}
echo ' Getting output name: '$Output
fi
let counter=$counter+1
done
echo "2. Translating positions"
rm -rf $Output
RecordNumber=`wc -l $Data|awk '{print $1}'`
for i in $(eval echo {1..$RecordNumber})
do
#echo $i
Tx=`sed -n ${i}p $Data`
chr=`echo $Tx | awk '{print $1}'`
Name=`echo $Tx | awk '{print $4}'`
GenoStart=`echo $Tx | awk '{print $2}'`
GenoEnd=`echo $Tx | awk '{print $3}'`
CurrAnno=`awk -v name=$Name -v chr=$chr '{if ($4==name && $1==chr) print $0}' $Anno`
#echo $Tx
#echo $CurrAnno
#echo $GenoStart,$GenoEnd
echo $CurrAnno | awk -v GenoStart=$GenoStart -v GenoEnd=$GenoEnd '{OFS="\t";split($11,blockSizes,","); split($12,blockStarts,","); blockCount=$10;
error=0;start=0;end=0;everstart=0;everend=0;
if (GenoStart >= GenoEnd || GenoStart<$2 || GenoEnd>$3 ) {error=1}
for (i=1;i<=$10;i++) {FullLength=FullLength+blockSizes[i]}
if ($6=="+") {
blocksum[0]=0;
for (i=1;i<=$10;i++) {
blocksum[i]=blocksum[i-1]+blockSizes[i];
if (GenoStart>=$2+blockStarts[i] && GenoStart <=$2+blockStarts[i]+blockSizes[i]) {
start=blocksum[i-1]+GenoStart-blockStarts[i]-$2;
everstart=1;
}
if (GenoEnd>=$2+blockStarts[i] && GenoEnd <=$2+blockStarts[i]+blockSizes[i]) {
end=blocksum[i-1]+GenoEnd-blockStarts[i]-$2;
everend=1;
}
}
}
if ($6=="-") {
blocksum[0]=0;
for (i=1;i<=$10;i++) {
blocksum[i]=blocksum[i-1]+blockSizes[$10-i+1];
if (GenoStart>=$2+blockStarts[$10-i+1] && GenoStart <=$2+blockStarts[$10-i+1]+blockSizes[$10-i+1]) {
end=blocksum[i-1]+blockStarts[$10-i+1]+blockSizes[$10-i+1]-(GenoStart-$2);
everend=1;
}
if (GenoEnd>=$2+blockStarts[$10-i+1] && GenoEnd <=$2+blockStarts[$10-i+1]+blockSizes[$10-i+1]) {
start=blocksum[i-1]+blockStarts[$10-i+1]+blockSizes[$10-i+1]-(GenoEnd-$2);
everstart=1;
}
}
}
if (everstart+everend<2) {error=1}
if (error==0) {
print $4,start,end
}else print "Out range error: "$4,GenoStart,GenoEnd | "cat 1>&2" }' >> $Output
done
if [ -s $Output ]
then
echo "3. Done"
else
echo "Error occurred!"
fi
fi