-
Notifications
You must be signed in to change notification settings - Fork 0
/
index
148 lines (132 loc) · 3.76 KB
/
index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env bash
helpstr="INDEX - Extract a subsequence from a FASTA file
Usage: index [OPTIONS] FILE START END_OR_LENGTH
Positional arguments:
1: Input FASTA file
2: Subsequence start position
3: Subsequence end position / length (interpreted as end
position unless -l flag set)
In end (default) mode, a value of \"len\" for argument 3 indicates
to subset from the start position to the end of the sequence.
Options:
-l: Run INDEX in length mode (interpret 3rd positional argument as
length instead of end position)
-x: Return only index sequence, without file header
-s: Sequence in file to index (1)
-n: Subsequence name (at start of header line) in output
(sequence)
-h: Print this help message and exit
By default, the sequence is printed with a FASTA-formatted header line
with the structure
>NAME|SOURCE_FILE:SEQUENCE_NUMBER|CO-ORDINATES|LENGTH
Running INDEX with the -x flag will supress this header line."
unset len seq name seqfile strt end_or_len nohead
while [ $OPTIND -le $# ]; do
# Optional arguments
while getopts "lxs:n:h" opt; do
case "$opt" in
l)
length_mode=true
;;
s)
seq=$OPTARG
;;
n)
name=$OPTARG
;;
x)
nohead="true"
;;
# Else:
h)
echo "$helpstr"
exit
;;
*)
echo "Invalid argument:" $opt >&2
echo "$helpstr"
exit 1
;;
esac
done
# Positional arguments:
if [[ -z "$seqfile" ]]; then
seqfile=${@:$OPTIND:1}
OPTIND=$(expr $OPTIND + 1)
elif [[ -z "$strt" ]]; then
strt=${@:$OPTIND:1}
OPTIND=$(expr $OPTIND + 1)
elif [[ -z "$end_or_len" ]]; then
end_or_len=${@:$OPTIND:1}
OPTIND=$(expr $OPTIND + 1)
fi
done
shift $((OPTIND-1))
# Errors if positional arguments missing:
if [ -z $seqfile ]; then
>&2 echo "Error: Sequence file required."
exit 1
fi
if [ -z $strt ]; then
>&2 echo "Error: start position required."
exit 1
fi
if [ -z $end_or_len ]; then
>&2 echo "Error: end position or subsequence length required."
exit 1
fi
# Defaults for optional arguments:
: ${seq:=1}
: ${name:=sequence}
##########
# SCRIPT #
##########
# Get start and end lines of sequence to be cut
scaffolds=$(grep -n ">" $seqfile)
nscaf=$(echo "$scaffolds" | wc -l)
seqstart=$(echo "$scaffolds" | sed -ne "${seq}p" | cut -d':' -f1)
seqstart=$((seqstart + 1))
if [[ $nscaf -lt $seq ]]; then
>&2 echo "Error: not enough sequences in file."
exit 1
fi
if [ $seq == $nscaf ]; then
seqend=$(wc -l $seqfile | cut -d' ' -f1)
else
nextstart=$(echo "$scaffolds" | sed -ne "$((${seq}+1))p" | cut -d':' -f1)
seqend=$(($nextstart - 1))
fi
# Extract scaffold sequence
sequence=$(sed -ne "${seqstart},${seqend}p" $seqfile | tr "\n" " ")
sequence="${sequence// /}"
# Define end and length according to input mode
if [[ $length_mode == true ]]; then
len=$end_or_len
end=$(expr $strt + $len - 1)
else
end=$end_or_len
if [[ $end == len ]]; then
end=${#sequence}
fi
if [[ $strt == len ]]; then
strt=${#sequence}
fi
if [[ $strt -gt $end ]]; then
len=$(expr $strt - $end + 1)
else
len=$(expr $end - $strt + 1)
fi
fi
# Print subsequence
if [[ $strt -gt $end ]]; then
if [[ -z $nohead ]]; then
echo '>'$name'|'$(basename $seqfile):$seq'|'$strt'-'$end'(antisense)|'${len}bp
fi
echo "$(rc ${sequence:$(($end - 1)):$len})"
else
if [[ -z $nohead ]]; then
echo '>'$name'|'$(basename $seqfile):$seq'|'$strt'-'$end'|'${len}bp
fi
echo "${sequence:$(($strt - 1)):$len}"
fi
exit