-
Notifications
You must be signed in to change notification settings - Fork 0
/
seqcount
91 lines (84 loc) · 2.39 KB
/
seqcount
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
helpstr="SEQCOUNT - Count number of sequences in a FASTA/FASTQ file
Usage: seqcount [option] ... SEQ_FILE
Positional arguments:
1. FASTA or FASTQ file to count
The script will automaticall detect whether the input is a FASTA or FASTQ
file from the header character (> or @).
Options (defaults):
-r: Reference file to calculate percentage
-h: Display this help message and exit
If a reference file is specified, the script will return the number of
sequences in the file in both absolute form and as a percentage of
the number of sequences in the reference file."
unset fastq refq ref_file seq_file
while [ $OPTIND -le $# ]; do
# Optional arguments
while getopts "r:qQh" opt; do
case "$opt" in
q)
fastq="true"
;;
Q)
refq="true"
;;
r)
ref_file=$OPTARG
;;
# Else:
h)
echo "$helpstr"
exit
;;
*)
echo "Invalid argument:" $opt >&2
echo "$helpstr"
exit 1
esac
done
# Positional arguments:
if [[ -z "$seq_file" ]]; then
seq_file=${@:$OPTIND:1}
OPTIND=$(expr $OPTIND + 1)
fi
done
shift $((OPTIND-1))
#Argument errors and warnings:
if [[ -z $seq_file ]]; then
>&2 echo "Error: FASTA or FASTQ input file required."
echo "$helpstr"
exit 1
fi
# Find number of sequences:
headchar=$(awk '/./{print;exit}' "$seq_file" | cut -c1)
if [[ "$headchar" == ">" ]]; then
factor=2
elif [[ "$headchar" == "@" ]]; then
factor=4
else
>&2 echo "Error: Input file must be in FASTA or FASTQ format."
exit 1
fi
nlines=$(egrep -v '^$' "$seq_file" | wc -l) # Excludes blank lines
seqnum=$(expr $nlines / $factor)
if [[ -n "$ref_file" ]]; then
rchar=$(awk '/./{print;exit}' "$ref_file" | cut -c1)
if [[ "$rchar" == ">" ]]; then
rfactor=2
elif [[ "$rchar" == "@" ]]; then
rfactor=4
else
>&2 echo "Error: Reference file must be in FASTA or FASTQ format."
exit 1
fi
rlines=$(egrep -v '^$' "$ref_file" | wc -l) # Excludes blank lines
refnum=$(expr $rlines / $rfactor)
# Print percentage
x=$(expr 100 \* $seqnum)
y=$refnum
perc=$(echo -| awk -v x=$x -v y=$y '{printf "%.2f\n", x/y}')%
echo "$seqnum ($perc)"
else
echo $seqnum
fi
exit