forked from genialis/docker-bio-linux8-resolwe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathre-import.sh
169 lines (143 loc) · 4.97 KB
/
re-import.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# re-import() is a convenience function that copies (or downloads) the given
# files, extracts it and moves it in place.
#
# INPUT ARGUMENTS
#
# TEMP: source file location
# FILE: real file name
# IN_FORMAT:
# fa|fasta 1.) matches files that end with fa or fasta
# 2.) matches a combination of:
# (fa|fasta).(gz|bz2|zip|rar|7z|tgz|tar.gz|tar.bz2)
# fa|fasta|zip 1.) matches files that end with fa or fasta
# 2.) matches a combination of:
# (fa|fasta).(gz|bz2|zip|rar|7z|tgz|tar.gz|tar.bz2)
# 3.) matches if the file end just with zip
# supported (gz|bz2|zip|rar|7z)
# OUT_FORMAT: the desired output format before compression, ie. fasta
# MAX_PROGRES: maximum progress at the end of transfer (0.0 to 1.0)
# COMPRESSION: if "compress" return compressed data
# if "extract" return extracted data
# else return both
re-import() {
TEMP=$1
FILE=$2
IN_FORMAT=$3
OUT_FORMAT=$4
MAX_PROGRES=${5:-1.0}
COMPRESSION=$6
echo "Importing and compressing..."
shopt -s nocasematch
function testrc {
RC=$?
if [ $1 ]; then
if [ $RC -eq $1 ]; then
echo "{\"proc.rc\":$RC}"
exit $RC
fi
else
if [ $RC -gt 0 ]; then
echo "{\"proc.rc\":$RC}"
exit $RC
fi
fi
}
function importGz {
mv "${TEMP}" "${NAME}.${OUT_FORMAT}.gz"
testrc
if [ "$COMPRESSION" != extract ]; then
gzip -t "${NAME}.${OUT_FORMAT}.gz"
testrc 1 # RC 2 "trailing garbage ignored" is OK
fi
if [ "$COMPRESSION" != compress ]; then
gzip -dc "${NAME}.${OUT_FORMAT}.gz" > "${NAME}.${OUT_FORMAT}"
testrc 1 # RC 2 "trailing garbage ignored" is OK
if [ "$COMPRESSION" = extract ]; then
rm "${NAME}.${OUT_FORMAT}.gz"
testrc
fi
fi
}
function import7z {
# Uncompress original file
if [[ ".${FILE}" =~ \.(tgz|tar\.gz|tar\.bz2)$ ]]; then
7z x -y -so "${TEMP}" | tar -xO > "${NAME}.${OUT_FORMAT}"
testrc
else
7z x -y -so "${TEMP}" > "${NAME}.${OUT_FORMAT}"
testrc
fi
# Remove original file
rm "${TEMP}"
testrc
if [ "$COMPRESSION" != extract ]; then
# Compress uncompressed file
gzip -c "${NAME}.${OUT_FORMAT}" > "${NAME}.${OUT_FORMAT}.gz"
testrc
if [ "$COMPRESSION" = compress ]; then
# Remove uncompressed file
rm "${NAME}.${OUT_FORMAT}"
testrc
fi
fi
}
function importUncompressed {
if [ "$COMPRESSION" = compress ]; then
gzip -c "${TEMP}" > "${NAME}.${OUT_FORMAT}.gz"
testrc
rm "${TEMP}"
testrc
elif [ "$COMPRESSION" = extract ]; then
mv "${TEMP}" "${NAME}.${OUT_FORMAT}"
testrc
else
gzip -c "${TEMP}" > "${NAME}.${OUT_FORMAT}.gz"
testrc
mv "${TEMP}" "${NAME}.${OUT_FORMAT}"
testrc
fi
}
regex='^(https?|ftp)://[-A-Za-z0-9\+&@#/%?=~_|!:,.;]*[-A-Za-z0-9\+&@#/%=~_|]'
if [[ "$TEMP" =~ $regex ]]
then
URL=${TEMP}
FILE=${FILE:-`basename "${URL%%\?*}"`}
TEMP=download_`basename "${URL%%\?*}"`
curl --connect-timeout 10 -a --retry 10 -# -L -o "${TEMP}" "${URL}" 2>&1 | stdbuf -oL tr '\r' '\n' | grep -o '[0-9]*\.[0-9]' | curlprogress.py --scale $MAX_PROGRES
testrc
fi
# Check if a temporary file exists
if [ ! -f "${TEMP}" ]; then
echo "{\"proc.error\":\"File transfer failed: temporary file not found\"}"
fi
# Set FILE to extracted filename from TEMP if FILE not set
FILE=${FILE:-`basename "${TEMP%%\?*}"`}
# Take basename if FILE not nice
FILE=`basename "${FILE%%\?*}"`
# Add a dot to all input formats except for no extension
# txt|csv -> .txt|.csv, txt|csv| -> .txt|.csv|
IN_FORMAT=`python2 -c "print '|'.join(['.' + a if a else a for a in '$IN_FORMAT'.split('|')])"`
# Decide which import to use based on the $FILE extension and the $IN_FORMAT
if [[ ".${FILE}" =~ (${IN_FORMAT})\.gz$ ]]; then
export NAME=`echo "$FILE" | sed -E "s/(${IN_FORMAT})\.gz$//g"`
importGz
elif [[ ".${FILE}" =~ (${IN_FORMAT})\.(bz2|zip|rar|7z|tgz|tar\.gz|tar\.bz2)$ ]]; then
export NAME=`echo "$FILE" | sed -E "s/(${IN_FORMAT})\.(bz2|zip|rar|7z|tgz|tar\.gz|tar\.bz2)$//g"`
import7z
elif [[ ".${FILE}" =~ (${IN_FORMAT})$ ]]; then
if [[ ".${FILE}" =~ \.gz$ ]]; then
export NAME=`echo "$FILE" | sed -E "s/\.gz$//g"`
importGz
elif [[ ".${FILE}" =~ \.(bz2|zip|rar|7z|tgz|tar\.gz|tar\.bz2)$ ]]; then
export NAME=`echo "$FILE" | sed -E "s/\.(bz2|zip|rar|7z|tgz|tar\.gz|tar\.bz2)$//g"`
import7z
else
export NAME=`echo "$FILE" | sed -E "s/(${IN_FORMAT})$//g"`
importUncompressed
fi
else
echo "{\"proc.rc\":1}"
exit 1
fi
echo "{\"proc.progress\":$MAX_PROGRES}"
}