-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport.sh
executable file
·393 lines (347 loc) · 10.6 KB
/
export.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
#!/usr/bin/env bash
###################################
function inf { >&2 printf "\033[34m[+]\033[0m %b\n" "$@" ; }
function warn { >&2 printf "\033[33m[!]\033[0m %b\n" "$@" ; }
function err { >&2 printf "\033[31m[!]\033[0m %b\n" "$@" ; }
function err! { err "$@" && exit 1; }
unset EXIT_RES
ON_EXIT=("${ON_EXIT[@]}")
function on_exit_fn {
EXIT_RES=$?
for cb in "${ON_EXIT[@]}"; do $cb || true; done
# read might hang on ctrl-c, this is a hack to finish the script for real
clear_exit
exit $EXIT_RES
}
function on_exit {
ON_EXIT+=("$@")
}
function clear_exit {
trap - EXIT SIGINT
}
trap on_exit_fn EXIT SIGINT
function barsh {
[[ $# -lt 2 ]] && return 1
local val=$1; local bas=$2; local txt=$3; local wid=$4;
[[ -z $wid ]] && { [[ -z $txt ]] && wid=$bas || wid=${#txt} ; }
[[ -z $txt ]] && txt="$(printf '%*s' "$wid" '')"
[[ $wid -gt ${#txt} ]] && txt=$txt$(printf '%*s' $((${#txt} - wid)) '')
local per=$(( (wid * val) / bas ))
printf "\033[7m%s\033[27m%s" "${txt:0:$per}" "${txt:$per:$((wid-per))}"
}
###################################
ACTION=${ACTION}
EXP=$(basename $0)
EXP_FROM=${EXP_FROM}
EXP_FROM_E=${EXP_FROM_E}
EXP_TO=${EXP_TO}
EXP_TO_I=${EXP_TO_I}
EXP_ALL=${EXP_ALL}
EXP_YEARLY=${EXP_YEARLY:-0}
EXP_MONTHLY=${EXP_MONTHLY:-0}
EXP_NETWORK=${EXP_NETWORK}
EXP_OUT=${EXP_OUT:-'/dev/stdout'}
# anything between 10 and 15 has a good CPU time / filesize ratio
EXP_ZSTD_C_LVL=${EXP_ZSTD_C_LVL:-10}
EXP_FORCE=${EXP_FORCE}
function usage {
cat << EOF
. ' ,
_________
_ /_|_____|_\ _
'. \ / .'
'.\ /.'
'.'
$EXP: your bike share import and export specialist
Usage: $EXP action [options...]
Options:
-ge, --from start date interval inclusive (>= YYYY-MM-DD)
-gt, --efrom start date interval exclusive (> YYYY-MM-DD)
-lt, --to end date interval exclusive (< YYYY-MM-DD)
-le, --ito end date interval inclusive (<= YYYY-MM-DD)
--network filter by network tag
--all export each network on a separate file
--monthly do montly exports on interval
--yearly do yearly exports on interval
--zstd ZSTD compression level
-f, --force force action
-o, --out export output file (defaults to stdout)
-V, --verbose echo every command that gets executed
-h, --help display this help
Commands:
help Show usage
quack cb.db cb.duck SQLite DB to duckdb
parquet cb.duck Export stat data as parquet file
csv cb.duck Export stat data as csv file
custom cb.duck Use custom format (duckdb)
Example:
$ $EXP quack cb.db cb.duck
$ $EXP parquet cb.duck --from 2024-11-01 --to 2024-11-05 > out.parquet
$ $EXP csv cb.duck --from 2024-11-01 --to 2024-11-05 -o out.csv
$ $EXP parquet --all --from 2024-11-01 --to 2024-12-01 -o dump/202411
$ $EXP custom cb.duck --network bicing -o out.parquet -- \\
"FORMAT 'parquet', CODEC 'zstd', COMPRESSION_LEVEL 22"
$ $EXP csv cb.duck --network bicing --from 2024-11-01 --to 2024-12-02 \\
| gzip > bicing.csv.gz
$ $EXP csv.gz --all --from 2024-11-01 --to 2024-12-01 -o dump/202411
EOF
}
function parse_args {
_ARGS=()
_NP_ARGS=()
! [[ $1 =~ ^- ]] && ACTION=$1 && shift
while [[ $# -gt 0 ]]; do
case "$1" in
-V|--verbose)
set -x
;;
-h|--help)
usage
exit 0
;;
--from|-ge)
EXP_FROM=$2
shift
;;
--efrom|-gt)
EXP_FROM_E=$2
shift
;;
--to|-lt)
EXP_TO=$2
shift
;;
--ito|-le)
EXP_TO_I=$2
shift
;;
--network)
EXP_NETWORK=$2
shift
;;
--yearly)
EXP_YEARLY=1
;;
--monthly)
EXP_MONTHLY=1
;;
--zstd)
EXP_ZSTD_C_LVL=$2
shift
;;
--all)
EXP_ALL=1
;;
-o|--out)
EXP_OUT=$2
shift
;;
-f|--force)
EXP_FORCE=1
;;
-)
_ARGS+=("$(cat "$2")")
shift
;;
--)
shift
_NP_ARGS+=("$@")
break
;;
*)
_ARGS+=("$1")
;;
esac
shift
done
}
function export_all {
local networks=($(duckdb $1 -readonly --list --noheader << EOF
SELECT DISTINCT(network_tag) FROM stats
WHERE true
${EXP_FROM:+"AND timestamp >= '$EXP_FROM'"}
${EXP_FROM_E:+"AND timestamp > '$EXP_FROM_E'"}
${EXP_TO:+"AND timestamp < '$EXP_TO'"}
${EXP_TO_I:+"AND timestamp <= '$EXP_TO_I'"}
ORDER BY network_tag ASC
;
EOF
))
[[ $EXP_OUT == "/dev/stdout" ]] && err! "Output can't be stdout"
local where=$(dirname $EXP_OUT)
mkdir -p $where
local bname=$(basename $EXP_OUT)
local prefix=${bname%.*}
local extension=${bname#*.}
[[ "$prefix" == "$extension" ]] && extension="$2"
local args=()
args+=("$2")
args+=("$1")
[[ -n $EXP_FROM ]] && args+=("--from $EXP_FROM")
[[ -n $EXP_TO ]] && args+=("--to $EXP_TO")
[[ -n $EXP_FROM_E ]] && args+=("--efrom $EXP_FROM_E")
[[ -n $EXP_TO_I ]] && args+=("--ito $EXP_TO_I")
local filename
local i=1
local per
for network in ${networks[@]}; do
per=$(( ($i * 100) / ${#networks[@]} ))
filename="$where/$prefix-$network-stats.$extension"
[[ -f $filename ]] && [[ -z $EXP_FORCE ]] && \
err! "File exists, use -f to force overwrite"
printf "\033[0G\033[K\033[34m%b\033[0m" \
"$(barsh $i ${#networks[@]} \
"│$i/${#networks[@]}:${per}%│ $network → $filename" $COLUMNS)"
# XXX kind of a hack
if [[ $2 == "csv.gz" ]]; then
$(realpath $0) ${args[@]} --network $network | gzip -cf > $filename
else
$(realpath $0) ${args[@]} --network $network -o $filename &> /dev/null
fi
((i+=1))
done
}
function main {
parse_args "$@"
# re-set action arguments after parsing.
# access action arguments as $1, $2, ... in order
set -- "${_ARGS[@]}"
case $ACTION in
# Dumps sqlite db into a CSV with json columns and imports them on
# a duckdb. Necessary because duckdb does not support JSONB columns
quack|duck)
[[ -z $1 ]] && err! "Please provide input sqlite db filename"
local sqlfile=$(realpath $1); shift
! [[ -f $sqlfile ]] && err! "$sqlfile not found"
local duckfile=${1:-${sqlfile%.*}.duck}
[[ -f $duckfile ]] && [[ -z $EXP_FORCE ]] && \
err! "$duckfile exists, use -f to force" || \
warn "quacking $duckfile"
tmpfile=$(mktemp -u cb-export.csv.XXXXX)
inf "Converting to CSV in $tmpfile"
on_exit "rm -f $tmpfile"
sqlite3 --csv $sqlfile << EOF > $tmpfile
SELECT network_tag, json(station), timestamp FROM stats
WHERE true
${EXP_FROM:+"AND timestamp >= '$EXP_FROM'"}
${EXP_FROM_E:+"AND timestamp > '$EXP_FROM_E'"}
${EXP_TO:+"AND timestamp < '$EXP_TO'"}
${EXP_TO_I:+"AND timestamp <= '$EXP_TO_I'"}
${EXP_NETWORK:+"AND network_tag = '$EXP_NETWORK'"}
;
EOF
inf "Populating $duckfile with data"
duckdb $duckfile << EOF
CREATE TABLE IF NOT EXISTS stats (
network_tag VARCHAR,
station JSON,
timestamp TIMESTAMP
);
COPY stats from '$tmpfile';
-- useful view for manual working with duck db file
CREATE VIEW IF NOT EXISTS _deduped AS (
WITH st_window AS (
SELECT network_tag,
station,
station.bikes AS bikes,
station.free AS free,
lag(station.bikes, 1, NULL) OVER st AS prev_bikes,
lag(station.free, 1, NULL) OVER st AS prev_free,
timestamp
FROM stats
WINDOW st AS(
PARTITION BY network_tag, station.id, station.extra.uid
ORDER BY timestamp ASC
)
ORDER BY network_tag, station.id, station.extra.uid,
timestamp ASC
)
SELECT * FROM st_window
WHERE
bikes <> prev_bikes OR free <> prev_free OR
(prev_bikes IS NULL AND prev_free IS NULL)
);
EOF
;;
parquet|csv|csv.gz|custom)
! [[ -f $1 ]] && err! "Please provide a duckdb"
local duckfile=$1; shift
[[ -n $EXP_ALL ]] && {
export_all $duckfile $ACTION
exit $?
}
local format
case $ACTION in
parquet)
format="FORMAT 'PARQUET', \
CODEC 'ZSTD', \
COMPRESSION_LEVEL $EXP_ZSTD_C_LVL"
;;
csv)
format="FORMAT 'csv', HEADER"
;;
csv.gz)
format="FORMAT 'csv', HEADER"
;;
custom)
format="${_NP_ARGS[@]}"
;;
*)
err! "Unrecognized format $ACTION"
;;
esac
duckdb -readonly $duckfile << EOF
COPY (
WITH st_window AS (
SELECT network_tag,
station,
station.bikes AS bikes,
station.free AS free,
lag(station.bikes, 1, NULL) OVER st AS prev_bikes,
lag(station.free, 1, NULL) OVER st AS prev_free,
timestamp
FROM stats
WHERE true
${EXP_FROM:+"AND timestamp >= '$EXP_FROM'"}
${EXP_FROM_E:+"AND timestamp > '$EXP_FROM_E'"}
${EXP_TO:+"AND timestamp < '$EXP_TO'"}
${EXP_TO_I:+"AND timestamp <= '$EXP_TO_I'"}
${EXP_NETWORK:+"AND network_tag = '$EXP_NETWORK'"}
WINDOW st AS(
PARTITION BY network_tag, station.id, station.extra.uid
ORDER BY timestamp ASC
)
ORDER BY network_tag, station.id, station.extra.uid,
timestamp ASC
),
deduped AS (
SELECT * FROM st_window
WHERE
bikes <> prev_bikes OR free <> prev_free OR
(prev_bikes IS NULL and prev_free IS NULL)
)
SELECT network_tag AS tag,
json_extract_string(station, '$.id') AS id,
json_extract_string(station, '$.extra.uid') AS nuid,
json_extract_string(station, '$.name') AS name,
json_extract(station, '$.latitude')::double AS latitude,
json_extract(station, '$.longitude')::double AS longitude,
json_extract(station, '$.bikes')::int AS bikes,
json_extract(station, '$.free')::int AS free,
station.extra,
timestamp
FROM deduped
ORDER BY tag, id, nuid, timestamp ASC
) TO '$EXP_OUT' WITH ($format);
EOF
;;
help)
usage
;;
*)
usage
exit 1
;;
esac
}
main "$@"