-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdl_uncompress.sh
49 lines (33 loc) · 1.34 KB
/
dl_uncompress.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# GLOBAL arguments. Pls change this based off your configs.
SCRAP="comments" # choose between `comments` or `submissions`
FILE_READ_PATH="/path/to/downloaded/files"
FILE_WRITE_PATH="/path/to/written/files"
##################################
COLLECTOR_PATH=$(pwd)
echo ""
echo "Root path is '$COLLECTOR_PATH'"
mkdir -p $FILE_READ_PATH
# check the date range arguments if you want to scrap specific period files
python fetch_file_links.py -scrap $SCRAP -start_date "2022-06" -end_date "2022-07" > url.list
mv url.list $FILE_READ_PATH
cd $FILE_READ_PATH
echo "Starting to download ..."
echo "This may take several minutes..."
cat url.list | parallel -j10 wget {}
rm url.list
echo "Un-compressing downloaded files..."
find . -name '*.zst' -print0 | xargs -0 -I {} -P 15 unzstd --long=31 {} > {}.json
find . -name '*.bz2' -print0 | xargs -0 -I {} -P 10 bzip2 -v -d {}
find . -name '*.xz' -print0 | xargs -0 -I {} -P 5 unxz {}
# At this stage, you can remove the compressed files to free up some disk memory!
# It's up tp you, if you want to keep the compressed files, then uncomment the following 3 lines
rm *.zst
rm *.bz2
rm *.xz
# return to the collector root
cd $COLLECTOR_PATH
python main.py -mode preprocess \
-read_dir $FILE_READ_PATH \
-write_dir $FILE_WRITE_PATH \
-tldr_th 4 \
-lower