-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy path22.download.sh
executable file
·107 lines (97 loc) · 3.46 KB
/
22.download.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
#
# (C) Copyright 2017 Hojin Choi <hojin.choi@gmail.com>
#
# vim: ts=4 noexpandtab sw=4 sts=4
. sejongrc
REFERER="https://ithub.korean.go.kr/user/total/database/electronicDicView.do"
SEQ="$1"
ATTIDX="$2"
FILESEQ="$3"
PROCNUM="$4"
TRYCOUNT=${TRYCOUNT-5}
if test "$FILESEQ" = "1"; then
ZIP=N
FILESEQVALUES=""
URL="https://ithub.korean.go.kr/common/boardFileDownload.do"
OUTFILE="download/$SEQ.txt"
else
ZIP=Y
FILESEQVALUES="$FILESEQ"
FILESEQ=1
URL="https://ithub.korean.go.kr/common/boardFileZipDownload.do"
OUTFILE="download/$SEQ.zip"
fi
DESC="Attachment of $SEQ"
PREFIX="($SEQ) "
if test -n "$PROCNUM"; then
PREFIX="[$PROCNUM] $PREFIX"
fi
mkdir -p download
mkdir -p corpus
if stat -s . >/dev/null 2>&1; then
STATOPTION="-s"
else
STATOPTION="-c st_size=%s"
fi
LOGFILE="html/attachment-$SEQ.html"
DATA="boardSeq=2&boardGb=T&boardType=CORPUS&articleSeq=$SEQ&roleGb=U&userId=0&fNo=$SEQ&thread=A&lan=1&attachIdx=$ATTIDX&fileSeq=$FILESEQ&fileSeqValues=$FILESEQVALUES&dataGb=E®Gb=1&isInsUpd=U&upperLowerGb=T&pageIndex=1&commentPageIndex=1&subListPageIndex=1¶mClass1Depth=11¶mClass2Depth=1157&searchStartDt=&searchEndDt=&searchDataGb=E&searchCondition=&searchKeyword=&beforePage=&searchWsType=&searchClass1Depth=&searchClass2Depth=&searchAnalType=&searchStartPublishYear=&searchEndPublishYear=&searchPublisher=&searchAuthor=&searchCclAll=&searchCclFree=&searchCommercialUseGb=&searchWorkChangeGb=&searchConditionPermit=&searchCclNoLimit=&searchCclLimit=&searchPlaceTop=&corpusBasketList=&searchYn=Y&cclGb=1&commercialUseGb=2&workChangeGb=2&orgFileSeq=1&posFileSeq=2&agreementYn=on&commentSeq=&commentContents="
if test "$(uname -s)" = "Darwin"; then
ICONV="iconv -f cp949 -t utf8-mac"
else
ICONV="iconv -f cp949 -t utf-8"
fi
TCOUNT=0
while test $TCOUNT -lt $TRYCOUNT
do
let "TCOUNT++"
if test -f "$OUTFILE" -a -f "$LOGFILE"; then
eval $(stat $STATOPTION $OUTFILE)
if test $st_size -gt 0; then
FILENAME=`grep -a Content-Disposition $LOGFILE | $ICONV | tr -d ';\r' | awk -F= '{printf $NF}'`
if test -n "$FILENAME"; then
echo "${PREFIX}Download log file: $LOGFILE"
break
fi
echo "Warn: Oops I can't determine the attached filename for $OUTFILE (Try $TCOUNT/$TRYCOUNT)"
else
echo "${PREFIX}Download again (0 sized file): $OUTFILE (Try $COUNT/$TRYCOUNT)"
CONTENTLENGTH=`grep -a '< Content-Length' $LOGFILE | tr -d ';\r' | awk '{printf $NF}'`
if test "$CONTENTLENGTH" -eq "0"; then
echo "${PREFIX}Skip zero sized file downloading: $OUTFILE (Try $COUNT/$TRYCOUNT) (Check $LOGFILE)"
exit 0
fi
fi
sleep 1
else
echo "${PREFIX}Download $OUTFILE (Try $TCOUNT/$TRYCOUNT)"
fi
rm -f "$LOGFILE" "$OUTFILE"
curl_post
done
if test "$TCOUNT" -ge "$TRYCOUNT"; then
echo "ERROR: Try out error to download $OUTFILE from $URL ($TCOUNT/$TRYCOUNT) (Log $LOGFILE)"
exit 1
fi
echo "${PREFIX}Download file: $OUTFILE, orginal $FILENAME"
echo "${PREFIX}$OUTFILE [$FILENAME]" >> logs/download.log
if test "$ZIP" = "Y"; then
if test "$OUTFILE.stamp" -nt "$OUTFILE" 2>/dev/null ; then
echo "${PREFIX}Skip unzipping $OUTFILE, already done"
else
unzip -o "$OUTFILE" -d corpus 2>/dev/null | while read log; do echo "${PREFIX} $log"; done
echo "${PREFIX}Unzipping $OUTFILE"
touch "$OUTFILE.stamp"
fi
else
if test -z "$FILENAME"; then
echo "Can't determine filename, it should be found in $LOGFILE"
exit 1
fi
if cmp "$OUTFILE" corpus/$FILENAME 2>/dev/null; then
:
else
cp "$OUTFILE" corpus/$FILENAME
echo "${PREFIX}Copying $OUTFILE to corpus/$FILENAME"
fi
fi