-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetRelevantCorpusFiles.sh
executable file
·73 lines (55 loc) · 1.27 KB
/
getRelevantCorpusFiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/sh
#
# take files with prosodic annotation if they exist (.s2) otherwise
# take file without prosodic annotation (.s1)
#
# take only long/unsplitted files (*_l.s? or *_r.s?)
#
KC_ROOT=
usage () {
cat << EOF
$0 [-h|--help] KC_ROOT
-h|--help print this output
KC_ROOT the root directory of Kiel Corpus
EOF
}
while [ $# -gt 0 ]; do
case $1 in
-h|--help)
usage
exit
;;
*)
KC_ROOT=$1
;;
esac
shift
done
if [ "$KC_ROOT" = "" ] || [ ! -d "$KC_ROOT" ]; then
echo "Error: Please specify the root directory of Kiel Corpus!"
usage
exit 1
fi
ALL_FILES=$( find $KC_ROOT -type f -name \*\.s[12] | rev | cut -d '.' -f 2- | rev | sort -u )
UNSPLITTED_FILES=$( echo "$ALL_FILES" | grep -E "_[lr]{1}$" )
FILE_PREFIXES=$ALL_FILES
# remove all splitted files from all files list
for FILE in $UNSPLITTED_FILES; do
FILE_PREFIX=$( echo "$FILE" | rev | cut -d _ -f 2- | rev )
OUT=$( echo "$FILE_PREFIXES" | grep -vE "^${FILE_PREFIX}[[:alnum:]]+$" )
FILE_PREFIXES="$OUT
$FILE"
done
FILES=
# take .s2 file of remaining files if they exist
# .s1 file otherwise
for FILE in $FILE_PREFIXES; do
if [ -f "${FILE}.s2" ]; then
FILES="$FILES
${FILE}.s2"
else
FILES="$FILES
${FILE}.s1"
fi
done
echo "$FILES" | grep -v "^$" | sort -u