-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmake.sh
executable file
·21 lines (19 loc) · 940 Bytes
/
make.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#!/bin/bash
#Author: Saurabh Pathak
#Port of POS tagger's makefile to bash script
#Plus it does not use intermediate disk I/O and can deal with both file and string input. :)
#Also, my sentence boundary detection is better (SBD)-- can handle abbreviations and big text files ;)
TAGGERDIR="$THESISDIR/data/downloaded/hindi-part-of-speech-tagger"
TAGGER="$TAGGERDIR/bin/tnt -v0 -H models/hindi" # Use option -u1 for speed at a slight cost of precision. For more options use ./bin/tnt -h
LEMMATIZER="$TAGGERDIR/bin/lemmatiser.py models/hindi.lemma"
TAG2VERT="$TAGGERDIR/bin/tag2vert.py"
NORMALIZE="$TAGGERDIR/bin/normalize_vert.py"
POSMOD="$TAGGERDIR/bin/modify_pos.py"
TOKENIZER="$TAGGERDIR/bin/unitok.py -l hindi -n"
if [ -f $1 ] >& /dev/null
then IN=$(cat $1)
else IN="$1"
fi
cd $TAGGERDIR
$TAGGER <( echo "$IN" | sed 's/$/ EOL/g' | $TOKENIZER | sed -e 's/।/./g' | $NORMALIZE ) | sed -e 's/\t\+/\t/g' | $LEMMATIZER | $TAG2VERT
exit 0