-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy patheval_sim.sh
executable file
·52 lines (38 loc) · 1.59 KB
/
eval_sim.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# dataset directory
dataset=wiki
# text file name; one document per line
text_file=text.txt
# word embedding output file name
out_file=jose.txt
# word embedding dimension
word_dim=100
# local context window size
window_size=10
# minimum word count in corpus; words that appear less than this threshold will be discarded
min_count=100
# number of iterations to run on the corpus
iter=10
# number of threads to be run in parallel
threads=20
green=`tput setaf 2`
reset=`tput sgr0`
cd ./src
make jose
cd ..
start=$SECONDS
if [ ! -e ./datasets/${dataset}/text.txt ]
then
cd ./datasets/${dataset}/
echo ${green}===Downloading Wikipedia Dump...===${reset}
wget --load-cookies /tmp/cookies.txt "https://drive.google.com/a/illinois.edu/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://drive.google.com/a/illinois.edu/uc?export=download&id=1fT1GxBMXEItf2NtNMjdXhA61HPecPO98' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1fT1GxBMXEItf2NtNMjdXhA61HPecPO98" -O wiki_dump.zip && rm -rf /tmp/cookies.txt
echo ${green}===Unzipping Wikipedia Dump...===${reset}
unzip wiki_dump.zip && rm wiki_dump.zip
cd ../../
fi
./src/jose -train ./datasets/${dataset}/${text_file} -word-output ./datasets/${dataset}/${out_file} \
-size ${word_dim} -alpha 0.04 -margin 0.15 -window ${window_size} -negative 2 -sample 1e-3 \
-min-count ${min_count} -iter ${iter} -threads ${threads}
duration=$(( SECONDS - start ))
printf '\nRunning time is %s seconds.\n' "$duration"
emb_file=${out_file}
python sim.py --dataset ${dataset} --emb_file ${emb_file}