-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtag_wordsim.py
executable file
·114 lines (85 loc) · 3.2 KB
/
tag_wordsim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
#
# Author: (c) 2016 Vincent Kriz <kriz@ufal.mff.cuni.cz>
#
# As WordSim353 does not provide the POS tags, we adopt the same
# strategy as was used in the SimLex-999 dataset. For each word form
# we obtain the most frequent POS from the tagged corpora.
import sys
import logging
import argparse
from collections import defaultdict
# Logging.
logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', level=logging.DEBUG)
# Parse command line arguments.
parser = argparse.ArgumentParser()
parser.description = 'Train most frequent POS tags for each word form.'
parser.add_argument('--dataset', required=True, help='a WordSim to be tagged')
parser.add_argument('--lang', required=True, help='a WordSim mutation (cs, en)')
parser.add_argument('--output', required=True, help='a modified WordSim')
args = parser.parse_args()
# Methods.
def pos_frequencies():
return defaultdict(int)
def parse_wordsim_en(line):
fields = line.rstrip().split('\t')
word1 = fields[0]
word2 = fields[1]
return word1, word2
def parse_wordsim_cs(line):
fields = line.rstrip().split('\t')
word1 = fields[6]
word2 = fields[7]
return word1, word2
def tag_word(word):
max_frequency = 0
max_frequent_pos = 0
for pos in dictionary[word]:
if dictionary[word][pos] > max_frequency:
max_frequency = dictionary[word][pos]
max_frequent_pos = pos
logging.info('Tagging %s by POS %s', word, max_frequent_pos)
return max_frequent_pos
# Parse a dataset and obtain a list of words to be tagged.
vocabulary = dict()
logging.info('Loading WordSim.')
with open(args.dataset, 'r') as fdataset:
for (n_line, line) in enumerate(fdataset):
if n_line == 0:
continue
if args.lang == 'en':
word1, word2 = parse_wordsim_en(line)
elif args.lang == 'cs':
word1, word2 = parse_wordsim_cs(line)
else:
raise NotImplementedError('Unsupported WordSim mutation.')
vocabulary[word1] = 1
vocabulary[word2] = 1
logging.info('Loaded %d different word forms to tag.', len(vocabulary))
# Read STDIN and count frequencies
logging.info('Counting frequencies.')
dictionary = defaultdict(pos_frequencies)
for (n_line, line) in enumerate(sys.stdin):
if (n_line % 10000000) == 0:
logging.info('Processed %10d tokens', n_line)
word, pos = line.rstrip().split('\t')
if args.lang == 'en':
word = word.split('_')[0]
if word in vocabulary:
dictionary[word][pos] += 1
# Save results.
with open(args.output, 'w') as fout:
with open(args.dataset, 'r') as fdataset:
for (n_line, line) in enumerate(fdataset):
if n_line == 0:
fout.write('%s\tPOS 1\tPOS 2\n' % line.rstrip())
continue
if args.lang == 'en':
word1, word2 = parse_wordsim_en(line)
elif args.lang == 'cs':
word1, word2 = parse_wordsim_cs(line)
else:
raise NotImplementedError('Unsupported WordSim mutation.')
pos1 = tag_word(word1)
pos2 = tag_word(word2)
fout.write('%s\t%s\t%s\n' % (line.rstrip(), pos1, pos2))