-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsent_morph.py
77 lines (60 loc) · 3.1 KB
/
sent_morph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from typing import Tuple, List
import os
from nltk import ngrams
from .utils import EXTERNAL_DIR
try:
with open(os.path.join(EXTERNAL_DIR, "pos_labels.txt"), 'r', encoding="utf8") as fin:
pos_unigrams = [pos for pos in fin.read().split()]
except Exception as e:
print("Failed to read pos_labels.txt. Skipping pos label features.")
try:
with open(os.path.join(EXTERNAL_DIR, "common_pos_bigrams.txt"), 'r', encoding="utf8") as fin:
possible_pos_bigrams = [tuple(poses.split()) for poses in fin.read().split("\n")]
except Exception as e:
print("Failed to read common_pos_bigrams.txt. Skipping pos bigram features.")
try:
with open(os.path.join(EXTERNAL_DIR, "common_pos_trigrams.txt"), 'r', encoding="utf8") as fin:
possible_pos_trigrams = [tuple(poses.split()) for poses in fin.read().split("\n")]
except Exception as e:
print("Failed to read common_pos_trigrams.txt. Skipping pos trigram features.")
def extract_sent_morphological_features(text: str, paragraph: List[List[Tuple[str, str, str, str]]], feature_names=None):
features = []
paragraph_pos_unigrams = [item[2] for sentence in paragraph for item in sentence]
if feature_names is None:
for feature in sent_morphological_features.values():
features.extend(feature(text, paragraph_pos_unigrams))
else:
for feature in feature_names:
features.extend(sent_morphological_features[feature](text, paragraph_pos_unigrams))
return features
def pos_unigram_freq(text: str, paragraph_pos_unigrams: list):
feature = [float(paragraph_pos_unigrams.count(pos) / len(pos_unigrams))
for pos in pos_unigrams]
return feature
def pos_bigram_freq(text: str, paragraph_pos_unigrams: list):
paragraph_pos_bigrams = list(ngrams(paragraph_pos_unigrams, 2))
feature = [float(paragraph_pos_bigrams.count(pos) / (len(possible_pos_bigrams) - 1))
if len(possible_pos_bigrams)-1 != 0 else [0.0]
for pos in possible_pos_bigrams]
return feature
def pos_trigram_freq(text: str, paragraph_pos_unigrams: list):
paragraph_pos_trigrams = list(ngrams(paragraph_pos_unigrams, 3))
feature = [float(paragraph_pos_trigrams.count(pos) / (len(possible_pos_trigrams) - 2))
if len(possible_pos_trigrams)-2 != 0 else [0.0]
for pos in possible_pos_trigrams]
return feature
def pos_bigram_occurrence(text: str, paragraph_pos_unigrams: list):
paragraph_pos_bigrams = list(ngrams(paragraph_pos_unigrams, 2))
feature = [(float(pos in paragraph_pos_bigrams))for pos in possible_pos_bigrams]
return feature
def pos_trigram_occurrence(text: str, paragraph_pos_unigrams: list):
paragraph_pos_trigrams = list(ngrams(paragraph_pos_unigrams, 3))
feature = [(float(pos in paragraph_pos_trigrams))for pos in possible_pos_trigrams]
return feature
sent_morphological_features = {
"pos_bigram_occurrence": pos_bigram_occurrence,
"pos_trigram_occurrence": pos_trigram_occurrence,
"pos_unigram_frequency": pos_unigram_freq,
"pos_bigram_frequency": pos_bigram_freq,
"pos_trigram_frequency": pos_trigram_freq
}