-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathchar_general.py
70 lines (56 loc) · 2.61 KB
/
char_general.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from typing import Tuple, List
import os
from .utils import EXTERNAL_DIR
try:
with open(os.path.join(EXTERNAL_DIR, "suffixes.txt"), 'r', encoding="utf8") as fin:
SUFFIXES_LIST = fin.read().splitlines()
except Exception as e:
print("Failed to read suffixes.txt. Skipping suffix features.")
try:
with open(os.path.join(EXTERNAL_DIR, "prefixes.txt"), 'r', encoding="utf8") as fin:
PREFIXES_LIST = fin.read().splitlines()
except Exception as e:
print("Failed to read prefixes.txt. Skipping prefix features.")
def extract_char_general_features(text: str, paragraph: List[List[Tuple[str, str, str, str]]], feature_names=None):
features = []
if feature_names is None:
for feature in char_general_features.values():
features.extend(feature(text, paragraph))
else:
for feature in feature_names:
features.extend(char_general_features[feature](text, paragraph))
return features
def suffixes_freq(text: str, paragraph: List[List[Tuple[str, str, str, str]]]):
feature = []
paragraph_words = [item[0] for sentence in paragraph for item in sentence]
for suffix in SUFFIXES_LIST:
matches_count = sum(word.endswith(suffix) for word in paragraph_words)
feature.append(matches_count)
return [float(feat / len(paragraph_words)) for feat in feature]
def prefixes_freq(text: str, paragraph: List[List[Tuple[str, str, str, str]]]):
feature = []
paragraph_words = [item[0] for sentence in paragraph for item in sentence]
for prefix in PREFIXES_LIST:
matches_count = sum(word.startswith(prefix) for word in paragraph_words)
feature.append(matches_count)
return [float(feat / len(paragraph_words)) for feat in feature]
def suffixes_occurrence(text: str, paragraph: List[List[Tuple[str, str, str, str]]]):
feature = []
paragraph_words = [item[0] for sentence in paragraph for item in sentence]
for suffix in SUFFIXES_LIST:
occurrence = any(word.endswith(suffix) for word in paragraph_words)
feature.append(float(occurrence))
return feature
def prefixes_occurrence(text: str, paragraph: List[List[Tuple[str, str, str, str]]]):
feature = []
paragraph_words = [item[0] for sentence in paragraph for item in sentence]
for prefix in PREFIXES_LIST:
occurrence = any(word.startswith(prefix) for word in paragraph_words)
feature.append(float(occurrence))
return feature
char_general_features = {
"suffixes_occurrence": suffixes_occurrence,
"prefixes_occurrence": prefixes_occurrence,
"suffixes_frequency": suffixes_freq,
"prefixes_frequency": prefixes_freq
}