-
Notifications
You must be signed in to change notification settings - Fork 5
/
feature_extraction_utils.py
65 lines (47 loc) · 1.64 KB
/
feature_extraction_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
def paths(tokens):
all_paths = ['_'.join(tokens[0:(i+1)]) for i in range(len(tokens))]
return ' '.join(all_paths)
def ip_features(ip):
if not ip:
return ''
if '.' in ip:
return paths(ip.split('.'))
elif ':' in ip:
return paths(ip.split(':'))
return ip
re_prop = re.compile(re.escape('/*') + '(.+?)' + re.escape('*/'))
re_links = re.compile(re.escape('[[') + '(.+?)' + re.escape(']]'))
re_punct = re.compile('[' + ''.join(re.escape(p) for p in string.punctuation) + ']')
re_space = re.compile(' +')
def to_unicode(s):
if isinstance(s, unicode):
return s
return s.decode('utf8')
def extract_structured_comment(comment):
all_found = re_prop.findall(comment)
command_tokens = set()
for res in all_found:
res = res.strip()
split = res.split('|')
for tok in split:
tok = tok.strip()
if not tok:
continue
tok = tok.replace(' ', '_')
command_tokens.add(tok)
command_tokens.update(tok.split(':'))
result = ' '.join(sorted(command_tokens))
return to_unicode(result).lower()
def extract_links(comment):
all_found = re_links.findall(comment)
props = [t.replace(' ', '_') for t in all_found]
result = ' '.join(props).lower()
return to_unicode(result).lower()
def extract_unstructured_text(comment):
comment = re_prop.sub('', comment)
comment = comment.strip().lower()
comment = ' '.join(re_punct.split(comment))
comment = comment.replace('property', '')
comment = re_space.sub(' ', comment).strip()
return to_unicode(comment).lower()