-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtypo_identification.py
76 lines (52 loc) · 1.93 KB
/
typo_identification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Can be run anytime after run_slc_process.py
Computes the precentage of bugs that reflect a typo (Damerau Levenshtein distance <= 2)
"""
from mapreduce import mapreduce
from fastDamerauLevenshtein import damerauLevenshtein
import json
from code_diff.diff_utils import parse_hunks
def get_match_line(hunk):
assert len(hunk.added_lines) == 1
assert len(hunk.rm_lines) == 1
add_line = next(iter(hunk.added_lines))
rm_line = next(iter(hunk.rm_lines))
add_line = hunk.lines[add_line]
rm_line = hunk.lines[rm_line]
add_line = " " + add_line[1:]
rm_line = " " + rm_line[1:]
return rm_line, add_line
def text_dist(slc):
diff = slc["diff"]
hunks = parse_hunks(diff)
cum_dist = 0
for hunk in hunks:
try:
before_line, after_line = get_match_line(hunk)
cum_dist += damerauLevenshtein(before_line, after_line, similarity = False)
except AssertionError:
return [1e9]
return [cum_dist]
def update_dist(slc):
edit_script = json.loads(slc["edit_script"])
if len(edit_script) != 1: return []
if edit_script[0][0] != "Update": return []
if "string" not in edit_script[0][1][0]: return []
before = edit_script[0][1][0].replace("string:", "")
after = edit_script[0][2]
if len(before) < 3: return [1e9]
return [damerauLevenshtein(before, after, similarity = False)]
class TypoCount():
def __init__(self):
self.count = 0
self.total = 0
def __call__(self, edit_distance):
if edit_distance <= 2:
self.count += 1
self.total += 1
def count_info(self):
return "%d / %d (%f)" % (self.count, self.total, self.count / self.total)
if __name__ == '__main__':
typo_count = TypoCount()
mapreduce(text_dist, typo_count) # Replace text_dist by update_dist to compute typo distribution for specific updates
print("Total number of typos: %s" % typo_count.count_info())