-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrun_udiff_deduplication.py
48 lines (33 loc) · 1.14 KB
/
run_udiff_deduplication.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""
Deduplicates all entries.
Note: It is necessary to run this script with --no_parrallel. Otherwise, the behavior is undefined.
"""
from collections import defaultdict
from mapreduce import mapreduce
# Deduplication ----------------------------------------------------------------
class DeduplicationIndex:
def __init__(self):
self._index = set()
def __contains__(self, slc):
commit_sha = slc["commit_sha"]
content_hash = hash(slc["diff"])
return (commit_sha, content_hash) in self._index
def add(self, slc):
commit_sha = slc["commit_sha"]
content_hash = hash(slc["diff"])
self._index.add((commit_sha, content_hash))
def info(self):
return f"""
Num commits: {len(self._index)}
"""
def group_by_project_commit(slc):
return (slc["project"], slc["commit_sha"])
if __name__ == '__main__':
deduplication_index = DeduplicationIndex()
def filter_duplicates(slc):
if slc not in deduplication_index:
deduplication_index.add(slc)
return [slc]
return []
mapreduce(filter_duplicates)
print(deduplication_index.info())