-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathviolations.py
96 lines (92 loc) · 3.33 KB
/
violations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
from collections import defaultdict
from functools import partial
import re
violations_re = r'(?P<code>\d+)\. (?P<desc>[^|]+) |'
DESC_IGNORE_LIST = []
# "CRITICAL VIOLATION",
# "CRITICAL VIOLATION 7-38-005A.",
# "CRITICAL VIOLATION:7-38-005(A)",
# "(CRITICAL 7-38-005A)",
# "CRITICAL CITATION ISSUED, 7-38-005(A).",
# "CRITICAL VIOLATION 7-38-005(A)",
# "Critical violation 7-38-005(a).",
# "CRITICAL VIOLATION 7-38-005A",
# "CRITICAL VIOLATION: 7-38-005(A)",
# "SERIOUS VIOLATION",
# "(CRITICAL 7-38-005A).",
# "CRITICAL",
# "CITATION ISSUED #7-38-005(A).",
# "CRITICAL CITATION ISSUED 7-38-005 (A).",
# "SERIOUS VIOLATION 7-38-005A.",
# "CRITICAL CITATION ISSUED",
# "Critical citation issued 7-38-005(A).",
# "CRITICAL CITATION ISSUED 7-38-005(A)",
# "CITATION ISSUED.",
# "MUST",
# "MUST PROVIDE.",
# "NO CITATION",
# "CITATION",
# "NO CITATION ISSUED.",
# "MUST CLEAN AND",
# "NO CITATION.",
# "MUST CLEAN AND MAINTAIN",
# "CRITICAL CITATION ISSUED 7-38-005(A).",
# "CRITICAL VIOLATION 7-38-005 (A)",
# "CRITICAL VIOLATION 7-38-005(A).",
# "CRITICAL VIOLATION 7-38-005B.",
# "CRITICAL VIOLATION:7-38-005(A).",
# "VIOLATION CORRECTED.",
# "VIOLATION CORRECTED AND ABATED.",
# "SERIOUS VIOLATION CORRECTED",
# "CRITICAL VIOLATION: 7-38-005(A).",
# "CITATION ISSUED",
# "SERIOUS CITATION ISSUED.",
# "SERIOUS VIOLATION CORRECTED.",
# "CRITICAL CITATION ISSUED 7-38-005.",
# "CRITICAL VIOLATION 7-38-005 (A).",
# "CRTICAL VIOLATION 7-38-005A",
# "CITATION ISSUED #7-38-005(A)",
# "CITATION ISSUED CRITICAL",
# "CORRECTED",
# "COURT DATE 11.18.10, 400 W SUPERIOR, ROOM 112, 10 AM.",
# "CRITICAL 7-38-005 (A) ISSUED.",
# "CRITICAL 7-38-005(A) ISSUED.",
# "CRITICAL 7-38-005(A).",
# "CRITICAL CITATION #7-38-005(A).",
# "CRITICAL CITATION 7-38-005(A)",
# "CRITICAL CITATION ISSUED #7-38-005(A).",
# "CRITICAL CITATION ISSUED 7-38-005(A.",
# "CRITICAL CITATION ISSUED, 7-38-005(A)",
# "CRITICAL CITATION ISSUED, 7-38-005[A].",
# "CRITICAL CITATION ISSUED.",
# "CRITICAL CITATION ISSUED. 7-38-005(A).",
# "CRITICAL VIOLATION 7-38-005",
# "CRITICAL VIOLATION 7-38-005 (B).",
# "CRITICAL VIOLATION 7-38-005.",
# "CRITICAL VIOLATION 7-38-005(A) ISSUED.",
# ]
def pull_violations(violations_dict, row):
violations_col = row['Violations']
cleaned_violations_col = re.sub("\s+", " ", str(violations_col))
cleaned_violations_col = cleaned_violations_col.replace("- Comments", "|")
match_info = re.finditer(violations_re, cleaned_violations_col)
acc = ""
for match in match_info:
code = match.group('code')
desc = match.group('desc')
if code and desc:
if desc not in DESC_IGNORE_LIST:
violations_dict[code].add(desc)
acc += f"{code}: {desc}\n"
return acc
if __name__ == "__main__":
df = pd.read_csv('Food_Inspections.csv')
violations = defaultdict(set)
df['pulled_violations'] = df.apply(partial(pull_violations, violations), axis=1)
df.to_csv('cleaned_violations.csv')
codes = sorted(violations.keys())
print("code\tdescription")
for code in codes:
for desc in violations[code]:
print(f"{code}\t{desc}")