-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge_similar_messages.py
94 lines (74 loc) · 2.69 KB
/
merge_similar_messages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
from fuzzywuzzy import fuzz
from datetime import timedelta
def merge(df, days=1, time_field='timestamp'):
return merge_messages(df,
timedelta(days=days),
50,
time_field
)
def merge_messages(df, max_time_diff,
string_similar_threshold,
time_field='timestamp'):
ret_df = pd.DataFrame(columns=df.columns)
sender_count = len(df['sender_id'].unique())
cnt = 0
msg_processed = 0
merged_msgs = []
for sender_id, sub_df in df.groupby(['sender_id']):
msg_processed += len(sub_df)
if len(sub_df) > 5:
new_df = merge_messages_by_single_user(
sub_df,
max_time_diff,
string_similar_threshold,
time_field
)
merged_msgs.append(new_df)
if len(sub_df) > len(new_df):
print('{} -> {}'.format(len(sub_df), len(new_df)))
else:
merged_msgs.append(sub_df)
cnt += 1
if msg_processed % 10000 == 0:
print('{} / {}'.format(msg_processed, len(df)))
return pd.concat(merged_msgs)
def merge_messages_by_single_user(df,
max_time_diff,
string_similar_threshold,
time_field='timestamp'):
def get_text(r):
return u'{} {}'.format(
r.subject, r.body
)
merged_msgs = []
df = df.sort_values(by=[time_field])
msg_ids = df['message_id'].tolist()
while len(msg_ids) > 0:
msg_id = msg_ids.pop(0)
msg = df[df['message_id'] == msg_id].iloc[0]
msg_text = get_text(msg)
sub_df = df[df[time_field] > msg[time_field]]
if not sub_df.empty:
sub_df = sub_df[
((sub_df[time_field] - msg[time_field]) <= max_time_diff)
]
if not sub_df.empty:
similar_msgs = sub_df[
sub_df.apply(
lambda r: fuzz.ratio(
get_text(r),
msg_text),
axis=1
) > string_similar_threshold
]
for _, m in similar_msgs.iterrows():
msg['recipient_ids'] += m['recipient_ids']
similar_msgs_ids = set(similar_msgs['message_id'].tolist())
msg_ids = [m for m in msg_ids if m not in similar_msgs_ids]
merged_msgs.append(msg)
return pd.DataFrame(merged_msgs)
def main():
pass
if __name__ == '__main__':
main()