-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotation_extraction.py
80 lines (68 loc) · 3.27 KB
/
annotation_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gzip, pickle, csv, time, datetime, glob, re, os
from time import mktime
from datetime import datetime
'''
Step 1 of the pipeline.
Extract all the gold annotations for the annotated messages.
IN:
- abusetype_all.csv: files genrated from the WCC data. Contains 1 line per annotated comment in WCC.
Columns are: rev_id, page_id, comment, year, logged_in, ns, sample, split and all annotations (depending on the abuse type considered).
Annotations are the summary of the 10 judgments in WCC i.e. average value for scores and majoritary value for binary.
- WCC/*.csv: files generated from the WCC data. 1 file per page containing at least 1 annotated comment.
WCC/xx.csv contains all the messages posted on the xx page (in the WCC data).
Attributes are: rev_id, comment, cleaned_comment, timestamp, conv_id, conv_name, author_id, author name, bot, admin
When the author was not logged in, author_id is missing and author name is its ip address.
OUT:
- annotated_abusetype.csv: files containing all the available annotated comments from the 3 WCC datasets and their gold annotations.
Attributes are: rev_id, page_id, comment, all gold annotations
'''
def strip_message(message):
message = re.sub('NEWLINE_TOKEN', '', message)
message = re.sub('NEWLINE', '', message)
return message
def load_annotated(abuse_type):
annotated_msg = {}
with open('Data/%s_all.csv' % abuse_type, mode='r') as csvfile:
reader = csv.DictReader(csvfile, delimiter=' ')
for row in reader:
#some messages don't have any page_id. We can't use them
if len(row['page_id']) > 0:
#remove unused attributes
del row['year']
del row['logged_in']
del row['ns']
del row['sample']
del row['split']
annotated_msg[row['rev_id']] = row
return annotated_msg
def create_annotated_file(abuse_type):
# Get dict of all annotated messages indexed by their rev_id
annotated_msg = load_annotated(abuse_type)
# name of the columns (rev_id, page_id, comment, all annotations)
column_names = list(annotated_msg.values())[0].keys()
with open('GeneratedData/annotated_%s.csv' % abuse_type, mode='w') as outfile:
writer = csv.writer(outfile)
writer.writerow(column_names)
#All conversation files from WCC
files = glob.glob("Data/WCC/*.csv")
for file in files:
with open(file, mode='r') as f:
reader = csv.DictReader(f, fieldnames=("rev_id", "message_text", "message", "date", "conv_id", "conv_name", "user_id", "user_text", "bot", "mod"), delimiter=' ')
for row in reader:
#If current message is an annotated message
if row['rev_id'] in annotated_msg:
annotated_message = annotated_msg[row['rev_id']]
msg = strip_message(row['message_text'])
#for attack: rev_id, conv_id, text, quoting_attack, recipient_attack, third_party_attack, other_attack, attack
#for aggression: rev_id, conv_id, text, aggression_score, aggression
#for toxicity: rev_id, conv_id, text, toxicity_score, toxicity
message = [int(annotated_message['rev_id']), int(annotated_message['page_id']), msg]
for annotation in list(annotated_message.values())[3:]:
message.append(annotation)
writer.writerow(message)
f.close()
outfile.close()
if __name__ == "__main__":
create_annotated_file('attack')
create_annotated_file('aggression')
create_annotated_file('toxicity')