-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_extraction.py
63 lines (60 loc) · 1.76 KB
/
text_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def extract_from_mail(message):
key="-------------------------"
mail=open(message,"r",encoding='utf-8')
messages=mail.readlines()
ret=[]
str=''
for line in messages:
#print(line)
if line.rstrip() == key:
ret.append(str)
str=''
elif ':' not in line:
str+=line
mail.close()
return ret
def labels_maker(label,n):
ret=[label]*n
return ret
import csv
emails=extract_from_mail('placements.txt')
n=len(emails)
labels=labels_maker('placements',n)
emails.extend(extract_from_mail('promotions.txt'))
labels.extend(labels_maker('promotions',len(emails)-n))
n=len(emails)
emails.extend(extract_from_mail('social.txt'))
labels.extend(labels_maker('social',len(emails)-n))
n=len(emails)
emails.extend(extract_from_mail('news.txt'))
labels.extend(labels_maker('news',len(emails)-n))
n=len(emails)
emails.extend(extract_from_mail('acads.txt'))
labels.extend(labels_maker('acads',len(emails)-n))
n=len(emails)
emails.extend(extract_from_mail('misc.txt'))
labels.extend(labels_maker('misc',len(emails)-n))
finalList=[]
for i in range(len(emails)):
finalList.append([emails[i],labels[i]])
fields=['Emails','Labels']
filename="dataset.csv"
with open (filename,"w",newline='',encoding='utf-8') as csvfile:
csvwriter=csv.writer(csvfile)
csvwriter.writerow(fields)
csvwriter.writerows(finalList)
finalList=[]
with open(filename,'r',encoding='utf-8') as csvfile:
csvreader = csv.reader(csvfile)
fields = next(csvreader)
for row in csvreader:
finalList.append(row)
emails=[]
labels=[]
for row in finalList:
emails.append(row[0])
labels.append(row[1])
print(emails[0])
print(labels[0])
print(len(emails))
print(len(labels))