-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpora_get.py
43 lines (38 loc) · 1.59 KB
/
corpora_get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas
def read_conversations(file):
conversations = [] #list of lists
current_id = file['dialogueID'][0]
current_conversation = []
prev_user = ""
for index, row in file.iterrows():
if row['dialogueID'] != current_id:
if len(current_conversation) > 1: #forming a conversation
conversations.append(current_conversation)
current_conversation = []
prev_user = ""
current_id = row['dialogueID']
if row['from'] != prev_user:
current_conversation.append(str(row['text']))
else:
current_conversation[len(current_conversation)-1] += str(row['text']) #one person talking continuously
prev_user = row['from']
return conversations
def get_tokenized_sequencial_sentences(conversations):
for conversation in conversations:
for i in range (len(conversation)-1):
yield (conversation[i].split(" "), conversation[i+1].split(" "))
'''
max = len(conversation) - 1
i = 0
while i < max:
yield (conversation[i].split(" "), conversation[i+1].split(" "))
i += 2
'''
def generate_conv_tuple(file):
conversations = read_conversations(file)
return tuple(zip(*list(get_tokenized_sequencial_sentences(conversations))))
def get_ubuntu_corpus_data():
#file = pandas.read_csv('Ubuntu-dialogue-corpus/dialogueText.csv')
file = (pandas.read_csv('Ubuntu-dialogue-corpus/dialogueText_301.csv'))
#file.append(pandas.read_csv('Ubuntu-dialogue-corpus/dialogueText_196.csv'))
return generate_conv_tuple(file)