-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_generator.py
102 lines (84 loc) · 4.59 KB
/
prepare_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# import modules
from nltk import sent_tokenize
from collections import Counter
import pandas as pd
import pickle
from project import phrase_obj
from project import preprocessing
from project import probabilities as prob
if __name__ == '__main__':
# parse transcript
# ADJUST SOURCE PATH!
transcript_path = '/Users/Alessandra/Desktop/ANLP/SciFiChatter/data/ma6/transcripts/TEC'
transcript_parsed = preprocessing.parse_transcript(transcript_path) # is list: [speaker, line]
print("\ntranscript found and parsed\n")
# create phrase object collection
print("processing phrases... producing templates... dialogue act classification...\n")
phrases = {}
indexer = 0
phrase_garbage = [] # to record phrases that are excluded from generator, because they are not in any category
for line in transcript_parsed:
speaker = line[0]
sentences = sent_tokenize(line[1])
for i, s in enumerate(sentences):
# initialize new phrase object
phr = phrase_obj.Phrase(indexer, s, speaker)
# assign speaker_change_bools
start = True if (i == 0) else False
end = True if (i + 1 == len(sentences)) else False
speaker_change = [start, end]
# normalize and get types
template = preprocessing.normalize_phrase(s)
type_choices = preprocessing.categorize_phrase(template)
if not type_choices:
phrase_garbage.append(template) # record garbage
# populate
type_vector = preprocessing.type2vec(type_choices)
type_nums = [i for i, val in enumerate(type_vector) if val == 1]
# add variables to phrase object
phr.setup(template, type_vector, type_nums, speaker_change)
# complete phrase object to dict -------------
phrases[indexer] = phr
# increment indexer --------------------------
indexer += 1
#######################
# EVAL
print("evaluating dialogue act classification: ")
print(str(len(phrase_garbage)) + " sentences could not be classified, will be excluded.")
print(str(len(phrases)) + " sentences were transformed into templates and classified.")
cc = [phrases[i] for i in phrases if phrases[i].get_speaker() == 'CC']
p = [phrases[i] for i in phrases if phrases[i].get_speaker() == 'P']
cc_types = Counter([str(phr.get_type_nums()) for phr in cc])
p_types = Counter([str(phr.get_type_nums()) for phr in p])
print(str(len(cc)) + " phrases assigned to CC (character x).")
print(cc_types)
print(str(len(p)) + " phrases assigned to P (character y).")
print(p_types)
print('\n')
#######################
# create probability matrices
print("calculating probabilities for speaker changes and dialogue act sequences...\n")
trans_prob_mat = prob.transmatrix(
[phrases[i].get_type_nums() for i in sorted(phrases) if len(phrases[i].get_type_nums())])
# starttype_prob = prob.typeboolmat([(phrases[i].get_type_nums(), phrases[i].is_starter()) for i in sorted(phrases)])
# endtype_prob = prob.typeboolmat([(phrases[i].get_type_nums(), phrases[i].is_ending()) for i in sorted(phrases)])
#######################
# EVAL
print("probability distribution for dialogue act sequences (bi-grams):\n")
print("NOT YET ADDED IN CODE\n\n")
#######################
# turn dict of phrases into a pandas data frame to store & load to generate sentences (without re-running this code)
phrase_dict = {}
for i in phrases:
p = phrases[i]
phrase_dict[i] = [i, p.get_speaker(), p.get_template(), p.get_types(), p.get_type_nums(), p.is_starter(), p.is_ending()]
column_names = ['i', 'SPEAKER', 'PHRASE', 'TYPEVEC', 'TYPECATS', 'ISSTART', 'ISEND']
df = pd.DataFrame.from_dict(phrase_dict, orient='index', columns=column_names)
print(df.head(10))
# components saved in directory: model_components
# ADJUST TARGET PATH!
# save model with pickle (highest protocol)
df.to_pickle(path='/Users/Alessandra/Desktop/ANLP/SciFiChatter/model_components/phrases.pkl')
pickle.dump(trans_prob_mat, open('/Users/Alessandra/Desktop/ANLP/SciFiChatter/model_components/transition_probabilities.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(starttype_prob, open('/Users/Alessandra/Desktop/ANLP/SciFiChatter/model_components/is_start_probabilities.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(endtype_prob, open('/Users/Alessandra/Desktop/ANLP/SciFiChatter/model_components/is_end_probabilities.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)