-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecisionTree.py
181 lines (165 loc) · 5.6 KB
/
decisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import pandas as pd
from BiasedWordsNgram import extract_biased_words
from BoW_PCA import toknize_article
from BoW_PCA import vectorize_articles
from gensim import corpora, matutils
import random
import math
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from subprocess import call
from sklearn.metrics import confusion_matrix
import re
import operator
from nltk.corpus import stopwords
# import graphviz
with open('data.json') as data_file:
data = json.load(data_file)
"""
Adjustable variables
"""
#dividing data into three categories
lib = data['liberal']
con = data['conservative']
def create_dataframe(ind):
#create word corpse
# biased_words = extract_biased_words(1)
if ind == 0:
words_corpse = list(set(extract_biased_words(2)))
elif ind == 1:
words_corpse = list(set(add_prefix_features(list(set(extract_biased_words(1))))))
elif ind == -1:
# tokenize the docs
lib_tokenized_docs = []
con_tokenized_docs = []
for sentence in lib[0:1000]:
tokens = toknize_article(sentence)
lib_tokenized_docs.append(tokens)
for sentence in con[0:1000]:
tokens = toknize_article(sentence)
con_tokenized_docs.append(tokens)
# concat tokenized_docs lists
all_tokenized_docs = lib_tokenized_docs + con_tokenized_docs
# use all_tokenized_docs so that matrix's # of features matches
words_corpse = corpora.Dictionary(all_tokenized_docs).values()
# print dict.values()
# create matrix for each category
columns = ['ideology'] + words_corpse
data_train_lib = []
for i in range(len(lib[0:1000])):
tmp = []
for j in range(len(columns)):
if j == 0:
tmp.append(1)
else:
tmp.append(lib[i].count(columns[j]))
data_train_lib.append(tmp)
data_test_lib = []
for i in range(len(lib[1000:1700])):
tmp = []
for j in range(len(columns)):
if j == 0:
tmp.append(1)
else:
tmp.append(lib[i+1000].count(columns[j]))
data_test_lib.append(tmp)
data_train_con = []
for i in range(len(con[0:1000])):
tmp = []
for j in range(len(columns)):
if j == 0:
tmp.append(0)
else:
tmp.append(con[i].count(columns[j]))
data_train_con.append(tmp)
data_test_con = []
for i in range(len(con[1000:1700])):
tmp = []
for j in range(len(columns)):
if j == 0:
tmp.append(0)
else:
tmp.append(con[i+1000].count(columns[j]))
data_test_con.append(tmp)
#create numpy array to use dataframe
numpy_array_train_lib = np.asarray(data_train_lib)
numpy_array_test_lib = np.asarray(data_test_lib)
numpy_array_train_con = np.asarray(data_train_con)
numpy_array_test_con = np.asarray(data_test_con)
df_train = pd.concat([pd.DataFrame(numpy_array_train_lib, columns = columns), pd.DataFrame(numpy_array_train_con, columns = columns)])
df_test = pd.concat([pd.DataFrame(numpy_array_test_lib, columns = columns), pd.DataFrame(numpy_array_test_con, columns = columns)])
return [df_train, df_test]
def pick_subset(anchor_words):
s=set(stopwords.words('english'))
features = [];
word_freq = {}
for sent in lib[0: 1000]:
sentence = re.sub("[^(a-rt-zA-z\s')]", "", sent).replace("`","").replace("(","").replace(")","").replace(" "," ")
sent_list = sentence.split()
if anchor_words in sent_list and sent_list.index(anchor_words) > 0:
pref = sent_list[sent_list.index(anchor_words)-1];
if (pref in word_freq):
word_freq[pref] += 1
else:
word_freq[pref] = 1
for sent in con[0: 1000]:
sentence = re.sub("[^(a-rt-zA-z\s')]", "", sent).replace("`","").replace("(","").replace(")","").replace(" "," ")
sent_list = sentence.split()
if anchor_words in sent_list and sent_list.index(anchor_words) > 0:
pref = sent_list[sent_list.index(anchor_words)-1];
if (pref in word_freq):
word_freq[pref] += 1
else:
word_freq[pref] = 1
word_freq_sorted = sorted(word_freq.items(), key=operator.itemgetter(1))
count = 0
word_freq_sorted.reverse()
if len(word_freq_sorted) > 0:
while word_freq_sorted[count][1] > 3:
if word_freq_sorted[count][0] not in s:
features.append(word_freq_sorted[count][0])
count += 1
if len(features) > 0:
print "word ", anchor_words, " features "
return features
else:
return []
def add_prefix_features(biased_words):
#we take top 5 words from liberal biased and top 5 words from conservative
features = biased_words
count = 0
ind = 0
featured_words = 0
while ind < 100:
new_features = pick_subset(biased_words[ind])
if (len(new_features) > 0):
featured_words += len(new_features)
features = features + new_features
count += 1
ind += 1
# print featured_words
return features
def run_decision_tree(ind):
df = create_dataframe(ind)
df_train = df[0]
df_test = df[1]
model = tree.DecisionTreeClassifier()
model.fit(df_train.drop('ideology', axis=1), df_train['ideology'])
# print model
y = df_test['ideology']
y_predict = model.predict(df_test.drop('ideology', axis=1))
print accuracy_score(y, y_predict)
# pd.DataFrame(
# confusion_matrix(y, y_predict),
# columns=['Predicted Not Liberal', 'Predicted Liberal'],
# index=['True Not Liberal', 'True Liberal']
# )
dot_data = tree.export_graphviz(model, out_file='tree.dot', feature_names=df_train.drop('ideology', axis=1).columns)
# call(['dot', '-T', 'png', 'tree.dot', '-o', 'tree.png'])
# graph = graphviz.Source(dot_data)
# graph
# run_decision_tree()
run_decision_tree(-1)