-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
212 lines (178 loc) · 7.94 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Cars Company Decision Tree Coursework
# COMP4035 Computer Science Applications - Artificial Intelligence
# Import necessary modules
import math
import pandas as pd
class Node:
def __init__(self, attribute=None, value=None, data=None, children=None):
"""
Constructor for Node class.
Note: by default, the constructor needs no arguments (other than self)
because the node object could be different type of node e.g. root, child, leaf...
:param attribute: attribute name (if applicable)
:param value: value (if applicable)
:param data: (sub)dataset (if applicable)
:param children: children of node (if applicable)
"""
self.attribute = attribute
self.value = value
self.data = data
self.children = children or []
def add_child(self, child):
"""
Adds a new child node to the tree.
:param child: spliced subset of dataset
"""
self.children.append(child)
def calculate_entropy(data):
"""
Calculates entropy for data
:param data: parsed in dataset
:return: calculated entropy
"""
# Find and save unique classes for dataset
unique_vales = data['class'].unique()
# Ensures default value of entropy for dataset
entropy = 0
# Iterate over every class in dataset
for value in unique_vales:
# Counts occurrences of 'value' in class of data
value_count = data[data['class'] == value].shape[0]
# Calculates probability of value
# occurrence out of total dataset count
value_prob = value_count / data.shape[0]
# Check entropy not equal to 0
# 0 entropy means dataset is in order, return immediately
if value_prob != 0:
# Calculates entropy based on formula
entropy -= value_prob * math.log2(value_prob)
return entropy
def calculate_information_gain(data, attribute):
"""
Calculates the information gain for a parsed attribute using entropy.
:param data: Parsed Dataset
:param attribute: Given attribute to calculate information gain for
:return: Information gain for specified attribute
"""
# Calcualte original entropy of dataset
original_entropy = calculate_entropy(data)
# Find all subsets of dataset by attribute
subsets = data.groupby(attribute)
weighted_entropy = 0
# Iterate over each subset
for subset_name, subset in subsets:
# Calculate current subset's probability
# by dividing number of instances in subset by total number of instances in total dataset
subset_prob = subset.shape[0] / data.shape[0]
# Calculate entropy for subset by calling function
subset_entropy = calculate_entropy(subset)
# Sum cumulative entropies correctly (weighted entropy)
weighted_entropy += subset_prob * subset_entropy
# Calculate information gain as difference in entropies
information_gain = original_entropy - weighted_entropy
# Return information gain
return information_gain
def build_decision_tree(data, attributes):
""" Builds a decision tree based on the input data frame and attributes
based on an implementation of the ID3 algorithm.
:param data: DataFrame containing the instances.
:param attributes: List of attributes to consider.
:return: Root node of the decision tree
"""
root = None
if not data.empty:
# Check if all instances belong to the same class
unique_classes = data['class'].unique()
if len(unique_classes) == 1:
# Create a leaf node with the class value
root = Node(data=data)
else:
# Calculate the information gain for each attribute
information_gains = []
for attr in attributes:
information_gains.append(calculate_information_gain(data, attr))
# if there are attributes left to split on
if information_gains:
# Select attribute with max information gain out of remainder of the data
best_attribute = attributes[information_gains.index(max(information_gains))]
# Create the root note with best to use attribute
root = Node(attribute=best_attribute)
# Split data based on the values of the best attribute
subsets = data.groupby(best_attribute)
# Create the child nodes for each of the subsets
for subset_name, subset in subsets:
filtered_attributes = []
for attr in attributes:
if attr != best_attribute:
filtered_attributes.append(attr)
child = build_decision_tree(subset.drop(best_attribute, axis=1), filtered_attributes)
child.value = subset_name
root.add_child(child)
else:
# If there are no further attributes to split on, now create a
# leaf node with the class which has the majority
class_with_majority = data['class'].mode()[0]
root = Node(data=data[data['class'] == class_with_majority])
return root
def output_decision_tree(node, level=0):
""" Prints the output decision tree in a formatted manner for a user to view.
:param node: The root node of the tree to ensure the entire tree can be output
:param level: The entry-level of the tree to ensure the whole tree can be output
"""
# Indents using tabs size multiplied by level to show clear visual difference in nodes of tree
indent = "\t" * level
if node:
if node.attribute:
print(str(indent) + "Attribute: " + str(node.attribute))
else:
print(str(indent) + "Class: " + str(node.data['class'].unique()[0]))
for child in node.children:
print(str(indent) + "Value: " + str(child.value))
next_level = level + 1
output_decision_tree(child, next_level)
def predict_instance(tree, instance):
""" Predicts the class of an instance using a tree structure.
:param tree: The root node of the tree
:param instance: The input instance to predict
:return: The predicted class of the instance
"""
current_node = tree
child_node = None
while current_node.children:
instance_value = instance[current_node.attribute]
for child in current_node.children:
if child.value == instance_value:
child_node = child
break
if child_node:
current_node = child_node
else:
break
return current_node.data['class'].mode()[0]
def measuring_accuracy_of_decision_tree(tree, test_data):
correct_predictions = 0
for index, instance in test_data.iterrows():
prediction = predict_instance(tree, instance)
if prediction == instance['class']:
correct_predictions += 1
accuracy = correct_predictions / test_data.shape[0]
return accuracy
if __name__ == "__main__":
# Load dataset
print("Loading dataset ...")
cardata = pd.read_csv('car.csv', header=None, names=['buying', 'maint', 'doors', 'persons', 'lugboot', 'class'])
# Set all possible attributes manually into list
attributes = ['buying', 'maint', 'doors', 'persons', 'lugboot']
print("Building decision tree ...")
# Build tree from root
root = build_decision_tree(cardata, attributes)
# --- DISPLAYING TREE ----
print("\n*** OUTPUT DECISION TREE ***")
# Graphically displau the tree to terminal
output_decision_tree(root)
# --- TESTS, ACCURACY MEASUREMENTS ---
print("\n\n--- TESTING, ACCURACY MEASUREMENT DATA ---")
test_cardata_sample = pd.read_csv('car.csv', header=None, names=['buying', 'maint', 'doors', 'persons', 'lugboot', 'class'])
# Test the decision tree
accuracy = measuring_accuracy_of_decision_tree(root, test_cardata_sample)
print("Accuracy of model on test random sampled data : " + str(accuracy))