-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwork4.py
76 lines (61 loc) · 2.16 KB
/
work4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
import matplotlib.pyplot as plt
from math import log
def dealMatrix(matrix): #转置
matrix = matrix.transpose()
return matrix
def dealEntropy(dataSet, num): #计算熵
n = len(dataSet)
count = {}
H = 0
for data in dataSet:
currentLabel = data[num]
if currentLabel not in count.keys(): #若字典中不存在该类别标签,即创建
count[currentLabel] = 0
count[currentLabel] += 1 #递增类别标签的值
for key in count:
px = float(count[key]) / float(n) #计算某个标签的概率
H -= px * log(px, 2) #计算信息熵
return H
def splitDataSet(dataSet, axis, value): #分割
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numberFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numberFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy =0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
f = open('iris.txt','r')
first_ele = True
for data in f.readlines():
data = data.strip(',"Iris-virginica"\n')
data = data.strip(',"Iris-setosa"\n')
data = data.strip(',"Iris-versicolor"\n')
nums = data.split(",")
if first_ele:
nums = [float(x) for x in nums]
matrix = np.array(nums)
first_ele = False
else:
nums = [float(x) for x in nums]
matrix = np.c_[matrix, nums]
D = dealMatrix(matrix)
print(dealEntropy(D, 0))