from sklearn.datasets import load_iris
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import random
from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn import datasets as ds
import h5py
import numpy as np
import pydotplus
import os
import image
fruits = pd.read_table('E:/fruit.txt')
X = fruits[['mass', 'width', 'height', 'color_score']]
Y = fruits['fruit_label']
x = X.values
y = Y.values
f1=h5py.File("myh5py1.hdf5","w")
d1=f1.create_dataset("dset1",data = x)
f2=h5py.File("myh5py2.hdf5","w")
d2=f2.create_dataset("dset2",data = y)
for key in f1.keys():
feature = f1[key].value
for key in f2.keys():
target = f2[key].value
'''
利用hdf5得到feature属性特征数据,得到target目标特征的数据,
相比dataframe 和字典,数组的数据结构,HDF5利用键值存储value,
并且以空格进行元素的间隔,有利于后文的数据操作提取
feature的存储结构为[[192. 8.4 7.3 0.55]
[180. 8. 6.8 0.59]
[176. 7.4 7.2 0.6 ]
[ 86. 6.2 4.7 0.8 ]
[ 84. 6. 4.6 0.79]
[ 80. 5.8 4.3 0.77]
[ 80. 5.9 4.3 0.81]
[ 76. 5.8 4. 0.81]
[178. 7.1 7.8 0.92]
[172. 7.4 7. 0.89]
[166. 6.9 7.3 0.93]
[172. 7.1 7.6 0.92]
[154. 7. 7.1 0.88]
[164. 7.3 7.7 0.7 ]]
target的存储内容为[1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]
'''
feature_names = ['mass', 'width', 'height', 'color_score'] # 放置属性特征名
class_names = ['oapple','mardlin','lenmon','orange'] #放置四种水果的名称
# q求水果的种类数量的默认熵
def calcEntropy(target):
label = np.unique(target) # label = 1,2,3,4
n = label.size # 4
count = np.zeros(n)
p_i = np.zeros(n)
for i in range(n):
count[i] = target[target == label[i]].size #每种水果的数量
# 计算每个类别的概率
p_i = np.divide(count, target.size)
# 计算熵
entropy = 0
for i in range(n):
entropy = entropy - p_i[i] * np.log2(p_i[i])
return entropy
# 求条件熵
# featurew某个属性特征,condition为边界值,
def calcConditionEntropy(feature, condition, target):
true_condition = condition(feature)
false_condition = true_condition == False
target_true = target[true_condition]
target_false = target[false_condition]
# 每种属性类别的数量除以总数就计算出其概率
p_true = target_true.size / target.size
p_false = 1 - p_true
# 每种属性类别的概率乘以该类别下的信息熵
entropy = p_true * calcEntropy(target_true) + p_false * calcEntropy(target_false)
return entropy
"""
生成特征的所有分界点: 先对特征进行排序,然后将 target 有变动的地方作为分界点
:param feature: 一维数组,一个特征的样本数据
:param target: 一维数组,数字或者字符串的分类标签
"""
# 分界点
def generate_feature_points(feature, target):
argsort = feature.argsort()
f1 = feature[argsort]
t1 = target[argsort]
last_value = target[0]
split_value = []
# 找出所有分裂点
for i in range(t1.size):
if last_value != t1[i]:
split_value.append((f1[i] + f1[i - 1]) / 2)
last_value = t1[i]
return np.array(split_value) # 包含所有分界点的一维数组
def calc_feature_entropy(feature, target):
"""
计算一个特征的所有分界点的条件熵,返回最小的那个条件熵(条件熵越小,信息增益越大)
:param feature: 一维数组,一个特征的样本数据
:param target: 一维数组,数字或者字符串的分类标签
"""
min_entropy = float('inf')
min_point = 0
points = generate_feature_points(feature, target)
for p in points:
entropy = calcConditionEntropy(feature, lambda f: f < p, target)
if entropy < min_entropy:
min_entropy = entropy
min_point = p
# 没有分界点说明只有一类数据标签,熵为0
if points.size == 0:
min_entropy = 0
return min_point, min_entropy # 分界点和条件熵
#print(calc_feature_entropy(feature[:,0], target))
def select_feature(feature, target):
"""
从所有特征中选择出条件熵最小的特征(即最大增益)
:param features: 二维数据,包含所有特征的样本数据
:param target: 一维数组,数字或者字符串的分类标签
:return: 特征索引,条件熵,特征分界点
"""
min_entropy = float('inf')
min_point = 0
num = feature.shape[1]
index = 0
for i in range(num):
point, entropy = calc_feature_entropy(feature[:, i], target)
if entropy <= min_entropy:
index = i
min_point = point
min_entropy = entropy
return index, min_point, min_entropy #特征索引,条件熵,特征分界点
class TreeNode:
idn = 0
feature_index = ''
feature_point = 0
feature_entropy = 0
target_label = ''
true_node = None
false_node = None
@staticmethod
def decision(feature, point):
return feature < point
def build_tree(feature, target, idn):
"""
递归构建决策树
:param features: 二维数据,包含所有特征的样本数据
:param target: 一维数组,数字或者字符串的分类标签
:param idn: 决策树节点 id,通过 id 观察决策树计算过程
:return: 决策树根节点
"""
node= TreeNode()
'选择条件熵最小的特征'
index, point, entropy = select_feature(feature, target)
# 求特征索引,条件熵,特征分界点
node.idn = idn
node.feature_index = index
node.feature_point = point
node.feature_entropy = entropy
'取出现次数最多的标签作为该特征节点的输出'
node.target_label = target[np.argmax(np.bincount(target))]
print('build tree node id %d, index %d, point %f, entropy %f, label %s ' %
(idn, index, point, entropy, node.target_label))
'熵小于 0.1 时则结束创建子节点,防止过拟合'
if entropy < 0.1:
print('too low entropy : ', node.feature_entropy)
return node
f_copy = feature.copy()
t_copy = target.copy()
f = f_copy[:, index]
selector = node.decision(f, point)
'创建左右两个子节点'
idn = idn + 1
node.true_node = build_tree(f_copy[selector, :], t_copy[selector], idn)
idn = node.true_node.idn + 1
node.false_node = build_tree(f_copy[selector == False], t_copy[selector == False], idn)
return node
# 深度优先遍历
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)
print("构建树")
fruittree = build_tree(feature, target, 1)
# 将测试数据放进树里面,输出结果放进prediction
predictions = []
for i in range(len(X_test)):
node = fruittree
while(node.feature_entropy > 0.1) :
if node.feature_entropy >= X_test[i][node.feature_index]:
node = node.true_node
elif node.feature_entropy <= X_test[i][node.feature_index]:
node = node.false_node
predictions.append(node.target_label)
print("通过树进行测试:")
print(predictions)
corret = 0
for i in range(len(y_test)):
if predictions[i] == y_test[i]:
corret+= 1
print("准确率:",(corret / (len(y_test)) * 100.0),'%')
'''通过树进行测试:
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
准确率: 41.66666666666667 %
实验数据出错。建立树失败。
'''
print("利用python决策树的包直接求出准确率:")
from sklearn.model_selection import train_test_split
from sklearn import tree
# 把样本分成训练集和测试集两部分, 两者比例为: 7:3
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)
# 创建决策树分类器
clf = tree.DecisionTreeClassifier()
# 训练决策树
clf.fit(X=X_train, y=y_train)
# 查看特征比重
print("feature weight : ", clf.feature_importances_)
# 查看决策树评分
print("decision tree score : ", clf.score(X=X_test, y=y_test))
决策树分类水果
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- opencv的机器学习检测在ML库中具有很大的相似性。简单来说都可以分成两步:1、训练/得到分类器。2、使用分类器...
- 课程简介 本节课程将以一个机器学习过程为例完整地展示一个基础 Python 数据分析流程,以向大家展示一个典型的数...
- 课程简介: 分类和预测是两种常用数据分析方式,可以用于提取数据类模型、预测未来的数据趋势,被广泛用于信用评价、医疗...