决策树分类水果

from sklearn.datasets import load_iris
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import random
from matplotlib import cm
from sklearn.model_selection import train_test_split
from sklearn import datasets as ds
import h5py
import numpy as np
import pydotplus
import os
import image


fruits = pd.read_table('E:/fruit.txt')
X = fruits[['mass', 'width', 'height', 'color_score']]
Y = fruits['fruit_label']
x = X.values
y = Y.values

f1=h5py.File("myh5py1.hdf5","w")
d1=f1.create_dataset("dset1",data = x)
f2=h5py.File("myh5py2.hdf5","w")
d2=f2.create_dataset("dset2",data = y)

for key in f1.keys():
    feature = f1[key].value

for key in f2.keys():
    target = f2[key].value

'''
利用hdf5得到feature属性特征数据，得到target目标特征的数据，
相比dataframe 和字典，数组的数据结构，HDF5利用键值存储value,
并且以空格进行元素的间隔，有利于后文的数据操作提取
feature的存储结构为[[192.     8.4    7.3    0.55]
 [180.     8.     6.8    0.59]
 [176.     7.4    7.2    0.6 ]
 [ 86.     6.2    4.7    0.8 ]
 [ 84.     6.     4.6    0.79]
 [ 80.     5.8    4.3    0.77]
 [ 80.     5.9    4.3    0.81]
 [ 76.     5.8    4.     0.81]
 [178.     7.1    7.8    0.92]
 [172.     7.4    7.     0.89]
 [166.     6.9    7.3    0.93]
 [172.     7.1    7.6    0.92]
 [154.     7.     7.1    0.88]
 [164.     7.3    7.7    0.7 ]]
 
 target的存储内容为[1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]
'''

feature_names = ['mass', 'width', 'height', 'color_score']  # 放置属性特征名
class_names = ['oapple','mardlin','lenmon','orange'] #放置四种水果的名称



# q求水果的种类数量的默认熵
def calcEntropy(target):
    label = np.unique(target) # label = 1,2,3,4
    n = label.size  # 4
    count = np.zeros(n)
    p_i = np.zeros(n)
    for i in range(n):
        count[i] = target[target == label[i]].size  #每种水果的数量
    # 计算每个类别的概率
    p_i = np.divide(count, target.size)

    # 计算熵
    entropy = 0
    for i in range(n):
        entropy = entropy - p_i[i] * np.log2(p_i[i])
    return entropy


# 求条件熵
# featurew某个属性特征，condition为边界值，
def calcConditionEntropy(feature, condition, target):
    true_condition = condition(feature)
    false_condition = true_condition == False
    target_true = target[true_condition]
    target_false = target[false_condition]
    # 每种属性类别的数量除以总数就计算出其概率
    p_true = target_true.size / target.size
    p_false = 1 - p_true
    # 每种属性类别的概率乘以该类别下的信息熵
    entropy = p_true * calcEntropy(target_true) + p_false * calcEntropy(target_false)
    return entropy

"""
    生成特征的所有分界点: 先对特征进行排序，然后将 target 有变动的地方作为分界点
    :param feature: 一维数组，一个特征的样本数据
    :param target: 一维数组，数字或者字符串的分类标签
    """

# 分界点
def generate_feature_points(feature, target):

    argsort = feature.argsort()
    f1 = feature[argsort]
    t1 = target[argsort]

    last_value = target[0]
    split_value = []

    # 找出所有分裂点
    for i in range(t1.size):
        if last_value != t1[i]:
            split_value.append((f1[i] + f1[i - 1]) / 2)
            last_value = t1[i]

    return np.array(split_value) # 包含所有分界点的一维数组


def calc_feature_entropy(feature, target):
    """
    计算一个特征的所有分界点的条件熵，返回最小的那个条件熵（条件熵越小，信息增益越大）
    :param feature: 一维数组，一个特征的样本数据
    :param target: 一维数组，数字或者字符串的分类标签

    """
    min_entropy = float('inf')
    min_point = 0
    points = generate_feature_points(feature, target)
    for p in points:
        entropy = calcConditionEntropy(feature, lambda f: f < p, target)
        if entropy < min_entropy:
            min_entropy = entropy
            min_point = p

    # 没有分界点说明只有一类数据标签，熵为0
    if points.size == 0:
        min_entropy = 0

    return min_point, min_entropy # 分界点和条件熵


#print(calc_feature_entropy(feature[:,0], target))

def select_feature(feature, target):
    """
    从所有特征中选择出条件熵最小的特征（即最大增益）
    :param features: 二维数据，包含所有特征的样本数据
    :param target: 一维数组，数字或者字符串的分类标签
    :return: 特征索引，条件熵，特征分界点
    """
    min_entropy = float('inf')
    min_point = 0
    num = feature.shape[1]
    index = 0
    for i in range(num):
        point, entropy = calc_feature_entropy(feature[:, i], target)
        if entropy <= min_entropy:
            index = i
            min_point = point
            min_entropy = entropy

    return index, min_point, min_entropy #特征索引，条件熵，特征分界点

class TreeNode:
    idn = 0
    feature_index = ''
    feature_point = 0
    feature_entropy = 0
    target_label = ''
    true_node = None
    false_node = None

    @staticmethod
    def decision(feature, point):
        return feature < point


def build_tree(feature, target, idn):
    """
    递归构建决策树
    :param features: 二维数据，包含所有特征的样本数据
    :param target: 一维数组，数字或者字符串的分类标签
    :param idn: 决策树节点 id，通过 id 观察决策树计算过程
    :return: 决策树根节点
    """
    node= TreeNode()

    '选择条件熵最小的特征'
    index, point, entropy = select_feature(feature, target)
    # 求特征索引，条件熵，特征分界点
    node.idn = idn
    node.feature_index = index
    node.feature_point = point
    node.feature_entropy = entropy
    '取出现次数最多的标签作为该特征节点的输出'
    node.target_label = target[np.argmax(np.bincount(target))]

    print('build tree node id %d, index %d, point %f, entropy %f, label %s ' %
          (idn, index, point, entropy, node.target_label))

    '熵小于 0.1 时则结束创建子节点，防止过拟合'
    if entropy < 0.1:
        print('too low entropy : ', node.feature_entropy)
        return node

    f_copy = feature.copy()
    t_copy = target.copy()
    f = f_copy[:, index]
    selector = node.decision(f, point)

    '创建左右两个子节点'
    idn = idn + 1
    node.true_node = build_tree(f_copy[selector, :], t_copy[selector], idn)
    idn = node.true_node.idn + 1
    node.false_node = build_tree(f_copy[selector == False], t_copy[selector == False], idn)
    return node


# 深度优先遍历
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)
print("构建树")
fruittree = build_tree(feature, target, 1)


# 将测试数据放进树里面，输出结果放进prediction
predictions = []

for i in range(len(X_test)):
    node = fruittree
    while(node.feature_entropy > 0.1) :
        if node.feature_entropy >= X_test[i][node.feature_index]:
            node = node.true_node
        elif node.feature_entropy <= X_test[i][node.feature_index]:
            node = node.false_node
    predictions.append(node.target_label)

print("通过树进行测试：")
print(predictions)
corret = 0
for i in range(len(y_test)):
    if predictions[i] == y_test[i]:
        corret+= 1
print("准确率：",(corret / (len(y_test)) * 100.0),'%')

'''通过树进行测试：
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
准确率： 41.66666666666667 %

实验数据出错。建立树失败。
'''


print("利用python决策树的包直接求出准确率:")

from sklearn.model_selection import train_test_split
from sklearn import tree
# 把样本分成训练集和测试集两部分, 两者比例为： 7:3
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)
# 创建决策树分类器
clf = tree.DecisionTreeClassifier()
# 训练决策树
clf.fit(X=X_train, y=y_train)
# 查看特征比重
print("feature weight : ", clf.feature_importances_)
# 查看决策树评分
print("decision tree score : ", clf.score(X=X_test, y=y_test))
决策树分类水果

推荐阅读更多精彩内容