#ID3主要用于分类,这里举一个CART回归的例子
from numpy import *
#连续特征切分后返回的切分样本
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
return mat0, mat1
def regLeaf(dataSet):#均值,用来计算y
return mean(dataSet[:,-1])
def regErr(dataSet):#平方误差,用方差*样本数
return var(dataSet[:,-1]) * shape(dataSet)[0]
#对每个属性进行切分,计算最佳属性-切分点对,ops预设属性,tolS是容许的误差下降值
#tolN是切分的最小样本数,当小于此值时不再切分,即预剪枝
def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
tolS = ops[0]; tolN = ops[1]
if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #终止条件:y都一致无需划分
return None, leafType(dataSet) #返回特征为None,叶子节点值为leafType(dataSet)
m,n = shape(dataSet)
S = errType(dataSet)
bestS = inf; bestIndex = 0; bestValue = 0 #设了三个初始值,将误差设为无穷大
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]):#每个属性的取值,注意matrix类型不能hashable,需要先转换成array
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)#最后转换成list
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0,mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
if(shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):
return None,leafType(dataSet)
return bestIndex, bestValue
#生成树,左分支取值为“是”,右分支取值“否”
def createTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None:return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTre
#pruning 后剪枝
def isTree(obj):
return (type(obj).__name__ == 'dict')
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left'] + tree['right']) / 2.0
def prune(tree, testData):
if shape(testData)[0] == 0: return getMean(tree) #对树进行塌陷处理
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']):
tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']):
tree['right'] = prune(tree['right'], rSet)
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = sum(power(lSet[:,-1] - tree['left'], 2)) + \
sum(power(rSet[:,-1] - tree['right'], 2))
treeMean = (tree['left'] + tree['right']) / 2.0
errorMerge = sum(power(testData[:-1] - treeMean, 2)) #power迭代器,每个元素求次方
if errorMerge < errorNoMerge:
print('merging')
return treeMean
else:return tree
else:return tree
#模型树,用线性组合代替树节点判断
def linearSolve(dataSet):
m, n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1)))
X[:,1:n] = dataSet[:,0:n-1]
Y = dataSet[:,-1]
xTx = X.T * X
if linalg.det(xTx) == 0.0:
raise NameError('singular matrix')
ws = xTx.I * (X.T * Y)
return ws, X, Y
def modelLeaf(dataSet):
ws, X ,Y = linearSolve(dataSet)
return ws
def modelErr(dataSet):
ws, X, Y =linearSolve(dataSet)
yHat = X * ws
return sum(power(Y - yHat, 2))
自己造轮子-CART回归和剪枝
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 借着猫姐(简猫)的东风,先来介绍下自己:我是白袍君,90后一枚,半开发半产品,所以我想说我是做互联网的,凡事有自...