1.模型的评估与参数调优
#引入工具库
import warnings
warings.filterwarnings('ignore')
%matplotlib.inline
import matplotlib as mpl
import numpy as np
from matplotlib import pyplot as plt
mpl.rcParams['legend.numpoints'] = 1
#基本建模流程
from sklearn.dataset import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#构建数据集
X , y = make_blobs(random_state = 0)
#切分train 和test data
X_train , X_test , y_train , y_test = train_test_split(X , y ,random_state = 0)
#初始化模型对象并拟合
logreg = LogisticRegression().fit(X_train , y_train)
#模型评估
logreg.score(X_test , y_test)
1.1交叉验证/Cross - validation
from tools import *
plots.plot_cross_validation()
1.1.1 K折交叉验证
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.target)
plots.plot_stratified_cross_validation()
1.1.2 sklearn中的交叉验证
from sklearn.model_selection import cross_val_score
from skelarn.datasets import load_iris
from skelarn.linear_model import LogisticRegression
iris = load_iris()
logreg = LogisticRegression()
scores = cross_val_score(logreg, iris.data, iris.target)
print("cross-validation scores: ", scores)
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
scores
scores.mean()
1.1.3不同的数据切分方式
1.1.3.1 k折交叉验证(手动指定k折切分)
from sklearn.model_selection import KFold
kfold = KFold(n_split = 5)
cross_val_score(logreg , iris.data , iris.target , cv = kfold)
kfold = KFold(n_splits = 3)
cross_val_score(logreg , iris.data , iris.target , cv = kfold)
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
cross_val_score(logreg, iris.data, iris.target, cv=kfold)
1.1.3.2 留一交叉验证
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg , iris.data , iris.target , cv = loo)
print("number of cv iterations: ", len(scores))
print("mean accuracy: ", scores.mean())
1.1.3.3 乱序分割交叉验证
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size = 5 , train_size = 5 , n_split = 10)
cross_val_scroe(logreg,iris.data , iris.target , cv=shuffle_split)
1.1.3.4 分层抽样交叉验证
print("根据标签类别分层抽样的kfold")
plots.plot_label_kfold()
from sklearn.model_selection import StratifiedKFold
from tools.datasets import make_blobs
#构建数据集
X , y = make_blobs(n_samples = 12 , random_state = 0 )
#敲定一组label , 做分层抽样交叉验证
labels = [0,0,0,1,1,1,1,2,2,3,3,3]
cross_val_score(logerg , X , y , labels , cv = StratifiedKFlod(n_splits = 3))
1,2网格搜索/Grid Search
1.2.1 手动遍历超参数进行超参数选择(训练集+验证集)
我们用训练集进行模型拟合,用验证集做效果评估和参数选择
#naive grid search implementation
from sklearn.svm import svc
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(iris.data , iris.target , random_state = 0)
print("Size of training set: %d size of test set: %d" % (X_train.shape[0], X_test.shape[0]))
best_score = 0
#for 循环遍历参数列表
for gamma in [0.001 , 0.01 , 0.1 , 1 , 10 , 100 ]:
for C in [0.001 , 0.01 , 0.1 , 1 , 10 , 100 ]:
#使用对应的参数组初始化模型对象
svm = SVC(gamma = gamma , C = C)
svm.fit(X_train , y_train)
#评估svm
score = svm.score(X_test , y_test)
#保留最高分和对应参数
if score > best_score:
best_score = score
best_parameters = {'C‘ :C ,'gamma' :gamma}
print("最高得分: ", best_score)
print("最好的参数: ", best_parameters)
best_score
1.2.2 训练集+测试集+验证集
- 训练集:拟合和训练模型
- 验证集:使用不同参数组在验证集上试验,用户参数调优
- 测试集:模型评估
plots.plot_threefold.split()
from sklearn.svm import SVC
#训练集+测试集
X_trainval , X_test , y_trainval , y_test = train_test_split(iris.data , iris.target , random_state = 0)
#真正的训练集+验证集
X_train , X_valid , y_train , y_vaild = train_test_split(X_trainval , y_trainval , random_state = 1)
print("训练集数据量: %d,验证集数据量: %d,测试集数据量: %d" % (X_train.shape[0], X_valid.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
# 评估
score = svm.score(X_valid, y_valid)
# 保留最高得分
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
# 在测试数据上评估
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("验证集上最高得分: ", best_score)
print("最佳参数: ", best_parameters)
print("验证集选出最好的参数上测试集的得分为: ", test_score)
1.2.3 GridSearchCV = grid_search(产出候选超参数) + cross_validation(评估方式)
1.2.4 RandomizedSearchCV
param_grid = {'C' : [0.001 , 0.01 , 0.1, 1 , 10, 100],
'gamma' :[0.001 , 0.01 , 0.1 ,1 , 10,100]}
param_grid
#超参数1:5种取值
#超参数2:5种取值
#超参数3:6种取值
#5-fold 交叉验证 , 要建多少次模型用于评估?
#5*3*6*5 +1
#有加速方法吗?(并行化,加资源。。)
#depth:[3,5,7,10]
#min_child :[20,50,100]
#lr :[0.01 , 0.1 , 1 , 10]
# 7 50 0.1
# 周边搜索
# [10,20,10]
#param_grid是参数列表
#GridSearchCV是网格搜索交叉验证对象,fit之后可以对参数列表中的参数组进行拟合和交叉验证评估
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC() , param_grid , cv = 5)
X_train , X_test , y_train , y_test = train_test_split(iris.data , iris.target , random_state = 0)
import warnings
warings.fitlerwarnings("ignore")
grid_search.fit(X_train , y_train)
grid_search.score(X_test , y_test)
#最好的参数 和最高得分
print(grid_search.best_params_)
print(grid_search.best_score_)
grid_search.best_estimator_
1.2.5 附:模型候选超参数参考表
1.2.6 检视交叉验证的结果
grid_search.cv_result_
import numpy as np
scores = grid_search.cv_result_['mean_test_score']
scores = np.array(scores).reshape(6,6)
#plot the mean cross-validation scores
tools.heatmap(scores , xlabel = 'gamma' , ylabel = 'C' , xticklabels = param_grid['gamma'] , yticklabels = param_grid['C'] , cmap = 'viridis')
fig , axes = plt.subplots(1 , 3 , figsize = (13 , 5))
param_grid_linear = {'C' : np.linspace(1,2,6) ,'gamma' : np.linspace(1,2,6)}
param_grid_one_log = {'C' :np.linspace(1,2,6) ,'gamma' :np.linspace(-3 , 2, 6)}
param_grid_range = {'C' : np.linspace(-3 , 2 , 6) , 'gamma' : np.logspace(-7 ,-2 ,6)}
for param_grid , ax in zip([param_grid_linear , param_grid_one_log , param_grid_range] , axes):
grid_search = GridSearchCV(SVC(),param_grid , cv = 5)
grid_search.fit(X_train , y_train)
scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(6,6)
#plot the mean cross-validation scores
scores_image = tools.heatmap(scores, xlabel = 'gamma' , ylabel = 'C' , xticklabels = param_grid['gamma'] , yticklabels = param_grid['C'] , cmap = 'viridis' , ax = ax)
plt.colorbar(score_image , ax = axes.tolist())
1.2.7 手动切分数据交叉验证
1.2.7 手动切分数据交叉验证
在有一些问题中,我们不能直接对数据进行随机切分,比如分类问题中,如果类别是不均衡的(非1:1),我们不能直接随机切分,更多的情况下,我们会手动切分,并且保证每个fold中的样本比例一致。
scores= cross_val_score(GridSearch(SVC() , param_grid , cv = 5) , iris.data , iris.target , cv = 5)
print("交叉验证得分: ", scores)
print("平均交叉验证得分: ", scores.mean())
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
outer_scores = []
# 手动切分训练集和测试集(几折交叉)
for training_samples, test_samples in outer_cv.split(X, y):
# 初始化最好的参数存储的字典:
best_parms = {}
best_score = -np.inf
# 遍历参数
for parameters in parameter_grid:
# 记录不同参数交叉验证实验得分
cv_scores = []
# 再把训练集做几折切分内部切分为 真正的训练集 和 验证集
for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
clf = Classifier(**parameters)
clf.fit(X[inner_train], y[inner_train])
score = clf.score(X[inner_test], y[inner_test])
cv_scores.append(score)
# 交叉验证的平均结果
mean_score = np.mean(cv_scores)
if mean_score > best_score:
best_score = mean_score
best_params = parameters
clf = Classifier(**best_params)
clf.fit(X[training_samples], y[training_samples])
outer_scores.append(clf.score(X[test_samples], y[test_samples]))
return outer_scores
from sklearn.model_selection import ParameterGrid , StratifieKFold
nested_cv(iris.data , iris.target , StratifieKFold(5) , StratifieKFold(5) , SVC , ParameterGrid(param_grid))