import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
df = pd.read_csv("datas/iris_data.csv")
df.head()
# x 和 y
x = df.drop(["target", "label"], axis=1)
y = df["label"]
# 模型训练和准确率 0.96
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x, y)
y_predict = knn.predict(x)
accuracy_score(y, y_predict)
# 数据标准化处理
from sklearn.preprocessing import StandardScaler
x_stand = StandardScaler().fit_transform(x)
# 取一个维度查看未处理的均值和标准差和标准化处理后的有什么不同
x1_mean = df["sepal length"].mean()
x1_stand_mean = x_stand[:, 0].mean()
x1_std= df["sepal length"].std()
x1_stand_std = x_stand[:, 0].std()
print(x1_mean, x1_stand_mean, x1_std, x1_stand_std)
from sklearn.decomposition import PCA
# pca 同等维度
pca = PCA(n_components=4)
x_pca = pca.fit_transform(x_stand)
# 主成分方差
var_radio = pca.explained_variance_ratio_ #array([0.72770452, 0.23030523, 0.03683832, 0.00515193])
# pca降维 只保留方差最大的两个维度
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_stand)
# 用knn查看降维数据
knn = KNeighborsClassifier()
knn.fit(x_pca, y)
knn_predict = knn.predict(x_pca)
accuracy_score(y, knn_predict) # 0.9466666666666667
``