#合并训练集合与测试集合
#1.读取数据
headers = [
'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relation',
'race',
'sex',
'captial-gain',
'captial-loss',
'hours-per-week',
'native-country',
'predclass']
#2.合并数据
import pandas as pd
train_raw=pd.read_csv(
'adult.data',
header=None,
names=headers,
sep=',\s',
na_values='?',
engine='python',
skiprows=1
)
test_raw=pd.read_csv(
'adult.test',
header=None,
names=headers,
sep=',\s',
na_values='?',
engine='python',
skiprows=1
)
print('数据加载完毕')
dataset_raw=train_raw.append(test_raw)
#合并的数据集重新索引
#重新索引
dataset_raw.reset_index(inplace=True)
#删除索引
dataset_raw.drop('index',axis=1,inplace=True)
data_bin=pd.DataFrame() #处理特征后的数据集
data_con=pd.DataFrame() #处理数据集
# 处理标签特征
# - >=50k,1
# - <50k,0
dataset_raw.loc[ dataset_raw['predclass']=='>50K','predclass']=1
dataset_raw.loc[ dataset_raw['predclass']=='<=50K','predclass']=0
dataset_raw.loc[ dataset_raw['predclass']=='>50K.','predclass']=1
dataset_raw.loc[ dataset_raw['predclass']=='<=50K.','predclass']=0
#处理好的特征拷贝到处理后的缓冲中
data_bin['predclass']=dataset_raw['predclass']
data_con['predclass']=dataset_raw['predclass']
print("缓冲完毕")
#age的特征处理,1.连续性数据,不处理 2.离散化处理:年龄分成不同年龄段
data_con['age']=dataset_raw['age']
data_bin['age']=pd.cut(dataset_raw['age'],10)
print("age over")
#工作岗位的分类重新合并
dataset_raw.loc[dataset_raw['workclass']=='Without-pay','workclass']='Not Working'
dataset_raw.loc[dataset_raw['workclass']=='Never-worked','workclass']='Not Working'
dataset_raw.loc[dataset_raw['workclass']=='Federal-gov','workclass']='Fed-gov'
dataset_raw.loc[dataset_raw['workclass']=='State-gov','workclass']='Non-Fed-gov'
dataset_raw.loc[dataset_raw['workclass']=='Local-gov','workclass']='Non-Fed-gov'
dataset_raw.loc[dataset_raw['workclass']=='Self-emp-inc','workclass']='Self-emp'
dataset_raw.loc[dataset_raw['workclass']=='Self-emp-not-inc','workclass']='Self-emp'
#拷贝到训练缓冲中
data_bin['workclass']=dataset_raw['workclass']
data_con['workclass']=dataset_raw['workclass']
print("workclass over")
#对职业进行合并
dataset_raw.loc[dataset_raw['occupation']=='Adm-clerical','occupation']='Admin'
dataset_raw.loc[dataset_raw['occupation']=='Armed-Forces','occupation']='Military'
dataset_raw.loc[dataset_raw['occupation']=='Craft-repair','occupation']='Manual Labour'
dataset_raw.loc[dataset_raw['occupation']=='Exec-managerial','occupation']='Office Labour'
dataset_raw.loc[dataset_raw['occupation']=='Farming-fishing','occupation']='Manual Labour'
dataset_raw.loc[dataset_raw['occupation']=='Handlers-cleaners','occupation']='Manual Labour'
dataset_raw.loc[dataset_raw['occupation']=='Machine-op-inspct','occupation']='Manual Labour'
dataset_raw.loc[dataset_raw['occupation']=='Other-service','occupation']='Service'
dataset_raw.loc[dataset_raw['occupation']=='Priv-house-serv','occupation']='Service'
dataset_raw.loc[dataset_raw['occupation']=='Prof-specialty','occupation']='Professional'
dataset_raw.loc[dataset_raw['occupation']=='Protective-serv','occupation']='Military'
dataset_raw.loc[dataset_raw['occupation']=='Sales','occupation']='Office Labour'
dataset_raw.loc[dataset_raw['occupation']=='Tech-support','occupation']='Office Labour'
dataset_raw.loc[dataset_raw['occupation']=='Transport-moving','occupation']='Manual Labour'
data_bin['occupation']=dataset_raw['occupation']
data_con['occupation']=dataset_raw['occupation']
print("occupation over")
dataset_raw.loc[dataset_raw['native-country'] == 'Cambodia' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Canada' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'China' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Columbia' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Cuba' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Dominican-Republic' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Ecuador' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'El-Salvador' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'England' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'France' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Germany' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Greece' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Guatemala' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Haiti' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Holand-Netherlands' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Honduras' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Hong' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Hungary' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'India' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'Iran' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Ireland' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'Italy' , 'native-country'] = 'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Jamaica' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Japan' , 'native-country'] = 'APAC'
dataset_raw.loc[dataset_raw['native-country'] == 'Laos' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Mexico' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Nicaragua' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Outlying-US(Guam-USVI-etc)' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Peru' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Philippines' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Poland' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Portugal' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Puerto-Rico' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Scotland' , 'native-country'] = 'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'South' , 'native-country'] = 'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Taiwan' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Thailand' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Trinadad&Tobago' , 'native-country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'United-States' , 'native-country'] = 'United-States'
dataset_raw.loc[dataset_raw['native-country'] == 'Vietnam' , 'native-country'] = 'SE-Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Yugoslavia' , 'native-country'] = 'Euro_Group_2'
data_bin['native-country'] = dataset_raw['native-country']
data_con['native-country'] = dataset_raw['native-country']
print("native-country over")
dataset_raw.loc[dataset_raw['education'] == '10th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '11th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '12th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '1st-4th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '5th-6th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '7th-8th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '9th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == 'Assoc-acdm' , 'education'] = 'Associate'
dataset_raw.loc[dataset_raw['education'] == 'Assoc-voc' , 'education'] = 'Associate'
dataset_raw.loc[dataset_raw['education'] == 'Bachelors' , 'education'] = 'Bachelors'
dataset_raw.loc[dataset_raw['education'] == 'Doctorate' , 'education'] = 'Doctorate'
dataset_raw.loc[dataset_raw['education'] == 'HS-Grad' , 'education'] = 'HS-Graduate'
dataset_raw.loc[dataset_raw['education'] == 'Masters' , 'education'] = 'Masters'
dataset_raw.loc[dataset_raw['education'] == 'Preschool' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == 'Prof-school' , 'education'] = 'Professor'
dataset_raw.loc[dataset_raw['education'] == 'Some-college' , 'education'] = 'HS-Graduate'
data_bin['education'] = dataset_raw['education']
data_con['education'] = dataset_raw['education']
print("education over")
dataset_raw.loc[dataset_raw['marital-status'] == 'Never-married' , 'marital-status'] = 'Never-Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-AF-spouse' , 'marital-status'] = 'Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-civ-spouse' , 'marital-status'] = 'Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-spouse-absent', 'marital-status'] = 'Not-Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Separated' , 'marital-status'] = 'Separated'
dataset_raw.loc[dataset_raw['marital-status'] == 'Divorced' , 'marital-status'] = 'Separated'
dataset_raw.loc[dataset_raw['marital-status'] == 'Widowed' , 'marital-status'] = 'Widowed'
data_bin['marital-status'] = dataset_raw['marital-status']
data_con['marital-status'] = dataset_raw['marital-status']
print("marital-status over")
#fnlw:final weight:序号
#离散;连续
data_bin['fnlwgt']=pd.cut(dataset_raw['fnlwgt'],10)
data_con['fnlwgt']=dataset_raw['fnlwgt']
print("fnlwgt over")
#education-num :连续;离散
data_bin['education-num']=pd.cut(dataset_raw['education-num'],10)
data_con['education-num']=dataset_raw['education-num']
print("education-num over")
#hour-per-week
data_bin['hours-per-week']=pd.cut(dataset_raw['hours-per-week'],10)
data_con['hours-per-week']=dataset_raw['hours-per-week']
print("hours-per-week over")
#captial-gain
data_bin['captial-gain']=pd.cut(dataset_raw['captial-gain'],10)
data_con['captial-gain']=dataset_raw['captial-gain']
print("captial-gain over")
#captial-loss
data_bin['captial-loss']=pd.cut(dataset_raw['captial-loss'],10)
data_con['captial-loss']=dataset_raw['captial-loss']
print("captial-loss over")
data_bin['race']=data_con['race']=dataset_raw['race']
data_bin['sex']=data_con['sex']=dataset_raw['sex']
data_bin['relation']=data_con['relation']=dataset_raw['relation']
print("race/sex/relation over")
#age hour-per-week
data_con['age-hours']=data_con['age']*data_con['hours-per-week']
data_bin['age-hours']=pd.cut(data_con['age-hours'],10)
print("age-hours over")
#sex+married
data_bin['sex-marital']=data_con['sex-marital']=data_con['sex']+data_con['marital-status']
print("sex-marital over")
# 确定编码的字段
one_hot_cols=data_bin.columns.tolist()
# 标签的字段不需要编码
one_hot_cols.remove('predclass')
data_bin_encode=pd.get_dummies(data_bin,columns=one_hot_cols)
# 连续的特征处理
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
# 转换数据类型
data_con=data_con.astype('str')
# 编码转换
data_con_encode=data_con.apply(encoder.fit_transform)
#把数据分成训练数据、测试数据
train = data_bin_encode.loc[0:32560, :]
test = data_bin_encode.loc[32560:, :]
train = train.dropna(axis=0)
test = test.dropna(axis=0)
from sklearn.svm import SVC
#准备数据集
X_train = train.drop('predclass',axis=1)
Y_train = train['predclass'].astype('int64') #转换为数字型
X_test = test.drop('predclass',axis=1)
Y_test = test['predclass'].astype('int64') #转换为数字型
############################################################################
#1.创建一个缩放器
#数据归一化处理
from sklearn.preprocessing import StandardScaler
#归一化对象
std_scaler = StandardScaler()
#2.训练缩放的参数(使用训练数据集)
std_scaler.fit(X_train)
#3.产生归一化数据
X_train = std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)
print('归一化结束')
#4.PCA降维对象
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
#5.训练降维的参数(奇异值分解)
pca.fit(X_train)
#6.降维
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
print('降维完成')
############################################################################
classifier = SVC(gamma='auto')
classifier.fit(X_train,Y_train)
score = classifier.score(X_test,Y_test)
train_score = classifier.score(X_train,Y_train)
test_score = classifier.score(X_test,Y_test)
print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")
#随机森林
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train,Y_train)
train_score = rf_classifier.score(X_train,Y_train)
test_score = rf_classifier.score(X_test,Y_test)
print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")
# 逻辑回归
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(solver='lbfgs',C=1.0,max_iter=10000)
lr_classifier.fit(X_train,Y_train)
train_score = lr_classifier.score(X_train,Y_train)
test_score = lr_classifier.score(X_test,Y_test)
print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")
# K-NN近邻算法
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train,Y_train)
train_score = knn_classifier.score(X_train,Y_train)
test_score = knn_classifier.score(X_test,Y_test)
print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")
# 朴素贝叶斯分类--GaussianNB
from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB
gau_classifier = GaussianNB()
gau_classifier.fit(X_train,Y_train)
train_score = gau_classifier.score(X_train,Y_train)
test_score = gau_classifier.score(X_test,Y_test)
print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")
from sklearn.metrics import classification_report
# 首先计算预测值
pre_gau = gau_classifier.predict(X_test)
report_gau = classification_report(Y_test,pre_gau)
print(report_gau)