D-降维数据-离散

#合并训练集合与测试集合

#1.读取数据

headers = [

'age',

'workclass',

'fnlwgt',

'education',

'education-num',

'marital-status',

'occupation',

'relation',

'race',

'sex',

'captial-gain',

'captial-loss',

'hours-per-week',

'native-country',

'predclass']

#2.合并数据

import pandas as pd

train_raw=pd.read_csv(

'adult.data',

header=None,

names=headers,

sep=',\s',

na_values='?',

engine='python',

skiprows=1

)

test_raw=pd.read_csv(

'adult.test',

header=None,

names=headers,

sep=',\s',

na_values='?',

engine='python',

skiprows=1

)

print('数据加载完毕')

dataset_raw=train_raw.append(test_raw)

#合并的数据集重新索引

#重新索引

dataset_raw.reset_index(inplace=True)

#删除索引

dataset_raw.drop('index',axis=1,inplace=True)

data_bin=pd.DataFrame() #处理特征后的数据集

data_con=pd.DataFrame() #处理数据集

# 处理标签特征

# - >=50k,1

# - <50k,0

dataset_raw.loc[ dataset_raw['predclass']=='>50K','predclass']=1

dataset_raw.loc[ dataset_raw['predclass']=='<=50K','predclass']=0

dataset_raw.loc[ dataset_raw['predclass']=='>50K.','predclass']=1

dataset_raw.loc[ dataset_raw['predclass']=='<=50K.','predclass']=0

#处理好的特征拷贝到处理后的缓冲中

data_bin['predclass']=dataset_raw['predclass']

data_con['predclass']=dataset_raw['predclass']

print("缓冲完毕")

#age的特征处理，1.连续性数据，不处理 2.离散化处理：年龄分成不同年龄段

data_con['age']=dataset_raw['age']

data_bin['age']=pd.cut(dataset_raw['age'],10)

print("age over")

#工作岗位的分类重新合并

dataset_raw.loc[dataset_raw['workclass']=='Without-pay','workclass']='Not Working'

dataset_raw.loc[dataset_raw['workclass']=='Never-worked','workclass']='Not Working'

dataset_raw.loc[dataset_raw['workclass']=='Federal-gov','workclass']='Fed-gov'

dataset_raw.loc[dataset_raw['workclass']=='State-gov','workclass']='Non-Fed-gov'

dataset_raw.loc[dataset_raw['workclass']=='Local-gov','workclass']='Non-Fed-gov'

dataset_raw.loc[dataset_raw['workclass']=='Self-emp-inc','workclass']='Self-emp'

dataset_raw.loc[dataset_raw['workclass']=='Self-emp-not-inc','workclass']='Self-emp'

#拷贝到训练缓冲中

data_bin['workclass']=dataset_raw['workclass']

data_con['workclass']=dataset_raw['workclass']

print("workclass over")

#对职业进行合并

dataset_raw.loc[dataset_raw['occupation']=='Adm-clerical','occupation']='Admin'

dataset_raw.loc[dataset_raw['occupation']=='Armed-Forces','occupation']='Military'

dataset_raw.loc[dataset_raw['occupation']=='Craft-repair','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Exec-managerial','occupation']='Office Labour'

dataset_raw.loc[dataset_raw['occupation']=='Farming-fishing','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Handlers-cleaners','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Machine-op-inspct','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Other-service','occupation']='Service'

dataset_raw.loc[dataset_raw['occupation']=='Priv-house-serv','occupation']='Service'

dataset_raw.loc[dataset_raw['occupation']=='Prof-specialty','occupation']='Professional'

dataset_raw.loc[dataset_raw['occupation']=='Protective-serv','occupation']='Military'

dataset_raw.loc[dataset_raw['occupation']=='Sales','occupation']='Office Labour'

dataset_raw.loc[dataset_raw['occupation']=='Tech-support','occupation']='Office Labour'

dataset_raw.loc[dataset_raw['occupation']=='Transport-moving','occupation']='Manual Labour'

data_bin['occupation']=dataset_raw['occupation']

data_con['occupation']=dataset_raw['occupation']

print("occupation over")

dataset_raw.loc[dataset_raw['native-country'] == 'Cambodia' , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Canada' , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'China' , 'native-country'] = 'China'

dataset_raw.loc[dataset_raw['native-country'] == 'Columbia' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Cuba' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Dominican-Republic' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Ecuador' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'El-Salvador' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'England' , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'France' , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Germany' , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Greece' , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Guatemala' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Haiti' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Holand-Netherlands' , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Honduras' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Hong' , 'native-country'] = 'China'

dataset_raw.loc[dataset_raw['native-country'] == 'Hungary' , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'India' , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'Iran' , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Ireland' , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'Italy' , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Jamaica' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Japan' , 'native-country'] = 'APAC'

dataset_raw.loc[dataset_raw['native-country'] == 'Laos' , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Mexico' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Nicaragua' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Outlying-US(Guam-USVI-etc)' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Peru' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Philippines' , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Poland' , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Portugal' , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Puerto-Rico' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Scotland' , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'South' , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Taiwan' , 'native-country'] = 'China'

dataset_raw.loc[dataset_raw['native-country'] == 'Thailand' , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Trinadad&Tobago' , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'United-States' , 'native-country'] = 'United-States'

dataset_raw.loc[dataset_raw['native-country'] == 'Vietnam' , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Yugoslavia' , 'native-country'] = 'Euro_Group_2'

data_bin['native-country'] = dataset_raw['native-country']

data_con['native-country'] = dataset_raw['native-country']

print("native-country over")

dataset_raw.loc[dataset_raw['education'] == '10th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '11th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '12th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '1st-4th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '5th-6th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '7th-8th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '9th' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == 'Assoc-acdm' , 'education'] = 'Associate'

dataset_raw.loc[dataset_raw['education'] == 'Assoc-voc' , 'education'] = 'Associate'

dataset_raw.loc[dataset_raw['education'] == 'Bachelors' , 'education'] = 'Bachelors'

dataset_raw.loc[dataset_raw['education'] == 'Doctorate' , 'education'] = 'Doctorate'

dataset_raw.loc[dataset_raw['education'] == 'HS-Grad' , 'education'] = 'HS-Graduate'

dataset_raw.loc[dataset_raw['education'] == 'Masters' , 'education'] = 'Masters'

dataset_raw.loc[dataset_raw['education'] == 'Preschool' , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == 'Prof-school' , 'education'] = 'Professor'

dataset_raw.loc[dataset_raw['education'] == 'Some-college' , 'education'] = 'HS-Graduate'

data_bin['education'] = dataset_raw['education']

data_con['education'] = dataset_raw['education']

print("education over")

dataset_raw.loc[dataset_raw['marital-status'] == 'Never-married' , 'marital-status'] = 'Never-Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Married-AF-spouse' , 'marital-status'] = 'Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Married-civ-spouse' , 'marital-status'] = 'Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Married-spouse-absent', 'marital-status'] = 'Not-Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Separated' , 'marital-status'] = 'Separated'

dataset_raw.loc[dataset_raw['marital-status'] == 'Divorced' , 'marital-status'] = 'Separated'

dataset_raw.loc[dataset_raw['marital-status'] == 'Widowed' , 'marital-status'] = 'Widowed'

data_bin['marital-status'] = dataset_raw['marital-status']

data_con['marital-status'] = dataset_raw['marital-status']

print("marital-status over")

#fnlw:final weight:序号

#离散；连续

data_bin['fnlwgt']=pd.cut(dataset_raw['fnlwgt'],10)

data_con['fnlwgt']=dataset_raw['fnlwgt']

print("fnlwgt over")

#education-num :连续；离散

data_bin['education-num']=pd.cut(dataset_raw['education-num'],10)

data_con['education-num']=dataset_raw['education-num']

print("education-num over")

#hour-per-week

data_bin['hours-per-week']=pd.cut(dataset_raw['hours-per-week'],10)

data_con['hours-per-week']=dataset_raw['hours-per-week']

print("hours-per-week over")

#captial-gain

data_bin['captial-gain']=pd.cut(dataset_raw['captial-gain'],10)

data_con['captial-gain']=dataset_raw['captial-gain']

print("captial-gain over")

#captial-loss

data_bin['captial-loss']=pd.cut(dataset_raw['captial-loss'],10)

data_con['captial-loss']=dataset_raw['captial-loss']

print("captial-loss over")

data_bin['race']=data_con['race']=dataset_raw['race']

data_bin['sex']=data_con['sex']=dataset_raw['sex']

data_bin['relation']=data_con['relation']=dataset_raw['relation']

print("race/sex/relation over")

#age hour-per-week

data_con['age-hours']=data_con['age']*data_con['hours-per-week']

data_bin['age-hours']=pd.cut(data_con['age-hours'],10)

print("age-hours over")

#sex+married

data_bin['sex-marital']=data_con['sex-marital']=data_con['sex']+data_con['marital-status']

print("sex-marital over")

# 确定编码的字段

one_hot_cols=data_bin.columns.tolist()

# 标签的字段不需要编码

one_hot_cols.remove('predclass')

data_bin_encode=pd.get_dummies(data_bin,columns=one_hot_cols)

# 连续的特征处理

from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

# 转换数据类型

data_con=data_con.astype('str')

# 编码转换

data_con_encode=data_con.apply(encoder.fit_transform)

#把数据分成训练数据、测试数据

train = data_bin_encode.loc[0:32560, :]

test = data_bin_encode.loc[32560:, :]

train = train.dropna(axis=0)

test = test.dropna(axis=0)

from sklearn.svm import SVC

#准备数据集

X_train = train.drop('predclass',axis=1)

Y_train = train['predclass'].astype('int64') #转换为数字型

X_test = test.drop('predclass',axis=1)

Y_test = test['predclass'].astype('int64') #转换为数字型

############################################################################

#1.创建一个缩放器

#数据归一化处理

from sklearn.preprocessing import StandardScaler

#归一化对象

std_scaler = StandardScaler()

#2.训练缩放的参数（使用训练数据集）

std_scaler.fit(X_train)

#3.产生归一化数据

X_train = std_scaler.transform(X_train)

X_test = std_scaler.transform(X_test)

print('归一化结束')

#4.PCA降维对象

from sklearn.decomposition import PCA

pca = PCA(n_components=10)

#5.训练降维的参数（奇异值分解）

pca.fit(X_train)

#6.降维

X_train = pca.transform(X_train)

X_test = pca.transform(X_test)

print('降维完成')

############################################################################

classifier = SVC(gamma='auto')

classifier.fit(X_train,Y_train)

score = classifier.score(X_test,Y_test)

train_score = classifier.score(X_train,Y_train)

test_score = classifier.score(X_test,Y_test)

print(F"训练集准确率：{train_score*100:8.2f}% ，测试集准确率：{test_score*100:8.2f}%")

#随机森林

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100)

rf_classifier.fit(X_train,Y_train)

train_score = rf_classifier.score(X_train,Y_train)

test_score = rf_classifier.score(X_test,Y_test)

print(F"训练集准确率：{train_score*100:8.2f}% ，测试集准确率：{test_score*100:8.2f}%")

# 逻辑回归

from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(solver='lbfgs',C=1.0,max_iter=10000)

lr_classifier.fit(X_train,Y_train)

train_score = lr_classifier.score(X_train,Y_train)

test_score = lr_classifier.score(X_test,Y_test)

print(F"训练集准确率：{train_score*100:8.2f}% ，测试集准确率：{test_score*100:8.2f}%")

# K-NN近邻算法

from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=3)

knn_classifier.fit(X_train,Y_train)

train_score = knn_classifier.score(X_train,Y_train)

test_score = knn_classifier.score(X_test,Y_test)

print(F"训练集准确率：{train_score*100:8.2f}% ，测试集准确率：{test_score*100:8.2f}%")

# 朴素贝叶斯分类--GaussianNB

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB

gau_classifier = GaussianNB()

gau_classifier.fit(X_train,Y_train)

train_score = gau_classifier.score(X_train,Y_train)

test_score = gau_classifier.score(X_test,Y_test)

print(F"训练集准确率：{train_score*100:8.2f}% ，测试集准确率：{test_score*100:8.2f}%")

from sklearn.metrics import classification_report

# 首先计算预测值

pre_gau = gau_classifier.predict(X_test)

report_gau = classification_report(Y_test,pre_gau)

print(report_gau)

D-降维数据-离散

推荐阅读更多精彩内容