D-降维数据-离散

#合并训练集合与测试集合

#1.读取数据

headers = [

    'age',

    'workclass',

    'fnlwgt',

    'education',

    'education-num',

    'marital-status',

    'occupation',

    'relation',

    'race',

    'sex',

    'captial-gain',

    'captial-loss',

    'hours-per-week',

    'native-country',

    'predclass']

#2.合并数据

import pandas as pd

train_raw=pd.read_csv(

    'adult.data',

    header=None,

    names=headers,

    sep=',\s',

    na_values='?',

    engine='python',

    skiprows=1

)

test_raw=pd.read_csv(

    'adult.test',

    header=None,

    names=headers,

    sep=',\s',

    na_values='?',

    engine='python',

    skiprows=1

)

print('数据加载完毕')

dataset_raw=train_raw.append(test_raw)

#合并的数据集重新索引

#重新索引

dataset_raw.reset_index(inplace=True)

#删除索引

dataset_raw.drop('index',axis=1,inplace=True)

data_bin=pd.DataFrame()  #处理特征后的数据集

data_con=pd.DataFrame()  #处理数据集

# 处理标签特征

#  - >=50k,1

#  - <50k,0

dataset_raw.loc[ dataset_raw['predclass']=='>50K','predclass']=1

dataset_raw.loc[ dataset_raw['predclass']=='<=50K','predclass']=0

dataset_raw.loc[ dataset_raw['predclass']=='>50K.','predclass']=1

dataset_raw.loc[ dataset_raw['predclass']=='<=50K.','predclass']=0

#处理好的特征拷贝到处理后的缓冲中

data_bin['predclass']=dataset_raw['predclass']

data_con['predclass']=dataset_raw['predclass']

print("缓冲完毕")

#age的特征处理,1.连续性数据,不处理 2.离散化处理:年龄分成不同年龄段

data_con['age']=dataset_raw['age']

data_bin['age']=pd.cut(dataset_raw['age'],10)

print("age over")

#工作岗位的分类重新合并

dataset_raw.loc[dataset_raw['workclass']=='Without-pay','workclass']='Not Working'

dataset_raw.loc[dataset_raw['workclass']=='Never-worked','workclass']='Not Working'

dataset_raw.loc[dataset_raw['workclass']=='Federal-gov','workclass']='Fed-gov'

dataset_raw.loc[dataset_raw['workclass']=='State-gov','workclass']='Non-Fed-gov'

dataset_raw.loc[dataset_raw['workclass']=='Local-gov','workclass']='Non-Fed-gov'

dataset_raw.loc[dataset_raw['workclass']=='Self-emp-inc','workclass']='Self-emp'

dataset_raw.loc[dataset_raw['workclass']=='Self-emp-not-inc','workclass']='Self-emp'

#拷贝到训练缓冲中

data_bin['workclass']=dataset_raw['workclass']

data_con['workclass']=dataset_raw['workclass']

print("workclass over")

#对职业进行合并

dataset_raw.loc[dataset_raw['occupation']=='Adm-clerical','occupation']='Admin'

dataset_raw.loc[dataset_raw['occupation']=='Armed-Forces','occupation']='Military'

dataset_raw.loc[dataset_raw['occupation']=='Craft-repair','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Exec-managerial','occupation']='Office Labour'

dataset_raw.loc[dataset_raw['occupation']=='Farming-fishing','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Handlers-cleaners','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Machine-op-inspct','occupation']='Manual Labour'

dataset_raw.loc[dataset_raw['occupation']=='Other-service','occupation']='Service'

dataset_raw.loc[dataset_raw['occupation']=='Priv-house-serv','occupation']='Service'

dataset_raw.loc[dataset_raw['occupation']=='Prof-specialty','occupation']='Professional'

dataset_raw.loc[dataset_raw['occupation']=='Protective-serv','occupation']='Military'

dataset_raw.loc[dataset_raw['occupation']=='Sales','occupation']='Office Labour'

dataset_raw.loc[dataset_raw['occupation']=='Tech-support','occupation']='Office Labour'

dataset_raw.loc[dataset_raw['occupation']=='Transport-moving','occupation']='Manual Labour'

data_bin['occupation']=dataset_raw['occupation']

data_con['occupation']=dataset_raw['occupation']

print("occupation over")

dataset_raw.loc[dataset_raw['native-country'] == 'Cambodia'                    , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Canada'                      , 'native-country'] = 'British-Commonwealth'   

dataset_raw.loc[dataset_raw['native-country'] == 'China'                      , 'native-country'] = 'China'     

dataset_raw.loc[dataset_raw['native-country'] == 'Columbia'                    , 'native-country'] = 'South-America'   

dataset_raw.loc[dataset_raw['native-country'] == 'Cuba'                        , 'native-country'] = 'South-America'       

dataset_raw.loc[dataset_raw['native-country'] == 'Dominican-Republic'          , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Ecuador'                    , 'native-country'] = 'South-America'   

dataset_raw.loc[dataset_raw['native-country'] == 'El-Salvador'                , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'England'                    , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'France'                      , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Germany'                    , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Greece'                      , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Guatemala'                  , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Haiti'                      , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Holand-Netherlands'          , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Honduras'                    , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Hong'                        , 'native-country'] = 'China'

dataset_raw.loc[dataset_raw['native-country'] == 'Hungary'                    , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'India'                      , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'Iran'                        , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Ireland'                    , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'Italy'                      , 'native-country'] = 'Euro_Group_1'

dataset_raw.loc[dataset_raw['native-country'] == 'Jamaica'                    , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Japan'                      , 'native-country'] = 'APAC'

dataset_raw.loc[dataset_raw['native-country'] == 'Laos'                        , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Mexico'                      , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Nicaragua'                  , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Outlying-US(Guam-USVI-etc)'  , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Peru'                        , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Philippines'                , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Poland'                      , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Portugal'                    , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Puerto-Rico'                , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'Scotland'                    , 'native-country'] = 'British-Commonwealth'

dataset_raw.loc[dataset_raw['native-country'] == 'South'                      , 'native-country'] = 'Euro_Group_2'

dataset_raw.loc[dataset_raw['native-country'] == 'Taiwan'                      , 'native-country'] = 'China'

dataset_raw.loc[dataset_raw['native-country'] == 'Thailand'                    , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Trinadad&Tobago'            , 'native-country'] = 'South-America'

dataset_raw.loc[dataset_raw['native-country'] == 'United-States'              , 'native-country'] = 'United-States'

dataset_raw.loc[dataset_raw['native-country'] == 'Vietnam'                    , 'native-country'] = 'SE-Asia'

dataset_raw.loc[dataset_raw['native-country'] == 'Yugoslavia'                  , 'native-country'] = 'Euro_Group_2'

data_bin['native-country'] = dataset_raw['native-country']

data_con['native-country'] = dataset_raw['native-country']

print("native-country over")

dataset_raw.loc[dataset_raw['education'] == '10th'          , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '11th'          , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '12th'          , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '1st-4th'      , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '5th-6th'      , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '7th-8th'      , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == '9th'          , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == 'Assoc-acdm'    , 'education'] = 'Associate'

dataset_raw.loc[dataset_raw['education'] == 'Assoc-voc'    , 'education'] = 'Associate'

dataset_raw.loc[dataset_raw['education'] == 'Bachelors'    , 'education'] = 'Bachelors'

dataset_raw.loc[dataset_raw['education'] == 'Doctorate'    , 'education'] = 'Doctorate'

dataset_raw.loc[dataset_raw['education'] == 'HS-Grad'      , 'education'] = 'HS-Graduate'

dataset_raw.loc[dataset_raw['education'] == 'Masters'      , 'education'] = 'Masters'

dataset_raw.loc[dataset_raw['education'] == 'Preschool'    , 'education'] = 'Dropout'

dataset_raw.loc[dataset_raw['education'] == 'Prof-school'  , 'education'] = 'Professor'

dataset_raw.loc[dataset_raw['education'] == 'Some-college'  , 'education'] = 'HS-Graduate'

data_bin['education'] = dataset_raw['education']

data_con['education'] = dataset_raw['education']

print("education over")

dataset_raw.loc[dataset_raw['marital-status'] == 'Never-married'        , 'marital-status'] = 'Never-Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Married-AF-spouse'    , 'marital-status'] = 'Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Married-civ-spouse'  , 'marital-status'] = 'Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Married-spouse-absent', 'marital-status'] = 'Not-Married'

dataset_raw.loc[dataset_raw['marital-status'] == 'Separated'            , 'marital-status'] = 'Separated'

dataset_raw.loc[dataset_raw['marital-status'] == 'Divorced'            , 'marital-status'] = 'Separated'

dataset_raw.loc[dataset_raw['marital-status'] == 'Widowed'              , 'marital-status'] = 'Widowed'

data_bin['marital-status'] = dataset_raw['marital-status']

data_con['marital-status'] = dataset_raw['marital-status']

print("marital-status over")

#fnlw:final weight:序号

#离散;连续

data_bin['fnlwgt']=pd.cut(dataset_raw['fnlwgt'],10)

data_con['fnlwgt']=dataset_raw['fnlwgt']

print("fnlwgt over")

#education-num :连续;离散

data_bin['education-num']=pd.cut(dataset_raw['education-num'],10)

data_con['education-num']=dataset_raw['education-num']

print("education-num over")

#hour-per-week

data_bin['hours-per-week']=pd.cut(dataset_raw['hours-per-week'],10)

data_con['hours-per-week']=dataset_raw['hours-per-week']

print("hours-per-week over")

#captial-gain

data_bin['captial-gain']=pd.cut(dataset_raw['captial-gain'],10)

data_con['captial-gain']=dataset_raw['captial-gain']

print("captial-gain over")

#captial-loss

data_bin['captial-loss']=pd.cut(dataset_raw['captial-loss'],10)

data_con['captial-loss']=dataset_raw['captial-loss']

print("captial-loss over")

data_bin['race']=data_con['race']=dataset_raw['race']

data_bin['sex']=data_con['sex']=dataset_raw['sex']

data_bin['relation']=data_con['relation']=dataset_raw['relation']

print("race/sex/relation over")

#age hour-per-week

data_con['age-hours']=data_con['age']*data_con['hours-per-week']

data_bin['age-hours']=pd.cut(data_con['age-hours'],10)

print("age-hours over")

#sex+married

data_bin['sex-marital']=data_con['sex-marital']=data_con['sex']+data_con['marital-status']

print("sex-marital over")

# 确定编码的字段

one_hot_cols=data_bin.columns.tolist()

# 标签的字段不需要编码

one_hot_cols.remove('predclass')

data_bin_encode=pd.get_dummies(data_bin,columns=one_hot_cols)

# 连续的特征处理

from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()

# 转换数据类型

data_con=data_con.astype('str')

# 编码转换

data_con_encode=data_con.apply(encoder.fit_transform)

#把数据分成训练数据、测试数据

train = data_bin_encode.loc[0:32560, :]

test = data_bin_encode.loc[32560:, :]

train = train.dropna(axis=0)

test = test.dropna(axis=0)

from sklearn.svm import SVC

#准备数据集

X_train = train.drop('predclass',axis=1)

Y_train = train['predclass'].astype('int64')  #转换为数字型

X_test = test.drop('predclass',axis=1)

Y_test = test['predclass'].astype('int64')  #转换为数字型

############################################################################

#1.创建一个缩放器

#数据归一化处理

from sklearn.preprocessing import StandardScaler

#归一化对象

std_scaler = StandardScaler()

#2.训练缩放的参数(使用训练数据集)

std_scaler.fit(X_train)

#3.产生归一化数据

X_train = std_scaler.transform(X_train)

X_test = std_scaler.transform(X_test)

print('归一化结束')

#4.PCA降维对象

from sklearn.decomposition import PCA

pca = PCA(n_components=10)

#5.训练降维的参数(奇异值分解)

pca.fit(X_train)

#6.降维

X_train = pca.transform(X_train)

X_test = pca.transform(X_test)

print('降维完成')

############################################################################

classifier = SVC(gamma='auto')

classifier.fit(X_train,Y_train)

score = classifier.score(X_test,Y_test)     

train_score = classifier.score(X_train,Y_train)

test_score = classifier.score(X_test,Y_test)

print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")

#随机森林

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100)

rf_classifier.fit(X_train,Y_train)

train_score = rf_classifier.score(X_train,Y_train)

test_score = rf_classifier.score(X_test,Y_test)

print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")

# 逻辑回归

from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(solver='lbfgs',C=1.0,max_iter=10000)

lr_classifier.fit(X_train,Y_train)

train_score = lr_classifier.score(X_train,Y_train)

test_score = lr_classifier.score(X_test,Y_test)

print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")

# K-NN近邻算法

from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=3)

knn_classifier.fit(X_train,Y_train)

train_score = knn_classifier.score(X_train,Y_train)

test_score = knn_classifier.score(X_test,Y_test)

print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")

# 朴素贝叶斯分类--GaussianNB

from sklearn.naive_bayes import GaussianNB, BernoulliNB,MultinomialNB

gau_classifier = GaussianNB()

gau_classifier.fit(X_train,Y_train)

train_score = gau_classifier.score(X_train,Y_train)

test_score = gau_classifier.score(X_test,Y_test)

print(F"训练集准确率:{train_score*100:8.2f}% ,测试集准确率:{test_score*100:8.2f}%")

from sklearn.metrics import classification_report

# 首先计算预测值

pre_gau = gau_classifier.predict(X_test)

report_gau = classification_report(Y_test,pre_gau)

print(report_gau)

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 219,490评论 6 508
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,581评论 3 395
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 165,830评论 0 356
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,957评论 1 295
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,974评论 6 393
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,754评论 1 307
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,464评论 3 420
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,357评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,847评论 1 317
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,995评论 3 338
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 40,137评论 1 351
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,819评论 5 346
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,482评论 3 331
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 32,023评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,149评论 1 272
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,409评论 3 373
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 45,086评论 2 355

推荐阅读更多精彩内容