在lightgbm中对categorical feature有专门的处理,但是需要标明哪些特征是categorical类型;另外在执行config文件也有相应的参数categorical_feature,可见LightGBM parameters.
如果是python API, 是通过pandas标明category,如下:
import pickle
import datetime
import json
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import cross_validation
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
def categorize(X, cols):
"""
输入: X pd数据
cols 需要变换的列
输出:变换好的X
"""
for col in cols:
X[col] = X[col].astype("category")
return X
def train_lgb(path_X, path_y):
"""
训练lgb模型
输出:lgb模型文件
"""
#从csv中读取提取好的特征,如果没有对应的csv,需要先执行gen_data()
X = pd.read_csv(path_X)
cols = ['x1', 'x2'] #只是作为例子
X = categorize(X, cols) #需要category的列
del X['pv'] #这里的pv选成了new_resblock_pv,暂时不加上,改过来之后删除这一行
label = pd.read_csv(path_y, header=None) #不需要header,不然会少一行
label = label[0].tolist() #不会有警告
train_X,test_X,train_y,test_y = train_test_split(X, label, test_size = 0.2, random_state = 2019)
#利用类别型特征需要标注出来,目前是auto
clf = lgb.LGBMClassifier(objective='binary', max_depth=4, learning_rate=0.3, n_estimators = 300, verbosity = -1, metric = 'auc')
clf.fit(train_X, train_y)
pred = clf.predict_proba(test_X)[:, 1]
#计算AUC
auc = metrics.roc_auc_score(test_y, pred)
pred_train = clf.predict_proba(train_X)[:, 1]
auc_train = metrics.roc_auc_score(train_y, pred_train)
print "train_auc:", auc_train
print "test_auc:", auc
#保存模型
model_name = 'lgb' + datetime.datetime.now().strftime('%Y-%m-%d-%H_%M') + '.model'
print model_name
clf.booster_.save_model(DIR + "model/" + model_name )
return clf