代码拉取完成,页面将自动刷新
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @Time : 2018/4/1 18:38
# @Author : liujiantao
# @Site : 模型训练
# @File : ljt_model_train.py
# @Software: PyCharm
import traceback
import numpy as np
from bp_network import NN
import xgboost as xgb
import pandas as pd
import time
from cnn_predict import CNNPredict
from config import Config
from data_util import DataUtil
from feature_integrate02 import FeatureIntegrate02
def data_pre_save(df, path_test_out="model/test.csv"):
df.to_csv(path_or_buf=path_test_out, index=False, sep=',', header=True)
config = Config()
f_t_Ig = FeatureIntegrate02()
d_t = DataUtil()
class LjtModelTrain(object):
'''
'''
all_features_list, user_Y_list, min_Y, max_Y= f_t_Ig.get_all_features()
def get_feature(self):
f_t_Ig = FeatureIntegrate02()
all_features_list, user_Y_list, pre_features_list, pre_Y, act_class_Y = f_t_Ig.get_all_features()
return all_features_list, user_Y_list, pre_features_list, pre_Y, act_class_Y
def random_forest(self):
target_names = ['label is 0', 'label is 1']
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = d_t.split_data(self.all_features_list, self.act_class_Y,
test_size=0.33)
rf2 = RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=120,
min_samples_leaf=20, max_features=7, oob_score=True, random_state=10)
rf2.fit(X_train, y_train)
print(rf2.oob_score_)
f_t_Ig.d_h.evaluate_function(rf2, X_test, y_test, target_names)
return rf2
def xgb_classifier(self):
target_names = ['label is 0', 'label is 1']
dataMat = np.array(self.all_features_list)
labelMat = np.array(self.act_class_Y)
X_train, X_test, y_train, y_test = d_t.split_data(dataMat, labelMat, test_size=0.33)
clf = xgb.XGBRegressor(learning_rate=0.02,
n_estimators=30,
max_depth=6,
min_child_weight=1,
gamma=0.15,
subsample=0.8,
colsample_bytree=0.8,
nthread=4,
scale_pos_weight=1,
seed=27)
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])
ytestPre = clf.predict(X_test)
from sklearn.metrics import classification_report
f_t_Ig.d_h.print_str += classification_report(y_test, ytestPre, target_names=target_names)
self.feature_importance(clf)
# print('###############################参数网格优选###################################')
# model_gbr_GridSearch = xgb.XGBClassifier()
# # 设置参数池 参考 http://www.cnblogs.com/DjangoBlog/p/6201663.html
# param_grid = {'n_estimators': range(30, 81, 10),
# 'learning_rate': [0.2, 0.1, 0.05, 0.02, 0.01],
# 'max_depth': [6, 8, 10],
# 'min_child_weight': [4, 5, 6],
# 'gamma': [i / 10.0 for i in range(0, 5)],
# 'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05],
# 'scale_pos_weight': [0.8, 0.5, 0.3, 1]
# }
# # 网格调参
# from sklearn.model_selection import GridSearchCV
# estimator = GridSearchCV(model_gbr_GridSearch, param_grid)
# estimator.fit(X_train, y_train.ravel())
# print('最优调参:', estimator.best_params_)
# {'max_depth': 6, 'n_estimators': 30, 'max_features': 0.8, 'learning_rate': 0.02, 'min_samples_leaf': 3}
# print('调参后得分', estimator.score(X_test, y_test.ravel()))
return clf
# 得到评价指标rmspe_xg训练模型
def rmspe_xg(yhat, y):
# y DMatrix对象
y = y.get_label()
# y.get_label 二维数组
y = np.exp(y) # 二维数组
yhat = np.exp(yhat) # 一维数组
rmspe = np.sqrt(np.mean((y - yhat) ** 2))
return "rmspe", rmspe
def predict_with_XGBoosting(self, test_feature):
"""
回归
:param test_feature:
:return:
"""
# 获取训练集测试集验证集的 feature 和 target
dataMat = np.array(self.all_features_list)
labelMat = np.array(self.user_Y_list)
test_feature = np.array(test_feature)
# X_train, X_test, y_train, y_test = d_t.split_data(dataMat, labelMat, test_size=0.003)
# XGBoost训练过程
model = xgb.XGBRegressor(learning_rate=0.02, n_estimators=1200, max_depth=16,
min_child_weight=8, gamma=0.15, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1,
seed=27)
model.fit(dataMat, labelMat)
self.feature_importance(model)
# # 显示重要特征
# from xgboost import plot_importance
# plot_importance(model)
# import matplotlib.pyplot as plt
# plt.show()
# 对测试集进行预测
test_probs = model.predict(test_feature)
return test_probs
def lightgbm_train(self):
"""
:return:
"""
dataMat = np.array(self.all_features_list)
labelMat = np.array(self.act_class_Y)
X_train, X_test, y_train, y_test = d_t.split_data(dataMat, labelMat, test_size=0.33)
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', num_leaves=8, objective='binary',
max_depth=6, learning_rate=0.2, n_estimators=300,
metric="auc",
reg_alpha=0.02,
)
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
eval_set=[(X_test, y_test)])
self.feature_importance(clf)
# # print('###############################参数网格优选###################################')
# model_gbr_GridSearch = lightgbm.LGBMClassifier()
# # 设置参数池 参考 http://www.cnblogs.com/DjangoBlog/p/6201663.html
# param_grid = {'n_estimators': range(30, 81, 10),
# 'learning_rate': [0.2, 0.1, 0.05, 0.02, 0.01],
# 'max_depth': [6, 8, 10],
# 'num_leaves': [6, 8, 10],
# 'reg_alpha': [0.02, 0.001, 0.005, 0.01, 0.05]
# }
# # 网格调参
# from sklearn.model_selection import GridSearchCV
# estimator = GridSearchCV(model_gbr_GridSearch, param_grid)
# estimator.fit(X_train, y_train.ravel())
# f_t_Ig.d_h.print_str += 'best params:'+ str(estimator.best_params_)
return clf
def feature_importance(self, clf):
test_feature_sort1 = clf.feature_importances_
test_feature_sort2 = sorted(zip(config.predictors1, test_feature_sort1), key=lambda _: _[1], reverse=True)
f_t_Ig.d_h.print_str += str(test_feature_sort2)
def bp_nn_predict(self):
"""
BP 神经
:return:
"""
# X_train, X_test, y_train, y_test = d_t.split_data(self.pre_features_list, self.act_Y,test_size=0.1)
n = NN(2, 6, 1)
# 用一些模式训练它
n.train(self.pre_features_list, self.act_Y, 500, 0.01)
# count, predict = n.testP(X_test, y_test)
# print(count)
return n
def sk_GradientBoostingRegressor(self):
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingRegressor
dataMat = np.array(self.pre_features_list)
labelMat = np.array(self.act_Y)
# # 随机挑选
train_x_disorder, test_x_disorder, train_y_disorder, test_y_disorder = d_t.split_data(dataMat,
labelMat,
test_size=0.33)
# 数据标准化
ss_x = preprocessing.StandardScaler()
train_x_disorder = ss_x.fit_transform(dataMat)
ss_y = preprocessing.StandardScaler()
train_y_disorder = ss_y.fit_transform(labelMat.reshape(-1, 1))
model_gbr_disorder = GradientBoostingRegressor(max_depth=16, max_features=0.8, n_estimators=30,
learning_rate=0.02,
min_samples_leaf=3)
model_gbr_disorder.fit(train_x_disorder, train_y_disorder.ravel())
gbr_score_disorder = model_gbr_disorder.score(test_x_disorder, test_y_disorder.ravel())
f_t_Ig.d_h.print_str += str('sklearn Integrated regression model score = ') + str(
gbr_score_disorder) # 准确率较高 0.853817723868
# print('###############################参数网格优选###################################')
# model_gbr_GridSearch = GradientBoostingRegressor()
# # 设置参数池 参考 http://www.cnblogs.com/DjangoBlog/p/6201663.html
# param_grid = {'n_estimators': range(20, 81, 10),
# 'learning_rate': [0.2, 0.1, 0.05, 0.02, 0.01],
# 'max_depth': [4, 6, 8],
# 'min_samples_leaf': [3, 5, 9, 14],
# 'max_features': [0.8, 0.5, 0.3, 0.1]}
# # 网格调参
# from sklearn.model_selection import GridSearchCV
# estimator = GridSearchCV(model_gbr_GridSearch, param_grid)
# estimator.fit(train_x_disorder, train_y_disorder.ravel())
# print('最优调参:', estimator.best_params_)
# # {'max_depth': 6, 'n_estimators': 30, 'max_features': 0.8, 'learning_rate': 0.02, 'min_samples_leaf': 3}
#
# print('调参后得分', estimator.score(test_x_disorder, test_y_disorder.ravel()))
# 使用默认参数的模型进行预测
# gbr_pridict_disorder = model_gbr_disorder.predict(test_x_disorder)
return model_gbr_disorder
def cnn_model(self, test_features):
cnn = CNNPredict(config)
return cnn.fit(self.all_features_list, self.user_Y_list, test_features)
def fuse_model(self):
pre_result = {
"Id": [],
"Pred": [],
"rank_pro": []
}
# r_f = self.random_forest()
r_f = self.xgb_classifier()
# r_f = self.lightgbm_train()
# bp = self.bp_nn_predict()
gbr = self.sk_GradientBoostingRegressor()
userid_list, test_features = f_t_Ig.get_test_features02()
# test_features = [x[1:] for x in test_features_list]
pre_classfies = r_f.predict_proba(test_features).tolist()
j = 0
for i in range(len(userid_list)):
userid = userid_list[i]
pre_result['Id'].append(userid)
rank_pro = pre_classfies[i][1]
if pre_classfies[i][0] > 0.49:
pre_result['Pred'].append(0)
else:
# y1 = bp.update(test_features[1])[0]
y1 = gbr.predict([test_features[1]])[0]
pre_result['Pred'].append(abs(round(y1, 9)))
if j < 50:
f_t_Ig.d_h.print_str += "Id=" + str(userid) + ",Pred=" + str(y1) + " | "
j += 1
pre_result['rank_pro'].append(float(rank_pro))
df = pd.DataFrame(pre_result).sort_values(by='rank_pro')
data_pre_save(df=df[['Id', 'Pred']])
f_t_Ig.d_h.print_str += " features len = " + str(test_features.__len__())
return pre_result
def fuse_cnn(self):
pre_result = {
"Id": [],
"Pred": [],
"rank_pro": []
}
# r_f = self.random_forest()
r_f = self.xgb_classifier()
# r_f = self.lightgbm_train()
# bp = self.bp_nn_predict()
userid_list, test_features = f_t_Ig.get_test_features02()
pre_classfies = r_f.predict_proba(test_features).tolist()
prediction_value = self.cnn_model(test_features)
j = 0
for i in range(len(userid_list)):
try:
userid = userid_list[i]
pre_result['Id'].append(userid)
rank_pro = pre_classfies[i][1]
if pre_classfies[i][0] > 0.49:
pre_result['Pred'].append(0)
else:
y1 = abs(round(prediction_value[i][0], 9))
if j < 50:
f_t_Ig.d_h.print_str += "Id=" + str(userid) + ",Pred=" + str(y1) + " | "
j += 1
if y1 <= 0:
y1 = 0
pre_result['Pred'].append(y1)
rank_pro += y1
except Exception as e:
traceback.print_exc()
pre_result['rank_pro'].append(float(rank_pro))
df = pd.DataFrame(pre_result).sort_values(by='rank_pro')
data_pre_save(df=df[['Id', 'Pred']])
f_t_Ig.d_h.print_str += "userid_list=" + str(userid_list.__len__())
# f_t_Ig.d_h.my_print(pre_result)
return pre_result
def fuse_cnn02(self):
# r_f = self.random_forest()
# r_f = self.xgb_classifier()
# r_f = self.lightgbm_train()
# bp = self.bp_nn_predict()
userid_list, test_features = f_t_Ig.get_test_features02()
pre_result = {
"Id": userid_list,
"Pred": []
}
# pre_classfies = r_f.predict_proba(test_features).tolist()
start01 = time.time()
prediction_value = self.cnn_model(test_features)
f_t_Ig.d_h.print_str += " cnn_model cost time: " + str(time.time() - start01) + " "
start01 = time.time()
df_Pred = pd.DataFrame(prediction_value, columns=['Pred'])
df_Pred['Id'] = userid_list
df_Pred['Pred'] = df_Pred.Pred.apply(lambda x: abs(x) if x < 0 else (abs(x / 100) if x > 100 else x))
df_Pred = df_Pred.sort_values(by='Pred')
data_pre_save(df=df_Pred[['Id', 'Pred']])
f_t_Ig.d_h.print_str += "userid_list=" + str(userid_list.__len__())
# f_t_Ig.d_h.my_print(pre_result)
f_t_Ig.d_h.print_str += " write csv cost time: " + str(time.time() - start01) + " "
def fuse_xgboost_pre(self):
# r_f = self.random_forest()
# r_f = self.xgb_classifier()
# r_f = self.lightgbm_train()
# bp = self.bp_nn_predict()
userid_list, test_features = f_t_Ig.get_test_features02()
# pre_result = {
# "Id": userid_list,
# "Pred": []
# }
# pre_classfies = r_f.predict_proba(test_features).tolist()
start01 = time.time()
prediction_value = self.predict_with_XGBoosting(test_features)
f_t_Ig.d_h.print_str += " predict_with_XGBoosting cost time: " + str(time.time() - start01) + " "
start01 = time.time()
df_Pred = pd.DataFrame(prediction_value, columns=['Pred'])
df_Pred['Id'] = userid_list
df_Pred['Pred'] = df_Pred.Pred.apply(lambda x: round(abs(self.min_Y+np.random.rand()), 9) if x < self.min_Y else round((self.max_Y - np.random.rand(), 9) if x > self.max_Y else round(x, 9)))
df_Pred = df_Pred.sort_values(by='Pred')
data_pre_save(df=df_Pred[['Id', 'Pred']])
f_t_Ig.d_h.print_str += "userid_list=" + str(userid_list.__len__()) +" | self.min_Y = "+str(self.min_Y )\
+" self.max_Y=" + str(self.max_Y) +" | "
f_t_Ig.d_h.print_str += " write csv cost time: " + str(time.time() - start01) + " "
if __name__ == '__main__':
print("start")
config.start_time = time.time()
ljtm = LjtModelTrain()
# data = ljtm.f_t_Ig.d_h.get_test_data(config.path_test01)
# userid_list = ljtm.f_t_Ig.d_h.get_userlist(data)
# ljtm.f_t_Ig.d_h.print_str +=" userid_list= " + str(len(userid_list))+" test_features_list= "+str(len(data['TERMINALNO'].tolist()))
# ljtm.bp_nn_predict()
# ljtm.fuse_model()
# res=ljtm.fuse_cnn()
# res=ljtm.fuse_cnn02()
res = ljtm.fuse_xgboost_pre()
# ljtm.sk_GradientBoostingRegressor()
print(str(f_t_Ig.d_h.print_str) + "all cost time: " + str(time.time() - config.start_time))
# data_pre_deal()
# print(f_t_Ig.d_h.gini_normalized(actual=ljtm.all_Y,pred=res))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。