代码拉取完成,页面将自动刷新
# 导入所需要的包
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
from xgboost import plot_importance
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV # 网格搜索
import matplotlib.pyplot as plt # 可视化
import seaborn as sns # 绘图包
# 忽略警告
import warnings
warnings.filterwarnings("ignore")
# 加载数据
df = pd.read_csv('attribute_use.csv')
# 数据切割
data_train, data_test = train_test_split(df, test_size=0.2, random_state=1234)
X_train = data_train.drop('isDefault', axis=1)
y_train = data_train.isDefault
X_test = data_test.drop('isDefault', axis=1)
y_test = data_test.isDefault
'''
# 无参数模型
model = xgb.XGBClassifier()
# 训练模型
model.fit(X_train, y_train)
# 预测值
y_pred = model.predict(X_test)
'''
# 评估指标
'''
# 求出预测和真实一样的数目
true = np.sum(y_pred == y_test)
print('预测对的结果数目为:', true)
print('预测错的的结果数目为:', y_test.shape[0] - true)
# 评估指标
print('预测数据的准确率为: {:.4}%'.format(accuracy_score(y_test, y_pred) * 100))
print('预测数据的精确率为:{:.4}%'.format(
precision_score(y_test, y_pred) * 100))
print('预测数据的召回率为:{:.4}%'.format(
recall_score(y_test, y_pred) * 100))
# print("训练数据的F1值为:", f1score_train)
print('预测数据的F1值为:',
f1_score(y_test, y_pred))
print('预测数据的Cohen’s Kappa系数为:',
cohen_kappa_score(y_test, y_pred))
# 打印分类报告
print('预测数据的分类报告为:', '\n', classification_report(y_test, y_pred))
'''
'''
预测对的结果数目为: 111106
预测错的的结果数目为: 26133
预测数据的准确率为: 80.96%
预测数据的精确率为:56.14%
预测数据的召回率为:11.29%
预测数据的F1值为: 0.18803790585676558
预测数据的Cohen’s Kappa系数为: 0.13122475142457302
预测数据的分类报告为:
precision recall f1-score support
0 0.82 0.98 0.89 110444
1 0.56 0.11 0.19 26795
accuracy 0.81 137239
macro avg 0.69 0.55 0.54 137239
weighted avg 0.77 0.81 0.75 137239
'''
'''
# 第一次n_estimator参数范围
scorel = []
for i in range(0, 300, 10):
model = xgb.XGBClassifier(n_estimators=i + 1,
n_jobs=--4,
random_state=90).fit(X_train, y_train)
score = model.score(X_test, y_test)
scorel.append(score)
print(max(scorel), (scorel.index(max(scorel)) * 10) + 1) # 作图反映出准确度随着估计器数量的变化,51的附近最好
plt.figure(figsize=[20, 5])
plt.plot(range(1, 300, 10), scorel)
plt.show()
'''
'''
0.8098645428777534 51
'''
'''
# 第二次n_estimartor参数确定
scorel = []
for i in range(40, 56, 1):
model = xgb.XGBClassifier(n_estimators=i + 1,
n_jobs=--4,
random_state=90).fit(X_train, y_train)
score = model.score(X_test, y_test)
scorel.append(score)
print(max(scorel), (scorel.index(max(scorel)) * 1) + 40) # 作图反映出准确度随着估计器数量的变化,45的附近最好
plt.figure(figsize=[20, 5])
plt.plot(range(40, 56, 1), scorel)
plt.show()
'''
'''
0.8100685665153491 45
'''
'''
# 初测max_depth
scorel = []
for i in range(6, 11, 1):
model = xgb.XGBClassifier(max_depth=i+1,
n_estimators=45,
n_jobs=--4,
random_state=90).fit(X_train, y_train)
score = model.score(X_test, y_test)
scorel.append(score)
print(max(scorel), (scorel.index(max(scorel)) * 1) + 6)
plt.figure(figsize=[20, 5])
plt.plot(range(6, 11, 1), scorel)
plt.show()
'''
'''
0.8102142976850604 6
'''
# 模型训练
model = xgb.XGBClassifier(n_estimater=45, max_depth=6, n_job=-4, random_state=90)
model.fit(X_train, y_train)
train_predict = model.predict(X_train)
y_pre = model.predict(X_test)
# 评估指标
'''
# 求出预测和真实一样的数目
true = np.sum(y_pre == y_test)
print('预测对的结果数目为:', true)
print('预测错的的结果数目为:', y_test.shape[0] - true)
# 评估指标
print('预测数据的准确率为: {:.4}%'.format(accuracy_score(y_test, y_pre) * 100))
print('预测数据的精确率为:{:.4}%'.format(
precision_score(y_test, y_pre) * 100))
print('预测数据的召回率为:{:.4}%'.format(
recall_score(y_test, y_pre) * 100))
# print("训练数据的F1值为:", f1score_train)
print('预测数据的F1值为:',
f1_score(y_test, y_pre))
print('预测数据的Cohen’s Kappa系数为:',
cohen_kappa_score(y_test, y_pre))
# 打印分类报告
print('预测数据的分类报告为:', '\n', classification_report(y_test, y_pre))
'''
# 利用accuracy(准确度)【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of train_set is:', metrics.accuracy_score(y_train, train_predict))
print('The accuracy of test_set is:', metrics.accuracy_score(y_test, y_pre))
'''
# 重要性表
# plt.figure(figsize=(15,15))
plt.rcParams["figure.figsize"] = (14, 8)
plot_importance(model)
plt.show()
'''
'''
# 决策树图
xgboosts = xgb.to_graphviz(model)
xgboosts.format = 'png'
xgboosts.view('./xgboost_tree')
'''
'''
# ROC曲线、AUC
# 预测正例的概率
y_pre_prob = model.predict_proba(X_test)[:, 1]
# y_pred_prob ,返回两列,第一列代表类别0,第二列代表类别1的概率
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pre_prob, pos_label=1)
# pos_label,代表真阳性标签,就是说是分类里面的好的标签,这个要看你的特征目标标签是0,1,还是1,2
roc_auc = metrics.auc(fpr, tpr) # auc为Roc曲线下的面积
# print(roc_auc)
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot(fpr, tpr, 'r', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
# plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1.1])
plt.ylim([0, 1.1])
plt.xlabel('False Positive Rate') # 横坐标是fpr
plt.ylabel('True Positive Rate') # 纵坐标是tpr
plt.title('Default Rate Example_ROC figure')
plt.show()
'''
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。