master

分支 (1)

管理

管理

master

credit-default-forecast
/
xgboost_test.py

# 导入所需要的包
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb
from xgboost import plot_importance
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV  # 网格搜索
import matplotlib.pyplot as plt  # 可视化
import seaborn as sns  # 绘图包

# 忽略警告
import warnings
warnings.filterwarnings("ignore")

# 加载数据
df = pd.read_csv('attribute_use.csv')

# 数据切割
data_train, data_test = train_test_split(df, test_size=0.2, random_state=1234)

X_train = data_train.drop('isDefault', axis=1)
y_train = data_train.isDefault

X_test = data_test.drop('isDefault', axis=1)
y_test = data_test.isDefault

'''
# 无参数模型
model = xgb.XGBClassifier()
# 训练模型
model.fit(X_train, y_train)
# 预测值
y_pred = model.predict(X_test)

'''
# 评估指标
'''
# 求出预测和真实一样的数目
true = np.sum(y_pred == y_test)
print('预测对的结果数目为：', true)
print('预测错的的结果数目为：', y_test.shape[0] - true)
# 评估指标

print('预测数据的准确率为： {:.4}%'.format(accuracy_score(y_test, y_pred) * 100))
print('预测数据的精确率为：{:.4}%'.format(
    precision_score(y_test, y_pred) * 100))
print('预测数据的召回率为：{:.4}%'.format(
    recall_score(y_test, y_pred) * 100))
# print("训练数据的F1值为：", f1score_train)
print('预测数据的F1值为：',
      f1_score(y_test, y_pred))
print('预测数据的Cohen’s Kappa系数为：',
      cohen_kappa_score(y_test, y_pred))
# 打印分类报告
print('预测数据的分类报告为：', '\n', classification_report(y_test, y_pred))
'''
'''
预测对的结果数目为： 111106
预测错的的结果数目为： 26133
预测数据的准确率为： 80.96%
预测数据的精确率为：56.14%
预测数据的召回率为：11.29%
预测数据的F1值为： 0.18803790585676558
预测数据的Cohen’s Kappa系数为： 0.13122475142457302
预测数据的分类报告为：
                precision    recall  f1-score   support

           0       0.82      0.98      0.89    110444
           1       0.56      0.11      0.19     26795

    accuracy                           0.81    137239
   macro avg       0.69      0.55      0.54    137239
weighted avg       0.77      0.81      0.75    137239
'''

'''
# 第一次n_estimator参数范围
scorel = []
for i in range(0, 300, 10):
    model = xgb.XGBClassifier(n_estimators=i + 1,
                              n_jobs=--4,
                              random_state=90).fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scorel.append(score)

print(max(scorel), (scorel.index(max(scorel)) * 10) + 1)  # 作图反映出准确度随着估计器数量的变化，51的附近最好
plt.figure(figsize=[20, 5])
plt.plot(range(1, 300, 10), scorel)
plt.show()
'''
'''
0.8098645428777534 51
'''

'''
# 第二次n_estimartor参数确定
scorel = []
for i in range(40, 56, 1):
    model = xgb.XGBClassifier(n_estimators=i + 1,
                              n_jobs=--4,
                              random_state=90).fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scorel.append(score)

print(max(scorel), (scorel.index(max(scorel)) * 1) + 40)  # 作图反映出准确度随着估计器数量的变化，45的附近最好
plt.figure(figsize=[20, 5])
plt.plot(range(40, 56, 1), scorel)
plt.show()
'''
'''
0.8100685665153491 45
'''

'''
# 初测max_depth
scorel = []
for i in range(6, 11, 1):
    model = xgb.XGBClassifier(max_depth=i+1,
                              n_estimators=45,
                              n_jobs=--4,
                              random_state=90).fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scorel.append(score)

print(max(scorel), (scorel.index(max(scorel)) * 1) + 6)
plt.figure(figsize=[20, 5])
plt.plot(range(6, 11, 1), scorel)
plt.show()
'''
'''
0.8102142976850604 6
'''

# 模型训练
model = xgb.XGBClassifier(n_estimater=45, max_depth=6, n_job=-4, random_state=90)
model.fit(X_train, y_train)

train_predict = model.predict(X_train)
y_pre = model.predict(X_test)


# 评估指标
'''
# 求出预测和真实一样的数目
true = np.sum(y_pre == y_test)
print('预测对的结果数目为：', true)
print('预测错的的结果数目为：', y_test.shape[0] - true)


# 评估指标
print('预测数据的准确率为： {:.4}%'.format(accuracy_score(y_test, y_pre) * 100))


print('预测数据的精确率为：{:.4}%'.format(
 precision_score(y_test, y_pre) * 100))
print('预测数据的召回率为：{:.4}%'.format(
 recall_score(y_test, y_pre) * 100))
# print("训练数据的F1值为：", f1score_train)
print('预测数据的F1值为：',
      f1_score(y_test, y_pre))
print('预测数据的Cohen’s Kappa系数为：',
      cohen_kappa_score(y_test, y_pre))
# 打印分类报告
print('预测数据的分类报告为：', '\n', classification_report(y_test, y_pre))
'''

# 利用accuracy（准确度）【预测正确的样本数目占总预测样本数目的比例】评估模型效果
print('The accuracy of train_set is:', metrics.accuracy_score(y_train, train_predict))
print('The accuracy of test_set is:', metrics.accuracy_score(y_test, y_pre))

'''
# 重要性表
# plt.figure(figsize=(15,15))
plt.rcParams["figure.figsize"] = (14, 8)
plot_importance(model)
plt.show()
'''
'''
# 决策树图
xgboosts = xgb.to_graphviz(model)
xgboosts.format = 'png'
xgboosts.view('./xgboost_tree')
'''
'''
# ROC曲线、AUC
# 预测正例的概率
y_pre_prob = model.predict_proba(X_test)[:, 1]
# y_pred_prob ,返回两列，第一列代表类别0,第二列代表类别1的概率
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pre_prob, pos_label=1)
# pos_label，代表真阳性标签，就是说是分类里面的好的标签，这个要看你的特征目标标签是0,1，还是1,2
roc_auc = metrics.auc(fpr, tpr)  # auc为Roc曲线下的面积
# print(roc_auc)
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot(fpr, tpr, 'r', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
# plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1.1])
plt.ylim([0, 1.1])
plt.xlabel('False Positive Rate')  # 横坐标是fpr
plt.ylabel('True Positive Rate')  # 纵坐标是tpr
plt.title('Default Rate Example_ROC figure')
plt.show()
'''