代码拉取完成,页面将自动刷新
import pandas as pd
import re
import time
import datetime
import pickle
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegressionCV
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
### 计算KS值
def KSs(df, score, target):
'''
:param df: 包含目标变量与预测值的数据集
:param score: 得分或者概率
:param target: 目标变量
:return: KS值
'''
total = df.groupby([score])[target].count()
bad = df.groupby([score])[target].sum()
all = pd.DataFrame({'total':total, 'bad':bad})
all['good'] = all['total'] - all['bad']
all[score] = all.index
all = all.sort_values(by=score,ascending=False)
all.index = range(len(all))
all['badCumRate'] = all['bad'].cumsum() / all['bad'].sum()
all['goodCumRate'] = all['good'].cumsum() / all['good'].sum()
KS = all.apply(lambda x: x.badCumRate - x.goodCumRate, axis=1)
return max(KS)
#读取数据
trainData = pd.read_csv('test1.csv')
##只计算统计变量
allFeatures = list(trainData.columns)
allFeatures.remove('Unnamed: 0')
## 上一步的结果
multi_analysis = []
### (1)将多变量分析的后变量带入LR模型中
y = trainData['y']
X = trainData[multi_analysis]
X['intercept'] = [1] * X.shape[0]
LR = sm.Logit(y, X).fit()
summary = LR.summary()
print(summary)
pvals = LR.pvalues
pvals = pvals.to_dict()
print(pvals)
# 有些变量不显著,需要逐步剔除
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True)
while (len(varLargeP) > 0 and len(multi_analysis) > 0):
# 每次迭代中,剔除最不显著的变量,直到
# (1) 剩余所有变量均显著
# (2) 没有特征可选
varMaxP = varLargeP[0][0]
print(varMaxP)
if varMaxP == 'intercept':
print('the intercept is not significant!')
break
multi_analysis.remove(varMaxP)
y = trainData['result']
X = trainData[multi_analysis]
X['intercept'] = [1] * X.shape[0]
LR = sm.Logit(y, X).fit()
pvals = LR.pvalues
pvals = pvals.to_dict()
varLargeP = {k: v for k, v in pvals.items() if v >= 0.1}
varLargeP = sorted(varLargeP.items(), key=lambda d: d[1], reverse=True)
summary = LR.summary()
trainData['prob'] = LR.predict(X)
ks = KSs(trainData, 'prob', 'result')
auc = roc_auc_score(trainData['result'], trainData['prob'])
print('normalLR:ks {}, auc {}'.format(ks, auc))
# 将模型保存
saveModel = open('LR_Model_Normal.pkl', 'wb+')
pickle.dump(LR, saveModel)
saveModel.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。