Fetch the repository succeeded.
This action will force synchronization from hyesc/machine-learning-stu, which will overwrite any changes that you have made since you forked the repository, and can not be recovered!!!
Synchronous operation will process in the background and will refresh the page when finishing processing. Please be patient.
# __author__ = 'heyin'
# __date__ = '2018/11/16 11:15'
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
def randomf():
# 获取数据
t = pd.read_csv('./titanic.csv')
# 对age列存在的nan进行均值的填充
t.fillna(t['age'].mean(), inplace=True)
# 获取pclass,sex,age列
# print(t.info()) # 通过info就可以看到哪些字段有nan了
x = t.loc[:, ['pclass', 'sex', 'age']] # 特征值
y = t.loc[:, 'survived'] # 目标值series
# 特征需要处理,利用one-hot编码
# 将dataframe转为字典
x = x.to_dict(orient='records') # records表示,每行数据变成一个字典
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 字典的特征抽取
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(x_train)
x_test = dv.transform(x_test)
# print(x_train)
# print(dv.get_feature_names())
# 通过交叉验证网格搜索进行超参数调优
# 使用随机森林进行处理
rfc = RandomForestClassifier()
params = {'n_estimators': [80, 100, 120, 200, 300, 500], 'max_depth': [5, 8, 15, 25, 30]}
gsc = GridSearchCV(rfc, params, cv=2)
gsc.fit(x_train, y_train)
print('测试集效果score', gsc.score(x_test, y_test))
print('训练集效果score', gsc.score(x_train, y_train))
print(gsc.best_estimator_)
def r_stock():
# 获取数据
# 从csv文件获取数据
df = pd.read_csv('./stockdata/sh.csv')
df.pop('date')
y = df.pop('up_down')
x = df
# 特征工程需要拆分训练集和测试集后进行
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 数据标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 使用随机森林进行处理
rfc = RandomForestClassifier()
params = {'n_estimators': [80, 100, 120, 200, 300, 500], 'max_depth': [5, 8, 15, 25, 30]}
gsc = GridSearchCV(rfc, params, cv=5)
gsc.fit(x_train, y_train)
print('测试集效果score', gsc.score(x_test, y_test))
print('训练集效果score', gsc.score(x_train, y_train))
print(gsc.best_estimator_)
def b_stock():
# 通过上边得到的最佳参数来看召回率和精确率
# 获取数据
# 从csv文件获取数据
df = pd.read_csv('./stockdata/sh.csv')
df.pop('date')
y = df.pop('up_down')
x = df
# 特征工程需要拆分训练集和测试集后进行
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 数据标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 使用随机森林进行处理
rfc = RandomForestClassifier(n_estimators=200, max_depth=5)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
print(y_pred)
print('测试集效果score', rfc.score(x_test, y_test))
print('训练集效果score', rfc.score(x_train, y_train))
print(classification_report(y_test, y_pred=rfc.predict(x_test), labels=[0, 1],
target_names=['跌', '涨']))
if __name__ == '__main__':
# randomf()
b_stock()
# r_stock()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。