1 Star 0 Fork 3

spider/machine-learning-stu

forked from hyesc/machine-learning-stu 
Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
文件
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
Clone or Download
no6随机森林.py 3.47 KB
Copy Edit Raw Blame History
hyesc authored 2018-11-22 11:59 . 逻辑回归,岭回归代码提交
# __author__ = 'heyin'
# __date__ = '2018/11/16 11:15'
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
def randomf():
# 获取数据
t = pd.read_csv('./titanic.csv')
# 对age列存在的nan进行均值的填充
t.fillna(t['age'].mean(), inplace=True)
# 获取pclass,sex,age列
# print(t.info()) # 通过info就可以看到哪些字段有nan了
x = t.loc[:, ['pclass', 'sex', 'age']] # 特征值
y = t.loc[:, 'survived'] # 目标值series
# 特征需要处理,利用one-hot编码
# 将dataframe转为字典
x = x.to_dict(orient='records') # records表示,每行数据变成一个字典
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 字典的特征抽取
dv = DictVectorizer(sparse=False)
x_train = dv.fit_transform(x_train)
x_test = dv.transform(x_test)
# print(x_train)
# print(dv.get_feature_names())
# 通过交叉验证网格搜索进行超参数调优
# 使用随机森林进行处理
rfc = RandomForestClassifier()
params = {'n_estimators': [80, 100, 120, 200, 300, 500], 'max_depth': [5, 8, 15, 25, 30]}
gsc = GridSearchCV(rfc, params, cv=2)
gsc.fit(x_train, y_train)
print('测试集效果score', gsc.score(x_test, y_test))
print('训练集效果score', gsc.score(x_train, y_train))
print(gsc.best_estimator_)
def r_stock():
# 获取数据
# 从csv文件获取数据
df = pd.read_csv('./stockdata/sh.csv')
df.pop('date')
y = df.pop('up_down')
x = df
# 特征工程需要拆分训练集和测试集后进行
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 数据标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 使用随机森林进行处理
rfc = RandomForestClassifier()
params = {'n_estimators': [80, 100, 120, 200, 300, 500], 'max_depth': [5, 8, 15, 25, 30]}
gsc = GridSearchCV(rfc, params, cv=5)
gsc.fit(x_train, y_train)
print('测试集效果score', gsc.score(x_test, y_test))
print('训练集效果score', gsc.score(x_train, y_train))
print(gsc.best_estimator_)
def b_stock():
# 通过上边得到的最佳参数来看召回率和精确率
# 获取数据
# 从csv文件获取数据
df = pd.read_csv('./stockdata/sh.csv')
df.pop('date')
y = df.pop('up_down')
x = df
# 特征工程需要拆分训练集和测试集后进行
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 数据标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 使用随机森林进行处理
rfc = RandomForestClassifier(n_estimators=200, max_depth=5)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
print(y_pred)
print('测试集效果score', rfc.score(x_test, y_test))
print('训练集效果score', rfc.score(x_train, y_train))
print(classification_report(y_test, y_pred=rfc.predict(x_test), labels=[0, 1],
target_names=['跌', '涨']))
if __name__ == '__main__':
# randomf()
b_stock()
# r_stock()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/spiderking/machine-learning-stu.git
git@gitee.com:spiderking/machine-learning-stu.git
spiderking
machine-learning-stu
machine-learning-stu
master

Search