代码拉取完成,页面将自动刷新
同步操作将从 hyesc/machine-learning-stu 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# __author__ = 'heyin'
# __date__ = '2018/11/12 9:36'
# knn算法的实现
import numpy as np
import pandas as pd
from pyecharts import Scatter3D, Scatter, Line
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.datasets import load_iris, load_boston
def knn_1():
# 准备数据,利用numpy生成
train = None
np.random.seed(1) # 指定随机种子,用于每次生成同一套数据
for i in range(5):
x = np.random.randint(200 * i - 100, 200 * (i + 1), (200, 3)) # 生成整数
y = np.random.randint(i, i + 1, (200, 1))
d = np.hstack((x, y))
if i == 0:
train = d
else:
train = np.vstack((train, d))
# 将原始点绘制在3d图上
ceshidian = list()
for j in range(10):
ceshidian.append([1000, 111 * (j + 1), 111 * (j + 1)])
range_color = [
'#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
'#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
scatter3D = Scatter3D("3D 散点图示例", width=1200, height=600)
scatter3D.add("", train[:, 0:-1], is_visualmap=True, visual_range_color=range_color)
scatter3D.add('', ceshidian, is_visualmap=True, visual_range_color=range_color)
scatter3D.render(path='./echart_html/3dknn自设数据.html')
# 划分测试集和训练集,针对未知数据,进行预测时,肯定也要对数据进行标准化
# 但是以什么标准进行标准化?,应当是训练集采用的标准化参数
x = train[:, 0:-1].astype(np.float64)
y = train[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 利用knn计算需要进行标准化或者归一化处理,由于归一化受最大值最小值影响较大,因此采用标准化
sd = StandardScaler()
x_train = sd.fit_transform(x_train)
x_test = sd.transform(x_test)
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(x_train, y_train)
y_pred = knc.predict(x_test)
print('训练集进行预测score是', knc.score(x_train, y_train))
# 准确率
print('测试机进行预测score是:', knc.score(x_test, y_test))
# 精确率,预测结果是正例的结果中,真正的正例占比
ret = classification_report(y_test, y_pred)
print(ret)
# 通过交叉验证和网格搜索进行超参数调优
# 构造超参数n_neighbors
kn = KNeighborsClassifier() # 此处不再传递参数
params = {'n_neighbors': [3, 5, 7, 9, 11]}
gscv = GridSearchCV(kn, params, cv=5)
gscv.fit(x_train, y_train)
pred = gscv.predict(x_test)
print('在交叉验证当中最好的结果:', gscv.best_score_)
print("每个超参数每次交叉验证的结果:", gscv.cv_results_)
print("选择最好的模型是:", gscv.best_estimator_)
print(gscv.best_params_)
print('交叉验证和网格搜索测试集预测score', gscv.score(x_test, y_test))
print('交叉验证和网格搜索训练集预测score', gscv.score(x_train, y_train))
print('交叉验证和网格搜索预测精确率和召回率', classification_report(y_test, pred))
# 随便给一个数,预测其分类
# 测试数据标准化,不进行标准化,预测错的离谱
ceshidian = sd.transform(ceshidian)
print(gscv.predict(ceshidian))
def knn_iris():
# 利用自带的数据集进行knn的预测
iris_set = load_iris()
# 取出特征和目标
x = iris_set.data
y = iris_set.target
target_names = iris_set.target_names
print(x)
print(y)
print(target_names)
print(iris_set.feature_names)
# 从特征值中分别取出三个种类的数据
# a = 0
# b = 0
# c = 0
# for i in y:
# if i == 0:
# a +=1
# elif i == 1:
# b += 1
# elif i == 2:
# c +=1
# print(a,b,c)
# print(x)
# a = x[0:50, :]
# b = x[50:100, :]
# c = x[100:150, :]
# # 将特征值绘制
# scatter3D = Scatter3D("所有数据点", width=1200, height=600)
# scatter3D.add("a", a)
# scatter3D.add("b", b)
# scatter3D.add("c", c)
# scatter3D.render(path='./echart_html/3dknn_iris.html')
#
# # 划分训练集和测试集
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=3)
# # 数据标准化
# sd = StandardScaler()
# x_train = sd.fit_transform(x_train)
# x_test = sd.transform(x_test)
# # 创建估计器
# kn = KNeighborsClassifier(n_neighbors=4)
# kn.fit(x_train, y_train)
# y_pred = kn.predict(x_test)
# print('测试集准确率:', kn.score(x_test, y_test))
# print('训练集准确率:', kn.score(x_train, y_train))
# print('精确率和召回率:', classification_report(y_test, y_pred, target_names=target_names))
#
# # 采用交叉验证和网格搜索进行超参数调优
# knn = KNeighborsClassifier()
# params = {'n_neighbors': [3, 4, 5, 6, 7, 8]}
# gs = GridSearchCV(knn, params, cv=3)
# gs.fit(x_train, y_train)
# print('gs测试集准确率:', gs.score(x_test, y_test))
# print('gs训练集准确率:', gs.score(x_train, y_train))
# print('gs精确率和召回率:', classification_report(y_test, gs.predict(x_test), target_names=target_names))
# print(gs.best_params_)
# print(gs.best_score_) # 最佳的参数时,交叉验证的平均值,和gs.score得到的值不是一个东西
# print(gs.best_estimator_)
# print(gs.cv_results_)
#
# print(gs.predict_proba(x_test))
#
# # 绘制一个测试集和训练集分开的图
# scatter3D = Scatter3D("训练集与测试集", width=1200, height=600)
# scatter3D.add("训练集", sd.inverse_transform(x_train))
# scatter3D.add("测试集", sd.inverse_transform(x_test))
# scatter3D.render(path='./echart_html/3dknn_iris_train_test.html')
def bubble_sort(nums, nums1):
for i in range(len(nums) - 1): # 这个循环负责设置冒泡排序进行的次数
for j in range(len(nums) - i - 1): # j为列表下标
if nums[j] > nums[j + 1]:
nums[j], nums[j + 1] = nums[j + 1], nums[j]
nums1[j], nums1[j + 1] = nums1[j + 1], nums1[j]
return nums, nums1
def knn_huigui():
boston = load_boston()
# print(boston)
x = boston.data
y = boston.target
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=666)
# 数据标准化
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
kn1 = KNeighborsRegressor(n_neighbors=4, weights='uniform')
kn1.fit(x_train, y_train)
y_pred = kn1.predict(x_test)
# 采用均方误差进行回归的性能评估,无法直观看到效果如何,可以绘图展示效果
print('均方误差为:', mean_squared_error(y_test, y_pred))
# 绘制折线图展示
x_axis = list(range(1, y_test.shape[0] + 1))
line = Line("knn回归算法结果对比图", width=1200)
y_test_s, y_pred_s = bubble_sort(y_test, y_pred)
line.add("真实值", x_axis, y_test_s, is_smooth=True)
line.add("预测值", x_axis, y_pred_s, is_smooth=True)
line.render(path='./echart_html/knn回归结果对比折线图平均值.html')
# distance
kn1 = KNeighborsRegressor(n_neighbors=4, weights='distance')
kn1.fit(x_train, y_train)
y_pred = kn1.predict(x_test)
# 采用均方误差进行回归的性能评估,无法直观看到效果如何,可以绘图展示效果
print('均方误差为:', mean_squared_error(y_test, y_pred))
# 绘制折线图展示
line = Line("knn回归算法结果对比图", width=1200)
y_test_s, y_pred_s = bubble_sort(y_test, y_pred)
line.add("真实值", x_axis, y_test_s, is_smooth=True)
line.add("预测值", x_axis, y_pred_s, is_smooth=True)
line.render(path='./echart_html/knn回归结果对比折线图加权值.html')
def astock():
# 从csv文件获取数据
df = pd.read_csv('./stockdata/sh.csv')
df.pop('date')
y = df.pop('up_down')
x = df
# 特征工程需要拆分训练集和测试集后进行
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
# 数据标准化处理
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# print(x_train)
# print(x_test)
for i in range(2, 11):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print(i, '训练集score', knn.score(x_train, y_train))
print(i, '测试集score', knn.score(x_test, y_test))
print(classification_report(y_test, y_pred, labels=[0, 1], target_names=['跌', '涨']))
if __name__ == '__main__':
# knn_1()
knn_iris()
# knn_huigui()
# astock()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。