代码拉取完成,页面将自动刷新
'''
author: white mai
program: knn demo
date: 2021,9,24
'''
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from numpy import tile
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist
# 数据初始化
train_spt = 0.8
# 数据导入
def load_data():
iris = load_iris()
data = iris.data
target = iris.target
# 打乱数据
data = np.concatenate([data, target.reshape(-1, 1)], axis=1)
np.random.shuffle(data)
target = data[:, 4]
data = data[:, 0:4]
return data, target
# 训练集和测试集分离
def train_test_split(datasets, train_spt):
train_size = int(len(datasets) * train_spt)
return datasets[:train_size], datasets[train_size:]
# knn算法部分
# 求距离
def distance(train_data, test_data, k):
# 欧几里得距离
if k == 1:
test_data = tile(test_data, train_data.shape[0]).reshape(-1, train_data.shape[1])
dist = ((train_data - test_data)**2).sum(axis=1)**0.5
return dist
# 曼哈顿距离
elif k == 2:
test_data = tile(test_data, train_data.shape[0]).reshape(-1, train_data.shape[1])
dist = (abs(train_data - test_data)).sum(axis=1)**0.5
return dist
# 闵可夫斯基距离(Minkowski Distance)
elif k == 3:
test_data = tile(test_data, train_data.shape[0]).reshape(-1, train_data.shape[1])
dist = ((train_data - test_data)**train_data.shape[1]).sum(axis=1) ** (1/train_data.shape[1])
return dist
# 余弦距离函数:
elif k == 4:
test_data = tile(test_data, train_data.shape[0]).reshape(-1, train_data.shape[1])
distes = np.array([])
for i in range(test_data.shape[0]):
Vec = np.vstack([train_data[i], test_data[i]])
dist = 1 - pdist(Vec, 'cosine')
distes = np.append(distes, dist)
return distes
# 判断函数
def decide(data):
data = list(data)
return max(set(data), key=data.count)
def knn(train_data, test_data, train_label, test_label, k):
loss = 0
matrix_dist = np.array([])
dist = 0
for i in range(test_data.shape[0]):
dist = distance(train_data, test_data[i, :], 2)
dist = np.concatenate([dist.reshape(-1, 1), train_label.reshape(-1, 1)], axis=1)
dist = dist[np.argsort(dist, 0)[:, 0], :]
# 取前K个元素
ans_label = decide(dist[0:k, 1])
if ans_label != test_label[i]:
loss += 1
# matrix_dist = matrix_dist.reshape(-1, test_data.shape[0])
loss = loss / test_label.shape[0]
return dist, loss
if __name__ == '__main__':
# 模型预处理
data, target = load_data()
data_train, data_test = train_test_split(data, train_spt)
label_train, label_test = train_test_split(target, train_spt)
# min_max归一化
sc = MinMaxScaler(feature_range=(0, 1))
data_train_scaled = sc.fit_transform(data_train)
data_test_scaled = sc.transform(data_test)
losses = np.array([])
for k in range(1, 50):
C, loss = knn(data_train_scaled, data_test_scaled, label_train, label_test, k)
print(loss)
losses = np.append(losses, loss)
ks = np.array([i for i in range(1, 50)])
plt.title('the relationship between k and loss')
plt.xlabel('k')
plt.ylabel('loss')
plt.scatter(ks, losses)
plt.show()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。