代码拉取完成,页面将自动刷新
同步操作将从 Joker/电影推荐系统 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
import numpy as np
import pickle
import redis
import math
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from redis import StrictRedis
import random
file_name = "./movielens/ratings.dat"
ratings_file = "./movielens/ratings.dat"
movie_file = "./movielens/movies.dat"
## 连接redis
pool = redis.ConnectionPool(host='127.0.0.1', db=2)
redis = redis.StrictRedis(connection_pool=pool, decode_responses=True)
###################
###################
#### 正式开始 ######
###################
###################
## read movie info from "movies.dat"
## in order to find attrs in terms of movie_id
##
## return {movie:[attr1, attr2,]}
## in redis: "movie_info"
def get_movie_info(file_name):
if redis.exists("movie_info"):
return pickle.loads(redis.get("movie_info"))
res = {}
with open(file_name,'r',encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
record = line.strip().split("::")
movie_id = int(record[0])
attrs = record[-1].split("|")
res[movie_id] = attrs
redis.set("movie_info", pickle.dumps(res))
return res
## 电影种类对应的下标
def get_refer(movie_info:dict):
refer = {}
count = 0
for cates in movie_info.values():
for cate in cates:
if cate not in refer:
refer[cate] = count
count+=1
return refer
## 电影及其特征向量
## return {movie:[1,1,0,...]}
## in redis: "movie_vector_dict"
def get_movie_vec_dict(movie_info:dict, refer:dict):
if redis.exists("movie_vector_dict"):
return pickle.loads(redis.get("movie_vector_dict"))
ans = {}
for movie_id in movie_info:
res = [0 for _ in range(18)]
# res = np.zeros((18,))
attrs = movie_info[movie_id]
for attr in attrs:
res[refer[attr]] = 1
ans[movie_id] = res
redis.set("movie_vector_dict", pickle.dumps(ans))
return ans
## read record from "ratings.dat"
## split into "train data" & "test data"
##
## records: [[user, movie, score],]
##
## return train_record, test_record
## "train_record" & "test_record" in redis
def get_split_ratings(file_name, ratio=0.98):
if redis.exists("train_record") and redis.exists("test_record"):
return pickle.loads(redis.get("train_record")), pickle.loads(redis.get("test_record"))
train_record = []
test_record = []
with open(file_name,'r') as f:
while True:
line = f.readline()
if not line:
break
record = line.strip().split("::")[:-1] #最后的时间戳不需要
i = random.random()
if i < ratio:
train_record.append(record)
else:
test_record.append(record)
# 放入redis
redis.set("train_record", pickle.dumps(train_record))
redis.set("test_record", pickle.dumps(test_record))
return train_record, test_record
## generate user's preference VECTOR for movies
##
## innovation: we can only use the movie whose score is higher than average
##
## records: [[user, movie, score],]
##
## return {user:{attr1:count, attr2:count}}
##
## in redis: "user_preference"
def get_user_preference(movie_vector_dict:dict, train_dict:dict):
if redis.exists("user_preference"):
return pickle.loads(redis.get("user_preference"))
user_prefer = {}
for user_id, movies in train_dict.items():
if user_id not in user_prefer:
user_prefer[user_id] = [0 for _ in range(18)] ### 注意这里直接用18了
for movie_id in movies.keys():
if movie_id in movie_vector_dict:
for i in range(len(user_prefer[user_id])):
user_prefer[user_id][i] += movie_vector_dict[movie_id][i]
redis.set("user_preference", pickle.dumps(user_prefer))
return user_prefer
## build User-Movie dict
##
## type:"train" or "test" : to indicate what the usage of data
##
## return {user:{movie:score}}
## "train_dict" & "test_dict" in redis
def get_dict(records:list, type:str):
# 先看看redis里面有没有
if redis.exists(type+'_dict'):
return pickle.loads(redis.get(type+'_dict'))
user_movie = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
rating = int(record[2])
if user_id not in user_movie:
user_movie[user_id]={}
user_movie[user_id][movie_id] = rating
# 中心化打分
for u, movies in user_movie.items():
sum = 0
for movie, score in movies.items():
sum += score
mean = sum/len(movies)
for movie in movies.keys():
user_movie[u][movie] -= mean
# 放进redis
redis.set(type+'_dict',pickle.dumps(user_movie))
return user_movie
## 计算一个 电影 和 用户偏好 的余弦相似度
def predict_preference(user_prefer, user_id, movie_vector_dict, movie_id):
preference = np.array(user_prefer[user_id])
attrs_vec = np.array(movie_vector_dict[movie_id])
return np.dot(preference, attrs_vec)/(np.sqrt(np.dot(preference, preference))*np.sqrt(np.dot(attrs_vec,attrs_vec)))
## 构建 Movie-User 倒排表
##
## return {movie:{user1,user2,}} using "set"
## "reverse_dict" in redis
def get_movie_user_dict(records:list)->dict:
## 先看看redis里面有没有
if redis.exists("reverse_dict"):
return pickle.loads(redis.get('reverse_dict'))
movie_user = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
if movie_id not in movie_user:
movie_user[movie_id] = set()
movie_user[movie_id].add(user_id)
# 放进redis
redis.set('reverse_dict',pickle.dumps(movie_user))
return movie_user
## 获取训练集
## {user_id:([[0,0,0,1,0,1],
## [1,0,0,1,1,10]],
## [0,1])}
## in redis: "trainning_data"
def get_trainning_data(train_dict:dict, movie_vec_dict:dict):
if redis.exists("trainning_data"):
return pickle.loads(redis.get("trainning_data"))
res = {}
limit = 0
for user_id, movie in train_dict.items():
# if limit>=1:
# break
# limit+=1
count = len(movie)
train_x = []
train_y = []
# 正样本
for movie_id in movie.keys():
train_x.append(movie_vec_dict[movie_id])
train_y.append(1)
# 负样本
while count>0:
rand_movie_id = random.randint(1,3952)
if rand_movie_id in movie_vec_dict and rand_movie_id not in movie:
count-=1
train_x.append(movie_vec_dict[movie_id])
train_y.append(0)
res[user_id] = (train_x, train_y)
redis.set("trainning_data", pickle.dumps(res))
return res
## 根据现有的推荐列表过滤出更好的推荐列表
def filter(movie_vec_dict:dict,
recommended:list,
train_x:list,
train_y:list,
n:int):
clf = LogisticRegression(random_state=0).fit(train_x, train_y)
test_x = []
for movie_id in recommended:
test_x.append(movie_vec_dict[movie_id])
res = clf.predict_proba(test_x)
container = []
for i in range(len(recommended)):
container.append((recommended[i], res[i][1])) #(movie_id, proba)
sorted_container = sorted(container, key=lambda x:x[1], reverse=True)
return dict(sorted_container[:n])
## 用户-用户 相似度 dict
## by reverse-dict {movie:{user1,user2}}
## return {user:{simi_user:simi}}
## "user_simi_dict_0" in redis
def get_user_simi_dict_0(reverse_dict:dict, train_dict:dict)->dict:
# 先看看redis里面有没有
if redis.exists("user_simi_dict_0"):
return pickle.loads(redis.get("user_simi_dict_0"))
user_simi_dict_0 = {}
for movie, users in reverse_dict.items():
for u in users:
if u not in user_simi_dict_0:
user_simi_dict_0[u] = {}
for v in users:
if u == v:
continue
if v not in user_simi_dict_0[u]:
user_simi_dict_0[u][v] = 0
user_simi_dict_0[u][v] += (1/math.log(1+len(users))) # 修正后的
# user_simi_dict_0[u][v] += 1
for u, simi_users in user_simi_dict_0.items():
for v in simi_users.keys():
user_simi_dict_0[u][v] /= math.sqrt(len(train_dict[u])*len(train_dict[v]))
redis.set('user_simi_dict_0', pickle.dumps(user_simi_dict_0))
return user_simi_dict_0
## 输入用户id, 基于用户的CF推荐
def recommend_by_user_cf(simi_dict:dict,
train_dict:dict,
user_id:int,
k:int,
n:int)->dict:
rank = {}
for simi_user, simi in sorted(simi_dict[user_id].items(), key=lambda x: x[1], reverse=True)[:k]:
for movie in train_dict[simi_user]: # similar user's movie
if movie in train_dict[user_id]: # if the movie has been seen yet
continue
if movie not in rank:
rank[movie] = 0
# rank[movie] += (simi * train_dict[simi_user][movie])
rank[movie] += (simi * int(bool(train_dict[simi_user][movie])))
# 不限制推荐结果的长度
if n < 0:
return rank
# 限制长度
tmp = sorted(rank.items(), key=lambda x: x[1], reverse=True)[:n]
_rank = {}
for movie, score in tmp:
_rank[movie] = score
return _rank
## 输入用户id, 基于内容的推荐
## n可以调整推荐结果的长度, 如果小于0, 就不限制
def recommend_by_content(movie_vector_dict:dict,
user_preference:dict,
train_dict:dict,
user_id:int,
n:int)->dict:
rank={}
for movie_id, vec in movie_vector_dict.items():
if movie_id in train_dict[user_id]:
continue
ans = predict_preference(user_preference, user_id, movie_vector_dict, movie_id)
if ans >= 0.5:
rank[movie_id] = ans
# 不限制推荐结果的长度
if n < 0:
return rank
# 限制长度
tmp = sorted(rank.items(), key=lambda x:x[1], reverse=True)[:n]
_rank = {}
for movie, score in tmp:
_rank[movie] = score
return _rank
## 融合推荐
def recommend(simi_dict:dict,
movie_vec_dict:dict,
user_prefer:dict,
train_dict:dict,
trainning_data:dict,
user_id:int, k:int, n:int):
res_1 = recommend_by_user_cf(simi_dict,
train_dict,
user_id,
k,
n)
res_2 = recommend_by_content(movie_vec_dict,
user_prefer,
train_dict,
user_id,
n)
# print(res_1)
# print(res_2)
_1 = list(res_1.keys())
_1.extend(list(res_2.keys()))
res = filter(movie_vec_dict,
_1,
trainning_data[user_id][0],
trainning_data[user_id][1],
n)
with open("task2_0.csv","a+") as f:
f.write(str(user_id))
f.write(":")
for movie_id in res.keys():
f.write(str(movie_id))
f.write(",")
f.write("\n")
return res
# recall, precision, coverage, popularity
def evaluate(simi_dict:dict,
movie_vec_dict:dict,
user_prefer:dict,
train_dict:dict,
test_dict:dict,
trainning_data:dict,
reverse_dict:dict,
k:int,n:int)->float:
hit = 0
all_reality = 0
all_prediction = 0
pop = 0
recommended = set()
all_movies = set()
for u, reality in test_dict.items():
all_reality += len(reality) # calculate recall
prediction = recommend(simi_dict,
movie_vec_dict,
user_prefer,
train_dict,
trainning_data,
u,k,n)
# # 将预测结果打印出来
# with open("task2_0.csv","a+") as f:
# f.write(str(u))
# f.write(":")
# for movie_id in prediction.keys():
# f.write(str(movie_id))
# f.write(",")
# f.write("\n")
all_prediction += len(prediction) # calculate precision
for i in prediction.keys():
recommended.add(i) # calculate coverage
pop += math.log(1 + len(reverse_dict.get(i,[]))) # calculate popularity
if i in reality:
hit += 1
for u in test_dict.keys():
for i in train_dict[u].keys():
all_movies.add(i)
return hit/all_reality, hit/all_prediction, len(recommended)/len(all_movies), pop/all_prediction
if __name__ == "__main__":
user = 2
k=20 # 选择最近的k个朋友
n=10 # 推荐n部电影, test里面平均有4-5部电影
movie_info = get_movie_info(movie_file)
refer = get_refer(movie_info)
print(refer)
# movie的向量
movie_vec_dict = get_movie_vec_dict(movie_info, refer)
# print(movie_vec_dict)
train_record, test_record = get_split_ratings(file_name)
# user-movie矩阵
train_dict = get_dict(train_record, "train")
test_dict = get_dict(test_record, "test")
# user的偏好向量
user_prefer = get_user_preference(movie_vec_dict, train_dict)
# print(user_prefer)
# movie-user矩阵
reverse_dict = get_movie_user_dict(train_record)
## 训练数据
trainning_data = get_trainning_data(train_dict, movie_vec_dict)
user_simi_dict = get_user_simi_dict_0(reverse_dict, train_dict)
## 测试recommend函数
# res3 = recommend(user_simi_dict,
# movie_vec_dict,
# user_prefer,
# train_dict,
# trainning_data,
# user,k,n)
## 测试filter函数
# train_x = [[1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0],
# [0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1]]
#
# train_y = [1,0]
#
# res = filter(movie_vec_dict, [2,3,4,1,5], train_x, train_y, 10)
# print(res)
## 获取推荐系统的指标
########################################################################################
a,b,c,d = evaluate(user_simi_dict,
movie_vec_dict,
user_prefer,
train_dict,
test_dict,
trainning_data,
reverse_dict,
k,n)
print('n:%d' % n)
print("recall: %f, precision: %f, coverage: %f, popularity: %f" % (a, b, c, d))
with open("task2_1.csv", "a+") as f:
f.write("召回率,精确率,覆盖率,流行度,新颖性\n")
f.write("%f,%f,%f,%f,%f"%(a,b,c,d,1/d))
########################################################################################
# print(redis.keys())
# print(redis.delete('movie_vec_dict'))
# print(redis.delete('train_dict'))
# print(redis.delete('user_simi_dict'))
# print(redis.delete('test_dict'))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。