3 Star 58 Fork 12

wyz/电影推荐系统

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
user_cf.py 9.06 KB
一键复制 编辑 原始数据 按行查看 历史
wyz 提交于 2020-05-27 21:52 . final
import numpy as np
import pickle
import redis
import math
import matplotlib.pyplot as plt
from pprint import pprint
from redis import StrictRedis
import random
file_name = "./movielens/ratings.dat"
## 连接redis
pool = redis.ConnectionPool(host='127.0.0.1', db=2)
redis = redis.StrictRedis(connection_pool=pool, decode_responses=True)
###################
###################
#### 正式开始 ######
###################
###################
##
## read record from "ratings.dat"
## split into "train data" & "test data"
##
## record's like: [[user, movie, score],]
##
## return train_record, test_record
## "train_record" & "test_record" in redis
def get_split_ratings(file_name, ratio=0.98):
if redis.exists("train_record") and redis.exists("test_record"):
return pickle.loads(redis.get("train_record")), pickle.loads(redis.get("test_record"))
train_record = []
test_record = []
with open(file_name,'r') as f:
while True:
line = f.readline()
if not line:
break
record = line.strip().split("::")[:-1]
i = random.random()
if i < ratio:
train_record.append(record)
else:
test_record.append(record)
# 放入redis
redis.set("train_record", pickle.dumps(train_record))
redis.set("test_record", pickle.dumps(test_record))
return train_record, test_record
## user-movie dict
##
## type:"train" or "test" : to indicate what the usage of data
##
## return {user:{movie:score}}
## "train_dict" & "test_dict" in redis
def get_dict(records:list, type:str):
# 先看看redis里面有没有
if redis.exists(type+'_dict'):
return pickle.loads(redis.get(type+'_dict'))
user_movie = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
rating = int(record[2])
if user_id not in user_movie:
user_movie[user_id]={}
user_movie[user_id][movie_id] = rating
# 中心化打分
for u, movies in user_movie.items():
sum = 0
for movie, score in movies.items():
sum += score
mean = sum/len(movies)
for movie in movies.keys():
user_movie[u][movie] -= mean
# 放进redis
redis.set(type+'_dict',pickle.dumps(user_movie))
return user_movie
## 1
## 计算两个用户dict的 Jaccord 相似度
## do not use the score, just use amount
def user_jaccord_similarity(i:dict, j:dict)->float:
set_i = set(i.keys())
set_j = set(j.keys())
union = set_i|set_j
if len(union)==0:
return 0
return len(set_i & set_j) / len(union)
## 计算两个用户dict的 Cos 相似度
## do not use the score, just use amount
def user_cos_similarity(i:dict, j:dict)->float:
set_i = set(i.keys())
set_j = set(j.keys())
if len(set_i)==0 or len(set_j)==0:
return 0
return len(set_i & set_j) / math.sqrt(len(set_i) * len(set_j))
## 计算两个用户dict的 Pearson 相似度
## use the score
def user_pearson_similarity(i:dict,j:dict)->float:
set_i = set(i.keys())
set_j = set(j.keys())
common = set_i&set_j
if len(common) == 0:
return 0
simi = 0
dot_i = 0
dot_j = 0
for movie_id in common:
simi += i[movie_id] * j[movie_id]
dot_i += pow(i[movie_id],2)
dot_j += pow(j[movie_id],2)
if dot_i == 0 or dot_j == 0:
return 0
return simi/(math.sqrt(dot_i)*math.sqrt(dot_j))
## 用户-用户的相似度dict
## use brute method
## return {user:{simip_user:simi}}
## "user_simi_dict" in redis
def get_user_simi_dict(train_dict):
# 先看看redis里面有没有
if redis.exists("user_simi_dict"):
return pickle.loads(redis.get("user_simi_dict"))
simi_dict = {}
n_user = 6040
for user_i in range(1,n_user+1):
simi_dict[user_i] = {}
for user_j in range(1,n_user+1):
if user_i == user_j:
continue
elif user_i < user_j:
simi = user_pearson_similarity(train_dict[user_i], train_dict[user_j])
if simi > 0:
simi_dict[user_i][user_j] = simi
else:
if user_i in simi_dict[user_j]:
simi_dict[user_i][user_j] = simi_dict[user_j][user_i]
# 存入redis里面
redis.set("user_simi_dict",pickle.dumps(simi_dict))
return simi_dict
## 2
## 构建 movie-user 倒排表
##
## return {movie:{user1,user2,}} using "set"
## "reverse_dict" in redis
def get_movie_user_dict(records:list):
## 先看看redis里面有没有
if redis.exists("reverse_dict"):
return pickle.loads(redis.get('reverse_dict'))
movie_user = {}
for record in records:
user_id = int(record[0])
movie_id = int(record[1])
if movie_id not in movie_user:
movie_user[movie_id] = set()
movie_user[movie_id].add(user_id)
# 放进redis
redis.set('reverse_dict',pickle.dumps(movie_user))
return movie_user
## 用户-用户 相似度 dict
## by reverse-dict {movie:{user1,user2}}
## return {user:{simi_user:simi}}
## "user_simi_dict_0" in redis
def get_user_simi_dict_0(reverse_dict:dict, train_dict:dict)->dict:
# 先看看redis里面有没有
if redis.exists("user_simi_dict_0"):
return pickle.loads(redis.get("user_simi_dict_0"))
user_simi_dict_0 = {}
for movie, users in reverse_dict.items():
for u in users:
if u not in user_simi_dict_0:
user_simi_dict_0[u] = {}
for v in users:
if u == v:
continue
if v not in user_simi_dict_0[u]:
user_simi_dict_0[u][v] = 0
user_simi_dict_0[u][v] += (1/math.log(1+len(users))) # 修正后的
# user_simi_dict_0[u][v] += 1
for u, simi_users in user_simi_dict_0.items():
for v in simi_users.keys():
user_simi_dict_0[u][v] /= math.sqrt(len(train_dict[u])*len(train_dict[v]))
redis.set('user_simi_dict_0', pickle.dumps(user_simi_dict_0))
return user_simi_dict_0
## 输入用户id, 根据最相似的k个用户, 来做推荐
## n可以调整推荐结果的长度, 如果小于0, 就不限制
def recommend(simi_dict:dict, train_dict:dict, user:int, k:int, n:int)->dict:
rank={}
for simi_user, simi in sorted(simi_dict[user].items(), key=lambda x:x[1], reverse=True)[:k]:
for movie in train_dict[simi_user]: # similar user's movie
if movie in train_dict[user]: # if the movie has been seen yet
continue
if movie not in rank:
rank[movie] = 0
# rank[movie] += (simi*train_dict[simi_user][movie])
rank[movie] += (simi*int(bool(train_dict[simi_user][movie])))
# 不限制推荐结果的长度
if n < 0:
return rank
# 限制长度
tmp = sorted(rank.items(), key=lambda x:x[1], reverse=True)[:n]
_rank = {}
for movie, score in tmp:
_rank[movie] = score
return _rank
# recall, precision, coverage, popularity
def evaluate(simi_dict:dict, reverse_dict:dict, train_dict:dict, test_dict:dict, k:int, n:int)->float:
hit = 0
all_reality = 0
all_prediction = 0
pop = 0
recommended = set()
all_movies = set()
for u, reality in test_dict.items():
all_reality += len(reality) # calculate recall
prediction = recommend(simi_dict, train_dict, u, k, n)
all_prediction += len(prediction) # calculate precision
for i in prediction.keys():
recommended.add(i) # calculate coverage
pop += math.log(1 + len(reverse_dict[i])) # calculate popularity
if i in reality:
hit += 1
for u in test_dict.keys():
for i in train_dict[u].keys():
all_movies.add(i)
return hit/all_reality, hit/all_prediction, len(recommended)/len(all_movies), pop/all_prediction
if __name__ == "__main__":
user = 1
k=20 # 选择最近的k个朋友
n=10 # 推荐n部电影, test里面平均有4-5部电影
train_record, test_record = get_split_ratings(file_name)
train_dict = get_dict(train_record, "train")
test_dict = get_dict(test_record, "test")
reverse_dict = get_movie_user_dict(train_record)
# brute way
# user_simi_dict = get_user_simi_dict(train_dict)
# smart way
user_simi_dict = get_user_simi_dict_0(reverse_dict, train_dict)
for k in range(30, 35, 10):
a,b,c,d = evaluate(user_simi_dict, reverse_dict, train_dict, test_dict, k, n)
print('k:%d' % k)
print('n:%d' % n)
print("recall: %f, precision: %f, coverage: %f, popularity: %f" % (a, b, c, d))
# print(redis.keys())
# print(redis.delete('train_dict'))
# print(redis.delete('user_simi_dict'))
# print(redis.delete('test_dict'))
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/earth_wyz/movieRS.git
git@gitee.com:earth_wyz/movieRS.git
earth_wyz
movieRS
电影推荐系统
master

搜索帮助

23e8dbc6 1850385 7e0993f3 1850385