快捷搜索: 王者荣耀 脱发

Python做一个最简单的推荐系统

通过各自电影评分的情况,计算用户之间的相似度(欧氏距离、皮尔逊相似度)

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

@Time    : 2018/9/15 8:19
@Author  : Negen
@Site    : 
@File    : recommend.py
@Software: PyCharm

from math import sqrt
from colorama import Fore
import numpy as np
"""
建立影评评分字典
六种电影类型:a,b,c,d,e,f
七个用户:Cathy,Sophie,Susie,Antonio,Marco,Jack,Leo
"""
critics={Cathy:{a:2.5,b:3.5,c:3,d:3.5,e:2.5,f:3},
         Sophie:{a:3,b:3.5,c:1.5,d:5,e:1.5,f:3},
         Susie:{a:2.5,b:3,d:3.5,f:4},
         Antonio:{b:3.5,c:3,d:4,e:2.5,f:4.5},
         Marco:{a:3,b:4,c:2,d:3,e:2,f:3},
         Jack:{a:3,b:4,d:5,e:3.5,f:3},
         Leo:{b:4.5,d:4,e:1.0}}


def sim_distance(prefs, person1, person2):
    """
    计算相似度(欧氏距离)
    :param prefs:
    :param person1:
    :param person2:
    :return: person1和person2的基于距离的相似度
    """
    si = {item : 1 for item in prefs[person1] if item in prefs[person2]}
    if len(si) == 0: return 0
    sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2) for item in si])
    # sum_of_squares= sum([pow(prefs[person1][item]-prefs[person2][item],2)
    #                      for item in prefs[person1] if item in prefs[person2]])
    # print(sum_of_squares)
    return 1/(1+sqrt(sum_of_squares))


sim_dis = sim_distance(critics, Cathy, Antonio)            #基于欧氏距离的相似度
print(Fore.RED, "欧氏距离:", sim_dis, Fore.RESET)


def sim_pearson(prefs, person1, person2):
    """
    计算person1和person2的皮尔逊相关系数
    :param prefs: 数据源
    :param person1:
    :param person2:
    :return: 两person的Pearson相关系数
    """
    si = {item : 1 for item in prefs[person1] if item in prefs[person2]}
    # for item in prefs[person1]:
    #     if item in prefs[person2]: si[item] = 1
    n = len(si)
    if n == 0: return 1
    sum1 = sum([prefs[person1][it] for it in si])
    sum2 = sum([prefs[person2][it] for it in si])

    sum1Sq = sum([pow(prefs[person1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[person2][it], 2) for it in si])

    pSum = sum([prefs[person1][it]*prefs[person2][it] for it in si])

    num = pSum - (sum1*sum2/n)    #协方差cov
    den = sqrt((sum1Sq - pow(sum1,2)/n)*(sum2Sq - pow(sum2,2)/n))    #标准差std

    # print(sum1, sum2, sum1Sq, sum2Sq, pSum)
    if den == 0: return 0
    # print(num, den)
    return num/den


sim_pear = sim_pearson(critics, Cathy, Sophie)
print(Fore.RED, "皮尔逊相关度:", sim_pear, Fore.RESET)

def test():
    """
    测试皮尔逊相关度
    :return:
    """
    person_1 = [2.5, 3.5, 3, 3.5, 2.5, 3]
    person_2 = [3, 3.5, 1.5, 5, 1.5, 3]
    sum1 = sum(person_1)
    sum2 = sum(person_2)
    # 协方差
    cov = sum([(person_1[i] - sum1 / len(person_1)) * (person_2[i] - sum1 / len(person_2)) for i in range(0, len(person_1))])/(len(person_1)-1)
    print("cov:", cov)
    # std1 = sum([pow(i - sum1 / len(person_1), 2) for i in person_1])/len(person_1)
    # print(sqrt(std1),np.std(person_1))
    # std2 = sum([pow(i - sum2 / len(person_2), 2) for i in person_2])/len(person_2)
    # print(sqrt(std2), np.std(person_2), sqrt(((np.array(person_2) - np.mean(np.array(person_2)))**2).sum()/(len(person_2))))
    # # 标准差
    # std = sqrt(std1 * std2)
    # print(std,np.std(person_1)*np.std(person_2))
    # pearson =  cov/std
    # print(pearson)
    std = np.std(person_1, ddof=1)*np.std(person_2, ddof=1)
    print(std)
    pearson = cov/std
    print(pearson)
# test()


def topmatches(prefs, person, n, similarity = sim_pearson) -> list:
    scores = [(similarity(prefs, person, other), other) for other in prefs if person != other]
    scores.sort()
    scores.reverse()
    return scores[:n]


topList_pear = topmatches(critics, Susie, 5)

print(topList_pear)
经验分享 程序员 微信小程序 职场和发展