python实现协同过滤算法
实现基于MapReduce协同过滤,需要三个阶段,如下所示
第一个MapReduce:通过ui矩阵得到归一化后的ui矩阵
map阶段:以i为key进行分区排序,相同的key的hash编码放到同一个partition中,
#!/usr/local/bin/python
import sys
for line in sys.stdin:
u, i, s = line.strip().split(',')
print "%s\t%s\t%s" % (i, u, s)
reduce阶段:利用同一个i被所用用户打过分的score,对其进行归一化操作
#!/usr/local/bin/python
import sys
import math
cur_item = None
user_score_list = []
for line in sys.stdin:
item, user, score = line.strip().split('\t')
if cur_item == None:
cur_item = item
if item != cur_item:
sum = 0.0
for tuple in user_score_list:
(u, s) = tuple
sum += pow(s,2)
sum = math.sqrt(sum)
for tuple in user_score_list:
(u, s) = tuple
print "%s\t%s\t%\t" % (u,cur_item,float(s/sum))
user_score_list = []
cur_item = item
user_score_list.append((user,float(score))
for tuple in user_score_list:
(u, s) = tuple
sum += pow(s, 2)
sum = math.sqrt(sum)
for tuple in user_score_list:
(u, s) = tuple
print "%s\t%s\t%s" % (u, cur_item, float(s / sum))
第二个MapReduce:
map阶段:为了得到ii 矩阵必须以u为key,得到(u,i,s)
#!/usr/local/bin/python
import sys
for line in sys.stdin:
u, i, s = line.strip().split('\t')
print "%s\t%s\t%s" % (u, i, s)
reduce阶段:对同一个用户,计算所有打过分的item之间归一化后的分数的乘积,得到 ii 矩阵
#!/usr/local/bin/python
import sys
cur_user = None
item_score_list = []
for line in sys.stdin:
user, item, score = line.strip().split(\t')
if cur_user == None:
cur_user = user
if cur_user != user:
for i in range(0,len(item_score_list) -1):
for j in range(i +1,len(item_score_list)):
item_a, score_a = item_score_list[i]
item_b, score_b = item_score_list[j]
print "%s\t%s\t%s" % (item_a, item_b, score_a * score_b)
print "%s\t%s\t%s" % (item_b, item_a, score_a * score_b)
item_score_list = []
cur_user = user
item_user_score.append((item, float(score)))
for i in range(0, len(item_score_list) - 1):
for j in range(i + 1, len(item_score_list)):
item_a, score_a = item_score_list[i]
item_b, score_b = item_score_list[j]
print "%s\t%s\t%s" % (item_a, item_b, score_a * score_b)
print "%s\t%s\t%s" % (item_b, item_a, score_a * score_b)
第三个MapReduce:
map阶段:以item_a_item_b key,调用map函数
#!/usr/local/bin/python
import sys
for line in sys.stdin:
item_a, item_b, s = line.strip().split('\t')
print "%s\t%s" % (item_a +"_" + item_b, s)
reduce阶段: 对相同的key进行聚合,对value值score进行求和,就得到item与item之间的相似度
#!/usr/local/bin/python
import sys
cur_ii_pair = None
score = 0.0
for line in sys.stdin:
ii_pair, s = line.strip().split('\t')
if cur_ii_pair == None:
cur_ii_pair = ii_pair
if cur_ii_pair != ii_pair:
item_a, item_b = cur_ii_pair.split('_')
print "%s\t%s\t%s" % (item_a, item_b, sum)
cur_ii_pair = ii_pair
score = 0.0
score += float(s)
item_a, item_b = cur_ii_pair.split('_')
print "%s\t%s\t%s" % (item_a, item_b, sum)