推荐系统常用评价指标及实现

code example websites:
https://github.com/samlobel/RaCT_CF/blob/master/utils/evaluation_functions.py
https://github.com/dawenl/cofactor/blob/master/src/rec_eval.py
https://python.hotexamples.com/examples/bottleneck/-/argpartsort/python-argpartsort-function-examples.html

import numpy as np
import bottleneck as bn
import sys,math

hit rate implementations:
https://medium.com/@rishabhbhatia315/recommendation-system-evaluation-metrics-3f6739288870

def HR_at_k(X_pred, X_true, k=10):
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (X_true > 0)
    hits_num = np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)
    return np.mean(hits_num/k)

def hit_rate1(X_pred, X_true, topk=5):
    num_users = len(X_pred)
    actual = [[] for _ in range(num_users)]
    where = np.where(X_true!=0)
    for idx in range(len(where[0])):
        actual[where[0][idx]].append(where[1][idx])
    # 
    rank = np.argsort(-X_pred)
    predicted = rank[:,:topk]
    #
    hits = 0
    num_users = len(predicted)
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i][:topk])
        for item in pred_set:
            if item in act_set:
                hits += 1
    return hits/topk/num_users 

precision & recall:

def precision_recall_at_k(X_pred, X_true, k=10):
    num_users = len(X_pred)
    actual = [[] for _ in range(num_users)]
    where = np.where(X_true!=0)
    for idx in range(len(where[0])):
        actual[where[0][idx]].append(where[1][idx])
    # 
    rank = np.argsort(-X_pred)
    predicted = rank[:,:k]
    sum_recall = 0.0
    sum_precision = 0.0
    true_users = 0
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i])
        if len(act_set) != 0:
            sum_precision += len(act_set & pred_set) / float(k)
            sum_recall += len(act_set & pred_set) / float(len(act_set))
            true_users += 1
    return sum_precision / true_users, sum_recall / true_users


#下面这个结果非常小
def precision_recall(x_pred, x_true, k = 10):
    epsilon = 1e-10
    pred_idx = bn.argpartition(-x_pred, k, axis=1)
    x_pred_binary = np.zeros_like(x_pred)
    x_pred_binary[pred_idx < k] = 1
    x_true_binary = (x_true>0).astype(np.int)
    tp = np.sum(x_pred_binary*x_true_binary, axis=1)
    fp = np.sum((1-x_pred_binary)*x_true_binary, axis=1)
    fn = np.sum(x_pred_binary*(1-x_true_binary), axis=1)
    p = tp/(tp+fp+epsilon)#epsilon的意义在于防止分母为0
    r = tp/(tp+fn+epsilon)
    # print(tp,fp,fn)
    # f1 = 2*p*r/(p+r+epsilon)
    # f1 = np.where(np.isnan(f1), np.zeros_like(f1), f1)
    # f1 = np.mean(f1)
    return p,r

recall的另外一个实现

def Recall_at_k(X_pred, X_true, k=10):
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (X_true > 0)
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return np.nan_to_num(recall)

NDCG
注意:以下代码要把范围调到0-1之间

def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10):
    """
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    """
    batch_users = X_pred.shape[0]
    # x_pred_binary = (X_pred>0)*1
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    # 
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1.0 / np.log2(np.arange(2, k + 2))
    # 
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1)
    IDCG = np.array([(tp[: min(n, k)]).sum() for n in np.sum(heldout_batch!=0,axis=1)])
    return np.mean(DCG / IDCG)

#以下的代码初步结果一样

def NDCG_binary_at_k_batch1(X_pred, heldout_batch, k=10, input_batch=None, normalize=True):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    If normalize is set to False, then we actually return DCG, not NDCG.
    '''
    if input_batch is not None:
        X_pred[input_batch.nonzero()] = -np.inf
    batch_users = X_pred.shape[0]
    # Get the indexes of the top K predictions.
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    # Get only the top k predictions.
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    # Get sorted index...
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    # Get sorted index...
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))
    # You add up the ones you've seen, scaled by their discount...
    # top_k_results = heldout_batch[np.arange()]
    maybe_sparse_top_results = heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk]
    try:
        top_results = maybe_sparse_top_results.toarray()
    except:
        top_results = maybe_sparse_top_results
    #
    try:
        number_non_zero = heldout_batch.getnnz(axis=1)
    except:
        number_non_zero = ((heldout_batch > 0) * 1).sum(axis=1)
    #
    DCG = (top_results * tp).sum(axis=1)
    # DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
    #                      idx_topk].toarray() * tp).sum(axis=1)
    # Gets denominator, could be the whole sum, could be only part of it if there's not many.
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in number_non_zero])
    #
    IDCG = np.maximum(0.1, IDCG) #Necessary, because sometimes you're not given ANY heldout things to work with. Crazy...
    # IDCG = np.array([(tp[:min(n, k)]).sum()
    #                  for n in heldout_batch.getnnz(axis=1)])
    # to_return = DCG / IDCG
    # if np.any(np.isnan(to_return)):
    #     print("bad?!")
    #     import ipdb; ipdb.set_trace()
    #     print("dab!?")
    if normalize:
        result = (DCG / IDCG)
    else:
        result = DCG
    result = result.astype(np.float32)
    return result

sklearn的实现,会比以上实现偏小

from sklearn.metrics import precision_score, recall_score, f1_score
def metrics_sklearn(X_pred, X_true, k=10):
    import bottleneck as bn
    from sklearn.metrics import precision_score, recall_score, f1_score,ndcg_score
    pred_idx = bn.argpartition(-X_pred, k, axis=1)
    x_pred_binary = np.zeros_like(X_pred)
    x_pred_binary[pred_idx < k] = 1
    p = np.array([])
    r = np.array([])
    for idx in range(X_pred.shape[0]):
        p = np.append(p,precision_score(np.int8(X_true[idx]>0), x_pred_binary[idx]) )
        r = np.append(r, recall_score(np.int8(X_true[idx]>0), x_pred_binary[idx],'macro'))
    return ndcg_score(np.int8(X_true>0), x_pred_binary), np.mean(p), np.mean(r)
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 219,366评论 6 508
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,521评论 3 395
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 165,689评论 0 356
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,925评论 1 295
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,942评论 6 392
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,727评论 1 305
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,447评论 3 420
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,349评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,820评论 1 317
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,990评论 3 337
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 40,127评论 1 351
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,812评论 5 346
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,471评论 3 331
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 32,017评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,142评论 1 272
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,388评论 3 373
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 45,066评论 2 355

推荐阅读更多精彩内容