import numpy as np
import bottleneck as bn
import sys,math

hit rate implementations:
https://medium.com/@rishabhbhatia315/recommendation-system-evaluation-metrics-3f6739288870

def HR_at_k(X_pred, X_true, k=10):
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (X_true > 0)
    hits_num = np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)
    return np.mean(hits_num/k)

def hit_rate1(X_pred, X_true, topk=5):
    num_users = len(X_pred)
    actual = [[] for _ in range(num_users)]
    where = np.where(X_true!=0)
    for idx in range(len(where[0])):
        actual[where[0][idx]].append(where[1][idx])
    # 
    rank = np.argsort(-X_pred)
    predicted = rank[:,:topk]
    #
    hits = 0
    num_users = len(predicted)
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i][:topk])
        for item in pred_set:
            if item in act_set:
                hits += 1
    return hits/topk/num_users

precision & recall:

def precision_recall_at_k(X_pred, X_true, k=10):
    num_users = len(X_pred)
    actual = [[] for _ in range(num_users)]
    where = np.where(X_true!=0)
    for idx in range(len(where[0])):
        actual[where[0][idx]].append(where[1][idx])
    # 
    rank = np.argsort(-X_pred)
    predicted = rank[:,:k]
    sum_recall = 0.0
    sum_precision = 0.0
    true_users = 0
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i])
        if len(act_set) != 0:
            sum_precision += len(act_set & pred_set) / float(k)
            sum_recall += len(act_set & pred_set) / float(len(act_set))
            true_users += 1
    return sum_precision / true_users, sum_recall / true_users


#下面这个结果非常小
def precision_recall(x_pred, x_true, k = 10):
    epsilon = 1e-10
    pred_idx = bn.argpartition(-x_pred, k, axis=1)
    x_pred_binary = np.zeros_like(x_pred)
    x_pred_binary[pred_idx < k] = 1
    x_true_binary = (x_true>0).astype(np.int)
    tp = np.sum(x_pred_binary*x_true_binary, axis=1)
    fp = np.sum((1-x_pred_binary)*x_true_binary, axis=1)
    fn = np.sum(x_pred_binary*(1-x_true_binary), axis=1)
    p = tp/(tp+fp+epsilon)#epsilon的意义在于防止分母为0
    r = tp/(tp+fn+epsilon)
    # print(tp,fp,fn)
    # f1 = 2*p*r/(p+r+epsilon)
    # f1 = np.where(np.isnan(f1), np.zeros_like(f1), f1)
    # f1 = np.mean(f1)
    return p,r

recall的另外一个实现

def Recall_at_k(X_pred, X_true, k=10):
    batch_users = X_pred.shape[0]
    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True
    X_true_binary = (X_true > 0)
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return np.nan_to_num(recall)

NDCG
注意：以下代码要把范围调到0-1之间

def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=10):
    """
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    """
    batch_users = X_pred.shape[0]
    # x_pred_binary = (X_pred>0)*1
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    # 
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1.0 / np.log2(np.arange(2, k + 2))
    # 
    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk] * tp).sum(axis=1)
    IDCG = np.array([(tp[: min(n, k)]).sum() for n in np.sum(heldout_batch!=0,axis=1)])
    return np.mean(DCG / IDCG)

#以下的代码初步结果一样

def NDCG_binary_at_k_batch1(X_pred, heldout_batch, k=10, input_batch=None, normalize=True):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    If normalize is set to False, then we actually return DCG, not NDCG.
    '''
    if input_batch is not None:
        X_pred[input_batch.nonzero()] = -np.inf
    batch_users = X_pred.shape[0]
    # Get the indexes of the top K predictions.
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    # Get only the top k predictions.
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    # Get sorted index...
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    # Get sorted index...
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))
    # You add up the ones you've seen, scaled by their discount...
    # top_k_results = heldout_batch[np.arange()]
    maybe_sparse_top_results = heldout_batch[np.arange(batch_users)[:, np.newaxis], idx_topk]
    try:
        top_results = maybe_sparse_top_results.toarray()
    except:
        top_results = maybe_sparse_top_results
    #
    try:
        number_non_zero = heldout_batch.getnnz(axis=1)
    except:
        number_non_zero = ((heldout_batch > 0) * 1).sum(axis=1)
    #
    DCG = (top_results * tp).sum(axis=1)
    # DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
    #                      idx_topk].toarray() * tp).sum(axis=1)
    # Gets denominator, could be the whole sum, could be only part of it if there's not many.
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in number_non_zero])
    #
    IDCG = np.maximum(0.1, IDCG) #Necessary, because sometimes you're not given ANY heldout things to work with. Crazy...
    # IDCG = np.array([(tp[:min(n, k)]).sum()
    #                  for n in heldout_batch.getnnz(axis=1)])
    # to_return = DCG / IDCG
    # if np.any(np.isnan(to_return)):
    #     print("bad?!")
    #     import ipdb; ipdb.set_trace()
    #     print("dab!?")
    if normalize:
        result = (DCG / IDCG)
    else:
        result = DCG
    result = result.astype(np.float32)
    return result

sklearn的实现，会比以上实现偏小

from sklearn.metrics import precision_score, recall_score, f1_score
def metrics_sklearn(X_pred, X_true, k=10):
    import bottleneck as bn
    from sklearn.metrics import precision_score, recall_score, f1_score,ndcg_score
    pred_idx = bn.argpartition(-X_pred, k, axis=1)
    x_pred_binary = np.zeros_like(X_pred)
    x_pred_binary[pred_idx < k] = 1
    p = np.array([])
    r = np.array([])
    for idx in range(X_pred.shape[0]):
        p = np.append(p,precision_score(np.int8(X_true[idx]>0), x_pred_binary[idx]) )
        r = np.append(r, recall_score(np.int8(X_true[idx]>0), x_pred_binary[idx],'macro'))
    return ndcg_score(np.int8(X_true>0), x_pred_binary), np.mean(p), np.mean(r)

最后编辑于：2021.09.27 16:32:59

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 219,366评论 6赞 508
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 93,521评论 3赞 395
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 165,689评论 0赞 356
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,925评论 1赞 295
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,942评论 6赞 392
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,727评论 1赞 305
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,447评论 3赞 420
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 39,349评论 0赞 276
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,820评论 1赞 317
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,990评论 3赞 337
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 40,127评论 1赞 351
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,812评论 5赞 346
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 41,471评论 3赞 331
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 32,017评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 33,142评论 1赞 272
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 48,388评论 3赞 373
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 45,066评论 2赞 355

推荐系统常用评价指标及实现

推荐系统常用评价指标及实现

推荐阅读更多精彩内容