介绍

今日从AINLP的公众号上面看到一个推送，关于短文本聚类的工具，刚好公司项目有用到短文本聚类，就进去看看。项目不复杂，使用了jaccard相似度量方法，当然也可以修改度量办法。其特点是内存友好，对于大批量的数据聚类有帮助。

代码解析

cluster.py

cluster.py文件的main方法是工具的主入口

def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)
    # ==检查并确保输入文件和输出文件夹的存在
    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1
    # ==删除输出文件夹中与聚类输出文件名模式相匹配的文件
    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    # ==载入停用词
    stop_words = get_stop_words(args.stop_words) if os.path.exists(args.stop_words) else list()
    # load tokenizer
    # ==建立分词类，分词有两种，支持中英文
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
    # 遍历输入文件的每一行
        line = line.rstrip()
        is_match = False
        # 分词并去除停用词
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
        # ==p_bucket是一个根据词汇来查找已有聚类簇的列表，记录的列表名
            w_bucket = p_bucket[wd]
            # is_match = False
            for bucket in w_bucket:
            # == 如果某个聚类簇包含该词汇，那么就去该列表下面寻找已经有的样本行，不足五行，就全选，反之采样五行。
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                #  == 将选取的样本进行分词并去除通用词
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # ==如果待分析样本与采样选取的每个样本都满足jaccard系数大于阈值，就归入该聚类簇
                if all(jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line+'\n')
                    break
            # == 这里应该是一个不足之处，因为如果在这里就进行判断的话，无法对不同顺序单词组成的句子进行聚类
            
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                w_bucket.append(bucket_name)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line+'\n')
                save_idx += 1
                break

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')

评价

优点：内存友好

缺点：项目有个bug，无法将不同顺序单词组成的句子归到同一类中，建议加入原有的聚类簇之后，不要break，只是不要再进行聚类类别判定或者新建，而对该样本的每个单词的p_bucket列表都加入该类，同时需要调整新建类别的逻辑，如果第一个单词匹配不通过，建立一个tmp词汇，将该单词放入，然后对下一个单词进行判定，直到所有的都不通过，才建立新的类别，并将tmp词汇里的p_bucket都加上这个新的类别。

项目地址

原始项目地址：（已经提了个issue，可能会修改这个问题

https://github.com/RandyPen/TextCluster

有bug的项目fork了一份：

https://github.com/612yese/TextCluster

追加一

def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)

    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    stop_words = get_stop_words(args.stop_words) if os.path.exists(args.stop_words) else list()
    # load tokenizer
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
        line = line.rstrip()
        is_match = False
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
        ################# 追加之处 begin ####################
            if is_match:
                break
        ################# 追加之处 end ####################
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line+'\n')
################# 追加之处 begin ####################
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
################# 追加之处 end ####################
                    break
################追加改变，缩进改变##################
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_path = os.path.join(args.output, bucket_name)
            with open(bucket_path, 'a', encoding='utf-8') as outfile:
                outfile.write(line+'\n')
################# 追加之处 begin ####################
            for w in seg_list:
                p_bucket[w].append(bucket_name)
################# 追加之处 end ####################
            save_idx += 1

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')

作者的追加显然是可以解决该问题的，将新增的聚类逻辑放大了单词for循环之外，对于已有聚类簇，也将该类加入到所有单词的bucket中。

一个内存友好的短文本聚类工具

一个内存友好的短文本聚类工具

介绍

代码解析

评价

项目地址

追加一