python实现PDF文档间对比（百度文本识别接口）

一、原理
1、将PDF文档每页转换为图像
2、调用百度通用文本识别页面接口，对图像进行内容识别
3、对图像内容进行对比，并将对比不一致的内容在文档图像上进行标记（红框）
4、将对比结果表格输出为html，以便进行识别

二、范围和限制
1、目前仅支持PDF文档之间的对比
2、无法识别图形（盖章和logo）、不清晰字迹
3、需要联网使用（OCR使用的是百度通用文本识别接口，仅限测试使用，暂不限次数）
4、对比存在误差（原因为百度OCR识别无法达到100%准确）

三、安装库
pip install pymupdf
pip install requests

四、参数
originPDF: PDF文档原件路径
contrastPDF: PDF文档扫描件路径
resultRoot: 输出结果路径（提示：程序运行后会清空该目录，请不要直接设置桌面）
输出 : 标注差异的文档图像、Html文档

五、源码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

__author__ = '孙思锴'

import os
import shutil
import fitz
import difflib
from datetime import datetime
import base64
from PIL import Image
from PIL import ImageDraw
import requests
from concurrent.futures import ThreadPoolExecutor

session = requests.session()
originDic = {}  # 空字典，用于保存原件中每一页对比不一致的文本
contrastDic = {}  # 文档扫描件
url = 'https://ai.baidu.com/aidemo'  # 百度文本识别接口URL
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
}


def initRoot(rootPath):
    """
    初始化目录
    :param rootPath:
    :return:rootPath
    """
    rootPath = os.path.abspath(rootPath)
    if os.path.exists(rootPath):
        # 检查用于放图片的目录是否存在，是的话删除
        shutil.rmtree(rootPath)  # 清空图片目录
    os.makedirs(rootPath)  # 创建图片目录
    return rootPath


def conver_img(pdfFilepath, outputPath):
    """
    pdf转换PNG图片
    :param outputPath: PNG图片输出路径
    :param pdfFilepath: pdf文档路径
    :return: doc.pageCount, ImagePath 文档图像张数，保存地址
    """

    pdfFilepath = os.path.abspath(pdfFilepath)  # 绝对路径
    if not os.path.exists(pdfFilepath):
        # 检查文件是否存在
        print('文件不存在：', pdfFilepath)
        exit(0)

    # 获取文件同名目录和类型
    pdfName = os.path.basename(pdfFilepath)  # 返回文件名
    pdfNamePath, extension = os.path.splitext(pdfName)
    ImagePath = os.path.join(outputPath, pdfNamePath)  # pdf文档图像保存地址
    if os.path.exists(ImagePath):
        # 检查用于放图片的目录是否存在，是的话删除
        shutil.rmtree(ImagePath)  # 清空图片目录
    os.makedirs(ImagePath)  # 创建图片目录

    # 读取文件
    doc = fitz.open(pdfFilepath)
    for page_index in range(doc.pageCount):
        page = doc[page_index]  # 逐页读取pdf
        # 每个尺寸的缩放系数为2，这将为我们生成分辨率提高四倍的图像。
        zoom_x = 2.0
        zoom_y = 2.0
        trans = fitz.Matrix(zoom_x, zoom_y)  # .preRotate(0)  # .preRotate(rotate)是执行一个旋转
        pm = page.getPixmap(matrix=trans, alpha=False)
        pm.writePNG(os.path.join(ImagePath, str(page_index) + '.png'))  # 保存图片
    return doc.pageCount, ImagePath


def getImageInfo(filename):
    """
    调用百度接口进行图像内容识别，通用文本识别（高精度含位置版）
    1、将image转为base64
    2、拼装请求，发送请求
    3、检验请求结果，返回
    :param filename:图片地址
    :return:json
    {'errno': 102, 'msg': '请求Demo过于频繁', 'data': ''}
    {'errno': 106, 'msg': '文件类型错误', 'data': ''}
    {'errno': 0, 'msg': 'success', 'data': {'log_id': '9163508383702196122', 'words_result_num': 30, 'words_result': [{'location': {'width': 142, 'top': 87, 'left': 202, 'height': 41}, 'words': '发银行'}, {'location': {'width': 86, 'top': 106, 'left': 909, 'height': 28}, 'words': '保密协议'}]}}
    """

    with open(filename, 'rb') as f:
        base64image = base64.b64encode(f.read()).decode()
        base64image = 'data:image/png;base64,' + base64image
    dic = {
        "image": base64image,
        "image_url": "",
        "type": "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate",
        "detect_direction": "false",
    }
    for _ in range(5):
        # 循环5次进行请求，防止请求过程提示请求繁忙
        result = session.post(url=url, headers=headers, data=dic).json()
        if result['errno'] == 102:
            continue
        return result


def imageDiff(resultRoot, originFile, contrastFile, page=1):
    """
    对比两张照片的区别
    :param resultRoot: 输出目录
    :param originFile: 源文件
    :param contrastFile: 扫描件
    :param page: 页数
    :return:
    """
    # 通过百度接口识别内容
    originResult = getImageInfo(filename=originFile)  # 识别原件内容
    contrastResult = getImageInfo(filename=contrastFile)  # 识别扫描件内容

    offset = 40  # 设置偏差值，防止原文档图像和扫描版图像出现位置偏差
    # 将原件的所有词块，一个个拿去扫描版的里对比，若位置偏差在设置范围内和词性一致，则评定词块相等
    for origin_words in originResult['data']['words_result'][:]:
        # 获取词块的相关位置信息
        left, top = origin_words['location']['left'], origin_words['location']['top']
        # right, bottom = left + origin_words['location']['width'], top + origin_words['location']['height']
        for contrast_words in contrastResult['data']['words_result'][:]:
            # 获取词块的相关位置信息
            result_left, result_top = contrast_words['location']['left'], contrast_words['location']['top']
            # result_right, result_bottom = result_left + contrast_words['location']['width'], result_top + \
            #                               contrast_words['location']['height']
            if abs(top - result_top) < offset:
                # 判断词块距离顶部的位置是否在偏差范围内，可理解为两个词块位置是否一致
                if origin_words['words'] == contrast_words['words']:
                    contrastResult['data']['words_result'].remove(contrast_words)  # 删除原件词块
                    originResult['data']['words_result'].remove(origin_words)  # 删除原件词块
                    break  # 已找到词块退出循环
                elif origin_words['words'] in contrast_words['words']:
                    # 说明扫描件内容和原件不一样
                    originResult['data']['words_result'].remove(origin_words)  # 删除原件词块
                    contrast_words['words'] = contrast_words['words'].replace(origin_words['words'], '', 1)
                    break  # 已找到词块退出循环

    # 文档图像标注，画框标注出不一样的内容
    originImage = Image.open(originFile)
    originDraw = ImageDraw.ImageDraw(originImage)
    originText = ''  # 保存对比不一致的文本
    for words in originResult['data']['words_result']:
        originText += words['words'] + '\n'
        left, top = words['location']['left'], words['location']['top']
        right, bottom = left + words['location']['width'], top + words['location']['height']
        originDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
    originDic[page] = originText  # 空字典，用于保存原件中每一页对比不一致的文本

    contrastImage = Image.open(contrastFile)
    contrastDraw = ImageDraw.ImageDraw(contrastImage)
    contrastText = ''
    for words in contrastResult['data']['words_result']:
        # 获取扫描版的每个词块
        contrastText += words['words'] + '\n'
        left, top = words['location']['left'], words['location']['top']
        right, bottom = left + words['location']['width'], top + words['location']['height']
        contrastDraw.rectangle(((left, top), (right, bottom)), outline='red', width=2)
    contrastDic[page] = contrastText  # 文档扫描件

    # 图像合并，生成对比图
    originSize = originImage.size  # 获取原始照片大小
    contrastSize = contrastImage.size  # 获取扫描件大小
    newImage_width = originSize[0] + contrastSize[0]
    newImage_hight = originSize[1] if originSize[1] > contrastSize[1] else contrastSize[1]
    new_Image = Image.new('RGB', (newImage_width, newImage_hight), "#000000")
    new_Image.paste(originImage, (0, 0))
    new_Image.paste(contrastImage, (originSize[0], 0))
    new_Image.save(os.path.join(resultRoot, "第" + str(page) + '页文档.png'))


if __name__ == '__main__':
    startTime = datetime.now()
    # 读取要对比的文件
    originPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-扫描件.pdf'  # 文档原件
    contrastPDF = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\测试文档-改字.pdf'  # 文档扫描件
    resultRoot = r'E:\Workspace\PycharmProjects\Python学习\合同文档比对\测试文档\对比结果'  # 输出目录

    resultRoot = initRoot(resultRoot)  # 清空输出目录
    originImageNum, originImagePath = conver_img(originPDF, resultRoot)  # 将原件pdf文档转换为图像
    contrastImageNum, contrastImagePath = conver_img(contrastPDF, resultRoot)  # 将扫描件pdf文档转换为图像
    if originImageNum != contrastImageNum:
        print('文档页数不一致！请查看', resultRoot)
        exit(0)
    resultRoot = os.path.join(resultRoot, '对比结果')  # 创建输出结果目录
    os.makedirs(resultRoot)  # 创建输出目录
    executor = ThreadPoolExecutor()  # 开启线程池
    for i in range(originImageNum):
        originFile = os.path.join(originImagePath, str(i) + '.png')
        contrastFile = os.path.join(contrastImagePath, str(i) + '.png')
        executor.submit(imageDiff, resultRoot, originFile, contrastFile, i + 1)  # 图像对比
    executor.shutdown(wait=True)  # 等待线程池为空后，关闭线程池

    # 输出对比到Html文件
    diff = difflib.HtmlDiff()
    with open(os.path.join(resultRoot, '结果.html'), 'w', encoding="utf-8") as f:
        for i in range(originImageNum):
            make_content = diff.make_file(fromlines=originDic[i + 1].splitlines(),
                                          tolines=contrastDic[i + 1].splitlines(),
                                          fromdesc='原件第' + str(i + 1) + '页', todesc='扫描件第' + str(i + 1) + '页')
            f.write(make_content)

    session.close()  # 关闭Session
    endTime = datetime.now()
    print('文档共', originImageNum, '页，执行总时间：', endTime - startTime)
    print('执行成功，请查看输出目录：', resultRoot)

六、执行结果示例：

标注差异的对比照片

Html文档表格

最后编辑于：2020.12.05 22:44:22

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 216,651评论 6赞 501
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 92,468评论 3赞 392
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 162,931评论 0赞 353
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 58,218评论 1赞 292
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 67,234评论 6赞 388
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 51,198评论 1赞 299
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 40,084评论 3赞 418
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 38,926评论 0赞 274
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 45,341评论 1赞 311
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 37,563评论 2赞 333
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 39,731评论 1赞 348
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 35,430评论 5赞 343
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 41,036评论 3赞 326
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 31,676评论 0赞 22
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,829评论 1赞 269
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 47,743评论 2赞 368
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 44,629评论 2赞 354

python实现PDF文档间对比（百度文本识别接口）

推荐阅读更多精彩内容