爬虫小项目之四发票批量录入

今天，心情不好，晚上什么都不想干，那就写个技术贴娱乐一下吧。爬虫NLP做知识图谱等下期更新，今天搞文字识别。前一段时间接了一个小任务，一个师妹拿着一堆发票在那边吐槽，不想干活了。录发票的确是一件让人抓狂的事。不过如果有ocr来辅助就不一样了。今天就用python的request模块向百度AI发送post请求，批量完成发票录入。

在开始这个项目之前，请准备一个手机，把像素调低，然后给每一张发票拍一张照片，放到一个文件夹。每张照片的大小不要超过2M。

image

接下来，读取所有文件

# encoding:utf-8

import requests
import base64
import os
#递归读取文件夹中所有文件
def get_all(cwd,result):
    get_dir=os.listdir(cwd)
    for i in get_dir:
        sub_dir=os.path.join(cwd,i)
        if os.path.isdir(sub_dir):
            get_all(sub_dir)
        else:
            result.append(i)
#

定义读取函数。这里同样要在百度AI申请账号密码，这里自行百度，不解释。

# 二进制方式打开图片文件
#对文件进行读写并存入
def ReadInvoice(Inputpath,file):
    request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice"
    f = open(Inputpath, 'rb')
    img = base64.b64encode(f.read())

    params = {"image":img}
    access_token = '[24.bafecd7823e46b4888a1aadd65882468.2592000.1602842119.282335-18705450]'
    request_url = request_url + "?access_token=" + access_token
    headers = {'content-type': 'application/x-www-form-urlencoded'}
    response = requests.post(request_url, data=params, headers=headers)
    if response:
        print (response.json())

    data=response.json()['words_result']
    #发票号
    InvoiceNum=data['InvoiceNum']
    #发票总额
    InvoicePrice=data['AmountInFiguers']
    #容量
    #？？
    #商品名称
    CommodityName=data['CommodityName']
    #容量单位
    CommodityUnit=data['CommodityUnit']
    #商品数量
    CommodityNum=data['CommodityNum']
    #规格
    CommodityType=data['CommodityType']
    #金额
    CommodityAmount=data['CommodityAmount']
    #税额
    CommodityTax=data['CommodityTax']
    #生产厂家
    #？？
    #供货商全称
    SellerName=data['SellerName']
    #供货商电话
    SellerAddress=data['SellerAddress']
    #整理商品名称
    lst=[eval(i['row']) for i in CommodityAmount]
    #写入数据
    try:
        if len(lst)>1:
            lst_=[]
            for _ in range(len(lst)-1):
                lst_.append([lst[_],lst[_+1]])
            lst_.append([lst[-1],eval(CommodityName[-1]['row'])])
            print(lst_)
            Name_list=[]
            for i,j in lst_:
                Name_list.append(''.join([x['word'] for x in CommodityName[(i-1):(j-1)]] ))
            CommodityName_merged=Name_list
            for index in range(len(CommodityName_merged)):
                file.write(
                    InvoiceNum+'\t'+
                    InvoicePrice+'\t'+
                    CommodityName_merged[index]+"\t"+
                    CommodityUnit[index]['word']+'\t'+
                    CommodityNum[index]['word']+'\t'+
                    CommodityType[index]['word']+'\t'+
                    CommodityAmount[index]['word']+'\t'+
                    CommodityTax[index]['word']+'\t'+
                    SellerName+'\t'+
                    SellerAddress+"\n"
                )
            file.flush()
        else:
            file.write(
                    InvoiceNum+'\t'+
                    InvoicePrice+'\t'+
                    CommodityName[0]['word']+"\t"+
                    CommodityUnit[0]['word']+'\t'+
                    CommodityNum[0]['word']+'\t'+
                    CommodityType[0]['word']+'\t'+
                    CommodityAmount[0]['word']+'\t'+
                    CommodityTax[0]['word']+'\t'+
                    SellerName+'\t'+
                    SellerAddress+"\n"
            )
            file.flush()
    except:
        pass

最后循环读取、解析文件。

file=open("invoice.txt",'w')
file.write("发票号"+"\t"+"发票总额"+"\t"+"商品名称"+"\t"+"容量单位"+"\t"+"商品数量"+"t"+"规格"+"\t"+"金额"+"\t"+"税额"+"\t"+"供货商全称"+"\t"+"供货商电话"+"\n")
result=[]
pth=r"C:\Users\Administrator\Desktop\发票录入\JPEG\JPEG"
get_all(pth,result)
for re in result:
    try:
        ReadInvoice(os.path.join(pth,re),file)
    except:
        pass
file.close()

最终将图片中的文字读取到txt文档中

image

总结：道理很简单，批量读取图片地址信息；使用python发送post请求到百度AI服务器，调用服务器中的增值税发票识别服务；最后写入数据到txt文档。当然，有些发票不清晰或者商品名称比较奇葩的会有识别错误。识别的数据还是需要再核实的。如果手上有几千张发票，几个人干一天的活可以一个人几个小时干完，Ok，又有人要失业了~

爬虫小项目之四 发票批量录入

爬虫小项目之四发票批量录入