只支持docx格式,根据内容过滤指定表格的时候需要加上条件,比如table.rows>1否则遇到只有单行表格table.cell可能会触发list out of range异常
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import docx
from docx import Document
import xlwt;
import xlrd;
import glob
def readdoc(filename):
doc = Document(filename)
n = 0
m = 0
j = 0
for table in doc.tables:
table_temp = []
table_temp.insert(0,filename)
if len(table.rows) > 1 and table.cell(1, 0).text.strip() == "输入项名称":
m = m + 1
for row in table.rows:
row_temp = []
j = j + 1
for cell in row.cells:
row_temp.append(cell.text)
table_temp.append(row_temp)
tables.append(table_temp)
n = n + 1
print("File name:" + filename + " tables:" + repr(n) + " parsed:" + repr(m))
return tables
def writeExcel(tables,filename):
# bold font
style = xlwt.easyxf('font: bold 1')
Sheet_index = 0
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('sheet' + str(Sheet_index),cell_overwrite_ok = True)
Sheet_index = Sheet_index + 1
j = 0
for table in tables:
for rows in table:
j = j + 1
r = table.index(rows)
if r == 0:
# write filename with style
worksheet.write(j,0,rows,style)
else:
# write table
for cell in rows:
c = rows.index(cell)
#print(r,c,cell)
worksheet.write(j,c,cell)
workbook.save(filename[:-5] + ".xls")
filenames = glob.iglob("**/*.docx", recursive=True)
tables = []
for filename in filenames:
tables = readdoc(filename)
writeExcel(tables,filename)