1 Star 0 Fork 9

阿荣/Invoice2Excel

forked from zoz/Invoice2Excel 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
Invoice2Excel.py 29.94 KB
一键复制 编辑 原始数据 按行查看 历史
zoz 提交于 2020-07-22 20:17 . update readme file and fix a bug
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729
"""
parse PDF invoice file and extract data to Excel
"""
import getopt
import os
import re
import sys
import pickle
from collections import defaultdict as Dict
from itertools import chain
import logging
import fitz
import pandas as pd
import pdfplumber as pb
logging.basicConfig(level=logging.ERROR,
filename= __name__ + '.log',
datefmt='%Y-%m-%d %H:%M:%S',
format='%(asctime)s - %(levelname)s - %(funcName)s - %(processName)s - %(threadName)s - %(message)s')
logger = logging.getLogger(__name__ + '_logger')
__author__ = 'yczha'
__email__ = 'yooongchun@foxmail.com'
class Extractor(object):
def __init__(self, path):
self.file = path
def _check_file(self):
if not isinstance(self.file, str) or not os.path.isfile(self.file) or not self.file.endswith(('.pdf', '.PDF')):
return {'error': 'not a valid pdf file.'}
return True
def _load_data(self):
try:
doc = fitz.open(self.file)
page = doc.loadPage(0)
words = page.getTextWords()
words = [{'x0': int(round(word[0])), 'y0': int(round(word[1])), 'x1': int(round(word[2])),
'y1': int(round(word[3])), 'word': word[4]} for word in words]
words = sorted(words, key=lambda v: v['x0'])
words = sorted(words, key=lambda v: v['y0'])
maxY = max(w['y1']for w in words)
delta = 30
for idx, word in enumerate(words):
words[idx]['y0'] = maxY + delta - word['y0']
words[idx]['y1'] = maxY + delta - word['y1']
pdf = pb.open(self.file)
page = pdf.pages[0]
words2 = page.extract_words()
words2 = [{'x0': int(round(word['x0'])), 'y0': int(round(word['top'])), 'x1': int(round(word['x1'])),
'y1': int(round(word['bottom'])), 'word': word['text']} for word in words2]
words2 = sorted(words2, key=lambda v: v['x0'])
words2 = sorted(words2, key=lambda v: v['y0'])
lines = [{'x0': round(line['x0']),
'y0': round(line['y0']),
'x1': round(line['x1']),
'y1': round(line['y1']),
'width': round(line['width']),
'height': round(line['height'])} for line in page.lines]
lines = sorted(lines, key=lambda v: v['x0'])
lines = sorted(lines, key=lambda v: v['y0'])
except Exception as e:
return {'error': e}
return {'words': words, 'words2': words2, 'lines': lines}
@staticmethod
def _find_nearest_val(vals, val):
delta = [abs(v-val) for v in vals]
idx = delta.index(min(delta))
return vals[idx]
def _fill_line(self, lines):
hlines = [line for line in lines if line['width'] > 0] # 筛选横线
hlines = sorted(hlines, key=lambda h: h['width'], reverse=True)[:-2] # 剔除较短的两根
vlines = [line for line in lines if line['height'] > 0] # 筛选竖线
# 延伸线段至最近的交点
ys = [line['y0'] for line in chain(hlines, vlines)] + [line['y1'] for line in chain(hlines, vlines)]
xs = [line['x0'] for line in chain(hlines, vlines)] + [line['x1'] for line in chain(hlines, vlines)]
for idx, line in enumerate(hlines):
for k in ['x0', 'y0', 'x1', 'y1']:
series = xs.copy() if 'x' in k else ys.copy()
series.remove(line[k])
hlines[idx][k] = self._find_nearest_val(series, line[k])
for idx, line in enumerate(vlines):
for k in ['x0', 'y0', 'x1', 'y1']:
series = xs.copy() if 'x' in k else ys.copy()
series.remove(line[k])
vlines[idx][k] = self._find_nearest_val(series, line[k])
# 查找边框顶点
maxX = max(int(line['x1']) for line in chain(hlines, vlines))
minX = min(int(line['x0']) for line in chain(hlines, vlines))
minY = min(int(line['y0']) for line in chain(hlines, vlines))
maxY = max(int(line['y1']) for line in chain(hlines, vlines))
thline = {'x0': minX, 'y0': minY, 'x1': maxX, 'y1': minY} # 顶部横线
bhline = {'x0': minX, 'y0': maxY, 'x1': maxX, 'y1': maxY} # 底部横线
lvline = {'x0': minX, 'y0': minY, 'x1': minX, 'y1': maxY} # 左侧竖线
rvline = {'x0': maxX, 'y0': minY, 'x1': maxX, 'y1': maxY} # 右侧竖线
hlines.insert(0, thline)
hlines.append(bhline)
vlines.insert(0, lvline)
vlines.append(rvline)
return hlines, vlines
@staticmethod
def _is_point_in_rect(point, rect):
"""判断点是否在矩形内"""
px, py = point
p1, p2, p3, p4 = rect.values()
if p1[0] <= px <= p2[0] and p1[1] <= py <= p3[1]:
return True
else:
return False
@staticmethod
def _find_cross_points(hlines, vlines):
points = []
delta = 1
for vline in vlines:
vx0 = vline['x0']
vy0 = vline['y0']
vy1 = vline['y1']
for hline in hlines:
hx0 = hline['x0']
hy0 = hline['y0']
hx1 = hline['x1']
if (hx0 - delta) <= vx0 <= (hx1 + delta) and (vy0 - delta) <= hy0 <= (vy1 + delta):
points.append((int(vx0), int(hy0)))
return points
@staticmethod
def _find_rects(cross_points):
# 构造矩阵
X = sorted(set([int(p[0]) for p in cross_points]))
Y = sorted(set([int(p[1]) for p in cross_points]))
df = pd.DataFrame(index=Y, columns=X)
for p in cross_points:
x, y = int(p[0]), int(p[1])
df.loc[y, x] = 1
df = df.fillna(0)
# 寻找矩形
rects = []
COLS = len(df.columns) - 1
ROWS = len(df.index) - 1
for row in range(ROWS):
for col in range(COLS):
p0 = df.iat[row, col] # 主点:必能构造一个矩阵
cnt = col + 1
while cnt <= COLS:
p1 = df.iat[row, cnt]
p2 = df.iat[row + 1, col]
p3 = df.iat[row + 1, cnt]
if p0 and p1 and p2 and p3:
rects.append({'p0': (df.columns[col], df.index[row]),
'p1': (df.columns[cnt], df.index[row]),
'p2': (df.columns[cnt], df.index[row+1]),
'p3': (df.columns[col], df.index[row+1])})
break
else:
cnt += 1
return rects
def _name_rects(self, rects):
rects = sorted(rects, key=lambda r: r['p0'][0])
rects = sorted(rects, key=lambda r: r['p0'][1], reverse=True)
return {f'r{idx+1}': rect for idx, rect in enumerate(rects)}
def _put_words_into_rect(self, words, rects):
# 将words按照坐标层级放入矩阵中
groups = {'IN': Dict(list), 'OUT': Dict(list)}
for name, r in rects.items():
groups['IN'][name] = []
for word in words:
p = ((word['x0'] + word['x1']) // 2, (word['y0'] + word['y1']) // 2)
is_word_put_into_group = False
for name, r in rects.items():
if self._is_point_in_rect(p, r):
is_word_put_into_group = True
groups['IN'][name].append(word)
break
if not is_word_put_into_group:
groups['OUT'][word['x0']].append(word)
return groups
@staticmethod
def _find_text_by_same_line(group, delta=1):
words = {}
group = sorted(group, key=lambda x: x['x0'])
for w in group:
bottom = int(w['bottom'])
text = w['text']
k1 = [bottom - i for i in range(delta)]
k2 = [bottom + i for i in range(delta)]
k = set(k1 + k2)
flag = False
for kk in k:
if kk in words:
words[kk] = words.get(kk, '') + text
flag = True
break
if not flag:
words[bottom] = words.get(bottom, '') + text
return words
def _split_words_into_diff_line(self, groups):
groups2 = {}
for k, g in groups.items():
words = self._find_text_by_same_line(g, 3)
groups2[k] = words
return groups2
@staticmethod
def _index_of_y(x, rects):
for index, r in enumerate(rects):
if x == r[2][0][0]:
return index + 1 if index + 1 < len(rects) else None
return None
@staticmethod
def _find_outer(words):
df = pd.DataFrame()
for pos, text in words.items():
if re.search(r'发票$', text): # 发票名称
df.loc[0, '发票名称'] = text
elif re.search(r'发票代码', text): # 发票代码
num = ''.join(re.findall(r'[0-9]+', text))
df.loc[0, '发票代码'] = num
elif re.search(r'发票号码', text): # 发票号码
num = ''.join(re.findall(r'[0-9]+', text))
df.loc[0, '发票号码'] = num
elif re.search(r'开票日期', text): # 开票日期
date = ''.join(re.findall(
r'[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日', text))
df.loc[0, '开票日期'] = date
elif '机器编号' in text and '校验码' in text: # 校验码
text1 = re.search(r'校验码:\d+', text)[0]
num = ''.join(re.findall(r'[0-9]+', text1))
df.loc[0, '校验码'] = num
text2 = re.search(r'机器编号:\d+', text)[0]
num = ''.join(re.findall(r'[0-9]+', text2))
df.loc[0, '机器编号'] = num
elif '机器编号' in text:
num = ''.join(re.findall(r'[0-9]+', text))
df.loc[0, '机器编号'] = num
elif '校验码' in text:
num = ''.join(re.findall(r'[0-9]+', text))
df.loc[0, '校验码'] = num
elif re.search(r'收款人', text):
items = re.split(r'收款人:|复核:|开票人:|销售方:', text)
items = [item for item in items if re.sub(
r'\s+', '', item) != '']
df.loc[0, '收款人'] = items[0] if items and len(items) > 0 else ''
df.loc[0, '复核'] = items[1] if items and len(items) > 1 else ''
df.loc[0, '开票人'] = items[2] if items and len(items) > 2 else ''
df.loc[0, '销售方'] = items[3] if items and len(items) > 3 else ''
return df
@staticmethod
def _find_and_sort_rect_in_same_line(y, groups):
same_rects_k = [k for k, v in groups.items() if k[1] == y]
return sorted(same_rects_k, key=lambda x: x[2][0][0])
def _find_inner(self, k, words, groups, groups2, free_zone_flag=False):
df = pd.DataFrame()
sort_words = sorted(words.items(), key=lambda x: x[0])
text = [word for k, word in sort_words]
context = ''.join(text)
if '购买方' in context or '销售方' in context:
y = k[1]
x = k[2][0][0]
same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)
target_index = self._index_of_y(x, same_rects_k)
target_k = same_rects_k[target_index]
group_context = groups2[target_k]
prefix = '购买方' if '购买方' in context else '销售方'
for pos, text in group_context.items():
if '名称' in text:
name = re.sub(r'名称:', '', text)
df.loc[0, prefix + '名称'] = name
elif '纳税人识别号' in text:
tax_man_id = re.sub(r'纳税人识别号:', '', text)
df.loc[0, prefix + '纳税人识别号'] = tax_man_id
elif '地址、电话' in text:
addr = re.sub(r'地址、电话:', '', text)
df.loc[0, prefix + '地址电话'] = addr
elif '开户行及账号' in text:
account = re.sub(r'开户行及账号:', '', text)
df.loc[0, prefix + '开户行及账号'] = account
elif '密码区' in context:
y = k[1]
x = k[2][0][0]
same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)
target_index = self._index_of_y(x, same_rects_k)
target_k = same_rects_k[target_index]
words = groups2[target_k]
context = [v for k, v in words.items()]
context = ''.join(context)
df.loc[0, '密码区'] = context
elif '价税合计' in context:
y = k[1]
x = k[2][0][0]
same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)
target_index = self._index_of_y(x, same_rects_k)
target_k = same_rects_k[target_index]
group_words = groups2[target_k]
group_context = ''.join([w for k, w in group_words.items()])
items = re.split(r'[((]小写[))]', group_context)
b = items[0] if items and len(items) > 0 else ''
s = items[1] if items and len(items) > 1 else ''
df.loc[0, '价税合计(大写)'] = b
df.loc[0, '价税合计(小写)'] = s
elif '备注' in context:
y = k[1]
x = k[2][0][0]
same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)
target_index = self._index_of_y(x, same_rects_k)
if target_index:
target_k = same_rects_k[target_index]
group_words = groups2[target_k]
group_context = ''.join([w for k, w in group_words.items()])
df.loc[0, '备注'] = group_context
else:
df.loc[0, '备注'] = ''
else:
if free_zone_flag:
return df, free_zone_flag
y = k[1]
x = k[2][0][0]
same_rects_k = self._find_and_sort_rect_in_same_line(y, groups)
if len(same_rects_k) == 8:
free_zone_flag = True
for kk in same_rects_k:
words = groups2[kk]
words = sorted(words.items(), key=lambda x: x[0]) if words and len(
words) > 0 else None
key = words[0][1] if words and len(words) > 0 else None
val = [word[1] for word in words[1:]
] if key and words and len(words) > 1 else ''
val = '\n'.join(val) if val else ''
if key:
df.loc[0, key] = val
return df, free_zone_flag
def _search_inner(self, inner_groups):
s = pd.Series(dtype=object)
if 'r2' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r2'])
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals = re.split(r'[::]', word)
if len(vals) > 1:
key, val = vals[:2]
else:
key = vals[0]
val = 0
s[key+'(购买方)'] = val
except Exception as e:
logger.error(f'error in r2: {e}')
if 'r4' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r4'])
text = ''
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
text += word
s['密码区'] = text
except Exception as e:
logger.error(f'error in r4: {e}')
if 'r5' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r5'])
vals = []
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals.append(word)
if len(vals) > 2:
s[vals[0]] = '\n'.join(str(v) for v in vals[1:-1])
elif len(vals) == 2:
s[vals[0]] = ''
else:
logger.error(f'not enough val in r5: {vals}')
except Exception as e:
logger.error(f'error in r5: {e}')
for r in ['r6', 'r7', 'r8', 'r9', 'r11']:
if r in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups[r])
vals = []
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals.append(word)
if len(vals) > 0:
s[vals[0]] = '\n'.join(str(v) for v in vals[1:])
elif len(vals) == 1:
s[vals[0]] = ''
else:
logger.error(f'not enough val in {r}: {vals}')
except Exception as e:
logger.error(f'error in {r}: {e}')
if 'r10' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r10'])
vals = []
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals.append(word)
if len(vals) > 2:
s[vals[0]] = '\n'.join(str(v) for v in vals[1:-1])
s['合计(金额)'] = vals[-1]
elif len(vals) == 2:
s[vals[0]] = ''
s['合计(金额)'] = vals[-1]
else:
logger.error(f'not enough val in r10: {vals}')
except Exception as e:
logger.error(f'error in r10: {e}')
if 'r12' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r12'])
vals = []
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals.append(word)
if len(vals) > 2:
s[vals[0]] = '\n'.join(str(v) for v in vals[1:-1])
s['合计(税额'] = vals[-1]
elif len(vals) == 2:
s[vals[0]] = ''
s['合计(税额)'] = vals[-1]
else:
logger.error(f'not enough val in r12: {vals}')
except Exception as e:
logger.error(f'error in r12: {e}')
if 'r14' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r14'])
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals = re.split(r'[((]小写[))]', word)
if len(vals) >= 2:
upper, lower = vals[:2]
else:
upper = vals[0]
lower = ''
s['价税合计(大写)'] = upper
s['价税合计(小写)'] = lower
except Exception as e:
logger.error(f'error in r14: {e}')
if 'r16' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r16'])
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
vals = re.split(r'[::]', word)
if len(vals) > 1:
key, val = vals[:2]
else:
key = vals[0]
val = 0
s[key+'(销售方)'] = val
except Exception as e:
logger.error(f'error in r16: {e}')
if 'r18' in inner_groups:
try:
words_in_line = self._merge_words_by_line(inner_groups['r18'])
for line, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
word = ''.join(str(w['word']) for w in words)
s['备注'] = word
except Exception as e:
logger.error(f'error in r18: {e}')
return s
def _search_outer(self, outer_groups):
s = pd.Series(dtype=object)
words = [word for gwords in outer_groups.values() for word in gwords]
words_in_line = self._merge_words_by_line(words)
for row_num, words in words_in_line.items():
words = sorted(words, key=lambda w: w['x0'])
text = ''.join(str(w['word']) for w in words)
if re.search(r'[\u4e00-\u9fa5]{3,20}发票', text): # 发票名称
s['发票名称'] = re.findall(r'[\u4e00-\u9fa5]{3,20}发票', text)[0]
for key in ['发票代码', '发票号码', '校验码', '机器编号']:
if key in text:
sep = re.compile(key + r'[::\s]')
rule = re.compile(key + r'[::\s]' + r'\d+')
vals = re.findall(rule, text)
val = vals[0] if len(vals) > 0 else ''
val = re.sub(sep, '', val)
s[key] = val
if re.search(r'开票日期', text): # 开票日期
date = ''.join(re.findall(r'\d{4}年\d{1,2}月\d{1,2}日', text))
s['开票日期'] = date
if re.search(r'收款人', text):
items = re.split(r'收款人:|复核:|开票人:|销售方:', text)
items = [item for item in items if re.sub(r'\s+', '', item) != '']
s['收款人'] = items[0] if items and len(items) > 0 else ''
s['复核'] = items[1] if items and len(items) > 1 else ''
s['开票人'] = items[2] if items and len(items) > 2 else ''
s['销售方'] = items[3] if items and len(items) > 3 else ''
return s
@staticmethod
def _merge_words_by_line(words, delta=2):
words_in_line = Dict(list)
for word in words:
row_num = round((word['y0'] + word['y1'])/2)
row_range = set([row_num - i for i in range(1, delta+1)] + [row_num + i for i in range(1, delta+1)])
if len(row_range & set(words_in_line.keys())) > 0:
row_num = list(row_range & set(words_in_line.keys()))[0]
words_in_line[row_num].append(word)
return words_in_line
def extract(self):
if self._check_file() is not True:
return self._check_file()
data = self._load_data()
if 'error' in data:
return data
words = data['words']
# words2 = data['words2']
lines = data['lines']
hlines, vlines = self._fill_line(lines)
cross_points = self._find_cross_points(hlines, vlines)
rects = self._find_rects(cross_points)
if len(rects) < 18 and os.path.isfile('rects.pickle'):
with open('rects.pickle', 'rb') as f:
rects = pickle.load(f)
if len(rects) < 18:
return {'error': 'can\'t get rects.'}
named_rects = self._name_rects(rects)
words_groups = self._put_words_into_rect(words, named_rects)
inner = self._search_inner(words_groups['IN'])
outer = self._search_outer(words_groups['OUT'])
res = pd.concat([inner, outer])
return res
def load_files(directory):
"""load files"""
if not os.path.isdir(directory):
return []
path_in_folder = Dict(list)
for root, _, files in os.walk(directory):
for file_ in files:
path = os.path.join(root, file_)
folder_name = re.split(r'/|\\', root)[-1]
if os.path.isfile(path) and file_.endswith(('.pdf', '.PDF')):
path_in_folder[folder_name].append(path)
return path_in_folder
def test():
import cv2
import numpy as np
import matplotlib.pyplot as plt
path = 'example/test.pdf'
extractor = Extractor(path)
data = extractor._load_data()
if 'error' in data:
return data
words = data['words']
# words2 = data['words2']
lines = data['lines']
hlines, vlines = extractor._fill_line(lines)
cross_points = extractor._find_cross_points(hlines, vlines)
rects = extractor._find_rects(cross_points)
if len(rects) < 18 and os.path.isfile('rects.pickle'):
with open('rects.pickle', 'rb') as f:
rects = pickle.load(f)
if len(rects) < 18:
return {'error': 'can\'t get rects.'}
named_rects = extractor._name_rects(rects)
words_groups = extractor._put_words_into_rect(words, named_rects)
for name, words in sorted(words_groups['IN'].items(), key=lambda x: int(x[0].replace('r', ''))):
words = ' '.join([str(w['word']) for w in words])
print(name, ': ', words)
for name, words in sorted(words_groups['OUT'].items(), key=lambda x: x[0]):
words = ' '.join([str(w['word']) for w in words])
print(name, ': ', words)
minX = min(int(line['x0']) for line in hlines)
maxX = max(int(line['x1']) for line in hlines)
minY = min(int(line['y0']) for line in vlines)
maxY = max(int(line['y1']) for line in vlines)
delta = 40
width = maxX + minX + delta
height = maxY + minY + delta
mat1 = np.zeros((height, width, 3))
for line in hlines:
p0 = (line['x0'], height - line['y0'])
p1 = (line['x1'], height - line['y1'])
cv2.line(mat1, p0, p1, (0, 255, 0), 2)
for line in vlines:
p0 = (line['x0'], height - line['y0'])
p1 = (line['x1'], height - line['y1'])
cv2.line(mat1, p0, p1, (0, 0, 255), 2)
plt.figure()
plt.title('hlines+vlines')
plt.imshow(mat1)
mat2 = np.zeros((height, width, 3))
for idx, r in named_rects.items():
x0 = r['p0'][0]
y0 = height - r['p0'][1]
x1 = r['p2'][0]
y1 = height - r['p2'][1]
cv2.rectangle(mat2, (x0, y0), (x1, y1), (0, 255, 0), 2)
cv2.putText(mat2, idx, ((x0+x1)//2, (y0+y1)//2), cv2.FONT_HERSHEY_TRIPLEX, 0.6, (0, 255, 0), 2)
plt.figure()
plt.title('rects')
plt.imshow(mat2)
mat3 = np.zeros((height, width, 3))
for name, words in words_groups['IN'].items():
# mat = np.zeros((height, width, 3))
x0 = named_rects[name]['p0'][0]
y0 = height - named_rects[name]['p0'][1]
x1 = named_rects[name]['p2'][0]
y1 = height - named_rects[name]['p2'][1]
# cv2.rectangle(mat, (x0, y0), (x1, y1), (0, 255, 0), 2)
cv2.rectangle(mat3, (x0, y0), (x1, y1), (0, 255, 0), 2)
for word in words:
x0 = word['x0']
x1 = word['x1']
y0 = word['y0']
y1 = word['y1']
p = (round((x0 + x1) / 2), height - round((y0 + y1) / 2))
# cv2.circle(mat, p, 2, (255, 0, 0), 2)
cv2.circle(mat3, p, 2, (255, 0, 0), 2)
# plt.figure()
# plt.title(f'words in rects:{name}')
# plt.imshow(mat)
for row, words in words_groups['OUT'].items():
for word in words:
x0 = word['x0']
x1 = word['x1']
y0 = word['y0']
y1 = word['y1']
p = (round((x0 + x1) / 2), height - round((y0 + y1) / 2))
cv2.circle(mat3, p, 2, (255, 0, 0), 2)
plt.figure()
plt.title(f'words in rects')
plt.imshow(mat3)
plt.show()
def main():
IN_PATH = 'example'
OUT_PATH = 'result.xlsx'
DEBUG = False
# parse params
opts, args = getopt.getopt(sys.argv[1:], 'p:ts:', ['test', 'path=', 'save=', 'debug'])
for opt, arg in opts:
if opt in ['-p', '--path']:
IN_PATH = arg
elif opt in ['--test', '-t']:
IN_PATH = 'example'
elif opt in ['--save', '-s']:
OUT_PATH = arg
elif opt == '--debug':
DEBUG = True
if DEBUG:
test()
sys.exit(0)
# run programme
print(
f'run {"test" if IN_PATH == "example" else "extracting"} mode, load data from directory {IN_PATH}.\n{"*" * 50}')
files_path = load_files(IN_PATH)
num = [len(paths) for _, paths in files_path.items()]
print(f'total {len(num)} folders, {sum(num)} file(s) to parse.\n{"*" * 50}')
index = 0
frames = {}
for folder_name, paths in files_path.items():
data = pd.DataFrame()
for file_path in paths:
index += 1
progress = round((index)/sum(num) * 100, 4)
print(f'{"="*int(progress)}>{index}/{sum(num)}({progress}%) {os.path.basename(file_path)}', end='\r')
extractor = Extractor(file_path)
try:
s = extractor.extract()
s.name = os.path.basename(file_path)
data = data.append(s)
except Exception as e:
print('file error:', file_path, '\n', e)
frames[folder_name] = data
print(end='\n')
print(f'{"*" * 50}\nfinish parsing, save data to {OUT_PATH}')
if os.path.isfile(OUT_PATH):
os.remove(OUT_PATH)
with pd.ExcelWriter(OUT_PATH) as writer:
for name, df in frames.items():
df.to_excel(writer, sheet_name=name)
print(f'{"*" * 50}\nALL DONE. THANK YOU FOR USING MY PROGRAMME. GOODBYE!\n{"*" * 50}')
if __name__ == '__main__':
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/aisino968897/Invoice2Excel.git
git@gitee.com:aisino968897/Invoice2Excel.git
aisino968897
Invoice2Excel
Invoice2Excel
master

搜索帮助