import json import pdfplumber import os class FapiaoShell(): # 适用电子发票 def __init__(self, path): self.do_load(path) def do_load(self, arg): """ 加载发票文件夹""" if not os.path.isdir(arg): return os.chdir(os.path.dirname(arg)) pdfs = [] for root, _, files in os.walk(arg): for fn in files: ext = os.path.splitext(fn)[1].lower() if ext != '.pdf': continue fpth = os.path.join(root, fn) fpth = os.path.relpath(fpth) pdfs.append(fpth) print(f'pdf文件: {pdfs}') pdf_ctxs = self._parse_pdfs(pdfs) # 导出json文件 with open('test.json', 'w', encoding='utf-8') as F: json.dump(pdf_ctxs, F, ensure_ascii=False) print('完成!') def _parse_pdfs(self, pdfs): """ 分析 """ result = [] for fpth in pdfs: with pdfplumber.open(fpth) as pdf: page = pdf.pages[0] # 空格替换,转成list list = ''.join(page.extract_text()).replace( ' ', '').split('\n') result.append(list) return result if __name__ == '__main__': FapiaoShell('./pd')