| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- import json
- import pdfplumber
- import os
- class FapiaoShell():
- # 适用电子发票
- def __init__(self, path):
- self.do_load(path)
- def do_load(self, arg):
- """ 加载发票文件夹"""
- if not os.path.isdir(arg):
- return
- os.chdir(os.path.dirname(arg))
- pdfs = []
- for root, _, files in os.walk(arg):
- for fn in files:
- ext = os.path.splitext(fn)[1].lower()
- if ext != '.pdf':
- continue
- fpth = os.path.join(root, fn)
- fpth = os.path.relpath(fpth)
- pdfs.append(fpth)
- print(f'pdf文件: {pdfs}')
- pdf_ctxs = self._parse_pdfs(pdfs)
- # 导出json文件
- with open('test.json', 'w', encoding='utf-8') as F:
- json.dump(pdf_ctxs, F, ensure_ascii=False)
- print('完成!')
- def _parse_pdfs(self, pdfs):
- """ 分析 """
- result = []
- for fpth in pdfs:
- with pdfplumber.open(fpth) as pdf:
- page = pdf.pages[0]
- # 空格替换,转成list
- list = ''.join(page.extract_text()).replace(
- ' ', '').split('\n')
- result.append(list)
- return result
- if __name__ == '__main__':
- FapiaoShell('./pd')
|