test.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import json
  2. import pdfplumber
  3. import os
  4. class FapiaoShell():
  5. # 适用电子发票
  6. def __init__(self, path):
  7. self.do_load(path)
  8. def do_load(self, arg):
  9. """ 加载发票文件夹"""
  10. if not os.path.isdir(arg):
  11. return
  12. os.chdir(os.path.dirname(arg))
  13. pdfs = []
  14. for root, _, files in os.walk(arg):
  15. for fn in files:
  16. ext = os.path.splitext(fn)[1].lower()
  17. if ext != '.pdf':
  18. continue
  19. fpth = os.path.join(root, fn)
  20. fpth = os.path.relpath(fpth)
  21. pdfs.append(fpth)
  22. print(f'pdf文件: {pdfs}')
  23. pdf_ctxs = self._parse_pdfs(pdfs)
  24. # 导出json文件
  25. with open('test.json', 'w', encoding='utf-8') as F:
  26. json.dump(pdf_ctxs, F, ensure_ascii=False)
  27. print('完成!')
  28. def _parse_pdfs(self, pdfs):
  29. """ 分析 """
  30. result = []
  31. for fpth in pdfs:
  32. with pdfplumber.open(fpth) as pdf:
  33. page = pdf.pages[0]
  34. # 空格替换,转成list
  35. list = ''.join(page.extract_text()).replace(
  36. ' ', '').split('\n')
  37. result.append(list)
  38. return result
  39. if __name__ == '__main__':
  40. FapiaoShell('./pd')