Browse Source

python 识别电子发票
Signed-off-by: Caner

Caner 4 years ago
parent
commit
5ca0a45017
4 changed files with 108 additions and 0 deletions
  1. BIN
      pd/1.pdf
  2. BIN
      pd/2.pdf
  3. 60 0
      test.json
  4. 48 0
      test.py

BIN
pd/1.pdf


BIN
pd/2.pdf


+ 60 - 0
test.json

@@ -0,0 +1,60 @@
+[
+  [
+    "浙江增值税电子普通发票发票代码:033002100211",
+    "国统一发票监发票号码:16764020",
+    "全制",
+    "国家税务总局章开票日期:2021年09月08日",
+    "机器编号:661807633624浙江省税务局校验码:63192048983001503449",
+    "678*8028*69165/0><75*+-4791",
+    "名称:成都金隧自动化工程有限责任公司",
+    "购密",
+    "纳税人识别号:91510100720345007J</05972<71+22*++719>117*86<",
+    "买码",
+    "0+42<986*5->291*6/46/*451/>",
+    "地址、电话:",
+    "方区",
+    "2345/>259>5<71666*-+7/<98-3",
+    "开户行及账号:",
+    "货物或应税劳务、服务名称规格型号单位数量单价金额税率税额",
+    "*配电控制设备*防水盒只258.6980198117.401%1.17",
+    "合计117.401.17",
+    "价税合计(大写)壹佰壹拾捌圆伍角柒分(小写)118.57",
+    "名称:乐清市库勒电气有限公司",
+    "销备",
+    "纳税人识别号:91330382MA2CNRKJ3H",
+    "售",
+    "地址、电话:浙江省温州市乐清市柳市镇朝阳村0577-61715883",
+    "方注",
+    "开户行及账号:中国建设银行股份有限公司乐清柳市支行33050162756400001127",
+    "收款人:张明乐复核:林明云开票人:张苗销售方:(章)"
+  ],
+  [
+    "浙江增值税电子普通发票发票代码:033002100211",
+    "国统一发票监发票号码:16764020",
+    "全制",
+    "国家税务总局章开票日期:2021年09月08日",
+    "机器编号:661807633624浙江省税务局校验码:63192048983001503449",
+    "678*8028*69165/0><75*+-4791",
+    "名称:成都金隧自动化工程有限责任公司",
+    "购密",
+    "纳税人识别号:91510100720345007J</05972<71+22*++719>117*86<",
+    "买码",
+    "0+42<986*5->291*6/46/*451/>",
+    "地址、电话:",
+    "方区",
+    "2345/>259>5<71666*-+7/<98-3",
+    "开户行及账号:",
+    "货物或应税劳务、服务名称规格型号单位数量单价金额税率税额",
+    "*配电控制设备*防水盒只258.6980198117.401%1.17",
+    "合计117.401.17",
+    "价税合计(大写)壹佰壹拾捌圆伍角柒分(小写)118.57",
+    "名称:乐清市库勒电气有限公司",
+    "销备",
+    "纳税人识别号:91330382MA2CNRKJ3H",
+    "售",
+    "地址、电话:浙江省温州市乐清市柳市镇朝阳村0577-61715883",
+    "方注",
+    "开户行及账号:中国建设银行股份有限公司乐清柳市支行33050162756400001127",
+    "收款人:张明乐复核:林明云开票人:张苗销售方:(章)"
+  ]
+]

+ 48 - 0
test.py

@@ -0,0 +1,48 @@
+import json
+import pdfplumber
+import os
+
+
+class FapiaoShell():
+    # 适用电子发票
+    def __init__(self, path):
+        self.do_load(path)
+
+    def do_load(self, arg):
+        """ 加载发票文件夹"""
+        if not os.path.isdir(arg):
+            return
+        os.chdir(os.path.dirname(arg))
+        pdfs = []
+        for root, _, files in os.walk(arg):
+            for fn in files:
+                ext = os.path.splitext(fn)[1].lower()
+                if ext != '.pdf':
+                    continue
+                fpth = os.path.join(root, fn)
+                fpth = os.path.relpath(fpth)
+                pdfs.append(fpth)
+
+        print(f'pdf文件: {pdfs}')
+        pdf_ctxs = self._parse_pdfs(pdfs)
+        # 导出json文件
+        with open('test.json', 'w', encoding='utf-8') as F:
+            json.dump(pdf_ctxs, F, ensure_ascii=False)
+        print('完成!')
+
+    def _parse_pdfs(self, pdfs):
+        """ 分析 """
+        result = []
+        for fpth in pdfs:
+            with pdfplumber.open(fpth) as pdf:
+                page = pdf.pages[0]
+                # 空格替换,转成list
+                list = ''.join(page.extract_text()).replace(
+                    ' ', '').split('\n')
+                result.append(list)
+
+        return result
+
+
+if __name__ == '__main__':
+    FapiaoShell('./pd')