diff --git a/applications/多模态表单识别.md b/applications/多模态表单识别.md index e64a22e169..d47bbe7704 100644 --- a/applications/多模态表单识别.md +++ b/applications/多模态表单识别.md @@ -16,14 +16,14 @@
图1 多模态表单识别流程图
-注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3815918)(配备Tesla V100、A100等高级算力资源) +注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375)(配备Tesla V100、A100等高级算力资源) # 2 安装说明 -下载PaddleOCR源码,本项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~ +下载PaddleOCR源码,上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~ ```python @@ -33,7 +33,7 @@ ```python # 如仍需安装or安装更新,可以执行以下步骤 -! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph +# ! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph # ! git clone https://gitee.com/PaddlePaddle/PaddleOCR ``` @@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数
图8 文本检测方案2-模型评估
-使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy` +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`,[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/ch_db_mv3-student1600-finetune.zip) ```python @@ -538,7 +538,7 @@ Train.dataset.ratio_list:动态采样
图16 文本识别方案3-模型评估
-使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy` +使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`,[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/rec_mobile_pp-OCRv2-student-realdata.zip) ```python diff --git a/ppstructure/docs/table/recovery.jpg b/ppstructure/docs/table/recovery.jpg new file mode 100644 index 0000000000..bee2e2fb34 Binary files /dev/null and b/ppstructure/docs/table/recovery.jpg differ diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index 7f18fcdf8e..b0ede5f3a1 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -23,6 +23,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) os.environ["FLAGS_allocator_strategy"] = 'auto_growth' import cv2 import json +import numpy as np import time import logging from copy import deepcopy @@ -33,6 +34,7 @@ from ppocr.utils.logging import get_logger from tools.infer.predict_system import TextSystem from ppstructure.table.predict_table import TableSystem, to_excel from ppstructure.utility import parse_args, draw_structure_result +from ppstructure.recovery.docx import convert_info_docx logger = get_logger() @@ -104,7 +106,12 @@ class StructureSystem(object): return_ocr_result_in_table) else: if self.text_system is not None: - filter_boxes, filter_rec_res = self.text_system(roi_img) + if args.recovery: + wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype) + wht_im[y1:y2, x1:x2, :] = roi_img + filter_boxes, filter_rec_res = self.text_system(wht_im) + else: + filter_boxes, filter_rec_res = self.text_system(roi_img) # remove style char style_token = [ '', '', '', '', '', @@ -118,7 +125,8 @@ class StructureSystem(object): for token in style_token: if token in rec_str: rec_str = rec_str.replace(token, '') - box += [x1, y1] + if not args.recovery: + box += [x1, y1] res.append({ 'text': rec_str, 'confidence': float(rec_conf), @@ -192,6 +200,8 @@ def main(args): # img_save_path = os.path.join(save_folder, img_name + '.jpg') cv2.imwrite(img_save_path, draw_img) logger.info('result save to {}'.format(img_save_path)) + if args.recovery: + convert_info_docx(img, res, save_folder, img_name) elapse = time.time() - starttime logger.info("Predict time : {:.3f}s".format(elapse)) diff --git a/ppstructure/recovery/README.md b/ppstructure/recovery/README.md new file mode 100644 index 0000000000..7e59367063 --- /dev/null +++ b/ppstructure/recovery/README.md @@ -0,0 +1,40 @@ +English | [简体中文](README_ch.md) + +- [Getting Started](#getting-started) + - [1. Introduction](#1) + - [2. Quick Start](#2) + + + +## 1. Introduction + +Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order. + +Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc. +The following figure shows the result: + +
+ +
+ + + +## 2. Quick Start + +```python +cd PaddleOCR/ppstructure + +# download model +mkdir inference && cd inference +# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# Download the ultra-lightweight English table inch model and unzip it +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +cd .. +# run +python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png +``` + +After running, the docx of each picture will be saved in the directory specified by the output field \ No newline at end of file diff --git a/ppstructure/recovery/README_ch.md b/ppstructure/recovery/README_ch.md new file mode 100644 index 0000000000..570acc4011 --- /dev/null +++ b/ppstructure/recovery/README_ch.md @@ -0,0 +1,44 @@ +[English](README.md) | 简体中文 + +# 版面恢复使用说明 + +- [1. 简介](#1) +- [2. 使用](#2) + + + + +## 1. 简介 + +版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。 + +版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,下图展示了版面恢复的结果: + +
+ +
+ + + +## 2. 使用 + +恢复给定文档的版面: + +```python +cd PaddleOCR/ppstructure + +# 下载模型 +mkdir inference && cd inference +# 下载超英文轻量级PP-OCRv3模型的检测模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar +# 下载英文轻量级PP-OCRv3模型的识别模型并解压 +wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar +# 下载超轻量级英文表格英寸模型并解压 +wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar +cd .. +# 执行预测 +python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png +``` + +运行完成后,每张图片的docx文档会保存到output字段指定的目录下 + diff --git a/ppstructure/recovery/docx.py b/ppstructure/recovery/docx.py new file mode 100644 index 0000000000..5278217d5b --- /dev/null +++ b/ppstructure/recovery/docx.py @@ -0,0 +1,160 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import os +import pypandoc +from copy import deepcopy + +from docx import Document +from docx import shared +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.section import WD_SECTION +from docx.oxml.ns import qn + +from ppocr.utils.logging import get_logger +logger = get_logger() + + +def convert_info_docx(img, res, save_folder, img_name): + doc = Document() + doc.styles['Normal'].font.name = 'Times New Roman' + doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') + doc.styles['Normal'].font.size = shared.Pt(6.5) + h, w, _ = img.shape + + res = sorted_layout_boxes(res, w) + flag = 1 + for i, region in enumerate(res): + if flag == 2 and region['layout'] == 'single': + section = doc.add_section(WD_SECTION.CONTINUOUS) + section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1') + flag = 1 + elif flag == 1 and region['layout'] == 'double': + section = doc.add_section(WD_SECTION.CONTINUOUS) + section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2') + flag = 2 + + if region['type'] == 'Figure': + excel_save_folder = os.path.join(save_folder, img_name) + img_path = os.path.join(excel_save_folder, + '{}.jpg'.format(region['bbox'])) + paragraph_pic = doc.add_paragraph() + paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER + run = paragraph_pic.add_run("") + if flag == 1: + run.add_picture(img_path, width=shared.Inches(5)) + elif flag == 2: + run.add_picture(img_path, width=shared.Inches(2)) + elif region['type'] == 'Title': + doc.add_heading(region['res'][0]['text']) + elif region['type'] == 'Text': + paragraph = doc.add_paragraph() + paragraph_format = paragraph.paragraph_format + for i, line in enumerate(region['res']): + if i == 0: + paragraph_format.first_line_indent = shared.Inches(0.25) + text_run = paragraph.add_run(line['text'] + ' ') + text_run.font.size = shared.Pt(9) + elif region['type'] == 'Table': + pypandoc.convert( + source=region['res']['html'], + format='html', + to='docx', + outputfile='tmp.docx') + tmp_doc = Document('tmp.docx') + paragraph = doc.add_paragraph() + + table = tmp_doc.tables[0] + new_table = deepcopy(table) + new_table.style = doc.styles['Table Grid'] + from docx.enum.table import WD_TABLE_ALIGNMENT + new_table.alignment = WD_TABLE_ALIGNMENT.CENTER + paragraph.add_run().element.addnext(new_table._tbl) + os.remove('tmp.docx') + else: + continue + + # save to docx + docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) + doc.save(docx_path) + logger.info('docx save to {}'.format(docx_path)) + + +def sorted_layout_boxes(res, w): + """ + Sort text boxes in order from top to bottom, left to right + args: + res(list):ppstructure results + return: + sorted results(list) + """ + num_boxes = len(res) + if num_boxes == 1: + res[0]['layout'] = 'single' + return res + + sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0])) + _boxes = list(sorted_boxes) + + new_res = [] + res_left = [] + res_right = [] + i = 0 + + while True: + if i >= num_boxes: + break + if i == num_boxes - 1: + if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][ + 'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2: + new_res += res_left + new_res += res_right + _boxes[i]['layout'] = 'single' + new_res.append(_boxes[i]) + else: + if _boxes[i]['bbox'][2] > w / 2: + _boxes[i]['layout'] = 'double' + res_right.append(_boxes[i]) + new_res += res_left + new_res += res_right + elif _boxes[i]['bbox'][0] < w / 2: + _boxes[i]['layout'] = 'double' + res_left.append(_boxes[i]) + new_res += res_left + new_res += res_right + res_left = [] + res_right = [] + break + elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4: + _boxes[i]['layout'] = 'double' + res_left.append(_boxes[i]) + i += 1 + elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2: + _boxes[i]['layout'] = 'double' + res_right.append(_boxes[i]) + i += 1 + else: + new_res += res_left + new_res += res_right + _boxes[i]['layout'] = 'single' + new_res.append(_boxes[i]) + res_left = [] + res_right = [] + i += 1 + if res_left: + new_res += res_left + if res_right: + new_res += res_right + return new_res \ No newline at end of file diff --git a/ppstructure/utility.py b/ppstructure/utility.py index 938c12f951..1ad902e7e6 100644 --- a/ppstructure/utility.py +++ b/ppstructure/utility.py @@ -61,6 +61,11 @@ def init_args(): type=str2bool, default=True, help='In the forward, whether the non-table area is recognition by ocr') + parser.add_argument( + "--recovery", + type=bool, + default=False, + help='Whether to enable layout of recovery') return parser