add recovery

2025-11-25 06:25:50 +00:00 · 2022-05-07 16:55:20 +08:00 · 2022-05-07 16:55:20 +08:00 · 7e5e95d624
commit 7e5e95d624
parent e10490dd8d
7 changed files with 266 additions and 7 deletions
--- a/applications/多模态表单识别.md
+++ b/applications/多模态表单识别.md
@ -16,14 +16,14 @@
 <center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center>
 <center>图1 多模态表单识别流程图</center>
-注：欢迎再AIStudio领取免费算力体验线上实训，项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3815918)(配备Tesla V100、A100等高级算力资源)
+注：欢迎再AIStudio领取免费算力体验线上实训，项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375)(配备Tesla V100、A100等高级算力资源)
 # 2 安装说明
-下载PaddleOCR源码，本项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件)，无需下载解压即可，只需安装依赖环境~
+下载PaddleOCR源码，上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件)，无需下载解压即可，只需安装依赖环境~
 ```python
@ -33,7 +33,7 @@
 ```python
 # 如仍需安装or安装更新，可以执行以下步骤
-! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
+# ! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
 # ! git clone https://gitee.com/PaddlePaddle/PaddleOCR
 ```
@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest：评估尺寸，添加如下参数
 <center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center>
 <center>图8 文本检测方案2-模型评估</center>
-使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`
+使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`，[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/ch_db_mv3-student1600-finetune.zip)
 ```python
@ -538,7 +538,7 @@ Train.dataset.ratio_list：动态采样
 <center>图16 文本识别方案3-模型评估</center>
-使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`
+使用训练好的模型进行评估，更新模型路径`Global.checkpoints`，这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`，[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/rec_mobile_pp-OCRv2-student-realdata.zip)
 ```python
--- a/ppstructure/docs/table/recovery.jpg
+++ b/ppstructure/docs/table/recovery.jpg
--- a/ppstructure/predict_system.py
+++ b/ppstructure/predict_system.py
@ -23,6 +23,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 import cv2
 import json
 import numpy as np
 import time
 import logging
 from copy import deepcopy
@ -33,6 +34,7 @@ from ppocr.utils.logging import get_logger
 from tools.infer.predict_system import TextSystem
 from ppstructure.table.predict_table import TableSystem, to_excel
 from ppstructure.utility import parse_args, draw_structure_result
 from ppstructure.recovery.docx import convert_info_docx
 logger = get_logger()
@ -104,7 +106,12 @@ class StructureSystem(object):
                                                return_ocr_result_in_table)
                else:
                    if self.text_system is not None:
-                        filter_boxes, filter_rec_res = self.text_system(roi_img)
+                        if args.recovery:
                            wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype)
                            wht_im[y1:y2, x1:x2, :] = roi_img
                            filter_boxes, filter_rec_res = self.text_system(wht_im)
                        else:
                            filter_boxes, filter_rec_res = self.text_system(roi_img)
                        # remove style char
                        style_token = [
                            '<strike>', '<strike>', '<sup>', '</sub>', '<b>',
@ -118,7 +125,8 @@ class StructureSystem(object):
                            for token in style_token:
                                if token in rec_str:
                                    rec_str = rec_str.replace(token, '')
-                            box += [x1, y1]
+                            if not args.recovery:
                                box += [x1, y1]
                            res.append({
                                'text': rec_str,
                                'confidence': float(rec_conf),
@ -192,6 +200,8 @@ def main(args):
            # img_save_path = os.path.join(save_folder, img_name + '.jpg')
        cv2.imwrite(img_save_path, draw_img)
        logger.info('result save to {}'.format(img_save_path))
        if args.recovery:
            convert_info_docx(img, res, save_folder, img_name) 
        elapse = time.time() - starttime
        logger.info("Predict time : {:.3f}s".format(elapse))
--- a/ppstructure/recovery/README.md
+++ b/ppstructure/recovery/README.md
@ -0,0 +1,40 @@
 English | [简体中文](README_ch.md)
 - [Getting Started](#getting-started)
  - [1.  Introduction](#1)
  - [2. Quick Start](#2)
 <a name="1"></a>
 ## 1.  Introduction
 Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order.
 Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc.
 The following figure shows the result：
 <div align="center">
 <img src="../docs/table/recovery.jpg"  width = "700" />
 </div>
 <a name="2"></a>
 ## 2. Quick Start
 ```python
 cd PaddleOCR/ppstructure
 # download model
 mkdir inference && cd inference
 # Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
 # Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf  ch_PP-OCRv3_rec_infer.tar
 # Download the ultra-lightweight English table inch model and unzip it
 wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
 cd ..
 # run
 python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
 ```
 After running, the docx  of each picture will be saved in the directory specified by the output field
--- a/ppstructure/recovery/README_ch.md
+++ b/ppstructure/recovery/README_ch.md
@ -0,0 +1,44 @@
 [English](README.md) | 简体中文
 # 版面恢复使用说明
 - [1. 简介](#1)
 - [2. 使用](#2)
 <a name="1"></a>
 ## 1.  简介
 版面恢复就是在OCR识别后，内容仍然像原文档图片那样排列着，段落不变、顺序不变的输出到word文档中等。
 版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术，从而更好地恢复图片、表格、标题等内容，下图展示了版面恢复的结果：
 <div align="center">
 <img src="../docs/table/recovery.jpg"  width = "700" />
 </div>
 <a name="2"></a>
 ## 2. 使用
 恢复给定文档的版面：
 ```python
 cd PaddleOCR/ppstructure
 # 下载模型
 mkdir inference && cd inference
 # 下载超英文轻量级PP-OCRv3模型的检测模型并解压
 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
 # 下载英文轻量级PP-OCRv3模型的识别模型并解压
 wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf  ch_PP-OCRv3_rec_infer.tar
 # 下载超轻量级英文表格英寸模型并解压
 wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
 cd ..
 # 执行预测
 python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
 ```
 运行完成后，每张图片的docx文档会保存到output字段指定的目录下
--- a/ppstructure/recovery/docx.py
+++ b/ppstructure/recovery/docx.py
@ -0,0 +1,160 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import cv2
 import os
 import pypandoc
 from copy import deepcopy
 from docx import Document
 from docx import shared
 from docx.enum.text import WD_ALIGN_PARAGRAPH
 from docx.enum.section import WD_SECTION
 from docx.oxml.ns import qn
 from ppocr.utils.logging import get_logger
 logger = get_logger()
 def convert_info_docx(img, res, save_folder, img_name):
    doc = Document()
    doc.styles['Normal'].font.name = 'Times New Roman'
    doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    doc.styles['Normal'].font.size = shared.Pt(6.5)
    h, w, _ = img.shape
    res = sorted_layout_boxes(res, w)
    flag = 1
    for i, region in enumerate(res):
        if flag == 2 and region['layout'] == 'single':
            section = doc.add_section(WD_SECTION.CONTINUOUS)
            section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
            flag = 1
        elif flag == 1 and region['layout'] == 'double':
            section = doc.add_section(WD_SECTION.CONTINUOUS)
            section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
            flag = 2
        if region['type'] == 'Figure':
            excel_save_folder = os.path.join(save_folder, img_name)
            img_path = os.path.join(excel_save_folder,
                                    '{}.jpg'.format(region['bbox']))
            paragraph_pic = doc.add_paragraph()
            paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
            run = paragraph_pic.add_run("")
            if flag == 1:
                run.add_picture(img_path, width=shared.Inches(5))
            elif flag == 2:
                run.add_picture(img_path, width=shared.Inches(2))
        elif region['type'] == 'Title':
            doc.add_heading(region['res'][0]['text'])
        elif region['type'] == 'Text':
            paragraph = doc.add_paragraph()
            paragraph_format = paragraph.paragraph_format
            for i, line in enumerate(region['res']):
                if i == 0:
                    paragraph_format.first_line_indent = shared.Inches(0.25)
                text_run = paragraph.add_run(line['text'] + ' ')
                text_run.font.size = shared.Pt(9)
        elif region['type'] == 'Table':
            pypandoc.convert(
                source=region['res']['html'],
                format='html',
                to='docx',
                outputfile='tmp.docx')
            tmp_doc = Document('tmp.docx')
            paragraph = doc.add_paragraph()
            table = tmp_doc.tables[0]
            new_table = deepcopy(table)
            new_table.style = doc.styles['Table Grid']
            from docx.enum.table import WD_TABLE_ALIGNMENT
            new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
            paragraph.add_run().element.addnext(new_table._tbl)
            os.remove('tmp.docx')
        else:
            continue
    # save to docx
    docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
    doc.save(docx_path)
    logger.info('docx save to {}'.format(docx_path))
 def sorted_layout_boxes(res, w):
    """
    Sort text boxes in order from top to bottom, left to right
    args:
        res(list):ppstructure results
    return:
        sorted results(list)
    """
    num_boxes = len(res)
    if num_boxes == 1:
        res[0]['layout'] = 'single'
        return res
    sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
    _boxes = list(sorted_boxes)
    new_res = []
    res_left = []
    res_right = []
    i = 0
    while True:
        if i >= num_boxes:
            break
        if i == num_boxes - 1:
            if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
                    'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
                new_res += res_left
                new_res += res_right
                _boxes[i]['layout'] = 'single'
                new_res.append(_boxes[i])
            else:
                if _boxes[i]['bbox'][2] > w / 2:
                    _boxes[i]['layout'] = 'double'
                    res_right.append(_boxes[i])
                    new_res += res_left
                    new_res += res_right
                elif _boxes[i]['bbox'][0] < w / 2:
                    _boxes[i]['layout'] = 'double'
                    res_left.append(_boxes[i])
                    new_res += res_left
                    new_res += res_right
            res_left = []
            res_right = []
            break
        elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4:
            _boxes[i]['layout'] = 'double'
            res_left.append(_boxes[i])
            i += 1
        elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
            _boxes[i]['layout'] = 'double'
            res_right.append(_boxes[i])
            i += 1
        else:
            new_res += res_left
            new_res += res_right
            _boxes[i]['layout'] = 'single'
            new_res.append(_boxes[i])
            res_left = []
            res_right = []
            i += 1
    if res_left:
        new_res += res_left
    if res_right:
        new_res += res_right
    return new_res
--- a/ppstructure/utility.py
+++ b/ppstructure/utility.py
@ -61,6 +61,11 @@ def init_args():
        type=str2bool,
        default=True,
        help='In the forward, whether the non-table area is recognition by ocr')
    parser.add_argument(
        "--recovery",
        type=bool,
        default=False,
        help='Whether to enable layout of recovery')        
    return parser