mirror of
https://github.com/PaddlePaddle/PaddleOCR.git
synced 2025-11-25 06:25:50 +00:00
add recovery
This commit is contained in:
parent
e10490dd8d
commit
7e5e95d624
@ -16,14 +16,14 @@
|
|||||||
<center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center>
|
<center><img src='https://ai-studio-static-online.cdn.bcebos.com/9bd844b970f94e5ba0bc0c5799bd819ea9b1861bb306471fabc2d628864d418e'></center>
|
||||||
<center>图1 多模态表单识别流程图</center>
|
<center>图1 多模态表单识别流程图</center>
|
||||||
|
|
||||||
注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3815918)(配备Tesla V100、A100等高级算力资源)
|
注:欢迎再AIStudio领取免费算力体验线上实训,项目链接: [多模态表单识别](https://aistudio.baidu.com/aistudio/projectdetail/3884375)(配备Tesla V100、A100等高级算力资源)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 2 安装说明
|
# 2 安装说明
|
||||||
|
|
||||||
|
|
||||||
下载PaddleOCR源码,本项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~
|
下载PaddleOCR源码,上述AIStudio项目中已经帮大家打包好的PaddleOCR(已经修改好配置文件),无需下载解压即可,只需安装依赖环境~
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -33,7 +33,7 @@
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# 如仍需安装or安装更新,可以执行以下步骤
|
# 如仍需安装or安装更新,可以执行以下步骤
|
||||||
! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
|
# ! git clone https://github.com/PaddlePaddle/PaddleOCR.git -b dygraph
|
||||||
# ! git clone https://gitee.com/PaddlePaddle/PaddleOCR
|
# ! git clone https://gitee.com/PaddlePaddle/PaddleOCR
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -290,7 +290,7 @@ Eval.dataset.transforms.DetResizeForTest:评估尺寸,添加如下参数
|
|||||||
<center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center>
|
<center><img src="https://ai-studio-static-online.cdn.bcebos.com/5a75137c5f924dfeb6956b5818812298cc3dc7992ac84954b4175be9adf83c77"></center>
|
||||||
<center>图8 文本检测方案2-模型评估</center>
|
<center>图8 文本检测方案2-模型评估</center>
|
||||||
|
|
||||||
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`
|
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/ch_db_mv3-student1600-finetune/best_accuracy`,[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/ch_db_mv3-student1600-finetune.zip)
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -538,7 +538,7 @@ Train.dataset.ratio_list:动态采样
|
|||||||
|
|
||||||
<center>图16 文本识别方案3-模型评估</center>
|
<center>图16 文本识别方案3-模型评估</center>
|
||||||
|
|
||||||
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`
|
使用训练好的模型进行评估,更新模型路径`Global.checkpoints`,这里为大家提供训练好的模型`./pretrain/rec_mobile_pp-OCRv2-student-readldata/best_accuracy`,[模型下载地址](https://paddleocr.bj.bcebos.com/fanliku/sheet_recognition/rec_mobile_pp-OCRv2-student-realdata.zip)
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|||||||
BIN
ppstructure/docs/table/recovery.jpg
Normal file
BIN
ppstructure/docs/table/recovery.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 762 KiB |
@ -23,6 +23,7 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
|
|||||||
os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
|
os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
|
||||||
import cv2
|
import cv2
|
||||||
import json
|
import json
|
||||||
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
@ -33,6 +34,7 @@ from ppocr.utils.logging import get_logger
|
|||||||
from tools.infer.predict_system import TextSystem
|
from tools.infer.predict_system import TextSystem
|
||||||
from ppstructure.table.predict_table import TableSystem, to_excel
|
from ppstructure.table.predict_table import TableSystem, to_excel
|
||||||
from ppstructure.utility import parse_args, draw_structure_result
|
from ppstructure.utility import parse_args, draw_structure_result
|
||||||
|
from ppstructure.recovery.docx import convert_info_docx
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
@ -104,7 +106,12 @@ class StructureSystem(object):
|
|||||||
return_ocr_result_in_table)
|
return_ocr_result_in_table)
|
||||||
else:
|
else:
|
||||||
if self.text_system is not None:
|
if self.text_system is not None:
|
||||||
filter_boxes, filter_rec_res = self.text_system(roi_img)
|
if args.recovery:
|
||||||
|
wht_im = np.ones(ori_im.shape, dtype=ori_im.dtype)
|
||||||
|
wht_im[y1:y2, x1:x2, :] = roi_img
|
||||||
|
filter_boxes, filter_rec_res = self.text_system(wht_im)
|
||||||
|
else:
|
||||||
|
filter_boxes, filter_rec_res = self.text_system(roi_img)
|
||||||
# remove style char
|
# remove style char
|
||||||
style_token = [
|
style_token = [
|
||||||
'<strike>', '<strike>', '<sup>', '</sub>', '<b>',
|
'<strike>', '<strike>', '<sup>', '</sub>', '<b>',
|
||||||
@ -118,7 +125,8 @@ class StructureSystem(object):
|
|||||||
for token in style_token:
|
for token in style_token:
|
||||||
if token in rec_str:
|
if token in rec_str:
|
||||||
rec_str = rec_str.replace(token, '')
|
rec_str = rec_str.replace(token, '')
|
||||||
box += [x1, y1]
|
if not args.recovery:
|
||||||
|
box += [x1, y1]
|
||||||
res.append({
|
res.append({
|
||||||
'text': rec_str,
|
'text': rec_str,
|
||||||
'confidence': float(rec_conf),
|
'confidence': float(rec_conf),
|
||||||
@ -192,6 +200,8 @@ def main(args):
|
|||||||
# img_save_path = os.path.join(save_folder, img_name + '.jpg')
|
# img_save_path = os.path.join(save_folder, img_name + '.jpg')
|
||||||
cv2.imwrite(img_save_path, draw_img)
|
cv2.imwrite(img_save_path, draw_img)
|
||||||
logger.info('result save to {}'.format(img_save_path))
|
logger.info('result save to {}'.format(img_save_path))
|
||||||
|
if args.recovery:
|
||||||
|
convert_info_docx(img, res, save_folder, img_name)
|
||||||
elapse = time.time() - starttime
|
elapse = time.time() - starttime
|
||||||
logger.info("Predict time : {:.3f}s".format(elapse))
|
logger.info("Predict time : {:.3f}s".format(elapse))
|
||||||
|
|
||||||
|
|||||||
40
ppstructure/recovery/README.md
Normal file
40
ppstructure/recovery/README.md
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
English | [简体中文](README_ch.md)
|
||||||
|
|
||||||
|
- [Getting Started](#getting-started)
|
||||||
|
- [1. Introduction](#1)
|
||||||
|
- [2. Quick Start](#2)
|
||||||
|
|
||||||
|
<a name="1"></a>
|
||||||
|
|
||||||
|
## 1. Introduction
|
||||||
|
|
||||||
|
Layout recovery means that after OCR recognition, the content is still arranged like the original document pictures, and the paragraphs are output to word document in the same order.
|
||||||
|
|
||||||
|
Layout recovery combines [layout analysis](../layout/README.md)、[table recognition](../table/README.md) to better recover images, tables, titles, etc.
|
||||||
|
The following figure shows the result:
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="../docs/table/recovery.jpg" width = "700" />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<a name="2"></a>
|
||||||
|
|
||||||
|
## 2. Quick Start
|
||||||
|
|
||||||
|
```python
|
||||||
|
cd PaddleOCR/ppstructure
|
||||||
|
|
||||||
|
# download model
|
||||||
|
mkdir inference && cd inference
|
||||||
|
# Download the detection model of the ultra-lightweight English PP-OCRv3 model and unzip it
|
||||||
|
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
|
||||||
|
# Download the recognition model of the ultra-lightweight English PP-OCRv3 model and unzip it
|
||||||
|
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
|
||||||
|
# Download the ultra-lightweight English table inch model and unzip it
|
||||||
|
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
|
||||||
|
cd ..
|
||||||
|
# run
|
||||||
|
python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
|
||||||
|
```
|
||||||
|
|
||||||
|
After running, the docx of each picture will be saved in the directory specified by the output field
|
||||||
44
ppstructure/recovery/README_ch.md
Normal file
44
ppstructure/recovery/README_ch.md
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
[English](README.md) | 简体中文
|
||||||
|
|
||||||
|
# 版面恢复使用说明
|
||||||
|
|
||||||
|
- [1. 简介](#1)
|
||||||
|
- [2. 使用](#2)
|
||||||
|
|
||||||
|
|
||||||
|
<a name="1"></a>
|
||||||
|
|
||||||
|
## 1. 简介
|
||||||
|
|
||||||
|
版面恢复就是在OCR识别后,内容仍然像原文档图片那样排列着,段落不变、顺序不变的输出到word文档中等。
|
||||||
|
|
||||||
|
版面恢复结合了[版面分析](../layout/README_ch.md)、[表格识别](../table/README_ch.md)技术,从而更好地恢复图片、表格、标题等内容,下图展示了版面恢复的结果:
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<img src="../docs/table/recovery.jpg" width = "700" />
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<a name="2"></a>
|
||||||
|
|
||||||
|
## 2. 使用
|
||||||
|
|
||||||
|
恢复给定文档的版面:
|
||||||
|
|
||||||
|
```python
|
||||||
|
cd PaddleOCR/ppstructure
|
||||||
|
|
||||||
|
# 下载模型
|
||||||
|
mkdir inference && cd inference
|
||||||
|
# 下载超英文轻量级PP-OCRv3模型的检测模型并解压
|
||||||
|
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_infer.tar && tar xf ch_PP-OCRv3_det_infer.tar
|
||||||
|
# 下载英文轻量级PP-OCRv3模型的识别模型并解压
|
||||||
|
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_infer.tar && tar xf ch_PP-OCRv3_rec_infer.tar
|
||||||
|
# 下载超轻量级英文表格英寸模型并解压
|
||||||
|
wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar
|
||||||
|
cd ..
|
||||||
|
# 执行预测
|
||||||
|
python3 predict_system.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_PP-OCRv3_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --rec_char_dict_path=../ppocr/utils/en_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --output ./output/table --rec_image_shape=3,48,320 --vis_font_path=../doc/fonts/simfang.ttf --recovery=True --image_dir=./docs/table/1.png
|
||||||
|
```
|
||||||
|
|
||||||
|
运行完成后,每张图片的docx文档会保存到output字段指定的目录下
|
||||||
|
|
||||||
160
ppstructure/recovery/docx.py
Normal file
160
ppstructure/recovery/docx.py
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import os
|
||||||
|
import pypandoc
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
from docx import Document
|
||||||
|
from docx import shared
|
||||||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||||
|
from docx.enum.section import WD_SECTION
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
|
||||||
|
from ppocr.utils.logging import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_info_docx(img, res, save_folder, img_name):
|
||||||
|
doc = Document()
|
||||||
|
doc.styles['Normal'].font.name = 'Times New Roman'
|
||||||
|
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
|
||||||
|
doc.styles['Normal'].font.size = shared.Pt(6.5)
|
||||||
|
h, w, _ = img.shape
|
||||||
|
|
||||||
|
res = sorted_layout_boxes(res, w)
|
||||||
|
flag = 1
|
||||||
|
for i, region in enumerate(res):
|
||||||
|
if flag == 2 and region['layout'] == 'single':
|
||||||
|
section = doc.add_section(WD_SECTION.CONTINUOUS)
|
||||||
|
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '1')
|
||||||
|
flag = 1
|
||||||
|
elif flag == 1 and region['layout'] == 'double':
|
||||||
|
section = doc.add_section(WD_SECTION.CONTINUOUS)
|
||||||
|
section._sectPr.xpath('./w:cols')[0].set(qn('w:num'), '2')
|
||||||
|
flag = 2
|
||||||
|
|
||||||
|
if region['type'] == 'Figure':
|
||||||
|
excel_save_folder = os.path.join(save_folder, img_name)
|
||||||
|
img_path = os.path.join(excel_save_folder,
|
||||||
|
'{}.jpg'.format(region['bbox']))
|
||||||
|
paragraph_pic = doc.add_paragraph()
|
||||||
|
paragraph_pic.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
run = paragraph_pic.add_run("")
|
||||||
|
if flag == 1:
|
||||||
|
run.add_picture(img_path, width=shared.Inches(5))
|
||||||
|
elif flag == 2:
|
||||||
|
run.add_picture(img_path, width=shared.Inches(2))
|
||||||
|
elif region['type'] == 'Title':
|
||||||
|
doc.add_heading(region['res'][0]['text'])
|
||||||
|
elif region['type'] == 'Text':
|
||||||
|
paragraph = doc.add_paragraph()
|
||||||
|
paragraph_format = paragraph.paragraph_format
|
||||||
|
for i, line in enumerate(region['res']):
|
||||||
|
if i == 0:
|
||||||
|
paragraph_format.first_line_indent = shared.Inches(0.25)
|
||||||
|
text_run = paragraph.add_run(line['text'] + ' ')
|
||||||
|
text_run.font.size = shared.Pt(9)
|
||||||
|
elif region['type'] == 'Table':
|
||||||
|
pypandoc.convert(
|
||||||
|
source=region['res']['html'],
|
||||||
|
format='html',
|
||||||
|
to='docx',
|
||||||
|
outputfile='tmp.docx')
|
||||||
|
tmp_doc = Document('tmp.docx')
|
||||||
|
paragraph = doc.add_paragraph()
|
||||||
|
|
||||||
|
table = tmp_doc.tables[0]
|
||||||
|
new_table = deepcopy(table)
|
||||||
|
new_table.style = doc.styles['Table Grid']
|
||||||
|
from docx.enum.table import WD_TABLE_ALIGNMENT
|
||||||
|
new_table.alignment = WD_TABLE_ALIGNMENT.CENTER
|
||||||
|
paragraph.add_run().element.addnext(new_table._tbl)
|
||||||
|
os.remove('tmp.docx')
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# save to docx
|
||||||
|
docx_path = os.path.join(save_folder, '{}.docx'.format(img_name))
|
||||||
|
doc.save(docx_path)
|
||||||
|
logger.info('docx save to {}'.format(docx_path))
|
||||||
|
|
||||||
|
|
||||||
|
def sorted_layout_boxes(res, w):
|
||||||
|
"""
|
||||||
|
Sort text boxes in order from top to bottom, left to right
|
||||||
|
args:
|
||||||
|
res(list):ppstructure results
|
||||||
|
return:
|
||||||
|
sorted results(list)
|
||||||
|
"""
|
||||||
|
num_boxes = len(res)
|
||||||
|
if num_boxes == 1:
|
||||||
|
res[0]['layout'] = 'single'
|
||||||
|
return res
|
||||||
|
|
||||||
|
sorted_boxes = sorted(res, key=lambda x: (x['bbox'][1], x['bbox'][0]))
|
||||||
|
_boxes = list(sorted_boxes)
|
||||||
|
|
||||||
|
new_res = []
|
||||||
|
res_left = []
|
||||||
|
res_right = []
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if i >= num_boxes:
|
||||||
|
break
|
||||||
|
if i == num_boxes - 1:
|
||||||
|
if _boxes[i]['bbox'][1] > _boxes[i - 1]['bbox'][3] and _boxes[i][
|
||||||
|
'bbox'][0] < w / 2 and _boxes[i]['bbox'][2] > w / 2:
|
||||||
|
new_res += res_left
|
||||||
|
new_res += res_right
|
||||||
|
_boxes[i]['layout'] = 'single'
|
||||||
|
new_res.append(_boxes[i])
|
||||||
|
else:
|
||||||
|
if _boxes[i]['bbox'][2] > w / 2:
|
||||||
|
_boxes[i]['layout'] = 'double'
|
||||||
|
res_right.append(_boxes[i])
|
||||||
|
new_res += res_left
|
||||||
|
new_res += res_right
|
||||||
|
elif _boxes[i]['bbox'][0] < w / 2:
|
||||||
|
_boxes[i]['layout'] = 'double'
|
||||||
|
res_left.append(_boxes[i])
|
||||||
|
new_res += res_left
|
||||||
|
new_res += res_right
|
||||||
|
res_left = []
|
||||||
|
res_right = []
|
||||||
|
break
|
||||||
|
elif _boxes[i]['bbox'][0] < w / 4 and _boxes[i]['bbox'][2] < 3*w / 4:
|
||||||
|
_boxes[i]['layout'] = 'double'
|
||||||
|
res_left.append(_boxes[i])
|
||||||
|
i += 1
|
||||||
|
elif _boxes[i]['bbox'][0] > w / 4 and _boxes[i]['bbox'][2] > w / 2:
|
||||||
|
_boxes[i]['layout'] = 'double'
|
||||||
|
res_right.append(_boxes[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
new_res += res_left
|
||||||
|
new_res += res_right
|
||||||
|
_boxes[i]['layout'] = 'single'
|
||||||
|
new_res.append(_boxes[i])
|
||||||
|
res_left = []
|
||||||
|
res_right = []
|
||||||
|
i += 1
|
||||||
|
if res_left:
|
||||||
|
new_res += res_left
|
||||||
|
if res_right:
|
||||||
|
new_res += res_right
|
||||||
|
return new_res
|
||||||
@ -61,6 +61,11 @@ def init_args():
|
|||||||
type=str2bool,
|
type=str2bool,
|
||||||
default=True,
|
default=True,
|
||||||
help='In the forward, whether the non-table area is recognition by ocr')
|
help='In the forward, whether the non-table area is recognition by ocr')
|
||||||
|
parser.add_argument(
|
||||||
|
"--recovery",
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
help='Whether to enable layout of recovery')
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user