Yuming Long dcd6d0ff67
Refactor: support entire page OCR with ocr_mode and ocr_languages (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".

The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
    *  OCR the entire image
    * merge the OCR layout with inferred page layout
 * if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process

This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
  
## Test
```
from unstructured.partition.auto import partition

entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 22:54:49 +00:00

527 lines
16 KiB
Python

import os
import pathlib
from unittest import mock
import pytest
from PIL import Image
from pytesseract import TesseractError
from unstructured_inference.inference import layout
from unstructured.chunking.title import chunk_by_title
from unstructured.partition import image, ocr, pdf
from unstructured.partition.json import partition_json
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.staging.base import elements_to_json
DIRECTORY = pathlib.Path(__file__).parent.resolve()
class MockResponse:
def __init__(self, status_code, response):
self.status_code = status_code
self.response = response
def json(self):
return self.response
def mock_healthy_get(url, **kwargs):
return MockResponse(status_code=200, response={})
def mock_unhealthy_get(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_unsuccessful_post(url, **kwargs):
return MockResponse(status_code=500, response={})
def mock_successful_post(url, **kwargs):
response = {
"pages": [
{
"number": 0,
"elements": [
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
],
},
{
"number": 1,
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
},
],
}
return MockResponse(status_code=200, response=response)
class MockPageLayout(layout.PageLayout):
def __init__(self, number: int, image: Image):
self.number = number
self.image = image
@property
def elements(self):
return [
layout.LayoutElement(
type="Title",
x1=0,
y1=0,
x2=2,
y2=2,
text="Charlie Brown and the Great Pumpkin",
),
]
class MockDocumentLayout(layout.DocumentLayout):
@property
def pages(self):
return [
MockPageLayout(number=0, image=Image.new("1", (1, 1))),
]
@pytest.mark.parametrize(
("filename", "file"),
[
("example-docs/example.jpg", None),
(None, b"0000"),
],
)
def test_partition_image_local(monkeypatch, filename, file):
monkeypatch.setattr(
layout,
"process_data_with_model",
lambda *args, **kwargs: MockDocumentLayout(),
)
monkeypatch.setattr(
layout,
"process_file_with_model",
lambda *args, **kwargs: MockDocumentLayout(),
)
monkeypatch.setattr(
ocr,
"process_data_with_ocr",
lambda *args, **kwargs: MockDocumentLayout(),
)
monkeypatch.setattr(
ocr,
"process_data_with_ocr",
lambda *args, **kwargs: MockDocumentLayout(),
)
partition_image_response = pdf._partition_pdf_or_image_local(
filename,
file,
is_image=True,
)
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
def test_partition_image_local_raises_with_no_filename():
with pytest.raises(FileNotFoundError):
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
def test_partition_image_with_auto_strategy(
filename="example-docs/layout-parser-paper-fast.jpg",
):
elements = image.partition_image(filename=filename, strategy="auto")
titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert titles[0].text == title
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)
def test_partition_image_with_table_extraction(
filename="example-docs/layout-parser-paper-with-table.jpg",
):
elements = image.partition_image(
filename=filename,
strategy="hi_res",
infer_table_structure=True,
)
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
assert len(table) == 1
assert "<table><thead><th>" in table[0]
def test_partition_image_with_multipage_tiff(
filename="example-docs/layout-parser-paper-combined.tiff",
):
elements = image.partition_image(filename=filename, strategy="auto")
assert elements[-1].metadata.page_number == 2
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
with mock.patch.object(
ocr,
"process_file_with_ocr",
mock.MagicMock(),
) as mock_partition:
image.partition_image(
filename=filename,
strategy="hi_res",
ocr_languages="eng+swe",
)
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
def test_partition_image_from_file_with_language_passed(
filename="example-docs/example.jpg",
):
with mock.patch.object(
ocr,
"process_data_with_ocr",
mock.MagicMock(),
) as mock_partition, open(filename, "rb") as f:
image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe")
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
@pytest.mark.skip(reason="Current catching too many tesseract errors")
def test_partition_image_raises_with_invalid_language(
filename="example-docs/example.jpg",
):
with pytest.raises(TesseractError):
image.partition_image(
filename=filename,
strategy="hi_res",
ocr_languages="fakeroo",
)
def test_partition_image_with_ocr_detects_korean():
filename = os.path.join(
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"english-and-korean.png",
)
elements = image.partition_image(
filename=filename,
ocr_languages="eng+kor",
strategy="ocr_only",
)
assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
def test_partition_image_with_ocr_detects_korean_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
ocr_languages="eng+kor",
strategy="ocr_only",
)
assert elements[0].text == "RULES AND INSTRUCTIONS"
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
def test_partition_image_raises_with_bad_strategy():
filename = os.path.join(
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"english-and-korean.png",
)
with pytest.raises(ValueError):
image.partition_image(filename=filename, strategy="fakeroo")
def test_partition_image_default_strategy_hi_res():
filename = os.path.join(
DIRECTORY,
"..",
"..",
"..",
"example-docs",
"layout-parser-paper-fast.jpg",
)
with open(filename, "rb") as f:
elements = image.partition_image(file=f)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
idx = 2
assert elements[idx].text == title
assert elements[idx].metadata.coordinates is not None
assert elements[idx].metadata.detection_class_prob is not None
assert isinstance(elements[idx].metadata.detection_class_prob, float)
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
assert {element.metadata.detection_origin for element in elements} == {"image"}
def test_partition_image_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(filename=filename)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_image_with_hi_res_strategy_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(filename=filename, stratefy="hi_res")
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_image_metadata_date_custom_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(
filename=filename,
metadata_last_modified=expected_last_modification_date,
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date",
return_value=mocked_last_modification_date,
)
elements = image.partition_image(
filename=filename,
stratefy="hi_res",
metadata_last_modified=expected_last_modification_date,
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_image_from_file_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = image.partition_image(file=f)
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = image.partition_image(file=f, stratefy="hi_res")
assert elements[0].metadata.last_modified == mocked_last_modification_date
def test_partition_image_from_file_metadata_date_custom_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
metadata_last_modified=expected_last_modification_date,
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_metadata_date(
mocker,
filename="example-docs/english-and-korean.png",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modification_date = "2009-07-05T09:24:28"
mocker.patch(
"unstructured.partition.pdf.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
)
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
metadata_last_modified=expected_last_modification_date,
stratefy="hi_res",
)
assert elements[0].metadata.last_modified == expected_last_modification_date
def test_partition_msg_with_json(
filename="example-docs/layout-parser-paper-fast.jpg",
):
elements = image.partition_image(filename=filename, strategy="auto")
test_elements = partition_json(text=elements_to_json(elements))
assert len(elements) == len(test_elements)
assert elements[0].metadata.page_number == test_elements[0].metadata.page_number
for i in range(len(elements)):
assert elements[i] == test_elements[i]
def test_partition_image_with_ocr_has_coordinates_from_filename(
filename="example-docs/english-and-korean.png",
):
elements = image.partition_image(filename=filename, strategy="ocr_only")
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
assert int_coordinates == [(14, 36), (14, 16), (381, 16), (381, 36)]
@pytest.mark.parametrize(
("filename"),
[
("example-docs/layout-parser-paper-with-table.jpg"),
("example-docs/english-and-korean.png"),
("example-docs/layout-parser-paper-fast.jpg"),
],
)
def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
filename,
):
import math
elements = image.partition_image(filename=filename, strategy="ocr_only")
for element in elements:
# TODO (jennings) One or multiple elements is an empty string
# without coordinates. This should be fixed in a new issue
if element.text:
box = element.metadata.coordinates.points
for point in box:
assert point[0] is not math.nan
assert point[1] is not math.nan
def test_partition_image_formats_languages_for_tesseract():
filename = "example-docs/jpn-vert.jpeg"
with mock.patch(
"unstructured.partition.ocr.process_file_with_ocr",
) as mock_process_file_with_ocr:
image.partition_image(filename=filename, strategy="hi_res", languages=["jpn_vert"])
_, kwargs = mock_process_file_with_ocr.call_args_list[0]
assert "ocr_languages" in kwargs
assert kwargs["ocr_languages"] == "jpn_vert"
def test_partition_image_warns_with_ocr_languages(caplog):
filename = "example-docs/layout-parser-paper-fast.jpg"
image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng")
assert "The ocr_languages kwarg will be deprecated" in caplog.text
def test_add_chunking_strategy_on_partition_image(
filename="example-docs/layout-parser-paper-fast.jpg",
):
elements = image.partition_image(filename=filename)
chunk_elements = image.partition_image(filename, chunking_strategy="by_title")
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_add_chunking_strategy_on_partition_image_hi_res(
filename="example-docs/layout-parser-paper-with-table.jpg",
):
elements = image.partition_image(
filename=filename,
strategy="hi_res",
infer_table_structure=True,
)
chunk_elements = image.partition_image(
filename,
strategy="hi_res",
infer_table_structure=True,
chunking_strategy="by_title",
)
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_partition_image_uses_model_name():
with mock.patch.object(
pdf,
"_partition_pdf_or_image_local",
) as mockpartition:
image.partition_image("example-docs/layout-parser-paper-fast.jpg", model_name="test")
print(mockpartition.call_args)
assert "model_name" in mockpartition.call_args.kwargs
assert mockpartition.call_args.kwargs["model_name"]
@pytest.mark.parametrize(
("ocr_mode", "idx_title_element"),
[
("entire_page", 2),
("individual_blocks", 1),
],
)
def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
filename = "example-docs/layout-parser-paper-fast.jpg"
elements = image.partition_image(filename=filename, ocr_mode=ocr_mode, strategy="hi_res")
first_line = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
# Note(yuming): idx_title_element is different based on xy-cut and ocr mode
assert elements[idx_title_element].text == first_line
def test_partition_image_hi_res_invalid_ocr_mode():
filename = "example-docs/layout-parser-paper-fast.jpg"
with pytest.raises(ValueError):
_ = image.partition_image(filename=filename, ocr_mode="invalid_ocr_mode", strategy="hi_res")