enhancement: add ocr_only strategy for partition_image (#540)

* spike for ocr-only strategy for images

* fix for file processing

* extra space

* add korean to ci

* added test for ocr_only strategy

* added docs for ocr_only

* changelog and version

* added test for bad strategy

* skip korean test if in docker

* bump version

* version bump

* document valid strategies

* bump version for release

---------

Co-authored-by: qued <64741807+qued@users.noreply.github.com>
This commit is contained in:
Matt Robinson 2023-05-04 16:23:51 -04:00 committed by GitHub
parent fae5f8fdde
commit 392cccdbf7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 103 additions and 15 deletions

View File

@ -107,7 +107,7 @@ jobs:
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version
make test
make check-coverage

View File

@ -1,7 +1,9 @@
## 0.6.3-dev3
## 0.6.3
### Enhancements
* Add an "ocr_only" strategy for `partition_image`.
### Features
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST

View File

@ -430,6 +430,20 @@ Examples:
elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segements the document using
``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy,
which OCRs the document and then runs the output through ``partition_text``. This can be helpful
if ``detectron2`` does not detect a text element in the image. To run example below, ensure you
have the Korean language pack for Tesseract installed on your system.
.. code:: python
from unstructured.partition.image import partition_image
filename = "example-docs/english-and-korean.png"
elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only")
``partition_email``
---------------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 298 KiB

View File

@ -1,3 +1,5 @@
import os
import pathlib
from unittest import mock
import pytest
@ -5,8 +7,13 @@ import requests
from pytesseract import TesseractError
from unstructured_inference.inference import layout
from unstructured.documents.elements import Title
from unstructured.partition import image, pdf
DIRECTORY = pathlib.Path(__file__).parent.resolve()
is_in_docker = os.path.exists("/.dockerenv")
class MockResponse:
def __init__(self, status_code, response):
@ -178,3 +185,37 @@ def test_partition_image_from_file_with_language_passed(filename="example-docs/e
def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
with pytest.raises(TesseractError):
image.partition_image(filename=filename, ocr_languages="fakeroo")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_partition_image_with_ocr_detects_korean():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
elements = image.partition_image(
filename=filename,
ocr_languages="eng+kor",
strategy="ocr_only",
)
assert elements[0] == Title("RULES AND INSTRUCTIONS")
assert elements[3].text.startswith("안녕하세요")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_partition_image_with_ocr_detects_korean_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
with open(filename, "rb") as f:
elements = image.partition_image(
file=f,
ocr_languages="eng+kor",
strategy="ocr_only",
)
assert elements[0] == Title("RULES AND INSTRUCTIONS")
assert elements[3].text.startswith("안녕하세요")
def test_partition_image_raises_with_bad_strategy():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
with pytest.raises(ValueError):
image.partition_image(filename=filename, strategy="fakeroo")

View File

@ -1 +1 @@
__version__ = "0.6.3-dev3" # pragma: no cover
__version__ = "0.6.3" # pragma: no cover

View File

@ -1,7 +1,14 @@
from typing import List, Optional
import pytesseract
from PIL import Image
from unstructured.documents.elements import Element
from unstructured.partition.common import exactly_one
from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.text import partition_text
VALID_STRATEGIES = ["hi_res", "ocr_only"]
def partition_image(
@ -12,8 +19,10 @@ def partition_image(
token: Optional[str] = None,
include_page_breaks: bool = False,
ocr_languages: str = "eng",
strategy: str = "hi_res",
) -> List[Element]:
"""Parses an image into a list of interpreted elements.
Parameters
----------
filename
@ -30,16 +39,38 @@ def partition_image(
A string defining the authentication token for a self-host url, if applicable.
ocr_languages
The languages to use for the Tesseract agent. To use a language, you'll first need
to isntall the appropriate Tesseract language pack.
to install the appropriate Tesseract language pack.
strategy
The strategy to use for partitioning the PDF. Valid strategies are "hi_res" and
"ocr_only". When using the "hi_res" strategy, the function ses a layout detection
model if to identify document elements. When using the "ocr_only strategy",
partition_image simply extracts the text from the document and processes it.
"""
if template is None:
template = "layout/image"
return partition_pdf_or_image(
filename=filename,
file=file,
url=url,
template=template,
token=token,
include_page_breaks=include_page_breaks,
ocr_languages=ocr_languages,
)
exactly_one(filename=filename, file=file)
if strategy == "hi_res":
if template is None:
template = "layout/image"
return partition_pdf_or_image(
filename=filename,
file=file,
url=url,
template=template,
token=token,
include_page_breaks=include_page_breaks,
ocr_languages=ocr_languages,
)
elif strategy == "ocr_only":
if file is not None:
image = Image.open(file)
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
else:
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
return partition_text(text=text)
else:
raise ValueError(
f"{strategy} is not a valid strategy for partition_image. "
f"Choose one of {VALID_STRATEGIES}.",
)