diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 521617694..77288d9a1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -107,7 +107,7 @@ jobs: sudo apt-get update sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get install -y tesseract-ocr + sudo apt-get install -y tesseract-ocr tesseract-ocr-kor tesseract --version make test make check-coverage diff --git a/CHANGELOG.md b/CHANGELOG.md index b7b0b6824..e8f8bbdf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,9 @@ -## 0.6.3-dev3 +## 0.6.3 ### Enhancements +* Add an "ocr_only" strategy for `partition_image`. + ### Features * Added `partition_multiple_via_api` for partitioning multiple documents in a single REST diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 4a40e533d..b652e389b 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -430,6 +430,20 @@ Examples: elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe") +The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segements the document using +``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy, +which OCRs the document and then runs the output through ``partition_text``. This can be helpful +if ``detectron2`` does not detect a text element in the image. To run example below, ensure you +have the Korean language pack for Tesseract installed on your system. + + +.. code:: python + + from unstructured.partition.image import partition_image + + filename = "example-docs/english-and-korean.png" + elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only") + ``partition_email`` --------------------- diff --git a/example-docs/english-and-korean.png b/example-docs/english-and-korean.png new file mode 100644 index 000000000..4a54d93ff Binary files /dev/null and b/example-docs/english-and-korean.png differ diff --git a/test_unstructured/partition/test_image.py b/test_unstructured/partition/test_image.py index a86590b2d..2f2d833ad 100644 --- a/test_unstructured/partition/test_image.py +++ b/test_unstructured/partition/test_image.py @@ -1,3 +1,5 @@ +import os +import pathlib from unittest import mock import pytest @@ -5,8 +7,13 @@ import requests from pytesseract import TesseractError from unstructured_inference.inference import layout +from unstructured.documents.elements import Title from unstructured.partition import image, pdf +DIRECTORY = pathlib.Path(__file__).parent.resolve() + +is_in_docker = os.path.exists("/.dockerenv") + class MockResponse: def __init__(self, status_code, response): @@ -178,3 +185,37 @@ def test_partition_image_from_file_with_language_passed(filename="example-docs/e def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"): with pytest.raises(TesseractError): image.partition_image(filename=filename, ocr_languages="fakeroo") + + +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") +def test_partition_image_with_ocr_detects_korean(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") + elements = image.partition_image( + filename=filename, + ocr_languages="eng+kor", + strategy="ocr_only", + ) + + assert elements[0] == Title("RULES AND INSTRUCTIONS") + assert elements[3].text.startswith("안녕하세요") + + +@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") +def test_partition_image_with_ocr_detects_korean_from_file(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") + + with open(filename, "rb") as f: + elements = image.partition_image( + file=f, + ocr_languages="eng+kor", + strategy="ocr_only", + ) + + assert elements[0] == Title("RULES AND INSTRUCTIONS") + assert elements[3].text.startswith("안녕하세요") + + +def test_partition_image_raises_with_bad_strategy(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png") + with pytest.raises(ValueError): + image.partition_image(filename=filename, strategy="fakeroo") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6c44ed07a..1f6c2e4b2 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.3-dev3" # pragma: no cover +__version__ = "0.6.3" # pragma: no cover diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py index 69b029aac..0623b2120 100644 --- a/unstructured/partition/image.py +++ b/unstructured/partition/image.py @@ -1,7 +1,14 @@ from typing import List, Optional +import pytesseract +from PIL import Image + from unstructured.documents.elements import Element +from unstructured.partition.common import exactly_one from unstructured.partition.pdf import partition_pdf_or_image +from unstructured.partition.text import partition_text + +VALID_STRATEGIES = ["hi_res", "ocr_only"] def partition_image( @@ -12,8 +19,10 @@ def partition_image( token: Optional[str] = None, include_page_breaks: bool = False, ocr_languages: str = "eng", + strategy: str = "hi_res", ) -> List[Element]: """Parses an image into a list of interpreted elements. + Parameters ---------- filename @@ -30,16 +39,38 @@ def partition_image( A string defining the authentication token for a self-host url, if applicable. ocr_languages The languages to use for the Tesseract agent. To use a language, you'll first need - to isntall the appropriate Tesseract language pack. + to install the appropriate Tesseract language pack. + strategy + The strategy to use for partitioning the PDF. Valid strategies are "hi_res" and + "ocr_only". When using the "hi_res" strategy, the function ses a layout detection + model if to identify document elements. When using the "ocr_only strategy", + partition_image simply extracts the text from the document and processes it. """ - if template is None: - template = "layout/image" - return partition_pdf_or_image( - filename=filename, - file=file, - url=url, - template=template, - token=token, - include_page_breaks=include_page_breaks, - ocr_languages=ocr_languages, - ) + exactly_one(filename=filename, file=file) + + if strategy == "hi_res": + if template is None: + template = "layout/image" + return partition_pdf_or_image( + filename=filename, + file=file, + url=url, + template=template, + token=token, + include_page_breaks=include_page_breaks, + ocr_languages=ocr_languages, + ) + + elif strategy == "ocr_only": + if file is not None: + image = Image.open(file) + text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'") + else: + text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'") + return partition_text(text=text) + + else: + raise ValueError( + f"{strategy} is not a valid strategy for partition_image. " + f"Choose one of {VALID_STRATEGIES}.", + )