mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
enhancement: add ocr_only strategy for partition_image (#540)
* spike for ocr-only strategy for images * fix for file processing * extra space * add korean to ci * added test for ocr_only strategy * added docs for ocr_only * changelog and version * added test for bad strategy * skip korean test if in docker * bump version * version bump * document valid strategies * bump version for release --------- Co-authored-by: qued <64741807+qued@users.noreply.github.com>
This commit is contained in:
parent
fae5f8fdde
commit
392cccdbf7
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -107,7 +107,7 @@ jobs:
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make test
|
||||
make check-coverage
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
## 0.6.3-dev3
|
||||
## 0.6.3
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Add an "ocr_only" strategy for `partition_image`.
|
||||
|
||||
### Features
|
||||
|
||||
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
|
||||
|
||||
@ -430,6 +430,20 @@ Examples:
|
||||
elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
|
||||
|
||||
|
||||
The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segements the document using
|
||||
``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy,
|
||||
which OCRs the document and then runs the output through ``partition_text``. This can be helpful
|
||||
if ``detectron2`` does not detect a text element in the image. To run example below, ensure you
|
||||
have the Korean language pack for Tesseract installed on your system.
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.image import partition_image
|
||||
|
||||
filename = "example-docs/english-and-korean.png"
|
||||
elements = partition_image(filename=filename, ocr_languages="eng+kor", strategy="ocr_only")
|
||||
|
||||
|
||||
``partition_email``
|
||||
---------------------
|
||||
|
||||
BIN
example-docs/english-and-korean.png
Normal file
BIN
example-docs/english-and-korean.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 298 KiB |
@ -1,3 +1,5 @@
|
||||
import os
|
||||
import pathlib
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
@ -5,8 +7,13 @@ import requests
|
||||
from pytesseract import TesseractError
|
||||
from unstructured_inference.inference import layout
|
||||
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.partition import image, pdf
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, status_code, response):
|
||||
@ -178,3 +185,37 @@ def test_partition_image_from_file_with_language_passed(filename="example-docs/e
|
||||
def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
|
||||
with pytest.raises(TesseractError):
|
||||
image.partition_image(filename=filename, ocr_languages="fakeroo")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_partition_image_with_ocr_detects_korean():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
||||
elements = image.partition_image(
|
||||
filename=filename,
|
||||
ocr_languages="eng+kor",
|
||||
strategy="ocr_only",
|
||||
)
|
||||
|
||||
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
||||
assert elements[3].text.startswith("안녕하세요")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_partition_image_with_ocr_detects_korean_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(
|
||||
file=f,
|
||||
ocr_languages="eng+kor",
|
||||
strategy="ocr_only",
|
||||
)
|
||||
|
||||
assert elements[0] == Title("RULES AND INSTRUCTIONS")
|
||||
assert elements[3].text.startswith("안녕하세요")
|
||||
|
||||
|
||||
def test_partition_image_raises_with_bad_strategy():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
||||
with pytest.raises(ValueError):
|
||||
image.partition_image(filename=filename, strategy="fakeroo")
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.6.3-dev3" # pragma: no cover
|
||||
__version__ = "0.6.3" # pragma: no cover
|
||||
|
||||
@ -1,7 +1,14 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.partition.pdf import partition_pdf_or_image
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
VALID_STRATEGIES = ["hi_res", "ocr_only"]
|
||||
|
||||
|
||||
def partition_image(
|
||||
@ -12,8 +19,10 @@ def partition_image(
|
||||
token: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
ocr_languages: str = "eng",
|
||||
strategy: str = "hi_res",
|
||||
) -> List[Element]:
|
||||
"""Parses an image into a list of interpreted elements.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename
|
||||
@ -30,16 +39,38 @@ def partition_image(
|
||||
A string defining the authentication token for a self-host url, if applicable.
|
||||
ocr_languages
|
||||
The languages to use for the Tesseract agent. To use a language, you'll first need
|
||||
to isntall the appropriate Tesseract language pack.
|
||||
to install the appropriate Tesseract language pack.
|
||||
strategy
|
||||
The strategy to use for partitioning the PDF. Valid strategies are "hi_res" and
|
||||
"ocr_only". When using the "hi_res" strategy, the function ses a layout detection
|
||||
model if to identify document elements. When using the "ocr_only strategy",
|
||||
partition_image simply extracts the text from the document and processes it.
|
||||
"""
|
||||
if template is None:
|
||||
template = "layout/image"
|
||||
return partition_pdf_or_image(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=url,
|
||||
template=template,
|
||||
token=token,
|
||||
include_page_breaks=include_page_breaks,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
||||
exactly_one(filename=filename, file=file)
|
||||
|
||||
if strategy == "hi_res":
|
||||
if template is None:
|
||||
template = "layout/image"
|
||||
return partition_pdf_or_image(
|
||||
filename=filename,
|
||||
file=file,
|
||||
url=url,
|
||||
template=template,
|
||||
token=token,
|
||||
include_page_breaks=include_page_breaks,
|
||||
ocr_languages=ocr_languages,
|
||||
)
|
||||
|
||||
elif strategy == "ocr_only":
|
||||
if file is not None:
|
||||
image = Image.open(file)
|
||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||
else:
|
||||
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
||||
return partition_text(text=text)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{strategy} is not a valid strategy for partition_image. "
|
||||
f"Choose one of {VALID_STRATEGIES}.",
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user