feat: replace pytesseract with unstructured.pytesseract fork (#3528)

This PR reverts `pytesseract` dependency to `unstructured.pytesseract`
fork due to the unavailability of some recent release versions of
`pytesseract` on PyPI.

This PR also addresses an issue encountered during the publication of
`unstructured==0.15.4` to PyPI. The error was due to the fact that PyPI
does not allow direct dependencies from Version Control System URLs like
GitHub in the `install_requires` or `extras_require` sections of the
`setup.py` file.
This commit is contained in:
Christine Straub 2024-08-16 07:34:22 -07:00 committed by GitHub
parent e64e09507a
commit fc26426310
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 30 additions and 31 deletions

View File

@ -1,4 +1,4 @@
## 0.15.5-dev1 ## 0.15.5
### Enhancements ### Enhancements
@ -6,6 +6,7 @@
### Fixes ### Fixes
* **Revert to using `unstructured.pytesseract` fork**. Due to the unavailability of some recent release versions of `pytesseract` on PyPI, the project now uses the `unstructured.pytesseract` fork to ensure stability and continued support.
* **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs. * **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs.
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility. * **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.

View File

@ -45,7 +45,7 @@ install-test:
python3 -m pip install -r requirements/test.txt python3 -m pip install -r requirements/test.txt
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require # NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
# pytesseract installation into the virtual env for testing # pytesseract installation into the virtual env for testing
python3 -m pip install pytesseract -c requirements/deps/constraints.txt python3 -m pip install unstructured_pytesseract
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt # python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
# NOTE(robinson) - Installing weaviate-client separately here because the requests # NOTE(robinson) - Installing weaviate-client separately here because the requests
# version conflicts with label_studio_sdk # version conflicts with label_studio_sdk

View File

@ -12,6 +12,4 @@ effdet
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is. # when unstructured library is.
unstructured-inference==0.7.36 unstructured-inference==0.7.36
# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository. unstructured.pytesseract>=0.3.12
# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released.
pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13

View File

@ -135,8 +135,8 @@ packaging==23.2
# matplotlib # matplotlib
# onnxruntime # onnxruntime
# pikepdf # pikepdf
# pytesseract
# transformers # transformers
# unstructured-pytesseract
pandas==2.2.2 pandas==2.2.2
# via layoutparser # via layoutparser
pdf2image==1.17.0 pdf2image==1.17.0
@ -159,8 +159,8 @@ pillow==10.4.0
# pdfplumber # pdfplumber
# pikepdf # pikepdf
# pillow-heif # pillow-heif
# pytesseract
# torchvision # torchvision
# unstructured-pytesseract
pillow-heif==0.18.0 pillow-heif==0.18.0
# via -r ./extra-pdf-image.in # via -r ./extra-pdf-image.in
portalocker==2.10.1 portalocker==2.10.1
@ -201,8 +201,6 @@ pypdf==4.3.1
# -r ./extra-pdf-image.in # -r ./extra-pdf-image.in
pypdfium2==4.30.0 pypdfium2==4.30.0
# via pdfplumber # via pdfplumber
pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
# via -r ./extra-pdf-image.in
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via # via
# -c ./base.txt # -c ./base.txt
@ -289,6 +287,8 @@ tzdata==2024.1
# via pandas # via pandas
unstructured-inference==0.7.36 unstructured-inference==0.7.36
# via -r ./extra-pdf-image.in # via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.13
# via -r ./extra-pdf-image.in
urllib3==1.26.19 urllib3==1.26.19
# via # via
# -c ././deps/constraints.txt # -c ././deps/constraints.txt

View File

@ -7,8 +7,8 @@ from unittest import mock
import pytest import pytest
from PIL import Image from PIL import Image
from pytesseract import TesseractError
from unstructured_inference.inference import layout from unstructured_inference.inference import layout
from unstructured_pytesseract import TesseractError
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path

View File

@ -3,8 +3,8 @@ from unittest.mock import patch
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pytesseract
import pytest import pytest
import unstructured_pytesseract
from pdf2image.exceptions import PDFPageCountError from pdf2image.exceptions import PDFPageCountError
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
def test_get_ocr_layout_from_image_tesseract(monkeypatch): def test_get_ocr_layout_from_image_tesseract(monkeypatch):
monkeypatch.setattr( monkeypatch.setattr(
pytesseract, unstructured_pytesseract,
"image_to_data", "image_to_data",
lambda *args, **kwargs: pd.DataFrame( lambda *args, **kwargs: pd.DataFrame(
{ {
@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch):
def test_get_ocr_text_from_image_tesseract(monkeypatch): def test_get_ocr_text_from_image_tesseract(monkeypatch):
monkeypatch.setattr( monkeypatch.setattr(
pytesseract, unstructured_pytesseract,
"image_to_string", "image_to_string",
lambda *args, **kwargs: "Hello World", lambda *args, **kwargs: "Hello World",
) )
@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000") monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000") monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
monkeypatch.setattr( monkeypatch.setattr(
pytesseract, unstructured_pytesseract,
"image_to_data", "image_to_data",
lambda *args, **kwargs: pd.DataFrame( lambda *args, **kwargs: pd.DataFrame(
{ {

View File

@ -384,7 +384,7 @@ def test_partition_pdf_falls_back_to_fast(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
): ):
def mock_exists(dep): def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"] return dep not in ["unstructured_inference", "unstructured_pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists) monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
@ -406,7 +406,7 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
): ):
def mock_exists(dep): def mock_exists(dep):
return dep not in ["pytesseract"] return dep not in ["unstructured_pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists) monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
@ -432,7 +432,7 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
): ):
def mock_exists(dep): def mock_exists(dep):
return dep not in ["pytesseract"] return dep not in ["unstructured_pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists) monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
@ -584,7 +584,7 @@ def test_partition_pdf_fails_if_pdf_not_processable(
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"), filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
): ):
def mock_exists(dep): def mock_exists(dep):
return dep not in ["unstructured_inference", "pytesseract"] return dep not in ["unstructured_inference", "unstructured_pytesseract"]
monkeypatch.setattr(strategies, "dependency_exists", mock_exists) monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: []) monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None():
[ [
( (
PartitionStrategy.HI_RES, PartitionStrategy.HI_RES,
"pytesseract.image_to_data", "unstructured_pytesseract.image_to_data",
), ),
( (
PartitionStrategy.OCR_ONLY, PartitionStrategy.OCR_ONLY,
"pytesseract.image_to_data", "unstructured_pytesseract.image_to_data",
), ),
( (
PartitionStrategy.OCR_ONLY, PartitionStrategy.OCR_ONLY,
"pytesseract.image_to_string", "unstructured_pytesseract.image_to_string",
), ),
], ],
) )

View File

@ -1 +1 @@
__version__ = "0.15.5-dev1" # pragma: no cover __version__ = "0.15.5" # pragma: no cover

View File

@ -31,7 +31,7 @@ def determine_pdf_or_image_strategy(
): ):
"""Determines what strategy to use for processing PDFs or images, accounting for fallback """Determines what strategy to use for processing PDFs or images, accounting for fallback
logic if some dependencies are not available.""" logic if some dependencies are not available."""
pytesseract_installed = dependency_exists("pytesseract") pytesseract_installed = dependency_exists("unstructured_pytesseract")
unstructured_inference_installed = dependency_exists("unstructured_inference") unstructured_inference_installed = dependency_exists("unstructured_inference")
if strategy == PartitionStrategy.AUTO: if strategy == PartitionStrategy.AUTO:

View File

@ -43,7 +43,7 @@ OCR_AGENT_MODULES_WHITELIST = os.getenv(
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
# this field is defined by pytesseract # this field is defined by unstructured_pytesseract
TESSERACT_TEXT_HEIGHT = "height" TESSERACT_TEXT_HEIGHT = "height"
TESSERACT_LANGUAGES_SPLITTER = "+" TESSERACT_LANGUAGES_SPLITTER = "+"

View File

@ -6,9 +6,9 @@ from typing import TYPE_CHECKING, List
import cv2 import cv2
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pytesseract import unstructured_pytesseract
from PIL import Image as PILImage from PIL import Image as PILImage
from pytesseract import Output from unstructured_pytesseract import Output
from unstructured.logger import trace_logger from unstructured.logger import trace_logger
from unstructured.partition.utils.config import env_config from unstructured.partition.utils.config import env_config
@ -40,14 +40,14 @@ class OCRAgentTesseract(OCRAgent):
return True return True
def get_text_from_image(self, image: PILImage.Image) -> str: def get_text_from_image(self, image: PILImage.Image) -> str:
return pytesseract.image_to_string(np.array(image), lang=self.language) return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language)
def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
"""Get the OCR regions from image as a list of text regions with tesseract.""" """Get the OCR regions from image as a list of text regions with tesseract."""
trace_logger.detail("Processing entire page OCR with tesseract...") trace_logger.detail("Processing entire page OCR with tesseract...")
zoom = 1 zoom = 1
ocr_df: pd.DataFrame = pytesseract.image_to_data( ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
np.array(image), np.array(image),
lang=self.language, lang=self.language,
output_type=Output.DATAFRAME, output_type=Output.DATAFRAME,
@ -76,7 +76,7 @@ class OCRAgentTesseract(OCRAgent):
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1), np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
max_zoom, max_zoom,
) )
ocr_df = pytesseract.image_to_data( ocr_df = unstructured_pytesseract.image_to_data(
np.array(zoom_image(image, zoom)), np.array(zoom_image(image, zoom)),
lang=self.language, lang=self.language,
output_type=Output.DATAFRAME, output_type=Output.DATAFRAME,
@ -96,9 +96,9 @@ class OCRAgentTesseract(OCRAgent):
ocr_regions = self.get_layout_from_image(image) ocr_regions = self.get_layout_from_image(image)
# NOTE(christine): For tesseract, the ocr_text returned by # NOTE(christine): For tesseract, the ocr_text returned by
# `pytesseract.image_to_string()` doesn't contain bounding box data but is # `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is
# well grouped. Conversely, the ocr_layout returned by parsing # well grouped. Conversely, the ocr_layout returned by parsing
# `pytesseract.image_to_data()` contains bounding box data but is not well # `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well
# grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge # grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
# the text regions in each group to create a list of layout elements. # the text regions in each group to create a list of layout elements.