mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-13 17:07:29 +00:00
feat: replace pytesseract with unstructured.pytesseract fork (#3528)
This PR reverts `pytesseract` dependency to `unstructured.pytesseract` fork due to the unavailability of some recent release versions of `pytesseract` on PyPI. This PR also addresses an issue encountered during the publication of `unstructured==0.15.4` to PyPI. The error was due to the fact that PyPI does not allow direct dependencies from Version Control System URLs like GitHub in the `install_requires` or `extras_require` sections of the `setup.py` file.
This commit is contained in:
parent
e64e09507a
commit
fc26426310
@ -1,4 +1,4 @@
|
|||||||
## 0.15.5-dev1
|
## 0.15.5
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* **Revert to using `unstructured.pytesseract` fork**. Due to the unavailability of some recent release versions of `pytesseract` on PyPI, the project now uses the `unstructured.pytesseract` fork to ensure stability and continued support.
|
||||||
* **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs.
|
* **Bump `libreoffice` verson in image.** Bumps the `libreoffice` version to `25.2.5.2` to address CVEs.
|
||||||
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.
|
* **Downgrade NLTK dependency version for compatibility**. Due to the unavailability of `nltk==3.8.2` on PyPI, the NLTK dependency has been downgraded to `<3.8.2`. This change ensures continued functionality and compatibility.
|
||||||
|
|
||||||
|
|||||||
2
Makefile
2
Makefile
@ -45,7 +45,7 @@ install-test:
|
|||||||
python3 -m pip install -r requirements/test.txt
|
python3 -m pip install -r requirements/test.txt
|
||||||
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
||||||
# pytesseract installation into the virtual env for testing
|
# pytesseract installation into the virtual env for testing
|
||||||
python3 -m pip install pytesseract -c requirements/deps/constraints.txt
|
python3 -m pip install unstructured_pytesseract
|
||||||
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
|
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
|
||||||
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
||||||
# version conflicts with label_studio_sdk
|
# version conflicts with label_studio_sdk
|
||||||
|
|||||||
@ -12,6 +12,4 @@ effdet
|
|||||||
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
|
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
|
||||||
# when unstructured library is.
|
# when unstructured library is.
|
||||||
unstructured-inference==0.7.36
|
unstructured-inference==0.7.36
|
||||||
# NOTE(christine): Pinned to a specific version of pytesseract from the GitHub repository.
|
unstructured.pytesseract>=0.3.12
|
||||||
# Remove this pin and switch to the latest version from PyPI once version 0.3.13 or newer is officially released.
|
|
||||||
pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
|
|
||||||
|
|||||||
@ -135,8 +135,8 @@ packaging==23.2
|
|||||||
# matplotlib
|
# matplotlib
|
||||||
# onnxruntime
|
# onnxruntime
|
||||||
# pikepdf
|
# pikepdf
|
||||||
# pytesseract
|
|
||||||
# transformers
|
# transformers
|
||||||
|
# unstructured-pytesseract
|
||||||
pandas==2.2.2
|
pandas==2.2.2
|
||||||
# via layoutparser
|
# via layoutparser
|
||||||
pdf2image==1.17.0
|
pdf2image==1.17.0
|
||||||
@ -159,8 +159,8 @@ pillow==10.4.0
|
|||||||
# pdfplumber
|
# pdfplumber
|
||||||
# pikepdf
|
# pikepdf
|
||||||
# pillow-heif
|
# pillow-heif
|
||||||
# pytesseract
|
|
||||||
# torchvision
|
# torchvision
|
||||||
|
# unstructured-pytesseract
|
||||||
pillow-heif==0.18.0
|
pillow-heif==0.18.0
|
||||||
# via -r ./extra-pdf-image.in
|
# via -r ./extra-pdf-image.in
|
||||||
portalocker==2.10.1
|
portalocker==2.10.1
|
||||||
@ -201,8 +201,6 @@ pypdf==4.3.1
|
|||||||
# -r ./extra-pdf-image.in
|
# -r ./extra-pdf-image.in
|
||||||
pypdfium2==4.30.0
|
pypdfium2==4.30.0
|
||||||
# via pdfplumber
|
# via pdfplumber
|
||||||
pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
|
|
||||||
# via -r ./extra-pdf-image.in
|
|
||||||
python-dateutil==2.9.0.post0
|
python-dateutil==2.9.0.post0
|
||||||
# via
|
# via
|
||||||
# -c ./base.txt
|
# -c ./base.txt
|
||||||
@ -289,6 +287,8 @@ tzdata==2024.1
|
|||||||
# via pandas
|
# via pandas
|
||||||
unstructured-inference==0.7.36
|
unstructured-inference==0.7.36
|
||||||
# via -r ./extra-pdf-image.in
|
# via -r ./extra-pdf-image.in
|
||||||
|
unstructured-pytesseract==0.3.13
|
||||||
|
# via -r ./extra-pdf-image.in
|
||||||
urllib3==1.26.19
|
urllib3==1.26.19
|
||||||
# via
|
# via
|
||||||
# -c ././deps/constraints.txt
|
# -c ././deps/constraints.txt
|
||||||
|
|||||||
@ -7,8 +7,8 @@ from unittest import mock
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pytesseract import TesseractError
|
|
||||||
from unstructured_inference.inference import layout
|
from unstructured_inference.inference import layout
|
||||||
|
from unstructured_pytesseract import TesseractError
|
||||||
|
|
||||||
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
||||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||||
|
|||||||
@ -3,8 +3,8 @@ from unittest.mock import patch
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytesseract
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import unstructured_pytesseract
|
||||||
from pdf2image.exceptions import PDFPageCountError
|
from pdf2image.exceptions import PDFPageCountError
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
|
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
|
||||||
@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
|
|||||||
|
|
||||||
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
|
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
pytesseract,
|
unstructured_pytesseract,
|
||||||
"image_to_data",
|
"image_to_data",
|
||||||
lambda *args, **kwargs: pd.DataFrame(
|
lambda *args, **kwargs: pd.DataFrame(
|
||||||
{
|
{
|
||||||
@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch):
|
|||||||
|
|
||||||
def test_get_ocr_text_from_image_tesseract(monkeypatch):
|
def test_get_ocr_text_from_image_tesseract(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
pytesseract,
|
unstructured_pytesseract,
|
||||||
"image_to_string",
|
"image_to_string",
|
||||||
lambda *args, **kwargs: "Hello World",
|
lambda *args, **kwargs: "Hello World",
|
||||||
)
|
)
|
||||||
@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
|
|||||||
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
|
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
|
||||||
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
|
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
pytesseract,
|
unstructured_pytesseract,
|
||||||
"image_to_data",
|
"image_to_data",
|
||||||
lambda *args, **kwargs: pd.DataFrame(
|
lambda *args, **kwargs: pd.DataFrame(
|
||||||
{
|
{
|
||||||
|
|||||||
@ -384,7 +384,7 @@ def test_partition_pdf_falls_back_to_fast(
|
|||||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||||
):
|
):
|
||||||
def mock_exists(dep):
|
def mock_exists(dep):
|
||||||
return dep not in ["unstructured_inference", "pytesseract"]
|
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
|
||||||
|
|
||||||
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
||||||
|
|
||||||
@ -406,7 +406,7 @@ def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
|||||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||||
):
|
):
|
||||||
def mock_exists(dep):
|
def mock_exists(dep):
|
||||||
return dep not in ["pytesseract"]
|
return dep not in ["unstructured_pytesseract"]
|
||||||
|
|
||||||
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
||||||
|
|
||||||
@ -432,7 +432,7 @@ def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
|||||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||||
):
|
):
|
||||||
def mock_exists(dep):
|
def mock_exists(dep):
|
||||||
return dep not in ["pytesseract"]
|
return dep not in ["unstructured_pytesseract"]
|
||||||
|
|
||||||
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
||||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||||
@ -584,7 +584,7 @@ def test_partition_pdf_fails_if_pdf_not_processable(
|
|||||||
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
filename=example_doc_path("pdf/layout-parser-paper-fast.pdf"),
|
||||||
):
|
):
|
||||||
def mock_exists(dep):
|
def mock_exists(dep):
|
||||||
return dep not in ["unstructured_inference", "pytesseract"]
|
return dep not in ["unstructured_inference", "unstructured_pytesseract"]
|
||||||
|
|
||||||
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
||||||
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
||||||
@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None():
|
|||||||
[
|
[
|
||||||
(
|
(
|
||||||
PartitionStrategy.HI_RES,
|
PartitionStrategy.HI_RES,
|
||||||
"pytesseract.image_to_data",
|
"unstructured_pytesseract.image_to_data",
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
PartitionStrategy.OCR_ONLY,
|
PartitionStrategy.OCR_ONLY,
|
||||||
"pytesseract.image_to_data",
|
"unstructured_pytesseract.image_to_data",
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
PartitionStrategy.OCR_ONLY,
|
PartitionStrategy.OCR_ONLY,
|
||||||
"pytesseract.image_to_string",
|
"unstructured_pytesseract.image_to_string",
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.15.5-dev1" # pragma: no cover
|
__version__ = "0.15.5" # pragma: no cover
|
||||||
|
|||||||
@ -31,7 +31,7 @@ def determine_pdf_or_image_strategy(
|
|||||||
):
|
):
|
||||||
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
|
"""Determines what strategy to use for processing PDFs or images, accounting for fallback
|
||||||
logic if some dependencies are not available."""
|
logic if some dependencies are not available."""
|
||||||
pytesseract_installed = dependency_exists("pytesseract")
|
pytesseract_installed = dependency_exists("unstructured_pytesseract")
|
||||||
unstructured_inference_installed = dependency_exists("unstructured_inference")
|
unstructured_inference_installed = dependency_exists("unstructured_inference")
|
||||||
|
|
||||||
if strategy == PartitionStrategy.AUTO:
|
if strategy == PartitionStrategy.AUTO:
|
||||||
|
|||||||
@ -43,7 +43,7 @@ OCR_AGENT_MODULES_WHITELIST = os.getenv(
|
|||||||
|
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
|
||||||
|
|
||||||
# this field is defined by pytesseract
|
# this field is defined by unstructured_pytesseract
|
||||||
TESSERACT_TEXT_HEIGHT = "height"
|
TESSERACT_TEXT_HEIGHT = "height"
|
||||||
|
|
||||||
TESSERACT_LANGUAGES_SPLITTER = "+"
|
TESSERACT_LANGUAGES_SPLITTER = "+"
|
||||||
|
|||||||
@ -6,9 +6,9 @@ from typing import TYPE_CHECKING, List
|
|||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytesseract
|
import unstructured_pytesseract
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
from pytesseract import Output
|
from unstructured_pytesseract import Output
|
||||||
|
|
||||||
from unstructured.logger import trace_logger
|
from unstructured.logger import trace_logger
|
||||||
from unstructured.partition.utils.config import env_config
|
from unstructured.partition.utils.config import env_config
|
||||||
@ -40,14 +40,14 @@ class OCRAgentTesseract(OCRAgent):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
def get_text_from_image(self, image: PILImage.Image) -> str:
|
def get_text_from_image(self, image: PILImage.Image) -> str:
|
||||||
return pytesseract.image_to_string(np.array(image), lang=self.language)
|
return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language)
|
||||||
|
|
||||||
def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
|
def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
|
||||||
"""Get the OCR regions from image as a list of text regions with tesseract."""
|
"""Get the OCR regions from image as a list of text regions with tesseract."""
|
||||||
|
|
||||||
trace_logger.detail("Processing entire page OCR with tesseract...")
|
trace_logger.detail("Processing entire page OCR with tesseract...")
|
||||||
zoom = 1
|
zoom = 1
|
||||||
ocr_df: pd.DataFrame = pytesseract.image_to_data(
|
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
|
||||||
np.array(image),
|
np.array(image),
|
||||||
lang=self.language,
|
lang=self.language,
|
||||||
output_type=Output.DATAFRAME,
|
output_type=Output.DATAFRAME,
|
||||||
@ -76,7 +76,7 @@ class OCRAgentTesseract(OCRAgent):
|
|||||||
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
|
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
|
||||||
max_zoom,
|
max_zoom,
|
||||||
)
|
)
|
||||||
ocr_df = pytesseract.image_to_data(
|
ocr_df = unstructured_pytesseract.image_to_data(
|
||||||
np.array(zoom_image(image, zoom)),
|
np.array(zoom_image(image, zoom)),
|
||||||
lang=self.language,
|
lang=self.language,
|
||||||
output_type=Output.DATAFRAME,
|
output_type=Output.DATAFRAME,
|
||||||
@ -96,9 +96,9 @@ class OCRAgentTesseract(OCRAgent):
|
|||||||
ocr_regions = self.get_layout_from_image(image)
|
ocr_regions = self.get_layout_from_image(image)
|
||||||
|
|
||||||
# NOTE(christine): For tesseract, the ocr_text returned by
|
# NOTE(christine): For tesseract, the ocr_text returned by
|
||||||
# `pytesseract.image_to_string()` doesn't contain bounding box data but is
|
# `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is
|
||||||
# well grouped. Conversely, the ocr_layout returned by parsing
|
# well grouped. Conversely, the ocr_layout returned by parsing
|
||||||
# `pytesseract.image_to_data()` contains bounding box data but is not well
|
# `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well
|
||||||
# grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
|
# grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
|
||||||
# the text regions in each group to create a list of layout elements.
|
# the text regions in each group to create a list of layout elements.
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user