From 051be5aead9559696fbe5a9f55171c09e16820bc Mon Sep 17 00:00:00 2001 From: Jake Zerrer <159055756+jake-normal@users.noreply.github.com> Date: Fri, 9 Aug 2024 00:28:48 -0400 Subject: [PATCH] Remove unstructured.pytesseract fork (#3454) A second attempt at https://github.com/Unstructured-IO/unstructured/pull/3360, this PR removes unstructured's dependency on its own fork of `pytesseract`. (The original reason for the fork, the addition of `run_and_get_multiple_output`, was removed [here](https://github.com/madmaze/pytesseract/releases/tag/v0.3.12).) --------- Co-authored-by: Christine Straub --- CHANGELOG.md | 3 ++- Dockerfile | 2 +- Makefile | 2 +- requirements/deps/constraints.txt | 4 ++-- requirements/extra-pdf-image.in | 5 +---- requirements/extra-pdf-image.txt | 12 ++++-------- requirements/extra-pptx.txt | 2 +- requirements/ingest/embed-aws-bedrock.txt | 2 +- requirements/ingest/embed-huggingface.txt | 2 +- requirements/ingest/embed-openai.txt | 2 +- requirements/ingest/embed-vertexai.txt | 2 +- requirements/ingest/embed-voyageai.txt | 2 +- requirements/ingest/singlestore.txt | 2 +- requirements/test.txt | 4 +--- .../partition/pdf_image/test_analysis.py | 2 +- test_unstructured/partition/pdf_image/test_ocr.py | 8 ++++---- test_unstructured/partition/pdf_image/test_pdf.py | 6 +++--- unstructured/__version__.py | 2 +- unstructured/chunking/base.py | 4 +--- unstructured/ingest/connector/notion/helpers.py | 8 ++------ unstructured/ingest/v2/pipeline/steps/chunk.py | 4 +--- unstructured/ingest/v2/pipeline/steps/embed.py | 4 +--- unstructured/ingest/v2/pipeline/steps/partition.py | 4 +--- unstructured/partition/pdf.py | 2 +- unstructured/partition/utils/constants.py | 2 +- .../partition/utils/ocr_models/tesseract_ocr.py | 14 +++++++------- unstructured/utils.py | 6 ++---- 27 files changed, 45 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c63314d1..e69af84dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.2-dev4 +## 0.15.2-dev5 ### Enhancements @@ -12,6 +12,7 @@ * **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters). * **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive. * **Fixes an issue in Object Detection metrics** The issue was in preprocessing/validating the ground truth and predicted data for object detection metrics. +* **Removes dependency on unstructured.pytesseract** Unstructured forked pytesseract while waiting for code to be upstreamed. Now that the new version has been released, this fork can be removed. ## 0.15.1 diff --git a/Dockerfile b/Dockerfile index bea1e6199..eb7fd5b29 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ COPY test_unstructured test_unstructured COPY example-docs example-docs RUN chown -R notebook-user:notebook-user /app && \ - apk add font-ubuntu && \ + apk add font-ubuntu git && \ fc-cache -fv && \ ln -s /usr/bin/python3.11 /usr/bin/python3 diff --git a/Makefile b/Makefile index 58583fd56..64c823318 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ install-test: python3 -m pip install -r requirements/test.txt # NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require # pytesseract installation into the virtual env for testing - python3 -m pip install unstructured.pytesseract -c requirements/deps/constraints.txt + python3 -m pip install pytesseract -c requirements/deps/constraints.txt # python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt # NOTE(robinson) - Installing weaviate-client separately here because the requests # version conflicts with label_studio_sdk diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 7da1129d1..93828753a 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -22,8 +22,8 @@ Office365-REST-Python-Client<2.4.3 # unstructured-inference to be upgraded when unstructured library is upgraded # https://github.com/Unstructured-IO/unstructured/issues/1458 # unstructured-inference -# use the known compatible version of weaviate and unstructured.pytesseract -unstructured.pytesseract>=0.3.12 +# use the known compatible version of weaviate and pytesseract +pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 weaviate-client>3.25.0 # Note(yuming) - pining to avoid conflict with paddle install matplotlib==3.7.2 diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 621def12e..8c68a0fcd 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -7,12 +7,9 @@ pdfminer.six pikepdf pillow_heif pypdf -pytesseract google-cloud-vision effdet # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. unstructured-inference==0.7.36 -# unstructured fork of pytesseract that provides an interface to allow for multiple output formats -# from one tesseract call -unstructured.pytesseract>=0.3.12 +pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index c57f23bd7..a11667dd7 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -138,7 +138,6 @@ packaging==23.2 # pikepdf # pytesseract # transformers - # unstructured-pytesseract pandas==2.2.2 # via layoutparser pdf2image==1.17.0 @@ -163,7 +162,6 @@ pillow==10.4.0 # pillow-heif # pytesseract # torchvision - # unstructured-pytesseract pillow-heif==0.18.0 # via -r ./extra-pdf-image.in portalocker==2.10.1 @@ -204,8 +202,10 @@ pypdf==4.3.1 # -r ./extra-pdf-image.in pypdfium2==4.30.0 # via pdfplumber -pytesseract==0.3.10 - # via -r ./extra-pdf-image.in +pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13 + # via + # -c ././deps/constraints.txt + # -r ./extra-pdf-image.in python-dateutil==2.9.0.post0 # via # -c ./base.txt @@ -290,10 +290,6 @@ tzdata==2024.1 # via pandas unstructured-inference==0.7.36 # via -r ./extra-pdf-image.in -unstructured-pytesseract==0.3.12 - # via - # -c ././deps/constraints.txt - # -r ./extra-pdf-image.in urllib3==1.26.19 # via # -c ././deps/constraints.txt diff --git a/requirements/extra-pptx.txt b/requirements/extra-pptx.txt index 6b8c80a83..42a06959e 100644 --- a/requirements/extra-pptx.txt +++ b/requirements/extra-pptx.txt @@ -8,7 +8,7 @@ lxml==5.2.2 # via python-pptx pillow==10.4.0 # via python-pptx -python-pptx==1.0.1 +python-pptx==1.0.2 # via -r ./extra-pptx.in typing-extensions==4.12.2 # via python-pptx diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 2c97a997d..3f3fe1bd0 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -63,7 +63,7 @@ langchain-community==0.2.11 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-aws-bedrock.in -langchain-core==0.2.28 +langchain-core==0.2.29 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 4c447f6ef..24a97bd2e 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -45,7 +45,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain-core==0.2.28 +langchain-core==0.2.29 # via langchain-huggingface langchain-huggingface==0.0.3 # via -r ./ingest/embed-huggingface.in diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 91beb70f6..4a556148c 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -53,7 +53,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain-core==0.2.28 +langchain-core==0.2.29 # via langchain-openai langchain-openai==0.1.20 # via -r ./ingest/embed-openai.in diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index 035d0beb2..6574e8183 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -110,7 +110,7 @@ langchain-community==0.2.11 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-vertexai.in -langchain-core==0.2.28 +langchain-core==0.2.29 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 6db8f82e0..e4d8ecab9 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -46,7 +46,7 @@ jsonpointer==3.0.0 # via jsonpatch langchain==0.2.12 # via -r ./ingest/embed-voyageai.in -langchain-core==0.2.28 +langchain-core==0.2.29 # via # langchain # langchain-text-splitters diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt index b3ea50606..2a6d094a6 100644 --- a/requirements/ingest/singlestore.txt +++ b/requirements/ingest/singlestore.txt @@ -40,7 +40,7 @@ requests==2.32.3 # via # -c ./ingest/../base.txt # singlestoredb -singlestoredb==1.6.1 +singlestoredb==1.6.2 # via -r ./ingest/singlestore.in sqlparams==6.0.1 # via singlestoredb diff --git a/requirements/test.txt b/requirements/test.txt index 56487c873..f9c45d8ab 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -7,9 +7,7 @@ annotated-types==0.7.0 # via pydantic appdirs==1.4.4 - # via - # label-studio-sdk - # label-studio-tools + # via label-studio-tools attrs==24.2.0 # via # jsonschema diff --git a/test_unstructured/partition/pdf_image/test_analysis.py b/test_unstructured/partition/pdf_image/test_analysis.py index 237672a79..b39c0d1f5 100644 --- a/test_unstructured/partition/pdf_image/test_analysis.py +++ b/test_unstructured/partition/pdf_image/test_analysis.py @@ -147,7 +147,7 @@ def test_od_document_layout_dump(): } od_layout_dump = ObjectDetectionLayoutDumper(od_document_layout).dump() - assert {"pages": od_layout_dump.get("pages")} == expected_dump + assert expected_dump == {"pages": od_layout_dump.get("pages")} # check OD model classes are attached but do not depend on a specific model instance assert "object_detection_classes" in od_layout_dump diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py index e07fb23d3..c650db112 100644 --- a/test_unstructured/partition/pdf_image/test_ocr.py +++ b/test_unstructured/partition/pdf_image/test_ocr.py @@ -3,8 +3,8 @@ from unittest.mock import patch import numpy as np import pandas as pd +import pytesseract import pytest -import unstructured_pytesseract from pdf2image.exceptions import PDFPageCountError from PIL import Image, UnidentifiedImageError from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion @@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch): def test_get_ocr_layout_from_image_tesseract(monkeypatch): monkeypatch.setattr( - unstructured_pytesseract, + pytesseract, "image_to_data", lambda *args, **kwargs: pd.DataFrame( { @@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch): def test_get_ocr_text_from_image_tesseract(monkeypatch): monkeypatch.setattr( - unstructured_pytesseract, + pytesseract, "image_to_string", lambda *args, **kwargs: "Hello World", ) @@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch): monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000") monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000") monkeypatch.setattr( - unstructured_pytesseract, + pytesseract, "image_to_data", lambda *args, **kwargs: pd.DataFrame( { diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 1dc603605..0ed804e9b 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None(): [ ( PartitionStrategy.HI_RES, - "unstructured_pytesseract.image_to_data", + "pytesseract.image_to_data", ), ( PartitionStrategy.OCR_ONLY, - "unstructured_pytesseract.image_to_data", + "pytesseract.image_to_data", ), ( PartitionStrategy.OCR_ONLY, - "unstructured_pytesseract.image_to_string", + "pytesseract.image_to_string", ), ], ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fac8b4352..d2ae41a9f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.2-dev4" # pragma: no cover +__version__ = "0.15.2-dev5" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index edf37b3d2..7f28dd3b3 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -881,9 +881,7 @@ class PreChunkBuilder: if self._text_length > self._opts.soft_max: return False # -- don't add an element if it would increase total size beyond the hard-max -- - if self._remaining_space < len(element.text): - return False - return True + return not self._remaining_space < len(element.text) @property def _remaining_space(self) -> int: diff --git a/unstructured/ingest/connector/notion/helpers.py b/unstructured/ingest/connector/notion/helpers.py index 91ffe2f28..a09fa083b 100644 --- a/unstructured/ingest/connector/notion/helpers.py +++ b/unstructured/ingest/connector/notion/helpers.py @@ -393,9 +393,7 @@ def is_page_url(client: Client, url: str): if not page_uuid: return False check_resp = client.pages.retrieve_status(page_id=page_uuid) - if check_resp == 200: - return True - return False + return check_resp == 200 def is_database_url(client: Client, url: str): @@ -407,9 +405,7 @@ def is_database_url(client: Client, url: str): if not database_uuid: return False check_resp = client.databases.retrieve_status(database_id=database_uuid) - if check_resp == 200: - return True - return False + return check_resp == 200 @dataclass diff --git a/unstructured/ingest/v2/pipeline/steps/chunk.py b/unstructured/ingest/v2/pipeline/steps/chunk.py index 07eb680d7..b2e5d14c2 100644 --- a/unstructured/ingest/v2/pipeline/steps/chunk.py +++ b/unstructured/ingest/v2/pipeline/steps/chunk.py @@ -39,9 +39,7 @@ class ChunkStep(PipelineStep): def should_chunk(self, filepath: Path, file_data: FileData) -> bool: if self.context.reprocess or file_data.reprocess: return True - if not filepath.exists(): - return True - return False + return not filepath.exists() def get_output_filepath(self, filename: Path) -> Path: hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" diff --git a/unstructured/ingest/v2/pipeline/steps/embed.py b/unstructured/ingest/v2/pipeline/steps/embed.py index d1fbe04bb..94103951c 100644 --- a/unstructured/ingest/v2/pipeline/steps/embed.py +++ b/unstructured/ingest/v2/pipeline/steps/embed.py @@ -39,9 +39,7 @@ class EmbedStep(PipelineStep): def should_embed(self, filepath: Path, file_data: FileData) -> bool: if self.context.reprocess or file_data.reprocess: return True - if not filepath.exists(): - return True - return False + return not filepath.exists() def get_output_filepath(self, filename: Path) -> Path: hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" diff --git a/unstructured/ingest/v2/pipeline/steps/partition.py b/unstructured/ingest/v2/pipeline/steps/partition.py index 0b3167847..541d2cae9 100644 --- a/unstructured/ingest/v2/pipeline/steps/partition.py +++ b/unstructured/ingest/v2/pipeline/steps/partition.py @@ -34,9 +34,7 @@ class PartitionStep(PipelineStep): def should_partition(self, filepath: Path, file_data: FileData) -> bool: if self.context.reprocess or file_data.reprocess: return True - if not filepath.exists(): - return True - return False + return not filepath.exists() def get_output_filepath(self, filename: Path) -> Path: hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index 91b04beab..8b733c8e3 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -926,7 +926,7 @@ def _partition_pdf_or_image_with_ocr_from_image( ocr_agent = OCRAgent.get_agent(language=ocr_languages) - # NOTE(christine): `unstructured_pytesseract.image_to_string()` returns sorted text + # NOTE(christine): `pytesseract.image_to_string()` returns sorted text if ocr_agent.is_text_sorted(): sort_mode = SORT_MODE_DONT diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 9d802080c..03e26eb0c 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -43,7 +43,7 @@ OCR_AGENT_MODULES_WHITELIST = os.getenv( UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) -# this field is defined by pytesseract/unstructured.pytesseract +# this field is defined by pytesseract TESSERACT_TEXT_HEIGHT = "height" TESSERACT_LANGUAGES_SPLITTER = "+" diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 46eb8a0cb..bba58c02e 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -6,9 +6,9 @@ from typing import TYPE_CHECKING, List import cv2 import numpy as np import pandas as pd -import unstructured_pytesseract +import pytesseract from PIL import Image as PILImage -from unstructured_pytesseract import Output +from pytesseract import Output from unstructured.logger import trace_logger from unstructured.partition.utils.config import env_config @@ -40,14 +40,14 @@ class OCRAgentTesseract(OCRAgent): return True def get_text_from_image(self, image: PILImage.Image) -> str: - return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language) + return pytesseract.image_to_string(np.array(image), lang=self.language) def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]: """Get the OCR regions from image as a list of text regions with tesseract.""" trace_logger.detail("Processing entire page OCR with tesseract...") zoom = 1 - ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data( + ocr_df: pd.DataFrame = pytesseract.image_to_data( np.array(image), lang=self.language, output_type=Output.DATAFRAME, @@ -76,7 +76,7 @@ class OCRAgentTesseract(OCRAgent): np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1), max_zoom, ) - ocr_df = unstructured_pytesseract.image_to_data( + ocr_df = pytesseract.image_to_data( np.array(zoom_image(image, zoom)), lang=self.language, output_type=Output.DATAFRAME, @@ -96,9 +96,9 @@ class OCRAgentTesseract(OCRAgent): ocr_regions = self.get_layout_from_image(image) # NOTE(christine): For tesseract, the ocr_text returned by - # `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is + # `pytesseract.image_to_string()` doesn't contain bounding box data but is # well grouped. Conversely, the ocr_layout returned by parsing - # `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well + # `pytesseract.image_to_data()` contains bounding box data but is not well # grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge # the text regions in each group to create a list of layout elements. diff --git a/unstructured/utils.py b/unstructured/utils.py index 55fecc319..3152d02e7 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -458,12 +458,10 @@ def is_parent_box(parent_target: Box, child_target: Box, add: float = 0.0) -> bo and (child_target[2] <= parent_targets[2] and child_target[3] <= parent_targets[3]) ): return True - if len(child_target) == 2 and ( + return len(child_target) == 2 and ( parent_targets[0] <= child_target[0] <= parent_targets[2] and parent_targets[1] <= child_target[1] <= parent_targets[3] - ): - return True - return False + ) def calculate_overlap_percentage(