mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-02 02:00:29 +00:00
Remove unstructured.pytesseract fork (#3454)
A second attempt at https://github.com/Unstructured-IO/unstructured/pull/3360, this PR removes unstructured's dependency on its own fork of `pytesseract`. (The original reason for the fork, the addition of `run_and_get_multiple_output`, was removed [here](https://github.com/madmaze/pytesseract/releases/tag/v0.3.12).) --------- Co-authored-by: Christine Straub <christinemstraub@gmail.com>
This commit is contained in:
parent
2373eaa829
commit
051be5aead
@ -1,4 +1,4 @@
|
||||
## 0.15.2-dev4
|
||||
## 0.15.2-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
|
||||
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
|
||||
* **Fixes an issue in Object Detection metrics** The issue was in preprocessing/validating the ground truth and predicted data for object detection metrics.
|
||||
* **Removes dependency on unstructured.pytesseract** Unstructured forked pytesseract while waiting for code to be upstreamed. Now that the new version has been released, this fork can be removed.
|
||||
|
||||
## 0.15.1
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ COPY test_unstructured test_unstructured
|
||||
COPY example-docs example-docs
|
||||
|
||||
RUN chown -R notebook-user:notebook-user /app && \
|
||||
apk add font-ubuntu && \
|
||||
apk add font-ubuntu git && \
|
||||
fc-cache -fv && \
|
||||
ln -s /usr/bin/python3.11 /usr/bin/python3
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@ -46,7 +46,7 @@ install-test:
|
||||
python3 -m pip install -r requirements/test.txt
|
||||
# NOTE(yao) - CI seem to always install tesseract to test so it would make sense to also require
|
||||
# pytesseract installation into the virtual env for testing
|
||||
python3 -m pip install unstructured.pytesseract -c requirements/deps/constraints.txt
|
||||
python3 -m pip install pytesseract -c requirements/deps/constraints.txt
|
||||
# python3 -m pip install argilla==1.28.0 -c requirements/deps/constraints.txt
|
||||
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
||||
# version conflicts with label_studio_sdk
|
||||
|
||||
@ -22,8 +22,8 @@ Office365-REST-Python-Client<2.4.3
|
||||
# unstructured-inference to be upgraded when unstructured library is upgraded
|
||||
# https://github.com/Unstructured-IO/unstructured/issues/1458
|
||||
# unstructured-inference
|
||||
# use the known compatible version of weaviate and unstructured.pytesseract
|
||||
unstructured.pytesseract>=0.3.12
|
||||
# use the known compatible version of weaviate and pytesseract
|
||||
pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
|
||||
weaviate-client>3.25.0
|
||||
# Note(yuming) - pining to avoid conflict with paddle install
|
||||
matplotlib==3.7.2
|
||||
|
||||
@ -7,12 +7,9 @@ pdfminer.six
|
||||
pikepdf
|
||||
pillow_heif
|
||||
pypdf
|
||||
pytesseract
|
||||
google-cloud-vision
|
||||
effdet
|
||||
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
|
||||
# when unstructured library is.
|
||||
unstructured-inference==0.7.36
|
||||
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
|
||||
# from one tesseract call
|
||||
unstructured.pytesseract>=0.3.12
|
||||
pytesseract>=0.3.12
|
||||
|
||||
@ -138,7 +138,6 @@ packaging==23.2
|
||||
# pikepdf
|
||||
# pytesseract
|
||||
# transformers
|
||||
# unstructured-pytesseract
|
||||
pandas==2.2.2
|
||||
# via layoutparser
|
||||
pdf2image==1.17.0
|
||||
@ -163,7 +162,6 @@ pillow==10.4.0
|
||||
# pillow-heif
|
||||
# pytesseract
|
||||
# torchvision
|
||||
# unstructured-pytesseract
|
||||
pillow-heif==0.18.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
portalocker==2.10.1
|
||||
@ -204,8 +202,10 @@ pypdf==4.3.1
|
||||
# -r ./extra-pdf-image.in
|
||||
pypdfium2==4.30.0
|
||||
# via pdfplumber
|
||||
pytesseract==0.3.10
|
||||
# via -r ./extra-pdf-image.in
|
||||
pytesseract @ git+https://github.com/madmaze/pytesseract.git@v0.3.13
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
python-dateutil==2.9.0.post0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
@ -290,10 +290,6 @@ tzdata==2024.1
|
||||
# via pandas
|
||||
unstructured-inference==0.7.36
|
||||
# via -r ./extra-pdf-image.in
|
||||
unstructured-pytesseract==0.3.12
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
# -r ./extra-pdf-image.in
|
||||
urllib3==1.26.19
|
||||
# via
|
||||
# -c ././deps/constraints.txt
|
||||
|
||||
@ -8,7 +8,7 @@ lxml==5.2.2
|
||||
# via python-pptx
|
||||
pillow==10.4.0
|
||||
# via python-pptx
|
||||
python-pptx==1.0.1
|
||||
python-pptx==1.0.2
|
||||
# via -r ./extra-pptx.in
|
||||
typing-extensions==4.12.2
|
||||
# via python-pptx
|
||||
|
||||
@ -63,7 +63,7 @@ langchain-community==0.2.11
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/embed-aws-bedrock.in
|
||||
langchain-core==0.2.28
|
||||
langchain-core==0.2.29
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
||||
@ -45,7 +45,7 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain-core==0.2.28
|
||||
langchain-core==0.2.29
|
||||
# via langchain-huggingface
|
||||
langchain-huggingface==0.0.3
|
||||
# via -r ./ingest/embed-huggingface.in
|
||||
|
||||
@ -53,7 +53,7 @@ jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain-core==0.2.28
|
||||
langchain-core==0.2.29
|
||||
# via langchain-openai
|
||||
langchain-openai==0.1.20
|
||||
# via -r ./ingest/embed-openai.in
|
||||
|
||||
@ -110,7 +110,7 @@ langchain-community==0.2.11
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/embed-vertexai.in
|
||||
langchain-core==0.2.28
|
||||
langchain-core==0.2.29
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
||||
@ -46,7 +46,7 @@ jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.12
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langchain-core==0.2.28
|
||||
langchain-core==0.2.29
|
||||
# via
|
||||
# langchain
|
||||
# langchain-text-splitters
|
||||
|
||||
@ -40,7 +40,7 @@ requests==2.32.3
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# singlestoredb
|
||||
singlestoredb==1.6.1
|
||||
singlestoredb==1.6.2
|
||||
# via -r ./ingest/singlestore.in
|
||||
sqlparams==6.0.1
|
||||
# via singlestoredb
|
||||
|
||||
@ -7,9 +7,7 @@
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
appdirs==1.4.4
|
||||
# via
|
||||
# label-studio-sdk
|
||||
# label-studio-tools
|
||||
# via label-studio-tools
|
||||
attrs==24.2.0
|
||||
# via
|
||||
# jsonschema
|
||||
|
||||
@ -147,7 +147,7 @@ def test_od_document_layout_dump():
|
||||
}
|
||||
od_layout_dump = ObjectDetectionLayoutDumper(od_document_layout).dump()
|
||||
|
||||
assert {"pages": od_layout_dump.get("pages")} == expected_dump
|
||||
assert expected_dump == {"pages": od_layout_dump.get("pages")}
|
||||
|
||||
# check OD model classes are attached but do not depend on a specific model instance
|
||||
assert "object_detection_classes" in od_layout_dump
|
||||
|
||||
@ -3,8 +3,8 @@ from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytesseract
|
||||
import pytest
|
||||
import unstructured_pytesseract
|
||||
from pdf2image.exceptions import PDFPageCountError
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
from unstructured_inference.inference.elements import EmbeddedTextRegion, TextRegion
|
||||
@ -70,7 +70,7 @@ def test_supplement_page_layout_with_ocr_invalid_ocr(monkeypatch):
|
||||
|
||||
def test_get_ocr_layout_from_image_tesseract(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
unstructured_pytesseract,
|
||||
pytesseract,
|
||||
"image_to_data",
|
||||
lambda *args, **kwargs: pd.DataFrame(
|
||||
{
|
||||
@ -156,7 +156,7 @@ def test_get_ocr_layout_from_image_paddle(monkeypatch):
|
||||
|
||||
def test_get_ocr_text_from_image_tesseract(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
unstructured_pytesseract,
|
||||
pytesseract,
|
||||
"image_to_string",
|
||||
lambda *args, **kwargs: "Hello World",
|
||||
)
|
||||
@ -443,7 +443,7 @@ def test_auto_zoom_not_exceed_tesseract_limit(monkeypatch):
|
||||
monkeypatch.setenv("TESSERACT_MIN_TEXT_HEIGHT", "1000")
|
||||
monkeypatch.setenv("TESSERACT_OPTIMUM_TEXT_HEIGHT", "100000")
|
||||
monkeypatch.setattr(
|
||||
unstructured_pytesseract,
|
||||
pytesseract,
|
||||
"image_to_data",
|
||||
lambda *args, **kwargs: pd.DataFrame(
|
||||
{
|
||||
|
||||
@ -978,15 +978,15 @@ def test_partition_hi_res_model_name_default_to_None():
|
||||
[
|
||||
(
|
||||
PartitionStrategy.HI_RES,
|
||||
"unstructured_pytesseract.image_to_data",
|
||||
"pytesseract.image_to_data",
|
||||
),
|
||||
(
|
||||
PartitionStrategy.OCR_ONLY,
|
||||
"unstructured_pytesseract.image_to_data",
|
||||
"pytesseract.image_to_data",
|
||||
),
|
||||
(
|
||||
PartitionStrategy.OCR_ONLY,
|
||||
"unstructured_pytesseract.image_to_string",
|
||||
"pytesseract.image_to_string",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.15.2-dev4" # pragma: no cover
|
||||
__version__ = "0.15.2-dev5" # pragma: no cover
|
||||
|
||||
@ -881,9 +881,7 @@ class PreChunkBuilder:
|
||||
if self._text_length > self._opts.soft_max:
|
||||
return False
|
||||
# -- don't add an element if it would increase total size beyond the hard-max --
|
||||
if self._remaining_space < len(element.text):
|
||||
return False
|
||||
return True
|
||||
return not self._remaining_space < len(element.text)
|
||||
|
||||
@property
|
||||
def _remaining_space(self) -> int:
|
||||
|
||||
@ -393,9 +393,7 @@ def is_page_url(client: Client, url: str):
|
||||
if not page_uuid:
|
||||
return False
|
||||
check_resp = client.pages.retrieve_status(page_id=page_uuid)
|
||||
if check_resp == 200:
|
||||
return True
|
||||
return False
|
||||
return check_resp == 200
|
||||
|
||||
|
||||
def is_database_url(client: Client, url: str):
|
||||
@ -407,9 +405,7 @@ def is_database_url(client: Client, url: str):
|
||||
if not database_uuid:
|
||||
return False
|
||||
check_resp = client.databases.retrieve_status(database_id=database_uuid)
|
||||
if check_resp == 200:
|
||||
return True
|
||||
return False
|
||||
return check_resp == 200
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
@ -39,9 +39,7 @@ class ChunkStep(PipelineStep):
|
||||
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
||||
if self.context.reprocess or file_data.reprocess:
|
||||
return True
|
||||
if not filepath.exists():
|
||||
return True
|
||||
return False
|
||||
return not filepath.exists()
|
||||
|
||||
def get_output_filepath(self, filename: Path) -> Path:
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
||||
|
||||
@ -39,9 +39,7 @@ class EmbedStep(PipelineStep):
|
||||
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
||||
if self.context.reprocess or file_data.reprocess:
|
||||
return True
|
||||
if not filepath.exists():
|
||||
return True
|
||||
return False
|
||||
return not filepath.exists()
|
||||
|
||||
def get_output_filepath(self, filename: Path) -> Path:
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
||||
|
||||
@ -34,9 +34,7 @@ class PartitionStep(PipelineStep):
|
||||
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
||||
if self.context.reprocess or file_data.reprocess:
|
||||
return True
|
||||
if not filepath.exists():
|
||||
return True
|
||||
return False
|
||||
return not filepath.exists()
|
||||
|
||||
def get_output_filepath(self, filename: Path) -> Path:
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
||||
|
||||
@ -926,7 +926,7 @@ def _partition_pdf_or_image_with_ocr_from_image(
|
||||
|
||||
ocr_agent = OCRAgent.get_agent(language=ocr_languages)
|
||||
|
||||
# NOTE(christine): `unstructured_pytesseract.image_to_string()` returns sorted text
|
||||
# NOTE(christine): `pytesseract.image_to_string()` returns sorted text
|
||||
if ocr_agent.is_text_sorted():
|
||||
sort_mode = SORT_MODE_DONT
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@ OCR_AGENT_MODULES_WHITELIST = os.getenv(
|
||||
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
|
||||
|
||||
# this field is defined by pytesseract/unstructured.pytesseract
|
||||
# this field is defined by pytesseract
|
||||
TESSERACT_TEXT_HEIGHT = "height"
|
||||
|
||||
TESSERACT_LANGUAGES_SPLITTER = "+"
|
||||
|
||||
@ -6,9 +6,9 @@ from typing import TYPE_CHECKING, List
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import unstructured_pytesseract
|
||||
import pytesseract
|
||||
from PIL import Image as PILImage
|
||||
from unstructured_pytesseract import Output
|
||||
from pytesseract import Output
|
||||
|
||||
from unstructured.logger import trace_logger
|
||||
from unstructured.partition.utils.config import env_config
|
||||
@ -40,14 +40,14 @@ class OCRAgentTesseract(OCRAgent):
|
||||
return True
|
||||
|
||||
def get_text_from_image(self, image: PILImage.Image) -> str:
|
||||
return unstructured_pytesseract.image_to_string(np.array(image), lang=self.language)
|
||||
return pytesseract.image_to_string(np.array(image), lang=self.language)
|
||||
|
||||
def get_layout_from_image(self, image: PILImage.Image) -> List[TextRegion]:
|
||||
"""Get the OCR regions from image as a list of text regions with tesseract."""
|
||||
|
||||
trace_logger.detail("Processing entire page OCR with tesseract...")
|
||||
zoom = 1
|
||||
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
|
||||
ocr_df: pd.DataFrame = pytesseract.image_to_data(
|
||||
np.array(image),
|
||||
lang=self.language,
|
||||
output_type=Output.DATAFRAME,
|
||||
@ -76,7 +76,7 @@ class OCRAgentTesseract(OCRAgent):
|
||||
np.round(env_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1),
|
||||
max_zoom,
|
||||
)
|
||||
ocr_df = unstructured_pytesseract.image_to_data(
|
||||
ocr_df = pytesseract.image_to_data(
|
||||
np.array(zoom_image(image, zoom)),
|
||||
lang=self.language,
|
||||
output_type=Output.DATAFRAME,
|
||||
@ -96,9 +96,9 @@ class OCRAgentTesseract(OCRAgent):
|
||||
ocr_regions = self.get_layout_from_image(image)
|
||||
|
||||
# NOTE(christine): For tesseract, the ocr_text returned by
|
||||
# `unstructured_pytesseract.image_to_string()` doesn't contain bounding box data but is
|
||||
# `pytesseract.image_to_string()` doesn't contain bounding box data but is
|
||||
# well grouped. Conversely, the ocr_layout returned by parsing
|
||||
# `unstructured_pytesseract.image_to_data()` contains bounding box data but is not well
|
||||
# `pytesseract.image_to_data()` contains bounding box data but is not well
|
||||
# grouped. Therefore, we need to first group the `ocr_layout` by `ocr_text` and then merge
|
||||
# the text regions in each group to create a list of layout elements.
|
||||
|
||||
|
||||
@ -458,12 +458,10 @@ def is_parent_box(parent_target: Box, child_target: Box, add: float = 0.0) -> bo
|
||||
and (child_target[2] <= parent_targets[2] and child_target[3] <= parent_targets[3])
|
||||
):
|
||||
return True
|
||||
if len(child_target) == 2 and (
|
||||
return len(child_target) == 2 and (
|
||||
parent_targets[0] <= child_target[0] <= parent_targets[2]
|
||||
and parent_targets[1] <= child_target[1] <= parent_targets[3]
|
||||
):
|
||||
return True
|
||||
return False
|
||||
)
|
||||
|
||||
|
||||
def calculate_overlap_percentage(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user