diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ba04de63..e1a06b91a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.3-dev5 +## 0.13.3-dev6 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index aff495f14..7c4a26b8d 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.3-dev5" # pragma: no cover +__version__ = "0.13.3-dev6" # pragma: no cover diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py index 84f7b0f07..1883fbe3a 100644 --- a/unstructured/partition/pdf_image/ocr.py +++ b/unstructured/partition/pdf_image/ocr.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import os import tempfile -from typing import TYPE_CHECKING, BinaryIO, Dict, List, Optional, Union, cast +from typing import IO, TYPE_CHECKING, Any, List, Optional, cast import pdf2image @@ -39,7 +41,7 @@ if "OMP_THREAD_LIMIT" not in os.environ: def process_data_with_ocr( - data: Union[bytes, BinaryIO], + data: bytes | IO[bytes], out_layout: "DocumentLayout", extracted_layout: List[List["TextRegion"]], is_image: bool = False, @@ -76,7 +78,8 @@ def process_data_with_ocr( DocumentLayout: The merged layout information obtained after OCR processing. """ with tempfile.NamedTemporaryFile() as tmp_file: - tmp_file.write(data.read() if hasattr(data, "read") else data) + data_bytes = data if isinstance(data, bytes) else data.read() + tmp_file.write(data_bytes) tmp_file.flush() merged_layouts = process_file_with_ocr( filename=tmp_file.name, @@ -131,7 +134,7 @@ def process_file_with_ocr( from unstructured_inference.inference.layout import DocumentLayout - merged_page_layouts = [] + merged_page_layouts: list[PageLayout] = [] try: if is_image: with PILImage.open(filename) as images: @@ -182,7 +185,7 @@ def process_file_with_ocr( @requires_dependencies("unstructured_inference") def supplement_page_layout_with_ocr( page_layout: "PageLayout", - image: PILImage, + image: PILImage.Image, infer_table_structure: bool = False, ocr_languages: str = "eng", ocr_mode: str = OCRMode.FULL_PAGE.value, @@ -254,7 +257,7 @@ def supplement_page_layout_with_ocr( def supplement_element_with_table_extraction( elements: List["LayoutElement"], - image: PILImage, + image: PILImage.Image, tables_agent: "UnstructuredTableTransformerModel", ocr_languages: str = "eng", ocr_agent: OCRAgent = OCRAgent.get_instance(OCR_AGENT_TESSERACT), @@ -289,12 +292,12 @@ def supplement_element_with_table_extraction( def get_table_tokens( - table_element_image: PILImage, + table_element_image: PILImage.Image, ocr_languages: str = "eng", ocr_agent: OCRAgent = OCRAgent.get_instance(OCR_AGENT_TESSERACT), extracted_regions: Optional[List["TextRegion"]] = None, table_element: Optional["LayoutElement"] = None, -) -> List[Dict]: +) -> List[dict[str, Any]]: """Get OCR tokens from either paddleocr or tesseract""" ocr_layout = ocr_agent.get_layout_from_image( @@ -417,7 +420,7 @@ def supplement_layout_with_ocr_elements( build_layout_elements_from_ocr_regions, ) - ocr_regions_to_remove = [] + ocr_regions_to_remove: list[TextRegion] = [] for ocr_region in ocr_layout: for el in layout: ocr_region_is_subregion_of_out_el = ocr_region.bbox.is_almost_subregion_of( diff --git a/unstructured/partition/utils/ocr_models/ocr_interface.py b/unstructured/partition/utils/ocr_models/ocr_interface.py index d6147e544..450c99a0b 100644 --- a/unstructured/partition/utils/ocr_models/ocr_interface.py +++ b/unstructured/partition/utils/ocr_models/ocr_interface.py @@ -1,12 +1,14 @@ +from __future__ import annotations + import functools import importlib from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any, List +from typing import TYPE_CHECKING from unstructured.partition.utils.constants import OCR_AGENT_MODULES_WHITELIST if TYPE_CHECKING: - from PIL import PILImage + from PIL import Image as PILImage from unstructured_inference.inference.elements import TextRegion from unstructured_inference.inference.layoutelement import ( LayoutElement, @@ -14,31 +16,26 @@ if TYPE_CHECKING: class OCRAgent(ABC): - def __init__(self): - self.agent = self.load_agent() - - @abstractmethod - def load_agent(self, language: str) -> Any: - pass + """Defines the interface for an Optical Character Recognition (OCR) service.""" @abstractmethod def is_text_sorted(self) -> bool: pass @abstractmethod - def get_text_from_image(self, image: "PILImage", ocr_languages: str = "eng") -> str: + def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") -> str: pass @abstractmethod def get_layout_from_image( - self, image: "PILImage", ocr_languages: str = "eng" - ) -> List["TextRegion"]: + self, image: PILImage.Image, ocr_languages: str = "eng" + ) -> list[TextRegion]: pass @abstractmethod def get_layout_elements_from_image( - self, image: "PILImage", ocr_languages: str = "eng" - ) -> List["LayoutElement"]: + self, image: PILImage.Image, ocr_languages: str = "eng" + ) -> list[LayoutElement]: pass @staticmethod @@ -51,6 +48,6 @@ class OCRAgent(ABC): return loaded_class() else: raise ValueError( - f"Environment variable OCR_AGENT module name {module_name}", - f" must be set to a whitelisted module part of {OCR_AGENT_MODULES_WHITELIST}.", + f"Environment variable OCR_AGENT module name {module_name}, must be set to a" + f" whitelisted module part of {OCR_AGENT_MODULES_WHITELIST}.", ) diff --git a/unstructured/partition/utils/ocr_models/paddle_ocr.py b/unstructured/partition/utils/ocr_models/paddle_ocr.py index b366d914e..151c1277f 100644 --- a/unstructured/partition/utils/ocr_models/paddle_ocr.py +++ b/unstructured/partition/utils/ocr_models/paddle_ocr.py @@ -1,14 +1,13 @@ -from typing import TYPE_CHECKING, List +from __future__ import annotations + +from typing import TYPE_CHECKING, Any import numpy as np from PIL import Image as PILImage from unstructured.documents.elements import ElementType from unstructured.logger import logger -from unstructured.partition.utils.constants import ( - DEFAULT_PADDLE_LANG, - Source, -) +from unstructured.partition.utils.constants import DEFAULT_PADDLE_LANG, Source from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent from unstructured.utils import requires_dependencies @@ -18,12 +17,17 @@ if TYPE_CHECKING: class OCRAgentPaddle(OCRAgent): + """OCR service implementation for PaddleOCR.""" + + def __init__(self): + self.agent = self.load_agent() + def load_agent(self, language: str = DEFAULT_PADDLE_LANG): + """Loads the PaddleOCR agent as a global variable to ensure that we only load it once.""" + import paddle from unstructured_paddleocr import PaddleOCR - """Loads the PaddleOCR agent as a global variable to ensure that we only load it once.""" - # Disable signal handlers at C++ level upon failing # ref: https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/ # disable_signal_handler_en.html#disable-signal-handler @@ -55,7 +59,7 @@ class OCRAgentPaddle(OCRAgent): ) return paddle_ocr - def get_text_from_image(self, image: PILImage, ocr_languages: str = "eng") -> str: + def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") -> str: ocr_regions = self.get_layout_from_image(image) return "\n\n".join([r.text for r in ocr_regions]) @@ -63,8 +67,8 @@ class OCRAgentPaddle(OCRAgent): return False def get_layout_from_image( - self, image: PILImage, ocr_languages: str = "eng" - ) -> List["TextRegion"]: + self, image: PILImage.Image, ocr_languages: str = "eng" + ) -> list[TextRegion]: """Get the OCR regions from image as a list of text regions with paddle.""" logger.info("Processing entire page OCR with paddle...") @@ -79,8 +83,8 @@ class OCRAgentPaddle(OCRAgent): @requires_dependencies("unstructured_inference") def get_layout_elements_from_image( - self, image: PILImage, ocr_languages: str = "eng" - ) -> List["LayoutElement"]: + self, image: PILImage.Image, ocr_languages: str = "eng" + ) -> list[LayoutElement]: from unstructured.partition.pdf_image.inference_utils import build_layout_element ocr_regions = self.get_layout_from_image( @@ -102,10 +106,8 @@ class OCRAgentPaddle(OCRAgent): ] @requires_dependencies("unstructured_inference") - def parse_data(self, ocr_data: list) -> List["TextRegion"]: - """ - Parse the OCR result data to extract a list of TextRegion objects from - paddle. + def parse_data(self, ocr_data: list[Any]) -> list[TextRegion]: + """Parse the OCR result data to extract a list of TextRegion objects from paddle. The function processes the OCR result dictionary, looking for bounding box information and associated text to create instances of the TextRegion @@ -115,7 +117,7 @@ class OCRAgentPaddle(OCRAgent): - ocr_data (list): A list containing the OCR result data Returns: - - List[TextRegion]: A list of TextRegion objects, each representing a + - list[TextRegion]: A list of TextRegion objects, each representing a detected text region within the OCR-ed image. Note: @@ -125,7 +127,7 @@ class OCRAgentPaddle(OCRAgent): from unstructured.partition.pdf_image.inference_utils import build_text_region_from_coords - text_regions = [] + text_regions: list[TextRegion] = [] for idx in range(len(ocr_data)): res = ocr_data[idx] if not res: @@ -142,12 +144,7 @@ class OCRAgentPaddle(OCRAgent): cleaned_text = text.strip() if cleaned_text: text_region = build_text_region_from_coords( - x1, - y1, - x2, - y2, - text=cleaned_text, - source=Source.OCR_PADDLE, + x1, y1, x2, y2, text=cleaned_text, source=Source.OCR_PADDLE ) text_regions.append(text_region) diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py index 9ac09548c..1867ec548 100644 --- a/unstructured/partition/utils/ocr_models/tesseract_ocr.py +++ b/unstructured/partition/utils/ocr_models/tesseract_ocr.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TYPE_CHECKING, List import cv2 @@ -26,21 +28,17 @@ if TYPE_CHECKING: class OCRAgentTesseract(OCRAgent): - def load_agent(self): - pass + """OCR service implementation for Tesseract.""" def is_text_sorted(self): return True - def get_text_from_image(self, image: PILImage, ocr_languages: str = "eng") -> str: - return unstructured_pytesseract.image_to_string( - np.array(image), - lang=ocr_languages, - ) + def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") -> str: + return unstructured_pytesseract.image_to_string(np.array(image), lang=ocr_languages) def get_layout_from_image( - self, image: PILImage, ocr_languages: str = "eng" - ) -> List["TextRegion"]: + self, image: PILImage.Image, ocr_languages: str = "eng" + ) -> List[TextRegion]: """Get the OCR regions from image as a list of text regions with tesseract.""" logger.info("Processing entire page OCR with tesseract...") @@ -58,7 +56,7 @@ class OCRAgentTesseract(OCRAgent): # depend on type of characters (font, language, etc); be careful about this # functionality text_height = ocr_df[TESSERACT_TEXT_HEIGHT].quantile( - env_config.TESSERACT_TEXT_HEIGHT_QUANTILE, + env_config.TESSERACT_TEXT_HEIGHT_QUANTILE ) if ( text_height < env_config.TESSERACT_MIN_TEXT_HEIGHT @@ -87,7 +85,7 @@ class OCRAgentTesseract(OCRAgent): @requires_dependencies("unstructured_inference") def get_layout_elements_from_image( - self, image: PILImage, ocr_languages: str = "eng" + self, image: PILImage.Image, ocr_languages: str = "eng" ) -> List["LayoutElement"]: from unstructured.partition.pdf_image.inference_utils import ( build_layout_elements_from_ocr_regions, @@ -118,9 +116,7 @@ class OCRAgentTesseract(OCRAgent): @requires_dependencies("unstructured_inference") def parse_data(self, ocr_data: pd.DataFrame, zoom: float = 1) -> List["TextRegion"]: - """ - Parse the OCR result data to extract a list of TextRegion objects from - tesseract. + """Parse the OCR result data to extract a list of TextRegion objects from tesseract. The function processes the OCR result data frame, looking for bounding box information and associated text to create instances of the TextRegion @@ -150,7 +146,7 @@ class OCRAgentTesseract(OCRAgent): if zoom <= 0: zoom = 1 - text_regions = [] + text_regions: list[TextRegion] = [] for idtx in ocr_data.itertuples(): text = idtx.text if not text: @@ -164,19 +160,14 @@ class OCRAgentTesseract(OCRAgent): x2 = (idtx.left + idtx.width) / zoom y2 = (idtx.top + idtx.height) / zoom text_region = build_text_region_from_coords( - x1, - y1, - x2, - y2, - text=cleaned_text, - source=Source.OCR_TESSERACT, + x1, y1, x2, y2, text=cleaned_text, source=Source.OCR_TESSERACT ) text_regions.append(text_region) return text_regions -def zoom_image(image: PILImage, zoom: float = 1) -> PILImage: +def zoom_image(image: PILImage.Image, zoom: float = 1) -> PILImage.Image: """scale an image based on the zoom factor using cv2; the scaled image is post processed by dilation then erosion to improve edge sharpness for OCR tasks""" if zoom <= 0: