feat: configure googlevisionapi (#3126)

### Summary Includes changes from #3117. Merged into a feature branch to run the full test suite. Original PR description: The Google Vision API allows for [configuration of the API endpoint](https://cloud.google.com/vision/docs/ocr#regionalization), to select if the data should be sent to the US or the EU. This PR adds an environment variable (`GOOGLEVISION_API_ENDPOINT`) to configure it. --------- Co-authored-by: JIAQIA <jqq1716@gmail.com> Co-authored-by: Dimitri Lozeve <dimitri@lozeve.com>
2025-12-25 14:14:30 +00:00 · 2024-05-31 14:41:04 -04:00 · 2024-05-31 14:41:04 -04:00 · 6005abce79
commit 6005abce79
parent 4a96d54906
3 changed files with 19 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,8 @@

 ### Features

+- **Allow configuration of the Google Vision API endpoint** Add an environment variable to select the Google Vision API in the US or the EU.
+
 ### Fixes

 * **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized.
--- a/unstructured/partition/utils/config.py
+++ b/unstructured/partition/utils/config.py
@ -95,11 +95,14 @@ class ENVConfig:
        """optimum text height for tesseract OCR"""
        return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)

+    @property
+    def GOOGLEVISION_API_ENDPOINT(self) -> str:
+        """API endpoint to use for Google Vision"""
+        return self._get_string("GOOGLEVISION_API_ENDPOINT", "")
+
    @property
    def OCR_AGENT(self) -> str:
-        """error margin when comparing if a ocr region is within the table element when preparing
-        table tokens
-        """
+        """OCR Agent to use"""
        return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)

    @property
--- a/unstructured/partition/utils/ocr_models/google_vision_ocr.py
+++ b/unstructured/partition/utils/ocr_models/google_vision_ocr.py
@ -5,6 +5,8 @@ from typing import TYPE_CHECKING

 from google.cloud.vision import Image, ImageAnnotatorClient, Paragraph, TextAnnotation

+from unstructured.logger import logger, trace_logger
+from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import Source
 from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent

@ -18,7 +20,14 @@ class OCRAgentGoogleVision(OCRAgent):
    """OCR service implementation for Google Vision API."""

    def __init__(self) -> None:
-        self.client = ImageAnnotatorClient()
+        client_options = {}
+        api_endpoint = env_config.GOOGLEVISION_API_ENDPOINT
+        if api_endpoint:
+            logger.info(f"Using Google Vision OCR with endpoint {api_endpoint}")
+            client_options["api_endpoint"] = api_endpoint
+        else:
+            logger.info("Using Google Vision OCR with default endpoint")
+        self.client = ImageAnnotatorClient(client_options=client_options)

    def is_text_sorted(self) -> bool:
        return True
@ -34,6 +43,7 @@ class OCRAgentGoogleVision(OCRAgent):
    def get_layout_from_image(
        self, image: PILImage.Image, ocr_languages: str = "eng"
    ) -> list[TextRegion]:
+        trace_logger.detail("Processing entire page OCR with Google Vision API...")
        with BytesIO() as buffer:
            image.save(buffer, format="PNG")
            response = self.client.document_text_detection(image=Image(content=buffer.getvalue()))