feat: configure googlevisionapi (#3126)

### Summary

Includes changes from #3117. Merged into a feature branch to run the
full test suite.

Original PR description:

The Google Vision API allows for [configuration of the API
endpoint](https://cloud.google.com/vision/docs/ocr#regionalization), to
select if the data should be sent to the US or the EU. This PR adds an
environment variable (`GOOGLEVISION_API_ENDPOINT`) to configure it.

---------

Co-authored-by: JIAQIA <jqq1716@gmail.com>
Co-authored-by: Dimitri Lozeve <dimitri@lozeve.com>
This commit is contained in:
Matt Robinson 2024-05-31 14:41:04 -04:00 committed by GitHub
parent 4a96d54906
commit 6005abce79
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 19 additions and 4 deletions

View File

@ -8,6 +8,8 @@
### Features
- **Allow configuration of the Google Vision API endpoint** Add an environment variable to select the Google Vision API in the US or the EU.
### Fixes
* **Fix V2 S3 Destination Connector authentication** Fixes bugs with S3 Destination Connector where the connection config was neither registered nor properly deserialized.

View File

@ -95,11 +95,14 @@ class ENVConfig:
"""optimum text height for tesseract OCR"""
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
@property
def GOOGLEVISION_API_ENDPOINT(self) -> str:
"""API endpoint to use for Google Vision"""
return self._get_string("GOOGLEVISION_API_ENDPOINT", "")
@property
def OCR_AGENT(self) -> str:
"""error margin when comparing if a ocr region is within the table element when preparing
table tokens
"""
"""OCR Agent to use"""
return self._get_string("OCR_AGENT", OCR_AGENT_TESSERACT)
@property

View File

@ -5,6 +5,8 @@ from typing import TYPE_CHECKING
from google.cloud.vision import Image, ImageAnnotatorClient, Paragraph, TextAnnotation
from unstructured.logger import logger, trace_logger
from unstructured.partition.utils.config import env_config
from unstructured.partition.utils.constants import Source
from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
@ -18,7 +20,14 @@ class OCRAgentGoogleVision(OCRAgent):
"""OCR service implementation for Google Vision API."""
def __init__(self) -> None:
self.client = ImageAnnotatorClient()
client_options = {}
api_endpoint = env_config.GOOGLEVISION_API_ENDPOINT
if api_endpoint:
logger.info(f"Using Google Vision OCR with endpoint {api_endpoint}")
client_options["api_endpoint"] = api_endpoint
else:
logger.info("Using Google Vision OCR with default endpoint")
self.client = ImageAnnotatorClient(client_options=client_options)
def is_text_sorted(self) -> bool:
return True
@ -34,6 +43,7 @@ class OCRAgentGoogleVision(OCRAgent):
def get_layout_from_image(
self, image: PILImage.Image, ocr_languages: str = "eng"
) -> list[TextRegion]:
trace_logger.detail("Processing entire page OCR with Google Vision API...")
with BytesIO() as buffer:
image.save(buffer, format="PNG")
response = self.client.document_text_detection(image=Image(content=buffer.getvalue()))