mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-17 10:09:13 +00:00
* feat!: Use `Secret` for passing authentication secrets to components * Add comment to clarify type ignore
139 lines
6.0 KiB
Python
139 lines
6.0 KiB
Python
from pathlib import Path
|
|
from typing import List, Union, Dict, Any, Optional
|
|
import logging
|
|
|
|
from haystack.lazy_imports import LazyImport
|
|
from haystack import component, Document, default_to_dict, default_from_dict
|
|
from haystack.dataclasses import ByteStream
|
|
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
with LazyImport(message="Run 'pip install \"azure-ai-formrecognizer>=3.2.0b2\"'") as azure_import:
|
|
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
|
|
from azure.core.credentials import AzureKeyCredential
|
|
|
|
|
|
@component
|
|
class AzureOCRDocumentConverter:
|
|
"""
|
|
A component for converting files to Documents using Azure's Document Intelligence service.
|
|
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
|
|
|
|
In order to be able to use this component, you need an active Azure account
|
|
and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the
|
|
[Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
|
|
to set up your resource.
|
|
|
|
Usage example:
|
|
```python
|
|
from haystack.components.converters.azure import AzureOCRDocumentConverter
|
|
from haystack.utils import Secret
|
|
|
|
converter = AzureOCRDocumentConverter(endpoint="<url>", api_key=Secret.from_token("<your-api-key>"))
|
|
results = converter.run(sources=["image-based-document.pdf"], meta={"date_added": datetime.now().isoformat()})
|
|
documents = results["documents"]
|
|
print(documents[0].content)
|
|
# 'This is a text from the PDF file.'
|
|
```
|
|
"""
|
|
|
|
def __init__(
|
|
self, endpoint: str, api_key: Secret = Secret.from_env_var("AZURE_AI_API_KEY"), model_id: str = "prebuilt-read"
|
|
):
|
|
"""
|
|
Create an AzureOCRDocumentConverter component.
|
|
|
|
:param endpoint: The endpoint of your Azure resource.
|
|
:param api_key: The key of your Azure resource.
|
|
:param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
|
|
for a list of available models. Default: `"prebuilt-read"`.
|
|
"""
|
|
azure_import.check()
|
|
|
|
self.document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value())) # type: ignore
|
|
self.endpoint = endpoint
|
|
self.model_id = model_id
|
|
self.api_key = api_key
|
|
|
|
@component.output_types(documents=List[Document], raw_azure_response=List[Dict])
|
|
def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[Dict[str, Any]]] = None):
|
|
"""
|
|
Convert files to Documents using Azure's Document Intelligence service.
|
|
|
|
This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains
|
|
a list of Documents that were created from the files. The `raw_azure_response` output contains a list of
|
|
the raw responses from Azure's Document Intelligence service.
|
|
|
|
:param sources: List of file paths or ByteStream objects.
|
|
:param meta: Optional metadata to attach to the Documents.
|
|
This value can be either a list of dictionaries or a single dictionary.
|
|
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
|
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
|
|
Defaults to `None`.
|
|
:return: A dictionary containing a list of Document objects under the 'documents' key
|
|
and the raw Azure response under the 'raw_azure_response' key.
|
|
"""
|
|
documents = []
|
|
azure_output = []
|
|
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
|
|
|
|
for source, metadata in zip(sources, meta_list):
|
|
try:
|
|
bytestream = get_bytestream_from_source(source=source)
|
|
except Exception as e:
|
|
logger.warning("Could not read %s. Skipping it. Error: %s", source, e)
|
|
continue
|
|
|
|
poller = self.document_analysis_client.begin_analyze_document(
|
|
model_id=self.model_id, document=bytestream.data
|
|
)
|
|
result = poller.result()
|
|
azure_output.append(result.to_dict())
|
|
|
|
file_suffix = None
|
|
if "file_path" in bytestream.meta:
|
|
file_suffix = Path(bytestream.meta["file_path"]).suffix
|
|
|
|
document = AzureOCRDocumentConverter._convert_azure_result_to_document(result, file_suffix)
|
|
merged_metadata = {**bytestream.meta, **metadata}
|
|
document.meta = merged_metadata
|
|
documents.append(document)
|
|
|
|
return {"documents": documents, "raw_azure_response": azure_output}
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""
|
|
Serialize this component to a dictionary.
|
|
"""
|
|
return default_to_dict(self, api_key=self.api_key.to_dict(), endpoint=self.endpoint, model_id=self.model_id)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
|
|
"""
|
|
Deserialize this component from a dictionary.
|
|
"""
|
|
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
|
|
return default_from_dict(cls, data)
|
|
|
|
@staticmethod
|
|
def _convert_azure_result_to_document(result: "AnalyzeResult", file_suffix: Optional[str] = None) -> Document:
|
|
"""
|
|
Convert the result of Azure OCR to a Haystack text Document.
|
|
"""
|
|
if file_suffix == ".pdf":
|
|
text = ""
|
|
for page in result.pages:
|
|
lines = page.lines if page.lines else []
|
|
for line in lines:
|
|
text += f"{line.content}\n"
|
|
|
|
text += "\f"
|
|
else:
|
|
text = result.content
|
|
|
|
document = Document(content=text)
|
|
|
|
return document
|