mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-11-01 18:29:32 +00:00
feat: Add AzureOCRDocumentConverter (2.0) (#5855)
* Add AzureOCRDocumentConverter * Add tests * Add release note * Formatting * update docstrings * Apply suggestions from code review Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> * PR feedback * PR feedback * PR feedback * Add secrets as environment variables * Adapt test * Add azure dependency to CI * Add azure dependency to CI --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
parent
c8398eeb6d
commit
80192589b1
7
.github/workflows/tests.yml
vendored
7
.github/workflows/tests.yml
vendored
@ -201,7 +201,7 @@ jobs:
|
||||
python-version: ${{ env.PYTHON_VERSION }}
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika
|
||||
run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||
|
||||
- name: Run
|
||||
run: pytest --cov-report xml:coverage.xml --cov="haystack" -m "unit" test/preview
|
||||
@ -901,6 +901,9 @@ jobs:
|
||||
image: apache/tika:2.9.0.0
|
||||
ports:
|
||||
- 9998:9998
|
||||
env:
|
||||
CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }}
|
||||
CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@ -915,7 +918,7 @@ jobs:
|
||||
sudo apt install ffmpeg # for local Whisper tests
|
||||
|
||||
- name: Install Haystack
|
||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika
|
||||
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from haystack.preview.components.file_converters.txt import TextFileToDocument
|
||||
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
|
||||
from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
|
||||
|
||||
__all__ = ["TextFileToDocument", "TikaDocumentConverter"]
|
||||
__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"]
|
||||
|
||||
115
haystack/preview/components/file_converters/azure.py
Normal file
115
haystack/preview/components/file_converters/azure.py
Normal file
@ -0,0 +1,115 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Union, Optional, Dict, Any
|
||||
|
||||
from haystack.preview.lazy_imports import LazyImport
|
||||
from haystack.preview import component, Document, default_to_dict, default_from_dict
|
||||
|
||||
|
||||
with LazyImport(message="Run 'pip install azure-ai-formrecognizer>=3.2.0b2'") as azure_import:
|
||||
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
|
||||
@component
|
||||
class AzureOCRDocumentConverter:
|
||||
"""
|
||||
A component for converting files to Documents using Azure's Document Intelligence service.
|
||||
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
|
||||
|
||||
In order to be able to use this component, you need an active Azure account
|
||||
and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the
|
||||
[Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
|
||||
to set up your resource.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, endpoint: str, api_key: str, model_id: str = "prebuilt-read", id_hash_keys: Optional[List[str]] = None
|
||||
):
|
||||
"""
|
||||
Create an AzureOCRDocumentConverter component.
|
||||
|
||||
:param endpoint: The endpoint of your Azure resource.
|
||||
:param api_key: The key of your Azure resource.
|
||||
:param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
|
||||
for a list of available models. Default: `"prebuilt-read"`.
|
||||
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
|
||||
attributes. If you want to ensure you don't have duplicate Documents in your Document Store but texts are not
|
||||
unique, you can pass the name of the metadata to use when building the document ID (like
|
||||
`["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the content of the
|
||||
`category` field. Default: `None`.
|
||||
"""
|
||||
azure_import.check()
|
||||
|
||||
self.document_analysis_client = DocumentAnalysisClient(
|
||||
endpoint=endpoint, credential=AzureKeyCredential(api_key)
|
||||
)
|
||||
self.endpoint = endpoint
|
||||
self.api_key = api_key
|
||||
self.model_id = model_id
|
||||
self.id_hash_keys = id_hash_keys or []
|
||||
|
||||
@component.output_types(documents=List[Document], azure=List[Dict])
|
||||
def run(self, paths: List[Union[str, Path]]):
|
||||
"""
|
||||
Convert files to Documents using Azure's Document Intelligence service.
|
||||
|
||||
This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains
|
||||
a list of Documents that were created from the files. The `raw_azure_response` output contains a list of
|
||||
the raw responses from Azure's Document Intelligence service.
|
||||
|
||||
:param paths: Paths to the files to convert.
|
||||
"""
|
||||
documents = []
|
||||
azure_output = []
|
||||
for path in paths:
|
||||
path = Path(path)
|
||||
with open(path, "rb") as file:
|
||||
poller = self.document_analysis_client.begin_analyze_document(model_id=self.model_id, document=file)
|
||||
result = poller.result()
|
||||
azure_output.append(result.to_dict())
|
||||
|
||||
file_suffix = path.suffix
|
||||
document = AzureOCRDocumentConverter._convert_azure_result_to_document(
|
||||
result, self.id_hash_keys, file_suffix
|
||||
)
|
||||
documents.append(document)
|
||||
|
||||
return {"documents": documents, "raw_azure_response": azure_output}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Serialize this component to a dictionary.
|
||||
"""
|
||||
return default_to_dict(
|
||||
self, endpoint=self.endpoint, api_key=self.api_key, model_id=self.model_id, id_hash_keys=self.id_hash_keys
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
|
||||
"""
|
||||
Deserialize this component from a dictionary.
|
||||
"""
|
||||
return default_from_dict(cls, data)
|
||||
|
||||
@staticmethod
|
||||
def _convert_azure_result_to_document(result: AnalyzeResult, id_hash_keys: List[str], file_suffix: str) -> Document:
|
||||
"""
|
||||
Convert the result of Azure OCR to a Haystack text Document.
|
||||
"""
|
||||
if file_suffix == ".pdf":
|
||||
text = ""
|
||||
for page in result.pages:
|
||||
lines = page.lines if page.lines else []
|
||||
for line in lines:
|
||||
text += f"{line.content}\n"
|
||||
|
||||
text += "\f"
|
||||
else:
|
||||
text = result.content
|
||||
|
||||
if id_hash_keys:
|
||||
document = Document(text=text, id_hash_keys=id_hash_keys)
|
||||
else:
|
||||
document = Document(text=text)
|
||||
|
||||
return document
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
preview:
|
||||
- |
|
||||
Add AzureOCRDocumentConverter to convert files of different types using Azure's Document Intelligence Service.
|
||||
@ -0,0 +1,110 @@
|
||||
import os
|
||||
from unittest.mock import patch, Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
|
||||
|
||||
|
||||
class TestAzureOCRDocumentConverter:
|
||||
@pytest.mark.unit
|
||||
def test_to_dict(self):
|
||||
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
|
||||
data = component.to_dict()
|
||||
assert data == {
|
||||
"type": "AzureOCRDocumentConverter",
|
||||
"init_parameters": {
|
||||
"api_key": "test_credential_key",
|
||||
"endpoint": "test_endpoint",
|
||||
"id_hash_keys": [],
|
||||
"model_id": "prebuilt-read",
|
||||
},
|
||||
}
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_from_dict(self):
|
||||
data = {
|
||||
"type": "AzureOCRDocumentConverter",
|
||||
"init_parameters": {
|
||||
"api_key": "test_credential_key",
|
||||
"endpoint": "test_endpoint",
|
||||
"id_hash_keys": [],
|
||||
"model_id": "prebuilt-read",
|
||||
},
|
||||
}
|
||||
component = AzureOCRDocumentConverter.from_dict(data)
|
||||
assert component.endpoint == "test_endpoint"
|
||||
assert component.api_key == "test_credential_key"
|
||||
assert component.id_hash_keys == []
|
||||
assert component.model_id == "prebuilt-read"
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_run(self, preview_samples_path):
|
||||
with patch("haystack.preview.components.file_converters.azure.DocumentAnalysisClient") as mock_azure_client:
|
||||
mock_result = Mock(pages=[Mock(lines=[Mock(content="mocked line 1"), Mock(content="mocked line 2")])])
|
||||
mock_result.to_dict.return_value = {
|
||||
"api_version": "2023-02-28-preview",
|
||||
"model_id": "prebuilt-read",
|
||||
"content": "mocked line 1\nmocked line 2\n\f",
|
||||
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
|
||||
}
|
||||
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = mock_result
|
||||
|
||||
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
|
||||
output = component.run(paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf"])
|
||||
document = output["documents"][0]
|
||||
assert document.text == "mocked line 1\nmocked line 2\n\f"
|
||||
assert "raw_azure_response" in output
|
||||
assert output["raw_azure_response"][0] == {
|
||||
"api_version": "2023-02-28-preview",
|
||||
"model_id": "prebuilt-read",
|
||||
"content": "mocked line 1\nmocked line 2\n\f",
|
||||
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
|
||||
}
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
"CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
|
||||
reason="Azure credentials not available",
|
||||
)
|
||||
def test_run_with_pdf_file(self, preview_samples_path):
|
||||
component = AzureOCRDocumentConverter(
|
||||
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
|
||||
)
|
||||
output = component.run(paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf"])
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 1
|
||||
assert "A sample PDF file" in documents[0].text
|
||||
assert "Page 2 of Sample PDF" in documents[0].text
|
||||
assert "Page 4 of Sample PDF" in documents[0].text
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
"CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
|
||||
reason="Azure credentials not available",
|
||||
)
|
||||
def test_with_image_file(self, preview_samples_path):
|
||||
component = AzureOCRDocumentConverter(
|
||||
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
|
||||
)
|
||||
output = component.run(paths=[preview_samples_path / "images" / "haystack-logo.png"])
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 1
|
||||
assert "haystack" in documents[0].text
|
||||
assert "by deepset" in documents[0].text
|
||||
|
||||
@pytest.mark.integration
|
||||
@pytest.mark.skipif(
|
||||
"CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
|
||||
reason="Azure credentials not available",
|
||||
)
|
||||
def test_run_with_docx_file(self, preview_samples_path):
|
||||
component = AzureOCRDocumentConverter(
|
||||
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
|
||||
)
|
||||
output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
|
||||
documents = output["documents"]
|
||||
assert len(documents) == 1
|
||||
assert "Sample Docx File" in documents[0].text
|
||||
assert "Now we are in Page 2" in documents[0].text
|
||||
assert "Page 3 was empty this is page 4" in documents[0].text
|
||||
BIN
test/preview/test_files/images/haystack-logo.png
Normal file
BIN
test/preview/test_files/images/haystack-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
Loading…
x
Reference in New Issue
Block a user