feat: Add AzureOCRDocumentConverter (2.0) (#5855)

* Add AzureOCRDocumentConverter

* Add tests

* Add release note

* Formatting

* update docstrings

* Apply suggestions from code review

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>

* PR feedback

* PR feedback

* PR feedback

* Add secrets as environment variables

* Adapt test

* Add azure dependency to CI

* Add azure dependency to CI

---------

Co-authored-by: ZanSara <sara.zanzottera@deepset.ai>
Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
This commit is contained in:
bogdankostic 2023-09-26 15:57:55 +02:00 committed by GitHub
parent c8398eeb6d
commit 80192589b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 236 additions and 3 deletions

View File

@ -201,7 +201,7 @@ jobs:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install Haystack
run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika
run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
- name: Run
run: pytest --cov-report xml:coverage.xml --cov="haystack" -m "unit" test/preview
@ -901,6 +901,9 @@ jobs:
image: apache/tika:2.9.0.0
ports:
- 9998:9998
env:
CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }}
CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }}
steps:
- uses: actions/checkout@v4
@ -915,7 +918,7 @@ jobs:
sudo apt install ffmpeg # for local Whisper tests
- name: Install Haystack
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika
run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'
- name: Run tests
run: |

View File

@ -1,4 +1,5 @@
from haystack.preview.components.file_converters.txt import TextFileToDocument
from haystack.preview.components.file_converters.tika import TikaDocumentConverter
from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
__all__ = ["TextFileToDocument", "TikaDocumentConverter"]
__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"]

View File

@ -0,0 +1,115 @@
from pathlib import Path
from typing import List, Union, Optional, Dict, Any
from haystack.preview.lazy_imports import LazyImport
from haystack.preview import component, Document, default_to_dict, default_from_dict
with LazyImport(message="Run 'pip install azure-ai-formrecognizer>=3.2.0b2'") as azure_import:
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
from azure.core.credentials import AzureKeyCredential
@component
class AzureOCRDocumentConverter:
"""
A component for converting files to Documents using Azure's Document Intelligence service.
Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
In order to be able to use this component, you need an active Azure account
and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the
[Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
to set up your resource.
"""
def __init__(
self, endpoint: str, api_key: str, model_id: str = "prebuilt-read", id_hash_keys: Optional[List[str]] = None
):
"""
Create an AzureOCRDocumentConverter component.
:param endpoint: The endpoint of your Azure resource.
:param api_key: The key of your Azure resource.
:param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
for a list of available models. Default: `"prebuilt-read"`.
:param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
attributes. If you want to ensure you don't have duplicate Documents in your Document Store but texts are not
unique, you can pass the name of the metadata to use when building the document ID (like
`["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the content of the
`category` field. Default: `None`.
"""
azure_import.check()
self.document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(api_key)
)
self.endpoint = endpoint
self.api_key = api_key
self.model_id = model_id
self.id_hash_keys = id_hash_keys or []
@component.output_types(documents=List[Document], azure=List[Dict])
def run(self, paths: List[Union[str, Path]]):
"""
Convert files to Documents using Azure's Document Intelligence service.
This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains
a list of Documents that were created from the files. The `raw_azure_response` output contains a list of
the raw responses from Azure's Document Intelligence service.
:param paths: Paths to the files to convert.
"""
documents = []
azure_output = []
for path in paths:
path = Path(path)
with open(path, "rb") as file:
poller = self.document_analysis_client.begin_analyze_document(model_id=self.model_id, document=file)
result = poller.result()
azure_output.append(result.to_dict())
file_suffix = path.suffix
document = AzureOCRDocumentConverter._convert_azure_result_to_document(
result, self.id_hash_keys, file_suffix
)
documents.append(document)
return {"documents": documents, "raw_azure_response": azure_output}
def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self, endpoint=self.endpoint, api_key=self.api_key, model_id=self.model_id, id_hash_keys=self.id_hash_keys
)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
"""
Deserialize this component from a dictionary.
"""
return default_from_dict(cls, data)
@staticmethod
def _convert_azure_result_to_document(result: AnalyzeResult, id_hash_keys: List[str], file_suffix: str) -> Document:
"""
Convert the result of Azure OCR to a Haystack text Document.
"""
if file_suffix == ".pdf":
text = ""
for page in result.pages:
lines = page.lines if page.lines else []
for line in lines:
text += f"{line.content}\n"
text += "\f"
else:
text = result.content
if id_hash_keys:
document = Document(text=text, id_hash_keys=id_hash_keys)
else:
document = Document(text=text)
return document

View File

@ -0,0 +1,4 @@
---
preview:
- |
Add AzureOCRDocumentConverter to convert files of different types using Azure's Document Intelligence Service.

View File

@ -0,0 +1,110 @@
import os
from unittest.mock import patch, Mock
import pytest
from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
class TestAzureOCRDocumentConverter:
@pytest.mark.unit
def test_to_dict(self):
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
data = component.to_dict()
assert data == {
"type": "AzureOCRDocumentConverter",
"init_parameters": {
"api_key": "test_credential_key",
"endpoint": "test_endpoint",
"id_hash_keys": [],
"model_id": "prebuilt-read",
},
}
@pytest.mark.unit
def test_from_dict(self):
data = {
"type": "AzureOCRDocumentConverter",
"init_parameters": {
"api_key": "test_credential_key",
"endpoint": "test_endpoint",
"id_hash_keys": [],
"model_id": "prebuilt-read",
},
}
component = AzureOCRDocumentConverter.from_dict(data)
assert component.endpoint == "test_endpoint"
assert component.api_key == "test_credential_key"
assert component.id_hash_keys == []
assert component.model_id == "prebuilt-read"
@pytest.mark.unit
def test_run(self, preview_samples_path):
with patch("haystack.preview.components.file_converters.azure.DocumentAnalysisClient") as mock_azure_client:
mock_result = Mock(pages=[Mock(lines=[Mock(content="mocked line 1"), Mock(content="mocked line 2")])])
mock_result.to_dict.return_value = {
"api_version": "2023-02-28-preview",
"model_id": "prebuilt-read",
"content": "mocked line 1\nmocked line 2\n\f",
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
}
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = mock_result
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
output = component.run(paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf"])
document = output["documents"][0]
assert document.text == "mocked line 1\nmocked line 2\n\f"
assert "raw_azure_response" in output
assert output["raw_azure_response"][0] == {
"api_version": "2023-02-28-preview",
"model_id": "prebuilt-read",
"content": "mocked line 1\nmocked line 2\n\f",
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
}
@pytest.mark.integration
@pytest.mark.skipif(
"CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
reason="Azure credentials not available",
)
def test_run_with_pdf_file(self, preview_samples_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
)
output = component.run(paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf"])
documents = output["documents"]
assert len(documents) == 1
assert "A sample PDF file" in documents[0].text
assert "Page 2 of Sample PDF" in documents[0].text
assert "Page 4 of Sample PDF" in documents[0].text
@pytest.mark.integration
@pytest.mark.skipif(
"CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
reason="Azure credentials not available",
)
def test_with_image_file(self, preview_samples_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
)
output = component.run(paths=[preview_samples_path / "images" / "haystack-logo.png"])
documents = output["documents"]
assert len(documents) == 1
assert "haystack" in documents[0].text
assert "by deepset" in documents[0].text
@pytest.mark.integration
@pytest.mark.skipif(
"CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
reason="Azure credentials not available",
)
def test_run_with_docx_file(self, preview_samples_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
)
output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
documents = output["documents"]
assert len(documents) == 1
assert "Sample Docx File" in documents[0].text
assert "Now we are in Page 2" in documents[0].text
assert "Page 3 was empty this is page 4" in documents[0].text

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB