feat: Add AzureOCRDocumentConverter (2.0) (#5855)

* Add AzureOCRDocumentConverter * Add tests * Add release note * Formatting * update docstrings * Apply suggestions from code review Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> * PR feedback * PR feedback * PR feedback * Add secrets as environment variables * Adapt test * Add azure dependency to CI * Add azure dependency to CI --------- Co-authored-by: ZanSara <sara.zanzottera@deepset.ai> Co-authored-by: Daria Fokina <daria.fokina@deepset.ai>
2025-11-01 18:29:32 +00:00 · 2023-09-26 15:57:55 +02:00 · 2023-09-26 15:57:55 +02:00 · 80192589b1
commit 80192589b1
parent c8398eeb6d
6 changed files with 236 additions and 3 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -201,7 +201,7 @@ jobs:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Install Haystack
-        run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika
+        run: pip install .[preview,dev] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'

      - name: Run
        run: pytest --cov-report xml:coverage.xml --cov="haystack" -m "unit" test/preview
@ -901,6 +901,9 @@ jobs:
        image: apache/tika:2.9.0.0
        ports:
          - 9998:9998
+    env:
+      CORE_AZURE_CS_ENDPOINT: ${{ secrets.CORE_AZURE_CS_ENDPOINT }}
+      CORE_AZURE_CS_API_KEY: ${{ secrets.CORE_AZURE_CS_API_KEY }}
    steps:
      - uses: actions/checkout@v4

@ -915,7 +918,7 @@ jobs:
          sudo apt install ffmpeg  # for local Whisper tests

      - name: Install Haystack
-        run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 sentence-transformers>=2.2.0 pypdf openai-whisper tika
+        run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2'

      - name: Run tests
        run: |
--- a/haystack/preview/components/file_converters/init.py
+++ b/haystack/preview/components/file_converters/init.py
@ -1,4 +1,5 @@
 from haystack.preview.components.file_converters.txt import TextFileToDocument
 from haystack.preview.components.file_converters.tika import TikaDocumentConverter
+from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter

-__all__ = ["TextFileToDocument", "TikaDocumentConverter"]
+__all__ = ["TextFileToDocument", "TikaDocumentConverter", "AzureOCRDocumentConverter"]
--- a/haystack/preview/components/file_converters/azure.py
+++ b/haystack/preview/components/file_converters/azure.py
@ -0,0 +1,115 @@
+from pathlib import Path
+from typing import List, Union, Optional, Dict, Any
+
+from haystack.preview.lazy_imports import LazyImport
+from haystack.preview import component, Document, default_to_dict, default_from_dict
+
+
+with LazyImport(message="Run 'pip install azure-ai-formrecognizer>=3.2.0b2'") as azure_import:
+    from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
+    from azure.core.credentials import AzureKeyCredential
+
+
+@component
+class AzureOCRDocumentConverter:
+    """
+    A component for converting files to Documents using Azure's Document Intelligence service.
+    Supported file formats are: PDF, JPEG, PNG, BMP, TIFF, DOCX, XLSX, PPTX, and HTML.
+
+    In order to be able to use this component, you need an active Azure account
+    and a Document Intelligence or Cognitive Services resource. Please follow the steps described in the
+    [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api)
+    to set up your resource.
+    """
+
+    def __init__(
+        self, endpoint: str, api_key: str, model_id: str = "prebuilt-read", id_hash_keys: Optional[List[str]] = None
+    ):
+        """
+        Create an AzureOCRDocumentConverter component.
+
+        :param endpoint: The endpoint of your Azure resource.
+        :param api_key: The key of your Azure resource.
+        :param model_id: The model ID of the model you want to use. Please refer to [Azure documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/choose-model-feature)
+            for a list of available models. Default: `"prebuilt-read"`.
+        :param id_hash_keys: Generate the Document ID from a custom list of strings that refer to the Document's
+            attributes. If you want to ensure you don't have duplicate Documents in your Document Store but texts are not
+            unique, you can pass the name of the metadata to use when building the document ID (like
+            `["text", "category"]`) to this field. In this case, the ID will be generated by using the text and the content of the
+            `category` field. Default: `None`.
+        """
+        azure_import.check()
+
+        self.document_analysis_client = DocumentAnalysisClient(
+            endpoint=endpoint, credential=AzureKeyCredential(api_key)
+        )
+        self.endpoint = endpoint
+        self.api_key = api_key
+        self.model_id = model_id
+        self.id_hash_keys = id_hash_keys or []
+
+    @component.output_types(documents=List[Document], azure=List[Dict])
+    def run(self, paths: List[Union[str, Path]]):
+        """
+        Convert files to Documents using Azure's Document Intelligence service.
+
+        This component creates two outputs: `documents` and `raw_azure_response`. The `documents` output contains
+        a list of Documents that were created from the files. The `raw_azure_response` output contains a list of
+        the raw responses from Azure's Document Intelligence service.
+
+        :param paths: Paths to the files to convert.
+        """
+        documents = []
+        azure_output = []
+        for path in paths:
+            path = Path(path)
+            with open(path, "rb") as file:
+                poller = self.document_analysis_client.begin_analyze_document(model_id=self.model_id, document=file)
+                result = poller.result()
+                azure_output.append(result.to_dict())
+
+            file_suffix = path.suffix
+            document = AzureOCRDocumentConverter._convert_azure_result_to_document(
+                result, self.id_hash_keys, file_suffix
+            )
+            documents.append(document)
+
+        return {"documents": documents, "raw_azure_response": azure_output}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(
+            self, endpoint=self.endpoint, api_key=self.api_key, model_id=self.model_id, id_hash_keys=self.id_hash_keys
+        )
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "AzureOCRDocumentConverter":
+        """
+        Deserialize this component from a dictionary.
+        """
+        return default_from_dict(cls, data)
+
+    @staticmethod
+    def _convert_azure_result_to_document(result: AnalyzeResult, id_hash_keys: List[str], file_suffix: str) -> Document:
+        """
+        Convert the result of Azure OCR to a Haystack text Document.
+        """
+        if file_suffix == ".pdf":
+            text = ""
+            for page in result.pages:
+                lines = page.lines if page.lines else []
+                for line in lines:
+                    text += f"{line.content}\n"
+
+                text += "\f"
+        else:
+            text = result.content
+
+        if id_hash_keys:
+            document = Document(text=text, id_hash_keys=id_hash_keys)
+        else:
+            document = Document(text=text)
+
+        return document
--- a/releasenotes/notes/add-azure_ocr_doc_converter-935130b3b243d236.yaml
+++ b/releasenotes/notes/add-azure_ocr_doc_converter-935130b3b243d236.yaml
@ -0,0 +1,4 @@
+---
+preview:
+  - |
+    Add AzureOCRDocumentConverter to convert files of different types using Azure's Document Intelligence Service.
--- a/test/preview/components/file_converters/test_azure_ocr_doc_converter.py
+++ b/test/preview/components/file_converters/test_azure_ocr_doc_converter.py
@ -0,0 +1,110 @@
+import os
+from unittest.mock import patch, Mock
+
+import pytest
+
+from haystack.preview.components.file_converters.azure import AzureOCRDocumentConverter
+
+
+class TestAzureOCRDocumentConverter:
+    @pytest.mark.unit
+    def test_to_dict(self):
+        component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
+        data = component.to_dict()
+        assert data == {
+            "type": "AzureOCRDocumentConverter",
+            "init_parameters": {
+                "api_key": "test_credential_key",
+                "endpoint": "test_endpoint",
+                "id_hash_keys": [],
+                "model_id": "prebuilt-read",
+            },
+        }
+
+    @pytest.mark.unit
+    def test_from_dict(self):
+        data = {
+            "type": "AzureOCRDocumentConverter",
+            "init_parameters": {
+                "api_key": "test_credential_key",
+                "endpoint": "test_endpoint",
+                "id_hash_keys": [],
+                "model_id": "prebuilt-read",
+            },
+        }
+        component = AzureOCRDocumentConverter.from_dict(data)
+        assert component.endpoint == "test_endpoint"
+        assert component.api_key == "test_credential_key"
+        assert component.id_hash_keys == []
+        assert component.model_id == "prebuilt-read"
+
+    @pytest.mark.unit
+    def test_run(self, preview_samples_path):
+        with patch("haystack.preview.components.file_converters.azure.DocumentAnalysisClient") as mock_azure_client:
+            mock_result = Mock(pages=[Mock(lines=[Mock(content="mocked line 1"), Mock(content="mocked line 2")])])
+            mock_result.to_dict.return_value = {
+                "api_version": "2023-02-28-preview",
+                "model_id": "prebuilt-read",
+                "content": "mocked line 1\nmocked line 2\n\f",
+                "pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
+            }
+            mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = mock_result
+
+            component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
+            output = component.run(paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf"])
+            document = output["documents"][0]
+            assert document.text == "mocked line 1\nmocked line 2\n\f"
+            assert "raw_azure_response" in output
+            assert output["raw_azure_response"][0] == {
+                "api_version": "2023-02-28-preview",
+                "model_id": "prebuilt-read",
+                "content": "mocked line 1\nmocked line 2\n\f",
+                "pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
+            }
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        "CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
+        reason="Azure credentials not available",
+    )
+    def test_run_with_pdf_file(self, preview_samples_path):
+        component = AzureOCRDocumentConverter(
+            endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
+        )
+        output = component.run(paths=[preview_samples_path / "pdf" / "sample_pdf_1.pdf"])
+        documents = output["documents"]
+        assert len(documents) == 1
+        assert "A sample PDF file" in documents[0].text
+        assert "Page 2 of Sample PDF" in documents[0].text
+        assert "Page 4 of Sample PDF" in documents[0].text
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        "CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
+        reason="Azure credentials not available",
+    )
+    def test_with_image_file(self, preview_samples_path):
+        component = AzureOCRDocumentConverter(
+            endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
+        )
+        output = component.run(paths=[preview_samples_path / "images" / "haystack-logo.png"])
+        documents = output["documents"]
+        assert len(documents) == 1
+        assert "haystack" in documents[0].text
+        assert "by deepset" in documents[0].text
+
+    @pytest.mark.integration
+    @pytest.mark.skipif(
+        "CORE_AZURE_CS_ENDPOINT" not in os.environ and "CORE_AZURE_CS_API_KEY" not in os.environ,
+        reason="Azure credentials not available",
+    )
+    def test_run_with_docx_file(self, preview_samples_path):
+        component = AzureOCRDocumentConverter(
+            endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
+        )
+        output = component.run(paths=[preview_samples_path / "docx" / "sample_docx.docx"])
+        documents = output["documents"]
+        assert len(documents) == 1
+        assert "Sample Docx File" in documents[0].text
+        assert "Now we are in Page 2" in documents[0].text
+        assert "Page 3 was empty this is page 4" in documents[0].text
--- a/test/preview/test_files/images/haystack-logo.png
+++ b/test/preview/test_files/images/haystack-logo.png