haystack/test/components/converters/test_azure_ocr_doc_converter.py

99 lines
5.2 KiB
Python
Raw Normal View History

import os
from unittest.mock import patch, Mock
import pytest
2023-11-24 14:48:43 +01:00
from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.dataclasses import ByteStream
class TestAzureOCRDocumentConverter:
def test_init_fail_wo_api_key(self, monkeypatch):
monkeypatch.delenv("AZURE_AI_API_KEY", raising=False)
with pytest.raises(ValueError, match="AzureOCRDocumentConverter expects an Azure Credential key"):
AzureOCRDocumentConverter(endpoint="test_endpoint")
def test_to_dict(self):
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
data = component.to_dict()
assert data == {
2023-11-24 14:48:43 +01:00
"type": "haystack.components.converters.azure.AzureOCRDocumentConverter",
"init_parameters": {"endpoint": "test_endpoint", "model_id": "prebuilt-read"},
}
2023-11-24 14:48:43 +01:00
def test_run(self, test_files_path):
with patch("haystack.components.converters.azure.DocumentAnalysisClient") as mock_azure_client:
mock_result = Mock(pages=[Mock(lines=[Mock(content="mocked line 1"), Mock(content="mocked line 2")])])
mock_result.to_dict.return_value = {
"api_version": "2023-02-28-preview",
"model_id": "prebuilt-read",
"content": "mocked line 1\nmocked line 2\n\f",
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
}
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = mock_result
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
output = component.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
document = output["documents"][0]
assert document.content == "mocked line 1\nmocked line 2\n\f"
assert "raw_azure_response" in output
assert output["raw_azure_response"][0] == {
"api_version": "2023-02-28-preview",
"model_id": "prebuilt-read",
"content": "mocked line 1\nmocked line 2\n\f",
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
}
def test_run_with_meta(self):
2023-12-21 17:09:58 +05:30
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
with patch("haystack.components.converters.azure.DocumentAnalysisClient"):
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
output = component.run(sources=[bytestream], meta=[{"language": "it"}])
document = output["documents"][0]
# check that the metadata from the bytestream is merged with that from the meta parameter
assert document.meta == {"author": "test_author", "language": "it"}
@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
2023-11-24 14:48:43 +01:00
def test_run_with_pdf_file(self, test_files_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
)
output = component.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
documents = output["documents"]
assert len(documents) == 1
assert "A sample PDF file" in documents[0].content
assert "Page 2 of Sample PDF" in documents[0].content
assert "Page 4 of Sample PDF" in documents[0].content
@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
2023-11-24 14:48:43 +01:00
def test_with_image_file(self, test_files_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
)
output = component.run(sources=[test_files_path / "images" / "haystack-logo.png"])
documents = output["documents"]
assert len(documents) == 1
assert "haystack" in documents[0].content
assert "by deepset" in documents[0].content
@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
2023-11-24 14:48:43 +01:00
def test_run_with_docx_file(self, test_files_path):
component = AzureOCRDocumentConverter(
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
)
output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
documents = output["documents"]
assert len(documents) == 1
assert "Sample Docx File" in documents[0].content
assert "Now we are in Page 2" in documents[0].content
assert "Page 3 was empty this is page 4" in documents[0].content