2023-09-26 15:57:55 +02:00
|
|
|
import os
|
|
|
|
from unittest.mock import patch, Mock
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
from haystack.components.converters.azure import AzureOCRDocumentConverter
|
2023-12-15 16:41:35 +01:00
|
|
|
from haystack.dataclasses import ByteStream
|
2023-09-26 15:57:55 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestAzureOCRDocumentConverter:
|
2023-10-23 15:56:23 +05:30
|
|
|
def test_init_fail_wo_api_key(self, monkeypatch):
|
|
|
|
monkeypatch.delenv("AZURE_AI_API_KEY", raising=False)
|
|
|
|
with pytest.raises(ValueError, match="AzureOCRDocumentConverter expects an Azure Credential key"):
|
|
|
|
AzureOCRDocumentConverter(endpoint="test_endpoint")
|
|
|
|
|
2023-09-26 15:57:55 +02:00
|
|
|
def test_to_dict(self):
|
|
|
|
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
|
|
|
|
data = component.to_dict()
|
|
|
|
assert data == {
|
2023-11-24 14:48:43 +01:00
|
|
|
"type": "haystack.components.converters.azure.AzureOCRDocumentConverter",
|
2023-10-23 15:56:23 +05:30
|
|
|
"init_parameters": {"endpoint": "test_endpoint", "model_id": "prebuilt-read"},
|
2023-09-26 15:57:55 +02:00
|
|
|
}
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run(self, test_files_path):
|
|
|
|
with patch("haystack.components.converters.azure.DocumentAnalysisClient") as mock_azure_client:
|
2023-09-26 15:57:55 +02:00
|
|
|
mock_result = Mock(pages=[Mock(lines=[Mock(content="mocked line 1"), Mock(content="mocked line 2")])])
|
|
|
|
mock_result.to_dict.return_value = {
|
|
|
|
"api_version": "2023-02-28-preview",
|
|
|
|
"model_id": "prebuilt-read",
|
|
|
|
"content": "mocked line 1\nmocked line 2\n\f",
|
|
|
|
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
|
|
|
|
}
|
|
|
|
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = mock_result
|
|
|
|
|
|
|
|
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
|
2023-09-26 15:57:55 +02:00
|
|
|
document = output["documents"][0]
|
2023-10-31 12:44:04 +01:00
|
|
|
assert document.content == "mocked line 1\nmocked line 2\n\f"
|
2023-09-26 15:57:55 +02:00
|
|
|
assert "raw_azure_response" in output
|
|
|
|
assert output["raw_azure_response"][0] == {
|
|
|
|
"api_version": "2023-02-28-preview",
|
|
|
|
"model_id": "prebuilt-read",
|
|
|
|
"content": "mocked line 1\nmocked line 2\n\f",
|
|
|
|
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
|
|
|
|
}
|
|
|
|
|
2023-12-15 16:41:35 +01:00
|
|
|
def test_run_with_meta(self):
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
2023-12-15 16:41:35 +01:00
|
|
|
|
|
|
|
with patch("haystack.components.converters.azure.DocumentAnalysisClient"):
|
|
|
|
component = AzureOCRDocumentConverter(endpoint="test_endpoint", api_key="test_credential_key")
|
|
|
|
|
|
|
|
output = component.run(sources=[bytestream], meta=[{"language": "it"}])
|
|
|
|
document = output["documents"][0]
|
|
|
|
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
|
|
|
assert document.meta == {"author": "test_author", "language": "it"}
|
|
|
|
|
2023-09-26 15:57:55 +02:00
|
|
|
@pytest.mark.integration
|
2023-09-28 12:09:19 +02:00
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
|
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_with_pdf_file(self, test_files_path):
|
2023-09-26 15:57:55 +02:00
|
|
|
component = AzureOCRDocumentConverter(
|
|
|
|
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
|
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
|
2023-09-26 15:57:55 +02:00
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "A sample PDF file" in documents[0].content
|
|
|
|
assert "Page 2 of Sample PDF" in documents[0].content
|
|
|
|
assert "Page 4 of Sample PDF" in documents[0].content
|
2023-09-26 15:57:55 +02:00
|
|
|
|
|
|
|
@pytest.mark.integration
|
2023-09-28 12:09:19 +02:00
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
|
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_with_image_file(self, test_files_path):
|
2023-09-26 15:57:55 +02:00
|
|
|
component = AzureOCRDocumentConverter(
|
|
|
|
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
|
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "images" / "haystack-logo.png"])
|
2023-09-26 15:57:55 +02:00
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "haystack" in documents[0].content
|
|
|
|
assert "by deepset" in documents[0].content
|
2023-09-26 15:57:55 +02:00
|
|
|
|
|
|
|
@pytest.mark.integration
|
2023-09-28 12:09:19 +02:00
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
|
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_with_docx_file(self, test_files_path):
|
2023-09-26 15:57:55 +02:00
|
|
|
component = AzureOCRDocumentConverter(
|
|
|
|
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=os.environ["CORE_AZURE_CS_API_KEY"]
|
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
|
2023-09-26 15:57:55 +02:00
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "Sample Docx File" in documents[0].content
|
|
|
|
assert "Now we are in Page 2" in documents[0].content
|
|
|
|
assert "Page 3 was empty this is page 4" in documents[0].content
|