2023-09-26 15:57:55 +02:00
|
|
|
import os
|
|
|
|
from unittest.mock import patch, Mock
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-11-24 14:48:43 +01:00
|
|
|
from haystack.components.converters.azure import AzureOCRDocumentConverter
|
2023-12-15 16:41:35 +01:00
|
|
|
from haystack.dataclasses import ByteStream
|
2024-02-05 13:17:01 +01:00
|
|
|
from haystack.utils import Secret
|
2023-09-26 15:57:55 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TestAzureOCRDocumentConverter:
|
2023-10-23 15:56:23 +05:30
|
|
|
def test_init_fail_wo_api_key(self, monkeypatch):
|
|
|
|
monkeypatch.delenv("AZURE_AI_API_KEY", raising=False)
|
2024-01-05 10:40:03 +01:00
|
|
|
with pytest.raises(ValueError):
|
2023-10-23 15:56:23 +05:30
|
|
|
AzureOCRDocumentConverter(endpoint="test_endpoint")
|
|
|
|
|
2024-02-05 13:17:01 +01:00
|
|
|
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
|
|
|
|
def test_to_dict(self, mock_resolve_value):
|
|
|
|
mock_resolve_value.return_value = "test_api_key"
|
|
|
|
component = AzureOCRDocumentConverter(endpoint="test_endpoint")
|
2023-09-26 15:57:55 +02:00
|
|
|
data = component.to_dict()
|
|
|
|
assert data == {
|
2023-11-24 14:48:43 +01:00
|
|
|
"type": "haystack.components.converters.azure.AzureOCRDocumentConverter",
|
2024-02-05 13:17:01 +01:00
|
|
|
"init_parameters": {
|
|
|
|
"api_key": {"env_vars": ["AZURE_AI_API_KEY"], "strict": True, "type": "env_var"},
|
|
|
|
"endpoint": "test_endpoint",
|
|
|
|
"model_id": "prebuilt-read",
|
|
|
|
},
|
2023-09-26 15:57:55 +02:00
|
|
|
}
|
|
|
|
|
2024-02-05 13:17:01 +01:00
|
|
|
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
|
|
|
|
def test_run(self, mock_resolve_value, test_files_path):
|
|
|
|
mock_resolve_value.return_value = "test_api_key"
|
2023-11-24 14:48:43 +01:00
|
|
|
with patch("haystack.components.converters.azure.DocumentAnalysisClient") as mock_azure_client:
|
2023-09-26 15:57:55 +02:00
|
|
|
mock_result = Mock(pages=[Mock(lines=[Mock(content="mocked line 1"), Mock(content="mocked line 2")])])
|
|
|
|
mock_result.to_dict.return_value = {
|
|
|
|
"api_version": "2023-02-28-preview",
|
|
|
|
"model_id": "prebuilt-read",
|
|
|
|
"content": "mocked line 1\nmocked line 2\n\f",
|
|
|
|
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
|
|
|
|
}
|
|
|
|
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = mock_result
|
|
|
|
|
2024-02-05 13:17:01 +01:00
|
|
|
component = AzureOCRDocumentConverter(endpoint="test_endpoint")
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
|
2023-09-26 15:57:55 +02:00
|
|
|
document = output["documents"][0]
|
2023-10-31 12:44:04 +01:00
|
|
|
assert document.content == "mocked line 1\nmocked line 2\n\f"
|
2023-09-26 15:57:55 +02:00
|
|
|
assert "raw_azure_response" in output
|
|
|
|
assert output["raw_azure_response"][0] == {
|
|
|
|
"api_version": "2023-02-28-preview",
|
|
|
|
"model_id": "prebuilt-read",
|
|
|
|
"content": "mocked line 1\nmocked line 2\n\f",
|
|
|
|
"pages": [{"lines": [{"content": "mocked line 1"}, {"content": "mocked line 2"}]}],
|
|
|
|
}
|
|
|
|
|
2024-02-05 13:17:01 +01:00
|
|
|
@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
|
|
|
|
def test_run_with_meta(self, mock_resolve_value, test_files_path):
|
|
|
|
mock_resolve_value.return_value = "test_api_key"
|
2023-12-21 17:09:58 +05:30
|
|
|
bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"})
|
2023-12-15 16:41:35 +01:00
|
|
|
with patch("haystack.components.converters.azure.DocumentAnalysisClient"):
|
2024-02-05 13:17:01 +01:00
|
|
|
component = AzureOCRDocumentConverter(endpoint="test_endpoint")
|
2024-01-09 10:49:37 +01:00
|
|
|
output = component.run(
|
|
|
|
sources=[bytestream, test_files_path / "pdf" / "sample_pdf_1.pdf"], meta={"language": "it"}
|
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
2024-01-09 10:49:37 +01:00
|
|
|
assert output["documents"][0].meta["author"] == "test_author"
|
|
|
|
assert output["documents"][0].meta["language"] == "it"
|
|
|
|
assert output["documents"][1].meta["language"] == "it"
|
2023-12-15 16:41:35 +01:00
|
|
|
|
2023-09-26 15:57:55 +02:00
|
|
|
@pytest.mark.integration
|
2023-09-28 12:09:19 +02:00
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
|
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_with_pdf_file(self, test_files_path):
|
2023-09-26 15:57:55 +02:00
|
|
|
component = AzureOCRDocumentConverter(
|
2024-02-05 13:17:01 +01:00
|
|
|
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
|
2023-09-26 15:57:55 +02:00
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"])
|
2023-09-26 15:57:55 +02:00
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "A sample PDF file" in documents[0].content
|
|
|
|
assert "Page 2 of Sample PDF" in documents[0].content
|
|
|
|
assert "Page 4 of Sample PDF" in documents[0].content
|
2023-09-26 15:57:55 +02:00
|
|
|
|
|
|
|
@pytest.mark.integration
|
2023-09-28 12:09:19 +02:00
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
|
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_with_image_file(self, test_files_path):
|
2023-09-26 15:57:55 +02:00
|
|
|
component = AzureOCRDocumentConverter(
|
2024-02-05 13:17:01 +01:00
|
|
|
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
|
2023-09-26 15:57:55 +02:00
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "images" / "haystack-logo.png"])
|
2023-09-26 15:57:55 +02:00
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "haystack" in documents[0].content
|
|
|
|
assert "by deepset" in documents[0].content
|
2023-09-26 15:57:55 +02:00
|
|
|
|
|
|
|
@pytest.mark.integration
|
2023-09-28 12:09:19 +02:00
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_ENDPOINT", None), reason="Azure credentials not available")
|
|
|
|
@pytest.mark.skipif(not os.environ.get("CORE_AZURE_CS_API_KEY", None), reason="Azure credentials not available")
|
2023-11-24 14:48:43 +01:00
|
|
|
def test_run_with_docx_file(self, test_files_path):
|
2023-09-26 15:57:55 +02:00
|
|
|
component = AzureOCRDocumentConverter(
|
2024-02-05 13:17:01 +01:00
|
|
|
endpoint=os.environ["CORE_AZURE_CS_ENDPOINT"], api_key=Secret.from_env_var("CORE_AZURE_CS_API_KEY")
|
2023-09-26 15:57:55 +02:00
|
|
|
)
|
2023-12-15 16:41:35 +01:00
|
|
|
output = component.run(sources=[test_files_path / "docx" / "sample_docx.docx"])
|
2023-09-26 15:57:55 +02:00
|
|
|
documents = output["documents"]
|
|
|
|
assert len(documents) == 1
|
2023-10-31 12:44:04 +01:00
|
|
|
assert "Sample Docx File" in documents[0].content
|
|
|
|
assert "Now we are in Page 2" in documents[0].content
|
|
|
|
assert "Page 3 was empty this is page 4" in documents[0].content
|