mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-17 05:51:22 +00:00

* first fucntioning DocxFileToDocument * fix lazy import message * add reno * Add license headder Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * change DocxFileToDocument to DocxToDocument * Update library install to the maintained version Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * clan try-exvept to only take non haystack errors into account * Add wanring on docstring of component ignoring page brakes, mark test as skip * make warnings lazy evaluations Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * make warnings lazy evaluations Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Make warnings lazy evaluated Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Solve f bug * Get more metadata from docx files * add 'python-docx' dependency and docs * Change logging import Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Fix typo Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * remake metadata extraction for docx * solve bug regarding _get_docx_metadata method * Update haystack/components/converters/docx.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Update haystack/components/converters/docx.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Delete unused test --------- Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>
64 lines
2.3 KiB
Python
64 lines
2.3 KiB
Python
import logging
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from haystack.dataclasses import ByteStream
|
|
from haystack.components.converters import DocxToDocument
|
|
|
|
|
|
@pytest.fixture
|
|
def docx_converter():
|
|
return DocxToDocument()
|
|
|
|
|
|
class TestDocxToDocument:
|
|
def test_init(self, docx_converter):
|
|
assert isinstance(docx_converter, DocxToDocument)
|
|
|
|
@pytest.mark.integration
|
|
def test_run(self, test_files_path, docx_converter):
|
|
"""
|
|
Test if the component runs correctly
|
|
"""
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "History" in docs[0].content
|
|
|
|
def test_run_with_meta(self, test_files_path, docx_converter):
|
|
with patch("haystack.components.converters.docx.DocxToDocument"):
|
|
output = docx_converter.run(
|
|
sources=[test_files_path / "docx" / "sample_docx_1.docx"],
|
|
meta={"language": "it", "author": "test_author"},
|
|
)
|
|
|
|
# check that the metadata from the bytestream is merged with that from the meta parameter
|
|
assert output["documents"][0].meta["author"] == "test_author"
|
|
assert output["documents"][0].meta["language"] == "it"
|
|
|
|
def test_run_error_handling(self, test_files_path, docx_converter, caplog):
|
|
"""
|
|
Test if the component correctly handles errors.
|
|
"""
|
|
paths = ["non_existing_file.docx"]
|
|
with caplog.at_level(logging.WARNING):
|
|
docx_converter.run(sources=paths)
|
|
assert "Could not read non_existing_file.docx" in caplog.text
|
|
|
|
@pytest.mark.integration
|
|
def test_mixed_sources_run(self, test_files_path, docx_converter):
|
|
"""
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
"""
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 2
|
|
assert "History and standardization" in docs[0].content
|
|
assert "History and standardization" in docs[1].content
|