2024-06-12 11:58:36 +02:00
|
|
|
import logging
|
2024-06-19 15:48:31 +02:00
|
|
|
import datetime
|
2024-06-12 11:58:36 +02:00
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from haystack.dataclasses import ByteStream
|
2024-06-19 15:48:31 +02:00
|
|
|
from haystack import Document
|
2024-06-27 08:19:01 +02:00
|
|
|
from haystack.components.converters.docx import DOCXToDocument, DOCXMetadata
|
2024-06-12 11:58:36 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def docx_converter():
|
2024-06-27 08:19:01 +02:00
|
|
|
return DOCXToDocument()
|
2024-06-12 11:58:36 +02:00
|
|
|
|
|
|
|
|
2024-06-27 08:19:01 +02:00
|
|
|
class TestDOCXToDocument:
|
2024-06-12 11:58:36 +02:00
|
|
|
def test_init(self, docx_converter):
|
2024-06-27 08:19:01 +02:00
|
|
|
assert isinstance(docx_converter, DOCXToDocument)
|
2024-06-12 11:58:36 +02:00
|
|
|
|
|
|
|
def test_run(self, test_files_path, docx_converter):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly
|
|
|
|
"""
|
|
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
|
|
output = docx_converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert "History" in docs[0].content
|
2024-06-19 15:48:31 +02:00
|
|
|
assert docs[0].meta.keys() == {"file_path", "docx"}
|
|
|
|
assert docs[0].meta == {
|
|
|
|
"file_path": str(paths[0]),
|
2024-06-27 08:19:01 +02:00
|
|
|
"docx": DOCXMetadata(
|
2024-06-19 15:48:31 +02:00
|
|
|
author="Microsoft Office User",
|
|
|
|
category="",
|
|
|
|
comments="",
|
|
|
|
content_status="",
|
|
|
|
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
|
|
|
identifier="",
|
|
|
|
keywords="",
|
|
|
|
language="",
|
|
|
|
last_modified_by="Carlos Fernández Lorán",
|
|
|
|
last_printed=None,
|
|
|
|
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
|
|
|
revision=2,
|
|
|
|
subject="",
|
|
|
|
title="",
|
|
|
|
version="",
|
|
|
|
),
|
|
|
|
}
|
2024-06-12 11:58:36 +02:00
|
|
|
|
2024-06-19 15:48:31 +02:00
|
|
|
def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
|
|
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
|
|
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
|
|
|
|
doc = output["documents"][0]
|
|
|
|
assert doc.meta == {
|
|
|
|
"file_path": str(paths[0]),
|
2024-06-27 08:19:01 +02:00
|
|
|
"docx": DOCXMetadata(
|
2024-06-19 15:48:31 +02:00
|
|
|
author="Microsoft Office User",
|
|
|
|
category="",
|
|
|
|
comments="",
|
|
|
|
content_status="",
|
|
|
|
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
|
|
|
identifier="",
|
|
|
|
keywords="",
|
|
|
|
language="",
|
|
|
|
last_modified_by="Carlos Fernández Lorán",
|
|
|
|
last_printed=None,
|
|
|
|
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
|
|
|
revision=2,
|
|
|
|
subject="",
|
|
|
|
title="",
|
|
|
|
version="",
|
|
|
|
),
|
|
|
|
"language": "it",
|
|
|
|
"author": "test_author",
|
|
|
|
}
|
2024-06-12 11:58:36 +02:00
|
|
|
|
2024-06-19 15:48:31 +02:00
|
|
|
def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
|
|
|
|
sources = [str(test_files_path / "txt" / "doc_1.txt")]
|
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
results = docx_converter.run(sources=sources)
|
|
|
|
assert "doc_1.txt and convert it" in caplog.text
|
|
|
|
assert results["documents"] == []
|
2024-06-12 11:58:36 +02:00
|
|
|
|
2024-06-19 15:48:31 +02:00
|
|
|
def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
|
2024-06-12 11:58:36 +02:00
|
|
|
"""
|
|
|
|
Test if the component correctly handles errors.
|
|
|
|
"""
|
|
|
|
paths = ["non_existing_file.docx"]
|
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
docx_converter.run(sources=paths)
|
|
|
|
assert "Could not read non_existing_file.docx" in caplog.text
|
|
|
|
|
2024-08-21 04:48:02 -05:00
|
|
|
def test_run_page_breaks(self, test_files_path, docx_converter):
|
|
|
|
"""
|
|
|
|
Test if the component correctly parses page breaks.
|
|
|
|
"""
|
|
|
|
paths = [test_files_path / "docx" / "sample_docx_2_page_breaks.docx"]
|
|
|
|
output = docx_converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 1
|
|
|
|
assert docs[0].content.count("\f") == 4
|
|
|
|
|
2024-06-12 11:58:36 +02:00
|
|
|
def test_mixed_sources_run(self, test_files_path, docx_converter):
|
|
|
|
"""
|
|
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
|
|
"""
|
|
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
|
|
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
|
|
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
|
|
|
|
output = docx_converter.run(sources=paths)
|
|
|
|
docs = output["documents"]
|
|
|
|
assert len(docs) == 2
|
|
|
|
assert "History and standardization" in docs[0].content
|
|
|
|
assert "History and standardization" in docs[1].content
|
2024-06-19 15:48:31 +02:00
|
|
|
|
|
|
|
def test_document_with_docx_metadata_to_dict(self):
|
2024-06-27 08:19:01 +02:00
|
|
|
docx_metadata = DOCXMetadata(
|
2024-06-19 15:48:31 +02:00
|
|
|
author="Microsoft Office User",
|
|
|
|
category="category",
|
|
|
|
comments="comments",
|
|
|
|
content_status="",
|
|
|
|
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
|
|
|
identifier="",
|
|
|
|
keywords="",
|
|
|
|
language="",
|
|
|
|
last_modified_by="Carlos Fernández Lorán",
|
|
|
|
last_printed=None,
|
|
|
|
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
|
|
|
revision=2,
|
|
|
|
subject="",
|
|
|
|
title="",
|
|
|
|
version="",
|
|
|
|
)
|
|
|
|
doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
|
|
|
|
assert doc.to_dict(flatten=False) == {
|
|
|
|
"blob": None,
|
|
|
|
"dataframe": None,
|
|
|
|
"content": "content",
|
|
|
|
"id": "1",
|
|
|
|
"score": None,
|
|
|
|
"embedding": None,
|
|
|
|
"sparse_embedding": None,
|
|
|
|
"meta": {
|
|
|
|
"test": 1,
|
|
|
|
"docx": {
|
|
|
|
"author": "Microsoft Office User",
|
|
|
|
"category": "category",
|
|
|
|
"comments": "comments",
|
|
|
|
"content_status": "",
|
|
|
|
"created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
|
|
|
"identifier": "",
|
|
|
|
"keywords": "",
|
|
|
|
"language": "",
|
|
|
|
"last_modified_by": "Carlos Fernández Lorán",
|
|
|
|
"last_printed": None,
|
|
|
|
"modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
|
|
|
"revision": 2,
|
|
|
|
"subject": "",
|
|
|
|
"title": "",
|
|
|
|
"version": "",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|