haystack/test/components/converters/test_docx_file_to_document.py
Stefano Fiorucci 2828d9e4ae
refactor!: DOCXToDocument converter - store DOCX metadata as a dict (#8804)
* DOCXToDocument - store DOCX metadata as a dict

* do not export DOCXMetadata to converters package
2025-02-05 14:43:19 +01:00

401 lines
16 KiB
Python

import json
import os
import logging
import pytest
import csv
from io import StringIO
from haystack import Document, Pipeline
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
from haystack.dataclasses import ByteStream
@pytest.fixture
def docx_converter():
return DOCXToDocument()
class TestDOCXToDocument:
def test_init(self, docx_converter):
assert isinstance(docx_converter, DOCXToDocument)
def test_init_with_string(self):
converter = DOCXToDocument(table_format="markdown")
assert isinstance(converter, DOCXToDocument)
assert converter.table_format == DOCXTableFormat.MARKDOWN
def test_init_with_invalid_string(self):
with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
DOCXToDocument(table_format="invalid_format")
def test_to_dict(self):
converter = DOCXToDocument()
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}
def test_to_dict_custom_parameters(self):
converter = DOCXToDocument(table_format="markdown")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
}
converter = DOCXToDocument(table_format="csv")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
}
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}
def test_from_dict(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "csv"},
}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.CSV
def test_from_dict_custom_parameters(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "markdown"},
}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.MARKDOWN
def test_from_dict_invalid_table_format(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "invalid_format"},
}
with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
DOCXToDocument.from_dict(data)
def test_from_dict_empty_init_parameters(self):
data = {"type": "haystack.components.converters.docx.DOCXToDocument", "init_parameters": {}}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.CSV
def test_pipeline_serde(self):
pipeline = Pipeline()
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
pipeline.add_component("converter", converter)
pipeline_str = pipeline.dumps()
assert "haystack.components.converters.docx.DOCXToDocument" in pipeline_str
assert "table_format" in pipeline_str
assert "markdown" in pipeline_str
new_pipeline = Pipeline.loads(pipeline_str)
new_converter = new_pipeline.get_component("converter")
assert isinstance(new_converter, DOCXToDocument)
assert new_converter.table_format == DOCXTableFormat.MARKDOWN
def test_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": os.path.basename(paths[0]),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
}
def test_run_with_table(self, test_files_path):
"""
Test if the component runs correctly
"""
docx_converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
paths = [test_files_path / "docx" / "sample_docx.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "Donald Trump" in docs[0].content ## :-)
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": os.path.basename(paths[0]),
"docx": {
"author": "Saha, Anirban",
"category": "",
"comments": "",
"content_status": "",
"created": "2020-07-14T08:14:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Saha, Anirban",
"last_printed": None,
"modified": "2020-07-14T08:16:00+00:00",
"revision": 1,
"subject": "",
"title": "",
"version": "",
},
}
# let's now detect that the table markdown is correctly added and that order of elements is correct
content_parts = docs[0].content.split("\n\n")
table_index = next(i for i, part in enumerate(content_parts) if "| This | Is | Just a |" in part)
# check that natural order of the document is preserved
assert any("Donald Trump" in part for part in content_parts[:table_index]), "Text before table not found"
assert any("Now we are in Page 2" in part for part in content_parts[table_index + 1 :]), (
"Text after table not found"
)
def test_run_with_store_full_path_false(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
docx_converter = DOCXToDocument(store_full_path=False)
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": "sample_docx_1.docx",
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
}
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
def test_table_between_two_paragraphs(self, test_files_path, table_format):
docx_converter = DOCXToDocument(table_format=table_format)
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
paragraphs_two = content.find("Paragraph 2:")
table = content[
paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
].strip()
if table_format == "markdown":
split = list(filter(None, table.split("\n")))
expected_table_header = "| Industry | AI Use Case | Impact |"
expected_last_row = "| Finance | Fraud detection and prevention | Reduced financial losses |"
assert split[0] == expected_table_header
assert split[-1] == expected_last_row
if table_format == "csv": # CSV format
csv_reader = csv.reader(StringIO(table))
rows = list(csv_reader)
assert len(rows) == 3 # Header + 2 data rows
assert rows[0] == ["Industry", "AI Use Case", "Impact"]
assert rows[-1] == ["Finance", "Fraud detection and prevention", "Reduced financial losses"]
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
def test_table_content_correct_parsing(self, test_files_path, table_format):
docx_converter = DOCXToDocument(table_format=table_format)
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
paragraphs_two = content.find("Paragraph 2:")
table = content[
paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
].strip()
if table_format == "markdown":
split = list(filter(None, table.split("\n")))
assert len(split) == 4
expected_table_header = "| Industry | AI Use Case | Impact |"
expected_table_top_border = "| ---------- | ------------------------------ | ------------------------- |"
expected_table_row_one = "| Healthcare | Predictive diagnostics | Improved patient outcomes |"
expected_table_row_two = "| Finance | Fraud detection and prevention | Reduced financial losses |"
assert split[0] == expected_table_header
assert split[1] == expected_table_top_border
assert split[2] == expected_table_row_one
assert split[3] == expected_table_row_two
if table_format == "csv": # CSV format
csv_reader = csv.reader(StringIO(table))
rows = list(csv_reader)
assert len(rows) == 3 # Header + 2 data rows
expected_header = ["Industry", "AI Use Case", "Impact"]
expected_row_one = ["Healthcare", "Predictive diagnostics", "Improved patient outcomes"]
expected_row_two = ["Finance", "Fraud detection and prevention", "Reduced financial losses"]
assert rows[0] == expected_header
assert rows[1] == expected_row_one
assert rows[2] == expected_row_two
def test_run_with_additional_meta(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
assert doc.meta == {
"file_path": os.path.basename(paths[0]),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
"language": "it",
"author": "test_author",
}
def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
sources = [str(test_files_path / "txt" / "doc_1.txt")]
with caplog.at_level(logging.WARNING):
results = docx_converter.run(sources=sources)
assert "doc_1.txt and convert it" in caplog.text
assert results["documents"] == []
def test_run_error_non_existent_file(self, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
docx_converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text
def test_run_page_breaks(self, test_files_path, docx_converter):
"""
Test if the component correctly parses page breaks.
"""
paths = [test_files_path / "docx" / "sample_docx_2_page_breaks.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 4
def test_mixed_sources_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
paths.append(ByteStream(f.read()))
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
def test_document_with_docx_metadata_to_dict(self):
docx_metadata = DOCXMetadata(
author="Microsoft Office User",
category="category",
comments="comments",
content_status="",
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
)
doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
assert doc.to_dict(flatten=False) == {
"blob": None,
"dataframe": None,
"content": "content",
"id": "1",
"score": None,
"embedding": None,
"sparse_embedding": None,
"meta": {
"test": 1,
"docx": {
"author": "Microsoft Office User",
"category": "category",
"comments": "comments",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
},
}
# check it is JSON serializable
json_str = json.dumps(doc.to_dict(flatten=False))
assert json.loads(json_str) == doc.to_dict(flatten=False)