haystack/test/components/converters/test_docx_file_to_document.py

401 lines
16 KiB
Python
Raw Normal View History

import json
import os
import logging
2024-06-12 11:58:36 +02:00
import pytest
import csv
from io import StringIO
2024-06-12 11:58:36 +02:00
from haystack import Document, Pipeline
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
2024-06-12 11:58:36 +02:00
from haystack.dataclasses import ByteStream
@pytest.fixture
def docx_converter():
return DOCXToDocument()
2024-06-12 11:58:36 +02:00
class TestDOCXToDocument:
2024-06-12 11:58:36 +02:00
def test_init(self, docx_converter):
assert isinstance(docx_converter, DOCXToDocument)
2024-06-12 11:58:36 +02:00
def test_init_with_string(self):
converter = DOCXToDocument(table_format="markdown")
assert isinstance(converter, DOCXToDocument)
assert converter.table_format == DOCXTableFormat.MARKDOWN
def test_init_with_invalid_string(self):
with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
DOCXToDocument(table_format="invalid_format")
def test_to_dict(self):
converter = DOCXToDocument()
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}
def test_to_dict_custom_parameters(self):
converter = DOCXToDocument(table_format="markdown")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
}
converter = DOCXToDocument(table_format="csv")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
}
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"store_full_path": False, "table_format": "csv"},
}
def test_from_dict(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "csv"},
}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.CSV
def test_from_dict_custom_parameters(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "markdown"},
}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.MARKDOWN
def test_from_dict_invalid_table_format(self):
data = {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "invalid_format"},
}
with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
DOCXToDocument.from_dict(data)
def test_from_dict_empty_init_parameters(self):
data = {"type": "haystack.components.converters.docx.DOCXToDocument", "init_parameters": {}}
converter = DOCXToDocument.from_dict(data)
assert converter.table_format == DOCXTableFormat.CSV
def test_pipeline_serde(self):
pipeline = Pipeline()
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
pipeline.add_component("converter", converter)
pipeline_str = pipeline.dumps()
assert "haystack.components.converters.docx.DOCXToDocument" in pipeline_str
assert "table_format" in pipeline_str
assert "markdown" in pipeline_str
new_pipeline = Pipeline.loads(pipeline_str)
new_converter = new_pipeline.get_component("converter")
assert isinstance(new_converter, DOCXToDocument)
assert new_converter.table_format == DOCXTableFormat.MARKDOWN
2024-06-12 11:58:36 +02:00
def test_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": os.path.basename(paths[0]),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
}
2024-06-12 11:58:36 +02:00
def test_run_with_table(self, test_files_path):
"""
Test if the component runs correctly
"""
docx_converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
paths = [test_files_path / "docx" / "sample_docx.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "Donald Trump" in docs[0].content ## :-)
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": os.path.basename(paths[0]),
"docx": {
"author": "Saha, Anirban",
"category": "",
"comments": "",
"content_status": "",
"created": "2020-07-14T08:14:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Saha, Anirban",
"last_printed": None,
"modified": "2020-07-14T08:16:00+00:00",
"revision": 1,
"subject": "",
"title": "",
"version": "",
},
}
# let's now detect that the table markdown is correctly added and that order of elements is correct
content_parts = docs[0].content.split("\n\n")
table_index = next(i for i, part in enumerate(content_parts) if "| This | Is | Just a |" in part)
# check that natural order of the document is preserved
assert any("Donald Trump" in part for part in content_parts[:table_index]), "Text before table not found"
assert any("Now we are in Page 2" in part for part in content_parts[table_index + 1 :]), (
"Text after table not found"
)
def test_run_with_store_full_path_false(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
docx_converter = DOCXToDocument(store_full_path=False)
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": "sample_docx_1.docx",
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
}
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
def test_table_between_two_paragraphs(self, test_files_path, table_format):
docx_converter = DOCXToDocument(table_format=table_format)
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
paragraphs_two = content.find("Paragraph 2:")
table = content[
paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
].strip()
if table_format == "markdown":
split = list(filter(None, table.split("\n")))
expected_table_header = "| Industry | AI Use Case | Impact |"
expected_last_row = "| Finance | Fraud detection and prevention | Reduced financial losses |"
assert split[0] == expected_table_header
assert split[-1] == expected_last_row
if table_format == "csv": # CSV format
csv_reader = csv.reader(StringIO(table))
rows = list(csv_reader)
assert len(rows) == 3 # Header + 2 data rows
assert rows[0] == ["Industry", "AI Use Case", "Impact"]
assert rows[-1] == ["Finance", "Fraud detection and prevention", "Reduced financial losses"]
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
def test_table_content_correct_parsing(self, test_files_path, table_format):
docx_converter = DOCXToDocument(table_format=table_format)
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
output = docx_converter.run(sources=paths)
content = output["documents"][0].content
paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
paragraphs_two = content.find("Paragraph 2:")
table = content[
paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
].strip()
if table_format == "markdown":
split = list(filter(None, table.split("\n")))
assert len(split) == 4
expected_table_header = "| Industry | AI Use Case | Impact |"
expected_table_top_border = "| ---------- | ------------------------------ | ------------------------- |"
expected_table_row_one = "| Healthcare | Predictive diagnostics | Improved patient outcomes |"
expected_table_row_two = "| Finance | Fraud detection and prevention | Reduced financial losses |"
assert split[0] == expected_table_header
assert split[1] == expected_table_top_border
assert split[2] == expected_table_row_one
assert split[3] == expected_table_row_two
if table_format == "csv": # CSV format
csv_reader = csv.reader(StringIO(table))
rows = list(csv_reader)
assert len(rows) == 3 # Header + 2 data rows
expected_header = ["Industry", "AI Use Case", "Impact"]
expected_row_one = ["Healthcare", "Predictive diagnostics", "Improved patient outcomes"]
expected_row_two = ["Finance", "Fraud detection and prevention", "Reduced financial losses"]
assert rows[0] == expected_header
assert rows[1] == expected_row_one
assert rows[2] == expected_row_two
def test_run_with_additional_meta(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
assert doc.meta == {
"file_path": os.path.basename(paths[0]),
"docx": {
"author": "Microsoft Office User",
"category": "",
"comments": "",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
"language": "it",
"author": "test_author",
}
2024-06-12 11:58:36 +02:00
def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
sources = [str(test_files_path / "txt" / "doc_1.txt")]
with caplog.at_level(logging.WARNING):
results = docx_converter.run(sources=sources)
assert "doc_1.txt and convert it" in caplog.text
assert results["documents"] == []
2024-06-12 11:58:36 +02:00
def test_run_error_non_existent_file(self, docx_converter, caplog):
2024-06-12 11:58:36 +02:00
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
docx_converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text
def test_run_page_breaks(self, test_files_path, docx_converter):
"""
Test if the component correctly parses page breaks.
"""
paths = [test_files_path / "docx" / "sample_docx_2_page_breaks.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 4
2024-06-12 11:58:36 +02:00
def test_mixed_sources_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
paths.append(ByteStream(f.read()))
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
def test_document_with_docx_metadata_to_dict(self):
docx_metadata = DOCXMetadata(
author="Microsoft Office User",
category="category",
comments="comments",
content_status="",
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
)
doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
assert doc.to_dict(flatten=False) == {
"blob": None,
"dataframe": None,
"content": "content",
"id": "1",
"score": None,
"embedding": None,
"sparse_embedding": None,
"meta": {
"test": 1,
"docx": {
"author": "Microsoft Office User",
"category": "category",
"comments": "comments",
"content_status": "",
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
},
}
# check it is JSON serializable
json_str = json.dumps(doc.to_dict(flatten=False))
assert json.loads(json_str) == doc.to_dict(flatten=False)