mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-14 12:31:10 +00:00

* DOCXToDocument - store DOCX metadata as a dict * do not export DOCXMetadata to converters package
401 lines
16 KiB
Python
401 lines
16 KiB
Python
import json
|
|
import os
|
|
import logging
|
|
import pytest
|
|
import csv
|
|
from io import StringIO
|
|
|
|
from haystack import Document, Pipeline
|
|
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
|
|
from haystack.dataclasses import ByteStream
|
|
|
|
|
|
@pytest.fixture
|
|
def docx_converter():
|
|
return DOCXToDocument()
|
|
|
|
|
|
class TestDOCXToDocument:
|
|
def test_init(self, docx_converter):
|
|
assert isinstance(docx_converter, DOCXToDocument)
|
|
|
|
def test_init_with_string(self):
|
|
converter = DOCXToDocument(table_format="markdown")
|
|
assert isinstance(converter, DOCXToDocument)
|
|
assert converter.table_format == DOCXTableFormat.MARKDOWN
|
|
|
|
def test_init_with_invalid_string(self):
|
|
with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
|
|
DOCXToDocument(table_format="invalid_format")
|
|
|
|
def test_to_dict(self):
|
|
converter = DOCXToDocument()
|
|
data = converter.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"store_full_path": False, "table_format": "csv"},
|
|
}
|
|
|
|
def test_to_dict_custom_parameters(self):
|
|
converter = DOCXToDocument(table_format="markdown")
|
|
data = converter.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
|
|
}
|
|
|
|
converter = DOCXToDocument(table_format="csv")
|
|
data = converter.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"store_full_path": False, "table_format": "csv"},
|
|
}
|
|
|
|
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
|
|
data = converter.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"store_full_path": False, "table_format": "markdown"},
|
|
}
|
|
|
|
converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
|
|
data = converter.to_dict()
|
|
assert data == {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"store_full_path": False, "table_format": "csv"},
|
|
}
|
|
|
|
def test_from_dict(self):
|
|
data = {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"table_format": "csv"},
|
|
}
|
|
converter = DOCXToDocument.from_dict(data)
|
|
assert converter.table_format == DOCXTableFormat.CSV
|
|
|
|
def test_from_dict_custom_parameters(self):
|
|
data = {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"table_format": "markdown"},
|
|
}
|
|
converter = DOCXToDocument.from_dict(data)
|
|
assert converter.table_format == DOCXTableFormat.MARKDOWN
|
|
|
|
def test_from_dict_invalid_table_format(self):
|
|
data = {
|
|
"type": "haystack.components.converters.docx.DOCXToDocument",
|
|
"init_parameters": {"table_format": "invalid_format"},
|
|
}
|
|
with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
|
|
DOCXToDocument.from_dict(data)
|
|
|
|
def test_from_dict_empty_init_parameters(self):
|
|
data = {"type": "haystack.components.converters.docx.DOCXToDocument", "init_parameters": {}}
|
|
converter = DOCXToDocument.from_dict(data)
|
|
assert converter.table_format == DOCXTableFormat.CSV
|
|
|
|
def test_pipeline_serde(self):
|
|
pipeline = Pipeline()
|
|
converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
|
|
pipeline.add_component("converter", converter)
|
|
|
|
pipeline_str = pipeline.dumps()
|
|
assert "haystack.components.converters.docx.DOCXToDocument" in pipeline_str
|
|
assert "table_format" in pipeline_str
|
|
assert "markdown" in pipeline_str
|
|
|
|
new_pipeline = Pipeline.loads(pipeline_str)
|
|
new_converter = new_pipeline.get_component("converter")
|
|
assert isinstance(new_converter, DOCXToDocument)
|
|
assert new_converter.table_format == DOCXTableFormat.MARKDOWN
|
|
|
|
def test_run(self, test_files_path, docx_converter):
|
|
"""
|
|
Test if the component runs correctly
|
|
"""
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "History" in docs[0].content
|
|
assert docs[0].meta.keys() == {"file_path", "docx"}
|
|
assert docs[0].meta == {
|
|
"file_path": os.path.basename(paths[0]),
|
|
"docx": {
|
|
"author": "Microsoft Office User",
|
|
"category": "",
|
|
"comments": "",
|
|
"content_status": "",
|
|
"created": "2024-06-09T21:17:00+00:00",
|
|
"identifier": "",
|
|
"keywords": "",
|
|
"language": "",
|
|
"last_modified_by": "Carlos Fernández Lorán",
|
|
"last_printed": None,
|
|
"modified": "2024-06-09T21:27:00+00:00",
|
|
"revision": 2,
|
|
"subject": "",
|
|
"title": "",
|
|
"version": "",
|
|
},
|
|
}
|
|
|
|
def test_run_with_table(self, test_files_path):
|
|
"""
|
|
Test if the component runs correctly
|
|
"""
|
|
docx_converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
|
|
paths = [test_files_path / "docx" / "sample_docx.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "Donald Trump" in docs[0].content ## :-)
|
|
assert docs[0].meta.keys() == {"file_path", "docx"}
|
|
assert docs[0].meta == {
|
|
"file_path": os.path.basename(paths[0]),
|
|
"docx": {
|
|
"author": "Saha, Anirban",
|
|
"category": "",
|
|
"comments": "",
|
|
"content_status": "",
|
|
"created": "2020-07-14T08:14:00+00:00",
|
|
"identifier": "",
|
|
"keywords": "",
|
|
"language": "",
|
|
"last_modified_by": "Saha, Anirban",
|
|
"last_printed": None,
|
|
"modified": "2020-07-14T08:16:00+00:00",
|
|
"revision": 1,
|
|
"subject": "",
|
|
"title": "",
|
|
"version": "",
|
|
},
|
|
}
|
|
# let's now detect that the table markdown is correctly added and that order of elements is correct
|
|
content_parts = docs[0].content.split("\n\n")
|
|
table_index = next(i for i, part in enumerate(content_parts) if "| This | Is | Just a |" in part)
|
|
# check that natural order of the document is preserved
|
|
assert any("Donald Trump" in part for part in content_parts[:table_index]), "Text before table not found"
|
|
assert any("Now we are in Page 2" in part for part in content_parts[table_index + 1 :]), (
|
|
"Text after table not found"
|
|
)
|
|
|
|
def test_run_with_store_full_path_false(self, test_files_path):
|
|
"""
|
|
Test if the component runs correctly with store_full_path=False
|
|
"""
|
|
docx_converter = DOCXToDocument(store_full_path=False)
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert "History" in docs[0].content
|
|
assert docs[0].meta.keys() == {"file_path", "docx"}
|
|
assert docs[0].meta == {
|
|
"file_path": "sample_docx_1.docx",
|
|
"docx": {
|
|
"author": "Microsoft Office User",
|
|
"category": "",
|
|
"comments": "",
|
|
"content_status": "",
|
|
"created": "2024-06-09T21:17:00+00:00",
|
|
"identifier": "",
|
|
"keywords": "",
|
|
"language": "",
|
|
"last_modified_by": "Carlos Fernández Lorán",
|
|
"last_printed": None,
|
|
"modified": "2024-06-09T21:27:00+00:00",
|
|
"revision": 2,
|
|
"subject": "",
|
|
"title": "",
|
|
"version": "",
|
|
},
|
|
}
|
|
|
|
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
|
|
def test_table_between_two_paragraphs(self, test_files_path, table_format):
|
|
docx_converter = DOCXToDocument(table_format=table_format)
|
|
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
|
|
content = output["documents"][0].content
|
|
|
|
paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
|
|
paragraphs_two = content.find("Paragraph 2:")
|
|
table = content[
|
|
paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
|
|
].strip()
|
|
|
|
if table_format == "markdown":
|
|
split = list(filter(None, table.split("\n")))
|
|
expected_table_header = "| Industry | AI Use Case | Impact |"
|
|
expected_last_row = "| Finance | Fraud detection and prevention | Reduced financial losses |"
|
|
|
|
assert split[0] == expected_table_header
|
|
assert split[-1] == expected_last_row
|
|
if table_format == "csv": # CSV format
|
|
csv_reader = csv.reader(StringIO(table))
|
|
rows = list(csv_reader)
|
|
assert len(rows) == 3 # Header + 2 data rows
|
|
assert rows[0] == ["Industry", "AI Use Case", "Impact"]
|
|
assert rows[-1] == ["Finance", "Fraud detection and prevention", "Reduced financial losses"]
|
|
|
|
@pytest.mark.parametrize("table_format", ["markdown", "csv"])
|
|
def test_table_content_correct_parsing(self, test_files_path, table_format):
|
|
docx_converter = DOCXToDocument(table_format=table_format)
|
|
paths = [test_files_path / "docx" / "sample_docx_3.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
content = output["documents"][0].content
|
|
|
|
paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
|
|
paragraphs_two = content.find("Paragraph 2:")
|
|
table = content[
|
|
paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
|
|
].strip()
|
|
|
|
if table_format == "markdown":
|
|
split = list(filter(None, table.split("\n")))
|
|
assert len(split) == 4
|
|
|
|
expected_table_header = "| Industry | AI Use Case | Impact |"
|
|
expected_table_top_border = "| ---------- | ------------------------------ | ------------------------- |"
|
|
expected_table_row_one = "| Healthcare | Predictive diagnostics | Improved patient outcomes |"
|
|
expected_table_row_two = "| Finance | Fraud detection and prevention | Reduced financial losses |"
|
|
|
|
assert split[0] == expected_table_header
|
|
assert split[1] == expected_table_top_border
|
|
assert split[2] == expected_table_row_one
|
|
assert split[3] == expected_table_row_two
|
|
if table_format == "csv": # CSV format
|
|
csv_reader = csv.reader(StringIO(table))
|
|
rows = list(csv_reader)
|
|
assert len(rows) == 3 # Header + 2 data rows
|
|
|
|
expected_header = ["Industry", "AI Use Case", "Impact"]
|
|
expected_row_one = ["Healthcare", "Predictive diagnostics", "Improved patient outcomes"]
|
|
expected_row_two = ["Finance", "Fraud detection and prevention", "Reduced financial losses"]
|
|
|
|
assert rows[0] == expected_header
|
|
assert rows[1] == expected_row_one
|
|
assert rows[2] == expected_row_two
|
|
|
|
def test_run_with_additional_meta(self, test_files_path, docx_converter):
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
|
|
doc = output["documents"][0]
|
|
assert doc.meta == {
|
|
"file_path": os.path.basename(paths[0]),
|
|
"docx": {
|
|
"author": "Microsoft Office User",
|
|
"category": "",
|
|
"comments": "",
|
|
"content_status": "",
|
|
"created": "2024-06-09T21:17:00+00:00",
|
|
"identifier": "",
|
|
"keywords": "",
|
|
"language": "",
|
|
"last_modified_by": "Carlos Fernández Lorán",
|
|
"last_printed": None,
|
|
"modified": "2024-06-09T21:27:00+00:00",
|
|
"revision": 2,
|
|
"subject": "",
|
|
"title": "",
|
|
"version": "",
|
|
},
|
|
"language": "it",
|
|
"author": "test_author",
|
|
}
|
|
|
|
def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
|
|
sources = [str(test_files_path / "txt" / "doc_1.txt")]
|
|
with caplog.at_level(logging.WARNING):
|
|
results = docx_converter.run(sources=sources)
|
|
assert "doc_1.txt and convert it" in caplog.text
|
|
assert results["documents"] == []
|
|
|
|
def test_run_error_non_existent_file(self, docx_converter, caplog):
|
|
"""
|
|
Test if the component correctly handles errors.
|
|
"""
|
|
paths = ["non_existing_file.docx"]
|
|
with caplog.at_level(logging.WARNING):
|
|
docx_converter.run(sources=paths)
|
|
assert "Could not read non_existing_file.docx" in caplog.text
|
|
|
|
def test_run_page_breaks(self, test_files_path, docx_converter):
|
|
"""
|
|
Test if the component correctly parses page breaks.
|
|
"""
|
|
paths = [test_files_path / "docx" / "sample_docx_2_page_breaks.docx"]
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 1
|
|
assert docs[0].content.count("\f") == 4
|
|
|
|
def test_mixed_sources_run(self, test_files_path, docx_converter):
|
|
"""
|
|
Test if the component runs correctly when mixed sources are provided.
|
|
"""
|
|
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
|
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
|
|
paths.append(ByteStream(f.read()))
|
|
|
|
output = docx_converter.run(sources=paths)
|
|
docs = output["documents"]
|
|
assert len(docs) == 2
|
|
assert "History and standardization" in docs[0].content
|
|
assert "History and standardization" in docs[1].content
|
|
|
|
def test_document_with_docx_metadata_to_dict(self):
|
|
docx_metadata = DOCXMetadata(
|
|
author="Microsoft Office User",
|
|
category="category",
|
|
comments="comments",
|
|
content_status="",
|
|
created="2024-06-09T21:17:00+00:00",
|
|
identifier="",
|
|
keywords="",
|
|
language="",
|
|
last_modified_by="Carlos Fernández Lorán",
|
|
last_printed=None,
|
|
modified="2024-06-09T21:27:00+00:00",
|
|
revision=2,
|
|
subject="",
|
|
title="",
|
|
version="",
|
|
)
|
|
doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
|
|
assert doc.to_dict(flatten=False) == {
|
|
"blob": None,
|
|
"dataframe": None,
|
|
"content": "content",
|
|
"id": "1",
|
|
"score": None,
|
|
"embedding": None,
|
|
"sparse_embedding": None,
|
|
"meta": {
|
|
"test": 1,
|
|
"docx": {
|
|
"author": "Microsoft Office User",
|
|
"category": "category",
|
|
"comments": "comments",
|
|
"content_status": "",
|
|
"created": "2024-06-09T21:17:00+00:00",
|
|
"identifier": "",
|
|
"keywords": "",
|
|
"language": "",
|
|
"last_modified_by": "Carlos Fernández Lorán",
|
|
"last_printed": None,
|
|
"modified": "2024-06-09T21:27:00+00:00",
|
|
"revision": 2,
|
|
"subject": "",
|
|
"title": "",
|
|
"version": "",
|
|
},
|
|
},
|
|
}
|
|
|
|
# check it is JSON serializable
|
|
json_str = json.dumps(doc.to_dict(flatten=False))
|
|
assert json.loads(json_str) == doc.to_dict(flatten=False)
|