haystack/test/components/converters/test_docx_file_to_document.py
Jon Strutz 471f07c8fe
fix: extract page breaks from .docx files (#8232)
* fix: extract page breaks from .docx files

Context: Currently, DOCXToDocument does not extract page breaks from
word documents. This makes it impossible to do things like split by page
or get correct page number metadata after using something like
DocumentSplitter. For example, if you split by word, the 'page_number'
metadata field will be 1 for all documents.

Solution: Added a method to DOCXToDocument that extracts page breaks
from word documents as '\f' characters so that they are recognized by
DocumentSplitter.

Caveat: Due to the way the python-docx library is set up, you can only
accurately determine the location of the first page break for a given
paragraph. In the rare case that a paragraph contains more than one page
break (which means it is an extremely long paragraph spanning multiple
pages), the 2nd, 3rd, etc. page break locations are not known. To sort
of fix this, I just appended the page break characters to the end of
the paragraph text to keep the overall page number values for the
document consistent.

* Apply suggestions from code review

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
2024-08-21 09:48:02 +00:00

166 lines
6.2 KiB
Python

import logging
import datetime
import pytest
from haystack.dataclasses import ByteStream
from haystack import Document
from haystack.components.converters.docx import DOCXToDocument, DOCXMetadata
@pytest.fixture
def docx_converter():
return DOCXToDocument()
class TestDOCXToDocument:
def test_init(self, docx_converter):
assert isinstance(docx_converter, DOCXToDocument)
def test_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": str(paths[0]),
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
revision=2,
subject="",
title="",
version="",
),
}
def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
assert doc.meta == {
"file_path": str(paths[0]),
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
revision=2,
subject="",
title="",
version="",
),
"language": "it",
"author": "test_author",
}
def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
sources = [str(test_files_path / "txt" / "doc_1.txt")]
with caplog.at_level(logging.WARNING):
results = docx_converter.run(sources=sources)
assert "doc_1.txt and convert it" in caplog.text
assert results["documents"] == []
def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
docx_converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text
def test_run_page_breaks(self, test_files_path, docx_converter):
"""
Test if the component correctly parses page breaks.
"""
paths = [test_files_path / "docx" / "sample_docx_2_page_breaks.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert docs[0].content.count("\f") == 4
def test_mixed_sources_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
paths.append(ByteStream(f.read()))
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content
def test_document_with_docx_metadata_to_dict(self):
docx_metadata = DOCXMetadata(
author="Microsoft Office User",
category="category",
comments="comments",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
revision=2,
subject="",
title="",
version="",
)
doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
assert doc.to_dict(flatten=False) == {
"blob": None,
"dataframe": None,
"content": "content",
"id": "1",
"score": None,
"embedding": None,
"sparse_embedding": None,
"meta": {
"test": 1,
"docx": {
"author": "Microsoft Office User",
"category": "category",
"comments": "comments",
"content_status": "",
"created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
"revision": 2,
"subject": "",
"title": "",
"version": "",
},
},
}