haystack/test/components/converters/test_docx_file_to_document.py

import json
import os
import logging
import pytest
import csv
from io import StringIO

from haystack import Document, Pipeline
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument, DOCXTableFormat
from haystack.dataclasses import ByteStream


@pytest.fixture
def docx_converter():
    return DOCXToDocument()


class TestDOCXToDocument:
    def test_init(self, docx_converter):
        assert isinstance(docx_converter, DOCXToDocument)

    def test_init_with_string(self):
        converter = DOCXToDocument(table_format="markdown")
        assert isinstance(converter, DOCXToDocument)
        assert converter.table_format == DOCXTableFormat.MARKDOWN

    def test_init_with_invalid_string(self):
        with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
            DOCXToDocument(table_format="invalid_format")

    def test_to_dict(self):
        converter = DOCXToDocument()
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"store_full_path": False, "table_format": "csv"},
        }

    def test_to_dict_custom_parameters(self):
        converter = DOCXToDocument(table_format="markdown")
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
        }

        converter = DOCXToDocument(table_format="csv")
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"store_full_path": False, "table_format": "csv"},
        }

        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"store_full_path": False, "table_format": "markdown"},
        }

        converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
        data = converter.to_dict()
        assert data == {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"store_full_path": False, "table_format": "csv"},
        }

    def test_from_dict(self):
        data = {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"table_format": "csv"},
        }
        converter = DOCXToDocument.from_dict(data)
        assert converter.table_format == DOCXTableFormat.CSV

    def test_from_dict_custom_parameters(self):
        data = {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"table_format": "markdown"},
        }
        converter = DOCXToDocument.from_dict(data)
        assert converter.table_format == DOCXTableFormat.MARKDOWN

    def test_from_dict_invalid_table_format(self):
        data = {
            "type": "haystack.components.converters.docx.DOCXToDocument",
            "init_parameters": {"table_format": "invalid_format"},
        }
        with pytest.raises(ValueError, match="Unknown table format 'invalid_format'"):
            DOCXToDocument.from_dict(data)

    def test_from_dict_empty_init_parameters(self):
        data = {"type": "haystack.components.converters.docx.DOCXToDocument", "init_parameters": {}}
        converter = DOCXToDocument.from_dict(data)
        assert converter.table_format == DOCXTableFormat.CSV

    def test_pipeline_serde(self):
        pipeline = Pipeline()
        converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
        pipeline.add_component("converter", converter)

        pipeline_str = pipeline.dumps()
        assert "haystack.components.converters.docx.DOCXToDocument" in pipeline_str
        assert "table_format" in pipeline_str
        assert "markdown" in pipeline_str

        new_pipeline = Pipeline.loads(pipeline_str)
        new_converter = new_pipeline.get_component("converter")
        assert isinstance(new_converter, DOCXToDocument)
        assert new_converter.table_format == DOCXTableFormat.MARKDOWN

    def test_run(self, test_files_path, docx_converter):
        """
        Test if the component runs correctly
        """
        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
        output = docx_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "History" in docs[0].content
        assert docs[0].meta.keys() == {"file_path", "docx"}
        assert docs[0].meta == {
            "file_path": os.path.basename(paths[0]),
            "docx": {
                "author": "Microsoft Office User",
                "category": "",
                "comments": "",
                "content_status": "",
                "created": "2024-06-09T21:17:00+00:00",
                "identifier": "",
                "keywords": "",
                "language": "",
                "last_modified_by": "Carlos Fernández Lorán",
                "last_printed": None,
                "modified": "2024-06-09T21:27:00+00:00",
                "revision": 2,
                "subject": "",
                "title": "",
                "version": "",
            },
        }

    def test_run_with_table(self, test_files_path):
        """
        Test if the component runs correctly
        """
        docx_converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
        paths = [test_files_path / "docx" / "sample_docx.docx"]
        output = docx_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "Donald Trump" in docs[0].content  ## :-)
        assert docs[0].meta.keys() == {"file_path", "docx"}
        assert docs[0].meta == {
            "file_path": os.path.basename(paths[0]),
            "docx": {
                "author": "Saha, Anirban",
                "category": "",
                "comments": "",
                "content_status": "",
                "created": "2020-07-14T08:14:00+00:00",
                "identifier": "",
                "keywords": "",
                "language": "",
                "last_modified_by": "Saha, Anirban",
                "last_printed": None,
                "modified": "2020-07-14T08:16:00+00:00",
                "revision": 1,
                "subject": "",
                "title": "",
                "version": "",
            },
        }
        # let's now detect that the table markdown is correctly added and that order of elements is correct
        content_parts = docs[0].content.split("\n\n")
        table_index = next(i for i, part in enumerate(content_parts) if "| This | Is     | Just a |" in part)
        # check that natural order of the document is preserved
        assert any("Donald Trump" in part for part in content_parts[:table_index]), "Text before table not found"
        assert any("Now we are in Page 2" in part for part in content_parts[table_index + 1 :]), (
            "Text after table not found"
        )

    def test_run_with_store_full_path_false(self, test_files_path):
        """
        Test if the component runs correctly with store_full_path=False
        """
        docx_converter = DOCXToDocument(store_full_path=False)
        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
        output = docx_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert "History" in docs[0].content
        assert docs[0].meta.keys() == {"file_path", "docx"}
        assert docs[0].meta == {
            "file_path": "sample_docx_1.docx",
            "docx": {
                "author": "Microsoft Office User",
                "category": "",
                "comments": "",
                "content_status": "",
                "created": "2024-06-09T21:17:00+00:00",
                "identifier": "",
                "keywords": "",
                "language": "",
                "last_modified_by": "Carlos Fernández Lorán",
                "last_printed": None,
                "modified": "2024-06-09T21:27:00+00:00",
                "revision": 2,
                "subject": "",
                "title": "",
                "version": "",
            },
        }

    @pytest.mark.parametrize("table_format", ["markdown", "csv"])
    def test_table_between_two_paragraphs(self, test_files_path, table_format):
        docx_converter = DOCXToDocument(table_format=table_format)
        paths = [test_files_path / "docx" / "sample_docx_3.docx"]
        output = docx_converter.run(sources=paths)

        content = output["documents"][0].content

        paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
        paragraphs_two = content.find("Paragraph 2:")
        table = content[
            paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
        ].strip()

        if table_format == "markdown":
            split = list(filter(None, table.split("\n")))
            expected_table_header = "| Industry   | AI Use Case                    | Impact                    |"
            expected_last_row = "| Finance    | Fraud detection and prevention | Reduced financial losses  |"

            assert split[0] == expected_table_header
            assert split[-1] == expected_last_row
        if table_format == "csv":  # CSV format
            csv_reader = csv.reader(StringIO(table))
            rows = list(csv_reader)
            assert len(rows) == 3  # Header + 2 data rows
            assert rows[0] == ["Industry", "AI Use Case", "Impact"]
            assert rows[-1] == ["Finance", "Fraud detection and prevention", "Reduced financial losses"]

    @pytest.mark.parametrize("table_format", ["markdown", "csv"])
    def test_table_content_correct_parsing(self, test_files_path, table_format):
        docx_converter = DOCXToDocument(table_format=table_format)
        paths = [test_files_path / "docx" / "sample_docx_3.docx"]
        output = docx_converter.run(sources=paths)
        content = output["documents"][0].content

        paragraphs_one = content.find("Table: AI Use Cases in Different Industries")
        paragraphs_two = content.find("Paragraph 2:")
        table = content[
            paragraphs_one + len("Table: AI Use Cases in Different Industries") + 1 : paragraphs_two
        ].strip()

        if table_format == "markdown":
            split = list(filter(None, table.split("\n")))
            assert len(split) == 4

            expected_table_header = "| Industry   | AI Use Case                    | Impact                    |"
            expected_table_top_border = "| ---------- | ------------------------------ | ------------------------- |"
            expected_table_row_one = "| Healthcare | Predictive diagnostics         | Improved patient outcomes |"
            expected_table_row_two = "| Finance    | Fraud detection and prevention | Reduced financial losses  |"

            assert split[0] == expected_table_header
            assert split[1] == expected_table_top_border
            assert split[2] == expected_table_row_one
            assert split[3] == expected_table_row_two
        if table_format == "csv":  # CSV format
            csv_reader = csv.reader(StringIO(table))
            rows = list(csv_reader)
            assert len(rows) == 3  # Header + 2 data rows

            expected_header = ["Industry", "AI Use Case", "Impact"]
            expected_row_one = ["Healthcare", "Predictive diagnostics", "Improved patient outcomes"]
            expected_row_two = ["Finance", "Fraud detection and prevention", "Reduced financial losses"]

            assert rows[0] == expected_header
            assert rows[1] == expected_row_one
            assert rows[2] == expected_row_two

    def test_run_with_additional_meta(self, test_files_path, docx_converter):
        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
        output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
        doc = output["documents"][0]
        assert doc.meta == {
            "file_path": os.path.basename(paths[0]),
            "docx": {
                "author": "Microsoft Office User",
                "category": "",
                "comments": "",
                "content_status": "",
                "created": "2024-06-09T21:17:00+00:00",
                "identifier": "",
                "keywords": "",
                "language": "",
                "last_modified_by": "Carlos Fernández Lorán",
                "last_printed": None,
                "modified": "2024-06-09T21:27:00+00:00",
                "revision": 2,
                "subject": "",
                "title": "",
                "version": "",
            },
            "language": "it",
            "author": "test_author",
        }

    def test_run_error_wrong_file_type(self, caplog, test_files_path, docx_converter):
        sources = [str(test_files_path / "txt" / "doc_1.txt")]
        with caplog.at_level(logging.WARNING):
            results = docx_converter.run(sources=sources)
            assert "doc_1.txt and convert it" in caplog.text
            assert results["documents"] == []

    def test_run_error_non_existent_file(self, docx_converter, caplog):
        """
        Test if the component correctly handles errors.
        """
        paths = ["non_existing_file.docx"]
        with caplog.at_level(logging.WARNING):
            docx_converter.run(sources=paths)
            assert "Could not read non_existing_file.docx" in caplog.text

    def test_run_page_breaks(self, test_files_path, docx_converter):
        """
        Test if the component correctly parses page breaks.
        """
        paths = [test_files_path / "docx" / "sample_docx_2_page_breaks.docx"]
        output = docx_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 1
        assert docs[0].content.count("\f") == 4

    def test_mixed_sources_run(self, test_files_path, docx_converter):
        """
        Test if the component runs correctly when mixed sources are provided.
        """
        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
        with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
            paths.append(ByteStream(f.read()))

        output = docx_converter.run(sources=paths)
        docs = output["documents"]
        assert len(docs) == 2
        assert "History and standardization" in docs[0].content
        assert "History and standardization" in docs[1].content

    def test_document_with_docx_metadata_to_dict(self):
        docx_metadata = DOCXMetadata(
            author="Microsoft Office User",
            category="category",
            comments="comments",
            content_status="",
            created="2024-06-09T21:17:00+00:00",
            identifier="",
            keywords="",
            language="",
            last_modified_by="Carlos Fernández Lorán",
            last_printed=None,
            modified="2024-06-09T21:27:00+00:00",
            revision=2,
            subject="",
            title="",
            version="",
        )
        doc = Document(content="content", meta={"test": 1, "docx": docx_metadata}, id="1")
        assert doc.to_dict(flatten=False) == {
            "blob": None,
            "dataframe": None,
            "content": "content",
            "id": "1",
            "score": None,
            "embedding": None,
            "sparse_embedding": None,
            "meta": {
                "test": 1,
                "docx": {
                    "author": "Microsoft Office User",
                    "category": "category",
                    "comments": "comments",
                    "content_status": "",
                    "created": "2024-06-09T21:17:00+00:00",
                    "identifier": "",
                    "keywords": "",
                    "language": "",
                    "last_modified_by": "Carlos Fernández Lorán",
                    "last_printed": None,
                    "modified": "2024-06-09T21:27:00+00:00",
                    "revision": 2,
                    "subject": "",
                    "title": "",
                    "version": "",
                },
            },
        }

        # check it is JSON serializable
        json_str = json.dumps(doc.to_dict(flatten=False))
        assert json.loads(json_str) == doc.to_dict(flatten=False)