feat: add DocxToDocument converter (#7838)

* first fucntioning DocxFileToDocument * fix lazy import message * add reno * Add license headder Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * change DocxFileToDocument to DocxToDocument * Update library install to the maintained version Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * clan try-exvept to only take non haystack errors into account * Add wanring on docstring of component ignoring page brakes, mark test as skip * make warnings lazy evaluations Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * make warnings lazy evaluations Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Make warnings lazy evaluated Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Solve f bug * Get more metadata from docx files * add 'python-docx' dependency and docs * Change logging import Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Fix typo Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * remake metadata extraction for docx * solve bug regarding _get_docx_metadata method * Update haystack/components/converters/docx.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Update haystack/components/converters/docx.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Delete unused test --------- Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>
2025-12-12 15:27:06 +00:00 · 2024-06-12 11:58:36 +02:00 · 2024-06-12 11:58:36 +02:00 · c1c339923f
commit c1c339923f
parent 28dd0f5596
7 changed files with 218 additions and 1 deletions
--- a/docs/pydoc/config/converters_api.yml
+++ b/docs/pydoc/config/converters_api.yml
@ -13,6 +13,7 @@ loaders:
        "txt",
        "output_adapter",
        "openapi_functions",
+        "docx"
      ]
    ignore_when_discovered: ["__init__"]
 processors:
--- a/haystack/components/converters/init.py
+++ b/haystack/components/converters/init.py
@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0

 from haystack.components.converters.azure import AzureOCRDocumentConverter
+from haystack.components.converters.docx import DocxToDocument
 from haystack.components.converters.html import HTMLToDocument
 from haystack.components.converters.markdown import MarkdownToDocument
 from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
@ -22,4 +23,5 @@ __all__ = [
    "MarkdownToDocument",
    "OpenAPIServiceToFunctions",
    "OutputAdapter",
+    "DocxToDocument",
 ]
--- a/haystack/components/converters/docx.py
+++ b/haystack/components/converters/docx.py
@ -0,0 +1,144 @@
+# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import io
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+from haystack.lazy_imports import LazyImport
+
+logger = logging.getLogger(__name__)
+
+with LazyImport("Run 'pip install python-docx'") as docx_import:
+    import docx
+    from docx.document import Document as DocxDocument
+
+
+@component
+class DocxToDocument:
+    """
+    Converts Docx files to Documents.
+
+    Uses `python-docx` library to convert the Docx file to a document.
+    This component does not preserve page brakes in the original document.
+
+    Usage example:
+    ```python
+    from haystack.components.converters.docx import DocxToDocument
+
+    converter = DocxToDocument()
+    results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'This is a text from the Docx file.'
+    ```
+    """
+
+    def __init__(self):
+        """
+        Create a DocxToDocument component.
+        """
+        docx_import.check()
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
+        """
+        Converts Docx files to Documents.
+
+        :param sources:
+            List of file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents
+        """
+        documents = []
+        meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            # Load source ByteStream
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+
+            # Load the Docx Document
+            try:
+                file = docx.Document(io.BytesIO(bytestream.data))
+            except Exception as e:
+                logger.warning(
+                    "Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
+                    source=source,
+                    error=e,
+                )
+                continue
+
+            # Load the Metadata
+            try:
+                docx_meta = self._get_docx_metadata(document=file)
+            except Exception as e:
+                logger.warning(
+                    "Could not load the metadata from {source}, skipping. Error: {error}", source=source, error=e
+                )
+                docx_meta = {}
+
+            # Load the content
+            try:
+                paragraphs = [para.text for para in file.paragraphs]
+                text = "\n".join(paragraphs)
+            except Exception as e:
+                logger.warning(
+                    "Could not convert {source} to a Document, skipping it. Error: {error}", source=source, error=e
+                )
+                continue
+
+            merged_metadata = {**bytestream.meta, **docx_meta, **metadata}
+            document = Document(content=text, meta=merged_metadata)
+
+            documents.append(document)
+
+        return {"documents": documents}
+
+    def _get_docx_metadata(self, document: DocxDocument) -> Dict[str, Union[str, int, datetime]]:
+        """
+        Get all relevant data from the 'core_properties' attribute from a Docx Document.
+
+        :param document:
+            The Docx Document you want to extract metadata from
+
+        :returns:
+            A dictionary containing all the relevant fields from the 'core_properties'
+        """
+        return {
+            "author": document.core_properties.author,
+            "category": document.core_properties.category,
+            "comments": document.core_properties.comments,
+            "content_status": document.core_properties.content_status,
+            "created": document.core_properties.created,
+            "identifier": document.core_properties.identifier,
+            "keywords": document.core_properties.keywords,
+            "language": document.core_properties.language,
+            "last_modified_by": document.core_properties.last_modified_by,
+            "last_printed": document.core_properties.last_printed,
+            "modified": document.core_properties.modified,
+            "revision": document.core_properties.revision,
+            "subject": document.core_properties.subject,
+            "title": document.core_properties.title,
+            "version": document.core_properties.version,
+        }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -120,7 +120,8 @@ extra-dependencies = [
  "azure-ai-formrecognizer>=3.2.0b2",  # AzureOCRDocumentConverter
  "trafilatura", # HTMLToDocument
  "python-pptx",  # PPTXToDocument
-  
+  "python-docx", # DocxToDocument
+
  # OpenAPI
  "jsonref",  # OpenAPIServiceConnector, OpenAPIServiceToFunctions
  "openapi3",
--- a/releasenotes/notes/add-docx-file-to-document-47b603755a00fbe6.yaml
+++ b/releasenotes/notes/add-docx-file-to-document-47b603755a00fbe6.yaml
@ -0,0 +1,6 @@
+---
+highlights: >
+    Adding the `DocxToDocument` component to convert Docx files to Documents.
+features:
+  - |
+    Adding the `DocxToDocument` component inside the `converters` category. It uses the `python-docx` library to convert Docx files to haystack Documents.
--- a/test/components/converters/test_docx_file_to_document.py
+++ b/test/components/converters/test_docx_file_to_document.py
@ -0,0 +1,63 @@
+import logging
+from unittest.mock import patch
+
+import pytest
+
+from haystack.dataclasses import ByteStream
+from haystack.components.converters import DocxToDocument
+
+
+@pytest.fixture
+def docx_converter():
+    return DocxToDocument()
+
+
+class TestDocxToDocument:
+    def test_init(self, docx_converter):
+        assert isinstance(docx_converter, DocxToDocument)
+
+    @pytest.mark.integration
+    def test_run(self, test_files_path, docx_converter):
+        """
+        Test if the component runs correctly
+        """
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "History" in docs[0].content
+
+    def test_run_with_meta(self, test_files_path, docx_converter):
+        with patch("haystack.components.converters.docx.DocxToDocument"):
+            output = docx_converter.run(
+                sources=[test_files_path / "docx" / "sample_docx_1.docx"],
+                meta={"language": "it", "author": "test_author"},
+            )
+
+        # check that the metadata from the bytestream is merged with that from the meta parameter
+        assert output["documents"][0].meta["author"] == "test_author"
+        assert output["documents"][0].meta["language"] == "it"
+
+    def test_run_error_handling(self, test_files_path, docx_converter, caplog):
+        """
+        Test if the component correctly handles errors.
+        """
+        paths = ["non_existing_file.docx"]
+        with caplog.at_level(logging.WARNING):
+            docx_converter.run(sources=paths)
+            assert "Could not read non_existing_file.docx" in caplog.text
+
+    @pytest.mark.integration
+    def test_mixed_sources_run(self, test_files_path, docx_converter):
+        """
+        Test if the component runs correctly when mixed sources are provided.
+        """
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
+            paths.append(ByteStream(f.read()))
+
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 2
+        assert "History and standardization" in docs[0].content
+        assert "History and standardization" in docs[1].content
--- a/test/test_files/docx/sample_docx_1.docx
+++ b/test/test_files/docx/sample_docx_1.docx