diff --git a/docs/pydoc/config/converters_api.yml b/docs/pydoc/config/converters_api.yml index 16ae5680d..e8ec88c7a 100644 --- a/docs/pydoc/config/converters_api.yml +++ b/docs/pydoc/config/converters_api.yml @@ -13,6 +13,7 @@ loaders: "txt", "output_adapter", "openapi_functions", + "docx" ] ignore_when_discovered: ["__init__"] processors: diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py index 54699c78e..32f8a7b0c 100644 --- a/haystack/components/converters/__init__.py +++ b/haystack/components/converters/__init__.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from haystack.components.converters.azure import AzureOCRDocumentConverter +from haystack.components.converters.docx import DocxToDocument from haystack.components.converters.html import HTMLToDocument from haystack.components.converters.markdown import MarkdownToDocument from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions @@ -22,4 +23,5 @@ __all__ = [ "MarkdownToDocument", "OpenAPIServiceToFunctions", "OutputAdapter", + "DocxToDocument", ] diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py new file mode 100644 index 000000000..d255617e0 --- /dev/null +++ b/haystack/components/converters/docx.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import io +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from haystack import Document, component, logging +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.lazy_imports import LazyImport + +logger = logging.getLogger(__name__) + +with LazyImport("Run 'pip install python-docx'") as docx_import: + import docx + from docx.document import Document as DocxDocument + + +@component +class DocxToDocument: + """ + Converts Docx files to Documents. + + Uses `python-docx` library to convert the Docx file to a document. + This component does not preserve page brakes in the original document. + + Usage example: + ```python + from haystack.components.converters.docx import DocxToDocument + + converter = DocxToDocument() + results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()}) + documents = results["documents"] + print(documents[0].content) + # 'This is a text from the Docx file.' + ``` + """ + + def __init__(self): + """ + Create a DocxToDocument component. + """ + docx_import.check() + + @component.output_types(documents=List[Document]) + def run( + self, + sources: List[Union[str, Path, ByteStream]], + meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ): + """ + Converts Docx files to Documents. + + :param sources: + List of file paths or ByteStream objects. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources, because the two lists will be zipped. + If `sources` contains ByteStream objects, their `meta` will be added to the output Documents. + + :returns: + A dictionary with the following keys: + - `documents`: Created Documents + """ + documents = [] + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list): + # Load source ByteStream + try: + bytestream = get_bytestream_from_source(source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + + # Load the Docx Document + try: + file = docx.Document(io.BytesIO(bytestream.data)) + except Exception as e: + logger.warning( + "Could not read {source} and convert it to a Docx Document, skipping. Error: {error}", + source=source, + error=e, + ) + continue + + # Load the Metadata + try: + docx_meta = self._get_docx_metadata(document=file) + except Exception as e: + logger.warning( + "Could not load the metadata from {source}, skipping. Error: {error}", source=source, error=e + ) + docx_meta = {} + + # Load the content + try: + paragraphs = [para.text for para in file.paragraphs] + text = "\n".join(paragraphs) + except Exception as e: + logger.warning( + "Could not convert {source} to a Document, skipping it. Error: {error}", source=source, error=e + ) + continue + + merged_metadata = {**bytestream.meta, **docx_meta, **metadata} + document = Document(content=text, meta=merged_metadata) + + documents.append(document) + + return {"documents": documents} + + def _get_docx_metadata(self, document: DocxDocument) -> Dict[str, Union[str, int, datetime]]: + """ + Get all relevant data from the 'core_properties' attribute from a Docx Document. + + :param document: + The Docx Document you want to extract metadata from + + :returns: + A dictionary containing all the relevant fields from the 'core_properties' + """ + return { + "author": document.core_properties.author, + "category": document.core_properties.category, + "comments": document.core_properties.comments, + "content_status": document.core_properties.content_status, + "created": document.core_properties.created, + "identifier": document.core_properties.identifier, + "keywords": document.core_properties.keywords, + "language": document.core_properties.language, + "last_modified_by": document.core_properties.last_modified_by, + "last_printed": document.core_properties.last_printed, + "modified": document.core_properties.modified, + "revision": document.core_properties.revision, + "subject": document.core_properties.subject, + "title": document.core_properties.title, + "version": document.core_properties.version, + } diff --git a/pyproject.toml b/pyproject.toml index 4adef1523..f47c9d4cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,7 +120,8 @@ extra-dependencies = [ "azure-ai-formrecognizer>=3.2.0b2", # AzureOCRDocumentConverter "trafilatura", # HTMLToDocument "python-pptx", # PPTXToDocument - + "python-docx", # DocxToDocument + # OpenAPI "jsonref", # OpenAPIServiceConnector, OpenAPIServiceToFunctions "openapi3", diff --git a/releasenotes/notes/add-docx-file-to-document-47b603755a00fbe6.yaml b/releasenotes/notes/add-docx-file-to-document-47b603755a00fbe6.yaml new file mode 100644 index 000000000..eb52b968f --- /dev/null +++ b/releasenotes/notes/add-docx-file-to-document-47b603755a00fbe6.yaml @@ -0,0 +1,6 @@ +--- +highlights: > + Adding the `DocxToDocument` component to convert Docx files to Documents. +features: + - | + Adding the `DocxToDocument` component inside the `converters` category. It uses the `python-docx` library to convert Docx files to haystack Documents. diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py new file mode 100644 index 000000000..7ee6a149a --- /dev/null +++ b/test/components/converters/test_docx_file_to_document.py @@ -0,0 +1,63 @@ +import logging +from unittest.mock import patch + +import pytest + +from haystack.dataclasses import ByteStream +from haystack.components.converters import DocxToDocument + + +@pytest.fixture +def docx_converter(): + return DocxToDocument() + + +class TestDocxToDocument: + def test_init(self, docx_converter): + assert isinstance(docx_converter, DocxToDocument) + + @pytest.mark.integration + def test_run(self, test_files_path, docx_converter): + """ + Test if the component runs correctly + """ + paths = [test_files_path / "docx" / "sample_docx_1.docx"] + output = docx_converter.run(sources=paths) + docs = output["documents"] + assert len(docs) == 1 + assert "History" in docs[0].content + + def test_run_with_meta(self, test_files_path, docx_converter): + with patch("haystack.components.converters.docx.DocxToDocument"): + output = docx_converter.run( + sources=[test_files_path / "docx" / "sample_docx_1.docx"], + meta={"language": "it", "author": "test_author"}, + ) + + # check that the metadata from the bytestream is merged with that from the meta parameter + assert output["documents"][0].meta["author"] == "test_author" + assert output["documents"][0].meta["language"] == "it" + + def test_run_error_handling(self, test_files_path, docx_converter, caplog): + """ + Test if the component correctly handles errors. + """ + paths = ["non_existing_file.docx"] + with caplog.at_level(logging.WARNING): + docx_converter.run(sources=paths) + assert "Could not read non_existing_file.docx" in caplog.text + + @pytest.mark.integration + def test_mixed_sources_run(self, test_files_path, docx_converter): + """ + Test if the component runs correctly when mixed sources are provided. + """ + paths = [test_files_path / "docx" / "sample_docx_1.docx"] + with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f: + paths.append(ByteStream(f.read())) + + output = docx_converter.run(sources=paths) + docs = output["documents"] + assert len(docs) == 2 + assert "History and standardization" in docs[0].content + assert "History and standardization" in docs[1].content diff --git a/test/test_files/docx/sample_docx_1.docx b/test/test_files/docx/sample_docx_1.docx new file mode 100644 index 000000000..baa1f34d7 Binary files /dev/null and b/test/test_files/docx/sample_docx_1.docx differ