feat: add DocxToDocument converter (#7838)

* first fucntioning DocxFileToDocument

* fix lazy import message

* add reno

* Add license headder

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* change DocxFileToDocument to DocxToDocument

* Update library install to the maintained version

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* clan try-exvept to only take non haystack errors into account

* Add wanring on docstring of component ignoring page brakes, mark test as skip

* make warnings lazy evaluations

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* make warnings lazy evaluations

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* Make warnings lazy evaluated

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* Solve f bug

* Get more metadata from docx files

* add 'python-docx' dependency and docs

* Change logging import

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* Fix typo

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* remake metadata extraction for docx

* solve bug regarding _get_docx_metadata method

* Update haystack/components/converters/docx.py

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* Update haystack/components/converters/docx.py

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>

* Delete unused test

---------

Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>
This commit is contained in:
Carlos Fernández 2024-06-12 11:58:36 +02:00 committed by GitHub
parent 28dd0f5596
commit c1c339923f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 218 additions and 1 deletions

View File

@ -13,6 +13,7 @@ loaders:
"txt",
"output_adapter",
"openapi_functions",
"docx"
]
ignore_when_discovered: ["__init__"]
processors:

View File

@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.docx import DocxToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
@ -22,4 +23,5 @@ __all__ = [
"MarkdownToDocument",
"OpenAPIServiceToFunctions",
"OutputAdapter",
"DocxToDocument",
]

View File

@ -0,0 +1,144 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
import io
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
logger = logging.getLogger(__name__)
with LazyImport("Run 'pip install python-docx'") as docx_import:
import docx
from docx.document import Document as DocxDocument
@component
class DocxToDocument:
"""
Converts Docx files to Documents.
Uses `python-docx` library to convert the Docx file to a document.
This component does not preserve page brakes in the original document.
Usage example:
```python
from haystack.components.converters.docx import DocxToDocument
converter = DocxToDocument()
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the Docx file.'
```
"""
def __init__(self):
"""
Create a DocxToDocument component.
"""
docx_import.check()
@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts Docx files to Documents.
:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the Documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
:returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""
documents = []
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
for source, metadata in zip(sources, meta_list):
# Load source ByteStream
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue
# Load the Docx Document
try:
file = docx.Document(io.BytesIO(bytestream.data))
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
source=source,
error=e,
)
continue
# Load the Metadata
try:
docx_meta = self._get_docx_metadata(document=file)
except Exception as e:
logger.warning(
"Could not load the metadata from {source}, skipping. Error: {error}", source=source, error=e
)
docx_meta = {}
# Load the content
try:
paragraphs = [para.text for para in file.paragraphs]
text = "\n".join(paragraphs)
except Exception as e:
logger.warning(
"Could not convert {source} to a Document, skipping it. Error: {error}", source=source, error=e
)
continue
merged_metadata = {**bytestream.meta, **docx_meta, **metadata}
document = Document(content=text, meta=merged_metadata)
documents.append(document)
return {"documents": documents}
def _get_docx_metadata(self, document: DocxDocument) -> Dict[str, Union[str, int, datetime]]:
"""
Get all relevant data from the 'core_properties' attribute from a Docx Document.
:param document:
The Docx Document you want to extract metadata from
:returns:
A dictionary containing all the relevant fields from the 'core_properties'
"""
return {
"author": document.core_properties.author,
"category": document.core_properties.category,
"comments": document.core_properties.comments,
"content_status": document.core_properties.content_status,
"created": document.core_properties.created,
"identifier": document.core_properties.identifier,
"keywords": document.core_properties.keywords,
"language": document.core_properties.language,
"last_modified_by": document.core_properties.last_modified_by,
"last_printed": document.core_properties.last_printed,
"modified": document.core_properties.modified,
"revision": document.core_properties.revision,
"subject": document.core_properties.subject,
"title": document.core_properties.title,
"version": document.core_properties.version,
}

View File

@ -120,6 +120,7 @@ extra-dependencies = [
"azure-ai-formrecognizer>=3.2.0b2", # AzureOCRDocumentConverter
"trafilatura", # HTMLToDocument
"python-pptx", # PPTXToDocument
"python-docx", # DocxToDocument
# OpenAPI
"jsonref", # OpenAPIServiceConnector, OpenAPIServiceToFunctions

View File

@ -0,0 +1,6 @@
---
highlights: >
Adding the `DocxToDocument` component to convert Docx files to Documents.
features:
- |
Adding the `DocxToDocument` component inside the `converters` category. It uses the `python-docx` library to convert Docx files to haystack Documents.

View File

@ -0,0 +1,63 @@
import logging
from unittest.mock import patch
import pytest
from haystack.dataclasses import ByteStream
from haystack.components.converters import DocxToDocument
@pytest.fixture
def docx_converter():
return DocxToDocument()
class TestDocxToDocument:
def test_init(self, docx_converter):
assert isinstance(docx_converter, DocxToDocument)
@pytest.mark.integration
def test_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
def test_run_with_meta(self, test_files_path, docx_converter):
with patch("haystack.components.converters.docx.DocxToDocument"):
output = docx_converter.run(
sources=[test_files_path / "docx" / "sample_docx_1.docx"],
meta={"language": "it", "author": "test_author"},
)
# check that the metadata from the bytestream is merged with that from the meta parameter
assert output["documents"][0].meta["author"] == "test_author"
assert output["documents"][0].meta["language"] == "it"
def test_run_error_handling(self, test_files_path, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
docx_converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text
@pytest.mark.integration
def test_mixed_sources_run(self, test_files_path, docx_converter):
"""
Test if the component runs correctly when mixed sources are provided.
"""
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
paths.append(ByteStream(f.read()))
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 2
assert "History and standardization" in docs[0].content
assert "History and standardization" in docs[1].content

Binary file not shown.