mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-09-23 07:03:45 +00:00
feat: add DocxToDocument converter (#7838)
* first fucntioning DocxFileToDocument * fix lazy import message * add reno * Add license headder Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * change DocxFileToDocument to DocxToDocument * Update library install to the maintained version Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * clan try-exvept to only take non haystack errors into account * Add wanring on docstring of component ignoring page brakes, mark test as skip * make warnings lazy evaluations Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * make warnings lazy evaluations Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Make warnings lazy evaluated Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Solve f bug * Get more metadata from docx files * add 'python-docx' dependency and docs * Change logging import Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Fix typo Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * remake metadata extraction for docx * solve bug regarding _get_docx_metadata method * Update haystack/components/converters/docx.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Update haystack/components/converters/docx.py Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com> * Delete unused test --------- Co-authored-by: Sebastian Husch Lee <sjrl@users.noreply.github.com>
This commit is contained in:
parent
28dd0f5596
commit
c1c339923f
@ -13,6 +13,7 @@ loaders:
|
||||
"txt",
|
||||
"output_adapter",
|
||||
"openapi_functions",
|
||||
"docx"
|
||||
]
|
||||
ignore_when_discovered: ["__init__"]
|
||||
processors:
|
||||
|
@ -3,6 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from haystack.components.converters.azure import AzureOCRDocumentConverter
|
||||
from haystack.components.converters.docx import DocxToDocument
|
||||
from haystack.components.converters.html import HTMLToDocument
|
||||
from haystack.components.converters.markdown import MarkdownToDocument
|
||||
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
|
||||
@ -22,4 +23,5 @@ __all__ = [
|
||||
"MarkdownToDocument",
|
||||
"OpenAPIServiceToFunctions",
|
||||
"OutputAdapter",
|
||||
"DocxToDocument",
|
||||
]
|
||||
|
144
haystack/components/converters/docx.py
Normal file
144
haystack/components/converters/docx.py
Normal file
@ -0,0 +1,144 @@
|
||||
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import io
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from haystack import Document, component, logging
|
||||
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack.lazy_imports import LazyImport
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
with LazyImport("Run 'pip install python-docx'") as docx_import:
|
||||
import docx
|
||||
from docx.document import Document as DocxDocument
|
||||
|
||||
|
||||
@component
|
||||
class DocxToDocument:
|
||||
"""
|
||||
Converts Docx files to Documents.
|
||||
|
||||
Uses `python-docx` library to convert the Docx file to a document.
|
||||
This component does not preserve page brakes in the original document.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack.components.converters.docx import DocxToDocument
|
||||
|
||||
converter = DocxToDocument()
|
||||
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
|
||||
documents = results["documents"]
|
||||
print(documents[0].content)
|
||||
# 'This is a text from the Docx file.'
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Create a DocxToDocument component.
|
||||
"""
|
||||
docx_import.check()
|
||||
|
||||
@component.output_types(documents=List[Document])
|
||||
def run(
|
||||
self,
|
||||
sources: List[Union[str, Path, ByteStream]],
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
):
|
||||
"""
|
||||
Converts Docx files to Documents.
|
||||
|
||||
:param sources:
|
||||
List of file paths or ByteStream objects.
|
||||
:param meta:
|
||||
Optional metadata to attach to the Documents.
|
||||
This value can be either a list of dictionaries or a single dictionary.
|
||||
If it's a single dictionary, its content is added to the metadata of all produced Documents.
|
||||
If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
|
||||
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
|
||||
|
||||
:returns:
|
||||
A dictionary with the following keys:
|
||||
- `documents`: Created Documents
|
||||
"""
|
||||
documents = []
|
||||
meta_list = normalize_metadata(meta=meta, sources_count=len(sources))
|
||||
|
||||
for source, metadata in zip(sources, meta_list):
|
||||
# Load source ByteStream
|
||||
try:
|
||||
bytestream = get_bytestream_from_source(source)
|
||||
except Exception as e:
|
||||
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
|
||||
continue
|
||||
|
||||
# Load the Docx Document
|
||||
try:
|
||||
file = docx.Document(io.BytesIO(bytestream.data))
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
|
||||
source=source,
|
||||
error=e,
|
||||
)
|
||||
continue
|
||||
|
||||
# Load the Metadata
|
||||
try:
|
||||
docx_meta = self._get_docx_metadata(document=file)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not load the metadata from {source}, skipping. Error: {error}", source=source, error=e
|
||||
)
|
||||
docx_meta = {}
|
||||
|
||||
# Load the content
|
||||
try:
|
||||
paragraphs = [para.text for para in file.paragraphs]
|
||||
text = "\n".join(paragraphs)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not convert {source} to a Document, skipping it. Error: {error}", source=source, error=e
|
||||
)
|
||||
continue
|
||||
|
||||
merged_metadata = {**bytestream.meta, **docx_meta, **metadata}
|
||||
document = Document(content=text, meta=merged_metadata)
|
||||
|
||||
documents.append(document)
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
def _get_docx_metadata(self, document: DocxDocument) -> Dict[str, Union[str, int, datetime]]:
|
||||
"""
|
||||
Get all relevant data from the 'core_properties' attribute from a Docx Document.
|
||||
|
||||
:param document:
|
||||
The Docx Document you want to extract metadata from
|
||||
|
||||
:returns:
|
||||
A dictionary containing all the relevant fields from the 'core_properties'
|
||||
"""
|
||||
return {
|
||||
"author": document.core_properties.author,
|
||||
"category": document.core_properties.category,
|
||||
"comments": document.core_properties.comments,
|
||||
"content_status": document.core_properties.content_status,
|
||||
"created": document.core_properties.created,
|
||||
"identifier": document.core_properties.identifier,
|
||||
"keywords": document.core_properties.keywords,
|
||||
"language": document.core_properties.language,
|
||||
"last_modified_by": document.core_properties.last_modified_by,
|
||||
"last_printed": document.core_properties.last_printed,
|
||||
"modified": document.core_properties.modified,
|
||||
"revision": document.core_properties.revision,
|
||||
"subject": document.core_properties.subject,
|
||||
"title": document.core_properties.title,
|
||||
"version": document.core_properties.version,
|
||||
}
|
@ -120,7 +120,8 @@ extra-dependencies = [
|
||||
"azure-ai-formrecognizer>=3.2.0b2", # AzureOCRDocumentConverter
|
||||
"trafilatura", # HTMLToDocument
|
||||
"python-pptx", # PPTXToDocument
|
||||
|
||||
"python-docx", # DocxToDocument
|
||||
|
||||
# OpenAPI
|
||||
"jsonref", # OpenAPIServiceConnector, OpenAPIServiceToFunctions
|
||||
"openapi3",
|
||||
|
@ -0,0 +1,6 @@
|
||||
---
|
||||
highlights: >
|
||||
Adding the `DocxToDocument` component to convert Docx files to Documents.
|
||||
features:
|
||||
- |
|
||||
Adding the `DocxToDocument` component inside the `converters` category. It uses the `python-docx` library to convert Docx files to haystack Documents.
|
63
test/components/converters/test_docx_file_to_document.py
Normal file
63
test/components/converters/test_docx_file_to_document.py
Normal file
@ -0,0 +1,63 @@
|
||||
import logging
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack.components.converters import DocxToDocument
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def docx_converter():
|
||||
return DocxToDocument()
|
||||
|
||||
|
||||
class TestDocxToDocument:
|
||||
def test_init(self, docx_converter):
|
||||
assert isinstance(docx_converter, DocxToDocument)
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_run(self, test_files_path, docx_converter):
|
||||
"""
|
||||
Test if the component runs correctly
|
||||
"""
|
||||
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
||||
output = docx_converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 1
|
||||
assert "History" in docs[0].content
|
||||
|
||||
def test_run_with_meta(self, test_files_path, docx_converter):
|
||||
with patch("haystack.components.converters.docx.DocxToDocument"):
|
||||
output = docx_converter.run(
|
||||
sources=[test_files_path / "docx" / "sample_docx_1.docx"],
|
||||
meta={"language": "it", "author": "test_author"},
|
||||
)
|
||||
|
||||
# check that the metadata from the bytestream is merged with that from the meta parameter
|
||||
assert output["documents"][0].meta["author"] == "test_author"
|
||||
assert output["documents"][0].meta["language"] == "it"
|
||||
|
||||
def test_run_error_handling(self, test_files_path, docx_converter, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
paths = ["non_existing_file.docx"]
|
||||
with caplog.at_level(logging.WARNING):
|
||||
docx_converter.run(sources=paths)
|
||||
assert "Could not read non_existing_file.docx" in caplog.text
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_mixed_sources_run(self, test_files_path, docx_converter):
|
||||
"""
|
||||
Test if the component runs correctly when mixed sources are provided.
|
||||
"""
|
||||
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
||||
with open(test_files_path / "docx" / "sample_docx_1.docx", "rb") as f:
|
||||
paths.append(ByteStream(f.read()))
|
||||
|
||||
output = docx_converter.run(sources=paths)
|
||||
docs = output["documents"]
|
||||
assert len(docs) == 2
|
||||
assert "History and standardization" in docs[0].content
|
||||
assert "History and standardization" in docs[1].content
|
BIN
test/test_files/docx/sample_docx_1.docx
Normal file
BIN
test/test_files/docx/sample_docx_1.docx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user