chore: Capitalize DOCX in DOCXToDocument converter (#7931)

* Capitalize DOCX in DOCXToDocument converter

* Update docstrings

* Update test class name

* add releease notes
This commit is contained in:
Sebastian Husch Lee 2024-06-27 08:19:01 +02:00 committed by GitHub
parent fd1a06d171
commit 6836079686
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 29 additions and 25 deletions

View File

@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from haystack.components.converters.azure import AzureOCRDocumentConverter
from haystack.components.converters.docx import DocxMetadata, DocxToDocument
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
from haystack.components.converters.html import HTMLToDocument
from haystack.components.converters.markdown import MarkdownToDocument
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
@ -23,6 +23,6 @@ __all__ = [
"MarkdownToDocument",
"OpenAPIServiceToFunctions",
"OutputAdapter",
"DocxToDocument",
"DocxMetadata",
"DOCXToDocument",
"DOCXMetadata",
]

View File

@ -21,7 +21,7 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
@dataclass
class DocxMetadata:
class DOCXMetadata:
"""
Describes the metadata of Docx file.
@ -60,28 +60,28 @@ class DocxMetadata:
@component
class DocxToDocument:
class DOCXToDocument:
"""
Converts Docx files to Documents.
Converts DOCX files to Documents.
Uses `python-docx` library to convert the Docx file to a document.
Uses `python-docx` library to convert the DOCX file to a document.
This component does not preserve page breaks in the original document.
Usage example:
```python
from haystack.components.converters.docx import DocxToDocument
from haystack.components.converters.docx import DOCXToDocument
converter = DocxToDocument()
converter = DOCXToDocument()
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the Docx file.'
# 'This is a text from the DOCX file.'
```
"""
def __init__(self):
"""
Create a DocxToDocument component.
Create a DOCXToDocument component.
"""
docx_import.check()
@ -92,7 +92,7 @@ class DocxToDocument:
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
):
"""
Converts Docx files to Documents.
Converts DOCX files to Documents.
:param sources:
List of file paths or ByteStream objects.
@ -123,7 +123,7 @@ class DocxToDocument:
text = "\n".join(paragraphs)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
"Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
source=source,
error=e,
)
@ -136,17 +136,17 @@ class DocxToDocument:
return {"documents": documents}
def _get_docx_metadata(self, document: "DocxDocument") -> DocxMetadata:
def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
"""
Get all relevant data from the 'core_properties' attribute from a Docx Document.
Get all relevant data from the 'core_properties' attribute from a DOCX Document.
:param document:
The Docx Document you want to extract metadata from
The DOCX Document you want to extract metadata from
:returns:
A `DocxMetadata` dataclass all the relevant fields from the 'core_properties'
A `DOCXMetadata` dataclass all the relevant fields from the 'core_properties'
"""
return DocxMetadata(
return DOCXMetadata(
author=document.core_properties.author,
category=document.core_properties.category,
comments=document.core_properties.comments,

View File

@ -0,0 +1,4 @@
---
enhancements:
- |
Renamed component from DocxToDocument to DOCXToDocument to follow the naming convention of other converter components.

View File

@ -5,17 +5,17 @@ import pytest
from haystack.dataclasses import ByteStream
from haystack import Document
from haystack.components.converters.docx import DocxToDocument, DocxMetadata
from haystack.components.converters.docx import DOCXToDocument, DOCXMetadata
@pytest.fixture
def docx_converter():
return DocxToDocument()
return DOCXToDocument()
class TestDocxToDocument:
class TestDOCXToDocument:
def test_init(self, docx_converter):
assert isinstance(docx_converter, DocxToDocument)
assert isinstance(docx_converter, DOCXToDocument)
def test_run(self, test_files_path, docx_converter):
"""
@ -29,7 +29,7 @@ class TestDocxToDocument:
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": str(paths[0]),
"docx": DocxMetadata(
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
@ -54,7 +54,7 @@ class TestDocxToDocument:
doc = output["documents"][0]
assert doc.meta == {
"file_path": str(paths[0]),
"docx": DocxMetadata(
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
@ -106,7 +106,7 @@ class TestDocxToDocument:
assert "History and standardization" in docs[1].content
def test_document_with_docx_metadata_to_dict(self):
docx_metadata = DocxMetadata(
docx_metadata = DOCXMetadata(
author="Microsoft Office User",
category="category",
comments="comments",