mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-12-29 07:59:27 +00:00
chore: Capitalize DOCX in DOCXToDocument converter (#7931)
* Capitalize DOCX in DOCXToDocument converter * Update docstrings * Update test class name * add releease notes
This commit is contained in:
parent
fd1a06d171
commit
6836079686
@ -3,7 +3,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from haystack.components.converters.azure import AzureOCRDocumentConverter
|
||||
from haystack.components.converters.docx import DocxMetadata, DocxToDocument
|
||||
from haystack.components.converters.docx import DOCXMetadata, DOCXToDocument
|
||||
from haystack.components.converters.html import HTMLToDocument
|
||||
from haystack.components.converters.markdown import MarkdownToDocument
|
||||
from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
|
||||
@ -23,6 +23,6 @@ __all__ = [
|
||||
"MarkdownToDocument",
|
||||
"OpenAPIServiceToFunctions",
|
||||
"OutputAdapter",
|
||||
"DocxToDocument",
|
||||
"DocxMetadata",
|
||||
"DOCXToDocument",
|
||||
"DOCXMetadata",
|
||||
]
|
||||
|
||||
@ -21,7 +21,7 @@ with LazyImport("Run 'pip install python-docx'") as docx_import:
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocxMetadata:
|
||||
class DOCXMetadata:
|
||||
"""
|
||||
Describes the metadata of Docx file.
|
||||
|
||||
@ -60,28 +60,28 @@ class DocxMetadata:
|
||||
|
||||
|
||||
@component
|
||||
class DocxToDocument:
|
||||
class DOCXToDocument:
|
||||
"""
|
||||
Converts Docx files to Documents.
|
||||
Converts DOCX files to Documents.
|
||||
|
||||
Uses `python-docx` library to convert the Docx file to a document.
|
||||
Uses `python-docx` library to convert the DOCX file to a document.
|
||||
This component does not preserve page breaks in the original document.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from haystack.components.converters.docx import DocxToDocument
|
||||
from haystack.components.converters.docx import DOCXToDocument
|
||||
|
||||
converter = DocxToDocument()
|
||||
converter = DOCXToDocument()
|
||||
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
|
||||
documents = results["documents"]
|
||||
print(documents[0].content)
|
||||
# 'This is a text from the Docx file.'
|
||||
# 'This is a text from the DOCX file.'
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Create a DocxToDocument component.
|
||||
Create a DOCXToDocument component.
|
||||
"""
|
||||
docx_import.check()
|
||||
|
||||
@ -92,7 +92,7 @@ class DocxToDocument:
|
||||
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
||||
):
|
||||
"""
|
||||
Converts Docx files to Documents.
|
||||
Converts DOCX files to Documents.
|
||||
|
||||
:param sources:
|
||||
List of file paths or ByteStream objects.
|
||||
@ -123,7 +123,7 @@ class DocxToDocument:
|
||||
text = "\n".join(paragraphs)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Could not read {source} and convert it to a Docx Document, skipping. Error: {error}",
|
||||
"Could not read {source} and convert it to a DOCX Document, skipping. Error: {error}",
|
||||
source=source,
|
||||
error=e,
|
||||
)
|
||||
@ -136,17 +136,17 @@ class DocxToDocument:
|
||||
|
||||
return {"documents": documents}
|
||||
|
||||
def _get_docx_metadata(self, document: "DocxDocument") -> DocxMetadata:
|
||||
def _get_docx_metadata(self, document: "DocxDocument") -> DOCXMetadata:
|
||||
"""
|
||||
Get all relevant data from the 'core_properties' attribute from a Docx Document.
|
||||
Get all relevant data from the 'core_properties' attribute from a DOCX Document.
|
||||
|
||||
:param document:
|
||||
The Docx Document you want to extract metadata from
|
||||
The DOCX Document you want to extract metadata from
|
||||
|
||||
:returns:
|
||||
A `DocxMetadata` dataclass all the relevant fields from the 'core_properties'
|
||||
A `DOCXMetadata` dataclass all the relevant fields from the 'core_properties'
|
||||
"""
|
||||
return DocxMetadata(
|
||||
return DOCXMetadata(
|
||||
author=document.core_properties.author,
|
||||
category=document.core_properties.category,
|
||||
comments=document.core_properties.comments,
|
||||
|
||||
@ -0,0 +1,4 @@
|
||||
---
|
||||
enhancements:
|
||||
- |
|
||||
Renamed component from DocxToDocument to DOCXToDocument to follow the naming convention of other converter components.
|
||||
@ -5,17 +5,17 @@ import pytest
|
||||
|
||||
from haystack.dataclasses import ByteStream
|
||||
from haystack import Document
|
||||
from haystack.components.converters.docx import DocxToDocument, DocxMetadata
|
||||
from haystack.components.converters.docx import DOCXToDocument, DOCXMetadata
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def docx_converter():
|
||||
return DocxToDocument()
|
||||
return DOCXToDocument()
|
||||
|
||||
|
||||
class TestDocxToDocument:
|
||||
class TestDOCXToDocument:
|
||||
def test_init(self, docx_converter):
|
||||
assert isinstance(docx_converter, DocxToDocument)
|
||||
assert isinstance(docx_converter, DOCXToDocument)
|
||||
|
||||
def test_run(self, test_files_path, docx_converter):
|
||||
"""
|
||||
@ -29,7 +29,7 @@ class TestDocxToDocument:
|
||||
assert docs[0].meta.keys() == {"file_path", "docx"}
|
||||
assert docs[0].meta == {
|
||||
"file_path": str(paths[0]),
|
||||
"docx": DocxMetadata(
|
||||
"docx": DOCXMetadata(
|
||||
author="Microsoft Office User",
|
||||
category="",
|
||||
comments="",
|
||||
@ -54,7 +54,7 @@ class TestDocxToDocument:
|
||||
doc = output["documents"][0]
|
||||
assert doc.meta == {
|
||||
"file_path": str(paths[0]),
|
||||
"docx": DocxMetadata(
|
||||
"docx": DOCXMetadata(
|
||||
author="Microsoft Office User",
|
||||
category="",
|
||||
comments="",
|
||||
@ -106,7 +106,7 @@ class TestDocxToDocument:
|
||||
assert "History and standardization" in docs[1].content
|
||||
|
||||
def test_document_with_docx_metadata_to_dict(self):
|
||||
docx_metadata = DocxMetadata(
|
||||
docx_metadata = DOCXMetadata(
|
||||
author="Microsoft Office User",
|
||||
category="category",
|
||||
comments="comments",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user