mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
fix: make meta produced by DOCXToDocument JSON serializable (#8263)
* make meta from DOCXToDocument JSON serializable * unused import * update docstrings
This commit is contained in:
parent
0a1a64cb0c
commit
2e619f06c8
@ -4,7 +4,6 @@
|
||||
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
@ -30,13 +29,13 @@ class DOCXMetadata:
|
||||
:param category: The category
|
||||
:param comments: The comments
|
||||
:param content_status: The content status
|
||||
:param created: The creation date
|
||||
:param created: The creation date (ISO formatted string)
|
||||
:param identifier: The identifier
|
||||
:param keywords: Available keywords
|
||||
:param language: The language of the document
|
||||
:param last_modified_by: The last modified by user date
|
||||
:param last_printed: The last printed date
|
||||
:param modified: The last modification date
|
||||
:param last_modified_by: User who last modified the document
|
||||
:param last_printed: The last printed date (ISO formatted string)
|
||||
:param modified: The last modification date (ISO formatted string)
|
||||
:param revision: The revision number
|
||||
:param subject: The subject
|
||||
:param title: The title
|
||||
@ -47,13 +46,13 @@ class DOCXMetadata:
|
||||
category: str
|
||||
comments: str
|
||||
content_status: str
|
||||
created: Optional[datetime]
|
||||
created: Optional[str]
|
||||
identifier: str
|
||||
keywords: str
|
||||
language: str
|
||||
last_modified_by: str
|
||||
last_printed: Optional[datetime]
|
||||
modified: Optional[datetime]
|
||||
last_printed: Optional[str]
|
||||
modified: Optional[str]
|
||||
revision: int
|
||||
subject: str
|
||||
title: str
|
||||
@ -192,13 +191,15 @@ class DOCXToDocument:
|
||||
category=document.core_properties.category,
|
||||
comments=document.core_properties.comments,
|
||||
content_status=document.core_properties.content_status,
|
||||
created=document.core_properties.created,
|
||||
created=document.core_properties.created.isoformat() if document.core_properties.created else None,
|
||||
identifier=document.core_properties.identifier,
|
||||
keywords=document.core_properties.keywords,
|
||||
language=document.core_properties.language,
|
||||
last_modified_by=document.core_properties.last_modified_by,
|
||||
last_printed=document.core_properties.last_printed,
|
||||
modified=document.core_properties.modified,
|
||||
last_printed=document.core_properties.last_printed.isoformat()
|
||||
if document.core_properties.last_printed
|
||||
else None,
|
||||
modified=document.core_properties.modified.isoformat() if document.core_properties.modified else None,
|
||||
revision=document.core_properties.revision,
|
||||
subject=document.core_properties.subject,
|
||||
title=document.core_properties.title,
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The metadata produced by `DOCXToDocument` component is now JSON serializable.
|
||||
Previously, it contained `datetime` objects automatically extracted from DOCX files, which are not JSON serializable.
|
||||
Now, the `datetime` objects are converted to strings.
|
||||
@ -1,5 +1,5 @@
|
||||
import logging
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
@ -34,13 +34,13 @@ class TestDOCXToDocument:
|
||||
category="",
|
||||
comments="",
|
||||
content_status="",
|
||||
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
||||
created="2024-06-09T21:17:00+00:00",
|
||||
identifier="",
|
||||
keywords="",
|
||||
language="",
|
||||
last_modified_by="Carlos Fernández Lorán",
|
||||
last_printed=None,
|
||||
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
||||
modified="2024-06-09T21:27:00+00:00",
|
||||
revision=2,
|
||||
subject="",
|
||||
title="",
|
||||
@ -48,7 +48,7 @@ class TestDOCXToDocument:
|
||||
),
|
||||
}
|
||||
|
||||
def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
|
||||
def test_run_with_additional_meta(self, test_files_path, docx_converter):
|
||||
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
|
||||
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
|
||||
doc = output["documents"][0]
|
||||
@ -59,13 +59,13 @@ class TestDOCXToDocument:
|
||||
category="",
|
||||
comments="",
|
||||
content_status="",
|
||||
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
||||
created="2024-06-09T21:17:00+00:00",
|
||||
identifier="",
|
||||
keywords="",
|
||||
language="",
|
||||
last_modified_by="Carlos Fernández Lorán",
|
||||
last_printed=None,
|
||||
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
||||
modified="2024-06-09T21:27:00+00:00",
|
||||
revision=2,
|
||||
subject="",
|
||||
title="",
|
||||
@ -82,7 +82,7 @@ class TestDOCXToDocument:
|
||||
assert "doc_1.txt and convert it" in caplog.text
|
||||
assert results["documents"] == []
|
||||
|
||||
def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
|
||||
def test_run_error_non_existent_file(self, docx_converter, caplog):
|
||||
"""
|
||||
Test if the component correctly handles errors.
|
||||
"""
|
||||
@ -121,13 +121,13 @@ class TestDOCXToDocument:
|
||||
category="category",
|
||||
comments="comments",
|
||||
content_status="",
|
||||
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
||||
created="2024-06-09T21:17:00+00:00",
|
||||
identifier="",
|
||||
keywords="",
|
||||
language="",
|
||||
last_modified_by="Carlos Fernández Lorán",
|
||||
last_printed=None,
|
||||
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
||||
modified="2024-06-09T21:27:00+00:00",
|
||||
revision=2,
|
||||
subject="",
|
||||
title="",
|
||||
@ -149,13 +149,13 @@ class TestDOCXToDocument:
|
||||
"category": "category",
|
||||
"comments": "comments",
|
||||
"content_status": "",
|
||||
"created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
|
||||
"created": "2024-06-09T21:17:00+00:00",
|
||||
"identifier": "",
|
||||
"keywords": "",
|
||||
"language": "",
|
||||
"last_modified_by": "Carlos Fernández Lorán",
|
||||
"last_printed": None,
|
||||
"modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
|
||||
"modified": "2024-06-09T21:27:00+00:00",
|
||||
"revision": 2,
|
||||
"subject": "",
|
||||
"title": "",
|
||||
@ -163,3 +163,7 @@ class TestDOCXToDocument:
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# check it is JSON serializable
|
||||
json_str = json.dumps(doc.to_dict(flatten=False))
|
||||
assert json.loads(json_str) == doc.to_dict(flatten=False)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user