fix: make meta produced by DOCXToDocument JSON serializable (#8263)

* make meta from DOCXToDocument JSON serializable

* unused import

* update docstrings
This commit is contained in:
Stefano Fiorucci 2024-08-22 14:24:32 +02:00 committed by GitHub
parent 0a1a64cb0c
commit 2e619f06c8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 33 additions and 22 deletions

View File

@ -4,7 +4,6 @@
import io
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
@ -30,13 +29,13 @@ class DOCXMetadata:
:param category: The category
:param comments: The comments
:param content_status: The content status
:param created: The creation date
:param created: The creation date (ISO formatted string)
:param identifier: The identifier
:param keywords: Available keywords
:param language: The language of the document
:param last_modified_by: The last modified by user date
:param last_printed: The last printed date
:param modified: The last modification date
:param last_modified_by: User who last modified the document
:param last_printed: The last printed date (ISO formatted string)
:param modified: The last modification date (ISO formatted string)
:param revision: The revision number
:param subject: The subject
:param title: The title
@ -47,13 +46,13 @@ class DOCXMetadata:
category: str
comments: str
content_status: str
created: Optional[datetime]
created: Optional[str]
identifier: str
keywords: str
language: str
last_modified_by: str
last_printed: Optional[datetime]
modified: Optional[datetime]
last_printed: Optional[str]
modified: Optional[str]
revision: int
subject: str
title: str
@ -192,13 +191,15 @@ class DOCXToDocument:
category=document.core_properties.category,
comments=document.core_properties.comments,
content_status=document.core_properties.content_status,
created=document.core_properties.created,
created=document.core_properties.created.isoformat() if document.core_properties.created else None,
identifier=document.core_properties.identifier,
keywords=document.core_properties.keywords,
language=document.core_properties.language,
last_modified_by=document.core_properties.last_modified_by,
last_printed=document.core_properties.last_printed,
modified=document.core_properties.modified,
last_printed=document.core_properties.last_printed.isoformat()
if document.core_properties.last_printed
else None,
modified=document.core_properties.modified.isoformat() if document.core_properties.modified else None,
revision=document.core_properties.revision,
subject=document.core_properties.subject,
title=document.core_properties.title,

View File

@ -0,0 +1,6 @@
---
fixes:
- |
The metadata produced by `DOCXToDocument` component is now JSON serializable.
Previously, it contained `datetime` objects automatically extracted from DOCX files, which are not JSON serializable.
Now, the `datetime` objects are converted to strings.

View File

@ -1,5 +1,5 @@
import logging
import datetime
import json
import pytest
@ -34,13 +34,13 @@ class TestDOCXToDocument:
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
@ -48,7 +48,7 @@ class TestDOCXToDocument:
),
}
def test_run_with_meta_overwrites(self, test_files_path, docx_converter):
def test_run_with_additional_meta(self, test_files_path, docx_converter):
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"})
doc = output["documents"][0]
@ -59,13 +59,13 @@ class TestDOCXToDocument:
category="",
comments="",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
@ -82,7 +82,7 @@ class TestDOCXToDocument:
assert "doc_1.txt and convert it" in caplog.text
assert results["documents"] == []
def test_run_error_non_existent_file(self, test_files_path, docx_converter, caplog):
def test_run_error_non_existent_file(self, docx_converter, caplog):
"""
Test if the component correctly handles errors.
"""
@ -121,13 +121,13 @@ class TestDOCXToDocument:
category="category",
comments="comments",
content_status="",
created=datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified=datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
@ -149,13 +149,13 @@ class TestDOCXToDocument:
"category": "category",
"comments": "comments",
"content_status": "",
"created": datetime.datetime(2024, 6, 9, 21, 17, tzinfo=datetime.timezone.utc),
"created": "2024-06-09T21:17:00+00:00",
"identifier": "",
"keywords": "",
"language": "",
"last_modified_by": "Carlos Fernández Lorán",
"last_printed": None,
"modified": datetime.datetime(2024, 6, 9, 21, 27, tzinfo=datetime.timezone.utc),
"modified": "2024-06-09T21:27:00+00:00",
"revision": 2,
"subject": "",
"title": "",
@ -163,3 +163,7 @@ class TestDOCXToDocument:
},
},
}
# check it is JSON serializable
json_str = json.dumps(doc.to_dict(flatten=False))
assert json.loads(json_str) == doc.to_dict(flatten=False)