mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-10 23:47:36 +00:00
rfctr: prepare to add orig_elements serde (#2668)
**Summary** The serialization and deserialization (serde) of `metadata.orig_elements` will be located in `unstructured.staging.base` alongside `elements_to_json()` and other existing serde functions. Improve the typing, readability, and structure of that module before adding the new serde functions for `metadata.orig_elements`. **Reviewers:** The commits are well-groomed and are probably quicker to review commit-by-commit than as all files-changed at once.
This commit is contained in:
parent
6abfb8b2b3
commit
31bef433ad
@ -1,4 +1,4 @@
|
||||
## 0.12.7-dev7
|
||||
## 0.12.7-dev8
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
0
test_unstructured/cleaners/__init__.py
Normal file
0
test_unstructured/cleaners/__init__.py
Normal file
0
test_unstructured/documents/__init__.py
Normal file
0
test_unstructured/documents/__init__.py
Normal file
@ -10,8 +10,7 @@ from functools import partial
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners.core import clean_prefix
|
||||
from unstructured.cleaners.translate import translate_text
|
||||
from unstructured.cleaners.core import clean_bullets, clean_prefix
|
||||
from unstructured.documents.coordinates import (
|
||||
CoordinateSystem,
|
||||
Orientation,
|
||||
@ -66,13 +65,10 @@ def test_text_element_apply_cleaners():
|
||||
|
||||
|
||||
def test_text_element_apply_multiple_cleaners():
|
||||
cleaners = [
|
||||
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
|
||||
partial(translate_text, target_lang="ru"),
|
||||
]
|
||||
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||
cleaners = [partial(clean_prefix, pattern=r"\[\d{1,2}\]"), partial(clean_bullets)]
|
||||
text_element = Text(text="[1] \u2022 A Textbook on Crocodile Habitats")
|
||||
text_element.apply(*cleaners)
|
||||
assert str(text_element) == "Учебник по крокодильным средам обитания"
|
||||
assert str(text_element) == "A Textbook on Crocodile Habitats"
|
||||
|
||||
|
||||
def test_apply_raises_if_func_does_not_produce_string():
|
||||
@ -82,7 +78,7 @@ def test_apply_raises_if_func_does_not_produce_string():
|
||||
text_element = Text(text="[1] A Textbook on Crocodile Habitats")
|
||||
|
||||
with pytest.raises(ValueError, match="Cleaner produced a non-string output."):
|
||||
text_element.apply(bad_cleaner) # pyright: ignore[reportGeneralTypeIssues]
|
||||
text_element.apply(bad_cleaner) # pyright: ignore[reportArgumentType]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -241,7 +237,7 @@ class DescribeElementMetadata:
|
||||
|
||||
def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
|
||||
with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
|
||||
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportGeneralTypeIssues]
|
||||
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportCallIssue]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_path",
|
||||
@ -289,9 +285,9 @@ class DescribeElementMetadata:
|
||||
|
||||
def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
|
||||
ElementMetadata(
|
||||
category_depth="2", # pyright: ignore[reportGeneralTypeIssues]
|
||||
file_directory=True, # pyright: ignore[reportGeneralTypeIssues]
|
||||
text_as_html=42, # pyright: ignore[reportGeneralTypeIssues]
|
||||
category_depth="2", # pyright: ignore[reportArgumentType]
|
||||
file_directory=True, # pyright: ignore[reportArgumentType]
|
||||
text_as_html=42, # pyright: ignore[reportArgumentType]
|
||||
)
|
||||
# -- it does not check types at runtime however (choosing to avoid validation overhead) --
|
||||
|
||||
@ -526,7 +522,7 @@ class DescribeElementMetadata:
|
||||
def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
|
||||
meta = ElementMetadata()
|
||||
with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
|
||||
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportGeneralTypeIssues]
|
||||
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportArgumentType]
|
||||
|
||||
# -- It knows when it is equal to another instance -------------------------------------------
|
||||
|
||||
|
||||
0
test_unstructured/embed/__init__.py
Normal file
0
test_unstructured/embed/__init__.py
Normal file
0
test_unstructured/file_utils/__init__.py
Normal file
0
test_unstructured/file_utils/__init__.py
Normal file
0
test_unstructured/metrics/__init__.py
Normal file
0
test_unstructured/metrics/__init__.py
Normal file
0
test_unstructured/partition/utils/__init__.py
Normal file
0
test_unstructured/partition/utils/__init__.py
Normal file
0
test_unstructured/staging/__init__.py
Normal file
0
test_unstructured/staging/__init__.py
Normal file
@ -31,14 +31,9 @@ from unstructured.partition.text import partition_text
|
||||
from unstructured.staging import base
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def output_csv_file(tmp_path):
|
||||
return os.path.join(tmp_path, "isd_data.csv")
|
||||
|
||||
|
||||
def test_convert_to_isd():
|
||||
def test_elements_to_dicts():
|
||||
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||
isd = base.convert_to_isd(elements)
|
||||
isd = base.elements_to_dicts(elements)
|
||||
|
||||
assert isd[0]["text"] == "Title 1"
|
||||
assert isd[0]["type"] == ElementType.TITLE
|
||||
@ -47,8 +42,8 @@ def test_convert_to_isd():
|
||||
assert isd[1]["type"] == "NarrativeText"
|
||||
|
||||
|
||||
def test_isd_to_elements():
|
||||
isd = [
|
||||
def test_elements_from_dicts():
|
||||
element_dicts = [
|
||||
{"text": "Blurb1", "type": "NarrativeText"},
|
||||
{"text": "Blurb2", "type": "Title"},
|
||||
{"text": "Blurb3", "type": "ListItem"},
|
||||
@ -56,7 +51,7 @@ def test_isd_to_elements():
|
||||
{"text": "No Type"},
|
||||
]
|
||||
|
||||
elements = base.isd_to_elements(isd)
|
||||
elements = base.elements_from_dicts(element_dicts)
|
||||
assert elements == [
|
||||
NarrativeText(text="Blurb1"),
|
||||
Title(text="Blurb2"),
|
||||
@ -65,13 +60,14 @@ def test_isd_to_elements():
|
||||
]
|
||||
|
||||
|
||||
def test_convert_to_csv(output_csv_file):
|
||||
def test_convert_to_csv(tmp_path: str):
|
||||
output_csv_path = os.path.join(tmp_path, "isd_data.csv")
|
||||
elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
|
||||
with open(output_csv_file, "w+") as csv_file:
|
||||
with open(output_csv_path, "w+") as csv_file:
|
||||
isd_csv_string = base.convert_to_csv(elements)
|
||||
csv_file.write(isd_csv_string)
|
||||
|
||||
with open(output_csv_file) as csv_file:
|
||||
with open(output_csv_path) as csv_file:
|
||||
csv_rows = csv.DictReader(csv_file)
|
||||
assert all(set(row.keys()) == set(base.TABLE_FIELDNAMES) for row in csv_rows)
|
||||
|
||||
@ -85,15 +81,13 @@ def test_convert_to_dataframe():
|
||||
"text": ["Title 1", "Narrative 1"],
|
||||
},
|
||||
)
|
||||
assert df.type.equals(expected_df.type) is True
|
||||
assert df.text.equals(expected_df.text) is True
|
||||
assert df.type.equals(expected_df.type) is True # type: ignore
|
||||
assert df.text.equals(expected_df.text) is True # type: ignore
|
||||
|
||||
|
||||
def test_convert_to_dataframe_maintains_fields(
|
||||
filename="example-docs/eml/fake-email-attachment.eml",
|
||||
):
|
||||
def test_convert_to_dataframe_maintains_fields():
|
||||
elements = partition_email(
|
||||
filename=filename,
|
||||
"example-docs/eml/fake-email-attachment.eml",
|
||||
process_attachements=True,
|
||||
regex_metadata={"hello": r"Hello", "punc": r"[!]"},
|
||||
)
|
||||
@ -109,10 +103,7 @@ def test_convert_to_dataframe_maintains_fields(
|
||||
|
||||
|
||||
def test_default_pandas_dtypes():
|
||||
"""
|
||||
Make sure that all the values that can exist on an element have a corresponding dtype
|
||||
mapped in the dict returned by get_default_pandas_dtypes()
|
||||
"""
|
||||
"""Ensure all element fields have a dtype in dict returned by get_default_pandas_dtypes()."""
|
||||
full_element = Text(
|
||||
text="some text",
|
||||
element_id="123",
|
||||
@ -165,8 +156,7 @@ def test_default_pandas_dtypes():
|
||||
element_as_dict = full_element.to_dict()
|
||||
element_as_dict.update(
|
||||
base.flatten_dict(
|
||||
element_as_dict.pop("metadata"),
|
||||
keys_to_omit=["data_source_record_locator"],
|
||||
element_as_dict.pop("metadata"), keys_to_omit=["data_source_record_locator"]
|
||||
),
|
||||
)
|
||||
flattened_element_keys = element_as_dict.keys()
|
||||
@ -180,13 +170,13 @@ def test_default_pandas_dtypes():
|
||||
platform.system() == "Windows",
|
||||
reason="Posix Paths are not available on Windows",
|
||||
)
|
||||
def test_convert_to_isd_serializes_with_posix_paths():
|
||||
def test_elements_to_dicts_serializes_with_posix_paths():
|
||||
metadata = ElementMetadata(filename=pathlib.PosixPath("../../fake-file.txt"))
|
||||
elements = [
|
||||
Title(text="Title 1", metadata=metadata),
|
||||
NarrativeText(text="Narrative 1", metadata=metadata),
|
||||
]
|
||||
output = base.convert_to_isd(elements)
|
||||
output = base.elements_to_dicts(elements)
|
||||
# NOTE(robinson) - json.dumps should run without raising an exception
|
||||
json.dumps(output)
|
||||
|
||||
@ -205,11 +195,11 @@ def test_all_elements_preserved_when_serialized():
|
||||
PageBreak(text=""),
|
||||
]
|
||||
|
||||
isd = base.convert_to_isd(elements)
|
||||
assert base.convert_to_isd(base.isd_to_elements(isd)) == isd
|
||||
element_dicts = base.elements_to_dicts(elements)
|
||||
assert base.elements_to_dicts(base.elements_from_dicts(element_dicts)) == element_dicts
|
||||
|
||||
|
||||
def test_serialized_deserialize_elements_to_json(tmpdir):
|
||||
def test_serialized_deserialize_elements_to_json(tmpdir: str):
|
||||
filename = os.path.join(tmpdir, "fake-elements.json")
|
||||
metadata = ElementMetadata(filename="fake-file.txt")
|
||||
elements = [
|
||||
@ -229,63 +219,38 @@ def test_serialized_deserialize_elements_to_json(tmpdir):
|
||||
assert elements == new_elements_filename
|
||||
|
||||
elements_str = base.elements_to_json(elements)
|
||||
assert elements_str is not None
|
||||
new_elements_text = base.elements_from_json(text=elements_str)
|
||||
assert elements == new_elements_text
|
||||
|
||||
|
||||
def test_read_and_write_json_with_encoding(
|
||||
filename="example-docs/fake-text-utf-16-be.txt",
|
||||
):
|
||||
elements = partition_text(filename=filename)
|
||||
def test_read_and_write_json_with_encoding():
|
||||
elements = partition_text("example-docs/fake-text-utf-16-be.txt")
|
||||
with NamedTemporaryFile() as tempfile:
|
||||
base.elements_to_json(elements, filename=tempfile.name, encoding="utf-16")
|
||||
new_elements_filename = base.elements_from_json(
|
||||
filename=tempfile.name,
|
||||
encoding="utf-16",
|
||||
)
|
||||
new_elements_filename = base.elements_from_json(filename=tempfile.name, encoding="utf-16")
|
||||
assert elements == new_elements_filename
|
||||
|
||||
|
||||
def test_filter_element_types_with_include_element_type(
|
||||
filename="example-docs/fake-text.txt",
|
||||
):
|
||||
def test_filter_element_types_with_include_element_type():
|
||||
element_types = [Title]
|
||||
elements = partition_text(
|
||||
filename=filename,
|
||||
include_metadata=False,
|
||||
)
|
||||
elements = base.filter_element_types(
|
||||
elements=elements,
|
||||
include_element_types=element_types,
|
||||
)
|
||||
elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
|
||||
elements = base.filter_element_types(elements=elements, include_element_types=element_types)
|
||||
for element in elements:
|
||||
assert type(element) in element_types
|
||||
|
||||
|
||||
def test_filter_element_types_with_exclude_element_type(
|
||||
filename="example-docs/fake-text.txt",
|
||||
):
|
||||
def test_filter_element_types_with_exclude_element_type():
|
||||
element_types = [Title]
|
||||
elements = partition_text(
|
||||
filename=filename,
|
||||
include_metadata=False,
|
||||
)
|
||||
elements = base.filter_element_types(
|
||||
elements=elements,
|
||||
exclude_element_types=element_types,
|
||||
)
|
||||
elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
|
||||
elements = base.filter_element_types(elements=elements, exclude_element_types=element_types)
|
||||
for element in elements:
|
||||
assert type(element) not in element_types
|
||||
|
||||
|
||||
def test_filter_element_types_with_exclude_and_include_element_type(
|
||||
filename="example-docs/fake-text.txt",
|
||||
):
|
||||
def test_filter_element_types_with_exclude_and_include_element_type():
|
||||
element_types = [Title]
|
||||
elements = partition_text(
|
||||
filename=filename,
|
||||
include_metadata=False,
|
||||
)
|
||||
elements = partition_text("example-docs/fake-text.txt", include_metadata=False)
|
||||
with pytest.raises(ValueError):
|
||||
elements = base.filter_element_types(
|
||||
elements=elements,
|
||||
@ -527,13 +492,9 @@ def test_flatten_dict_flatten_list_omit_keys4():
|
||||
|
||||
def test_flatten_empty_dict():
|
||||
"""Flattening an empty dictionary"""
|
||||
dictionary = {}
|
||||
expected_result = {}
|
||||
assert base.flatten_dict(dictionary) == expected_result
|
||||
assert base.flatten_dict({}) == {}
|
||||
|
||||
|
||||
def test_flatten_dict_empty_lists():
|
||||
"""Flattening a dictionary with empty lists"""
|
||||
dictionary = {"a": [], "b": {"c": []}}
|
||||
expected_result = {"a": [], "b_c": []}
|
||||
assert base.flatten_dict(dictionary) == expected_result
|
||||
assert base.flatten_dict({"a": [], "b": {"c": []}}) == {"a": [], "b_c": []}
|
||||
@ -14,7 +14,7 @@ from unstructured.ingest.interfaces import (
|
||||
ReadConfig,
|
||||
)
|
||||
from unstructured.partition.auto import partition
|
||||
from unstructured.staging.base import convert_to_dict
|
||||
from unstructured.staging.base import elements_to_dicts
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs")
|
||||
@ -108,7 +108,7 @@ def partition_test_results():
|
||||
@pytest.fixture()
|
||||
def partition_file_test_results(partition_test_results):
|
||||
# Reusable partition_file test results, calculated only once
|
||||
return convert_to_dict(partition_test_results)
|
||||
return elements_to_dicts(partition_test_results)
|
||||
|
||||
|
||||
def test_partition_file():
|
||||
@ -120,9 +120,9 @@ def test_partition_file():
|
||||
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
||||
)
|
||||
test_ingest_doc._date_processed = TEST_DATE_PROCESSSED
|
||||
isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
||||
isd_elems = convert_to_dict(isd_elems_raw)
|
||||
assert len(isd_elems)
|
||||
elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
||||
element_dicts = elements_to_dicts(elements)
|
||||
assert len(element_dicts)
|
||||
expected_keys = {
|
||||
"element_id",
|
||||
"text",
|
||||
@ -139,7 +139,7 @@ def test_partition_file():
|
||||
"languages",
|
||||
"last_modified",
|
||||
}
|
||||
for elem in isd_elems:
|
||||
for elem in element_dicts:
|
||||
# Parent IDs are non-deterministic - remove them from the test
|
||||
elem["metadata"].pop("parent_id", None)
|
||||
|
||||
@ -166,11 +166,11 @@ def test_process_file_fields_include_default(mocker, partition_test_results):
|
||||
read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR),
|
||||
processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR),
|
||||
)
|
||||
isd_elems_raw = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
||||
isd_elems = convert_to_dict(isd_elems_raw)
|
||||
assert len(isd_elems)
|
||||
elements = test_ingest_doc.partition_file(partition_config=PartitionConfig())
|
||||
element_dicts = elements_to_dicts(elements)
|
||||
assert len(element_dicts)
|
||||
assert mock_partition.call_count == 1
|
||||
for elem in isd_elems:
|
||||
for elem in element_dicts:
|
||||
# Parent IDs are non-deterministic - remove them from the test
|
||||
elem["metadata"].pop("parent_id", None)
|
||||
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.7-dev7" # pragma: no cover
|
||||
__version__ = "0.12.7-dev8" # pragma: no cover
|
||||
|
||||
@ -22,7 +22,7 @@ from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, e
|
||||
from unstructured.ingest.enhanced_dataclass.core import _asdict
|
||||
from unstructured.ingest.error import PartitionError, SourceConnectionError
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.staging.base import convert_to_dict, flatten_dict
|
||||
from unstructured.staging.base import elements_to_dicts, flatten_dict
|
||||
|
||||
A = t.TypeVar("A", bound="DataClassJsonMixin")
|
||||
|
||||
@ -586,12 +586,11 @@ class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC):
|
||||
return None
|
||||
logger.info(f"Processing {self.filename}")
|
||||
|
||||
isd_elems_raw = self.partition_file(partition_config=partition_config, **partition_kwargs)
|
||||
isd_elems = convert_to_dict(isd_elems_raw)
|
||||
elements = self.partition_file(partition_config=partition_config, **partition_kwargs)
|
||||
element_dicts = elements_to_dicts(elements)
|
||||
|
||||
self.isd_elems_no_filename: t.List[t.Dict[str, t.Any]] = []
|
||||
for elem in isd_elems:
|
||||
# type: ignore
|
||||
for elem in element_dicts:
|
||||
if partition_config.metadata_exclude and partition_config.metadata_include:
|
||||
raise ValueError(
|
||||
"Arguments `--metadata-include` and `--metadata-exclude` are "
|
||||
|
||||
@ -10,7 +10,7 @@ from unstructured.ingest.interfaces import (
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.ingest.pipeline.interfaces import ReformatNode
|
||||
from unstructured.staging.base import convert_to_dict, elements_from_json
|
||||
from unstructured.staging.base import elements_from_json, elements_to_dicts
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -49,10 +49,10 @@ class Chunker(ReformatNode):
|
||||
return str(json_path)
|
||||
elements = elements_from_json(filename=elements_json)
|
||||
chunked_elements = self.chunking_config.chunk(elements=elements)
|
||||
elements_dict = convert_to_dict(chunked_elements)
|
||||
element_dicts = elements_to_dicts(chunked_elements)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing chunking content to {json_path}")
|
||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
except Exception as e:
|
||||
if self.pipeline_context.raise_on_error:
|
||||
|
||||
@ -10,7 +10,7 @@ from unstructured.ingest.interfaces import (
|
||||
)
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.ingest.pipeline.interfaces import ReformatNode
|
||||
from unstructured.staging.base import convert_to_dict, elements_from_json
|
||||
from unstructured.staging.base import elements_from_json, elements_to_dicts
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -50,10 +50,10 @@ class Embedder(ReformatNode):
|
||||
elements = elements_from_json(filename=elements_json)
|
||||
embedder = self.embedder_config.get_embedder()
|
||||
embedded_elements = embedder.embed_documents(elements=elements)
|
||||
elements_dict = convert_to_dict(embedded_elements)
|
||||
element_dicts = elements_to_dicts(embedded_elements)
|
||||
with open(json_path, "w", encoding="utf8") as output_f:
|
||||
logger.info(f"writing embeddings content to {json_path}")
|
||||
json.dump(elements_dict, output_f, ensure_ascii=False, indent=2)
|
||||
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
||||
return str(json_path)
|
||||
except Exception as e:
|
||||
if self.pipeline_context.raise_on_error:
|
||||
|
||||
@ -13,7 +13,7 @@ from unstructured_client.models import shared
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.logger import logger
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.staging.base import dict_to_elements, elements_from_json
|
||||
from unstructured.staging.base import elements_from_dicts, elements_from_json
|
||||
|
||||
|
||||
def partition_via_api(
|
||||
@ -214,7 +214,7 @@ def partition_multiple_via_api(
|
||||
response_list = [response_list]
|
||||
|
||||
for document in response_list:
|
||||
documents.append(dict_to_elements(document))
|
||||
documents.append(elements_from_dicts(document))
|
||||
return documents
|
||||
else:
|
||||
raise ValueError(
|
||||
|
||||
@ -24,7 +24,7 @@ from unstructured.partition.common import (
|
||||
get_last_modified_date,
|
||||
get_last_modified_date_from_file,
|
||||
)
|
||||
from unstructured.staging.base import dict_to_elements
|
||||
from unstructured.staging.base import elements_from_dicts
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@ -86,8 +86,8 @@ def partition_json(
|
||||
)
|
||||
|
||||
try:
|
||||
dict = json.loads(file_text)
|
||||
elements = dict_to_elements(dict)
|
||||
element_dicts = json.loads(file_text)
|
||||
elements = elements_from_dicts(element_dicts)
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Not a valid json")
|
||||
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Iterable, Optional, Sequence, cast
|
||||
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
@ -14,13 +16,128 @@ from unstructured.documents.elements import (
|
||||
NoID,
|
||||
)
|
||||
from unstructured.partition.common import exactly_one
|
||||
from unstructured.utils import dependency_exists, requires_dependencies
|
||||
from unstructured.utils import Point, dependency_exists, requires_dependencies
|
||||
|
||||
if dependency_exists("pandas"):
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def _get_metadata_table_fieldnames():
|
||||
# ================================================================================================
|
||||
# SERIALIZATION/DESERIALIZATION (SERDE) RELATED FUNCTIONS
|
||||
# ================================================================================================
|
||||
# These serde functions will likely relocate to `unstructured.documents.elements` since they are
|
||||
# so closely related to elements and this staging "brick" is deprecated.
|
||||
# ================================================================================================
|
||||
|
||||
# == DESERIALIZERS ===============================
|
||||
|
||||
|
||||
def elements_from_dicts(element_dicts: Iterable[dict[str, Any]]) -> list[Element]:
|
||||
"""Convert a list of element-dicts to a list of elements."""
|
||||
elements: list[Element] = []
|
||||
|
||||
for item in element_dicts:
|
||||
element_id: str = item.get("element_id", NoID())
|
||||
metadata = (
|
||||
ElementMetadata()
|
||||
if item.get("metadata") is None
|
||||
else ElementMetadata.from_dict(item["metadata"])
|
||||
)
|
||||
|
||||
if item.get("type") in TYPE_TO_TEXT_ELEMENT_MAP:
|
||||
ElementCls = TYPE_TO_TEXT_ELEMENT_MAP[item["type"]]
|
||||
elements.append(ElementCls(text=item["text"], element_id=element_id, metadata=metadata))
|
||||
elif item.get("type") == "CheckBox":
|
||||
elements.append(
|
||||
CheckBox(checked=item["checked"], element_id=element_id, metadata=metadata)
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
# -- legacy aliases for elements_from_dicts() --
|
||||
isd_to_elements = elements_from_dicts
|
||||
dict_to_elements = elements_from_dicts
|
||||
|
||||
|
||||
def elements_from_json(
|
||||
filename: str = "", text: str = "", encoding: str = "utf-8"
|
||||
) -> list[Element]:
|
||||
"""Loads a list of elements from a JSON file or a string."""
|
||||
exactly_one(filename=filename, text=text)
|
||||
|
||||
if filename:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
element_dicts = json.load(f)
|
||||
else:
|
||||
element_dicts = json.loads(text)
|
||||
|
||||
return elements_from_dicts(element_dicts)
|
||||
|
||||
|
||||
# == SERIALIZERS =================================
|
||||
|
||||
|
||||
def elements_to_dicts(elements: Iterable[Element]) -> list[dict[str, Any]]:
|
||||
"""Convert document elements to element-dicts."""
|
||||
return [e.to_dict() for e in elements]
|
||||
|
||||
|
||||
# -- legacy aliases for elements_to_dicts() --
|
||||
convert_to_isd = elements_to_dicts
|
||||
convert_to_dict = elements_to_dicts
|
||||
|
||||
|
||||
def elements_to_json(
|
||||
elements: Iterable[Element],
|
||||
filename: Optional[str] = None,
|
||||
indent: int = 4,
|
||||
encoding: str = "utf-8",
|
||||
) -> Optional[str]:
|
||||
"""Saves a list of elements to a JSON file if filename is specified.
|
||||
|
||||
Otherwise, return the list of elements as a string.
|
||||
"""
|
||||
# -- serialize `elements` as a JSON array (str) --
|
||||
precision_adjusted_elements = _fix_metadata_field_precision(elements)
|
||||
element_dicts = elements_to_dicts(precision_adjusted_elements)
|
||||
json_str = json.dumps(element_dicts, indent=indent, sort_keys=True)
|
||||
|
||||
if filename is not None:
|
||||
with open(filename, "w", encoding=encoding) as f:
|
||||
f.write(json_str)
|
||||
return None
|
||||
|
||||
return json_str
|
||||
|
||||
|
||||
def _fix_metadata_field_precision(elements: Iterable[Element]) -> list[Element]:
|
||||
out_elements: list[Element] = []
|
||||
for element in elements:
|
||||
el = deepcopy(element)
|
||||
if el.metadata.coordinates:
|
||||
precision = 1 if isinstance(el.metadata.coordinates.system, PixelSpace) else 2
|
||||
points = el.metadata.coordinates.points
|
||||
assert points is not None
|
||||
rounded_points: list[Point] = []
|
||||
for point in points:
|
||||
x, y = point
|
||||
rounded_point = (round(x, precision), round(y, precision))
|
||||
rounded_points.append(rounded_point)
|
||||
el.metadata.coordinates.points = tuple(rounded_points)
|
||||
|
||||
if el.metadata.detection_class_prob:
|
||||
el.metadata.detection_class_prob = round(el.metadata.detection_class_prob, 5)
|
||||
|
||||
out_elements.append(el)
|
||||
|
||||
return out_elements
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
def _get_metadata_table_fieldnames() -> list[str]:
|
||||
metadata_fields = list(ElementMetadata.__annotations__.keys())
|
||||
metadata_fields.remove("coordinates")
|
||||
metadata_fields.extend(
|
||||
@ -35,27 +152,25 @@ def _get_metadata_table_fieldnames():
|
||||
return metadata_fields
|
||||
|
||||
|
||||
TABLE_FIELDNAMES: List[str] = [
|
||||
TABLE_FIELDNAMES: list[str] = [
|
||||
"type",
|
||||
"text",
|
||||
"element_id",
|
||||
] + _get_metadata_table_fieldnames()
|
||||
|
||||
|
||||
def convert_to_text(elements: List[Element]) -> str:
|
||||
"""Converts a list of elements into clean, concatenated text."""
|
||||
def convert_to_text(elements: Iterable[Element]) -> str:
|
||||
"""Convert elements into clean, concatenated text."""
|
||||
return "\n".join([e.text for e in elements if hasattr(e, "text") and e.text])
|
||||
|
||||
|
||||
def elements_to_text(
|
||||
elements: List[Element],
|
||||
filename: Optional[str] = None,
|
||||
encoding: str = "utf-8",
|
||||
elements: Iterable[Element], filename: Optional[str] = None, encoding: str = "utf-8"
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Convert the text from the list of elements into clean, concatenated text.
|
||||
Saves to a txt file if filename is specified.
|
||||
Otherwise, return the text of the elements as a string.
|
||||
"""Convert text from each of `elements` into clean, concatenated text.
|
||||
|
||||
Saves to a txt file if filename is specified. Otherwise, return the text of the elements as a
|
||||
string.
|
||||
"""
|
||||
element_cct = convert_to_text(elements)
|
||||
if filename is not None:
|
||||
@ -66,130 +181,23 @@ def elements_to_text(
|
||||
return element_cct
|
||||
|
||||
|
||||
def convert_to_isd(elements: List[Element]) -> List[Dict[str, Any]]:
|
||||
"""Represents the document elements as an Initial Structured Document (ISD)."""
|
||||
isd: List[Dict[str, Any]] = []
|
||||
for element in elements:
|
||||
section = element.to_dict()
|
||||
isd.append(section)
|
||||
return isd
|
||||
|
||||
|
||||
def convert_to_dict(elements: List[Element]) -> List[Dict[str, Any]]:
|
||||
"""Converts a list of elements into a dictionary."""
|
||||
return convert_to_isd(elements)
|
||||
|
||||
|
||||
def _fix_metadata_field_precision(elements: List[Element]) -> List[Element]:
|
||||
out_elements = []
|
||||
for element in elements:
|
||||
el = deepcopy(element)
|
||||
if el.metadata.coordinates:
|
||||
precision = 1 if isinstance(el.metadata.coordinates.system, PixelSpace) else 2
|
||||
points = el.metadata.coordinates.points
|
||||
rounded_points = []
|
||||
for point in points:
|
||||
x, y = point
|
||||
rounded_point = (round(x, precision), round(y, precision))
|
||||
rounded_points.append(rounded_point)
|
||||
el.metadata.coordinates.points = tuple(rounded_points)
|
||||
|
||||
if el.metadata.detection_class_prob:
|
||||
el.metadata.detection_class_prob = round(el.metadata.detection_class_prob, 5)
|
||||
|
||||
out_elements.append(el)
|
||||
return out_elements
|
||||
|
||||
|
||||
def elements_to_json(
|
||||
elements: List[Element],
|
||||
filename: Optional[str] = None,
|
||||
indent: int = 4,
|
||||
encoding: str = "utf-8",
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Saves a list of elements to a JSON file if filename is specified.
|
||||
Otherwise, return the list of elements as a string.
|
||||
"""
|
||||
|
||||
pre_processed_elements = _fix_metadata_field_precision(elements)
|
||||
element_dict = convert_to_dict(pre_processed_elements)
|
||||
if filename is not None:
|
||||
with open(filename, "w", encoding=encoding) as f:
|
||||
json.dump(element_dict, f, indent=indent, sort_keys=True)
|
||||
return None
|
||||
else:
|
||||
return json.dumps(element_dict, indent=indent, sort_keys=True)
|
||||
|
||||
|
||||
def isd_to_elements(isd: List[Dict[str, Any]]) -> List[Element]:
|
||||
"""Converts an Initial Structured Data (ISD) dictionary to a list of elements."""
|
||||
elements: List[Element] = []
|
||||
|
||||
for item in isd:
|
||||
element_id: str = item.get("element_id", NoID())
|
||||
metadata = ElementMetadata()
|
||||
_metadata_dict = item.get("metadata")
|
||||
if _metadata_dict is not None:
|
||||
metadata = ElementMetadata.from_dict(_metadata_dict)
|
||||
|
||||
if item.get("type") in TYPE_TO_TEXT_ELEMENT_MAP:
|
||||
_text_class = TYPE_TO_TEXT_ELEMENT_MAP[item["type"]]
|
||||
elements.append(
|
||||
_text_class(
|
||||
text=item["text"],
|
||||
element_id=element_id,
|
||||
metadata=metadata,
|
||||
),
|
||||
)
|
||||
elif item.get("type") == "CheckBox":
|
||||
elements.append(
|
||||
CheckBox(
|
||||
checked=item["checked"],
|
||||
element_id=element_id,
|
||||
metadata=metadata,
|
||||
),
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def dict_to_elements(element_dict: List[Dict[str, Any]]) -> List[Element]:
|
||||
"""Converts a dictionary representation of an element list into List[Element]."""
|
||||
return isd_to_elements(element_dict)
|
||||
|
||||
|
||||
def elements_from_json(
|
||||
filename: str = "",
|
||||
text: str = "",
|
||||
encoding: str = "utf-8",
|
||||
) -> List[Element]:
|
||||
"""Loads a list of elements from a JSON file or a string."""
|
||||
exactly_one(filename=filename, text=text)
|
||||
|
||||
if filename:
|
||||
with open(filename, encoding=encoding) as f:
|
||||
element_dict = json.load(f)
|
||||
return dict_to_elements(element_dict)
|
||||
else:
|
||||
element_dict = json.loads(text)
|
||||
return dict_to_elements(element_dict)
|
||||
|
||||
|
||||
def flatten_dict(
|
||||
dictionary,
|
||||
parent_key="",
|
||||
separator="_",
|
||||
flatten_lists=False,
|
||||
remove_none=False,
|
||||
keys_to_omit: List[str] = None,
|
||||
):
|
||||
"""Flattens a nested dictionary into a single level dictionary. keys_to_omit is a list of keys
|
||||
that don't get flattened. If omitting a nested key, format as {parent_key}{separator}{key}.
|
||||
If flatten_lists is True, then lists and tuples are flattened as well.
|
||||
If remove_none is True, then None keys/values are removed from the flattened dictionary."""
|
||||
dictionary: dict[str, Any],
|
||||
parent_key: str = "",
|
||||
separator: str = "_",
|
||||
flatten_lists: bool = False,
|
||||
remove_none: bool = False,
|
||||
keys_to_omit: Optional[Sequence[str]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Flattens a nested dictionary into a single level dictionary.
|
||||
|
||||
keys_to_omit is a list of keys that don't get flattened. If omitting a nested key, format as
|
||||
{parent_key}{separator}{key}. If flatten_lists is True, then lists and tuples are flattened as
|
||||
well. If remove_none is True, then None keys/values are removed from the flattened
|
||||
dictionary.
|
||||
"""
|
||||
keys_to_omit = keys_to_omit if keys_to_omit else []
|
||||
flattened_dict = {}
|
||||
flattened_dict: dict[str, Any] = {}
|
||||
for key, value in dictionary.items():
|
||||
new_key = f"{parent_key}{separator}{key}" if parent_key else key
|
||||
if new_key in keys_to_omit:
|
||||
@ -197,12 +205,14 @@ def flatten_dict(
|
||||
elif value is None and remove_none:
|
||||
continue
|
||||
elif isinstance(value, dict):
|
||||
value = cast("dict[str, Any]", value)
|
||||
flattened_dict.update(
|
||||
flatten_dict(
|
||||
value, new_key, separator, flatten_lists, remove_none, keys_to_omit=keys_to_omit
|
||||
),
|
||||
)
|
||||
elif isinstance(value, (list, tuple)) and flatten_lists:
|
||||
value = cast("list[Any] | tuple[Any]", value)
|
||||
for index, item in enumerate(value):
|
||||
flattened_dict.update(
|
||||
flatten_dict(
|
||||
@ -216,10 +226,11 @@ def flatten_dict(
|
||||
)
|
||||
else:
|
||||
flattened_dict[new_key] = value
|
||||
|
||||
return flattened_dict
|
||||
|
||||
|
||||
def _get_table_fieldnames(rows):
|
||||
def _get_table_fieldnames(rows: list[dict[str, Any]]):
|
||||
table_fieldnames = list(TABLE_FIELDNAMES)
|
||||
for row in rows:
|
||||
metadata = row["metadata"]
|
||||
@ -229,12 +240,9 @@ def _get_table_fieldnames(rows):
|
||||
return table_fieldnames
|
||||
|
||||
|
||||
def convert_to_isd_csv(elements: List[Element]) -> str:
|
||||
"""
|
||||
Returns the representation of document elements as an Initial Structured Document (ISD)
|
||||
in CSV Format.
|
||||
"""
|
||||
rows: List[Dict[str, Any]] = convert_to_isd(elements)
|
||||
def convert_to_csv(elements: Iterable[Element]) -> str:
|
||||
"""Convert `elements` to CSV format."""
|
||||
rows: list[dict[str, Any]] = elements_to_dicts(elements)
|
||||
table_fieldnames = _get_table_fieldnames(rows)
|
||||
# NOTE(robinson) - flatten metadata and add it to the table
|
||||
for row in rows:
|
||||
@ -255,55 +263,54 @@ def convert_to_isd_csv(elements: List[Element]) -> str:
|
||||
return buffer.getvalue()
|
||||
|
||||
|
||||
def convert_to_csv(elements: List[Element]) -> str:
|
||||
"""Converts a list of elements to a CSV."""
|
||||
return convert_to_isd_csv(elements)
|
||||
# -- legacy alias for convert_to_csv --
|
||||
convert_to_isd_csv = convert_to_csv
|
||||
|
||||
|
||||
@requires_dependencies(["pandas"])
|
||||
def get_default_pandas_dtypes() -> dict:
|
||||
def get_default_pandas_dtypes() -> dict[str, Any]:
|
||||
return {
|
||||
"text": pd.StringDtype(),
|
||||
"type": pd.StringDtype(),
|
||||
"element_id": pd.StringDtype(),
|
||||
"filename": pd.StringDtype(), # Optional[str]
|
||||
"filetype": pd.StringDtype(), # Optional[str]
|
||||
"file_directory": pd.StringDtype(), # Optional[str]
|
||||
"last_modified": pd.StringDtype(), # Optional[str]
|
||||
"attached_to_filename": pd.StringDtype(), # Optional[str]
|
||||
"parent_id": pd.StringDtype(), # Optional[str],
|
||||
"text": pd.StringDtype(), # type: ignore
|
||||
"type": pd.StringDtype(), # type: ignore
|
||||
"element_id": pd.StringDtype(), # type: ignore
|
||||
"filename": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"filetype": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"file_directory": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"last_modified": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"attached_to_filename": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"parent_id": pd.StringDtype(), # Optional[str], # type: ignore
|
||||
"category_depth": "Int64", # Optional[int]
|
||||
"image_path": pd.StringDtype(), # Optional[str]
|
||||
"languages": object, # Optional[List[str]]
|
||||
"image_path": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"languages": object, # Optional[list[str]]
|
||||
"page_number": "Int64", # Optional[int]
|
||||
"page_name": pd.StringDtype(), # Optional[str]
|
||||
"url": pd.StringDtype(), # Optional[str]
|
||||
"link_urls": pd.StringDtype(), # Optional[str]
|
||||
"link_texts": object, # Optional[List[str]]
|
||||
"page_name": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"url": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"link_urls": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"link_texts": object, # Optional[list[str]]
|
||||
"links": object,
|
||||
"sent_from": object, # Optional[List[str]],
|
||||
"sent_to": object, # Optional[List[str]]
|
||||
"subject": pd.StringDtype(), # Optional[str]
|
||||
"section": pd.StringDtype(), # Optional[str]
|
||||
"header_footer_type": pd.StringDtype(), # Optional[str]
|
||||
"emphasized_text_contents": object, # Optional[List[str]]
|
||||
"emphasized_text_tags": object, # Optional[List[str]]
|
||||
"text_as_html": pd.StringDtype(), # Optional[str]
|
||||
"sent_from": object, # Optional[list[str]],
|
||||
"sent_to": object, # Optional[list[str]]
|
||||
"subject": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"section": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"header_footer_type": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"emphasized_text_contents": object, # Optional[list[str]]
|
||||
"emphasized_text_tags": object, # Optional[list[str]]
|
||||
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"regex_metadata": object,
|
||||
"max_characters": "Int64", # Optional[int]
|
||||
"is_continuation": "boolean", # Optional[bool]
|
||||
"detection_class_prob": float, # Optional[float],
|
||||
"sender": pd.StringDtype(),
|
||||
"sender": pd.StringDtype(), # type: ignore
|
||||
"coordinates_points": object,
|
||||
"coordinates_system": pd.StringDtype(),
|
||||
"coordinates_system": pd.StringDtype(), # type: ignore
|
||||
"coordinates_layout_width": float,
|
||||
"coordinates_layout_height": float,
|
||||
"data_source_url": pd.StringDtype(), # Optional[str]
|
||||
"data_source_version": pd.StringDtype(), # Optional[str]
|
||||
"data_source_url": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"data_source_version": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"data_source_record_locator": object,
|
||||
"data_source_date_created": pd.StringDtype(), # Optional[str]
|
||||
"data_source_date_modified": pd.StringDtype(), # Optional[str]
|
||||
"data_source_date_processed": pd.StringDtype(), # Optional[str]
|
||||
"data_source_date_created": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"data_source_date_modified": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"data_source_permissions_data": object,
|
||||
"embeddings": object,
|
||||
"regex_metadata_key": object,
|
||||
@ -312,44 +319,41 @@ def get_default_pandas_dtypes() -> dict:
|
||||
|
||||
@requires_dependencies(["pandas"])
|
||||
def convert_to_dataframe(
|
||||
elements: List[Element],
|
||||
drop_empty_cols: bool = True,
|
||||
set_dtypes=False,
|
||||
elements: Iterable[Element], drop_empty_cols: bool = True, set_dtypes: bool = False
|
||||
) -> "pd.DataFrame":
|
||||
"""Converts document elements to a pandas DataFrame. The dataframe contains the
|
||||
following columns:
|
||||
"""Convert `elements` to a pandas DataFrame.
|
||||
|
||||
The dataframe contains the following columns:
|
||||
text: the element text
|
||||
type: the text type (NarrativeText, Title, etc)
|
||||
|
||||
Output is pd.DataFrame
|
||||
"""
|
||||
elements_as_dict = convert_to_dict(elements)
|
||||
for d in elements_as_dict:
|
||||
element_dicts = elements_to_dicts(elements)
|
||||
for d in element_dicts:
|
||||
if metadata := d.pop("metadata", None):
|
||||
d.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
||||
df = pd.DataFrame.from_dict(
|
||||
elements_as_dict,
|
||||
)
|
||||
df = pd.DataFrame.from_dict(element_dicts) # type: ignore
|
||||
if set_dtypes:
|
||||
dt = {k: v for k, v in get_default_pandas_dtypes().items() if k in df.columns}
|
||||
df = df.astype(dt)
|
||||
df = df.astype(dt) # type: ignore
|
||||
if drop_empty_cols:
|
||||
df.dropna(axis=1, how="all", inplace=True)
|
||||
df.dropna(axis=1, how="all", inplace=True) # type: ignore
|
||||
return df
|
||||
|
||||
|
||||
def filter_element_types(
|
||||
elements: List[Element],
|
||||
include_element_types: Optional[List[Element]] = None,
|
||||
exclude_element_types: Optional[List[Element]] = None,
|
||||
) -> List[Element]:
|
||||
elements: Iterable[Element],
|
||||
include_element_types: Optional[Sequence[type[Element]]] = None,
|
||||
exclude_element_types: Optional[Sequence[type[Element]]] = None,
|
||||
) -> list[Element]:
|
||||
"""Filters document elements by element type"""
|
||||
exactly_one(
|
||||
include_element_types=include_element_types,
|
||||
exclude_element_types=exclude_element_types,
|
||||
)
|
||||
|
||||
filtered_elements: List[Element] = []
|
||||
filtered_elements: list[Element] = []
|
||||
if include_element_types:
|
||||
for element in elements:
|
||||
if type(element) in include_element_types:
|
||||
@ -364,16 +368,18 @@ def filter_element_types(
|
||||
|
||||
return filtered_elements
|
||||
|
||||
return elements
|
||||
return list(elements)
|
||||
|
||||
|
||||
def convert_to_coco(
|
||||
elements: List[Element],
|
||||
elements: Iterable[Element],
|
||||
dataset_description: Optional[str] = None,
|
||||
dataset_version: str = "1.0",
|
||||
contributors: Tuple[str] = ("Unstructured Developers",),
|
||||
) -> List[Dict[str, Any]]:
|
||||
coco_dataset = {}
|
||||
contributors: tuple[str] = ("Unstructured Developers",),
|
||||
) -> dict[str, Any]:
|
||||
from unstructured.documents.elements import TYPE_TO_TEXT_ELEMENT_MAP
|
||||
|
||||
coco_dataset: dict[str, Any] = {}
|
||||
# Handle Info
|
||||
coco_dataset["info"] = {
|
||||
"description": (
|
||||
@ -386,7 +392,7 @@ def convert_to_coco(
|
||||
"contributors": ",".join(contributors),
|
||||
"date_created": datetime.now().date().isoformat(),
|
||||
}
|
||||
elements_dict = convert_to_dict(elements)
|
||||
element_dicts = elements_to_dicts(elements)
|
||||
# Handle Images
|
||||
images = [
|
||||
{
|
||||
@ -404,7 +410,7 @@ def convert_to_coco(
|
||||
"file_name": el["metadata"].get("filename", ""),
|
||||
"page_number": el["metadata"].get("page_number", ""),
|
||||
}
|
||||
for el in elements_dict
|
||||
for el in element_dicts
|
||||
]
|
||||
images = list({tuple(sorted(d.items())): d for d in images}.values())
|
||||
for index, d in enumerate(images):
|
||||
@ -458,7 +464,7 @@ def convert_to_coco(
|
||||
else None
|
||||
),
|
||||
}
|
||||
for el in elements_dict
|
||||
for el in element_dicts
|
||||
]
|
||||
coco_dataset["annotations"] = annotations
|
||||
return coco_dataset
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user