mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
Dynamic ElementMetadata implementation (#2043)
### Executive Summary
The structure of element metadata is currently static, meaning only
predefined fields can appear in the metadata. We would like the
flexibility for end-users, at their own discretion, to define and use
additional metadata fields that make sense for their particular
use-case.
### Concepts
A key concept for dynamic metadata is _known field_. A known-field is
one of those explicitly defined on `ElementMetadata`. Each of these has
a type and can be specified when _constructing_ a new `ElementMetadata`
instance. This is in contrast to an _end-user defined_ (or _ad-hoc_)
metadata field, one not known at "compile" time and added at the
discretion of an end-user to suit the purposes of their application.
An ad-hoc field can only be added by _assignment_ on an already
constructed instance.
### End-user ad-hoc metadata field behaviors
An ad-hoc field can be added to an `ElementMetadata` instance by
assignment:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
```
A field added in this way can be accessed by name:
```python
>>> metadata.coefficient
0.536
```
and that field will appear in the JSON/dict for that instance:
```python
>>> metadata = ElementMetadata()
>>> metadata.coefficient = 0.536
>>> metadata.to_dict()
{"coefficient": 0.536}
```
However, accessing a "user-defined" value that has _not_ been assigned
on that instance raises `AttributeError`:
```python
>>> metadata.coeffcient # -- misspelled "coefficient" --
AttributeError: 'ElementMetadata' object has no attribute 'coeffcient'
```
This makes "tagging" a metadata item with a value very convenient, but
entails the proviso that if an end-user wants to add a metadata field to
_some_ elements and not others (sparse population), AND they want to
access that field by name on ANY element and receive `None` where it has
not been assigned, they will need to use an expression like this:
```python
coefficient = metadata.coefficient if hasattr(metadata, "coefficient") else None
```
### Implementation Notes
- **ad-hoc metadata fields** are discarded during consolidation (for
chunking) because we don't have a consolidation strategy defined for
those. We could consider using a default consolidation strategy like
`FIRST` or possibly allow a user to register a strategy (although that
gets hairy in non-private and multiple-memory-space situations.)
- ad-hoc metadata fields **cannot start with an underscore**.
- We have no way to distinguish an ad-hoc field from any "noise" fields
that might appear in a JSON/dict loaded using `.from_dict()`, so unlike
the original (which only loaded known-fields), we'll rehydrate anything
that we find there.
- No real type-safety is possible on ad-hoc fields but the type-checker
does not complain because the type of all ad-hoc fields is `Any` (which
is the best available behavior in my view).
- We may want to consider whether end-users should be able to add ad-hoc
fields to "sub" metadata objects too, like `DataSourceMetadata` and
conceivably `CoordinatesMetadata` (although I'm not immediately seeing a
use-case for the second one).
This commit is contained in:
parent
d7a280402f
commit
252405c780
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -542,7 +542,7 @@ jobs:
|
||||
# TODO - figure out best practice for caching docker images
|
||||
# (Using the virtualenv to get pytest)
|
||||
test_dockerfile:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-latest-m
|
||||
needs: [ setup, lint ]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
## 0.10.31-dev5
|
||||
## 0.11.0-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
* **Temporary Support for paddle language parameter** User can specify default langage code for paddle with ENV `DEFAULT_PADDLE_LANG` before we have the language mapping for paddle.
|
||||
|
||||
### Features
|
||||
|
||||
* **Add ad-hoc fields to ElementMetadata instance.** End-users can now add their own metadata fields simply by assigning to an element-metadata attribute-name of their choice, like `element.metadata.coefficient = 0.58`. These fields will round-trip through JSON and can be accessed with dotted notation.
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix `fast` strategy fall back to `ocr_only`.** The `fast` strategy should not fall back to a more expensive strategy.
|
||||
|
||||
@ -13,6 +13,9 @@ python_classes = Test Describe
|
||||
python_functions = test_ it_ they_ but_ and_
|
||||
markers =
|
||||
chipper: mark a test as running chipper, which tends to be slow and compute-heavy.
|
||||
testpaths =
|
||||
test_unstructured
|
||||
test_unstructured_ingest
|
||||
|
||||
[autoflake]
|
||||
expand_star_imports=true
|
||||
|
||||
@ -261,6 +261,7 @@ def test_chunk_by_title():
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
||||
|
||||
assert chunks == [
|
||||
@ -274,7 +275,6 @@ def test_chunk_by_title():
|
||||
),
|
||||
CheckBox(),
|
||||
]
|
||||
|
||||
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
||||
assert chunks[3].metadata == ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
|
||||
@ -826,6 +826,38 @@ class Describe_TextSection:
|
||||
# -- A `None` value never appears, neither does a field-name with an empty list --
|
||||
}
|
||||
|
||||
def but_it_discards_ad_hoc_metadata_fields_during_consolidation(self):
|
||||
metadata = ElementMetadata(
|
||||
category_depth=0,
|
||||
filename="foo.docx",
|
||||
languages=["lat"],
|
||||
parent_id="f87731e0",
|
||||
)
|
||||
metadata.coefficient = 0.62
|
||||
metadata_2 = ElementMetadata(
|
||||
category_depth=1,
|
||||
filename="foo.docx",
|
||||
image_path="sprite.png",
|
||||
languages=["lat", "eng"],
|
||||
)
|
||||
metadata_2.quotient = 1.74
|
||||
|
||||
section = _TextSection(
|
||||
[
|
||||
Title("Lorem Ipsum", metadata=metadata),
|
||||
Text("'Lorem ipsum dolor' means 'Thank you very much'.", metadata=metadata_2),
|
||||
]
|
||||
)
|
||||
|
||||
# -- ad-hoc fields "coefficient" and "quotient" do not appear --
|
||||
assert section._all_metadata_values == {
|
||||
"category_depth": [0, 1],
|
||||
"filename": ["foo.docx", "foo.docx"],
|
||||
"image_path": ["sprite.png"],
|
||||
"languages": [["lat"], ["lat", "eng"]],
|
||||
"parent_id": ["f87731e0"],
|
||||
}
|
||||
|
||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
||||
|
||||
|
||||
@ -1,5 +1,11 @@
|
||||
import dataclasses as dc
|
||||
# pyright: reportPrivateUsage=false
|
||||
|
||||
"""Test-suite for `unstructured.documents.elements` module."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import pathlib
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
@ -15,6 +21,7 @@ from unstructured.documents.elements import (
|
||||
UUID,
|
||||
ConsolidationStrategy,
|
||||
CoordinatesMetadata,
|
||||
DataSourceMetadata,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
NoID,
|
||||
@ -177,7 +184,8 @@ def test_element_to_dict():
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
)
|
||||
expected = {
|
||||
|
||||
assert element.to_dict() == {
|
||||
"metadata": {
|
||||
"coordinates": {
|
||||
"layout_height": 1,
|
||||
@ -189,7 +197,6 @@ def test_element_to_dict():
|
||||
"type": None,
|
||||
"element_id": "awt32t1",
|
||||
}
|
||||
assert element.to_dict() == expected
|
||||
|
||||
|
||||
def test_regex_metadata_round_trips_through_JSON():
|
||||
@ -210,36 +217,407 @@ def test_regex_metadata_round_trips_through_JSON():
|
||||
assert reserialized_metadata_json == metadata_json
|
||||
|
||||
|
||||
def test_metadata_from_dict_extra_fields():
|
||||
"""
|
||||
Assert that the metadata classes ignore nonexistent fields.
|
||||
This can be an issue when elements_from_json gets a schema
|
||||
from the future.
|
||||
"""
|
||||
element_metadata = {
|
||||
"new_field": "hello",
|
||||
"data_source": {
|
||||
"new_field": "world",
|
||||
},
|
||||
"coordinates": {
|
||||
"new_field": "foo",
|
||||
},
|
||||
}
|
||||
class DescribeElementMetadata:
|
||||
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
|
||||
|
||||
metadata = ElementMetadata.from_dict(element_metadata)
|
||||
metadata_dict = metadata.to_dict()
|
||||
# -- It can be constructed with known keyword arguments. In particular, including a non-known
|
||||
# -- keyword argument produces a type-error at development time and raises an exception at
|
||||
# -- runtime. This catches typos before they reach production.
|
||||
|
||||
assert "new_field" not in metadata_dict
|
||||
assert "new_field" not in metadata_dict["coordinates"]
|
||||
assert "new_field" not in metadata_dict["data_source"]
|
||||
def it_detects_unknown_constructor_args_at_both_development_time_and_runtime(self):
|
||||
with pytest.raises(TypeError, match="got an unexpected keyword argument 'file_name'"):
|
||||
ElementMetadata(file_name="memo.docx") # pyright: ignore[reportGeneralTypeIssues]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"file_path",
|
||||
[
|
||||
pathlib.Path("documents/docx") / "memos" / "memo-2023-11-10.docx",
|
||||
"documents/docx/memos/memo-2023-11-10.docx",
|
||||
],
|
||||
)
|
||||
def it_accommodates_either_a_pathlib_Path_or_str_for_its_filename_arg(
|
||||
self, file_path: pathlib.Path | str
|
||||
):
|
||||
meta = ElementMetadata(filename=file_path)
|
||||
|
||||
def test_there_is_a_consolidation_strategy_for_every_ElementMetadata_field():
|
||||
metadata_field_names = sorted(f.name for f in dc.fields(ElementMetadata))
|
||||
consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
||||
assert meta.file_directory == "documents/docx/memos"
|
||||
assert meta.filename == "memo-2023-11-10.docx"
|
||||
|
||||
for field_name in metadata_field_names:
|
||||
assert field_name in consolidation_strategies, (
|
||||
f"ElementMetadata field `.{field_name}` does not have a consolidation strategy."
|
||||
f" Add one in `ConsolidationStrategy.field_consolidation_strategies()."
|
||||
def it_leaves_both_filename_and_file_directory_None_when_neither_is_specified(self):
|
||||
meta = ElementMetadata()
|
||||
|
||||
assert meta.file_directory is None
|
||||
assert meta.filename is None
|
||||
|
||||
@pytest.mark.parametrize("file_path", [pathlib.Path("memo.docx"), "memo.docx"])
|
||||
def and_it_leaves_file_directory_None_when_not_specified_and_filename_is_not_a_path(
|
||||
self, file_path: pathlib.Path | str
|
||||
):
|
||||
meta = ElementMetadata(filename=file_path)
|
||||
|
||||
assert meta.file_directory is None
|
||||
assert meta.filename == "memo.docx"
|
||||
|
||||
def and_it_splits_off_directory_path_from_its_filename_arg_when_it_is_a_file_path(self):
|
||||
meta = ElementMetadata(filename="documents/docx/memo-2023-11-11.docx")
|
||||
|
||||
assert meta.file_directory == "documents/docx"
|
||||
assert meta.filename == "memo-2023-11-11.docx"
|
||||
|
||||
def but_it_prefers_a_specified_file_directory_when_filename_also_contains_a_path(self):
|
||||
meta = ElementMetadata(filename="tmp/staging/memo.docx", file_directory="documents/docx")
|
||||
|
||||
assert meta.file_directory == "documents/docx"
|
||||
assert meta.filename == "memo.docx"
|
||||
|
||||
# -- It knows the types of its known members so type-checking support is available. --
|
||||
|
||||
def it_knows_the_types_of_its_known_members_so_type_checking_support_is_available(self):
|
||||
ElementMetadata(
|
||||
category_depth="2", # pyright: ignore[reportGeneralTypeIssues]
|
||||
file_directory=True, # pyright: ignore[reportGeneralTypeIssues]
|
||||
text_as_html=42, # pyright: ignore[reportGeneralTypeIssues]
|
||||
)
|
||||
# -- it does not check types at runtime however (choosing to avoid validation overhead) --
|
||||
|
||||
# -- It only stores a field's value when it is not None. --
|
||||
|
||||
def it_returns_the_value_of_an_attribute_it_has(self):
|
||||
meta = ElementMetadata(url="https://google.com")
|
||||
assert "url" in meta.__dict__
|
||||
assert meta.url == "https://google.com"
|
||||
|
||||
def and_it_returns_None_for_a_known_attribute_it_does_not_have(self):
|
||||
meta = ElementMetadata()
|
||||
assert "url" not in meta.__dict__
|
||||
assert meta.url is None
|
||||
|
||||
def but_it_raises_AttributeError_for_an_unknown_attribute_it_does_not_have(self):
|
||||
meta = ElementMetadata()
|
||||
assert "coefficient" not in meta.__dict__
|
||||
with pytest.raises(AttributeError, match="object has no attribute 'coefficient'"):
|
||||
meta.coefficient
|
||||
|
||||
def it_stores_a_non_None_field_value_when_assigned(self):
|
||||
meta = ElementMetadata()
|
||||
assert "file_directory" not in meta.__dict__
|
||||
meta.file_directory = "tmp/"
|
||||
assert "file_directory" in meta.__dict__
|
||||
assert meta.file_directory == "tmp/"
|
||||
|
||||
def it_removes_a_field_when_None_is_assigned_to_it(self):
|
||||
meta = ElementMetadata(file_directory="tmp/")
|
||||
assert "file_directory" in meta.__dict__
|
||||
assert meta.file_directory == "tmp/"
|
||||
|
||||
meta.file_directory = None
|
||||
assert "file_directory" not in meta.__dict__
|
||||
assert meta.file_directory is None
|
||||
|
||||
# -- It can serialize itself to a dict -------------------------------------------------------
|
||||
|
||||
def it_can_serialize_itself_to_a_dict(self):
|
||||
meta = ElementMetadata(
|
||||
category_depth=1,
|
||||
file_directory="tmp/",
|
||||
page_number=2,
|
||||
text_as_html="<table></table>",
|
||||
url="https://google.com",
|
||||
)
|
||||
assert meta.to_dict() == {
|
||||
"category_depth": 1,
|
||||
"file_directory": "tmp/",
|
||||
"page_number": 2,
|
||||
"text_as_html": "<table></table>",
|
||||
"url": "https://google.com",
|
||||
}
|
||||
|
||||
def and_it_serializes_a_coordinates_sub_object_to_a_dict_when_it_is_present(self):
|
||||
meta = ElementMetadata(
|
||||
category_depth=1,
|
||||
coordinates=CoordinatesMetadata(
|
||||
points=((2, 2), (1, 4), (3, 4), (3, 2)),
|
||||
system=RelativeCoordinateSystem(),
|
||||
),
|
||||
page_number=2,
|
||||
)
|
||||
assert meta.to_dict() == {
|
||||
"category_depth": 1,
|
||||
"coordinates": {
|
||||
"layout_height": 1,
|
||||
"layout_width": 1,
|
||||
"points": ((2, 2), (1, 4), (3, 4), (3, 2)),
|
||||
"system": "RelativeCoordinateSystem",
|
||||
},
|
||||
"page_number": 2,
|
||||
}
|
||||
|
||||
def and_it_serializes_a_data_source_sub_object_to_a_dict_when_it_is_present(self):
|
||||
meta = ElementMetadata(
|
||||
category_depth=1,
|
||||
data_source=DataSourceMetadata(
|
||||
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
date_created="2023-11-09",
|
||||
),
|
||||
page_number=2,
|
||||
)
|
||||
assert meta.to_dict() == {
|
||||
"category_depth": 1,
|
||||
"data_source": {
|
||||
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
"date_created": "2023-11-09",
|
||||
},
|
||||
"page_number": 2,
|
||||
}
|
||||
|
||||
def but_unlike_in_ElementMetadata_unknown_fields_in_sub_objects_are_ignored(self):
|
||||
"""Metadata sub-objects ignore fields they do not explicitly define.
|
||||
|
||||
This is _not_ the case for ElementMetadata itself where an non-known field is welcomed as a
|
||||
user-defined ad-hoc metadata field.
|
||||
"""
|
||||
element_metadata = {
|
||||
"new_field": "hello",
|
||||
"data_source": {
|
||||
"new_field": "world",
|
||||
},
|
||||
"coordinates": {
|
||||
"new_field": "foo",
|
||||
},
|
||||
}
|
||||
|
||||
metadata = ElementMetadata.from_dict(element_metadata)
|
||||
metadata_dict = metadata.to_dict()
|
||||
|
||||
assert "new_field" in metadata_dict
|
||||
assert "new_field" not in metadata_dict["coordinates"]
|
||||
assert "new_field" not in metadata_dict["data_source"]
|
||||
|
||||
# -- It can deserialize itself from a dict ---------------------------------------------------
|
||||
|
||||
def it_can_deserialize_itself_from_a_dict(self):
|
||||
meta_dict = {
|
||||
"category_depth": 1,
|
||||
"coefficient": 0.58,
|
||||
"coordinates": {
|
||||
"layout_height": 4,
|
||||
"layout_width": 2,
|
||||
"points": ((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
"system": "RelativeCoordinateSystem",
|
||||
},
|
||||
"data_source": {
|
||||
"url": "https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
"date_created": "2023-11-09",
|
||||
},
|
||||
"languages": ["eng"],
|
||||
}
|
||||
|
||||
meta = ElementMetadata.from_dict(meta_dict)
|
||||
|
||||
# -- known fields present in dict are present in meta --
|
||||
assert meta.category_depth == 1
|
||||
|
||||
# -- known sub-object fields present in dict are present in meta --
|
||||
assert meta.coordinates == CoordinatesMetadata(
|
||||
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
system=RelativeCoordinateSystem(),
|
||||
)
|
||||
assert meta.data_source == DataSourceMetadata(
|
||||
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
date_created="2023-11-09",
|
||||
)
|
||||
|
||||
# -- known fields absent from dict report None but are not present in meta --
|
||||
assert meta.file_directory is None
|
||||
assert "file_directory" not in meta.__dict__
|
||||
|
||||
# -- non-known fields present in dict are present in meta (we have no way to tell whether
|
||||
# -- they are "ad-hoc" or not because we lack indication of user-intent)
|
||||
assert meta.coefficient == 0.58
|
||||
|
||||
# -- ad-hoc fields absent from dict raise on attempted access --
|
||||
with pytest.raises(AttributeError, match="ntMetadata' object has no attribute 'quotient'"):
|
||||
meta.quotient
|
||||
|
||||
# -- but that can be worked around by end-user --
|
||||
assert (meta.quotient if hasattr(meta, "quotient") else None) is None
|
||||
|
||||
# -- mutating a mutable (collection) field does not affect the original value --
|
||||
assert isinstance(meta.languages, list)
|
||||
assert meta.languages == ["eng"]
|
||||
meta.languages.append("spa")
|
||||
assert meta.languages == ["eng", "spa"]
|
||||
assert meta_dict["languages"] == ["eng"]
|
||||
|
||||
# -- It allows downstream users to add an arbitrary new member by assignment. ----------------
|
||||
|
||||
def it_allows_an_end_user_to_add_an_arbitrary_field(self):
|
||||
meta = ElementMetadata()
|
||||
meta.foobar = 7
|
||||
assert "foobar" in meta.__dict__
|
||||
assert meta.foobar == 7
|
||||
|
||||
def and_fields_so_added_appear_in_the_metadata_JSON(self):
|
||||
meta = ElementMetadata()
|
||||
meta.foobar = 7
|
||||
assert meta.to_dict() == {"foobar": 7}
|
||||
|
||||
def and_it_removes_an_end_user_field_when_it_is_assigned_None(self):
|
||||
meta = ElementMetadata()
|
||||
meta.foobar = 7
|
||||
assert "foobar" in meta.__dict__
|
||||
meta.foobar = None
|
||||
assert "foobar" not in meta.__dict__
|
||||
with pytest.raises(
|
||||
AttributeError, match="'ElementMetadata' object has no attribute 'foobar'"
|
||||
):
|
||||
meta.foobar
|
||||
|
||||
# -- It can update itself from another instance ----------------------------------------------
|
||||
|
||||
def it_can_update_itself_from_another_instance(self):
|
||||
meta = ElementMetadata(category_depth=1, page_number=1)
|
||||
meta.coefficient = 0.58
|
||||
meta.stem_length = 18
|
||||
other = ElementMetadata(file_directory="tmp/", page_number=2)
|
||||
other.quotient = 1.4
|
||||
other.stem_length = 20
|
||||
|
||||
meta.update(other)
|
||||
|
||||
# -- known-fields present on self but not other are unchanged --
|
||||
assert meta.category_depth == 1
|
||||
# -- known-fields present on other but not self are added --
|
||||
assert meta.file_directory == "tmp/"
|
||||
# -- known-fields present on both self and other are updated --
|
||||
assert meta.page_number == 2
|
||||
# -- ad-hoc-fields present on self but not other are unchanged --
|
||||
assert meta.coefficient == 0.58
|
||||
# -- ad-hoc-fields present on other but not self are added --
|
||||
assert meta.quotient == 1.4
|
||||
# -- ad-hoc-fields present on both self and other are updated --
|
||||
assert meta.stem_length == 20
|
||||
# -- other is left unchanged --
|
||||
assert other.category_depth is None
|
||||
assert other.file_directory == "tmp/"
|
||||
assert other.page_number == 2
|
||||
assert other.text_as_html is None
|
||||
assert other.url is None
|
||||
assert other.quotient == 1.4
|
||||
assert other.stem_length == 20
|
||||
with pytest.raises(AttributeError, match="etadata' object has no attribute 'coefficient'"):
|
||||
other.coefficient
|
||||
|
||||
def but_it_raises_on_attempt_to_update_from_a_non_ElementMetadata_object(self):
|
||||
meta = ElementMetadata()
|
||||
with pytest.raises(ValueError, match=r"ate\(\)' must be an instance of 'ElementMetadata'"):
|
||||
meta.update({"coefficient": "0.56"}) # pyright: ignore[reportGeneralTypeIssues]
|
||||
|
||||
# -- It knows when it is equal to another instance -------------------------------------------
|
||||
|
||||
def it_is_equal_to_another_instance_with_the_same_known_field_values(self):
|
||||
meta = ElementMetadata(
|
||||
category_depth=1,
|
||||
coordinates=CoordinatesMetadata(
|
||||
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
system=RelativeCoordinateSystem(),
|
||||
),
|
||||
data_source=DataSourceMetadata(
|
||||
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
date_created="2023-11-08",
|
||||
),
|
||||
file_directory="tmp/",
|
||||
languages=["eng"],
|
||||
page_number=2,
|
||||
text_as_html="<table></table>",
|
||||
url="https://google.com",
|
||||
)
|
||||
assert meta == ElementMetadata(
|
||||
category_depth=1,
|
||||
coordinates=CoordinatesMetadata(
|
||||
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
system=RelativeCoordinateSystem(),
|
||||
),
|
||||
data_source=DataSourceMetadata(
|
||||
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
date_created="2023-11-08",
|
||||
),
|
||||
file_directory="tmp/",
|
||||
languages=["eng"],
|
||||
page_number=2,
|
||||
text_as_html="<table></table>",
|
||||
url="https://google.com",
|
||||
)
|
||||
|
||||
def but_it_is_never_equal_to_a_non_ElementMetadata_object(self):
|
||||
class NotElementMetadata:
|
||||
pass
|
||||
|
||||
meta = ElementMetadata()
|
||||
other = NotElementMetadata()
|
||||
|
||||
# -- all the "fields" are the same --
|
||||
assert meta.__dict__ == other.__dict__
|
||||
# -- but it is rejected solely because its type is different --
|
||||
assert meta != other
|
||||
|
||||
def it_is_equal_to_another_instance_with_the_same_ad_hoc_field_values(self):
|
||||
meta = ElementMetadata(category_depth=1)
|
||||
meta.coefficient = 0.58
|
||||
other = ElementMetadata(category_depth=1)
|
||||
other.coefficient = 0.58
|
||||
|
||||
assert meta == other
|
||||
|
||||
def but_it_is_not_equal_to_an_instance_with_ad_hoc_fields_that_differ(self):
|
||||
meta = ElementMetadata(category_depth=1)
|
||||
meta.coefficient = 0.58
|
||||
other = ElementMetadata(category_depth=1)
|
||||
other.coefficient = 0.72
|
||||
|
||||
assert meta != other
|
||||
|
||||
def it_is_not_equal_when_a_list_field_contains_different_items(self):
|
||||
meta = ElementMetadata(languages=["eng"])
|
||||
assert meta != ElementMetadata(languages=["eng", "spa"])
|
||||
|
||||
def and_it_is_not_equal_when_the_coordinates_sub_object_field_differs(self):
|
||||
meta = ElementMetadata(
|
||||
coordinates=CoordinatesMetadata(
|
||||
points=((1, 2), (1, 4), (3, 4), (3, 2)),
|
||||
system=RelativeCoordinateSystem(),
|
||||
)
|
||||
)
|
||||
assert meta != ElementMetadata(
|
||||
coordinates=CoordinatesMetadata(
|
||||
points=((2, 2), (2, 4), (3, 4), (4, 2)),
|
||||
system=RelativeCoordinateSystem(),
|
||||
)
|
||||
)
|
||||
|
||||
def and_it_is_not_equal_when_the_data_source_sub_object_field_differs(self):
|
||||
meta = ElementMetadata(
|
||||
data_source=DataSourceMetadata(
|
||||
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
date_created="2023-11-08",
|
||||
)
|
||||
)
|
||||
assert meta != ElementMetadata(
|
||||
data_source=DataSourceMetadata(
|
||||
url="https://www.nih.gov/about-nih/who-we-are/nih-director",
|
||||
date_created="2023-11-09",
|
||||
)
|
||||
)
|
||||
|
||||
# -- There is a consolidation-strategy for all known fields ----------------------------------
|
||||
|
||||
def it_can_find_the_consolidation_strategy_for_each_of_its_known_fields(self):
|
||||
metadata = ElementMetadata()
|
||||
metadata_field_names = sorted(metadata._known_field_names)
|
||||
consolidation_strategies = ConsolidationStrategy.field_consolidation_strategies()
|
||||
|
||||
for field_name in metadata_field_names:
|
||||
assert field_name in consolidation_strategies, (
|
||||
f"ElementMetadata field `.{field_name}` does not have a consolidation strategy."
|
||||
f" Add one in `ConsolidationStrategy.field_consolidation_strategies()."
|
||||
)
|
||||
|
||||
@ -5,7 +5,11 @@ import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
||||
from test_unstructured.unit_utils import (
|
||||
assert_round_trips_through_JSON,
|
||||
example_doc_path,
|
||||
parse_optional_datetime,
|
||||
)
|
||||
from unstructured.chunking.title import chunk_by_title
|
||||
from unstructured.documents.elements import (
|
||||
ElementMetadata,
|
||||
@ -323,7 +327,7 @@ def test_partition_email_from_filename_has_metadata():
|
||||
).to_dict()
|
||||
)
|
||||
expected_dt = datetime.datetime.fromisoformat("2022-12-16T17:04:16-05:00")
|
||||
assert elements[0].metadata.get_last_modified() == expected_dt
|
||||
assert parse_optional_datetime(elements[0].metadata.last_modified) == expected_dt
|
||||
for element in elements:
|
||||
assert element.metadata.filename == "fake-email.eml"
|
||||
|
||||
@ -399,7 +403,7 @@ def test_partition_email_still_works_with_no_content():
|
||||
def test_partition_email_from_filename_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email-header.eml")
|
||||
elements = partition_email(filename=filename, include_metadata=False)
|
||||
assert elements[0].metadata.get_last_modified() is None
|
||||
assert parse_optional_datetime(elements[0].metadata.last_modified) is None
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
@ -413,7 +417,7 @@ def test_partition_email_from_text_file_exclude_metadata():
|
||||
content_source="text/plain",
|
||||
include_metadata=False,
|
||||
)
|
||||
assert elements[0].metadata.get_last_modified() is None
|
||||
assert parse_optional_datetime(elements[0].metadata.last_modified) is None
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
@ -423,7 +427,7 @@ def test_partition_email_from_file_exclude_metadata():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
|
||||
with open(filename) as f:
|
||||
elements = partition_email(file=f, include_metadata=False)
|
||||
assert elements[0].metadata.get_last_modified() is None
|
||||
assert parse_optional_datetime(elements[0].metadata.last_modified) is None
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
assert elements[0].metadata.filename is None
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
"""Utilities that ease unit-testing."""
|
||||
|
||||
import datetime as dt
|
||||
import difflib
|
||||
import pathlib
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
from unstructured.documents.elements import Element
|
||||
from unstructured.staging.base import elements_from_json, elements_to_json
|
||||
@ -26,9 +28,20 @@ def assert_round_trips_through_JSON(elements: List[Element]) -> None:
|
||||
round_tripped_json = elements_to_json(round_tripped_elements)
|
||||
assert round_tripped_json is not None
|
||||
|
||||
assert (
|
||||
round_tripped_json == original_json
|
||||
), f"JSON differs, expected\n{original_json},\ngot\n{round_tripped_json}\n"
|
||||
assert round_tripped_json == original_json, _diff(
|
||||
"JSON differs:", round_tripped_json, original_json
|
||||
)
|
||||
|
||||
|
||||
def _diff(heading: str, actual: str, expected: str):
|
||||
"""Diff of actual compared to expected.
|
||||
|
||||
"+" indicates unexpected lines actual, "-" indicates lines missing from actual.
|
||||
"""
|
||||
expected_lines = expected.splitlines(keepends=True)
|
||||
actual_lines = actual.splitlines(keepends=True)
|
||||
heading = "diff: '+': unexpected lines in actual, '-': lines missing from actual\n"
|
||||
return heading + "".join(difflib.Differ().compare(actual_lines, expected_lines))
|
||||
|
||||
|
||||
def example_doc_path(file_name: str) -> str:
|
||||
@ -36,3 +49,8 @@ def example_doc_path(file_name: str) -> str:
|
||||
example_docs_dir = pathlib.Path(__file__).parent.parent / "example-docs"
|
||||
file_path = example_docs_dir / file_name
|
||||
return str(file_path.resolve())
|
||||
|
||||
|
||||
def parse_optional_datetime(datetime_str: Optional[str]) -> Optional[dt.datetime]:
|
||||
"""Parse `datetime_str` to a datetime.datetime instance or None if `datetime_str` is None."""
|
||||
return dt.datetime.fromisoformat(datetime_str) if datetime_str else None
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.31-dev5" # pragma: no cover
|
||||
__version__ = "0.11.0-dev1" # pragma: no cover
|
||||
|
||||
@ -416,7 +416,7 @@ class _TextSection:
|
||||
"""(field_name, value) pair for each non-None field in single `ElementMetadata`."""
|
||||
return (
|
||||
(field_name, value)
|
||||
for field_name, value in vars(metadata).items()
|
||||
for field_name, value in metadata.known_fields.items()
|
||||
if value is not None
|
||||
)
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@ from __future__ import annotations
|
||||
import abc
|
||||
import copy
|
||||
import dataclasses as dc
|
||||
import datetime
|
||||
import enum
|
||||
import functools
|
||||
import hashlib
|
||||
@ -12,9 +11,10 @@ import os
|
||||
import pathlib
|
||||
import re
|
||||
import uuid
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
|
||||
from types import MappingProxyType
|
||||
from typing import Any, Callable, Dict, FrozenSet, List, Optional, Tuple, Union
|
||||
|
||||
from typing_extensions import ParamSpec, Self, TypedDict
|
||||
from typing_extensions import ParamSpec, TypedDict
|
||||
|
||||
from unstructured.documents.coordinates import (
|
||||
TYPE_TO_COORDINATE_SYSTEM_MAP,
|
||||
@ -22,6 +22,7 @@ from unstructured.documents.coordinates import (
|
||||
RelativeCoordinateSystem,
|
||||
)
|
||||
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
|
||||
from unstructured.utils import lazyproperty
|
||||
|
||||
|
||||
class NoID(abc.ABC):
|
||||
@ -48,7 +49,7 @@ class DataSourceMetadata:
|
||||
return {key: value for key, value in self.__dict__.items() if value is not None}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, input_dict):
|
||||
def from_dict(cls, input_dict: Dict[str, Any]):
|
||||
# Only use existing fields when constructing
|
||||
supported_fields = [f.name for f in dc.fields(cls)]
|
||||
args = {k: v for k, v in input_dict.items() if k in supported_fields}
|
||||
@ -72,8 +73,8 @@ class CoordinatesMetadata:
|
||||
self.points = points
|
||||
self.system = system
|
||||
|
||||
def __eq__(self, other):
|
||||
if other is None:
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
if not isinstance(other, CoordinatesMetadata):
|
||||
return False
|
||||
return all(
|
||||
[
|
||||
@ -91,7 +92,7 @@ class CoordinatesMetadata:
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, input_dict):
|
||||
def from_dict(cls, input_dict: Dict[str, Any]):
|
||||
# `input_dict` may contain a tuple of tuples or a list of lists
|
||||
def convert_to_tuple_of_tuples(sequence_of_sequences):
|
||||
subsequences = []
|
||||
@ -135,139 +136,270 @@ class Link(TypedDict):
|
||||
start_index: int
|
||||
|
||||
|
||||
@dc.dataclass
|
||||
class ElementMetadata:
|
||||
# NOTE(scanny): if you ADD a field here you must specify a consolidation strategy for it below
|
||||
# in ConsolidationStrategy.field_consolidation_strategies() to be used when combining elements
|
||||
# during chunking.
|
||||
coordinates: Optional[CoordinatesMetadata] = None
|
||||
data_source: Optional[DataSourceMetadata] = None
|
||||
filename: Optional[str] = None
|
||||
file_directory: Optional[str] = None
|
||||
last_modified: Optional[str] = None
|
||||
filetype: Optional[str] = None
|
||||
attached_to_filename: Optional[str] = None
|
||||
parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
|
||||
category_depth: Optional[int] = None
|
||||
image_path: Optional[str] = None
|
||||
"""Fully-dynamic replacement for dataclass-based ElementMetadata."""
|
||||
|
||||
# Languages in element.
|
||||
languages: Optional[List[str]] = None
|
||||
# NOTE(scanny): To add a field:
|
||||
# - Add the field declaration with type here at the top. This makes it a "known" field and
|
||||
# enables type-checking and completion.
|
||||
# - Add a parameter with default for field in __init__() and assign it in __init__() body.
|
||||
# - Add a consolidation strategy for the field below in `ConsolidationStrategy`
|
||||
# `.field_consolidation_strategies()` to be used when consolidating metadata fields of a
|
||||
# section's elements during chunking.
|
||||
# - Add field-name to DEBUG_FIELD_NAMES if it shouldn't appear in dict/JSON or participate in
|
||||
# equality comparison.
|
||||
|
||||
# Page numbers currently supported for DOCX, HTML, PDF, and PPTX documents
|
||||
page_number: Optional[int] = None
|
||||
attached_to_filename: Optional[str]
|
||||
category_depth: Optional[int]
|
||||
coordinates: Optional[CoordinatesMetadata]
|
||||
data_source: Optional[DataSourceMetadata]
|
||||
# -- Detection Model Class Probabilities from Unstructured-Inference Hi-Res --
|
||||
detection_class_prob: Optional[float]
|
||||
# -- DEBUG field, the detection mechanism that emitted this element --
|
||||
detection_origin: Optional[str]
|
||||
emphasized_text_contents: Optional[List[str]]
|
||||
emphasized_text_tags: Optional[List[str]]
|
||||
file_directory: Optional[str]
|
||||
filename: Optional[str]
|
||||
filetype: Optional[str]
|
||||
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
|
||||
header_footer_type: Optional[str]
|
||||
image_path: Optional[str]
|
||||
# -- used in chunks only, when chunk must be split mid-text to fit window --
|
||||
is_continuation: Optional[bool]
|
||||
languages: Optional[List[str]]
|
||||
last_modified: Optional[str]
|
||||
link_texts: Optional[List[str]]
|
||||
link_urls: Optional[List[str]]
|
||||
links: Optional[List[Link]]
|
||||
# -- the worksheet name in XLXS documents --
|
||||
page_name: Optional[str]
|
||||
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
|
||||
page_number: Optional[int]
|
||||
parent_id: Optional[str | uuid.UUID | NoID | UUID]
|
||||
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
|
||||
regex_metadata: Optional[Dict[str, List[RegexMetadata]]]
|
||||
# -- EPUB document section --
|
||||
section: Optional[str]
|
||||
|
||||
# Page name. The sheet name in XLXS documents.
|
||||
page_name: Optional[str] = None
|
||||
# -- e-mail specific metadata fields --
|
||||
sent_from: Optional[List[str]]
|
||||
sent_to: Optional[List[str]]
|
||||
subject: Optional[str]
|
||||
|
||||
# Webpage specific metadata fields
|
||||
url: Optional[str] = None
|
||||
link_urls: Optional[List[str]] = None
|
||||
link_texts: Optional[List[str]] = None
|
||||
links: Optional[List[Link]] = None
|
||||
# -- used for Table elements to capture rows/col structure --
|
||||
text_as_html: Optional[str]
|
||||
url: Optional[str]
|
||||
|
||||
# E-mail specific metadata fields
|
||||
sent_from: Optional[List[str]] = None
|
||||
sent_to: Optional[List[str]] = None
|
||||
subject: Optional[str] = None
|
||||
# -- debug fields can be assigned and referenced using dotted-notation but are not serialized
|
||||
# -- to dict/JSON, do not participate in equality comparison, and are not included in the
|
||||
# -- `.fields` dict used by other parts of the library like chunking and weaviate.
|
||||
DEBUG_FIELD_NAMES = frozenset(["detection_origin"])
|
||||
|
||||
# Document section fields
|
||||
section: Optional[str] = None
|
||||
def __init__(
|
||||
self,
|
||||
attached_to_filename: Optional[str] = None,
|
||||
category_depth: Optional[int] = None,
|
||||
coordinates: Optional[CoordinatesMetadata] = None,
|
||||
data_source: Optional[DataSourceMetadata] = None,
|
||||
detection_class_prob: Optional[float] = None,
|
||||
emphasized_text_contents: Optional[List[str]] = None,
|
||||
emphasized_text_tags: Optional[List[str]] = None,
|
||||
file_directory: Optional[str] = None,
|
||||
filename: Optional[str | pathlib.Path] = None,
|
||||
filetype: Optional[str] = None,
|
||||
header_footer_type: Optional[str] = None,
|
||||
image_path: Optional[str] = None,
|
||||
is_continuation: Optional[bool] = None,
|
||||
languages: Optional[List[str]] = None,
|
||||
last_modified: Optional[str] = None,
|
||||
link_texts: Optional[List[str]] = None,
|
||||
link_urls: Optional[List[str]] = None,
|
||||
links: Optional[List[Link]] = None,
|
||||
page_name: Optional[str] = None,
|
||||
page_number: Optional[int] = None,
|
||||
parent_id: Optional[str | uuid.UUID | NoID | UUID] = None,
|
||||
regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None,
|
||||
section: Optional[str] = None,
|
||||
sent_from: Optional[List[str]] = None,
|
||||
sent_to: Optional[List[str]] = None,
|
||||
subject: Optional[str] = None,
|
||||
text_as_html: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
) -> None:
|
||||
self.attached_to_filename = attached_to_filename
|
||||
self.category_depth = category_depth
|
||||
self.coordinates = coordinates
|
||||
self.data_source = data_source
|
||||
self.detection_class_prob = detection_class_prob
|
||||
self.emphasized_text_contents = emphasized_text_contents
|
||||
self.emphasized_text_tags = emphasized_text_tags
|
||||
|
||||
# MSFT Word specific metadata fields
|
||||
header_footer_type: Optional[str] = None
|
||||
# -- accommodate pathlib.Path for filename --
|
||||
filename = str(filename) if isinstance(filename, pathlib.Path) else filename
|
||||
# -- produces "", "" when filename arg is None --
|
||||
directory_path, file_name = os.path.split(filename or "")
|
||||
# -- prefer `file_directory` arg if specified, otherwise split of file-path passed as
|
||||
# -- `filename` arg, or None if `filename` is the empty string.
|
||||
self.file_directory = file_directory or directory_path or None
|
||||
self.filename = file_name or None
|
||||
|
||||
# Formatting metadata fields
|
||||
emphasized_text_contents: Optional[List[str]] = None
|
||||
emphasized_text_tags: Optional[List[str]] = None
|
||||
self.filetype = filetype
|
||||
self.header_footer_type = header_footer_type
|
||||
self.image_path = image_path
|
||||
self.is_continuation = is_continuation
|
||||
self.languages = languages
|
||||
self.last_modified = last_modified
|
||||
self.link_texts = link_texts
|
||||
self.link_urls = link_urls
|
||||
self.links = links
|
||||
self.page_name = page_name
|
||||
self.page_number = page_number
|
||||
self.parent_id = parent_id
|
||||
self.regex_metadata = regex_metadata
|
||||
self.section = section
|
||||
self.sent_from = sent_from
|
||||
self.sent_to = sent_to
|
||||
self.subject = subject
|
||||
self.text_as_html = text_as_html
|
||||
self.url = url
|
||||
|
||||
# Text format metadata fields
|
||||
text_as_html: Optional[str] = None
|
||||
def __eq__(self, other: object) -> bool:
|
||||
"""Implments equivalence, like meta == other_meta.
|
||||
|
||||
# Metadata extracted via regex
|
||||
regex_metadata: Optional[Dict[str, List[RegexMetadata]]] = None
|
||||
All fields at all levels must match. Unpopulated fields are not considered except when
|
||||
populated in one and not the other.
|
||||
"""
|
||||
if not isinstance(other, ElementMetadata):
|
||||
return False
|
||||
return self.fields == other.fields
|
||||
|
||||
# Chunking metadata fields
|
||||
is_continuation: Optional[bool] = None
|
||||
def __getattr__(self, attr_name: str) -> None:
|
||||
"""Only called when attribute doesn't exist."""
|
||||
if attr_name in self._known_field_names:
|
||||
return None
|
||||
raise AttributeError(f"'ElementMetadata' object has no attribute '{attr_name}'")
|
||||
|
||||
# Detection Model Class Probabilities from Unstructured-Inference Hi-Res
|
||||
detection_class_prob: Optional[float] = None
|
||||
|
||||
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
||||
# -- The detection mechanism that emitted this element, for debugging purposes. Only
|
||||
# -- defined when UNSTRUCTURED_INCLUDE_DEBUG_METADATA flag is True. Note the `compare=False`
|
||||
# -- setting meaning it's value is not included when comparing two ElementMetadata instances
|
||||
# -- for equality (`.__eq__()`).
|
||||
detection_origin: Optional[str] = dc.field(default=None, compare=False)
|
||||
|
||||
def __setattr__(self, key: str, value: Any):
|
||||
# -- Avoid triggering `AttributeError` when assigning to `metadata.detection_origin` when
|
||||
# -- when the UNSTRUCTURED_INCLUDE_DEBUG_METADATA flag is False (and the `.detection_origin`
|
||||
# -- field is not defined).
|
||||
if not UNSTRUCTURED_INCLUDE_DEBUG_METADATA and key == "detection_origin":
|
||||
def __setattr__(self, __name: str, __value: Any) -> None:
|
||||
if __value is None:
|
||||
# -- can't use `hasattr()` for this because it calls `__getattr__()` to find out --
|
||||
if __name in self.__dict__:
|
||||
delattr(self, __name)
|
||||
return
|
||||
else:
|
||||
super().__setattr__(key, value)
|
||||
|
||||
def __post_init__(self):
|
||||
if isinstance(self.filename, pathlib.Path):
|
||||
self.filename = str(self.filename)
|
||||
|
||||
if self.filename is not None:
|
||||
file_directory, filename = os.path.split(self.filename)
|
||||
# -- Only replace file-directory when we have something better. When ElementMetadata is
|
||||
# -- being re-loaded from JSON, the file-directory we want will already be there and
|
||||
# -- filename will be just the file-name portion of the path.
|
||||
if file_directory:
|
||||
self.file_directory = file_directory
|
||||
self.filename = filename
|
||||
|
||||
def to_dict(self):
|
||||
if not self.links:
|
||||
self.links = None
|
||||
_dict = {
|
||||
key: value
|
||||
for key, value in self.__dict__.items()
|
||||
if value is not None and key != "detection_origin"
|
||||
}
|
||||
if "regex_metadata" in _dict and not _dict["regex_metadata"]:
|
||||
_dict.pop("regex_metadata")
|
||||
if self.data_source:
|
||||
_dict["data_source"] = cast(DataSourceMetadata, self.data_source).to_dict()
|
||||
if self.coordinates:
|
||||
_dict["coordinates"] = cast(CoordinatesMetadata, self.coordinates).to_dict()
|
||||
return _dict
|
||||
if not UNSTRUCTURED_INCLUDE_DEBUG_METADATA and __name in self.DEBUG_FIELD_NAMES:
|
||||
return
|
||||
super().__setattr__(__name, __value)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, input_dict: Dict[str, Any]) -> Self:
|
||||
constructor_args = copy.deepcopy(input_dict)
|
||||
if constructor_args.get("coordinates", None) is not None:
|
||||
constructor_args["coordinates"] = CoordinatesMetadata.from_dict(
|
||||
constructor_args["coordinates"],
|
||||
)
|
||||
if constructor_args.get("data_source", None) is not None:
|
||||
constructor_args["data_source"] = DataSourceMetadata.from_dict(
|
||||
constructor_args["data_source"],
|
||||
)
|
||||
def from_dict(cls, meta_dict: Dict[str, Any]) -> ElementMetadata:
|
||||
"""Construct from a metadata-dict.
|
||||
|
||||
# Only use existing fields when constructing
|
||||
supported_fields = [f.name for f in dc.fields(cls)]
|
||||
args = {k: v for k, v in constructor_args.items() if k in supported_fields}
|
||||
This would generally be a dict formed using the `.to_dict()` method and stored as JSON
|
||||
before "rehydrating" it using this method.
|
||||
"""
|
||||
# -- avoid unexpected mutation by working on a copy of provided dict --
|
||||
meta_dict = copy.deepcopy(meta_dict)
|
||||
self = ElementMetadata()
|
||||
for field_name, field_value in meta_dict.items():
|
||||
if field_name == "coordinates":
|
||||
self.coordinates = CoordinatesMetadata.from_dict(field_value)
|
||||
elif field_name == "data_source":
|
||||
self.data_source = DataSourceMetadata.from_dict(field_value)
|
||||
else:
|
||||
setattr(self, field_name, field_value)
|
||||
|
||||
return cls(**args)
|
||||
|
||||
def merge(self, other: ElementMetadata):
|
||||
for k in self.__dict__:
|
||||
if getattr(self, k) is None:
|
||||
setattr(self, k, getattr(other, k))
|
||||
return self
|
||||
|
||||
def get_last_modified(self) -> Optional[datetime.datetime]:
|
||||
"""Converts the date field to a datetime object."""
|
||||
dt = None
|
||||
if self.last_modified is not None:
|
||||
dt = datetime.datetime.fromisoformat(self.last_modified)
|
||||
return dt
|
||||
@property
|
||||
def fields(self) -> MappingProxyType[str, Any]:
|
||||
"""Populated metadata fields in this object as a read-only dict.
|
||||
|
||||
Basically `self.__dict__` but it needs a little filtering to remove entries like
|
||||
"_known_field_names". Note this is a *snapshot* and will not reflect later changes.
|
||||
"""
|
||||
return MappingProxyType(
|
||||
{
|
||||
field_name: field_value
|
||||
for field_name, field_value in self.__dict__.items()
|
||||
if not field_name.startswith("_") and field_name not in self.DEBUG_FIELD_NAMES
|
||||
}
|
||||
)
|
||||
|
||||
@property
|
||||
def known_fields(self) -> MappingProxyType[str, Any]:
|
||||
"""Populated non-ad-hoc fields in this object as a read-only dict.
|
||||
|
||||
Only fields declared at the top of this class are included. Ad-hoc fields added to this
|
||||
instance by assignment are not. Note this is a *snapshot* and will not reflect changes that
|
||||
occur after this call.
|
||||
"""
|
||||
known_field_names = self._known_field_names
|
||||
return MappingProxyType(
|
||||
{
|
||||
field_name: field_value
|
||||
for field_name, field_value in self.__dict__.items()
|
||||
if (field_name in known_field_names and field_name not in self.DEBUG_FIELD_NAMES)
|
||||
}
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert this metadata to dict form, suitable for JSON serialization.
|
||||
|
||||
The returned dict is "sparse" in that no key-value pair appears for a field with value
|
||||
`None`.
|
||||
"""
|
||||
meta_dict = copy.deepcopy(dict(self.fields))
|
||||
|
||||
# -- remove fields that should not be serialized --
|
||||
for field_name in self.DEBUG_FIELD_NAMES:
|
||||
meta_dict.pop(field_name, None)
|
||||
|
||||
# -- don't serialize empty lists --
|
||||
meta_dict: Dict[str, Any] = {
|
||||
field_name: value
|
||||
for field_name, value in meta_dict.items()
|
||||
if value != [] and value != {}
|
||||
}
|
||||
|
||||
# -- serialize sub-object types when present --
|
||||
if self.coordinates is not None:
|
||||
meta_dict["coordinates"] = self.coordinates.to_dict()
|
||||
if self.data_source is not None:
|
||||
meta_dict["data_source"] = self.data_source.to_dict()
|
||||
|
||||
return meta_dict
|
||||
|
||||
def update(self, other: ElementMetadata) -> None:
|
||||
"""Update self with all fields present in `other`.
|
||||
|
||||
Semantics are like those of `dict.update()`.
|
||||
|
||||
- fields present in both `self` and `other` will be updated to the value in `other`.
|
||||
- fields present in `other` but not `self` will be added to `self`.
|
||||
- fields present in `self` but not `other` are unchanged.
|
||||
- `other` is unchanged.
|
||||
- both ad-hoc and known fields participate in update with the same semantics.
|
||||
|
||||
Note that fields listed in DEBUG_FIELD_NAMES are skipped in this process. Those can only be
|
||||
updated by direct assignment to the instance.
|
||||
"""
|
||||
if not isinstance(other, ElementMetadata): # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
raise ValueError("argument to '.update()' must be an instance of 'ElementMetadata'")
|
||||
|
||||
for field_name, field_value in other.fields.items():
|
||||
setattr(self, field_name, field_value)
|
||||
|
||||
@lazyproperty
|
||||
def _known_field_names(self) -> FrozenSet[str]:
|
||||
"""field-names for non-user-defined fields, available on all ElementMetadata instances.
|
||||
|
||||
Note that the first call to this lazyproperty adds a `"_known_field_names"` item to the
|
||||
`__dict__` of this instance, so this be called *before* iterating through `self.__dict__`
|
||||
to avoid a mid-iteration mutation.
|
||||
"""
|
||||
# -- self.__annotations__ is a dict and iterating it produces its keys, which are the
|
||||
# -- field-names we want here.
|
||||
return frozenset(self.__annotations__)
|
||||
|
||||
|
||||
class ConsolidationStrategy(enum.Enum):
|
||||
@ -434,23 +566,12 @@ class Element(abc.ABC):
|
||||
metadata: Optional[ElementMetadata] = None,
|
||||
detection_origin: Optional[str] = None,
|
||||
):
|
||||
if metadata is None:
|
||||
metadata = ElementMetadata()
|
||||
metadata.detection_origin = detection_origin
|
||||
self.id: Union[str, uuid.UUID, NoID, UUID] = element_id
|
||||
coordinates_metadata = (
|
||||
None
|
||||
if coordinates is None and coordinate_system is None
|
||||
else (
|
||||
CoordinatesMetadata(
|
||||
points=coordinates,
|
||||
system=coordinate_system,
|
||||
)
|
||||
self.metadata = ElementMetadata() if metadata is None else metadata
|
||||
if coordinates is not None or coordinate_system is not None:
|
||||
self.metadata.coordinates = CoordinatesMetadata(
|
||||
points=coordinates, system=coordinate_system
|
||||
)
|
||||
)
|
||||
self.metadata = metadata.merge(
|
||||
ElementMetadata(coordinates=coordinates_metadata),
|
||||
)
|
||||
self.metadata.detection_origin = detection_origin
|
||||
|
||||
def id_to_uuid(self):
|
||||
|
||||
@ -495,7 +495,7 @@ def _process_leaf_table_item(
|
||||
rows = tag_elem.findall("tr")
|
||||
if not rows:
|
||||
body = tag_elem.find("tbody")
|
||||
rows = body.findall("tr") if body else []
|
||||
rows = body.findall("tr") if body is not None else []
|
||||
if len(rows) > 0:
|
||||
table_data = [list(row.itertext()) for row in rows]
|
||||
html_table = tabulate(table_data, tablefmt="html")
|
||||
|
||||
@ -336,11 +336,9 @@ def _add_element_metadata(
|
||||
image_path=image_path,
|
||||
languages=languages,
|
||||
)
|
||||
metadata.detection_origin = detection_origin
|
||||
# NOTE(newel) - Element metadata is being merged into
|
||||
# newly constructed metadata, not the other way around
|
||||
# TODO? Make this more expected behavior?
|
||||
element.metadata = metadata.merge(element.metadata)
|
||||
element.metadata.update(metadata)
|
||||
if detection_origin is not None:
|
||||
element.metadata.detection_origin = detection_origin
|
||||
return element
|
||||
|
||||
|
||||
@ -663,7 +661,7 @@ def ocr_data_to_elements(
|
||||
)
|
||||
|
||||
if common_metadata:
|
||||
element.metadata = element.metadata.merge(common_metadata)
|
||||
element.metadata.update(common_metadata)
|
||||
|
||||
elements.append(element)
|
||||
|
||||
|
||||
@ -115,10 +115,10 @@ def elements_to_json(
|
||||
element_dict = convert_to_dict(pre_processed_elements)
|
||||
if filename is not None:
|
||||
with open(filename, "w", encoding=encoding) as f:
|
||||
json.dump(element_dict, f, indent=indent)
|
||||
json.dump(element_dict, f, indent=indent, sort_keys=True)
|
||||
return None
|
||||
else:
|
||||
return json.dumps(element_dict, indent=indent)
|
||||
return json.dumps(element_dict, indent=indent, sort_keys=True)
|
||||
|
||||
|
||||
def isd_to_elements(isd: List[Dict[str, Any]]) -> List[Element]:
|
||||
|
||||
@ -8,6 +8,7 @@ from datetime import datetime
|
||||
from functools import wraps
|
||||
from itertools import combinations
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
@ -26,7 +27,9 @@ import requests
|
||||
from typing_extensions import ParamSpec
|
||||
|
||||
from unstructured.__version__ import __version__
|
||||
from unstructured.documents.elements import Text
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured.documents.elements import Text
|
||||
|
||||
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
@ -614,7 +617,7 @@ def identify_overlapping_or_nesting_case(
|
||||
|
||||
|
||||
def catch_overlapping_and_nested_bboxes(
|
||||
elements: List[Text],
|
||||
elements: List["Text"],
|
||||
nested_error_tolerance_px: int = 5,
|
||||
sm_overlap_threshold: float = 10.0,
|
||||
) -> (bool, List[Dict]):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user