Hubert Rutkowski b8d894f963
feat/Move the category field to Element (#3056)
It's pretty basic change, just literally moved the category field to
Element class. Can't think of other changes that are needed here,
because I think pretty much everything expected the category to be
directly in elements list.

For local testing, IDE's and linters should see difference in that
`category` is now in Element.
2024-05-23 10:43:26 +00:00

1096 lines
39 KiB
Python

from __future__ import annotations
import abc
import copy
import dataclasses as dc
import enum
import functools
import hashlib
import os
import pathlib
import re
import uuid
from itertools import groupby
from types import MappingProxyType
from typing import Any, Callable, FrozenSet, Optional, Sequence, cast
from typing_extensions import ParamSpec, TypeAlias, TypedDict
from unstructured.documents.coordinates import (
TYPE_TO_COORDINATE_SYSTEM_MAP,
CoordinateSystem,
RelativeCoordinateSystem,
)
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
Point: TypeAlias = "tuple[float, float]"
Points: TypeAlias = "tuple[Point, ...]"
@dc.dataclass
class DataSourceMetadata:
"""Metadata fields that pertain to the data source of the document."""
url: Optional[str] = None
version: Optional[str] = None
record_locator: Optional[dict[str, Any]] = None # Values must be JSON-serializable
date_created: Optional[str] = None
date_modified: Optional[str] = None
date_processed: Optional[str] = None
permissions_data: Optional[list[dict[str, Any]]] = None
def to_dict(self):
return {key: value for key, value in self.__dict__.items() if value is not None}
@classmethod
def from_dict(cls, input_dict: dict[str, Any]):
# Only use existing fields when constructing
supported_fields = [f.name for f in dc.fields(cls)]
args = {k: v for k, v in input_dict.items() if k in supported_fields}
return cls(**args)
@dc.dataclass
class CoordinatesMetadata:
"""Metadata fields that pertain to the coordinates of the element."""
points: Optional[Points]
system: Optional[CoordinateSystem]
def __init__(self, points: Optional[Points], system: Optional[CoordinateSystem]):
# Both `points` and `system` must be present; one is not meaningful without the other.
if (points is None and system is not None) or (points is not None and system is None):
raise ValueError(
"Coordinates points should not exist without coordinates system and vice versa.",
)
self.points = points
self.system = system
def __eq__(self, other: Any) -> bool:
if not isinstance(other, CoordinatesMetadata):
return False
return all(
[
(self.points == other.points),
(self.system == other.system),
],
)
def to_dict(self):
return {
"points": self.points,
"system": None if self.system is None else str(self.system.__class__.__name__),
"layout_width": None if self.system is None else self.system.width,
"layout_height": None if self.system is None else self.system.height,
}
@classmethod
def from_dict(cls, input_dict: dict[str, Any]):
# `input_dict` may contain a tuple of tuples or a list of lists
def convert_to_points(sequence_of_sequences: Sequence[Sequence[float]]) -> Points:
points: list[Point] = []
for seq in sequence_of_sequences:
if isinstance(seq, list):
points.append(cast(Point, tuple(seq)))
elif isinstance(seq, tuple):
points.append(cast(Point, seq))
return tuple(points)
# -- parse points --
input_points = input_dict.get("points")
points = convert_to_points(input_points) if input_points is not None else None
# -- parse system --
system_name = input_dict.get("system")
width = input_dict.get("layout_width")
height = input_dict.get("layout_height")
system = (
None
if system_name is None
else (
RelativeCoordinateSystem()
if system_name == "RelativeCoordinateSystem"
else (
TYPE_TO_COORDINATE_SYSTEM_MAP[system_name](width, height)
if (
width is not None
and height is not None
and system_name in TYPE_TO_COORDINATE_SYSTEM_MAP
)
else None
)
)
)
return cls(points=points, system=system)
class RegexMetadata(TypedDict):
"""Metadata that is extracted from a document element via regex."""
text: str
start: int
end: int
class Link(TypedDict):
"""Metadata related to extracted links"""
text: Optional[str]
url: str
start_index: int
class FormKeyOrValue(TypedDict):
text: str
layout_element_id: Optional[str]
custom_element: Optional[Text]
class FormKeyValuePair(TypedDict):
key: FormKeyOrValue
value: Optional[FormKeyOrValue]
confidence: float
class ElementMetadata:
"""Fully-dynamic replacement for dataclass-based ElementMetadata."""
# NOTE(scanny): To add a field:
# - Add the field declaration with type here at the top. This makes it a "known" field and
# enables type-checking and completion.
# - Add a parameter with default for field in __init__() and assign it in __init__() body.
# - Add a consolidation strategy for the new field in
# `ConsolidationStrategy.field_consolidation_strategies()` below. This strategy will be used
# to consolidate this new metadata field from each pre-chunk element during chunking.
# - Add field-name to DEBUG_FIELD_NAMES if it shouldn't appear in dict/JSON or participate in
# equality comparison.
attached_to_filename: Optional[str]
category_depth: Optional[int]
coordinates: Optional[CoordinatesMetadata]
data_source: Optional[DataSourceMetadata]
# -- Detection Model Class Probabilities from Unstructured-Inference Hi-Res --
detection_class_prob: Optional[float]
# -- DEBUG field, the detection mechanism that emitted this element --
detection_origin: Optional[str]
emphasized_text_contents: Optional[list[str]]
emphasized_text_tags: Optional[list[str]]
file_directory: Optional[str]
filename: Optional[str]
filetype: Optional[str]
image_path: Optional[str]
image_base64: Optional[str]
image_mime_type: Optional[str]
# -- specific to DOCX which has distinct primary, first-page, and even-page header/footers --
header_footer_type: Optional[str]
# -- used in chunks only, when chunk must be split mid-text to fit window --
is_continuation: Optional[bool]
key_value_pairs: Optional[list[FormKeyValuePair]]
languages: Optional[list[str]]
last_modified: Optional[str]
link_texts: Optional[list[str]]
link_urls: Optional[list[str]]
link_start_indexes: Optional[list[int]]
links: Optional[list[Link]]
# -- used in chunks only, allowing access to element(s) chunk was formed from when enabled --
orig_elements: Optional[list[Element]]
# -- the worksheet name in XLXS documents --
page_name: Optional[str]
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
page_number: Optional[int]
parent_id: Optional[str]
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
# -- e-mail specific metadata fields --
sent_from: Optional[list[str]]
sent_to: Optional[list[str]]
subject: Optional[str]
signature: Optional[str]
# -- used for Table elements to capture rows/col structure --
text_as_html: Optional[str]
table_as_cells: Optional[dict[str, str | int]]
url: Optional[str]
# -- debug fields can be assigned and referenced using dotted-notation but are not serialized
# -- to dict/JSON, do not participate in equality comparison, and are not included in the
# -- `.fields` dict used by other parts of the library like chunking and weaviate.
DEBUG_FIELD_NAMES = frozenset(["detection_origin"])
def __init__(
self,
attached_to_filename: Optional[str] = None,
category_depth: Optional[int] = None,
coordinates: Optional[CoordinatesMetadata] = None,
data_source: Optional[DataSourceMetadata] = None,
detection_class_prob: Optional[float] = None,
emphasized_text_contents: Optional[list[str]] = None,
emphasized_text_tags: Optional[list[str]] = None,
file_directory: Optional[str] = None,
filename: Optional[str | pathlib.Path] = None,
filetype: Optional[str] = None,
header_footer_type: Optional[str] = None,
image_path: Optional[str] = None,
is_continuation: Optional[bool] = None,
languages: Optional[list[str]] = None,
last_modified: Optional[str] = None,
link_texts: Optional[list[str]] = None,
link_urls: Optional[list[str]] = None,
link_start_indexes: Optional[list[int]] = None,
links: Optional[list[Link]] = None,
orig_elements: Optional[list[Element]] = None,
page_name: Optional[str] = None,
page_number: Optional[int] = None,
parent_id: Optional[str] = None,
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
sent_from: Optional[list[str]] = None,
sent_to: Optional[list[str]] = None,
signature: Optional[str] = None,
subject: Optional[str] = None,
text_as_html: Optional[str] = None,
table_as_cells: Optional[dict[str, str | int]] = None,
url: Optional[str] = None,
) -> None:
self.attached_to_filename = attached_to_filename
self.category_depth = category_depth
self.coordinates = coordinates
self.data_source = data_source
self.detection_class_prob = detection_class_prob
self.emphasized_text_contents = emphasized_text_contents
self.emphasized_text_tags = emphasized_text_tags
# -- accommodate pathlib.Path for filename --
filename = str(filename) if isinstance(filename, pathlib.Path) else filename
# -- produces "", "" when filename arg is None --
directory_path, file_name = os.path.split(filename or "")
# -- prefer `file_directory` arg if specified, otherwise split of file-path passed as
# -- `filename` arg, or None if `filename` is the empty string.
self.file_directory = file_directory or directory_path or None
self.filename = file_name or None
self.filetype = filetype
self.header_footer_type = header_footer_type
self.image_path = image_path
self.is_continuation = is_continuation
self.languages = languages
self.last_modified = last_modified
self.link_texts = link_texts
self.link_urls = link_urls
self.link_start_indexes = link_start_indexes
self.links = links
self.orig_elements = orig_elements
self.page_name = page_name
self.page_number = page_number
self.parent_id = parent_id
self.regex_metadata = regex_metadata
self.sent_from = sent_from
self.sent_to = sent_to
self.signature = signature
self.subject = subject
self.text_as_html = text_as_html
self.table_as_cells = table_as_cells
self.url = url
def __eq__(self, other: object) -> bool:
"""Implments equivalence, like meta == other_meta.
All fields at all levels must match. Unpopulated fields are not considered except when
populated in one and not the other.
"""
if not isinstance(other, ElementMetadata):
return False
return self.fields == other.fields
def __getattr__(self, attr_name: str) -> None:
"""Only called when attribute doesn't exist."""
if attr_name in self._known_field_names:
return None
raise AttributeError(f"'ElementMetadata' object has no attribute '{attr_name}'")
def __setattr__(self, __name: str, __value: Any) -> None:
if __value is None:
# -- can't use `hasattr()` for this because it calls `__getattr__()` to find out --
if __name in self.__dict__:
delattr(self, __name)
return
if not UNSTRUCTURED_INCLUDE_DEBUG_METADATA and __name in self.DEBUG_FIELD_NAMES:
return
super().__setattr__(__name, __value)
@classmethod
def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata:
"""Construct from a metadata-dict.
This would generally be a dict formed using the `.to_dict()` method and stored as JSON
before "rehydrating" it using this method.
"""
from unstructured.staging.base import elements_from_base64_gzipped_json
# -- avoid unexpected mutation by working on a copy of provided dict --
meta_dict = copy.deepcopy(meta_dict)
self = ElementMetadata()
for field_name, field_value in meta_dict.items():
if field_name == "coordinates":
self.coordinates = CoordinatesMetadata.from_dict(field_value)
elif field_name == "data_source":
self.data_source = DataSourceMetadata.from_dict(field_value)
elif field_name == "orig_elements":
self.orig_elements = elements_from_base64_gzipped_json(field_value)
elif field_name == "key_value_pairs":
self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
else:
setattr(self, field_name, field_value)
return self
@property
def fields(self) -> MappingProxyType[str, Any]:
"""Populated metadata fields in this object as a read-only dict.
Basically `self.__dict__` but it needs a little filtering to remove entries like
"_known_field_names". Note this is a *snapshot* and will not reflect later changes.
"""
return MappingProxyType(
{
field_name: field_value
for field_name, field_value in self.__dict__.items()
if not field_name.startswith("_") and field_name not in self.DEBUG_FIELD_NAMES
}
)
@property
def known_fields(self) -> MappingProxyType[str, Any]:
"""Populated non-ad-hoc fields in this object as a read-only dict.
Only fields declared at the top of this class are included. Ad-hoc fields added to this
instance by assignment are not. Note this is a *snapshot* and will not reflect changes that
occur after this call.
"""
known_field_names = self._known_field_names
return MappingProxyType(
{
field_name: field_value
for field_name, field_value in self.__dict__.items()
if (field_name in known_field_names and field_name not in self.DEBUG_FIELD_NAMES)
}
)
def to_dict(self) -> dict[str, Any]:
"""Convert this metadata to dict form, suitable for JSON serialization.
The returned dict is "sparse" in that no key-value pair appears for a field with value
`None`.
"""
from unstructured.staging.base import elements_to_base64_gzipped_json
meta_dict = copy.deepcopy(dict(self.fields))
# -- remove fields that should not be serialized --
for field_name in self.DEBUG_FIELD_NAMES:
meta_dict.pop(field_name, None)
# -- don't serialize empty lists --
meta_dict: dict[str, Any] = {
field_name: value
for field_name, value in meta_dict.items()
if value != [] and value != {}
}
# -- serialize sub-object types when present --
if self.coordinates is not None:
meta_dict["coordinates"] = self.coordinates.to_dict()
if self.data_source is not None:
meta_dict["data_source"] = self.data_source.to_dict()
if self.orig_elements is not None:
meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
if self.key_value_pairs is not None:
meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)
return meta_dict
def update(self, other: ElementMetadata) -> None:
"""Update self with all fields present in `other`.
Semantics are like those of `dict.update()`.
- fields present in both `self` and `other` will be updated to the value in `other`.
- fields present in `other` but not `self` will be added to `self`.
- fields present in `self` but not `other` are unchanged.
- `other` is unchanged.
- both ad-hoc and known fields participate in update with the same semantics.
Note that fields listed in DEBUG_FIELD_NAMES are skipped in this process. Those can only be
updated by direct assignment to the instance.
"""
if not isinstance(other, ElementMetadata): # pyright: ignore[reportUnnecessaryIsInstance]
raise ValueError("argument to '.update()' must be an instance of 'ElementMetadata'")
for field_name, field_value in other.fields.items():
setattr(self, field_name, field_value)
@lazyproperty
def _known_field_names(self) -> FrozenSet[str]:
"""field-names for non-user-defined fields, available on all ElementMetadata instances.
Note that the first call to this lazyproperty adds a `"_known_field_names"` item to the
`__dict__` of this instance, so this be called *before* iterating through `self.__dict__`
to avoid a mid-iteration mutation.
"""
# -- self.__annotations__ is a dict and iterating it produces its keys, which are the
# -- field-names we want here.
return frozenset(self.__annotations__)
class ConsolidationStrategy(enum.Enum):
"""Methods by which a metadata field can be consolidated across a collection of elements.
These are assigned to `ElementMetadata` field-names immediately below. Metadata consolidation is
part of the chunking process and may arise elsewhere as well.
"""
DROP = "drop"
"""Do not include this field in the consolidated metadata object."""
FIRST = "first"
"""Use the first value encountered, omit if not present in any elements."""
LIST_CONCATENATE = "LIST_CONCATENATE"
"""Concatenate the list values across elements. Only suitable for fields of `List` type."""
LIST_UNIQUE = "list_unique"
"""Union list values across elements, preserving order. Only suitable for `List` fields."""
REGEX = "regex"
"""Combine regex-metadata of elements, adjust start and stop offsets for concatenated text."""
@classmethod
def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"""Mapping from ElementMetadata field-name to its consolidation strategy.
Note that only _TextSection objects ("pre-chunks" containing only `Text` elements that are
not `Table`) have their metadata consolidated, so these strategies are only applicable for
non-Table Text elements.
"""
return {
"attached_to_filename": cls.FIRST,
"category_depth": cls.DROP,
"coordinates": cls.DROP,
"data_source": cls.FIRST,
"detection_class_prob": cls.DROP,
"detection_origin": cls.DROP,
"emphasized_text_contents": cls.LIST_CONCATENATE,
"emphasized_text_tags": cls.LIST_CONCATENATE,
"file_directory": cls.FIRST,
"filename": cls.FIRST,
"filetype": cls.FIRST,
"header_footer_type": cls.DROP,
"image_path": cls.DROP,
"image_base64": cls.DROP,
"image_mime_type": cls.DROP,
"is_continuation": cls.DROP, # -- not expected, added by chunking, not before --
"languages": cls.LIST_UNIQUE,
"last_modified": cls.FIRST,
"link_texts": cls.LIST_CONCATENATE,
"link_urls": cls.LIST_CONCATENATE,
"link_start_indexes": cls.DROP,
"links": cls.DROP, # -- deprecated field --
"max_characters": cls.DROP, # -- unused, remove from ElementMetadata --
"orig_elements": cls.DROP, # -- not expected, added by chunking, not before --
"page_name": cls.FIRST,
"page_number": cls.FIRST,
"parent_id": cls.DROP,
"regex_metadata": cls.REGEX,
"sent_from": cls.FIRST,
"sent_to": cls.FIRST,
"signature": cls.FIRST,
"subject": cls.FIRST,
"text_as_html": cls.FIRST, # -- only occurs in Table --
"table_as_cells": cls.FIRST, # -- only occurs in Table --
"url": cls.FIRST,
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
}
_P = ParamSpec("_P")
def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
"""Converts `id` and `parent_id` of elements from UUIDs to hashes.
This function ensures deterministic IDs by:
1. Converting each element's UUID into a hash.
2. Updating the `parent_id` to match the new hash ID of parent elements.
Args:
elements: A list of Element objects to update.
Returns:
List of updated Element objects with hashes for `id` and `parent_id`.
"""
# -- generate sequence number for each element on a page --
page_numbers = [e.metadata.page_number for e in elements]
page_seq_pairs = [
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
]
# -- assign hash IDs to elements --
old_to_new_mapping = {
element.id: element.id_to_hash(seq_on_page_counter)
for element, seq_on_page_counter in zip(elements, page_seq_pairs)
}
# -- map old parent IDs to new ones --
for e in elements:
parent_id = e.metadata.parent_id
if not parent_id:
continue
e.metadata.parent_id = old_to_new_mapping[parent_id]
return elements
def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, list[Element]]]:
"""Post-process element-metadata for this document.
This decorator adds a post-processing step to a document partitioner. It adds documentation for
`metadata_filename` and `include_metadata` parameters if not present. Also adds regex-metadata
when `regex_metadata` keyword-argument is provided and changes the element-id to a UUID when
`unique_element_ids` argument is provided and True.
"""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
if func.__doc__:
if (
"metadata_filename" in func.__code__.co_varnames
and "metadata_filename" not in func.__doc__
):
func.__doc__ += (
"\nMetadata Parameters:\n\tmetadata_filename:"
+ "\n\t\tThe filename to use in element metadata."
)
if (
"include_metadata" in func.__code__.co_varnames
and "include_metadata" not in func.__doc__
):
func.__doc__ += (
"\n\tinclude_metadata:"
+ """\n\t\tDetermines whether or not metadata is included in the metadata
attribute on the elements in the output."""
)
@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
elements = func(*args, **kwargs)
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {})
# -- don't write an empty `{}` to metadata.regex_metadata when no regex-metadata was
# -- requested, otherwise it will serialize (because it's not None) when it has no
# -- meaning or is even misleading. Also it complicates tests that don't use regex-meta.
if regex_metadata:
elements = _add_regex_metadata(elements, regex_metadata)
unique_element_ids: bool = call_args.get("unique_element_ids", False)
if unique_element_ids is False:
elements = assign_and_map_hash_ids(elements)
return elements
return wrapper
return decorator
def _add_regex_metadata(
elements: list[Element],
regex_metadata: dict[str, str] = {},
) -> list[Element]:
"""Adds metadata based on a user provided regular expression.
The additional metadata will be added to the regex_metadata attrbuted in the element metadata.
"""
for element in elements:
if isinstance(element, Text):
_regex_metadata: dict["str", list[RegexMetadata]] = {}
for field_name, pattern in regex_metadata.items():
results: list[RegexMetadata] = []
for result in re.finditer(pattern, element.text):
start, end = result.span()
results.append(
{
"text": element.text[start:end],
"start": start,
"end": end,
},
)
if len(results) > 0:
_regex_metadata[field_name] = results
element.metadata.regex_metadata = _regex_metadata
return elements
class ElementType:
TITLE = "Title"
TEXT = "Text"
UNCATEGORIZED_TEXT = "UncategorizedText"
NARRATIVE_TEXT = "NarrativeText"
BULLETED_TEXT = "BulletedText"
PARAGRAPH = "Paragraph"
ABSTRACT = "Abstract"
THREADING = "Threading"
FORM = "Form"
FIELD_NAME = "Field-Name"
VALUE = "Value"
LINK = "Link"
COMPOSITE_ELEMENT = "CompositeElement"
IMAGE = "Image"
PICTURE = "Picture"
FIGURE_CAPTION = "FigureCaption"
FIGURE = "Figure"
CAPTION = "Caption"
LIST = "List"
LIST_ITEM = "ListItem"
LIST_ITEM_OTHER = "List-item"
CHECKED = "Checked"
UNCHECKED = "Unchecked"
CHECK_BOX_CHECKED = "CheckBoxChecked"
CHECK_BOX_UNCHECKED = "CheckBoxUnchecked"
RADIO_BUTTON_CHECKED = "RadioButtonChecked"
RADIO_BUTTON_UNCHECKED = "RadioButtonUnchecked"
ADDRESS = "Address"
EMAIL_ADDRESS = "EmailAddress"
PAGE_BREAK = "PageBreak"
FORMULA = "Formula"
TABLE = "Table"
HEADER = "Header"
HEADLINE = "Headline"
SUB_HEADLINE = "Subheadline"
PAGE_HEADER = "Page-header" # Title?
SECTION_HEADER = "Section-header"
FOOTER = "Footer"
FOOTNOTE = "Footnote"
PAGE_FOOTER = "Page-footer"
PAGE_NUMBER = "PageNumber"
CODE_SNIPPET = "CodeSnippet"
FORM_KEYS_VALUES = "FormKeysValues"
@classmethod
def to_dict(cls):
"""
Convert class attributes to a dictionary.
Returns:
dict: A dictionary where keys are attribute names and values are attribute values.
"""
return {
attr: getattr(cls, attr)
for attr in dir(cls)
if not callable(getattr(cls, attr)) and not attr.startswith("__")
}
class Element(abc.ABC):
"""An element is a semantically-coherent component of a document, often a paragraph.
There are a few design principles that are followed when creating an element:
1. It will always have an ID, which by default is a random UUID.
2. Asking for an ID should always return a string, it can never be None.
3. ID is lazy, meaning it will be generated when asked for the first time.
4. When deterministic behavior is needed, the ID can be converted.
to a hash based on its text `element.id_to_hash(position)`
4. Even if the `text` attribute is not defined in a subclass, it will default to a blank string.
6. Assigning a string ID manually is possible, but is meant to be used
only for deserialization purposes.
"""
text: str
category = "UncategorizedText"
def __init__(
self,
element_id: Optional[str] = None,
coordinates: Optional[tuple[tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
metadata: Optional[ElementMetadata] = None,
detection_origin: Optional[str] = None,
):
if element_id is not None and not isinstance(
element_id, str
): # pyright: ignore[reportUnnecessaryIsInstance]
raise ValueError("element_id must be of type str or None.")
self._element_id = element_id
self.metadata = ElementMetadata() if metadata is None else metadata
if coordinates is not None or coordinate_system is not None:
self.metadata.coordinates = CoordinatesMetadata(
points=coordinates, system=coordinate_system
)
self.metadata.detection_origin = detection_origin
# -- all `Element` instances get a `text` attribute, defaults to the empty string if not
# -- defined in a subclass.
self.text = self.text if hasattr(self, "text") else ""
def __str__(self):
return self.text
def convert_coordinates_to_new_system(
self, new_system: CoordinateSystem, in_place: bool = True
) -> Optional[Points]:
"""Converts the element location coordinates to a new coordinate system.
If inplace is true, changes the coordinates in place and updates the coordinate system.
"""
if (
self.metadata.coordinates is None
or self.metadata.coordinates.system is None
or self.metadata.coordinates.points is None
):
return None
new_coordinates = tuple(
self.metadata.coordinates.system.convert_coordinates_to_new_system(
new_system=new_system,
x=x,
y=y,
)
for x, y in self.metadata.coordinates.points
)
if in_place:
self.metadata.coordinates.points = new_coordinates
self.metadata.coordinates.system = new_system
return new_coordinates
def id_to_hash(self, sequence_number: int) -> str:
"""Calculates and assigns a deterministic hash as an ID.
The hash ID is based on element's text, sequence number on page,
page number and its filename.
Args:
sequence_number: index on page
Returns: new ID value
"""
data = f"{self.metadata.filename}{self.text}{self.metadata.page_number}{sequence_number}"
self._element_id = hashlib.sha256(data.encode()).hexdigest()[:32]
return self.id
@property
def id(self):
if self._element_id is None:
self._element_id = str(uuid.uuid4())
return self._element_id
def to_dict(self) -> dict[str, Any]:
return {
"type": None,
"element_id": self.id,
"text": self.text,
"metadata": self.metadata.to_dict(),
}
class CheckBox(Element):
"""A checkbox with an attribute indicating whether its checked or not.
Primarily used in documents that are forms.
"""
def __init__(
self,
element_id: Optional[str] = None,
coordinates: Optional[tuple[tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
checked: bool = False,
metadata: Optional[ElementMetadata] = None,
detection_origin: Optional[str] = None,
):
metadata = metadata if metadata else ElementMetadata()
super().__init__(
element_id=element_id,
coordinates=coordinates,
coordinate_system=coordinate_system,
metadata=metadata,
detection_origin=detection_origin,
)
self.checked: bool = checked
def __eq__(self, other: object) -> bool:
if not isinstance(other, CheckBox):
return False
return all(
(
self.checked == other.checked,
self.metadata.coordinates == other.metadata.coordinates,
)
)
def to_dict(self) -> dict[str, Any]:
"""Serialize to JSON-compatible (str keys) dict."""
out = super().to_dict()
out["type"] = "CheckBox"
out["checked"] = self.checked
out["element_id"] = self.id
return out
class Text(Element):
"""Base element for capturing free text from within document."""
def __init__(
self,
text: str,
element_id: Optional[str] = None,
coordinates: Optional[tuple[tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
metadata: Optional[ElementMetadata] = None,
detection_origin: Optional[str] = None,
embeddings: Optional[list[float]] = None,
):
metadata = metadata if metadata else ElementMetadata()
self.text: str = text
self.embeddings: Optional[list[float]] = embeddings
super().__init__(
element_id=element_id,
metadata=metadata,
coordinates=coordinates,
coordinate_system=coordinate_system,
detection_origin=detection_origin,
)
def __eq__(self, other: object):
if not isinstance(other, Text):
return False
return all(
(
self.text == other.text,
self.metadata.coordinates == other.metadata.coordinates,
self.category == other.category,
self.embeddings == other.embeddings,
),
)
def __str__(self):
return self.text
def apply(self, *cleaners: Callable[[str], str]):
"""Applies a cleaning brick to the text element.
The function that's passed in should take a string as input and produce a string as
output.
"""
cleaned_text = self.text
for cleaner in cleaners:
cleaned_text = cleaner(cleaned_text)
if not isinstance(cleaned_text, str): # pyright: ignore[reportUnnecessaryIsInstance]
raise ValueError("Cleaner produced a non-string output.")
self.text = cleaned_text
def to_dict(self) -> dict[str, Any]:
"""Serialize to JSON-compatible (str keys) dict."""
out = super().to_dict()
out["element_id"] = self.id
out["type"] = self.category
out["text"] = self.text
if self.embeddings:
out["embeddings"] = self.embeddings
return out
class Formula(Text):
"An element containing formulas in a document"
category = "Formula"
class CompositeElement(Text):
"""A chunk formed from text (non-Table) elements.
Only produced by chunking. An instance may be formed by combining one or more sequential
elements produced by partitioning. It it also used when text-splitting an "oversized" element,
a single element that by itself is larger than the requested chunk size.
"""
category = "CompositeElement"
class FigureCaption(Text):
"""An element for capturing text associated with figure captions."""
category = "FigureCaption"
class NarrativeText(Text):
"""NarrativeText is an element consisting of multiple, well-formulated sentences. This
excludes elements such titles, headers, footers, and captions."""
category = "NarrativeText"
class ListItem(Text):
"""ListItem is a NarrativeText element that is part of a list."""
category = "ListItem"
class Title(Text):
"""A text element for capturing titles."""
category = "Title"
class Address(Text):
"""A text element for capturing addresses."""
category = "Address"
class EmailAddress(Text):
"""A text element for capturing addresses"""
category = "EmailAddress"
class Image(Text):
"""A text element for capturing image metadata."""
category = ElementType.IMAGE
class PageBreak(Text):
"""An element for capturing page breaks."""
category = "PageBreak"
class Table(Text):
"""An element for capturing tables."""
category = "Table"
class TableChunk(Table):
"""An element for capturing chunks of tables."""
category = "Table"
class Header(Text):
"""An element for capturing document headers."""
category = "Header"
class Footer(Text):
"""An element for capturing document footers."""
category = "Footer"
class CodeSnippet(Text):
"""An element for capturing code snippets."""
category = "CodeSnippet"
class PageNumber(Text):
"""An element for capturing page numbers."""
category = "PageNumber"
class FormKeysValues(Text):
"""An element for capturing Key-Value dicts (forms)."""
category = "FormKeysValues"
TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
ElementType.TITLE: Title,
ElementType.SECTION_HEADER: Title,
ElementType.HEADLINE: Title,
ElementType.SUB_HEADLINE: Title,
ElementType.FIELD_NAME: Title,
ElementType.UNCATEGORIZED_TEXT: Text,
ElementType.COMPOSITE_ELEMENT: CompositeElement,
ElementType.TEXT: NarrativeText,
ElementType.NARRATIVE_TEXT: NarrativeText,
ElementType.PARAGRAPH: NarrativeText,
# this mapping favors ensures yolox produces backward compatible categories
ElementType.ABSTRACT: NarrativeText,
ElementType.THREADING: NarrativeText,
ElementType.FORM: NarrativeText,
ElementType.VALUE: NarrativeText,
ElementType.LINK: NarrativeText,
ElementType.LIST_ITEM: ListItem,
ElementType.BULLETED_TEXT: ListItem,
ElementType.LIST_ITEM_OTHER: ListItem,
ElementType.HEADER: Header,
ElementType.PAGE_HEADER: Header, # Title?
ElementType.FOOTER: Footer,
ElementType.PAGE_FOOTER: Footer,
ElementType.FOOTNOTE: Footer,
ElementType.FIGURE_CAPTION: FigureCaption,
ElementType.CAPTION: FigureCaption,
ElementType.IMAGE: Image,
ElementType.FIGURE: Image,
ElementType.PICTURE: Image,
ElementType.TABLE: Table,
ElementType.ADDRESS: Address,
ElementType.EMAIL_ADDRESS: EmailAddress,
ElementType.FORMULA: Formula,
ElementType.PAGE_BREAK: PageBreak,
ElementType.CODE_SNIPPET: CodeSnippet,
ElementType.PAGE_NUMBER: PageNumber,
ElementType.FORM_KEYS_VALUES: FormKeysValues,
}
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from dicts into Elements explicitly,
e.g. when partition_json is used.
"""
from unstructured.staging.base import elements_from_dicts
# safe to overwrite - deepcopy already happened
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
(kv_pair["key"]["custom_element"],) = elements_from_dicts(
[kv_pair["key"]["custom_element"]]
)
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
[kv_pair["value"]["custom_element"]]
)
return kv_pairs
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from Elements to dicts recursively,
e.g. when FormKeysValues.to_dict() is used.
"""
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()
return kv_pairs