mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
rfctr(part): prepare for pluggable auto-partitioners 2 (#3657)
**Summary** Step 2 in prep for pluggable auto-partitioners, remove `regex_metadata` field from `ElementMetadata`. **Additional Context** - "regex-metadata" was an experimental feature that didn't pan out. - It's implemented by one of the post-partitioning metadata decorators, so get rid of it as part of the cleanup before consolidating those decorators.
This commit is contained in:
parent
903efb0c6d
commit
086b8d6f8a
@ -135,9 +135,6 @@
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"regex_metadata": {
|
||||
"type": "object"
|
||||
},
|
||||
"detection_class_prob": {
|
||||
"type": "float"
|
||||
}
|
||||
|
@ -139,9 +139,6 @@
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"regex_metadata": {
|
||||
"type": "object"
|
||||
},
|
||||
"detection_class_prob": {
|
||||
"type": "float"
|
||||
}
|
||||
|
@ -37,7 +37,6 @@ CREATE TABLE elements (
|
||||
emphasized_text_contents TEXT,
|
||||
emphasized_text_tags TEXT,
|
||||
text_as_html TEXT,
|
||||
regex_metadata TEXT,
|
||||
detection_class_prob DECIMAL,
|
||||
is_continuation BOOLEAN,
|
||||
orig_elements TEXT,
|
||||
|
@ -38,7 +38,6 @@ CREATE TABLE elements (
|
||||
emphasized_text_contents VARCHAR [],
|
||||
emphasized_text_tags VARCHAR [],
|
||||
text_as_html TEXT,
|
||||
regex_metadata TEXT,
|
||||
detection_class_prob DECIMAL
|
||||
);
|
||||
|
||||
|
@ -36,6 +36,5 @@ CREATE TABLE elements (
|
||||
emphasized_text_contents TEXT,
|
||||
emphasized_text_tags TEXT,
|
||||
text_as_html TEXT,
|
||||
regex_metadata TEXT,
|
||||
detection_class_prob DECIMAL
|
||||
);
|
||||
|
@ -361,15 +361,6 @@
|
||||
"name": "text_as_html",
|
||||
"tokenization": "word"
|
||||
},
|
||||
{
|
||||
"dataType": [
|
||||
"text"
|
||||
],
|
||||
"indexFilterable": true,
|
||||
"indexSearchable": true,
|
||||
"name": "regex_metadata",
|
||||
"tokenization": "word"
|
||||
},
|
||||
{
|
||||
"dataType": [
|
||||
"number"
|
||||
@ -420,4 +411,4 @@
|
||||
},
|
||||
"vectorIndexType": "hnsw",
|
||||
"vectorizer": "none"
|
||||
}
|
||||
}
|
||||
|
@ -31,7 +31,6 @@ from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
PageBreak,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Text,
|
||||
@ -958,51 +957,6 @@ class DescribeTextPreChunk:
|
||||
assert orig_elements[0] is element
|
||||
assert orig_elements[1] is element_2
|
||||
|
||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
||||
|
||||
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
||||
position in the chunk after element text has been concatenated.
|
||||
"""
|
||||
pre_chunk = TextPreChunk(
|
||||
[
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
||||
),
|
||||
),
|
||||
],
|
||||
overlap_prefix="ficitur.", # len == 8
|
||||
opts=ChunkingOptions(),
|
||||
)
|
||||
|
||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
||||
|
||||
assert regex_metadata == {
|
||||
"dolor": [RegexMetadata(text="dolor", start=35, end=40)],
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=16, end=21),
|
||||
RegexMetadata(text="ipsum", start=29, end=34),
|
||||
RegexMetadata(text="ipsum", start=91, end=96),
|
||||
],
|
||||
}
|
||||
|
||||
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
||||
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
||||
|
||||
@ -1021,7 +975,6 @@ class DescribeTextPreChunk:
|
||||
emphasized_text_contents=["Lorem", "Ipsum"],
|
||||
emphasized_text_tags=["b", "i"],
|
||||
languages=["lat"],
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
@ -1036,11 +989,6 @@ class DescribeTextPreChunk:
|
||||
emphasized_text_tags=["i", "b"],
|
||||
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
||||
languages=["eng", "lat"],
|
||||
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
],
|
||||
@ -1055,13 +1003,6 @@ class DescribeTextPreChunk:
|
||||
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
||||
"emphasized_text_tags": ["b", "i", "i", "b"],
|
||||
"languages": ["lat", "eng"],
|
||||
"regex_metadata": {
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||
RegexMetadata(text="ipsum", start=19, end=24),
|
||||
],
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
},
|
||||
}
|
||||
|
||||
def it_computes_the_original_elements_list_to_help(self):
|
||||
|
@ -19,7 +19,6 @@ from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
@ -111,12 +110,7 @@ def test_chunk_by_title():
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
Title("A Bad Day"),
|
||||
Text(
|
||||
"Today is a bad day.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
||||
),
|
||||
),
|
||||
Text("Today is a bad day."),
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
@ -134,9 +128,6 @@ def test_chunk_by_title():
|
||||
),
|
||||
]
|
||||
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
||||
assert chunks[3].metadata == ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
|
||||
)
|
||||
|
||||
|
||||
def test_chunk_by_title_separates_by_page_number():
|
||||
@ -149,12 +140,7 @@ def test_chunk_by_title_separates_by_page_number():
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
Title("A Bad Day"),
|
||||
Text(
|
||||
"Today is a bad day.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
||||
),
|
||||
),
|
||||
Text("Today is a bad day."),
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
@ -185,12 +171,7 @@ def test_chuck_by_title_respects_multipage():
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
Title("A Bad Day"),
|
||||
Text(
|
||||
"Today is a bad day.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
||||
),
|
||||
),
|
||||
Text("Today is a bad day."),
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
@ -207,90 +188,6 @@ def test_chuck_by_title_respects_multipage():
|
||||
]
|
||||
|
||||
|
||||
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
|
||||
"""PreChunker is insensitive to regex-metadata changes.
|
||||
|
||||
A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
|
||||
not be split based on such a difference.
|
||||
"""
|
||||
elements: list[Element] = [
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
chunks = chunk_by_title(elements)
|
||||
|
||||
assert chunks == [
|
||||
CompositeElement(
|
||||
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
|
||||
" ipsum sed lectus porta volutpat.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
|
||||
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
|
||||
|
||||
The `start` and `end` offsets of each regex-match are adjusted to reflect their new position in
|
||||
the chunk after element text has been concatenated.
|
||||
"""
|
||||
elements: list[Element] = [
|
||||
Title(
|
||||
"Lorem Ipsum",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={
|
||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
||||
},
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
||||
),
|
||||
),
|
||||
]
|
||||
chunks = chunk_by_title(elements)
|
||||
|
||||
assert len(chunks) == 1
|
||||
chunk = chunks[0]
|
||||
assert chunk == CompositeElement(
|
||||
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
|
||||
" ipsum sed lectus porta volutpat.",
|
||||
)
|
||||
assert chunk.metadata.regex_metadata == {
|
||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
||||
"ipsum": [
|
||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
||||
RegexMetadata(text="ipsum", start=19, end=24),
|
||||
RegexMetadata(text="ipsum", start=81, end=86),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def test_chunk_by_title_groups_across_pages():
|
||||
elements: list[Element] = [
|
||||
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
||||
@ -301,12 +198,7 @@ def test_chunk_by_title_groups_across_pages():
|
||||
Text("Today is an okay day."),
|
||||
Text("It is rainy outside."),
|
||||
Title("A Bad Day"),
|
||||
Text(
|
||||
"Today is a bad day.",
|
||||
metadata=ElementMetadata(
|
||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
||||
),
|
||||
),
|
||||
Text("Today is a bad day."),
|
||||
Text("It is storming outside."),
|
||||
CheckBox(),
|
||||
]
|
||||
|
@ -27,7 +27,6 @@ from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Points,
|
||||
RegexMetadata,
|
||||
Text,
|
||||
Title,
|
||||
assign_and_map_hash_ids,
|
||||
@ -235,24 +234,6 @@ def test_element_to_dict():
|
||||
}
|
||||
|
||||
|
||||
def test_regex_metadata_round_trips_through_JSON():
|
||||
"""metadata.regex_metadata should appear at full depth in JSON."""
|
||||
regex_metadata = {
|
||||
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
|
||||
"version": [
|
||||
RegexMetadata(text="current=v1.7.2", start=7, end=21),
|
||||
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
|
||||
],
|
||||
}
|
||||
metadata = ElementMetadata(regex_metadata=regex_metadata)
|
||||
|
||||
metadata_json = json.dumps(metadata.to_dict())
|
||||
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
|
||||
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
|
||||
|
||||
assert reserialized_metadata_json == metadata_json
|
||||
|
||||
|
||||
class DescribeElementMetadata:
|
||||
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
|
||||
|
||||
|
@ -244,17 +244,6 @@ the fox met a bear."""
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_text_extract_regex_metadata():
|
||||
text = "SPEAKER 1: It is my turn to speak now!"
|
||||
|
||||
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
|
||||
assert elements[0].metadata.regex_metadata == {
|
||||
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
|
||||
}
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
|
||||
|
||||
def test_partition_text_splits_long_text():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
||||
elements = partition_text(filename=filename)
|
||||
|
@ -23,7 +23,6 @@ from unstructured.documents.elements import (
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
RegexMetadata,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
@ -113,17 +112,12 @@ def test_convert_to_dataframe_maintains_fields():
|
||||
elements = partition_email(
|
||||
"example-docs/eml/fake-email-attachment.eml",
|
||||
process_attachements=True,
|
||||
regex_metadata={"hello": r"Hello", "punc": r"[!]"},
|
||||
)
|
||||
df = base.convert_to_dataframe(elements)
|
||||
for element in elements:
|
||||
metadata = element.metadata.to_dict()
|
||||
for key in metadata:
|
||||
if not key.startswith("regex_metadata"):
|
||||
assert key in df.columns
|
||||
|
||||
assert "regex_metadata_hello" in df.columns
|
||||
assert "regex_metadata_punc" in df.columns
|
||||
assert key in df.columns
|
||||
|
||||
|
||||
def test_default_pandas_dtypes():
|
||||
@ -171,7 +165,6 @@ def test_default_pandas_dtypes():
|
||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||
text_as_html="text_as_html",
|
||||
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
|
||||
is_continuation=True,
|
||||
detection_class_prob=0.5,
|
||||
),
|
||||
@ -328,7 +321,6 @@ def test_convert_to_coco():
|
||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||
text_as_html="text_as_html",
|
||||
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
|
||||
is_continuation=True,
|
||||
detection_class_prob=0.5,
|
||||
),
|
||||
@ -372,7 +364,6 @@ def test_convert_to_coco():
|
||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||
text_as_html="text_as_html",
|
||||
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
|
||||
is_continuation=True,
|
||||
detection_class_prob=0.5,
|
||||
),
|
||||
|
@ -177,10 +177,6 @@
|
||||
"name": "text_as_html",
|
||||
"type": "Edm.String"
|
||||
},
|
||||
{
|
||||
"name": "regex_metadata",
|
||||
"type": "Edm.String"
|
||||
},
|
||||
{
|
||||
"name": "detection_class_prob",
|
||||
"type": "Edm.Double"
|
||||
@ -202,4 +198,4 @@
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -55,7 +55,6 @@ TEST_DATA_2 = {
|
||||
},
|
||||
"last_modified": "2021-01-03T00:00:00",
|
||||
"page_number": 10,
|
||||
"regex_metadata": {"pattern": "abc"},
|
||||
},
|
||||
"embeddings": [0.1, 0.2, 0.3],
|
||||
}
|
||||
@ -135,7 +134,6 @@ def test_conform_dict_2():
|
||||
"links": '{"link1": "https://example.com", "link2": "https://example.org"}',
|
||||
"last_modified": datetime.datetime(2021, 1, 3, 0, 0),
|
||||
"page_number": "10",
|
||||
"regex_metadata": '{"pattern": "abc"}',
|
||||
"date_created": datetime.datetime(2021, 1, 1, 0, 0),
|
||||
"date_modified": datetime.datetime(2021, 1, 2, 0, 0),
|
||||
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),
|
||||
|
@ -15,7 +15,6 @@ from unstructured.documents.elements import (
|
||||
ConsolidationStrategy,
|
||||
Element,
|
||||
ElementMetadata,
|
||||
RegexMetadata,
|
||||
Table,
|
||||
TableChunk,
|
||||
Title,
|
||||
@ -739,43 +738,6 @@ class TextPreChunk:
|
||||
continuation_metadata.is_continuation = True
|
||||
return continuation_metadata
|
||||
|
||||
@lazyproperty
|
||||
def _consolidated_regex_meta(self) -> dict[str, list[RegexMetadata]]:
|
||||
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
|
||||
|
||||
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
|
||||
offsets of each regex match are also adjusted for their new positions.
|
||||
"""
|
||||
chunk_regex_metadata: dict[str, list[RegexMetadata]] = {}
|
||||
separator_len = len(self._opts.text_separator)
|
||||
running_text_len = len(self._overlap_prefix) if self._overlap_prefix else 0
|
||||
start_offset = running_text_len
|
||||
|
||||
for element in self._elements:
|
||||
text_len = len(element.text)
|
||||
# -- skip empty elements like `PageBreak("")` --
|
||||
if not text_len:
|
||||
continue
|
||||
# -- account for blank line between "squashed" elements, but not at start of text --
|
||||
running_text_len += separator_len if running_text_len else 0
|
||||
start_offset = running_text_len
|
||||
running_text_len += text_len
|
||||
|
||||
if not element.metadata.regex_metadata:
|
||||
continue
|
||||
|
||||
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
|
||||
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
|
||||
for regex_name, matches in element_regex_metadata.items():
|
||||
for m in matches:
|
||||
m["start"] += start_offset
|
||||
m["end"] += start_offset
|
||||
chunk_matches = chunk_regex_metadata.get(regex_name, [])
|
||||
chunk_matches.extend(matches)
|
||||
chunk_regex_metadata[regex_name] = chunk_matches
|
||||
|
||||
return chunk_regex_metadata
|
||||
|
||||
def _iter_text_segments(self) -> Iterator[str]:
|
||||
"""Generate overlap text and each element text segment in order.
|
||||
|
||||
@ -812,8 +774,6 @@ class TextPreChunk:
|
||||
# -- Python 3.7+ maintains dict insertion order --
|
||||
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
||||
yield field_name, list(ordered_unique_keys.keys())
|
||||
elif strategy is CS.REGEX:
|
||||
yield field_name, self._consolidated_regex_meta
|
||||
elif strategy is CS.DROP:
|
||||
continue
|
||||
else: # pragma: no cover
|
||||
|
@ -8,7 +8,6 @@ import functools
|
||||
import hashlib
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
import uuid
|
||||
from itertools import groupby
|
||||
from types import MappingProxyType
|
||||
@ -127,14 +126,6 @@ class CoordinatesMetadata:
|
||||
return cls(points=points, system=system)
|
||||
|
||||
|
||||
class RegexMetadata(TypedDict):
|
||||
"""Metadata that is extracted from a document element via regex."""
|
||||
|
||||
text: str
|
||||
start: int
|
||||
end: int
|
||||
|
||||
|
||||
class Link(TypedDict):
|
||||
"""Metadata related to extracted links"""
|
||||
|
||||
@ -202,8 +193,6 @@ class ElementMetadata:
|
||||
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
|
||||
page_number: Optional[int]
|
||||
parent_id: Optional[str]
|
||||
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
|
||||
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
|
||||
|
||||
# -- e-mail specific metadata fields --
|
||||
bcc_recipient: Optional[list[str]]
|
||||
@ -254,7 +243,6 @@ class ElementMetadata:
|
||||
page_name: Optional[str] = None,
|
||||
page_number: Optional[int] = None,
|
||||
parent_id: Optional[str] = None,
|
||||
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
|
||||
sent_from: Optional[list[str]] = None,
|
||||
sent_to: Optional[list[str]] = None,
|
||||
signature: Optional[str] = None,
|
||||
@ -299,7 +287,6 @@ class ElementMetadata:
|
||||
self.page_name = page_name
|
||||
self.page_number = page_number
|
||||
self.parent_id = parent_id
|
||||
self.regex_metadata = regex_metadata
|
||||
self.sent_from = sent_from
|
||||
self.sent_to = sent_to
|
||||
self.signature = signature
|
||||
@ -477,9 +464,6 @@ class ConsolidationStrategy(enum.Enum):
|
||||
LIST_UNIQUE = "list_unique"
|
||||
"""Union list values across elements, preserving order. Only suitable for `List` fields."""
|
||||
|
||||
REGEX = "regex"
|
||||
"""Combine regex-metadata of elements, adjust start and stop offsets for concatenated text."""
|
||||
|
||||
@classmethod
|
||||
def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
|
||||
"""Mapping from ElementMetadata field-name to its consolidation strategy.
|
||||
@ -519,7 +503,6 @@ class ConsolidationStrategy(enum.Enum):
|
||||
"page_name": cls.FIRST,
|
||||
"page_number": cls.FIRST,
|
||||
"parent_id": cls.DROP,
|
||||
"regex_metadata": cls.REGEX,
|
||||
"sent_from": cls.FIRST,
|
||||
"sent_to": cls.FIRST,
|
||||
"signature": cls.FIRST,
|
||||
@ -550,7 +533,7 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
|
||||
# -- generate sequence number for each element on a page --
|
||||
page_numbers = [e.metadata.page_number for e in elements]
|
||||
page_seq_pairs = [
|
||||
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
|
||||
seq_on_page for _, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
|
||||
]
|
||||
|
||||
# -- assign hash IDs to elements --
|
||||
@ -575,7 +558,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
|
||||
This decorator adds a post-processing step to a document partitioner.
|
||||
|
||||
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
|
||||
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
|
||||
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
|
||||
|
||||
"""
|
||||
@ -605,13 +587,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
|
||||
elements = func(*args, **kwargs)
|
||||
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
|
||||
|
||||
regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {})
|
||||
# -- don't write an empty `{}` to metadata.regex_metadata when no regex-metadata was
|
||||
# -- requested, otherwise it will serialize (because it's not None) when it has no
|
||||
# -- meaning or is even misleading. Also it complicates tests that don't use regex-meta.
|
||||
if regex_metadata:
|
||||
elements = _add_regex_metadata(elements, regex_metadata)
|
||||
|
||||
unique_element_ids: bool = call_args.get("unique_element_ids", False)
|
||||
if unique_element_ids is False:
|
||||
elements = assign_and_map_hash_ids(elements)
|
||||
@ -623,36 +598,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
|
||||
return decorator
|
||||
|
||||
|
||||
def _add_regex_metadata(
|
||||
elements: list[Element],
|
||||
regex_metadata: dict[str, str] = {},
|
||||
) -> list[Element]:
|
||||
"""Adds metadata based on a user provided regular expression.
|
||||
|
||||
The additional metadata will be added to the regex_metadata attrbuted in the element metadata.
|
||||
"""
|
||||
for element in elements:
|
||||
if isinstance(element, Text):
|
||||
_regex_metadata: dict["str", list[RegexMetadata]] = {}
|
||||
for field_name, pattern in regex_metadata.items():
|
||||
results: list[RegexMetadata] = []
|
||||
for result in re.finditer(pattern, element.text):
|
||||
start, end = result.span()
|
||||
results.append(
|
||||
{
|
||||
"text": element.text[start:end],
|
||||
"start": start,
|
||||
"end": end,
|
||||
},
|
||||
)
|
||||
if len(results) > 0:
|
||||
_regex_metadata[field_name] = results
|
||||
|
||||
element.metadata.regex_metadata = _regex_metadata
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
class ElementType:
|
||||
TITLE = "Title"
|
||||
TEXT = "Text"
|
||||
@ -738,9 +683,7 @@ class Element(abc.ABC):
|
||||
metadata: Optional[ElementMetadata] = None,
|
||||
detection_origin: Optional[str] = None,
|
||||
):
|
||||
if element_id is not None and not isinstance(
|
||||
element_id, str
|
||||
): # pyright: ignore[reportUnnecessaryIsInstance]
|
||||
if element_id is not None and not isinstance(element_id, str): # type: ignore
|
||||
raise ValueError("element_id must be of type str or None.")
|
||||
|
||||
self._element_id = element_id
|
||||
@ -1075,7 +1018,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
||||
}
|
||||
|
||||
|
||||
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
|
||||
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict[str, Any]]) -> list[FormKeyValuePair]:
|
||||
"""
|
||||
The key_value_pairs metadata field contains (in the vast majority of cases)
|
||||
nested Text elements. Those need to be turned from dicts into Elements explicitly,
|
||||
@ -1093,17 +1036,17 @@ def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyVal
|
||||
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
|
||||
[kv_pair["value"]["custom_element"]]
|
||||
)
|
||||
return kv_pairs
|
||||
return cast(list[FormKeyValuePair], kv_pairs)
|
||||
|
||||
|
||||
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
|
||||
def _kvform_pairs_to_dict(orig_kv_pairs: list[FormKeyValuePair]) -> list[dict[str, Any]]:
|
||||
"""
|
||||
The key_value_pairs metadata field contains (in the vast majority of cases)
|
||||
nested Text elements. Those need to be turned from Elements to dicts recursively,
|
||||
e.g. when FormKeysValues.to_dict() is used.
|
||||
|
||||
"""
|
||||
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
|
||||
kv_pairs: list[dict[str, Any]] = copy.deepcopy(orig_kv_pairs) # type: ignore
|
||||
for kv_pair in kv_pairs:
|
||||
if kv_pair["key"]["custom_element"] is not None:
|
||||
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
|
||||
|
@ -495,7 +495,7 @@ class _OleFileDifferentiator:
|
||||
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
|
||||
"""True when file has CFBF magic first 8 bytes."""
|
||||
with ctx.open() as file:
|
||||
return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
|
||||
return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
|
||||
|
||||
@staticmethod
|
||||
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
|
||||
@ -719,10 +719,11 @@ def add_filetype(
|
||||
|
||||
This decorator adds a post-processing step to a document partitioner.
|
||||
|
||||
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
|
||||
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
|
||||
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
|
||||
- Adds `.metadata.filetype` (source-document MIME-type) metadata value
|
||||
|
||||
This "partial" decorator is present because `partition_image()` does not apply
|
||||
`.metadata.filetype` this way since each image type has its own MIME-type (e.g. `image.jpeg`,
|
||||
`image/png`, etc.).
|
||||
"""
|
||||
|
||||
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
|
||||
|
@ -107,8 +107,6 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
||||
data["metadata"]["data_source"]["date_processed"] = parser.parse(
|
||||
date_processed,
|
||||
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
||||
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
||||
if page_number := data.get("metadata", {}).get("page_number"):
|
||||
data["metadata"]["page_number"] = str(page_number)
|
||||
|
||||
|
@ -159,9 +159,6 @@ class SqlDestinationConnector(BaseDestinationConnector):
|
||||
if page_number := data.get("metadata", {}).get("page_number"):
|
||||
data["metadata"]["page_number"] = str(page_number)
|
||||
|
||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
||||
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
||||
|
||||
if data.get("metadata", {}).get("data_source", None):
|
||||
data.update(data.get("metadata", {}).pop("data_source", None))
|
||||
if data.get("metadata", {}).get("coordinates", None):
|
||||
|
@ -169,9 +169,6 @@ class WeaviateDestinationConnector(BaseDestinationConnector):
|
||||
if page_number := data.get("metadata", {}).get("page_number"):
|
||||
data["metadata"]["page_number"] = str(page_number)
|
||||
|
||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
||||
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
||||
|
||||
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||
logger.info(
|
||||
f"writing {len(elements_dict)} objects to destination "
|
||||
|
@ -111,8 +111,6 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
||||
date_processed
|
||||
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
||||
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
||||
if page_number := data.get("metadata", {}).get("page_number"):
|
||||
data["metadata"]["page_number"] = str(page_number)
|
||||
return data
|
||||
@ -179,7 +177,6 @@ class AzureCognitiveSearchUploader(Uploader):
|
||||
return self.write_dict(elements_dict=elements_dict)
|
||||
|
||||
def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
|
||||
|
||||
elements_dict = []
|
||||
for content in contents:
|
||||
with open(content.path) as elements_file:
|
||||
|
@ -103,7 +103,6 @@ _COLUMNS = (
|
||||
"emphasized_text_contents",
|
||||
"emphasized_text_tags",
|
||||
"text_as_html",
|
||||
"regex_metadata",
|
||||
"detection_class_prob",
|
||||
)
|
||||
|
||||
@ -165,10 +164,7 @@ class SQLUploadStager(UploadStager):
|
||||
df[column] = df[column].apply(
|
||||
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
||||
)
|
||||
for column in filter(
|
||||
lambda x: x in df.columns,
|
||||
("version", "page_number", "regex_metadata"),
|
||||
):
|
||||
for column in filter(lambda x: x in df.columns, ("version", "page_number")):
|
||||
df[column] = df[column].apply(str)
|
||||
|
||||
with output_path.open("w") as output_file:
|
||||
|
@ -126,9 +126,6 @@ class WeaviateUploadStager(UploadStager):
|
||||
if page_number := data.get("metadata", {}).get("page_number"):
|
||||
data["metadata"]["page_number"] = str(page_number)
|
||||
|
||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
||||
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
||||
|
||||
def run(
|
||||
self,
|
||||
elements_filepath: Path,
|
||||
|
@ -272,13 +272,7 @@ def flatten_dict(
|
||||
|
||||
|
||||
def _get_table_fieldnames(rows: list[dict[str, Any]]):
|
||||
table_fieldnames = list(TABLE_FIELDNAMES)
|
||||
for row in rows:
|
||||
metadata = row["metadata"]
|
||||
for key in flatten_dict(metadata):
|
||||
if key.startswith("regex_metadata") and key not in table_fieldnames:
|
||||
table_fieldnames.append(key)
|
||||
return table_fieldnames
|
||||
return list(TABLE_FIELDNAMES)
|
||||
|
||||
|
||||
def convert_to_csv(elements: Iterable[Element]) -> str:
|
||||
@ -337,7 +331,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
||||
"emphasized_text_contents": object, # Optional[list[str]]
|
||||
"emphasized_text_tags": object, # Optional[list[str]]
|
||||
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"regex_metadata": object,
|
||||
"max_characters": "Int64", # Optional[int]
|
||||
"is_continuation": "boolean", # Optional[bool]
|
||||
"detection_class_prob": float, # Optional[float],
|
||||
@ -354,7 +347,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
||||
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
|
||||
"data_source_permissions_data": object,
|
||||
"embeddings": object,
|
||||
"regex_metadata_key": object,
|
||||
}
|
||||
|
||||
|
||||
|
@ -16,7 +16,6 @@ exclude_metadata_keys = (
|
||||
"is_continuation",
|
||||
"links",
|
||||
"orig_elements",
|
||||
"regex_metadata",
|
||||
"key_value_pairs",
|
||||
)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user