rfctr(part): prepare for pluggable auto-partitioners 2 (#3657)

**Summary**
Step 2 in prep for pluggable auto-partitioners, remove `regex_metadata`
field from `ElementMetadata`.

**Additional Context**
- "regex-metadata" was an experimental feature that didn't pan out.
- It's implemented by one of the post-partitioning metadata decorators,
so get rid of it as part of the cleanup before consolidating those
decorators.
This commit is contained in:
Steve Canny 2024-09-24 10:33:25 -07:00 committed by GitHub
parent 903efb0c6d
commit 086b8d6f8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 20 additions and 373 deletions

View File

@ -135,9 +135,6 @@
"type": "text", "type": "text",
"analyzer": "standard" "analyzer": "standard"
}, },
"regex_metadata": {
"type": "object"
},
"detection_class_prob": { "detection_class_prob": {
"type": "float" "type": "float"
} }

View File

@ -139,9 +139,6 @@
"type": "text", "type": "text",
"analyzer": "standard" "analyzer": "standard"
}, },
"regex_metadata": {
"type": "object"
},
"detection_class_prob": { "detection_class_prob": {
"type": "float" "type": "float"
} }

View File

@ -37,7 +37,6 @@ CREATE TABLE elements (
emphasized_text_contents TEXT, emphasized_text_contents TEXT,
emphasized_text_tags TEXT, emphasized_text_tags TEXT,
text_as_html TEXT, text_as_html TEXT,
regex_metadata TEXT,
detection_class_prob DECIMAL, detection_class_prob DECIMAL,
is_continuation BOOLEAN, is_continuation BOOLEAN,
orig_elements TEXT, orig_elements TEXT,

View File

@ -38,7 +38,6 @@ CREATE TABLE elements (
emphasized_text_contents VARCHAR [], emphasized_text_contents VARCHAR [],
emphasized_text_tags VARCHAR [], emphasized_text_tags VARCHAR [],
text_as_html TEXT, text_as_html TEXT,
regex_metadata TEXT,
detection_class_prob DECIMAL detection_class_prob DECIMAL
); );

View File

@ -36,6 +36,5 @@ CREATE TABLE elements (
emphasized_text_contents TEXT, emphasized_text_contents TEXT,
emphasized_text_tags TEXT, emphasized_text_tags TEXT,
text_as_html TEXT, text_as_html TEXT,
regex_metadata TEXT,
detection_class_prob DECIMAL detection_class_prob DECIMAL
); );

View File

@ -361,15 +361,6 @@
"name": "text_as_html", "name": "text_as_html",
"tokenization": "word" "tokenization": "word"
}, },
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "regex_metadata",
"tokenization": "word"
},
{ {
"dataType": [ "dataType": [
"number" "number"
@ -420,4 +411,4 @@
}, },
"vectorIndexType": "hnsw", "vectorIndexType": "hnsw",
"vectorizer": "none" "vectorizer": "none"
} }

View File

@ -31,7 +31,6 @@ from unstructured.documents.elements import (
Element, Element,
ElementMetadata, ElementMetadata,
PageBreak, PageBreak,
RegexMetadata,
Table, Table,
TableChunk, TableChunk,
Text, Text,
@ -958,51 +957,6 @@ class DescribeTextPreChunk:
assert orig_elements[0] is element assert orig_elements[0] is element
assert orig_elements[1] is element_2 assert orig_elements[1] is element_2
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements.
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
position in the chunk after element text has been concatenated.
"""
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
],
overlap_prefix="ficitur.", # len == 8
opts=ChunkingOptions(),
)
regex_metadata = pre_chunk._consolidated_regex_meta
assert regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=35, end=40)],
"ipsum": [
RegexMetadata(text="Ipsum", start=16, end=21),
RegexMetadata(text="ipsum", start=29, end=34),
RegexMetadata(text="ipsum", start=91, end=96),
],
}
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self): def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata. """._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
@ -1021,7 +975,6 @@ class DescribeTextPreChunk:
emphasized_text_contents=["Lorem", "Ipsum"], emphasized_text_contents=["Lorem", "Ipsum"],
emphasized_text_tags=["b", "i"], emphasized_text_tags=["b", "i"],
languages=["lat"], languages=["lat"],
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
), ),
), ),
Text( Text(
@ -1036,11 +989,6 @@ class DescribeTextPreChunk:
emphasized_text_tags=["i", "b"], emphasized_text_tags=["i", "b"],
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once -- # -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
languages=["eng", "lat"], languages=["eng", "lat"],
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
), ),
), ),
], ],
@ -1055,13 +1003,6 @@ class DescribeTextPreChunk:
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"], "emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
"emphasized_text_tags": ["b", "i", "i", "b"], "emphasized_text_tags": ["b", "i", "i", "b"],
"languages": ["lat", "eng"], "languages": ["lat", "eng"],
"regex_metadata": {
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
],
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
},
} }
def it_computes_the_original_elements_list_to_help(self): def it_computes_the_original_elements_list_to_help(self):

View File

@ -19,7 +19,6 @@ from unstructured.documents.elements import (
Element, Element,
ElementMetadata, ElementMetadata,
ListItem, ListItem,
RegexMetadata,
Table, Table,
Text, Text,
Title, Title,
@ -111,12 +110,7 @@ def test_chunk_by_title():
Text("Today is an okay day."), Text("Today is an okay day."),
Text("It is rainy outside."), Text("It is rainy outside."),
Title("A Bad Day"), Title("A Bad Day"),
Text( Text("Today is a bad day."),
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."), Text("It is storming outside."),
CheckBox(), CheckBox(),
] ]
@ -134,9 +128,6 @@ def test_chunk_by_title():
), ),
] ]
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"]) assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
assert chunks[3].metadata == ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
)
def test_chunk_by_title_separates_by_page_number(): def test_chunk_by_title_separates_by_page_number():
@ -149,12 +140,7 @@ def test_chunk_by_title_separates_by_page_number():
Text("Today is an okay day."), Text("Today is an okay day."),
Text("It is rainy outside."), Text("It is rainy outside."),
Title("A Bad Day"), Title("A Bad Day"),
Text( Text("Today is a bad day."),
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."), Text("It is storming outside."),
CheckBox(), CheckBox(),
] ]
@ -185,12 +171,7 @@ def test_chuck_by_title_respects_multipage():
Text("Today is an okay day."), Text("Today is an okay day."),
Text("It is rainy outside."), Text("It is rainy outside."),
Title("A Bad Day"), Title("A Bad Day"),
Text( Text("Today is a bad day."),
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."), Text("It is storming outside."),
CheckBox(), CheckBox(),
] ]
@ -207,90 +188,6 @@ def test_chuck_by_title_respects_multipage():
] ]
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
"""PreChunker is insensitive to regex-metadata changes.
A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
not be split based on such a difference.
"""
elements: list[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat.",
),
]
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
The `start` and `end` offsets of each regex-match are adjusted to reflect their new position in
the chunk after element text has been concatenated.
"""
elements: list[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
]
chunks = chunk_by_title(elements)
assert len(chunks) == 1
chunk = chunks[0]
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat.",
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
def test_chunk_by_title_groups_across_pages(): def test_chunk_by_title_groups_across_pages():
elements: list[Element] = [ elements: list[Element] = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)), Title("A Great Day", metadata=ElementMetadata(page_number=1)),
@ -301,12 +198,7 @@ def test_chunk_by_title_groups_across_pages():
Text("Today is an okay day."), Text("Today is an okay day."),
Text("It is rainy outside."), Text("It is rainy outside."),
Title("A Bad Day"), Title("A Bad Day"),
Text( Text("Today is a bad day."),
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("It is storming outside."), Text("It is storming outside."),
CheckBox(), CheckBox(),
] ]

View File

@ -27,7 +27,6 @@ from unstructured.documents.elements import (
Element, Element,
ElementMetadata, ElementMetadata,
Points, Points,
RegexMetadata,
Text, Text,
Title, Title,
assign_and_map_hash_ids, assign_and_map_hash_ids,
@ -235,24 +234,6 @@ def test_element_to_dict():
} }
def test_regex_metadata_round_trips_through_JSON():
"""metadata.regex_metadata should appear at full depth in JSON."""
regex_metadata = {
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
"version": [
RegexMetadata(text="current=v1.7.2", start=7, end=21),
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
],
}
metadata = ElementMetadata(regex_metadata=regex_metadata)
metadata_json = json.dumps(metadata.to_dict())
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
assert reserialized_metadata_json == metadata_json
class DescribeElementMetadata: class DescribeElementMetadata:
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`.""" """Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""

View File

@ -244,17 +244,6 @@ the fox met a bear."""
assert element.metadata.filename is None assert element.metadata.filename is None
def test_partition_text_extract_regex_metadata():
text = "SPEAKER 1: It is my turn to speak now!"
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
assert elements[0].metadata.regex_metadata == {
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
}
for element in elements:
assert element.metadata.filename is None
def test_partition_text_splits_long_text(): def test_partition_text_splits_long_text():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
elements = partition_text(filename=filename) elements = partition_text(filename=filename)

View File

@ -23,7 +23,6 @@ from unstructured.documents.elements import (
ListItem, ListItem,
NarrativeText, NarrativeText,
PageBreak, PageBreak,
RegexMetadata,
Text, Text,
Title, Title,
) )
@ -113,17 +112,12 @@ def test_convert_to_dataframe_maintains_fields():
elements = partition_email( elements = partition_email(
"example-docs/eml/fake-email-attachment.eml", "example-docs/eml/fake-email-attachment.eml",
process_attachements=True, process_attachements=True,
regex_metadata={"hello": r"Hello", "punc": r"[!]"},
) )
df = base.convert_to_dataframe(elements) df = base.convert_to_dataframe(elements)
for element in elements: for element in elements:
metadata = element.metadata.to_dict() metadata = element.metadata.to_dict()
for key in metadata: for key in metadata:
if not key.startswith("regex_metadata"): assert key in df.columns
assert key in df.columns
assert "regex_metadata_hello" in df.columns
assert "regex_metadata_punc" in df.columns
def test_default_pandas_dtypes(): def test_default_pandas_dtypes():
@ -171,7 +165,6 @@ def test_default_pandas_dtypes():
emphasized_text_contents=["emphasized", "text", "contents"], emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"], emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html", text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True, is_continuation=True,
detection_class_prob=0.5, detection_class_prob=0.5,
), ),
@ -328,7 +321,6 @@ def test_convert_to_coco():
emphasized_text_contents=["emphasized", "text", "contents"], emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"], emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html", text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True, is_continuation=True,
detection_class_prob=0.5, detection_class_prob=0.5,
), ),
@ -372,7 +364,6 @@ def test_convert_to_coco():
emphasized_text_contents=["emphasized", "text", "contents"], emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"], emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html", text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True, is_continuation=True,
detection_class_prob=0.5, detection_class_prob=0.5,
), ),

View File

@ -177,10 +177,6 @@
"name": "text_as_html", "name": "text_as_html",
"type": "Edm.String" "type": "Edm.String"
}, },
{
"name": "regex_metadata",
"type": "Edm.String"
},
{ {
"name": "detection_class_prob", "name": "detection_class_prob",
"type": "Edm.Double" "type": "Edm.Double"
@ -202,4 +198,4 @@
} }
] ]
} }
} }

View File

@ -55,7 +55,6 @@ TEST_DATA_2 = {
}, },
"last_modified": "2021-01-03T00:00:00", "last_modified": "2021-01-03T00:00:00",
"page_number": 10, "page_number": 10,
"regex_metadata": {"pattern": "abc"},
}, },
"embeddings": [0.1, 0.2, 0.3], "embeddings": [0.1, 0.2, 0.3],
} }
@ -135,7 +134,6 @@ def test_conform_dict_2():
"links": '{"link1": "https://example.com", "link2": "https://example.org"}', "links": '{"link1": "https://example.com", "link2": "https://example.org"}',
"last_modified": datetime.datetime(2021, 1, 3, 0, 0), "last_modified": datetime.datetime(2021, 1, 3, 0, 0),
"page_number": "10", "page_number": "10",
"regex_metadata": '{"pattern": "abc"}',
"date_created": datetime.datetime(2021, 1, 1, 0, 0), "date_created": datetime.datetime(2021, 1, 1, 0, 0),
"date_modified": datetime.datetime(2021, 1, 2, 0, 0), "date_modified": datetime.datetime(2021, 1, 2, 0, 0),
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8), "date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),

View File

@ -15,7 +15,6 @@ from unstructured.documents.elements import (
ConsolidationStrategy, ConsolidationStrategy,
Element, Element,
ElementMetadata, ElementMetadata,
RegexMetadata,
Table, Table,
TableChunk, TableChunk,
Title, Title,
@ -739,43 +738,6 @@ class TextPreChunk:
continuation_metadata.is_continuation = True continuation_metadata.is_continuation = True
return continuation_metadata return continuation_metadata
@lazyproperty
def _consolidated_regex_meta(self) -> dict[str, list[RegexMetadata]]:
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
offsets of each regex match are also adjusted for their new positions.
"""
chunk_regex_metadata: dict[str, list[RegexMetadata]] = {}
separator_len = len(self._opts.text_separator)
running_text_len = len(self._overlap_prefix) if self._overlap_prefix else 0
start_offset = running_text_len
for element in self._elements:
text_len = len(element.text)
# -- skip empty elements like `PageBreak("")` --
if not text_len:
continue
# -- account for blank line between "squashed" elements, but not at start of text --
running_text_len += separator_len if running_text_len else 0
start_offset = running_text_len
running_text_len += text_len
if not element.metadata.regex_metadata:
continue
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
for regex_name, matches in element_regex_metadata.items():
for m in matches:
m["start"] += start_offset
m["end"] += start_offset
chunk_matches = chunk_regex_metadata.get(regex_name, [])
chunk_matches.extend(matches)
chunk_regex_metadata[regex_name] = chunk_matches
return chunk_regex_metadata
def _iter_text_segments(self) -> Iterator[str]: def _iter_text_segments(self) -> Iterator[str]:
"""Generate overlap text and each element text segment in order. """Generate overlap text and each element text segment in order.
@ -812,8 +774,6 @@ class TextPreChunk:
# -- Python 3.7+ maintains dict insertion order -- # -- Python 3.7+ maintains dict insertion order --
ordered_unique_keys = {key: None for val_list in values for key in val_list} ordered_unique_keys = {key: None for val_list in values for key in val_list}
yield field_name, list(ordered_unique_keys.keys()) yield field_name, list(ordered_unique_keys.keys())
elif strategy is CS.REGEX:
yield field_name, self._consolidated_regex_meta
elif strategy is CS.DROP: elif strategy is CS.DROP:
continue continue
else: # pragma: no cover else: # pragma: no cover

View File

@ -8,7 +8,6 @@ import functools
import hashlib import hashlib
import os import os
import pathlib import pathlib
import re
import uuid import uuid
from itertools import groupby from itertools import groupby
from types import MappingProxyType from types import MappingProxyType
@ -127,14 +126,6 @@ class CoordinatesMetadata:
return cls(points=points, system=system) return cls(points=points, system=system)
class RegexMetadata(TypedDict):
"""Metadata that is extracted from a document element via regex."""
text: str
start: int
end: int
class Link(TypedDict): class Link(TypedDict):
"""Metadata related to extracted links""" """Metadata related to extracted links"""
@ -202,8 +193,6 @@ class ElementMetadata:
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents -- # -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
page_number: Optional[int] page_number: Optional[int]
parent_id: Optional[str] parent_id: Optional[str]
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
# -- e-mail specific metadata fields -- # -- e-mail specific metadata fields --
bcc_recipient: Optional[list[str]] bcc_recipient: Optional[list[str]]
@ -254,7 +243,6 @@ class ElementMetadata:
page_name: Optional[str] = None, page_name: Optional[str] = None,
page_number: Optional[int] = None, page_number: Optional[int] = None,
parent_id: Optional[str] = None, parent_id: Optional[str] = None,
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
sent_from: Optional[list[str]] = None, sent_from: Optional[list[str]] = None,
sent_to: Optional[list[str]] = None, sent_to: Optional[list[str]] = None,
signature: Optional[str] = None, signature: Optional[str] = None,
@ -299,7 +287,6 @@ class ElementMetadata:
self.page_name = page_name self.page_name = page_name
self.page_number = page_number self.page_number = page_number
self.parent_id = parent_id self.parent_id = parent_id
self.regex_metadata = regex_metadata
self.sent_from = sent_from self.sent_from = sent_from
self.sent_to = sent_to self.sent_to = sent_to
self.signature = signature self.signature = signature
@ -477,9 +464,6 @@ class ConsolidationStrategy(enum.Enum):
LIST_UNIQUE = "list_unique" LIST_UNIQUE = "list_unique"
"""Union list values across elements, preserving order. Only suitable for `List` fields.""" """Union list values across elements, preserving order. Only suitable for `List` fields."""
REGEX = "regex"
"""Combine regex-metadata of elements, adjust start and stop offsets for concatenated text."""
@classmethod @classmethod
def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"""Mapping from ElementMetadata field-name to its consolidation strategy. """Mapping from ElementMetadata field-name to its consolidation strategy.
@ -519,7 +503,6 @@ class ConsolidationStrategy(enum.Enum):
"page_name": cls.FIRST, "page_name": cls.FIRST,
"page_number": cls.FIRST, "page_number": cls.FIRST,
"parent_id": cls.DROP, "parent_id": cls.DROP,
"regex_metadata": cls.REGEX,
"sent_from": cls.FIRST, "sent_from": cls.FIRST,
"sent_to": cls.FIRST, "sent_to": cls.FIRST,
"signature": cls.FIRST, "signature": cls.FIRST,
@ -550,7 +533,7 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
# -- generate sequence number for each element on a page -- # -- generate sequence number for each element on a page --
page_numbers = [e.metadata.page_number for e in elements] page_numbers = [e.metadata.page_number for e in elements]
page_seq_pairs = [ page_seq_pairs = [
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group) seq_on_page for _, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
] ]
# -- assign hash IDs to elements -- # -- assign hash IDs to elements --
@ -575,7 +558,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
This decorator adds a post-processing step to a document partitioner. This decorator adds a post-processing step to a document partitioner.
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present. - Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True. - Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
""" """
@ -605,13 +587,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
elements = func(*args, **kwargs) elements = func(*args, **kwargs)
call_args = get_call_args_applying_defaults(func, *args, **kwargs) call_args = get_call_args_applying_defaults(func, *args, **kwargs)
regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {})
# -- don't write an empty `{}` to metadata.regex_metadata when no regex-metadata was
# -- requested, otherwise it will serialize (because it's not None) when it has no
# -- meaning or is even misleading. Also it complicates tests that don't use regex-meta.
if regex_metadata:
elements = _add_regex_metadata(elements, regex_metadata)
unique_element_ids: bool = call_args.get("unique_element_ids", False) unique_element_ids: bool = call_args.get("unique_element_ids", False)
if unique_element_ids is False: if unique_element_ids is False:
elements = assign_and_map_hash_ids(elements) elements = assign_and_map_hash_ids(elements)
@ -623,36 +598,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
return decorator return decorator
def _add_regex_metadata(
elements: list[Element],
regex_metadata: dict[str, str] = {},
) -> list[Element]:
"""Adds metadata based on a user provided regular expression.
The additional metadata will be added to the regex_metadata attrbuted in the element metadata.
"""
for element in elements:
if isinstance(element, Text):
_regex_metadata: dict["str", list[RegexMetadata]] = {}
for field_name, pattern in regex_metadata.items():
results: list[RegexMetadata] = []
for result in re.finditer(pattern, element.text):
start, end = result.span()
results.append(
{
"text": element.text[start:end],
"start": start,
"end": end,
},
)
if len(results) > 0:
_regex_metadata[field_name] = results
element.metadata.regex_metadata = _regex_metadata
return elements
class ElementType: class ElementType:
TITLE = "Title" TITLE = "Title"
TEXT = "Text" TEXT = "Text"
@ -738,9 +683,7 @@ class Element(abc.ABC):
metadata: Optional[ElementMetadata] = None, metadata: Optional[ElementMetadata] = None,
detection_origin: Optional[str] = None, detection_origin: Optional[str] = None,
): ):
if element_id is not None and not isinstance( if element_id is not None and not isinstance(element_id, str): # type: ignore
element_id, str
): # pyright: ignore[reportUnnecessaryIsInstance]
raise ValueError("element_id must be of type str or None.") raise ValueError("element_id must be of type str or None.")
self._element_id = element_id self._element_id = element_id
@ -1075,7 +1018,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
} }
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]: def _kvform_rehydrate_internal_elements(kv_pairs: list[dict[str, Any]]) -> list[FormKeyValuePair]:
""" """
The key_value_pairs metadata field contains (in the vast majority of cases) The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from dicts into Elements explicitly, nested Text elements. Those need to be turned from dicts into Elements explicitly,
@ -1093,17 +1036,17 @@ def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyVal
(kv_pair["value"]["custom_element"],) = elements_from_dicts( (kv_pair["value"]["custom_element"],) = elements_from_dicts(
[kv_pair["value"]["custom_element"]] [kv_pair["value"]["custom_element"]]
) )
return kv_pairs return cast(list[FormKeyValuePair], kv_pairs)
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]: def _kvform_pairs_to_dict(orig_kv_pairs: list[FormKeyValuePair]) -> list[dict[str, Any]]:
""" """
The key_value_pairs metadata field contains (in the vast majority of cases) The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from Elements to dicts recursively, nested Text elements. Those need to be turned from Elements to dicts recursively,
e.g. when FormKeysValues.to_dict() is used. e.g. when FormKeysValues.to_dict() is used.
""" """
kv_pairs: list[dict] = copy.deepcopy(kv_pairs) kv_pairs: list[dict[str, Any]] = copy.deepcopy(orig_kv_pairs) # type: ignore
for kv_pair in kv_pairs: for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None: if kv_pair["key"]["custom_element"] is not None:
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict() kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()

View File

@ -495,7 +495,7 @@ class _OleFileDifferentiator:
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool: def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
"""True when file has CFBF magic first 8 bytes.""" """True when file has CFBF magic first 8 bytes."""
with ctx.open() as file: with ctx.open() as file:
return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
@staticmethod @staticmethod
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None: def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
@ -719,10 +719,11 @@ def add_filetype(
This decorator adds a post-processing step to a document partitioner. This decorator adds a post-processing step to a document partitioner.
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present. - Adds `.metadata.filetype` (source-document MIME-type) metadata value
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
This "partial" decorator is present because `partition_image()` does not apply
`.metadata.filetype` this way since each image type has its own MIME-type (e.g. `image.jpeg`,
`image/png`, etc.).
""" """
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]: def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:

View File

@ -107,8 +107,6 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
data["metadata"]["data_source"]["date_processed"] = parser.parse( data["metadata"]["data_source"]["date_processed"] = parser.parse(
date_processed, date_processed,
).strftime("%Y-%m-%dT%H:%M:%S.%fZ") ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
if page_number := data.get("metadata", {}).get("page_number"): if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number) data["metadata"]["page_number"] = str(page_number)

View File

@ -159,9 +159,6 @@ class SqlDestinationConnector(BaseDestinationConnector):
if page_number := data.get("metadata", {}).get("page_number"): if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number) data["metadata"]["page_number"] = str(page_number)
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
if data.get("metadata", {}).get("data_source", None): if data.get("metadata", {}).get("data_source", None):
data.update(data.get("metadata", {}).pop("data_source", None)) data.update(data.get("metadata", {}).pop("data_source", None))
if data.get("metadata", {}).get("coordinates", None): if data.get("metadata", {}).get("coordinates", None):

View File

@ -169,9 +169,6 @@ class WeaviateDestinationConnector(BaseDestinationConnector):
if page_number := data.get("metadata", {}).get("page_number"): if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number) data["metadata"]["page_number"] = str(page_number)
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info( logger.info(
f"writing {len(elements_dict)} objects to destination " f"writing {len(elements_dict)} objects to destination "

View File

@ -111,8 +111,6 @@ class AzureCognitiveSearchUploadStager(UploadStager):
date_processed date_processed
).strftime("%Y-%m-%dT%H:%M:%S.%fZ") ).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
if page_number := data.get("metadata", {}).get("page_number"): if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number) data["metadata"]["page_number"] = str(page_number)
return data return data
@ -179,7 +177,6 @@ class AzureCognitiveSearchUploader(Uploader):
return self.write_dict(elements_dict=elements_dict) return self.write_dict(elements_dict=elements_dict)
def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None: def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
elements_dict = [] elements_dict = []
for content in contents: for content in contents:
with open(content.path) as elements_file: with open(content.path) as elements_file:

View File

@ -103,7 +103,6 @@ _COLUMNS = (
"emphasized_text_contents", "emphasized_text_contents",
"emphasized_text_tags", "emphasized_text_tags",
"text_as_html", "text_as_html",
"regex_metadata",
"detection_class_prob", "detection_class_prob",
) )
@ -165,10 +164,7 @@ class SQLUploadStager(UploadStager):
df[column] = df[column].apply( df[column] = df[column].apply(
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
) )
for column in filter( for column in filter(lambda x: x in df.columns, ("version", "page_number")):
lambda x: x in df.columns,
("version", "page_number", "regex_metadata"),
):
df[column] = df[column].apply(str) df[column] = df[column].apply(str)
with output_path.open("w") as output_file: with output_path.open("w") as output_file:

View File

@ -126,9 +126,6 @@ class WeaviateUploadStager(UploadStager):
if page_number := data.get("metadata", {}).get("page_number"): if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number) data["metadata"]["page_number"] = str(page_number)
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
def run( def run(
self, self,
elements_filepath: Path, elements_filepath: Path,

View File

@ -272,13 +272,7 @@ def flatten_dict(
def _get_table_fieldnames(rows: list[dict[str, Any]]): def _get_table_fieldnames(rows: list[dict[str, Any]]):
table_fieldnames = list(TABLE_FIELDNAMES) return list(TABLE_FIELDNAMES)
for row in rows:
metadata = row["metadata"]
for key in flatten_dict(metadata):
if key.startswith("regex_metadata") and key not in table_fieldnames:
table_fieldnames.append(key)
return table_fieldnames
def convert_to_csv(elements: Iterable[Element]) -> str: def convert_to_csv(elements: Iterable[Element]) -> str:
@ -337,7 +331,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
"emphasized_text_contents": object, # Optional[list[str]] "emphasized_text_contents": object, # Optional[list[str]]
"emphasized_text_tags": object, # Optional[list[str]] "emphasized_text_tags": object, # Optional[list[str]]
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore "text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
"regex_metadata": object,
"max_characters": "Int64", # Optional[int] "max_characters": "Int64", # Optional[int]
"is_continuation": "boolean", # Optional[bool] "is_continuation": "boolean", # Optional[bool]
"detection_class_prob": float, # Optional[float], "detection_class_prob": float, # Optional[float],
@ -354,7 +347,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore "data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_permissions_data": object, "data_source_permissions_data": object,
"embeddings": object, "embeddings": object,
"regex_metadata_key": object,
} }

View File

@ -16,7 +16,6 @@ exclude_metadata_keys = (
"is_continuation", "is_continuation",
"links", "links",
"orig_elements", "orig_elements",
"regex_metadata",
"key_value_pairs", "key_value_pairs",
) )