rfctr(part): prepare for pluggable auto-partitioners 2 (#3657)

**Summary**
Step 2 in prep for pluggable auto-partitioners, remove `regex_metadata`
field from `ElementMetadata`.

**Additional Context**
- "regex-metadata" was an experimental feature that didn't pan out.
- It's implemented by one of the post-partitioning metadata decorators,
so get rid of it as part of the cleanup before consolidating those
decorators.
This commit is contained in:
Steve Canny 2024-09-24 10:33:25 -07:00 committed by GitHub
parent 903efb0c6d
commit 086b8d6f8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 20 additions and 373 deletions

View File

@ -135,9 +135,6 @@
"type": "text",
"analyzer": "standard"
},
"regex_metadata": {
"type": "object"
},
"detection_class_prob": {
"type": "float"
}

View File

@ -139,9 +139,6 @@
"type": "text",
"analyzer": "standard"
},
"regex_metadata": {
"type": "object"
},
"detection_class_prob": {
"type": "float"
}

View File

@ -37,7 +37,6 @@ CREATE TABLE elements (
emphasized_text_contents TEXT,
emphasized_text_tags TEXT,
text_as_html TEXT,
regex_metadata TEXT,
detection_class_prob DECIMAL,
is_continuation BOOLEAN,
orig_elements TEXT,

View File

@ -38,7 +38,6 @@ CREATE TABLE elements (
emphasized_text_contents VARCHAR [],
emphasized_text_tags VARCHAR [],
text_as_html TEXT,
regex_metadata TEXT,
detection_class_prob DECIMAL
);

View File

@ -36,6 +36,5 @@ CREATE TABLE elements (
emphasized_text_contents TEXT,
emphasized_text_tags TEXT,
text_as_html TEXT,
regex_metadata TEXT,
detection_class_prob DECIMAL
);

View File

@ -361,15 +361,6 @@
"name": "text_as_html",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "regex_metadata",
"tokenization": "word"
},
{
"dataType": [
"number"
@ -420,4 +411,4 @@
},
"vectorIndexType": "hnsw",
"vectorizer": "none"
}
}

View File

@ -31,7 +31,6 @@ from unstructured.documents.elements import (
Element,
ElementMetadata,
PageBreak,
RegexMetadata,
Table,
TableChunk,
Text,
@ -958,51 +957,6 @@ class DescribeTextPreChunk:
assert orig_elements[0] is element
assert orig_elements[1] is element_2
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
"""regex_metadata of chunk is combined regex_metadatas of its elements.
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
position in the chunk after element text has been concatenated.
"""
pre_chunk = TextPreChunk(
[
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
],
overlap_prefix="ficitur.", # len == 8
opts=ChunkingOptions(),
)
regex_metadata = pre_chunk._consolidated_regex_meta
assert regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=35, end=40)],
"ipsum": [
RegexMetadata(text="Ipsum", start=16, end=21),
RegexMetadata(text="ipsum", start=29, end=34),
RegexMetadata(text="ipsum", start=91, end=96),
],
}
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
@ -1021,7 +975,6 @@ class DescribeTextPreChunk:
emphasized_text_contents=["Lorem", "Ipsum"],
emphasized_text_tags=["b", "i"],
languages=["lat"],
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
@ -1036,11 +989,6 @@ class DescribeTextPreChunk:
emphasized_text_tags=["i", "b"],
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
languages=["eng", "lat"],
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
],
@ -1055,13 +1003,6 @@ class DescribeTextPreChunk:
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
"emphasized_text_tags": ["b", "i", "i", "b"],
"languages": ["lat", "eng"],
"regex_metadata": {
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
],
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
},
}
def it_computes_the_original_elements_list_to_help(self):

View File

@ -19,7 +19,6 @@ from unstructured.documents.elements import (
Element,
ElementMetadata,
ListItem,
RegexMetadata,
Table,
Text,
Title,
@ -111,12 +110,7 @@ def test_chunk_by_title():
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
@ -134,9 +128,6 @@ def test_chunk_by_title():
),
]
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
assert chunks[3].metadata == ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
)
def test_chunk_by_title_separates_by_page_number():
@ -149,12 +140,7 @@ def test_chunk_by_title_separates_by_page_number():
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
@ -185,12 +171,7 @@ def test_chuck_by_title_respects_multipage():
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]
@ -207,90 +188,6 @@ def test_chuck_by_title_respects_multipage():
]
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
"""PreChunker is insensitive to regex-metadata changes.
A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
not be split based on such a difference.
"""
elements: list[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat.",
),
]
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
The `start` and `end` offsets of each regex-match are adjusted to reflect their new position in
the chunk after element text has been concatenated.
"""
elements: list[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
},
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
),
),
]
chunks = chunk_by_title(elements)
assert len(chunks) == 1
chunk = chunks[0]
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat.",
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
def test_chunk_by_title_groups_across_pages():
elements: list[Element] = [
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
@ -301,12 +198,7 @@ def test_chunk_by_title_groups_across_pages():
Text("Today is an okay day."),
Text("It is rainy outside."),
Title("A Bad Day"),
Text(
"Today is a bad day.",
metadata=ElementMetadata(
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
),
),
Text("Today is a bad day."),
Text("It is storming outside."),
CheckBox(),
]

View File

@ -27,7 +27,6 @@ from unstructured.documents.elements import (
Element,
ElementMetadata,
Points,
RegexMetadata,
Text,
Title,
assign_and_map_hash_ids,
@ -235,24 +234,6 @@ def test_element_to_dict():
}
def test_regex_metadata_round_trips_through_JSON():
"""metadata.regex_metadata should appear at full depth in JSON."""
regex_metadata = {
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
"version": [
RegexMetadata(text="current=v1.7.2", start=7, end=21),
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
],
}
metadata = ElementMetadata(regex_metadata=regex_metadata)
metadata_json = json.dumps(metadata.to_dict())
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
assert reserialized_metadata_json == metadata_json
class DescribeElementMetadata:
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""

View File

@ -244,17 +244,6 @@ the fox met a bear."""
assert element.metadata.filename is None
def test_partition_text_extract_regex_metadata():
text = "SPEAKER 1: It is my turn to speak now!"
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
assert elements[0].metadata.regex_metadata == {
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
}
for element in elements:
assert element.metadata.filename is None
def test_partition_text_splits_long_text():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
elements = partition_text(filename=filename)

View File

@ -23,7 +23,6 @@ from unstructured.documents.elements import (
ListItem,
NarrativeText,
PageBreak,
RegexMetadata,
Text,
Title,
)
@ -113,17 +112,12 @@ def test_convert_to_dataframe_maintains_fields():
elements = partition_email(
"example-docs/eml/fake-email-attachment.eml",
process_attachements=True,
regex_metadata={"hello": r"Hello", "punc": r"[!]"},
)
df = base.convert_to_dataframe(elements)
for element in elements:
metadata = element.metadata.to_dict()
for key in metadata:
if not key.startswith("regex_metadata"):
assert key in df.columns
assert "regex_metadata_hello" in df.columns
assert "regex_metadata_punc" in df.columns
assert key in df.columns
def test_default_pandas_dtypes():
@ -171,7 +165,6 @@ def test_default_pandas_dtypes():
emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True,
detection_class_prob=0.5,
),
@ -328,7 +321,6 @@ def test_convert_to_coco():
emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True,
detection_class_prob=0.5,
),
@ -372,7 +364,6 @@ def test_convert_to_coco():
emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True,
detection_class_prob=0.5,
),

View File

@ -177,10 +177,6 @@
"name": "text_as_html",
"type": "Edm.String"
},
{
"name": "regex_metadata",
"type": "Edm.String"
},
{
"name": "detection_class_prob",
"type": "Edm.Double"
@ -202,4 +198,4 @@
}
]
}
}
}

View File

@ -55,7 +55,6 @@ TEST_DATA_2 = {
},
"last_modified": "2021-01-03T00:00:00",
"page_number": 10,
"regex_metadata": {"pattern": "abc"},
},
"embeddings": [0.1, 0.2, 0.3],
}
@ -135,7 +134,6 @@ def test_conform_dict_2():
"links": '{"link1": "https://example.com", "link2": "https://example.org"}',
"last_modified": datetime.datetime(2021, 1, 3, 0, 0),
"page_number": "10",
"regex_metadata": '{"pattern": "abc"}',
"date_created": datetime.datetime(2021, 1, 1, 0, 0),
"date_modified": datetime.datetime(2021, 1, 2, 0, 0),
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),

View File

@ -15,7 +15,6 @@ from unstructured.documents.elements import (
ConsolidationStrategy,
Element,
ElementMetadata,
RegexMetadata,
Table,
TableChunk,
Title,
@ -739,43 +738,6 @@ class TextPreChunk:
continuation_metadata.is_continuation = True
return continuation_metadata
@lazyproperty
def _consolidated_regex_meta(self) -> dict[str, list[RegexMetadata]]:
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
offsets of each regex match are also adjusted for their new positions.
"""
chunk_regex_metadata: dict[str, list[RegexMetadata]] = {}
separator_len = len(self._opts.text_separator)
running_text_len = len(self._overlap_prefix) if self._overlap_prefix else 0
start_offset = running_text_len
for element in self._elements:
text_len = len(element.text)
# -- skip empty elements like `PageBreak("")` --
if not text_len:
continue
# -- account for blank line between "squashed" elements, but not at start of text --
running_text_len += separator_len if running_text_len else 0
start_offset = running_text_len
running_text_len += text_len
if not element.metadata.regex_metadata:
continue
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
for regex_name, matches in element_regex_metadata.items():
for m in matches:
m["start"] += start_offset
m["end"] += start_offset
chunk_matches = chunk_regex_metadata.get(regex_name, [])
chunk_matches.extend(matches)
chunk_regex_metadata[regex_name] = chunk_matches
return chunk_regex_metadata
def _iter_text_segments(self) -> Iterator[str]:
"""Generate overlap text and each element text segment in order.
@ -812,8 +774,6 @@ class TextPreChunk:
# -- Python 3.7+ maintains dict insertion order --
ordered_unique_keys = {key: None for val_list in values for key in val_list}
yield field_name, list(ordered_unique_keys.keys())
elif strategy is CS.REGEX:
yield field_name, self._consolidated_regex_meta
elif strategy is CS.DROP:
continue
else: # pragma: no cover

View File

@ -8,7 +8,6 @@ import functools
import hashlib
import os
import pathlib
import re
import uuid
from itertools import groupby
from types import MappingProxyType
@ -127,14 +126,6 @@ class CoordinatesMetadata:
return cls(points=points, system=system)
class RegexMetadata(TypedDict):
"""Metadata that is extracted from a document element via regex."""
text: str
start: int
end: int
class Link(TypedDict):
"""Metadata related to extracted links"""
@ -202,8 +193,6 @@ class ElementMetadata:
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
page_number: Optional[int]
parent_id: Optional[str]
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
# -- e-mail specific metadata fields --
bcc_recipient: Optional[list[str]]
@ -254,7 +243,6 @@ class ElementMetadata:
page_name: Optional[str] = None,
page_number: Optional[int] = None,
parent_id: Optional[str] = None,
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
sent_from: Optional[list[str]] = None,
sent_to: Optional[list[str]] = None,
signature: Optional[str] = None,
@ -299,7 +287,6 @@ class ElementMetadata:
self.page_name = page_name
self.page_number = page_number
self.parent_id = parent_id
self.regex_metadata = regex_metadata
self.sent_from = sent_from
self.sent_to = sent_to
self.signature = signature
@ -477,9 +464,6 @@ class ConsolidationStrategy(enum.Enum):
LIST_UNIQUE = "list_unique"
"""Union list values across elements, preserving order. Only suitable for `List` fields."""
REGEX = "regex"
"""Combine regex-metadata of elements, adjust start and stop offsets for concatenated text."""
@classmethod
def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"""Mapping from ElementMetadata field-name to its consolidation strategy.
@ -519,7 +503,6 @@ class ConsolidationStrategy(enum.Enum):
"page_name": cls.FIRST,
"page_number": cls.FIRST,
"parent_id": cls.DROP,
"regex_metadata": cls.REGEX,
"sent_from": cls.FIRST,
"sent_to": cls.FIRST,
"signature": cls.FIRST,
@ -550,7 +533,7 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
# -- generate sequence number for each element on a page --
page_numbers = [e.metadata.page_number for e in elements]
page_seq_pairs = [
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
seq_on_page for _, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
]
# -- assign hash IDs to elements --
@ -575,7 +558,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
This decorator adds a post-processing step to a document partitioner.
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
"""
@ -605,13 +587,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
elements = func(*args, **kwargs)
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {})
# -- don't write an empty `{}` to metadata.regex_metadata when no regex-metadata was
# -- requested, otherwise it will serialize (because it's not None) when it has no
# -- meaning or is even misleading. Also it complicates tests that don't use regex-meta.
if regex_metadata:
elements = _add_regex_metadata(elements, regex_metadata)
unique_element_ids: bool = call_args.get("unique_element_ids", False)
if unique_element_ids is False:
elements = assign_and_map_hash_ids(elements)
@ -623,36 +598,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
return decorator
def _add_regex_metadata(
elements: list[Element],
regex_metadata: dict[str, str] = {},
) -> list[Element]:
"""Adds metadata based on a user provided regular expression.
The additional metadata will be added to the regex_metadata attrbuted in the element metadata.
"""
for element in elements:
if isinstance(element, Text):
_regex_metadata: dict["str", list[RegexMetadata]] = {}
for field_name, pattern in regex_metadata.items():
results: list[RegexMetadata] = []
for result in re.finditer(pattern, element.text):
start, end = result.span()
results.append(
{
"text": element.text[start:end],
"start": start,
"end": end,
},
)
if len(results) > 0:
_regex_metadata[field_name] = results
element.metadata.regex_metadata = _regex_metadata
return elements
class ElementType:
TITLE = "Title"
TEXT = "Text"
@ -738,9 +683,7 @@ class Element(abc.ABC):
metadata: Optional[ElementMetadata] = None,
detection_origin: Optional[str] = None,
):
if element_id is not None and not isinstance(
element_id, str
): # pyright: ignore[reportUnnecessaryIsInstance]
if element_id is not None and not isinstance(element_id, str): # type: ignore
raise ValueError("element_id must be of type str or None.")
self._element_id = element_id
@ -1075,7 +1018,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
}
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict[str, Any]]) -> list[FormKeyValuePair]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from dicts into Elements explicitly,
@ -1093,17 +1036,17 @@ def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyVal
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
[kv_pair["value"]["custom_element"]]
)
return kv_pairs
return cast(list[FormKeyValuePair], kv_pairs)
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
def _kvform_pairs_to_dict(orig_kv_pairs: list[FormKeyValuePair]) -> list[dict[str, Any]]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from Elements to dicts recursively,
e.g. when FormKeysValues.to_dict() is used.
"""
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
kv_pairs: list[dict[str, Any]] = copy.deepcopy(orig_kv_pairs) # type: ignore
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()

View File

@ -495,7 +495,7 @@ class _OleFileDifferentiator:
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
"""True when file has CFBF magic first 8 bytes."""
with ctx.open() as file:
return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
@staticmethod
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
@ -719,10 +719,11 @@ def add_filetype(
This decorator adds a post-processing step to a document partitioner.
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
- Adds `.metadata.filetype` (source-document MIME-type) metadata value
This "partial" decorator is present because `partition_image()` does not apply
`.metadata.filetype` this way since each image type has its own MIME-type (e.g. `image.jpeg`,
`image/png`, etc.).
"""
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:

View File

@ -107,8 +107,6 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
data["metadata"]["data_source"]["date_processed"] = parser.parse(
date_processed,
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number)

View File

@ -159,9 +159,6 @@ class SqlDestinationConnector(BaseDestinationConnector):
if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number)
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
if data.get("metadata", {}).get("data_source", None):
data.update(data.get("metadata", {}).pop("data_source", None))
if data.get("metadata", {}).get("coordinates", None):

View File

@ -169,9 +169,6 @@ class WeaviateDestinationConnector(BaseDestinationConnector):
if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number)
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
logger.info(
f"writing {len(elements_dict)} objects to destination "

View File

@ -111,8 +111,6 @@ class AzureCognitiveSearchUploadStager(UploadStager):
date_processed
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number)
return data
@ -179,7 +177,6 @@ class AzureCognitiveSearchUploader(Uploader):
return self.write_dict(elements_dict=elements_dict)
def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
elements_dict = []
for content in contents:
with open(content.path) as elements_file:

View File

@ -103,7 +103,6 @@ _COLUMNS = (
"emphasized_text_contents",
"emphasized_text_tags",
"text_as_html",
"regex_metadata",
"detection_class_prob",
)
@ -165,10 +164,7 @@ class SQLUploadStager(UploadStager):
df[column] = df[column].apply(
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
)
for column in filter(
lambda x: x in df.columns,
("version", "page_number", "regex_metadata"),
):
for column in filter(lambda x: x in df.columns, ("version", "page_number")):
df[column] = df[column].apply(str)
with output_path.open("w") as output_file:

View File

@ -126,9 +126,6 @@ class WeaviateUploadStager(UploadStager):
if page_number := data.get("metadata", {}).get("page_number"):
data["metadata"]["page_number"] = str(page_number)
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
def run(
self,
elements_filepath: Path,

View File

@ -272,13 +272,7 @@ def flatten_dict(
def _get_table_fieldnames(rows: list[dict[str, Any]]):
table_fieldnames = list(TABLE_FIELDNAMES)
for row in rows:
metadata = row["metadata"]
for key in flatten_dict(metadata):
if key.startswith("regex_metadata") and key not in table_fieldnames:
table_fieldnames.append(key)
return table_fieldnames
return list(TABLE_FIELDNAMES)
def convert_to_csv(elements: Iterable[Element]) -> str:
@ -337,7 +331,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
"emphasized_text_contents": object, # Optional[list[str]]
"emphasized_text_tags": object, # Optional[list[str]]
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
"regex_metadata": object,
"max_characters": "Int64", # Optional[int]
"is_continuation": "boolean", # Optional[bool]
"detection_class_prob": float, # Optional[float],
@ -354,7 +347,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
"data_source_permissions_data": object,
"embeddings": object,
"regex_metadata_key": object,
}

View File

@ -16,7 +16,6 @@ exclude_metadata_keys = (
"is_continuation",
"links",
"orig_elements",
"regex_metadata",
"key_value_pairs",
)