mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
rfctr(part): prepare for pluggable auto-partitioners 2 (#3657)
**Summary** Step 2 in prep for pluggable auto-partitioners, remove `regex_metadata` field from `ElementMetadata`. **Additional Context** - "regex-metadata" was an experimental feature that didn't pan out. - It's implemented by one of the post-partitioning metadata decorators, so get rid of it as part of the cleanup before consolidating those decorators.
This commit is contained in:
parent
903efb0c6d
commit
086b8d6f8a
@ -135,9 +135,6 @@
|
|||||||
"type": "text",
|
"type": "text",
|
||||||
"analyzer": "standard"
|
"analyzer": "standard"
|
||||||
},
|
},
|
||||||
"regex_metadata": {
|
|
||||||
"type": "object"
|
|
||||||
},
|
|
||||||
"detection_class_prob": {
|
"detection_class_prob": {
|
||||||
"type": "float"
|
"type": "float"
|
||||||
}
|
}
|
||||||
|
@ -139,9 +139,6 @@
|
|||||||
"type": "text",
|
"type": "text",
|
||||||
"analyzer": "standard"
|
"analyzer": "standard"
|
||||||
},
|
},
|
||||||
"regex_metadata": {
|
|
||||||
"type": "object"
|
|
||||||
},
|
|
||||||
"detection_class_prob": {
|
"detection_class_prob": {
|
||||||
"type": "float"
|
"type": "float"
|
||||||
}
|
}
|
||||||
|
@ -37,7 +37,6 @@ CREATE TABLE elements (
|
|||||||
emphasized_text_contents TEXT,
|
emphasized_text_contents TEXT,
|
||||||
emphasized_text_tags TEXT,
|
emphasized_text_tags TEXT,
|
||||||
text_as_html TEXT,
|
text_as_html TEXT,
|
||||||
regex_metadata TEXT,
|
|
||||||
detection_class_prob DECIMAL,
|
detection_class_prob DECIMAL,
|
||||||
is_continuation BOOLEAN,
|
is_continuation BOOLEAN,
|
||||||
orig_elements TEXT,
|
orig_elements TEXT,
|
||||||
|
@ -38,7 +38,6 @@ CREATE TABLE elements (
|
|||||||
emphasized_text_contents VARCHAR [],
|
emphasized_text_contents VARCHAR [],
|
||||||
emphasized_text_tags VARCHAR [],
|
emphasized_text_tags VARCHAR [],
|
||||||
text_as_html TEXT,
|
text_as_html TEXT,
|
||||||
regex_metadata TEXT,
|
|
||||||
detection_class_prob DECIMAL
|
detection_class_prob DECIMAL
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -36,6 +36,5 @@ CREATE TABLE elements (
|
|||||||
emphasized_text_contents TEXT,
|
emphasized_text_contents TEXT,
|
||||||
emphasized_text_tags TEXT,
|
emphasized_text_tags TEXT,
|
||||||
text_as_html TEXT,
|
text_as_html TEXT,
|
||||||
regex_metadata TEXT,
|
|
||||||
detection_class_prob DECIMAL
|
detection_class_prob DECIMAL
|
||||||
);
|
);
|
||||||
|
@ -361,15 +361,6 @@
|
|||||||
"name": "text_as_html",
|
"name": "text_as_html",
|
||||||
"tokenization": "word"
|
"tokenization": "word"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"dataType": [
|
|
||||||
"text"
|
|
||||||
],
|
|
||||||
"indexFilterable": true,
|
|
||||||
"indexSearchable": true,
|
|
||||||
"name": "regex_metadata",
|
|
||||||
"tokenization": "word"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"dataType": [
|
"dataType": [
|
||||||
"number"
|
"number"
|
||||||
@ -420,4 +411,4 @@
|
|||||||
},
|
},
|
||||||
"vectorIndexType": "hnsw",
|
"vectorIndexType": "hnsw",
|
||||||
"vectorizer": "none"
|
"vectorizer": "none"
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,6 @@ from unstructured.documents.elements import (
|
|||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
PageBreak,
|
PageBreak,
|
||||||
RegexMetadata,
|
|
||||||
Table,
|
Table,
|
||||||
TableChunk,
|
TableChunk,
|
||||||
Text,
|
Text,
|
||||||
@ -958,51 +957,6 @@ class DescribeTextPreChunk:
|
|||||||
assert orig_elements[0] is element
|
assert orig_elements[0] is element
|
||||||
assert orig_elements[1] is element_2
|
assert orig_elements[1] is element_2
|
||||||
|
|
||||||
def it_consolidates_regex_metadata_in_a_field_specific_way(self):
|
|
||||||
"""regex_metadata of chunk is combined regex_metadatas of its elements.
|
|
||||||
|
|
||||||
Also, the `start` and `end` offsets of each regex-match are adjusted to reflect their new
|
|
||||||
position in the chunk after element text has been concatenated.
|
|
||||||
"""
|
|
||||||
pre_chunk = TextPreChunk(
|
|
||||||
[
|
|
||||||
Title(
|
|
||||||
"Lorem Ipsum",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
|
||||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
],
|
|
||||||
overlap_prefix="ficitur.", # len == 8
|
|
||||||
opts=ChunkingOptions(),
|
|
||||||
)
|
|
||||||
|
|
||||||
regex_metadata = pre_chunk._consolidated_regex_meta
|
|
||||||
|
|
||||||
assert regex_metadata == {
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=35, end=40)],
|
|
||||||
"ipsum": [
|
|
||||||
RegexMetadata(text="Ipsum", start=16, end=21),
|
|
||||||
RegexMetadata(text="ipsum", start=29, end=34),
|
|
||||||
RegexMetadata(text="ipsum", start=91, end=96),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
def it_forms_ElementMetadata_constructor_kwargs_by_applying_consolidation_strategies(self):
|
||||||
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
"""._meta_kwargs is used like `ElementMetadata(**self._meta_kwargs)` to construct metadata.
|
||||||
|
|
||||||
@ -1021,7 +975,6 @@ class DescribeTextPreChunk:
|
|||||||
emphasized_text_contents=["Lorem", "Ipsum"],
|
emphasized_text_contents=["Lorem", "Ipsum"],
|
||||||
emphasized_text_tags=["b", "i"],
|
emphasized_text_tags=["b", "i"],
|
||||||
languages=["lat"],
|
languages=["lat"],
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
Text(
|
Text(
|
||||||
@ -1036,11 +989,6 @@ class DescribeTextPreChunk:
|
|||||||
emphasized_text_tags=["i", "b"],
|
emphasized_text_tags=["i", "b"],
|
||||||
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
# -- languages has LIST_UNIQUE strategy, so "lat(in)" appears only once --
|
||||||
languages=["eng", "lat"],
|
languages=["eng", "lat"],
|
||||||
# -- regex_metadata has its own dedicated consolidation-strategy (REGEX) --
|
|
||||||
regex_metadata={
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
|
||||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
|
||||||
},
|
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@ -1055,13 +1003,6 @@ class DescribeTextPreChunk:
|
|||||||
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
"emphasized_text_contents": ["Lorem", "Ipsum", "Lorem", "ipsum"],
|
||||||
"emphasized_text_tags": ["b", "i", "i", "b"],
|
"emphasized_text_tags": ["b", "i", "i", "b"],
|
||||||
"languages": ["lat", "eng"],
|
"languages": ["lat", "eng"],
|
||||||
"regex_metadata": {
|
|
||||||
"ipsum": [
|
|
||||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
|
||||||
RegexMetadata(text="ipsum", start=19, end=24),
|
|
||||||
],
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def it_computes_the_original_elements_list_to_help(self):
|
def it_computes_the_original_elements_list_to_help(self):
|
||||||
|
@ -19,7 +19,6 @@ from unstructured.documents.elements import (
|
|||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
ListItem,
|
ListItem,
|
||||||
RegexMetadata,
|
|
||||||
Table,
|
Table,
|
||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
@ -111,12 +110,7 @@ def test_chunk_by_title():
|
|||||||
Text("Today is an okay day."),
|
Text("Today is an okay day."),
|
||||||
Text("It is rainy outside."),
|
Text("It is rainy outside."),
|
||||||
Title("A Bad Day"),
|
Title("A Bad Day"),
|
||||||
Text(
|
Text("Today is a bad day."),
|
||||||
"Today is a bad day.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text("It is storming outside."),
|
Text("It is storming outside."),
|
||||||
CheckBox(),
|
CheckBox(),
|
||||||
]
|
]
|
||||||
@ -134,9 +128,6 @@ def test_chunk_by_title():
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
assert chunks[0].metadata == ElementMetadata(emphasized_text_contents=["Day", "day"])
|
||||||
assert chunks[3].metadata == ElementMetadata(
|
|
||||||
regex_metadata={"a": [RegexMetadata(text="A", start=11, end=12)]},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_by_title_separates_by_page_number():
|
def test_chunk_by_title_separates_by_page_number():
|
||||||
@ -149,12 +140,7 @@ def test_chunk_by_title_separates_by_page_number():
|
|||||||
Text("Today is an okay day."),
|
Text("Today is an okay day."),
|
||||||
Text("It is rainy outside."),
|
Text("It is rainy outside."),
|
||||||
Title("A Bad Day"),
|
Title("A Bad Day"),
|
||||||
Text(
|
Text("Today is a bad day."),
|
||||||
"Today is a bad day.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text("It is storming outside."),
|
Text("It is storming outside."),
|
||||||
CheckBox(),
|
CheckBox(),
|
||||||
]
|
]
|
||||||
@ -185,12 +171,7 @@ def test_chuck_by_title_respects_multipage():
|
|||||||
Text("Today is an okay day."),
|
Text("Today is an okay day."),
|
||||||
Text("It is rainy outside."),
|
Text("It is rainy outside."),
|
||||||
Title("A Bad Day"),
|
Title("A Bad Day"),
|
||||||
Text(
|
Text("Today is a bad day."),
|
||||||
"Today is a bad day.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text("It is storming outside."),
|
Text("It is storming outside."),
|
||||||
CheckBox(),
|
CheckBox(),
|
||||||
]
|
]
|
||||||
@ -207,90 +188,6 @@ def test_chuck_by_title_respects_multipage():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_by_title_does_not_break_on_regex_metadata_change():
|
|
||||||
"""PreChunker is insensitive to regex-metadata changes.
|
|
||||||
|
|
||||||
A regex-metadata match in an element does not signify a semantic boundary and a pre-chunk should
|
|
||||||
not be split based on such a difference.
|
|
||||||
"""
|
|
||||||
elements: list[Element] = [
|
|
||||||
Title(
|
|
||||||
"Lorem Ipsum",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
chunks = chunk_by_title(elements)
|
|
||||||
|
|
||||||
assert chunks == [
|
|
||||||
CompositeElement(
|
|
||||||
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
|
|
||||||
" ipsum sed lectus porta volutpat.",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_by_title_consolidates_and_adjusts_offsets_of_regex_metadata():
|
|
||||||
"""ElementMetadata.regex_metadata of chunk is union of regex_metadatas of its elements.
|
|
||||||
|
|
||||||
The `start` and `end` offsets of each regex-match are adjusted to reflect their new position in
|
|
||||||
the chunk after element text has been concatenated.
|
|
||||||
"""
|
|
||||||
elements: list[Element] = [
|
|
||||||
Title(
|
|
||||||
"Lorem Ipsum",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
|
|
||||||
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
|
|
||||||
},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text(
|
|
||||||
"In rhoncus ipsum sed lectus porta volutpat.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
chunks = chunk_by_title(elements)
|
|
||||||
|
|
||||||
assert len(chunks) == 1
|
|
||||||
chunk = chunks[0]
|
|
||||||
assert chunk == CompositeElement(
|
|
||||||
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
|
|
||||||
" ipsum sed lectus porta volutpat.",
|
|
||||||
)
|
|
||||||
assert chunk.metadata.regex_metadata == {
|
|
||||||
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
|
|
||||||
"ipsum": [
|
|
||||||
RegexMetadata(text="Ipsum", start=6, end=11),
|
|
||||||
RegexMetadata(text="ipsum", start=19, end=24),
|
|
||||||
RegexMetadata(text="ipsum", start=81, end=86),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_by_title_groups_across_pages():
|
def test_chunk_by_title_groups_across_pages():
|
||||||
elements: list[Element] = [
|
elements: list[Element] = [
|
||||||
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
Title("A Great Day", metadata=ElementMetadata(page_number=1)),
|
||||||
@ -301,12 +198,7 @@ def test_chunk_by_title_groups_across_pages():
|
|||||||
Text("Today is an okay day."),
|
Text("Today is an okay day."),
|
||||||
Text("It is rainy outside."),
|
Text("It is rainy outside."),
|
||||||
Title("A Bad Day"),
|
Title("A Bad Day"),
|
||||||
Text(
|
Text("Today is a bad day."),
|
||||||
"Today is a bad day.",
|
|
||||||
metadata=ElementMetadata(
|
|
||||||
regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]},
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Text("It is storming outside."),
|
Text("It is storming outside."),
|
||||||
CheckBox(),
|
CheckBox(),
|
||||||
]
|
]
|
||||||
|
@ -27,7 +27,6 @@ from unstructured.documents.elements import (
|
|||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
Points,
|
Points,
|
||||||
RegexMetadata,
|
|
||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
assign_and_map_hash_ids,
|
assign_and_map_hash_ids,
|
||||||
@ -235,24 +234,6 @@ def test_element_to_dict():
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def test_regex_metadata_round_trips_through_JSON():
|
|
||||||
"""metadata.regex_metadata should appear at full depth in JSON."""
|
|
||||||
regex_metadata = {
|
|
||||||
"mail-stop": [RegexMetadata(text="MS-107", start=18, end=24)],
|
|
||||||
"version": [
|
|
||||||
RegexMetadata(text="current=v1.7.2", start=7, end=21),
|
|
||||||
RegexMetadata(text="supersedes=v1.7.2", start=22, end=40),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
metadata = ElementMetadata(regex_metadata=regex_metadata)
|
|
||||||
|
|
||||||
metadata_json = json.dumps(metadata.to_dict())
|
|
||||||
deserialized_metadata = ElementMetadata.from_dict(json.loads(metadata_json))
|
|
||||||
reserialized_metadata_json = json.dumps(deserialized_metadata.to_dict())
|
|
||||||
|
|
||||||
assert reserialized_metadata_json == metadata_json
|
|
||||||
|
|
||||||
|
|
||||||
class DescribeElementMetadata:
|
class DescribeElementMetadata:
|
||||||
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
|
"""Unit-test suite for `unstructured.documents.elements.ElementMetadata`."""
|
||||||
|
|
||||||
|
@ -244,17 +244,6 @@ the fox met a bear."""
|
|||||||
assert element.metadata.filename is None
|
assert element.metadata.filename is None
|
||||||
|
|
||||||
|
|
||||||
def test_partition_text_extract_regex_metadata():
|
|
||||||
text = "SPEAKER 1: It is my turn to speak now!"
|
|
||||||
|
|
||||||
elements = partition_text(text=text, regex_metadata={"speaker": r"SPEAKER \d{1,3}"})
|
|
||||||
assert elements[0].metadata.regex_metadata == {
|
|
||||||
"speaker": [{"text": "SPEAKER 1", "start": 0, "end": 9}],
|
|
||||||
}
|
|
||||||
for element in elements:
|
|
||||||
assert element.metadata.filename is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_text_splits_long_text():
|
def test_partition_text_splits_long_text():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "norwich-city.txt")
|
||||||
elements = partition_text(filename=filename)
|
elements = partition_text(filename=filename)
|
||||||
|
@ -23,7 +23,6 @@ from unstructured.documents.elements import (
|
|||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
PageBreak,
|
PageBreak,
|
||||||
RegexMetadata,
|
|
||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
@ -113,17 +112,12 @@ def test_convert_to_dataframe_maintains_fields():
|
|||||||
elements = partition_email(
|
elements = partition_email(
|
||||||
"example-docs/eml/fake-email-attachment.eml",
|
"example-docs/eml/fake-email-attachment.eml",
|
||||||
process_attachements=True,
|
process_attachements=True,
|
||||||
regex_metadata={"hello": r"Hello", "punc": r"[!]"},
|
|
||||||
)
|
)
|
||||||
df = base.convert_to_dataframe(elements)
|
df = base.convert_to_dataframe(elements)
|
||||||
for element in elements:
|
for element in elements:
|
||||||
metadata = element.metadata.to_dict()
|
metadata = element.metadata.to_dict()
|
||||||
for key in metadata:
|
for key in metadata:
|
||||||
if not key.startswith("regex_metadata"):
|
assert key in df.columns
|
||||||
assert key in df.columns
|
|
||||||
|
|
||||||
assert "regex_metadata_hello" in df.columns
|
|
||||||
assert "regex_metadata_punc" in df.columns
|
|
||||||
|
|
||||||
|
|
||||||
def test_default_pandas_dtypes():
|
def test_default_pandas_dtypes():
|
||||||
@ -171,7 +165,6 @@ def test_default_pandas_dtypes():
|
|||||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||||
text_as_html="text_as_html",
|
text_as_html="text_as_html",
|
||||||
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
|
|
||||||
is_continuation=True,
|
is_continuation=True,
|
||||||
detection_class_prob=0.5,
|
detection_class_prob=0.5,
|
||||||
),
|
),
|
||||||
@ -328,7 +321,6 @@ def test_convert_to_coco():
|
|||||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||||
text_as_html="text_as_html",
|
text_as_html="text_as_html",
|
||||||
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
|
|
||||||
is_continuation=True,
|
is_continuation=True,
|
||||||
detection_class_prob=0.5,
|
detection_class_prob=0.5,
|
||||||
),
|
),
|
||||||
@ -372,7 +364,6 @@ def test_convert_to_coco():
|
|||||||
emphasized_text_contents=["emphasized", "text", "contents"],
|
emphasized_text_contents=["emphasized", "text", "contents"],
|
||||||
emphasized_text_tags=["emphasized", "text", "tags"],
|
emphasized_text_tags=["emphasized", "text", "tags"],
|
||||||
text_as_html="text_as_html",
|
text_as_html="text_as_html",
|
||||||
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
|
|
||||||
is_continuation=True,
|
is_continuation=True,
|
||||||
detection_class_prob=0.5,
|
detection_class_prob=0.5,
|
||||||
),
|
),
|
||||||
|
@ -177,10 +177,6 @@
|
|||||||
"name": "text_as_html",
|
"name": "text_as_html",
|
||||||
"type": "Edm.String"
|
"type": "Edm.String"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "regex_metadata",
|
|
||||||
"type": "Edm.String"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "detection_class_prob",
|
"name": "detection_class_prob",
|
||||||
"type": "Edm.Double"
|
"type": "Edm.Double"
|
||||||
@ -202,4 +198,4 @@
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -55,7 +55,6 @@ TEST_DATA_2 = {
|
|||||||
},
|
},
|
||||||
"last_modified": "2021-01-03T00:00:00",
|
"last_modified": "2021-01-03T00:00:00",
|
||||||
"page_number": 10,
|
"page_number": 10,
|
||||||
"regex_metadata": {"pattern": "abc"},
|
|
||||||
},
|
},
|
||||||
"embeddings": [0.1, 0.2, 0.3],
|
"embeddings": [0.1, 0.2, 0.3],
|
||||||
}
|
}
|
||||||
@ -135,7 +134,6 @@ def test_conform_dict_2():
|
|||||||
"links": '{"link1": "https://example.com", "link2": "https://example.org"}',
|
"links": '{"link1": "https://example.com", "link2": "https://example.org"}',
|
||||||
"last_modified": datetime.datetime(2021, 1, 3, 0, 0),
|
"last_modified": datetime.datetime(2021, 1, 3, 0, 0),
|
||||||
"page_number": "10",
|
"page_number": "10",
|
||||||
"regex_metadata": '{"pattern": "abc"}',
|
|
||||||
"date_created": datetime.datetime(2021, 1, 1, 0, 0),
|
"date_created": datetime.datetime(2021, 1, 1, 0, 0),
|
||||||
"date_modified": datetime.datetime(2021, 1, 2, 0, 0),
|
"date_modified": datetime.datetime(2021, 1, 2, 0, 0),
|
||||||
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),
|
"date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8),
|
||||||
|
@ -15,7 +15,6 @@ from unstructured.documents.elements import (
|
|||||||
ConsolidationStrategy,
|
ConsolidationStrategy,
|
||||||
Element,
|
Element,
|
||||||
ElementMetadata,
|
ElementMetadata,
|
||||||
RegexMetadata,
|
|
||||||
Table,
|
Table,
|
||||||
TableChunk,
|
TableChunk,
|
||||||
Title,
|
Title,
|
||||||
@ -739,43 +738,6 @@ class TextPreChunk:
|
|||||||
continuation_metadata.is_continuation = True
|
continuation_metadata.is_continuation = True
|
||||||
return continuation_metadata
|
return continuation_metadata
|
||||||
|
|
||||||
@lazyproperty
|
|
||||||
def _consolidated_regex_meta(self) -> dict[str, list[RegexMetadata]]:
|
|
||||||
"""Consolidate the regex-metadata in `regex_metadata_dicts` into a single dict.
|
|
||||||
|
|
||||||
This consolidated value is suitable for use in the chunk metadata. `start` and `end`
|
|
||||||
offsets of each regex match are also adjusted for their new positions.
|
|
||||||
"""
|
|
||||||
chunk_regex_metadata: dict[str, list[RegexMetadata]] = {}
|
|
||||||
separator_len = len(self._opts.text_separator)
|
|
||||||
running_text_len = len(self._overlap_prefix) if self._overlap_prefix else 0
|
|
||||||
start_offset = running_text_len
|
|
||||||
|
|
||||||
for element in self._elements:
|
|
||||||
text_len = len(element.text)
|
|
||||||
# -- skip empty elements like `PageBreak("")` --
|
|
||||||
if not text_len:
|
|
||||||
continue
|
|
||||||
# -- account for blank line between "squashed" elements, but not at start of text --
|
|
||||||
running_text_len += separator_len if running_text_len else 0
|
|
||||||
start_offset = running_text_len
|
|
||||||
running_text_len += text_len
|
|
||||||
|
|
||||||
if not element.metadata.regex_metadata:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# -- consolidate any `regex_metadata` matches, adjusting the match start/end offsets --
|
|
||||||
element_regex_metadata = copy.deepcopy(element.metadata.regex_metadata)
|
|
||||||
for regex_name, matches in element_regex_metadata.items():
|
|
||||||
for m in matches:
|
|
||||||
m["start"] += start_offset
|
|
||||||
m["end"] += start_offset
|
|
||||||
chunk_matches = chunk_regex_metadata.get(regex_name, [])
|
|
||||||
chunk_matches.extend(matches)
|
|
||||||
chunk_regex_metadata[regex_name] = chunk_matches
|
|
||||||
|
|
||||||
return chunk_regex_metadata
|
|
||||||
|
|
||||||
def _iter_text_segments(self) -> Iterator[str]:
|
def _iter_text_segments(self) -> Iterator[str]:
|
||||||
"""Generate overlap text and each element text segment in order.
|
"""Generate overlap text and each element text segment in order.
|
||||||
|
|
||||||
@ -812,8 +774,6 @@ class TextPreChunk:
|
|||||||
# -- Python 3.7+ maintains dict insertion order --
|
# -- Python 3.7+ maintains dict insertion order --
|
||||||
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
ordered_unique_keys = {key: None for val_list in values for key in val_list}
|
||||||
yield field_name, list(ordered_unique_keys.keys())
|
yield field_name, list(ordered_unique_keys.keys())
|
||||||
elif strategy is CS.REGEX:
|
|
||||||
yield field_name, self._consolidated_regex_meta
|
|
||||||
elif strategy is CS.DROP:
|
elif strategy is CS.DROP:
|
||||||
continue
|
continue
|
||||||
else: # pragma: no cover
|
else: # pragma: no cover
|
||||||
|
@ -8,7 +8,6 @@ import functools
|
|||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
|
||||||
import uuid
|
import uuid
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from types import MappingProxyType
|
from types import MappingProxyType
|
||||||
@ -127,14 +126,6 @@ class CoordinatesMetadata:
|
|||||||
return cls(points=points, system=system)
|
return cls(points=points, system=system)
|
||||||
|
|
||||||
|
|
||||||
class RegexMetadata(TypedDict):
|
|
||||||
"""Metadata that is extracted from a document element via regex."""
|
|
||||||
|
|
||||||
text: str
|
|
||||||
start: int
|
|
||||||
end: int
|
|
||||||
|
|
||||||
|
|
||||||
class Link(TypedDict):
|
class Link(TypedDict):
|
||||||
"""Metadata related to extracted links"""
|
"""Metadata related to extracted links"""
|
||||||
|
|
||||||
@ -202,8 +193,6 @@ class ElementMetadata:
|
|||||||
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
|
# -- page numbers currently supported for DOCX, HTML, PDF, and PPTX documents --
|
||||||
page_number: Optional[int]
|
page_number: Optional[int]
|
||||||
parent_id: Optional[str]
|
parent_id: Optional[str]
|
||||||
# -- "fields" e.g. status, dept.no, etc. extracted from text via regex --
|
|
||||||
regex_metadata: Optional[dict[str, list[RegexMetadata]]]
|
|
||||||
|
|
||||||
# -- e-mail specific metadata fields --
|
# -- e-mail specific metadata fields --
|
||||||
bcc_recipient: Optional[list[str]]
|
bcc_recipient: Optional[list[str]]
|
||||||
@ -254,7 +243,6 @@ class ElementMetadata:
|
|||||||
page_name: Optional[str] = None,
|
page_name: Optional[str] = None,
|
||||||
page_number: Optional[int] = None,
|
page_number: Optional[int] = None,
|
||||||
parent_id: Optional[str] = None,
|
parent_id: Optional[str] = None,
|
||||||
regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None,
|
|
||||||
sent_from: Optional[list[str]] = None,
|
sent_from: Optional[list[str]] = None,
|
||||||
sent_to: Optional[list[str]] = None,
|
sent_to: Optional[list[str]] = None,
|
||||||
signature: Optional[str] = None,
|
signature: Optional[str] = None,
|
||||||
@ -299,7 +287,6 @@ class ElementMetadata:
|
|||||||
self.page_name = page_name
|
self.page_name = page_name
|
||||||
self.page_number = page_number
|
self.page_number = page_number
|
||||||
self.parent_id = parent_id
|
self.parent_id = parent_id
|
||||||
self.regex_metadata = regex_metadata
|
|
||||||
self.sent_from = sent_from
|
self.sent_from = sent_from
|
||||||
self.sent_to = sent_to
|
self.sent_to = sent_to
|
||||||
self.signature = signature
|
self.signature = signature
|
||||||
@ -477,9 +464,6 @@ class ConsolidationStrategy(enum.Enum):
|
|||||||
LIST_UNIQUE = "list_unique"
|
LIST_UNIQUE = "list_unique"
|
||||||
"""Union list values across elements, preserving order. Only suitable for `List` fields."""
|
"""Union list values across elements, preserving order. Only suitable for `List` fields."""
|
||||||
|
|
||||||
REGEX = "regex"
|
|
||||||
"""Combine regex-metadata of elements, adjust start and stop offsets for concatenated text."""
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
|
def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
|
||||||
"""Mapping from ElementMetadata field-name to its consolidation strategy.
|
"""Mapping from ElementMetadata field-name to its consolidation strategy.
|
||||||
@ -519,7 +503,6 @@ class ConsolidationStrategy(enum.Enum):
|
|||||||
"page_name": cls.FIRST,
|
"page_name": cls.FIRST,
|
||||||
"page_number": cls.FIRST,
|
"page_number": cls.FIRST,
|
||||||
"parent_id": cls.DROP,
|
"parent_id": cls.DROP,
|
||||||
"regex_metadata": cls.REGEX,
|
|
||||||
"sent_from": cls.FIRST,
|
"sent_from": cls.FIRST,
|
||||||
"sent_to": cls.FIRST,
|
"sent_to": cls.FIRST,
|
||||||
"signature": cls.FIRST,
|
"signature": cls.FIRST,
|
||||||
@ -550,7 +533,7 @@ def assign_and_map_hash_ids(elements: list[Element]) -> list[Element]:
|
|||||||
# -- generate sequence number for each element on a page --
|
# -- generate sequence number for each element on a page --
|
||||||
page_numbers = [e.metadata.page_number for e in elements]
|
page_numbers = [e.metadata.page_number for e in elements]
|
||||||
page_seq_pairs = [
|
page_seq_pairs = [
|
||||||
seq_on_page for page, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
|
seq_on_page for _, group in groupby(page_numbers) for seq_on_page, _ in enumerate(group)
|
||||||
]
|
]
|
||||||
|
|
||||||
# -- assign hash IDs to elements --
|
# -- assign hash IDs to elements --
|
||||||
@ -575,7 +558,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
|
|||||||
This decorator adds a post-processing step to a document partitioner.
|
This decorator adds a post-processing step to a document partitioner.
|
||||||
|
|
||||||
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
|
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
|
||||||
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
|
|
||||||
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
|
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -605,13 +587,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
|
|||||||
elements = func(*args, **kwargs)
|
elements = func(*args, **kwargs)
|
||||||
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
|
call_args = get_call_args_applying_defaults(func, *args, **kwargs)
|
||||||
|
|
||||||
regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {})
|
|
||||||
# -- don't write an empty `{}` to metadata.regex_metadata when no regex-metadata was
|
|
||||||
# -- requested, otherwise it will serialize (because it's not None) when it has no
|
|
||||||
# -- meaning or is even misleading. Also it complicates tests that don't use regex-meta.
|
|
||||||
if regex_metadata:
|
|
||||||
elements = _add_regex_metadata(elements, regex_metadata)
|
|
||||||
|
|
||||||
unique_element_ids: bool = call_args.get("unique_element_ids", False)
|
unique_element_ids: bool = call_args.get("unique_element_ids", False)
|
||||||
if unique_element_ids is False:
|
if unique_element_ids is False:
|
||||||
elements = assign_and_map_hash_ids(elements)
|
elements = assign_and_map_hash_ids(elements)
|
||||||
@ -623,36 +598,6 @@ def process_metadata() -> Callable[[Callable[_P, list[Element]]], Callable[_P, l
|
|||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
def _add_regex_metadata(
|
|
||||||
elements: list[Element],
|
|
||||||
regex_metadata: dict[str, str] = {},
|
|
||||||
) -> list[Element]:
|
|
||||||
"""Adds metadata based on a user provided regular expression.
|
|
||||||
|
|
||||||
The additional metadata will be added to the regex_metadata attrbuted in the element metadata.
|
|
||||||
"""
|
|
||||||
for element in elements:
|
|
||||||
if isinstance(element, Text):
|
|
||||||
_regex_metadata: dict["str", list[RegexMetadata]] = {}
|
|
||||||
for field_name, pattern in regex_metadata.items():
|
|
||||||
results: list[RegexMetadata] = []
|
|
||||||
for result in re.finditer(pattern, element.text):
|
|
||||||
start, end = result.span()
|
|
||||||
results.append(
|
|
||||||
{
|
|
||||||
"text": element.text[start:end],
|
|
||||||
"start": start,
|
|
||||||
"end": end,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
if len(results) > 0:
|
|
||||||
_regex_metadata[field_name] = results
|
|
||||||
|
|
||||||
element.metadata.regex_metadata = _regex_metadata
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
class ElementType:
|
class ElementType:
|
||||||
TITLE = "Title"
|
TITLE = "Title"
|
||||||
TEXT = "Text"
|
TEXT = "Text"
|
||||||
@ -738,9 +683,7 @@ class Element(abc.ABC):
|
|||||||
metadata: Optional[ElementMetadata] = None,
|
metadata: Optional[ElementMetadata] = None,
|
||||||
detection_origin: Optional[str] = None,
|
detection_origin: Optional[str] = None,
|
||||||
):
|
):
|
||||||
if element_id is not None and not isinstance(
|
if element_id is not None and not isinstance(element_id, str): # type: ignore
|
||||||
element_id, str
|
|
||||||
): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
||||||
raise ValueError("element_id must be of type str or None.")
|
raise ValueError("element_id must be of type str or None.")
|
||||||
|
|
||||||
self._element_id = element_id
|
self._element_id = element_id
|
||||||
@ -1075,7 +1018,7 @@ TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
|
def _kvform_rehydrate_internal_elements(kv_pairs: list[dict[str, Any]]) -> list[FormKeyValuePair]:
|
||||||
"""
|
"""
|
||||||
The key_value_pairs metadata field contains (in the vast majority of cases)
|
The key_value_pairs metadata field contains (in the vast majority of cases)
|
||||||
nested Text elements. Those need to be turned from dicts into Elements explicitly,
|
nested Text elements. Those need to be turned from dicts into Elements explicitly,
|
||||||
@ -1093,17 +1036,17 @@ def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyVal
|
|||||||
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
|
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
|
||||||
[kv_pair["value"]["custom_element"]]
|
[kv_pair["value"]["custom_element"]]
|
||||||
)
|
)
|
||||||
return kv_pairs
|
return cast(list[FormKeyValuePair], kv_pairs)
|
||||||
|
|
||||||
|
|
||||||
def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
|
def _kvform_pairs_to_dict(orig_kv_pairs: list[FormKeyValuePair]) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
The key_value_pairs metadata field contains (in the vast majority of cases)
|
The key_value_pairs metadata field contains (in the vast majority of cases)
|
||||||
nested Text elements. Those need to be turned from Elements to dicts recursively,
|
nested Text elements. Those need to be turned from Elements to dicts recursively,
|
||||||
e.g. when FormKeysValues.to_dict() is used.
|
e.g. when FormKeysValues.to_dict() is used.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
|
kv_pairs: list[dict[str, Any]] = copy.deepcopy(orig_kv_pairs) # type: ignore
|
||||||
for kv_pair in kv_pairs:
|
for kv_pair in kv_pairs:
|
||||||
if kv_pair["key"]["custom_element"] is not None:
|
if kv_pair["key"]["custom_element"] is not None:
|
||||||
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
|
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
|
||||||
|
@ -495,7 +495,7 @@ class _OleFileDifferentiator:
|
|||||||
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
|
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
|
||||||
"""True when file has CFBF magic first 8 bytes."""
|
"""True when file has CFBF magic first 8 bytes."""
|
||||||
with ctx.open() as file:
|
with ctx.open() as file:
|
||||||
return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
|
return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
|
def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
|
||||||
@ -719,10 +719,11 @@ def add_filetype(
|
|||||||
|
|
||||||
This decorator adds a post-processing step to a document partitioner.
|
This decorator adds a post-processing step to a document partitioner.
|
||||||
|
|
||||||
- Adds `metadata_filename` and `include_metadata` parameters to docstring if not present.
|
- Adds `.metadata.filetype` (source-document MIME-type) metadata value
|
||||||
- Adds `.metadata.regex-metadata` when `regex_metadata` keyword-argument is provided.
|
|
||||||
- Updates element.id to a UUID when `unique_element_ids` argument is provided and True.
|
|
||||||
|
|
||||||
|
This "partial" decorator is present because `partition_image()` does not apply
|
||||||
|
`.metadata.filetype` this way since each image type has its own MIME-type (e.g. `image.jpeg`,
|
||||||
|
`image/png`, etc.).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
|
def decorator(func: Callable[_P, list[Element]]) -> Callable[_P, list[Element]]:
|
||||||
|
@ -107,8 +107,6 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
|||||||
data["metadata"]["data_source"]["date_processed"] = parser.parse(
|
data["metadata"]["data_source"]["date_processed"] = parser.parse(
|
||||||
date_processed,
|
date_processed,
|
||||||
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
||||||
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
|
||||||
if page_number := data.get("metadata", {}).get("page_number"):
|
if page_number := data.get("metadata", {}).get("page_number"):
|
||||||
data["metadata"]["page_number"] = str(page_number)
|
data["metadata"]["page_number"] = str(page_number)
|
||||||
|
|
||||||
|
@ -159,9 +159,6 @@ class SqlDestinationConnector(BaseDestinationConnector):
|
|||||||
if page_number := data.get("metadata", {}).get("page_number"):
|
if page_number := data.get("metadata", {}).get("page_number"):
|
||||||
data["metadata"]["page_number"] = str(page_number)
|
data["metadata"]["page_number"] = str(page_number)
|
||||||
|
|
||||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
||||||
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
||||||
|
|
||||||
if data.get("metadata", {}).get("data_source", None):
|
if data.get("metadata", {}).get("data_source", None):
|
||||||
data.update(data.get("metadata", {}).pop("data_source", None))
|
data.update(data.get("metadata", {}).pop("data_source", None))
|
||||||
if data.get("metadata", {}).get("coordinates", None):
|
if data.get("metadata", {}).get("coordinates", None):
|
||||||
|
@ -169,9 +169,6 @@ class WeaviateDestinationConnector(BaseDestinationConnector):
|
|||||||
if page_number := data.get("metadata", {}).get("page_number"):
|
if page_number := data.get("metadata", {}).get("page_number"):
|
||||||
data["metadata"]["page_number"] = str(page_number)
|
data["metadata"]["page_number"] = str(page_number)
|
||||||
|
|
||||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
||||||
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
||||||
|
|
||||||
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"writing {len(elements_dict)} objects to destination "
|
f"writing {len(elements_dict)} objects to destination "
|
||||||
|
@ -111,8 +111,6 @@ class AzureCognitiveSearchUploadStager(UploadStager):
|
|||||||
date_processed
|
date_processed
|
||||||
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
||||||
data["metadata"]["regex_metadata"] = json.dumps(regex_metadata)
|
|
||||||
if page_number := data.get("metadata", {}).get("page_number"):
|
if page_number := data.get("metadata", {}).get("page_number"):
|
||||||
data["metadata"]["page_number"] = str(page_number)
|
data["metadata"]["page_number"] = str(page_number)
|
||||||
return data
|
return data
|
||||||
@ -179,7 +177,6 @@ class AzureCognitiveSearchUploader(Uploader):
|
|||||||
return self.write_dict(elements_dict=elements_dict)
|
return self.write_dict(elements_dict=elements_dict)
|
||||||
|
|
||||||
def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
|
def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None:
|
||||||
|
|
||||||
elements_dict = []
|
elements_dict = []
|
||||||
for content in contents:
|
for content in contents:
|
||||||
with open(content.path) as elements_file:
|
with open(content.path) as elements_file:
|
||||||
|
@ -103,7 +103,6 @@ _COLUMNS = (
|
|||||||
"emphasized_text_contents",
|
"emphasized_text_contents",
|
||||||
"emphasized_text_tags",
|
"emphasized_text_tags",
|
||||||
"text_as_html",
|
"text_as_html",
|
||||||
"regex_metadata",
|
|
||||||
"detection_class_prob",
|
"detection_class_prob",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -165,10 +164,7 @@ class SQLUploadStager(UploadStager):
|
|||||||
df[column] = df[column].apply(
|
df[column] = df[column].apply(
|
||||||
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None
|
||||||
)
|
)
|
||||||
for column in filter(
|
for column in filter(lambda x: x in df.columns, ("version", "page_number")):
|
||||||
lambda x: x in df.columns,
|
|
||||||
("version", "page_number", "regex_metadata"),
|
|
||||||
):
|
|
||||||
df[column] = df[column].apply(str)
|
df[column] = df[column].apply(str)
|
||||||
|
|
||||||
with output_path.open("w") as output_file:
|
with output_path.open("w") as output_file:
|
||||||
|
@ -126,9 +126,6 @@ class WeaviateUploadStager(UploadStager):
|
|||||||
if page_number := data.get("metadata", {}).get("page_number"):
|
if page_number := data.get("metadata", {}).get("page_number"):
|
||||||
data["metadata"]["page_number"] = str(page_number)
|
data["metadata"]["page_number"] = str(page_number)
|
||||||
|
|
||||||
if regex_metadata := data.get("metadata", {}).get("regex_metadata"):
|
|
||||||
data["metadata"]["regex_metadata"] = str(json.dumps(regex_metadata))
|
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
self,
|
self,
|
||||||
elements_filepath: Path,
|
elements_filepath: Path,
|
||||||
|
@ -272,13 +272,7 @@ def flatten_dict(
|
|||||||
|
|
||||||
|
|
||||||
def _get_table_fieldnames(rows: list[dict[str, Any]]):
|
def _get_table_fieldnames(rows: list[dict[str, Any]]):
|
||||||
table_fieldnames = list(TABLE_FIELDNAMES)
|
return list(TABLE_FIELDNAMES)
|
||||||
for row in rows:
|
|
||||||
metadata = row["metadata"]
|
|
||||||
for key in flatten_dict(metadata):
|
|
||||||
if key.startswith("regex_metadata") and key not in table_fieldnames:
|
|
||||||
table_fieldnames.append(key)
|
|
||||||
return table_fieldnames
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_csv(elements: Iterable[Element]) -> str:
|
def convert_to_csv(elements: Iterable[Element]) -> str:
|
||||||
@ -337,7 +331,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
|||||||
"emphasized_text_contents": object, # Optional[list[str]]
|
"emphasized_text_contents": object, # Optional[list[str]]
|
||||||
"emphasized_text_tags": object, # Optional[list[str]]
|
"emphasized_text_tags": object, # Optional[list[str]]
|
||||||
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
|
"text_as_html": pd.StringDtype(), # Optional[str] # type: ignore
|
||||||
"regex_metadata": object,
|
|
||||||
"max_characters": "Int64", # Optional[int]
|
"max_characters": "Int64", # Optional[int]
|
||||||
"is_continuation": "boolean", # Optional[bool]
|
"is_continuation": "boolean", # Optional[bool]
|
||||||
"detection_class_prob": float, # Optional[float],
|
"detection_class_prob": float, # Optional[float],
|
||||||
@ -354,7 +347,6 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
|||||||
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
|
"data_source_date_processed": pd.StringDtype(), # Optional[str] # type: ignore
|
||||||
"data_source_permissions_data": object,
|
"data_source_permissions_data": object,
|
||||||
"embeddings": object,
|
"embeddings": object,
|
||||||
"regex_metadata_key": object,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,7 +16,6 @@ exclude_metadata_keys = (
|
|||||||
"is_continuation",
|
"is_continuation",
|
||||||
"links",
|
"links",
|
||||||
"orig_elements",
|
"orig_elements",
|
||||||
"regex_metadata",
|
|
||||||
"key_value_pairs",
|
"key_value_pairs",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user