mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 13:44:05 +00:00
fix: chunking fails with detection_class_prob in metadata (#1637)
This commit is contained in:
parent
0a65fc2134
commit
9960ce5f00
@ -1,4 +1,4 @@
|
||||
## 0.10.19-dev10
|
||||
## 0.10.19-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
|
||||
Fix: Updated code to deal with these cases.
|
||||
Importance: This will ensure the correctness when partitioning HTML and Markdown documents.
|
||||
* **Fixes chunking when `detection_class_prob` appears in Element metadata** Problem: when `detection_class_prob` appears in Element metadata, Elements will only be combined by chunk_by_title if they have the same `detection_class_prob` value (which is rare). This is unlikely a case we ever need to support and most often results in no chunking. Fix: `detection_class_prob` is included in the chunking list of metadata keys excluded for similarity comparison. Importance: This change allows `chunk_by_title` to operate as intended for documents which include `detection_class_prob` metadata in their Elements.
|
||||
|
||||
## 0.10.18
|
||||
|
||||
|
||||
@ -288,6 +288,46 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
|
||||
)
|
||||
|
||||
|
||||
def test_chunk_by_title_drops_detection_class_prob():
|
||||
elements = [
|
||||
Title(
|
||||
"A Great Day",
|
||||
metadata=ElementMetadata(
|
||||
detection_class_prob=0.5,
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Today is a great day.",
|
||||
metadata=ElementMetadata(
|
||||
detection_class_prob=0.62,
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"It is sunny outside.",
|
||||
metadata=ElementMetadata(
|
||||
detection_class_prob=0.73,
|
||||
),
|
||||
),
|
||||
Title(
|
||||
"An Okay Day",
|
||||
metadata=ElementMetadata(
|
||||
detection_class_prob=0.84,
|
||||
),
|
||||
),
|
||||
Text(
|
||||
"Today is an okay day.",
|
||||
metadata=ElementMetadata(
|
||||
detection_class_prob=0.95,
|
||||
),
|
||||
),
|
||||
]
|
||||
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
|
||||
assert str(chunks[0]) == str(
|
||||
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
|
||||
)
|
||||
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
|
||||
|
||||
|
||||
def test_chunk_by_title_drops_extra_metadata():
|
||||
elements = [
|
||||
Title(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.19-dev10" # pragma: no cover
|
||||
__version__ = "0.10.19-dev11" # pragma: no cover
|
||||
|
||||
@ -197,7 +197,14 @@ def _drop_extra_metadata(
|
||||
metadata_dict: Dict[str, Any],
|
||||
include_pages: bool = True,
|
||||
) -> Dict[str, Any]:
|
||||
keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth"]
|
||||
keys_to_drop = [
|
||||
"element_id",
|
||||
"type",
|
||||
"coordinates",
|
||||
"parent_id",
|
||||
"category_depth",
|
||||
"detection_class_prob",
|
||||
]
|
||||
if not include_pages and "page_number" in metadata_dict:
|
||||
keys_to_drop.append("page_number")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user