fix: chunking fails with detection_class_prob in metadata (#1637)

This commit is contained in:
ryannikolaidis 2023-10-04 15:14:21 -07:00 committed by GitHub
parent 0a65fc2134
commit 9960ce5f00
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 51 additions and 3 deletions

View File

@ -1,4 +1,4 @@
## 0.10.19-dev10
## 0.10.19-dev11
### Enhancements
@ -20,6 +20,7 @@
Problem: Under certain circumstances, text immediately after some HTML tags will be misssing from partition result.
Fix: Updated code to deal with these cases.
Importance: This will ensure the correctness when partitioning HTML and Markdown documents.
* **Fixes chunking when `detection_class_prob` appears in Element metadata** Problem: when `detection_class_prob` appears in Element metadata, Elements will only be combined by chunk_by_title if they have the same `detection_class_prob` value (which is rare). This is unlikely a case we ever need to support and most often results in no chunking. Fix: `detection_class_prob` is included in the chunking list of metadata keys excluded for similarity comparison. Importance: This change allows `chunk_by_title` to operate as intended for documents which include `detection_class_prob` metadata in their Elements.
## 0.10.18

View File

@ -288,6 +288,46 @@ def test_add_chunking_strategy_raises_error_for_invalid_n_chars(
)
def test_chunk_by_title_drops_detection_class_prob():
elements = [
Title(
"A Great Day",
metadata=ElementMetadata(
detection_class_prob=0.5,
),
),
Text(
"Today is a great day.",
metadata=ElementMetadata(
detection_class_prob=0.62,
),
),
Text(
"It is sunny outside.",
metadata=ElementMetadata(
detection_class_prob=0.73,
),
),
Title(
"An Okay Day",
metadata=ElementMetadata(
detection_class_prob=0.84,
),
),
Text(
"Today is an okay day.",
metadata=ElementMetadata(
detection_class_prob=0.95,
),
),
]
chunks = chunk_by_title(elements, combine_text_under_n_chars=0)
assert str(chunks[0]) == str(
CompositeElement("A Great Day\n\nToday is a great day.\n\nIt is sunny outside."),
)
assert str(chunks[1]) == str(CompositeElement("An Okay Day\n\nToday is an okay day."))
def test_chunk_by_title_drops_extra_metadata():
elements = [
Title(

View File

@ -1 +1 @@
__version__ = "0.10.19-dev10" # pragma: no cover
__version__ = "0.10.19-dev11" # pragma: no cover

View File

@ -197,7 +197,14 @@ def _drop_extra_metadata(
metadata_dict: Dict[str, Any],
include_pages: bool = True,
) -> Dict[str, Any]:
keys_to_drop = ["element_id", "type", "coordinates", "parent_id", "category_depth"]
keys_to_drop = [
"element_id",
"type",
"coordinates",
"parent_id",
"category_depth",
"detection_class_prob",
]
if not include_pages and "page_number" in metadata_dict:
keys_to_drop.append("page_number")