Fix: partition_html() swallows some paragraphs (#2837)

Closes #2836.

The `partition_html()` only considers elements with limited depth when
determining if an HTML tag (`etree`) element contains text, to avoid
becoming the text representation of a giant div. This PR increases the
limit value.
This commit is contained in:
Christine Straub 2024-04-02 22:06:37 -07:00 committed by GitHub
parent 8a239b346c
commit e49c35933d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 15 additions and 4 deletions

View File

@ -1,9 +1,13 @@
## 0.13.1-dev0 ## 0.13.1-dev1
### Enhancements ### Enhancements
### Features ### Features
### Fixes ### Fixes
* **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value.
## 0.13.0 ## 0.13.0
### Enhancements ### Enhancements

View File

@ -1 +1 @@
__version__ = "0.13.1-dev0" # pragma: no cover __version__ = "0.13.1-dev1" # pragma: no cover

View File

@ -5,6 +5,8 @@ from __future__ import annotations
import sys import sys
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
if sys.version_info < (3, 8): if sys.version_info < (3, 8):
from typing_extensions import Final from typing_extensions import Final
else: else:
@ -568,7 +570,10 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
return unfurled return unfurled
def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool: def _is_text_tag(
tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> bool:
"""True when `tag_element` potentially contains narrative text.""" """True when `tag_element` potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise, # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div # it could be the text representation of a giant div
@ -594,7 +599,7 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool
def _process_list_item( def _process_list_item(
tag_elem: etree._Element, tag_elem: etree._Element,
max_predecessor_len: int = 5, max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> Tuple[Optional[Element], Optional[etree._Element]]: ) -> Tuple[Optional[Element], Optional[etree._Element]]:
"""Produces an `HTMLListItem` document element from `tag_elem`. """Produces an `HTMLListItem` document element from `tag_elem`.

View File

@ -202,3 +202,5 @@ TESSERACT_MAX_SIZE = 2147483647
# default image colors # default image colors
IMAGE_COLOR_DEPTH = 32 IMAGE_COLOR_DEPTH = 32
HTML_MAX_PREDECESSOR_LEN = 15