Fix: partition_html() swallows some paragraphs (#2837)

Closes #2836.

The `partition_html()` only considers elements with limited depth when
determining if an HTML tag (`etree`) element contains text, to avoid
becoming the text representation of a giant div. This PR increases the
limit value.
This commit is contained in:
Christine Straub 2024-04-02 22:06:37 -07:00 committed by GitHub
parent 8a239b346c
commit e49c35933d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 15 additions and 4 deletions

View File

@ -1,9 +1,13 @@
## 0.13.1-dev0
## 0.13.1-dev1
### Enhancements
### Features
### Fixes
* **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value.
## 0.13.0
### Enhancements

View File

@ -1 +1 @@
__version__ = "0.13.1-dev0" # pragma: no cover
__version__ = "0.13.1-dev1" # pragma: no cover

View File

@ -5,6 +5,8 @@ from __future__ import annotations
import sys
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
if sys.version_info < (3, 8):
from typing_extensions import Final
else:
@ -568,7 +570,10 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
return unfurled
def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool:
def _is_text_tag(
tag_elem: etree._Element,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> bool:
"""True when `tag_element` potentially contains narrative text."""
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
# it could be the text representation of a giant div
@ -594,7 +599,7 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool
def _process_list_item(
tag_elem: etree._Element,
max_predecessor_len: int = 5,
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
) -> Tuple[Optional[Element], Optional[etree._Element]]:
"""Produces an `HTMLListItem` document element from `tag_elem`.

View File

@ -202,3 +202,5 @@ TESSERACT_MAX_SIZE = 2147483647
# default image colors
IMAGE_COLOR_DEPTH = 32
HTML_MAX_PREDECESSOR_LEN = 15