mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-30 20:39:54 +00:00
Fix: partition_html()
swallows some paragraphs (#2837)
Closes #2836. The `partition_html()` only considers elements with limited depth when determining if an HTML tag (`etree`) element contains text, to avoid becoming the text representation of a giant div. This PR increases the limit value.
This commit is contained in:
parent
8a239b346c
commit
e49c35933d
@ -1,9 +1,13 @@
|
||||
## 0.13.1-dev0
|
||||
## 0.13.1-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value.
|
||||
|
||||
## 0.13.0
|
||||
|
||||
### Enhancements
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.13.1-dev0" # pragma: no cover
|
||||
__version__ = "0.13.1-dev1" # pragma: no cover
|
||||
|
@ -5,6 +5,8 @@ from __future__ import annotations
|
||||
import sys
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
|
||||
|
||||
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
else:
|
||||
@ -568,7 +570,10 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
|
||||
return unfurled
|
||||
|
||||
|
||||
def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool:
|
||||
def _is_text_tag(
|
||||
tag_elem: etree._Element,
|
||||
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
|
||||
) -> bool:
|
||||
"""True when `tag_element` potentially contains narrative text."""
|
||||
# NOTE(robinson) - Only consider elements with limited depth. Otherwise,
|
||||
# it could be the text representation of a giant div
|
||||
@ -594,7 +599,7 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool
|
||||
|
||||
def _process_list_item(
|
||||
tag_elem: etree._Element,
|
||||
max_predecessor_len: int = 5,
|
||||
max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
|
||||
) -> Tuple[Optional[Element], Optional[etree._Element]]:
|
||||
"""Produces an `HTMLListItem` document element from `tag_elem`.
|
||||
|
||||
|
@ -202,3 +202,5 @@ TESSERACT_MAX_SIZE = 2147483647
|
||||
|
||||
# default image colors
|
||||
IMAGE_COLOR_DEPTH = 32
|
||||
|
||||
HTML_MAX_PREDECESSOR_LEN = 15
|
||||
|
Loading…
x
Reference in New Issue
Block a user