diff --git a/CHANGELOG.md b/CHANGELOG.md index d9f4ec836..bea4e67c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ -## 0.13.1-dev0 +## 0.13.1-dev1 ### Enhancements + ### Features + ### Fixes +* **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value. + ## 0.13.0 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fc7823e13..2f456b1a9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.1-dev0" # pragma: no cover +__version__ = "0.13.1-dev1" # pragma: no cover diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 2da3c6328..c595f28c1 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -5,6 +5,8 @@ from __future__ import annotations import sys from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast +from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN + if sys.version_info < (3, 8): from typing_extensions import Final else: @@ -568,7 +570,10 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]: return unfurled -def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool: +def _is_text_tag( + tag_elem: etree._Element, + max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN, +) -> bool: """True when `tag_element` potentially contains narrative text.""" # NOTE(robinson) - Only consider elements with limited depth. Otherwise, # it could be the text representation of a giant div @@ -594,7 +599,7 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool def _process_list_item( tag_elem: etree._Element, - max_predecessor_len: int = 5, + max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN, ) -> Tuple[Optional[Element], Optional[etree._Element]]: """Produces an `HTMLListItem` document element from `tag_elem`. diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index 7258e7eca..6645dce07 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -202,3 +202,5 @@ TESSERACT_MAX_SIZE = 2147483647 # default image colors IMAGE_COLOR_DEPTH = 32 + +HTML_MAX_PREDECESSOR_LEN = 15