mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	Fix: partition_html() swallows some paragraphs (#2837)
				
					
				
			Closes #2836. The `partition_html()` only considers elements with limited depth when determining if an HTML tag (`etree`) element contains text, to avoid becoming the text representation of a giant div. This PR increases the limit value.
This commit is contained in:
		
							parent
							
								
									8a239b346c
								
							
						
					
					
						commit
						e49c35933d
					
				@ -1,9 +1,13 @@
 | 
			
		||||
## 0.13.1-dev0
 | 
			
		||||
## 0.13.1-dev1
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
### Features
 | 
			
		||||
 | 
			
		||||
### Fixes
 | 
			
		||||
 | 
			
		||||
* **Fix `partition_html()` swallowing some paragraphs**. The `partition_html()` only considers elements with limited depth to avoid becoming the text representation of a giant div. This fix increases the limit value.
 | 
			
		||||
 | 
			
		||||
## 0.13.0
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
 | 
			
		||||
@ -1 +1 @@
 | 
			
		||||
__version__ = "0.13.1-dev0"  # pragma: no cover
 | 
			
		||||
__version__ = "0.13.1-dev1"  # pragma: no cover
 | 
			
		||||
 | 
			
		||||
@ -5,6 +5,8 @@ from __future__ import annotations
 | 
			
		||||
import sys
 | 
			
		||||
from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, cast
 | 
			
		||||
 | 
			
		||||
from unstructured.partition.utils.constants import HTML_MAX_PREDECESSOR_LEN
 | 
			
		||||
 | 
			
		||||
if sys.version_info < (3, 8):
 | 
			
		||||
    from typing_extensions import Final
 | 
			
		||||
else:
 | 
			
		||||
@ -568,7 +570,10 @@ def _unfurl_break_tags(tag_elem: etree._Element) -> List[etree._Element]:
 | 
			
		||||
    return unfurled
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool:
 | 
			
		||||
def _is_text_tag(
 | 
			
		||||
    tag_elem: etree._Element,
 | 
			
		||||
    max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
 | 
			
		||||
) -> bool:
 | 
			
		||||
    """True when `tag_element` potentially contains narrative text."""
 | 
			
		||||
    # NOTE(robinson) - Only consider elements with limited depth. Otherwise,
 | 
			
		||||
    # it could be the text representation of a giant div
 | 
			
		||||
@ -594,7 +599,7 @@ def _is_text_tag(tag_elem: etree._Element, max_predecessor_len: int = 5) -> bool
 | 
			
		||||
 | 
			
		||||
def _process_list_item(
 | 
			
		||||
    tag_elem: etree._Element,
 | 
			
		||||
    max_predecessor_len: int = 5,
 | 
			
		||||
    max_predecessor_len: int = HTML_MAX_PREDECESSOR_LEN,
 | 
			
		||||
) -> Tuple[Optional[Element], Optional[etree._Element]]:
 | 
			
		||||
    """Produces an `HTMLListItem` document element from `tag_elem`.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -202,3 +202,5 @@ TESSERACT_MAX_SIZE = 2147483647
 | 
			
		||||
 | 
			
		||||
# default image colors
 | 
			
		||||
IMAGE_COLOR_DEPTH = 32
 | 
			
		||||
 | 
			
		||||
HTML_MAX_PREDECESSOR_LEN = 15
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user