mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-25 15:03:54 +00:00 
			
		
		
		
	 87a88a3c87
			
		
	
	
		87a88a3c87
		
			
		
	
	
	
	
		
			
			This PR implements splitting of `pdfminer` elements (`groups of text chunks`) into smaller bounding boxes (`text lines`). This implementation prevents loss of information from the object detection model and facilitates more effective removal of duplicated `pdfminer` text. This PR also addresses #3430. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
		
			
				
	
	
		
			29 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from unittest.mock import MagicMock
 | |
| 
 | |
| from pdfminer.layout import LTContainer, LTTextLine
 | |
| 
 | |
| from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
 | |
| 
 | |
| 
 | |
| def test_extract_text_objects_nested_containers():
 | |
|     """Test extract_text_objects with nested LTContainers."""
 | |
|     # Mock LTTextLine objects
 | |
|     mock_text_line1 = MagicMock(spec=LTTextLine)
 | |
|     mock_text_line2 = MagicMock(spec=LTTextLine)
 | |
| 
 | |
|     # Mock inner container containing one LTTextLine
 | |
|     mock_inner_container = MagicMock(spec=LTContainer)
 | |
|     mock_inner_container.__iter__.return_value = [mock_text_line2]
 | |
| 
 | |
|     # Mock outer container containing another LTTextLine and the inner container
 | |
|     mock_outer_container = MagicMock(spec=LTContainer)
 | |
|     mock_outer_container.__iter__.return_value = [mock_text_line1, mock_inner_container]
 | |
| 
 | |
|     # Call the function with the outer container
 | |
|     result = extract_text_objects(mock_outer_container)
 | |
| 
 | |
|     # Assert both text line objects are extracted, even from nested containers
 | |
|     assert len(result) == 2
 | |
|     assert mock_text_line1 in result
 | |
|     assert mock_text_line2 in result
 |