mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-30 17:38:13 +00:00 
			
		
		
		
	 05c3cd1be2
			
		
	
	
		05c3cd1be2
		
			
		
	
	
	
	
		
			
			This PR introduces `clean_pdfminer_inner_elements` , which deletes pdfminer elements inside other detection origins such as YoloX or detectron. This function returns the clean document. Also, the ingest-test fixtures were updated to reflect the new standard output. The best way to check that this function is working properly is check the new test `test_clean_pdfminer_inner_elements` in `test_unstructured/partition/utils/test_processing_elements.py` --------- Co-authored-by: Roman Isecke <roman@unstructured.io> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: rbiseck3 <rbiseck3@users.noreply.github.com> Co-authored-by: Roman Isecke <136338424+rbiseck3@users.noreply.github.com>
		
			
				
	
	
		
			84 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			84 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| from unstructured_inference.constants import Source
 | |
| from unstructured_inference.inference.elements import Rectangle
 | |
| from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
 | |
| 
 | |
| from unstructured.partition.utils.processing_elements import clean_pdfminer_inner_elements
 | |
| 
 | |
| # A set of elements with pdfminer elements inside tables
 | |
| deletable_elements_inside_table = [
 | |
|     LayoutElement(
 | |
|         bbox=Rectangle(0, 0, 100, 100),
 | |
|         text="Table with inner elements",
 | |
|         type="Table",
 | |
|     ),
 | |
|     LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.PDFMINER),
 | |
|     LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.PDFMINER),
 | |
| ]
 | |
| 
 | |
| # A set of elements without pdfminer elements inside
 | |
| # tables (no elements with source=Source.PDFMINER)
 | |
| no_deletable_elements_inside_table = [
 | |
|     LayoutElement(
 | |
|         bbox=Rectangle(0, 0, 100, 100),
 | |
|         text="Table with inner elements",
 | |
|         type="Table",
 | |
|         source=Source.YOLOX,
 | |
|     ),
 | |
|     LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="text1", source=Source.YOLOX),
 | |
|     LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="text2", source=Source.YOLOX),
 | |
| ]
 | |
| # A set of elements with pdfminer elements inside tables and other
 | |
| # elements with source=Source.PDFMINER
 | |
| # Note: there is some elements with source=Source.PDFMINER are not inside tables
 | |
| mix_elements_inside_table = [
 | |
|     LayoutElement(
 | |
|         bbox=Rectangle(0, 0, 100, 100),
 | |
|         text="Table1 with inner elements",
 | |
|         type="Table",
 | |
|         source=Source.YOLOX,
 | |
|     ),
 | |
|     LayoutElement(bbox=Rectangle(50, 50, 70, 70), text="Inside table1"),
 | |
|     LayoutElement(bbox=Rectangle(70, 70, 80, 80), text="Inside table1", source=Source.PDFMINER),
 | |
|     LayoutElement(
 | |
|         bbox=Rectangle(150, 150, 170, 170),
 | |
|         text="Outside tables",
 | |
|         source=Source.PDFMINER,
 | |
|     ),
 | |
|     LayoutElement(
 | |
|         bbox=Rectangle(180, 180, 200, 200),
 | |
|         text="Outside tables",
 | |
|         source=Source.PDFMINER,
 | |
|     ),
 | |
|     LayoutElement(
 | |
|         bbox=Rectangle(0, 500, 100, 700),
 | |
|         text="Table2 with inner elements",
 | |
|         type="Table",
 | |
|         source=Source.YOLOX,
 | |
|     ),
 | |
|     LayoutElement(bbox=Rectangle(0, 510, 50, 300), text="Inside table2", source=Source.PDFMINER),
 | |
|     LayoutElement(bbox=Rectangle(0, 550, 70, 400), text="Inside table2", source=Source.PDFMINER),
 | |
| ]
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("elements", "length_extra_info", "expected_document_length"),
 | |
|     [
 | |
|         (deletable_elements_inside_table, 1, 1),
 | |
|         (no_deletable_elements_inside_table, 0, 3),
 | |
|         (mix_elements_inside_table, 2, 5),
 | |
|     ],
 | |
| )
 | |
| def test_clean_pdfminer_inner_elements(elements, length_extra_info, expected_document_length):
 | |
|     # create a sample document with pdfminer elements inside tables
 | |
|     page = PageLayout(number=1, image=None, layout=elements)
 | |
|     page.elements = elements
 | |
|     document_with_table = DocumentLayout(pages=[page])
 | |
|     document = document_with_table
 | |
| 
 | |
|     # call the function to clean the pdfminer inner elements
 | |
|     cleaned_doc = clean_pdfminer_inner_elements(document)
 | |
| 
 | |
|     # check that the pdfminer elements were stored in the extra_info dictionary
 | |
|     assert len(cleaned_doc.pages[0].elements) == expected_document_length
 |