mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 01:54:25 +00:00 
			
		
		
		
	 a11ad22609
			
		
	
	
		a11ad22609
		
			
		
	
	
	
	
		
			
			This PR bumps `unstructured-inference` to `0.8.0`, which introduces vectorized data structure for layout elements and text regions. This PR also cleans up a few places in CI that has repeated definition of env variables or missing installation of testing dependencies in cache. A few document ingest results are changed: - two places for `biomed-api` (actually processed locally on runner) are due to very small changes in numerical results of the bounding box areas: one results in a duplicated page number/header and another results in a deduplication of a word of a sentence that starts in a new line. (yes, two cases goes in opposite directions) - the layout parser paper now outputs the code lines with page number inside the code box as list items --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com> Co-authored-by: christinestraub <christinemstraub@gmail.com>
		
			
				
	
	
		
			38 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			38 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from unstructured_inference.inference.elements import TextRegion, TextRegions
 | |
| from unstructured_inference.inference.layoutelement import LayoutElement
 | |
| 
 | |
| from unstructured.documents.elements import ElementType
 | |
| from unstructured.partition.pdf_image.inference_utils import (
 | |
|     build_layout_elements_from_ocr_regions,
 | |
|     merge_text_regions,
 | |
| )
 | |
| 
 | |
| 
 | |
| def test_merge_text_regions(mock_embedded_text_regions):
 | |
|     expected = TextRegion.from_coords(
 | |
|         x1=437.83888888888885,
 | |
|         y1=317.319341111111,
 | |
|         x2=1256.334784222222,
 | |
|         y2=406.9837855555556,
 | |
|         text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
 | |
|     )
 | |
| 
 | |
|     merged_text_region = merge_text_regions(TextRegions.from_list(mock_embedded_text_regions))
 | |
|     assert merged_text_region == expected
 | |
| 
 | |
| 
 | |
| def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
 | |
|     expected = [
 | |
|         LayoutElement.from_coords(
 | |
|             x1=437.83888888888885,
 | |
|             y1=317.319341111111,
 | |
|             x2=1256.334784222222,
 | |
|             y2=406.9837855555556,
 | |
|             text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
 | |
|             type=ElementType.UNCATEGORIZED_TEXT,
 | |
|         ),
 | |
|     ]
 | |
| 
 | |
|     elements = build_layout_elements_from_ocr_regions(mock_embedded_text_regions)
 | |
|     assert elements == expected
 |