mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	 0eb461acc2
			
		
	
	
		0eb461acc2
		
			
		
	
	
	
	
		
			
			This PR aims to improve the organization and readability of our example documents used in unit tests, specifically focusing on PDF and image files. ### Summary - Created two new subdirectories in the `example-docs` folder: - `pdf/`: for all PDF example files - `img/`: for all image example files - Moved relevant PDF files from `example-docs/` to `example-docs/pdf/` - Moved relevant image files from `example-docs/` to `example-docs/img/` - Updated file paths in affected unit & ingest tests to reflect the new directory structure ### Testing All unit & ingest tests should be updated and verified to work with the new file structure. ## Notes Other file types (e.g., office documents, HTML files) remain in the root of `example-docs/` for now. ## Next Steps Consider similar reorganization for other file types if this structure proves to be beneficial. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
		
			
				
	
	
		
			131 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			131 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| 
 | |
| from test_unstructured.unit_utils import example_doc_path
 | |
| from unstructured.documents.elements import Text
 | |
| from unstructured.partition import pdf, strategies
 | |
| from unstructured.partition.utils.constants import PartitionStrategy
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     "strategy",
 | |
|     [
 | |
|         PartitionStrategy.AUTO,
 | |
|         PartitionStrategy.FAST,
 | |
|         PartitionStrategy.OCR_ONLY,
 | |
|         PartitionStrategy.HI_RES,
 | |
|     ],
 | |
| )
 | |
| def test_validate_strategy(strategy):
 | |
|     # Nothing should raise for a valid strategy
 | |
|     strategies.validate_strategy(strategy=strategy)
 | |
| 
 | |
| 
 | |
| def test_validate_strategy_raises_for_fast_strategy():
 | |
|     with pytest.raises(ValueError):
 | |
|         strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)
 | |
| 
 | |
| 
 | |
| def test_validate_strategy_raises_for_bad_strategy():
 | |
|     with pytest.raises(ValueError):
 | |
|         strategies.validate_strategy("totally_guess_the_text")
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("filename", "from_file", "expected"),
 | |
|     [
 | |
|         ("layout-parser-paper-fast.pdf", True, True),
 | |
|         ("copy-protected.pdf", True, True),
 | |
|         ("loremipsum-flat.pdf", True, False),
 | |
|         ("layout-parser-paper-fast.pdf", False, True),
 | |
|         ("copy-protected.pdf", False, True),
 | |
|         ("loremipsum-flat.pdf", False, False),
 | |
|     ],
 | |
| )
 | |
| def test_is_pdf_text_extractable(filename, from_file, expected):
 | |
|     filename = example_doc_path(f"pdf/{filename}")
 | |
| 
 | |
|     if from_file:
 | |
|         with open(filename, "rb") as f:
 | |
|             extracted_elements = pdf.extractable_elements(file=f)
 | |
|     else:
 | |
|         extracted_elements = pdf.extractable_elements(filename=filename)
 | |
| 
 | |
|     pdf_text_extractable = any(
 | |
|         isinstance(el, Text) and el.text.strip()
 | |
|         for page_elements in extracted_elements
 | |
|         for el in page_elements
 | |
|     )
 | |
| 
 | |
|     assert pdf_text_extractable is expected
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     ("pdf_text_extractable", "infer_table_structure"),
 | |
|     [
 | |
|         (True, True),
 | |
|         (False, True),
 | |
|         (True, False),
 | |
|         (False, False),
 | |
|     ],
 | |
| )
 | |
| def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
 | |
|     strategy = strategies.determine_pdf_or_image_strategy(
 | |
|         strategy=PartitionStrategy.FAST,
 | |
|         pdf_text_extractable=pdf_text_extractable,
 | |
|         infer_table_structure=infer_table_structure,
 | |
|     )
 | |
|     assert strategy == PartitionStrategy.FAST
 | |
| 
 | |
| 
 | |
| @pytest.mark.parametrize(
 | |
|     (
 | |
|         "pdf_text_extractable",
 | |
|         "infer_table_structure",
 | |
|         "extract_images_in_pdf",
 | |
|         "extract_image_block_types",
 | |
|         "expected",
 | |
|     ),
 | |
|     [
 | |
|         (True, True, True, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (True, True, True, [], PartitionStrategy.HI_RES),
 | |
|         (True, True, False, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (True, True, False, [], PartitionStrategy.HI_RES),
 | |
|         (True, False, True, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (True, False, True, [], PartitionStrategy.HI_RES),
 | |
|         (True, False, False, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (True, False, False, [], PartitionStrategy.FAST),
 | |
|         (False, True, True, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (False, True, True, [], PartitionStrategy.HI_RES),
 | |
|         (False, True, False, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (False, True, False, [], PartitionStrategy.HI_RES),
 | |
|         (False, False, True, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (False, False, True, [], PartitionStrategy.HI_RES),
 | |
|         (False, False, False, ["Image"], PartitionStrategy.HI_RES),
 | |
|         (False, False, False, [], PartitionStrategy.OCR_ONLY),
 | |
|     ],
 | |
| )
 | |
| def test_determine_pdf_auto_strategy(
 | |
|     pdf_text_extractable,
 | |
|     infer_table_structure,
 | |
|     extract_images_in_pdf,
 | |
|     extract_image_block_types,
 | |
|     expected,
 | |
| ):
 | |
|     strategy = strategies.determine_pdf_or_image_strategy(
 | |
|         strategy=PartitionStrategy.AUTO,
 | |
|         is_image=False,
 | |
|         pdf_text_extractable=pdf_text_extractable,
 | |
|         infer_table_structure=infer_table_structure,
 | |
|         extract_images_in_pdf=extract_images_in_pdf,
 | |
|         extract_image_block_types=extract_image_block_types,
 | |
|     )
 | |
|     assert strategy == expected
 | |
| 
 | |
| 
 | |
| def test_determine_image_auto_strategy():
 | |
|     strategy = strategies.determine_pdf_or_image_strategy(
 | |
|         strategy=PartitionStrategy.AUTO,
 | |
|         is_image=True,
 | |
|     )
 | |
|     assert strategy == PartitionStrategy.HI_RES
 |