2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								import pytest
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-18 15:21:32 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from test_unstructured.unit_utils import example_doc_path
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-15 23:02:56 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from unstructured.documents.elements import Text
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-15 14:29:58 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from unstructured.partition import pdf, strategies
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-15 21:41:02 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								from unstructured.partition.utils.constants import PartitionStrategy
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-15 21:41:02 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								@pytest.mark.parametrize(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    "strategy",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        PartitionStrategy.AUTO,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        PartitionStrategy.FAST,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        PartitionStrategy.OCR_ONLY,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        PartitionStrategy.HI_RES,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								)
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def test_validate_strategy(strategy):
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    # Nothing should raise for a valid strategy
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    strategies.validate_strategy(strategy=strategy)
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def test_validate_strategy_raises_for_fast_strategy():
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    with pytest.raises(ValueError):
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-15 21:41:02 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True)
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def test_validate_strategy_raises_for_bad_strategy():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    with pytest.raises(ValueError):
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        strategies.validate_strategy("totally_guess_the_text")
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								@pytest.mark.parametrize(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ("filename", "from_file", "expected"),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    [
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ("layout-parser-paper-fast.pdf", True, True),
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ("copy-protected.pdf", True, True),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ("loremipsum-flat.pdf", True, False),
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ("layout-parser-paper-fast.pdf", False, True),
							 | 
						
					
						
							
								
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        ("copy-protected.pdf", False, True),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        ("loremipsum-flat.pdf", False, False),
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								)
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def test_is_pdf_text_extractable(filename, from_file, expected):
							 | 
						
					
						
							
								
									
										
										
										
											2024-07-18 15:21:32 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    filename = example_doc_path(f"pdf/{filename}")
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    if from_file:
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        with open(filename, "rb") as f:
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-15 23:02:56 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								            extracted_elements = pdf.extractable_elements(file=f)
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    else:
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-15 23:02:56 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        extracted_elements = pdf.extractable_elements(filename=filename)
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 
									 
								 
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2024-05-15 23:02:56 -07:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    pdf_text_extractable = any(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        isinstance(el, Text) and el.text.strip()
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for page_elements in extracted_elements
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        for el in page_elements
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    assert pdf_text_extractable is expected
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								@pytest.mark.parametrize(
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ("pdf_text_extractable", "infer_table_structure"),
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        (True, True),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, True),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, False),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, False),
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								)
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure):
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    strategy = strategies.determine_pdf_or_image_strategy(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        strategy=PartitionStrategy.FAST,
							 | 
						
					
						
							
								
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pdf_text_extractable=pdf_text_extractable,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        infer_table_structure=infer_table_structure,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    )
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    assert strategy == PartitionStrategy.FAST
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-16 10:59:13 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								@pytest.mark.parametrize(
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    (
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "pdf_text_extractable",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "infer_table_structure",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        "extract_images_in_pdf",
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "extract_image_block_types",
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        "expected",
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    ),
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    [
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        (True, True, True, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, True, True, [], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, True, False, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, True, False, [], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, False, True, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, False, True, [], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, False, False, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (True, False, False, [], PartitionStrategy.FAST),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, True, True, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, True, True, [], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, True, False, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, True, False, [], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, False, True, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, False, True, [], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, False, False, ["Image"], PartitionStrategy.HI_RES),
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        (False, False, False, [], PartitionStrategy.OCR_ONLY),
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    ],
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								)
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								def test_determine_pdf_auto_strategy(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    pdf_text_extractable,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    infer_table_structure,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    extract_images_in_pdf,
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    extract_image_block_types,
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    expected,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								):
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-16 10:59:13 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    strategy = strategies.determine_pdf_or_image_strategy(
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        strategy=PartitionStrategy.AUTO,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        is_image=False,
							 | 
						
					
						
							
								
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        pdf_text_extractable=pdf_text_extractable,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        infer_table_structure=infer_table_structure,
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        extract_images_in_pdf=extract_images_in_pdf,
							 | 
						
					
						
							
								
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								        extract_image_block_types=extract_image_block_types,
							 | 
						
					
						
							
								
									
										
										
										
											2023-06-16 10:59:13 -04:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    )
							 | 
						
					
						
							
								
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 
									 
								 
							 | 
							
								
									
										
									
								
							 | 
							
								
							 | 
							
							
								    assert strategy == expected
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								def test_determine_image_auto_strategy():
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    strategy = strategies.determine_pdf_or_image_strategy(
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        strategy=PartitionStrategy.AUTO,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								        is_image=True,
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    )
							 | 
						
					
						
							| 
								
							 | 
							
								
							 | 
							
								
							 | 
							
							
								    assert strategy == PartitionStrategy.HI_RES
							 |