| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | import os | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-15 14:29:58 -08:00
										 |  |  | from unstructured.partition import pdf, strategies | 
					
						
							| 
									
										
										
										
											2023-11-15 21:41:02 -08:00
										 |  |  | from unstructured.partition.utils.constants import PartitionStrategy | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-15 21:41:02 -08:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "strategy", | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         PartitionStrategy.AUTO, | 
					
						
							|  |  |  |         PartitionStrategy.FAST, | 
					
						
							|  |  |  |         PartitionStrategy.OCR_ONLY, | 
					
						
							|  |  |  |         PartitionStrategy.HI_RES, | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  | def test_validate_strategy(strategy): | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     # Nothing should raise for a valid strategy | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  |     strategies.validate_strategy(strategy=strategy) | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  | def test_validate_strategy_raises_for_fast_strategy(): | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     with pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2023-11-15 21:41:02 -08:00
										 |  |  |         strategies.validate_strategy(strategy=PartitionStrategy.FAST, is_image=True) | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_validate_strategy_raises_for_bad_strategy(): | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  |         strategies.validate_strategy("totally_guess_the_text") | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "from_file", "expected"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("layout-parser-paper-fast.pdf", True, True), | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |         ("copy-protected.pdf", True, True), | 
					
						
							|  |  |  |         ("loremipsum-flat.pdf", True, False), | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |         ("layout-parser-paper-fast.pdf", False, True), | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |         ("copy-protected.pdf", False, True), | 
					
						
							|  |  |  |         ("loremipsum-flat.pdf", False, False), | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_is_pdf_text_extractable(filename, from_file, expected): | 
					
						
							|  |  |  |     filename = os.path.join("example-docs", filename) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if from_file: | 
					
						
							|  |  |  |         with open(filename, "rb") as f: | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |             extractable = pdf.extractable_elements(file=f) | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |         extractable = pdf.extractable_elements(filename=filename) | 
					
						
							| 
									
										
										
										
											2023-05-08 13:21:24 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-07-07 23:41:37 -05:00
										 |  |  |     assert bool(extractable) is expected | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |     ("pdf_text_extractable", "infer_table_structure"), | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |     [ | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |         (True, True), | 
					
						
							|  |  |  |         (False, True), | 
					
						
							|  |  |  |         (True, False), | 
					
						
							|  |  |  |         (False, False), | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  | def test_determine_pdf_or_image_fast_strategy(pdf_text_extractable, infer_table_structure): | 
					
						
							|  |  |  |     strategy = strategies.determine_pdf_or_image_strategy( | 
					
						
							|  |  |  |         strategy=PartitionStrategy.FAST, | 
					
						
							| 
									
										
										
										
											2023-05-12 13:45:08 -04:00
										 |  |  |         pdf_text_extractable=pdf_text_extractable, | 
					
						
							|  |  |  |         infer_table_structure=infer_table_structure, | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |     assert strategy == PartitionStrategy.FAST | 
					
						
							| 
									
										
										
										
											2023-06-16 10:59:13 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |     ( | 
					
						
							|  |  |  |         "pdf_text_extractable", | 
					
						
							|  |  |  |         "infer_table_structure", | 
					
						
							|  |  |  |         "extract_images_in_pdf", | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  |         "extract_image_block_types", | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |         "expected", | 
					
						
							|  |  |  |     ), | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  |     [ | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |         (True, True, True, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, True, True, [], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, True, False, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, True, False, [], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, False, True, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, False, True, [], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, False, False, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (True, False, False, [], PartitionStrategy.FAST), | 
					
						
							|  |  |  |         (False, True, True, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, True, True, [], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, True, False, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, True, False, [], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, False, True, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, False, True, [], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, False, False, ["Image"], PartitionStrategy.HI_RES), | 
					
						
							|  |  |  |         (False, False, False, [], PartitionStrategy.OCR_ONLY), | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  | def test_determine_pdf_auto_strategy( | 
					
						
							|  |  |  |     pdf_text_extractable, | 
					
						
							|  |  |  |     infer_table_structure, | 
					
						
							|  |  |  |     extract_images_in_pdf, | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  |     extract_image_block_types, | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |     expected, | 
					
						
							|  |  |  | ): | 
					
						
							| 
									
										
										
										
											2023-06-16 10:59:13 -04:00
										 |  |  |     strategy = strategies.determine_pdf_or_image_strategy( | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |         strategy=PartitionStrategy.AUTO, | 
					
						
							|  |  |  |         is_image=False, | 
					
						
							| 
									
										
										
										
											2023-11-14 10:46:41 -08:00
										 |  |  |         pdf_text_extractable=pdf_text_extractable, | 
					
						
							|  |  |  |         infer_table_structure=infer_table_structure, | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |         extract_images_in_pdf=extract_images_in_pdf, | 
					
						
							| 
									
										
										
										
											2024-01-04 09:52:00 -08:00
										 |  |  |         extract_image_block_types=extract_image_block_types, | 
					
						
							| 
									
										
										
										
											2023-06-16 10:59:13 -04:00
										 |  |  |     ) | 
					
						
							| 
									
										
										
										
											2023-12-28 14:25:30 -08:00
										 |  |  |     assert strategy == expected | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_determine_image_auto_strategy(): | 
					
						
							|  |  |  |     strategy = strategies.determine_pdf_or_image_strategy( | 
					
						
							|  |  |  |         strategy=PartitionStrategy.AUTO, | 
					
						
							|  |  |  |         is_image=True, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert strategy == PartitionStrategy.HI_RES |