| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  | import os | 
					
						
							|  |  |  | import pathlib | 
					
						
							| 
									
										
										
										
											2023-12-20 11:50:12 -06:00
										 |  |  | import shutil | 
					
						
							| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-22 16:30:16 -05:00
										 |  |  | import pandas as pd | 
					
						
							| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  | import pytest | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from unstructured.metrics.evaluate import ( | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  |     measure_element_type_accuracy, | 
					
						
							|  |  |  |     measure_text_extraction_accuracy, | 
					
						
							| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | is_in_docker = os.path.exists("/.dockerenv") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | EXAMPLE_DOCS_DIRECTORY = os.path.join( | 
					
						
							|  |  |  |     pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs" | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output" | 
					
						
							|  |  |  | GOLD_CCT_DIRNAME = "gold_standard_cct" | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  | GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type" | 
					
						
							| 
									
										
										
										
											2024-01-05 18:34:53 -05:00
										 |  |  | UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct" | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-20 11:50:12 -06:00
										 |  |  | @pytest.fixture() | 
					
						
							|  |  |  | def _cleanup_after_test(): | 
					
						
							|  |  |  |     # This is where the test runs | 
					
						
							|  |  |  |     yield | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Cleanup the directory and file | 
					
						
							|  |  |  |     if os.path.exists(export_dir): | 
					
						
							|  |  |  |         shutil.rmtree(export_dir) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							| 
									
										
										
										
											2023-12-20 11:50:12 -06:00
										 |  |  | @pytest.mark.usefixtures("_cleanup_after_test") | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  | def test_text_extraction_evaluation(): | 
					
						
							|  |  |  |     output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) | 
					
						
							|  |  |  |     source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") | 
					
						
							|  |  |  |     measure_text_extraction_accuracy( | 
					
						
							|  |  |  |         output_dir=output_dir, source_dir=source_dir, export_dir=export_dir | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv")) | 
					
						
							|  |  |  |     df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t") | 
					
						
							|  |  |  |     assert len(df) == 3 | 
					
						
							|  |  |  |     assert len(df.columns) == 5 | 
					
						
							|  |  |  |     assert df.iloc[0].filename == "Bank Good Credit Loan.pptx" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-01-05 18:34:53 -05:00
										 |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							|  |  |  | @pytest.mark.usefixtures("_cleanup_after_test") | 
					
						
							|  |  |  | def test_text_extraction_evaluation_type_txt(): | 
					
						
							|  |  |  |     output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_CCT_DIRNAME) | 
					
						
							|  |  |  |     source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct_txt") | 
					
						
							|  |  |  |     measure_text_extraction_accuracy( | 
					
						
							|  |  |  |         output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="txt" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv")) | 
					
						
							|  |  |  |     df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t") | 
					
						
							|  |  |  |     assert len(df) == 3 | 
					
						
							|  |  |  |     assert len(df.columns) == 5 | 
					
						
							|  |  |  |     assert df.iloc[0].filename == "Bank Good Credit Loan.pptx" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							| 
									
										
										
										
											2023-12-20 11:50:12 -06:00
										 |  |  | @pytest.mark.usefixtures("_cleanup_after_test") | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  | def test_element_type_evaluation(): | 
					
						
							|  |  |  |     output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) | 
					
						
							|  |  |  |     source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") | 
					
						
							|  |  |  |     measure_element_type_accuracy( | 
					
						
							|  |  |  |         output_dir=output_dir, source_dir=source_dir, export_dir=export_dir | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv")) | 
					
						
							|  |  |  |     df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t") | 
					
						
							|  |  |  |     assert len(df) == 1 | 
					
						
							|  |  |  |     assert len(df.columns) == 4 | 
					
						
							|  |  |  |     assert df.iloc[0].filename == "IRS-form-1987.pdf" | 
					
						
							| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							| 
									
										
										
										
											2023-12-20 11:50:12 -06:00
										 |  |  | @pytest.mark.usefixtures("_cleanup_after_test") | 
					
						
							| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  | def test_text_extraction_takes_list(): | 
					
						
							|  |  |  |     output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) | 
					
						
							|  |  |  |     output_list = ["currency.csv.json"] | 
					
						
							|  |  |  |     source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  |     measure_text_extraction_accuracy( | 
					
						
							| 
									
										
										
										
											2023-11-07 11:54:22 -08:00
										 |  |  |         output_dir=output_dir, | 
					
						
							|  |  |  |         source_dir=source_dir, | 
					
						
							|  |  |  |         output_list=output_list, | 
					
						
							|  |  |  |         export_dir=export_dir, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     # check that only the listed files are included | 
					
						
							| 
									
										
										
										
											2024-01-05 18:34:53 -05:00
										 |  |  |     assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv")) | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  |     df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t") | 
					
						
							|  |  |  |     assert len(df) == len(output_list) | 
					
						
							| 
									
										
										
										
											2023-11-22 16:30:16 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							| 
									
										
										
										
											2023-12-20 11:50:12 -06:00
										 |  |  | @pytest.mark.usefixtures("_cleanup_after_test") | 
					
						
							| 
									
										
										
										
											2023-11-22 16:30:16 -05:00
										 |  |  | def test_text_extraction_grouping(): | 
					
						
							|  |  |  |     output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) | 
					
						
							|  |  |  |     source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  |     measure_text_extraction_accuracy( | 
					
						
							| 
									
										
										
										
											2023-11-22 16:30:16 -05:00
										 |  |  |         output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype" | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t") | 
					
						
							| 
									
										
										
										
											2023-11-28 20:05:55 -05:00
										 |  |  |     assert len(df) == 4  # metrics row and doctype rows | 
					
						
							| 
									
										
										
										
											2024-01-05 18:34:53 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") | 
					
						
							|  |  |  | def test_text_extraction_wrong_type(): | 
					
						
							|  |  |  |     output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME) | 
					
						
							|  |  |  |     source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME) | 
					
						
							|  |  |  |     export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct") | 
					
						
							|  |  |  |     with pytest.raises(ValueError): | 
					
						
							|  |  |  |         measure_text_extraction_accuracy( | 
					
						
							|  |  |  |             output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="wrong" | 
					
						
							|  |  |  |         ) |