2023-11-07 11:54:22 -08:00
|
|
|
import os
|
|
|
|
import pathlib
|
2023-12-20 11:50:12 -06:00
|
|
|
import shutil
|
2023-11-07 11:54:22 -08:00
|
|
|
|
2023-11-22 16:30:16 -05:00
|
|
|
import pandas as pd
|
2023-11-07 11:54:22 -08:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
from unstructured.metrics.evaluate import (
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping,
|
2023-11-28 20:05:55 -05:00
|
|
|
measure_element_type_accuracy,
|
2024-02-22 17:35:46 +01:00
|
|
|
measure_table_structure_accuracy,
|
2023-11-28 20:05:55 -05:00
|
|
|
measure_text_extraction_accuracy,
|
2023-11-07 11:54:22 -08:00
|
|
|
)
|
|
|
|
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(
|
|
|
|
pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"
|
|
|
|
)
|
|
|
|
TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")
|
|
|
|
|
|
|
|
UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
|
|
|
|
GOLD_CCT_DIRNAME = "gold_standard_cct"
|
2023-11-28 20:05:55 -05:00
|
|
|
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
|
2024-02-22 17:35:46 +01:00
|
|
|
GOLD_TABLE_STRUCTURE_DIRNAME = "gold_standard_table_structure"
|
2024-01-05 18:34:53 -05:00
|
|
|
UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct"
|
2024-02-22 17:35:46 +01:00
|
|
|
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME = "unstructured_output_table_structure"
|
2023-11-28 20:05:55 -05:00
|
|
|
|
2024-03-01 20:18:37 +05:00
|
|
|
DUMMY_DF_CCT = pd.DataFrame(
|
2024-02-23 12:45:20 +07:00
|
|
|
{
|
|
|
|
"filename": [
|
|
|
|
"Bank Good Credit Loan.pptx",
|
|
|
|
"Performance-Audit-Discussion.pdf",
|
|
|
|
"currency.csv",
|
|
|
|
],
|
|
|
|
"doctype": ["pptx", "pdf", "csv"],
|
|
|
|
"connector": ["connector1", "connector1", "connector2"],
|
|
|
|
"cct-accuracy": [0.812, 0.994, 0.887],
|
|
|
|
"cct-%missing": [0.001, 0.002, 0.041],
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2024-03-01 20:18:37 +05:00
|
|
|
DUMMY_DF_ELEMENT_TYPE = pd.DataFrame(
|
|
|
|
{
|
|
|
|
"filename": [
|
|
|
|
"Bank Good Credit Loan.pptx",
|
|
|
|
"Performance-Audit-Discussion.pdf",
|
|
|
|
"currency.csv",
|
|
|
|
],
|
|
|
|
"doctype": ["pptx", "pdf", "csv"],
|
|
|
|
"connector": ["connector1", "connector1", "connector2"],
|
|
|
|
"element-type-accuracy": [0.812, 0.994, 0.887],
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2023-11-28 20:05:55 -05:00
|
|
|
|
2023-12-20 11:50:12 -06:00
|
|
|
@pytest.fixture()
|
|
|
|
def _cleanup_after_test():
|
|
|
|
# This is where the test runs
|
|
|
|
yield
|
|
|
|
|
|
|
|
os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
|
|
|
|
# Cleanup the directory and file
|
|
|
|
if os.path.exists(export_dir):
|
|
|
|
shutil.rmtree(export_dir)
|
|
|
|
|
|
|
|
|
2023-11-28 20:05:55 -05:00
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-12-20 11:50:12 -06:00
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
2023-11-28 20:05:55 -05:00
|
|
|
def test_text_extraction_evaluation():
|
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
measure_text_extraction_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
|
|
|
)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
|
|
assert len(df) == 3
|
|
|
|
assert len(df.columns) == 5
|
|
|
|
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
|
|
|
|
|
|
|
|
|
2024-01-05 18:34:53 -05:00
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
|
|
def test_text_extraction_evaluation_type_txt():
|
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_CCT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
2024-02-23 12:45:20 +07:00
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
2024-01-05 18:34:53 -05:00
|
|
|
measure_text_extraction_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="txt"
|
|
|
|
)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
|
|
assert len(df) == 3
|
|
|
|
assert len(df.columns) == 5
|
|
|
|
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
|
|
|
|
|
|
|
|
|
2023-11-28 20:05:55 -05:00
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-12-20 11:50:12 -06:00
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
2023-11-28 20:05:55 -05:00
|
|
|
def test_element_type_evaluation():
|
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
measure_element_type_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
|
|
|
)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
|
|
|
|
assert len(df) == 1
|
|
|
|
assert len(df.columns) == 4
|
|
|
|
assert df.iloc[0].filename == "IRS-form-1987.pdf"
|
2024-02-22 17:35:46 +01:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
|
|
def test_table_structure_evaluation():
|
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_TABLE_STRUCTURE_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_table_structure")
|
|
|
|
measure_table_structure_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
|
|
|
)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"))
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
|
|
|
|
assert len(df) == 1
|
|
|
|
assert len(df.columns) == 9
|
|
|
|
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"
|
2023-11-07 11:54:22 -08:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-12-20 11:50:12 -06:00
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
2023-11-07 11:54:22 -08:00
|
|
|
def test_text_extraction_takes_list():
|
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
output_list = ["currency.csv.json"]
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
2023-11-28 20:05:55 -05:00
|
|
|
measure_text_extraction_accuracy(
|
2023-11-07 11:54:22 -08:00
|
|
|
output_dir=output_dir,
|
|
|
|
source_dir=source_dir,
|
|
|
|
output_list=output_list,
|
|
|
|
export_dir=export_dir,
|
|
|
|
)
|
|
|
|
# check that only the listed files are included
|
2024-01-05 18:34:53 -05:00
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
|
2023-11-28 20:05:55 -05:00
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
|
|
assert len(df) == len(output_list)
|
2023-11-22 16:30:16 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2023-12-20 11:50:12 -06:00
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
2024-02-23 12:45:20 +07:00
|
|
|
def test_text_extraction_with_grouping():
|
2023-11-22 16:30:16 -05:00
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
2023-11-28 20:05:55 -05:00
|
|
|
measure_text_extraction_accuracy(
|
2023-11-22 16:30:16 -05:00
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
|
|
|
|
)
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
2023-11-28 20:05:55 -05:00
|
|
|
assert len(df) == 4 # metrics row and doctype rows
|
2024-01-05 18:34:53 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
def test_text_extraction_wrong_type():
|
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
measure_text_extraction_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, output_type="wrong"
|
|
|
|
)
|
2024-02-23 12:45:20 +07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
|
|
@pytest.mark.parametrize(("grouping", "count_row"), [("doctype", 3), ("connector", 2)])
|
2024-03-01 20:18:37 +05:00
|
|
|
def test_get_mean_grouping_df_input(grouping, count_row):
|
2024-02-23 12:45:20 +07:00
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping(
|
|
|
|
grouping=grouping,
|
|
|
|
data_input=DUMMY_DF_CCT,
|
|
|
|
export_dir=export_dir,
|
|
|
|
metric_strategy="text_extraction",
|
|
|
|
)
|
2024-02-23 12:45:20 +07:00
|
|
|
grouped_df = pd.read_csv(os.path.join(export_dir, f"all-{grouping}-agg-cct.tsv"), sep="\t")
|
|
|
|
assert grouped_df[grouping].dropna().nunique() == count_row
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
2024-03-01 20:18:37 +05:00
|
|
|
def test_get_mean_grouping_tsv_input():
|
2024-02-23 12:45:20 +07:00
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
measure_text_extraction_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
|
|
|
)
|
|
|
|
filename = os.path.join(export_dir, "all-docs-cct.tsv")
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping(
|
|
|
|
grouping="doctype",
|
|
|
|
data_input=filename,
|
|
|
|
export_dir=export_dir,
|
|
|
|
metric_strategy="text_extraction",
|
|
|
|
)
|
2024-02-23 12:45:20 +07:00
|
|
|
grouped_df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
|
|
|
assert grouped_df["doctype"].dropna().nunique() == 3
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
2024-03-01 20:18:37 +05:00
|
|
|
def test_get_mean_grouping_invalid_group():
|
2024-02-23 12:45:20 +07:00
|
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
measure_text_extraction_accuracy(
|
|
|
|
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
|
|
|
)
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
|
|
with pytest.raises(ValueError):
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping(
|
|
|
|
grouping="invalid",
|
|
|
|
data_input=df,
|
|
|
|
export_dir=export_dir,
|
|
|
|
metric_strategy="text_extraction",
|
|
|
|
)
|
2024-02-23 12:45:20 +07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
def test_text_extraction_grouping_empty_df():
|
|
|
|
empty_df = pd.DataFrame()
|
|
|
|
with pytest.raises(SystemExit):
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping("doctype", empty_df, "some_dir", metric_strategy="text_extraction")
|
2024-02-23 12:45:20 +07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2024-03-01 20:18:37 +05:00
|
|
|
def test_get_mean_grouping_missing_grouping_column():
|
2024-02-23 12:45:20 +07:00
|
|
|
df_with_no_grouping = pd.DataFrame({"some_column": [1, 2, 3]})
|
|
|
|
with pytest.raises(SystemExit):
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping("doctype", df_with_no_grouping, "some_dir", "text-extraction")
|
2024-02-23 12:45:20 +07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
2024-03-01 20:18:37 +05:00
|
|
|
def test_get_mean_grouping_all_null_grouping_column():
|
2024-02-23 12:45:20 +07:00
|
|
|
df_with_null_grouping = pd.DataFrame({"doctype": [None, None, None]})
|
|
|
|
with pytest.raises(SystemExit):
|
2024-03-01 20:18:37 +05:00
|
|
|
get_mean_grouping(
|
|
|
|
"doctype", df_with_null_grouping, "some_dir", metric_strategy="text_extraction"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
def test_get_mean_grouping_invalid_metric_strategy():
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
get_mean_grouping("doctype", DUMMY_DF_ELEMENT_TYPE, "some_dir", metric_strategy="invalid")
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
|
|
@pytest.mark.parametrize(("grouping", "count_row"), [("doctype", 3), ("connector", 2)])
|
|
|
|
def test_get_mean_grouping_element_type(grouping, count_row):
|
|
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_element_type")
|
|
|
|
get_mean_grouping(
|
|
|
|
grouping=grouping,
|
|
|
|
data_input=DUMMY_DF_ELEMENT_TYPE,
|
|
|
|
export_dir=export_dir,
|
|
|
|
metric_strategy="element_type",
|
|
|
|
)
|
|
|
|
grouped_df = pd.read_csv(
|
|
|
|
os.path.join(export_dir, f"all-{grouping}-agg-element-type.tsv"), sep="\t"
|
|
|
|
)
|
|
|
|
assert grouped_df[grouping].dropna().nunique() == count_row
|