mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

- per [ticket](https://unstructured-ai.atlassian.net/browse/ML-551), there is a bug in the `unstructured` lib under metrics/evaluate.py that incorrectly retrieves the file extension before the conversion to cct file from paths like '*.pdf.txt' . (see below screenshot) - the current status is in the top example - we should have the correct version in the bottom example of the screenshot.  - in addition, i also observe the doctype returned are not aligned, some returning '.*' and some are returning without the dot. - therefore, i just aligned them to be output into the same version which is '.*".
599 lines
24 KiB
Python
599 lines
24 KiB
Python
import os
|
|
import pathlib
|
|
import shutil
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from unstructured.metrics.evaluate import (
|
|
ElementTypeMetricsCalculator,
|
|
TableStructureMetricsCalculator,
|
|
TextExtractionMetricsCalculator,
|
|
filter_metrics,
|
|
get_mean_grouping,
|
|
)
|
|
|
|
is_in_docker = os.path.exists("/.dockerenv")
|
|
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(
|
|
pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"
|
|
)
|
|
TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")
|
|
|
|
UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
|
|
GOLD_CCT_DIRNAME = "gold_standard_cct"
|
|
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
|
|
GOLD_TABLE_STRUCTURE_DIRNAME = "gold_standard_table_structure"
|
|
UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct"
|
|
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME = "unstructured_output_table_structure"
|
|
|
|
DUMMY_DF_CCT = pd.DataFrame(
|
|
{
|
|
"filename": [
|
|
"Bank Good Credit Loan.pptx",
|
|
"Performance-Audit-Discussion.pdf",
|
|
"currency.csv",
|
|
],
|
|
"doctype": ["pptx", "pdf", "csv"],
|
|
"connector": ["connector1", "connector1", "connector2"],
|
|
"cct-accuracy": [0.812, 0.994, 0.887],
|
|
"cct-%missing": [0.001, 0.002, 0.041],
|
|
}
|
|
)
|
|
|
|
DUMMY_DF_ELEMENT_TYPE = pd.DataFrame(
|
|
{
|
|
"filename": [
|
|
"Bank Good Credit Loan.pptx",
|
|
"Performance-Audit-Discussion.pdf",
|
|
"currency.csv",
|
|
],
|
|
"doctype": ["pptx", "pdf", "csv"],
|
|
"connector": ["connector1", "connector1", "connector2"],
|
|
"element-type-accuracy": [0.812, 0.994, 0.887],
|
|
}
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def mock_dependencies():
|
|
with patch(
|
|
"unstructured.metrics.evaluate.calculate_accuracy"
|
|
) as mock_calculate_accuracy, patch(
|
|
"unstructured.metrics.evaluate.calculate_percent_missing_text"
|
|
) as mock_calculate_percent_missing_text, patch.object(
|
|
TextExtractionMetricsCalculator, "_get_ccts"
|
|
) as mock_get_ccts, patch(
|
|
"unstructured.metrics.evaluate.get_element_type_frequency"
|
|
) as mock_get_element_type_frequency, patch(
|
|
"unstructured.metrics.evaluate.calculate_element_type_percent_match"
|
|
) as mock_calculate_element_type_percent_match, patch(
|
|
"unstructured.metrics.evaluate._read_text_file"
|
|
) as mock_read_text_file, patch.object(
|
|
Path, "exists"
|
|
) as mock_path_exists, patch(
|
|
"unstructured.metrics.evaluate.TableEvalProcessor.from_json_files"
|
|
) as mock_table_eval_processor_from_json_files, patch.object(
|
|
TableStructureMetricsCalculator, "supported_metric_names"
|
|
) as mock_supported_metric_names:
|
|
mocks = {
|
|
"mock_calculate_accuracy": mock_calculate_accuracy,
|
|
"mock_calculate_percent_missing_text": mock_calculate_percent_missing_text,
|
|
"mock_get_ccts": mock_get_ccts,
|
|
"mock_get_element_type_frequency": mock_get_element_type_frequency,
|
|
"mock_read_text_file": mock_read_text_file,
|
|
"mock_calculate_element_type_percent_match": mock_calculate_element_type_percent_match,
|
|
"mock_table_eval_processor_from_json_files": mock_table_eval_processor_from_json_files,
|
|
"mock_supported_metric_names": mock_supported_metric_names,
|
|
"mock_path_exists": mock_path_exists,
|
|
}
|
|
|
|
# setup mocks
|
|
mocks["mock_calculate_accuracy"].return_value = 0.5
|
|
mocks["mock_calculate_percent_missing_text"].return_value = 0.5
|
|
mocks["mock_get_ccts"].return_value = ["output_cct", "source_cct"]
|
|
mocks["mock_get_element_type_frequency"].side_effect = [{"ele1": 1}, {"ele2": 3}]
|
|
mocks["mock_calculate_element_type_percent_match"].return_value = 0.5
|
|
mocks["mock_supported_metric_names"].return_value = ["table_level_acc"]
|
|
mocks["mock_path_exists"].return_value = True
|
|
mocks["mock_read_text_file"].side_effect = ["output_text", "source_text"]
|
|
|
|
yield mocks
|
|
|
|
|
|
@pytest.fixture()
|
|
def _cleanup_after_test():
|
|
"""Fixture for removing side-effects of running tests in this file."""
|
|
|
|
def remove_generated_directories():
|
|
"""Remove directories created from running tests."""
|
|
|
|
# Directories to be removed:
|
|
target_dir_names = [
|
|
"test_evaluate_results_cct",
|
|
"test_evaluate_results_cct_txt",
|
|
"test_evaluate_results_element_type",
|
|
"test_evaluate_result_table_structure",
|
|
]
|
|
subdirs = (d for d in os.scandir(TESTING_FILE_DIR) if d.is_dir())
|
|
for d in subdirs:
|
|
if d.name in target_dir_names:
|
|
shutil.rmtree(d.path)
|
|
|
|
# Run test as normal
|
|
yield
|
|
remove_generated_directories()
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_text_extraction_evaluation():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir, ground_truths_dir=source_dir
|
|
).calculate(export_dir=export_dir, visualize_progress=False, display_agg_df=False)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
assert len(df) == 3
|
|
assert len(df.columns) == 5
|
|
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("calculator_class", "output_dirname", "source_dirname", "path", "expected_length", "kwargs"),
|
|
[
|
|
(
|
|
TextExtractionMetricsCalculator,
|
|
UNSTRUCTURED_CCT_DIRNAME,
|
|
GOLD_CCT_DIRNAME,
|
|
Path("Bank Good Credit Loan.pptx.txt"),
|
|
5,
|
|
{"document_type": "txt"},
|
|
),
|
|
(
|
|
TableStructureMetricsCalculator,
|
|
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
|
|
GOLD_TABLE_STRUCTURE_DIRNAME,
|
|
Path("IRS-2023-Form-1095-A.pdf.json"),
|
|
14,
|
|
{},
|
|
),
|
|
(
|
|
ElementTypeMetricsCalculator,
|
|
UNSTRUCTURED_OUTPUT_DIRNAME,
|
|
GOLD_ELEMENT_TYPE_DIRNAME,
|
|
Path("IRS-form-1987.pdf.json"),
|
|
4,
|
|
{},
|
|
),
|
|
],
|
|
)
|
|
def test_process_document_returns_the_correct_amount_of_values(
|
|
calculator_class, output_dirname, source_dirname, path, expected_length, kwargs
|
|
):
|
|
output_dir = Path(TESTING_FILE_DIR) / output_dirname
|
|
source_dir = Path(TESTING_FILE_DIR) / source_dirname
|
|
|
|
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
|
|
output_list = calculator._process_document(path)
|
|
assert len(output_list) == expected_length
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
|
|
@pytest.mark.parametrize(
|
|
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
|
|
[
|
|
(
|
|
TextExtractionMetricsCalculator,
|
|
UNSTRUCTURED_CCT_DIRNAME,
|
|
GOLD_CCT_DIRNAME,
|
|
Path("2310.03502text_to_image_synthesis1-7.pdf.txt"),
|
|
{"document_type": "txt"},
|
|
),
|
|
],
|
|
)
|
|
def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
|
|
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
|
|
):
|
|
|
|
output_dir = Path(TESTING_FILE_DIR) / output_dirname
|
|
source_dir = Path(TESTING_FILE_DIR) / source_dirname
|
|
mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
|
|
mock_calculate_percent_missing_text = mock_dependencies["mock_calculate_percent_missing_text"]
|
|
mock_get_ccts = mock_dependencies["mock_get_ccts"]
|
|
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
|
|
output_list = calculator._process_document(path)
|
|
assert output_list[1] == ".pdf"
|
|
assert mock_calculate_accuracy.call_count == 1
|
|
assert mock_calculate_percent_missing_text.call_count == 1
|
|
assert mock_get_ccts.call_count == 1
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
|
|
@pytest.mark.parametrize(
|
|
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
|
|
[
|
|
(
|
|
TableStructureMetricsCalculator,
|
|
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
|
|
GOLD_TABLE_STRUCTURE_DIRNAME,
|
|
Path("tablib-627mTABLES-2310.07875-p7.pdf.json"),
|
|
{},
|
|
),
|
|
# (
|
|
# ElementTypeMetricsCalculator,
|
|
# UNSTRUCTURED_OUTPUT_DIRNAME,
|
|
# GOLD_ELEMENT_TYPE_DIRNAME,
|
|
# Path("IRS-form.1987.pdf.json"),
|
|
# {},
|
|
# ),
|
|
],
|
|
)
|
|
def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
|
|
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
|
|
):
|
|
|
|
output_dir = Path(TESTING_FILE_DIR) / output_dirname
|
|
source_dir = Path(TESTING_FILE_DIR) / source_dirname
|
|
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
|
|
calculator._ground_truths_dir = source_dir
|
|
calculator._documents_dir = output_dir
|
|
calculator._ground_truth_paths = [source_dir / path]
|
|
mock_report = MagicMock()
|
|
mock_report.total_predicted_tables = 3
|
|
mock_report.table_evel_acc = 0.83
|
|
mock_table_eval_processor_from_json_files = mock_dependencies[
|
|
"mock_table_eval_processor_from_json_files"
|
|
]
|
|
mock_table_eval_processor_from_json_files.return_value.process_file.return_value = mock_report
|
|
|
|
output_list = calculator._process_document(path)
|
|
assert output_list[1] == ".pdf"
|
|
assert mock_table_eval_processor_from_json_files.call_count == 1
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
|
|
@pytest.mark.parametrize(
|
|
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
|
|
[
|
|
(
|
|
ElementTypeMetricsCalculator,
|
|
UNSTRUCTURED_OUTPUT_DIRNAME,
|
|
GOLD_ELEMENT_TYPE_DIRNAME,
|
|
Path("IRS-form.1987.pdf.json"),
|
|
{},
|
|
),
|
|
],
|
|
)
|
|
def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
|
|
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
|
|
):
|
|
|
|
output_dir = Path(TESTING_FILE_DIR) / output_dirname
|
|
source_dir = Path(TESTING_FILE_DIR) / source_dirname
|
|
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
|
|
mock_element_type_frequency = mock_dependencies["mock_get_element_type_frequency"]
|
|
mock_read_text_file = mock_dependencies["mock_read_text_file"]
|
|
mock_calculate_element_type_percent_match = mock_dependencies[
|
|
"mock_calculate_element_type_percent_match"
|
|
]
|
|
output_list = calculator._process_document(path)
|
|
assert output_list[1] == ".pdf"
|
|
assert mock_read_text_file.call_count == 2
|
|
assert mock_element_type_frequency.call_count == 2
|
|
assert mock_calculate_element_type_percent_match.call_count == 1
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_text_extraction_evaluation_type_txt():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_CCT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir, ground_truths_dir=source_dir, document_type="txt"
|
|
).calculate(export_dir=export_dir)
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
assert len(df) == 3
|
|
assert len(df.columns) == 5
|
|
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_element_type_evaluation():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
ElementTypeMetricsCalculator(
|
|
documents_dir=output_dir,
|
|
ground_truths_dir=source_dir,
|
|
).calculate(export_dir=export_dir, visualize_progress=False)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
|
|
assert len(df) == 1
|
|
assert len(df.columns) == 4
|
|
assert df.iloc[0].filename == "IRS-form-1987.pdf"
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_table_structure_evaluation():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_TABLE_STRUCTURE_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_result_table_structure")
|
|
|
|
TableStructureMetricsCalculator(
|
|
documents_dir=output_dir,
|
|
ground_truths_dir=source_dir,
|
|
).calculate(export_dir=export_dir, visualize_progress=False)
|
|
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"))
|
|
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
|
|
agg_df = pd.read_csv(
|
|
os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"), sep="\t"
|
|
).set_index("metric")
|
|
assert len(df) == 2
|
|
assert len(df.columns) == 15
|
|
assert df.iloc[1].filename == "IRS-2023-Form-1095-A.pdf"
|
|
assert (
|
|
np.round(np.average(df["table_level_acc"], weights=df["total_tables"]), 3)
|
|
== agg_df.loc["table_level_acc", "average"]
|
|
)
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_text_extraction_takes_list():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
output_list = ["currency.csv.json"]
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir,
|
|
ground_truths_dir=source_dir,
|
|
).on_files(document_paths=output_list).calculate(export_dir=export_dir)
|
|
|
|
# check that only the listed files are included
|
|
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
assert len(df) == len(output_list)
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_text_extraction_with_grouping():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir,
|
|
ground_truths_dir=source_dir,
|
|
group_by="doctype",
|
|
).calculate(export_dir=export_dir)
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
|
assert len(df) == 4 # metrics row and doctype rows
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_text_extraction_wrong_type():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
with pytest.raises(ValueError):
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir, ground_truths_dir=source_dir, document_type="invalid type"
|
|
).calculate(export_dir=export_dir)
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
@pytest.mark.parametrize(("grouping", "count_row"), [("doctype", 3), ("connector", 2)])
|
|
def test_get_mean_grouping_df_input(grouping: str, count_row: int):
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
get_mean_grouping(
|
|
group_by=grouping,
|
|
data_input=DUMMY_DF_CCT,
|
|
export_dir=export_dir,
|
|
eval_name="text_extraction",
|
|
)
|
|
grouped_df = pd.read_csv(os.path.join(export_dir, f"all-{grouping}-agg-cct.tsv"), sep="\t")
|
|
assert grouped_df[grouping].dropna().nunique() == count_row
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_tsv_input():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir,
|
|
ground_truths_dir=source_dir,
|
|
).calculate(export_dir=export_dir)
|
|
|
|
filename = os.path.join(export_dir, "all-docs-cct.tsv")
|
|
get_mean_grouping(
|
|
group_by="doctype",
|
|
data_input=filename,
|
|
export_dir=export_dir,
|
|
eval_name="text_extraction",
|
|
)
|
|
grouped_df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
|
assert grouped_df["doctype"].dropna().nunique() == 3
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_invalid_group():
|
|
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
|
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
TextExtractionMetricsCalculator(
|
|
documents_dir=output_dir,
|
|
ground_truths_dir=source_dir,
|
|
).calculate(export_dir=export_dir)
|
|
|
|
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
|
with pytest.raises(ValueError):
|
|
get_mean_grouping(
|
|
group_by="invalid",
|
|
data_input=df,
|
|
export_dir=export_dir,
|
|
eval_name="text_extraction",
|
|
)
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_text_extraction_grouping_empty_df():
|
|
empty_df = pd.DataFrame()
|
|
with pytest.raises(SystemExit):
|
|
get_mean_grouping("doctype", empty_df, "some_dir", eval_name="text_extraction")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_missing_grouping_column():
|
|
df_with_no_grouping = pd.DataFrame({"some_column": [1, 2, 3]})
|
|
with pytest.raises(SystemExit):
|
|
get_mean_grouping("doctype", df_with_no_grouping, "some_dir", "text_extraction")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_all_null_grouping_column():
|
|
df_with_null_grouping = pd.DataFrame({"doctype": [None, None, None]})
|
|
with pytest.raises(SystemExit):
|
|
get_mean_grouping("doctype", df_with_null_grouping, "some_dir", eval_name="text_extraction")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_invalid_eval_name():
|
|
with pytest.raises(ValueError):
|
|
get_mean_grouping("doctype", DUMMY_DF_ELEMENT_TYPE, "some_dir", eval_name="invalid")
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
@pytest.mark.parametrize(("group_by", "count_row"), [("doctype", 3), ("connector", 2)])
|
|
def test_get_mean_grouping_element_type(group_by: str, count_row: int):
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_element_type")
|
|
get_mean_grouping(
|
|
group_by=group_by,
|
|
data_input=DUMMY_DF_ELEMENT_TYPE,
|
|
export_dir=export_dir,
|
|
eval_name="element_type",
|
|
)
|
|
grouped_df = pd.read_csv(
|
|
os.path.join(export_dir, f"all-{group_by}-agg-element-type.tsv"), sep="\t"
|
|
)
|
|
assert grouped_df[group_by].dropna().nunique() == count_row
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_filter_metrics():
|
|
with open(os.path.join(TESTING_FILE_DIR, "filter_list.txt"), "w") as file:
|
|
file.write("Bank Good Credit Loan.pptx\n")
|
|
file.write("Performance-Audit-Discussion.pdf\n")
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
filter_metrics(
|
|
data_input=DUMMY_DF_CCT,
|
|
filter_list=os.path.join(TESTING_FILE_DIR, "filter_list.txt"),
|
|
filter_by="filename",
|
|
export_filename="filtered_metrics.tsv",
|
|
export_dir=export_dir,
|
|
return_type="file",
|
|
)
|
|
filtered_df = pd.read_csv(os.path.join(export_dir, "filtered_metrics.tsv"), sep="\t")
|
|
assert len(filtered_df) == 2
|
|
assert filtered_df["filename"].iloc[0] == "Bank Good Credit Loan.pptx"
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_all_file():
|
|
with open(os.path.join(TESTING_FILE_DIR, "filter_list.txt"), "w") as file:
|
|
file.write("Bank Good Credit Loan.pptx\n")
|
|
file.write("Performance-Audit-Discussion.pdf\n")
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
filter_metrics(
|
|
data_input=DUMMY_DF_CCT,
|
|
filter_list=["Bank Good Credit Loan.pptx", "Performance-Audit-Discussion.pdf"],
|
|
filter_by="filename",
|
|
export_filename="filtered_metrics.tsv",
|
|
export_dir=export_dir,
|
|
return_type="file",
|
|
)
|
|
filtered_df = pd.read_csv(os.path.join(export_dir, "filtered_metrics.tsv"), sep="\t")
|
|
|
|
get_mean_grouping(
|
|
group_by="all",
|
|
data_input=filtered_df,
|
|
export_dir=export_dir,
|
|
eval_name="text_extraction",
|
|
export_filename="two-filename-agg-cct.tsv",
|
|
)
|
|
grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
|
|
|
|
assert np.isclose(float(grouped_df.iloc[1, 0]), 0.903)
|
|
assert np.isclose(float(grouped_df.iloc[1, 1]), 0.129)
|
|
assert np.isclose(float(grouped_df.iloc[1, 2]), 0.091)
|
|
|
|
|
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
@pytest.mark.usefixtures("_cleanup_after_test")
|
|
def test_get_mean_grouping_all_file_txt():
|
|
with open(os.path.join(TESTING_FILE_DIR, "filter_list.txt"), "w") as file:
|
|
file.write("Bank Good Credit Loan.pptx\n")
|
|
file.write("Performance-Audit-Discussion.pdf\n")
|
|
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
|
|
|
filter_metrics(
|
|
data_input=DUMMY_DF_CCT,
|
|
filter_list=os.path.join(TESTING_FILE_DIR, "filter_list.txt"),
|
|
filter_by="filename",
|
|
export_filename="filtered_metrics.tsv",
|
|
export_dir=export_dir,
|
|
return_type="file",
|
|
)
|
|
filtered_df = pd.read_csv(os.path.join(export_dir, "filtered_metrics.tsv"), sep="\t")
|
|
|
|
get_mean_grouping(
|
|
group_by="all",
|
|
data_input=filtered_df,
|
|
export_dir=export_dir,
|
|
eval_name="text_extraction",
|
|
export_filename="two-filename-agg-cct.tsv",
|
|
)
|
|
grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
|
|
|
|
assert np.isclose(float(grouped_df.iloc[1, 0]), 0.903)
|
|
assert np.isclose(float(grouped_df.iloc[1, 1]), 0.129)
|
|
assert np.isclose(float(grouped_df.iloc[1, 2]), 0.091)
|