Tracy Shen 8c58bc57db
fix doctype parsing error (#3811)
- per [ticket](https://unstructured-ai.atlassian.net/browse/ML-551),
there is a bug in the `unstructured` lib under metrics/evaluate.py that
incorrectly retrieves the file extension before the conversion to cct
file from paths like '*.pdf.txt' . (see below screenshot)
    - the current status is in the top example
- we should have the correct version in the bottom example of the
screenshot.
   

![image](https://github.com/user-attachments/assets/6d82de85-3b54-4e77-a637-28a27fcb279d)

- in addition, i also observe the doctype returned are not aligned, some
returning '.*' and some are returning without the dot.
- therefore, i just aligned them to be output into the same version
which is '.*".
2024-12-06 23:55:01 +00:00

599 lines
24 KiB
Python

import os
import pathlib
import shutil
from pathlib import Path
from unittest.mock import MagicMock, patch
import numpy as np
import pandas as pd
import pytest
from unstructured.metrics.evaluate import (
ElementTypeMetricsCalculator,
TableStructureMetricsCalculator,
TextExtractionMetricsCalculator,
filter_metrics,
get_mean_grouping,
)
is_in_docker = os.path.exists("/.dockerenv")
EXAMPLE_DOCS_DIRECTORY = os.path.join(
pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"
)
TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")
UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
GOLD_CCT_DIRNAME = "gold_standard_cct"
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
GOLD_TABLE_STRUCTURE_DIRNAME = "gold_standard_table_structure"
UNSTRUCTURED_CCT_DIRNAME = "unstructured_output_cct"
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME = "unstructured_output_table_structure"
DUMMY_DF_CCT = pd.DataFrame(
{
"filename": [
"Bank Good Credit Loan.pptx",
"Performance-Audit-Discussion.pdf",
"currency.csv",
],
"doctype": ["pptx", "pdf", "csv"],
"connector": ["connector1", "connector1", "connector2"],
"cct-accuracy": [0.812, 0.994, 0.887],
"cct-%missing": [0.001, 0.002, 0.041],
}
)
DUMMY_DF_ELEMENT_TYPE = pd.DataFrame(
{
"filename": [
"Bank Good Credit Loan.pptx",
"Performance-Audit-Discussion.pdf",
"currency.csv",
],
"doctype": ["pptx", "pdf", "csv"],
"connector": ["connector1", "connector1", "connector2"],
"element-type-accuracy": [0.812, 0.994, 0.887],
}
)
@pytest.fixture
def mock_dependencies():
with patch(
"unstructured.metrics.evaluate.calculate_accuracy"
) as mock_calculate_accuracy, patch(
"unstructured.metrics.evaluate.calculate_percent_missing_text"
) as mock_calculate_percent_missing_text, patch.object(
TextExtractionMetricsCalculator, "_get_ccts"
) as mock_get_ccts, patch(
"unstructured.metrics.evaluate.get_element_type_frequency"
) as mock_get_element_type_frequency, patch(
"unstructured.metrics.evaluate.calculate_element_type_percent_match"
) as mock_calculate_element_type_percent_match, patch(
"unstructured.metrics.evaluate._read_text_file"
) as mock_read_text_file, patch.object(
Path, "exists"
) as mock_path_exists, patch(
"unstructured.metrics.evaluate.TableEvalProcessor.from_json_files"
) as mock_table_eval_processor_from_json_files, patch.object(
TableStructureMetricsCalculator, "supported_metric_names"
) as mock_supported_metric_names:
mocks = {
"mock_calculate_accuracy": mock_calculate_accuracy,
"mock_calculate_percent_missing_text": mock_calculate_percent_missing_text,
"mock_get_ccts": mock_get_ccts,
"mock_get_element_type_frequency": mock_get_element_type_frequency,
"mock_read_text_file": mock_read_text_file,
"mock_calculate_element_type_percent_match": mock_calculate_element_type_percent_match,
"mock_table_eval_processor_from_json_files": mock_table_eval_processor_from_json_files,
"mock_supported_metric_names": mock_supported_metric_names,
"mock_path_exists": mock_path_exists,
}
# setup mocks
mocks["mock_calculate_accuracy"].return_value = 0.5
mocks["mock_calculate_percent_missing_text"].return_value = 0.5
mocks["mock_get_ccts"].return_value = ["output_cct", "source_cct"]
mocks["mock_get_element_type_frequency"].side_effect = [{"ele1": 1}, {"ele2": 3}]
mocks["mock_calculate_element_type_percent_match"].return_value = 0.5
mocks["mock_supported_metric_names"].return_value = ["table_level_acc"]
mocks["mock_path_exists"].return_value = True
mocks["mock_read_text_file"].side_effect = ["output_text", "source_text"]
yield mocks
@pytest.fixture()
def _cleanup_after_test():
"""Fixture for removing side-effects of running tests in this file."""
def remove_generated_directories():
"""Remove directories created from running tests."""
# Directories to be removed:
target_dir_names = [
"test_evaluate_results_cct",
"test_evaluate_results_cct_txt",
"test_evaluate_results_element_type",
"test_evaluate_result_table_structure",
]
subdirs = (d for d in os.scandir(TESTING_FILE_DIR) if d.is_dir())
for d in subdirs:
if d.name in target_dir_names:
shutil.rmtree(d.path)
# Run test as normal
yield
remove_generated_directories()
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_evaluation():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
TextExtractionMetricsCalculator(
documents_dir=output_dir, ground_truths_dir=source_dir
).calculate(export_dir=export_dir, visualize_progress=False, display_agg_df=False)
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
assert len(df) == 3
assert len(df.columns) == 5
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "expected_length", "kwargs"),
[
(
TextExtractionMetricsCalculator,
UNSTRUCTURED_CCT_DIRNAME,
GOLD_CCT_DIRNAME,
Path("Bank Good Credit Loan.pptx.txt"),
5,
{"document_type": "txt"},
),
(
TableStructureMetricsCalculator,
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
GOLD_TABLE_STRUCTURE_DIRNAME,
Path("IRS-2023-Form-1095-A.pdf.json"),
14,
{},
),
(
ElementTypeMetricsCalculator,
UNSTRUCTURED_OUTPUT_DIRNAME,
GOLD_ELEMENT_TYPE_DIRNAME,
Path("IRS-form-1987.pdf.json"),
4,
{},
),
],
)
def test_process_document_returns_the_correct_amount_of_values(
calculator_class, output_dirname, source_dirname, path, expected_length, kwargs
):
output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
output_list = calculator._process_document(path)
assert len(output_list) == expected_length
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
[
(
TextExtractionMetricsCalculator,
UNSTRUCTURED_CCT_DIRNAME,
GOLD_CCT_DIRNAME,
Path("2310.03502text_to_image_synthesis1-7.pdf.txt"),
{"document_type": "txt"},
),
],
)
def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
):
output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
mock_calculate_percent_missing_text = mock_dependencies["mock_calculate_percent_missing_text"]
mock_get_ccts = mock_dependencies["mock_get_ccts"]
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
output_list = calculator._process_document(path)
assert output_list[1] == ".pdf"
assert mock_calculate_accuracy.call_count == 1
assert mock_calculate_percent_missing_text.call_count == 1
assert mock_get_ccts.call_count == 1
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
[
(
TableStructureMetricsCalculator,
UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME,
GOLD_TABLE_STRUCTURE_DIRNAME,
Path("tablib-627mTABLES-2310.07875-p7.pdf.json"),
{},
),
# (
# ElementTypeMetricsCalculator,
# UNSTRUCTURED_OUTPUT_DIRNAME,
# GOLD_ELEMENT_TYPE_DIRNAME,
# Path("IRS-form.1987.pdf.json"),
# {},
# ),
],
)
def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
):
output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
calculator._ground_truths_dir = source_dir
calculator._documents_dir = output_dir
calculator._ground_truth_paths = [source_dir / path]
mock_report = MagicMock()
mock_report.total_predicted_tables = 3
mock_report.table_evel_acc = 0.83
mock_table_eval_processor_from_json_files = mock_dependencies[
"mock_table_eval_processor_from_json_files"
]
mock_table_eval_processor_from_json_files.return_value.process_file.return_value = mock_report
output_list = calculator._process_document(path)
assert output_list[1] == ".pdf"
assert mock_table_eval_processor_from_json_files.call_count == 1
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test", "mock_dependencies")
@pytest.mark.parametrize(
("calculator_class", "output_dirname", "source_dirname", "path", "kwargs"),
[
(
ElementTypeMetricsCalculator,
UNSTRUCTURED_OUTPUT_DIRNAME,
GOLD_ELEMENT_TYPE_DIRNAME,
Path("IRS-form.1987.pdf.json"),
{},
),
],
)
def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
):
output_dir = Path(TESTING_FILE_DIR) / output_dirname
source_dir = Path(TESTING_FILE_DIR) / source_dirname
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
mock_element_type_frequency = mock_dependencies["mock_get_element_type_frequency"]
mock_read_text_file = mock_dependencies["mock_read_text_file"]
mock_calculate_element_type_percent_match = mock_dependencies[
"mock_calculate_element_type_percent_match"
]
output_list = calculator._process_document(path)
assert output_list[1] == ".pdf"
assert mock_read_text_file.call_count == 2
assert mock_element_type_frequency.call_count == 2
assert mock_calculate_element_type_percent_match.call_count == 1
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_evaluation_type_txt():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_CCT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
TextExtractionMetricsCalculator(
documents_dir=output_dir, ground_truths_dir=source_dir, document_type="txt"
).calculate(export_dir=export_dir)
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
assert len(df) == 3
assert len(df.columns) == 5
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_element_type_evaluation():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
ElementTypeMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
).calculate(export_dir=export_dir, visualize_progress=False)
assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
assert len(df) == 1
assert len(df.columns) == 4
assert df.iloc[0].filename == "IRS-form-1987.pdf"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_table_structure_evaluation():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_TABLE_STRUCTURE_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_TABLE_STRUCTURE_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_result_table_structure")
TableStructureMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
).calculate(export_dir=export_dir, visualize_progress=False)
assert os.path.isfile(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"))
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
agg_df = pd.read_csv(
os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"), sep="\t"
).set_index("metric")
assert len(df) == 2
assert len(df.columns) == 15
assert df.iloc[1].filename == "IRS-2023-Form-1095-A.pdf"
assert (
np.round(np.average(df["table_level_acc"], weights=df["total_tables"]), 3)
== agg_df.loc["table_level_acc", "average"]
)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_takes_list():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
output_list = ["currency.csv.json"]
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
TextExtractionMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
).on_files(document_paths=output_list).calculate(export_dir=export_dir)
# check that only the listed files are included
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
assert len(df) == len(output_list)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_with_grouping():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
TextExtractionMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
group_by="doctype",
).calculate(export_dir=export_dir)
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
assert len(df) == 4 # metrics row and doctype rows
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_wrong_type():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
with pytest.raises(ValueError):
TextExtractionMetricsCalculator(
documents_dir=output_dir, ground_truths_dir=source_dir, document_type="invalid type"
).calculate(export_dir=export_dir)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
@pytest.mark.parametrize(("grouping", "count_row"), [("doctype", 3), ("connector", 2)])
def test_get_mean_grouping_df_input(grouping: str, count_row: int):
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
get_mean_grouping(
group_by=grouping,
data_input=DUMMY_DF_CCT,
export_dir=export_dir,
eval_name="text_extraction",
)
grouped_df = pd.read_csv(os.path.join(export_dir, f"all-{grouping}-agg-cct.tsv"), sep="\t")
assert grouped_df[grouping].dropna().nunique() == count_row
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_tsv_input():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
TextExtractionMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
).calculate(export_dir=export_dir)
filename = os.path.join(export_dir, "all-docs-cct.tsv")
get_mean_grouping(
group_by="doctype",
data_input=filename,
export_dir=export_dir,
eval_name="text_extraction",
)
grouped_df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
assert grouped_df["doctype"].dropna().nunique() == 3
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_invalid_group():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
TextExtractionMetricsCalculator(
documents_dir=output_dir,
ground_truths_dir=source_dir,
).calculate(export_dir=export_dir)
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
with pytest.raises(ValueError):
get_mean_grouping(
group_by="invalid",
data_input=df,
export_dir=export_dir,
eval_name="text_extraction",
)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_text_extraction_grouping_empty_df():
empty_df = pd.DataFrame()
with pytest.raises(SystemExit):
get_mean_grouping("doctype", empty_df, "some_dir", eval_name="text_extraction")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_missing_grouping_column():
df_with_no_grouping = pd.DataFrame({"some_column": [1, 2, 3]})
with pytest.raises(SystemExit):
get_mean_grouping("doctype", df_with_no_grouping, "some_dir", "text_extraction")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_all_null_grouping_column():
df_with_null_grouping = pd.DataFrame({"doctype": [None, None, None]})
with pytest.raises(SystemExit):
get_mean_grouping("doctype", df_with_null_grouping, "some_dir", eval_name="text_extraction")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_invalid_eval_name():
with pytest.raises(ValueError):
get_mean_grouping("doctype", DUMMY_DF_ELEMENT_TYPE, "some_dir", eval_name="invalid")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
@pytest.mark.parametrize(("group_by", "count_row"), [("doctype", 3), ("connector", 2)])
def test_get_mean_grouping_element_type(group_by: str, count_row: int):
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_element_type")
get_mean_grouping(
group_by=group_by,
data_input=DUMMY_DF_ELEMENT_TYPE,
export_dir=export_dir,
eval_name="element_type",
)
grouped_df = pd.read_csv(
os.path.join(export_dir, f"all-{group_by}-agg-element-type.tsv"), sep="\t"
)
assert grouped_df[group_by].dropna().nunique() == count_row
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_filter_metrics():
with open(os.path.join(TESTING_FILE_DIR, "filter_list.txt"), "w") as file:
file.write("Bank Good Credit Loan.pptx\n")
file.write("Performance-Audit-Discussion.pdf\n")
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
filter_metrics(
data_input=DUMMY_DF_CCT,
filter_list=os.path.join(TESTING_FILE_DIR, "filter_list.txt"),
filter_by="filename",
export_filename="filtered_metrics.tsv",
export_dir=export_dir,
return_type="file",
)
filtered_df = pd.read_csv(os.path.join(export_dir, "filtered_metrics.tsv"), sep="\t")
assert len(filtered_df) == 2
assert filtered_df["filename"].iloc[0] == "Bank Good Credit Loan.pptx"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_all_file():
with open(os.path.join(TESTING_FILE_DIR, "filter_list.txt"), "w") as file:
file.write("Bank Good Credit Loan.pptx\n")
file.write("Performance-Audit-Discussion.pdf\n")
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
filter_metrics(
data_input=DUMMY_DF_CCT,
filter_list=["Bank Good Credit Loan.pptx", "Performance-Audit-Discussion.pdf"],
filter_by="filename",
export_filename="filtered_metrics.tsv",
export_dir=export_dir,
return_type="file",
)
filtered_df = pd.read_csv(os.path.join(export_dir, "filtered_metrics.tsv"), sep="\t")
get_mean_grouping(
group_by="all",
data_input=filtered_df,
export_dir=export_dir,
eval_name="text_extraction",
export_filename="two-filename-agg-cct.tsv",
)
grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
assert np.isclose(float(grouped_df.iloc[1, 0]), 0.903)
assert np.isclose(float(grouped_df.iloc[1, 1]), 0.129)
assert np.isclose(float(grouped_df.iloc[1, 2]), 0.091)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.usefixtures("_cleanup_after_test")
def test_get_mean_grouping_all_file_txt():
with open(os.path.join(TESTING_FILE_DIR, "filter_list.txt"), "w") as file:
file.write("Bank Good Credit Loan.pptx\n")
file.write("Performance-Audit-Discussion.pdf\n")
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
filter_metrics(
data_input=DUMMY_DF_CCT,
filter_list=os.path.join(TESTING_FILE_DIR, "filter_list.txt"),
filter_by="filename",
export_filename="filtered_metrics.tsv",
export_dir=export_dir,
return_type="file",
)
filtered_df = pd.read_csv(os.path.join(export_dir, "filtered_metrics.tsv"), sep="\t")
get_mean_grouping(
group_by="all",
data_input=filtered_df,
export_dir=export_dir,
eval_name="text_extraction",
export_filename="two-filename-agg-cct.tsv",
)
grouped_df = pd.read_csv(os.path.join(export_dir, "two-filename-agg-cct.tsv"), sep="\t")
assert np.isclose(float(grouped_df.iloc[1, 0]), 0.903)
assert np.isclose(float(grouped_df.iloc[1, 1]), 0.129)
assert np.isclose(float(grouped_df.iloc[1, 2]), 0.091)