unstructured/test_unstructured/metrics/test_evaluate.py

import os
import pathlib

import pandas as pd
import pytest

from unstructured.metrics.evaluate import (
    measure_element_type_accuracy,
    measure_text_extraction_accuracy,
)

is_in_docker = os.path.exists("/.dockerenv")

EXAMPLE_DOCS_DIRECTORY = os.path.join(
    pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"
)
TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")

UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
GOLD_CCT_DIRNAME = "gold_standard_cct"
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_evaluation():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
    measure_text_extraction_accuracy(
        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
    )
    assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
    df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
    assert len(df) == 3
    assert len(df.columns) == 5
    assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_element_type_evaluation():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
    measure_element_type_accuracy(
        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
    )
    assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
    df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
    assert len(df) == 1
    assert len(df.columns) == 4
    assert df.iloc[0].filename == "IRS-form-1987.pdf"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_takes_list():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    output_list = ["currency.csv.json"]
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
    measure_text_extraction_accuracy(
        output_dir=output_dir,
        source_dir=source_dir,
        output_list=output_list,
        export_dir=export_dir,
    )
    # check that only the listed files are included
    df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
    assert len(df) == len(output_list)


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_grouping():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
    measure_text_extraction_accuracy(
        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
    )
    df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
    assert len(df) == 4  # metrics row and doctype rows
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00			`import os`
			`import pathlib`

refactor: measure_text_edit_distance function for aggregation (#2108) - Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation. 2023-11-22 16:30:16 -05:00			`import pandas as pd`
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00			`import pytest`

			`from unstructured.metrics.evaluate import (`
feat: add visualize param to command and add test (#2178) - Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name. 2023-11-28 20:05:55 -05:00			`measure_element_type_accuracy,`
			`measure_text_extraction_accuracy,`
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00			`)`

			`is_in_docker = os.path.exists("/.dockerenv")`

			`EXAMPLE_DOCS_DIRECTORY = os.path.join(`
			`pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"`
			`)`
			`TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")`

			`UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"`
			`GOLD_CCT_DIRNAME = "gold_standard_cct"`
feat: add visualize param to command and add test (#2178) - Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name. 2023-11-28 20:05:55 -05:00			`GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"`


			`@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")`
			`def test_text_extraction_evaluation():`
			`output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)`
			`source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)`
			`export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")`
			`measure_text_extraction_accuracy(`
			`output_dir=output_dir, source_dir=source_dir, export_dir=export_dir`
			`)`
			`assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))`
			`df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")`
			`assert len(df) == 3`
			`assert len(df.columns) == 5`
			`assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"`


			`@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")`
			`def test_element_type_evaluation():`
			`output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)`
			`source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)`
			`export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")`
			`measure_element_type_accuracy(`
			`output_dir=output_dir, source_dir=source_dir, export_dir=export_dir`
			`)`
			`assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))`
			`df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")`
			`assert len(df) == 1`
			`assert len(df.columns) == 4`
			`assert df.iloc[0].filename == "IRS-form-1987.pdf"`
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00

			`@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")`
			`def test_text_extraction_takes_list():`
			`output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)`
			`output_list = ["currency.csv.json"]`
			`source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)`
			`export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")`
feat: add visualize param to command and add test (#2178) - Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name. 2023-11-28 20:05:55 -05:00			`measure_text_extraction_accuracy(`
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00			`output_dir=output_dir,`
			`source_dir=source_dir,`
			`output_list=output_list,`
			`export_dir=export_dir,`
			`)`
			`# check that only the listed files are included`
feat: add visualize param to command and add test (#2178) - Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name. 2023-11-28 20:05:55 -05:00			`df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")`
			`assert len(df) == len(output_list)`
refactor: measure_text_edit_distance function for aggregation (#2108) - Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation. 2023-11-22 16:30:16 -05:00

			`@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")`
			`def test_text_extraction_grouping():`
			`output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)`
			`source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)`
			`export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")`
feat: add visualize param to command and add test (#2178) - Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name. 2023-11-28 20:05:55 -05:00			`measure_text_extraction_accuracy(`
refactor: measure_text_edit_distance function for aggregation (#2108) - Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation. 2023-11-22 16:30:16 -05:00			`output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"`
			`)`
			`df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")`
feat: add visualize param to command and add test (#2178) - Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name. 2023-11-28 20:05:55 -05:00			`assert len(df) == 4 # metrics row and doctype rows`