unstructured/test_unstructured/metrics/test_evaluate.py

import os
import pathlib

import pandas as pd
import pytest

from unstructured.metrics.evaluate import (
    measure_text_edit_distance,
)

is_in_docker = os.path.exists("/.dockerenv")

EXAMPLE_DOCS_DIRECTORY = os.path.join(
    pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"
)
TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")

UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
GOLD_CCT_DIRNAME = "gold_standard_cct"


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_takes_list():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    output_list = ["currency.csv.json"]
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
    measure_text_edit_distance(
        output_dir=output_dir,
        source_dir=source_dir,
        output_list=output_list,
        export_dir=export_dir,
    )
    # check that only the listed files are included
    with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
        lines = f.read().splitlines()
    assert len(lines) == len(output_list) + 1  # includes header


@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_grouping():
    output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
    source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
    export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
    measure_text_edit_distance(
        output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
    )
    df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
    assert len(df) == 4
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00			`import os`
			`import pathlib`

refactor: measure_text_edit_distance function for aggregation (#2108) - Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation. 2023-11-22 16:30:16 -05:00			`import pandas as pd`
refactor: separate click wrappers from core evaluation functionality (#1981) ### Summary Click decorated functions cannot (properly) be called outside of the click interface. This makes it difficult to reuse the setup functionality in measure_text_edit_distance or measure_element_type_accuracy. This PR removes the click decoration and separates it into a wrapper function purely to execute the command. ### Technical Details - Changed as suggested in [this StackOverflow post](https://stackoverflow.com/questions/40091347/call-another-click-command-from-a-click-command) response - The locations of these now distinct functions are separate: the `_command` click-decorated functions stay in ingest/evaluate.py, and the core functions measure_text_edit_distance and measure_element_type_accuracy are moved into the unstructured/metrics/ folder (which is a more logical location for them). - Initial test added for measure_text_edit_distance ### Test `sh ./test_unstructured_ingest/evaluation-metrics.sh text-extraction` functionality is unchanged. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: shreyanid <shreyanid@users.noreply.github.com> Co-authored-by: Trevor Bossert <37596773+tabossert@users.noreply.github.com> 2023-11-07 11:54:22 -08:00			`import pytest`

			`from unstructured.metrics.evaluate import (`
			`measure_text_edit_distance,`
			`)`

			`is_in_docker = os.path.exists("/.dockerenv")`

			`EXAMPLE_DOCS_DIRECTORY = os.path.join(`
			`pathlib.Path(__file__).parent.resolve(), "..", "..", "example-docs"`
			`)`
			`TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")`

			`UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"`
			`GOLD_CCT_DIRNAME = "gold_standard_cct"`


			`@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")`
			`def test_text_extraction_takes_list():`
			`output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)`
			`output_list = ["currency.csv.json"]`
			`source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)`
			`export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")`
			`measure_text_edit_distance(`
			`output_dir=output_dir,`
			`source_dir=source_dir,`
			`output_list=output_list,`
			`export_dir=export_dir,`
			`)`
			`# check that only the listed files are included`
			`with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:`
			`lines = f.read().splitlines()`
			`assert len(lines) == len(output_list) + 1 # includes header`
refactor: measure_text_edit_distance function for aggregation (#2108) - Refactor `metrics/evaluation.py` to accepts `grouping` as parameter. - Switch to `DataFrame` for easier analysis and aggregation. 2023-11-22 16:30:16 -05:00

			`@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")`
			`def test_text_extraction_grouping():`
			`output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)`
			`source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)`
			`export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")`
			`measure_text_edit_distance(`
			`output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"`
			`)`
			`df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")`
			`assert len(df) == 4`