feat: add visualize param to command and add test (#2178)

- Add `visualize` parameter to the click command -- now callable using
`--visualize` flag to show the progress bar.
- Refactor the name.
This commit is contained in:
Klaijan 2023-11-28 20:05:55 -05:00 committed by GitHub
parent 50b1431c9e
commit 0aae1faa54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 6418 additions and 97 deletions

View File

@ -13,7 +13,7 @@
* **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
* **Fix DOCX merged table cell repeats cell text.** Only include text for a merged cell, not for each underlying cell spanned by the merge.
* **Fix tables not extracted from DOCX header/footers.** Headers and footers in DOCX documents skip tables defined in the header and commonly used for layout/alignment purposes. Extract text from tables as a string and include in the `Header` and `Footer` document elements.
* **Fix output filepath for fsspec-based source connectors** Previously the base directory was being included in the output filepath unnecessarily.
* **Fix output filepath for fsspec-based source connectors.** Previously the base directory was being included in the output filepath unnecessarily.
## 0.11.0

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,8 @@ import pandas as pd
import pytest
from unstructured.metrics.evaluate import (
measure_text_edit_distance,
measure_element_type_accuracy,
measure_text_extraction_accuracy,
)
is_in_docker = os.path.exists("/.dockerenv")
@ -17,6 +18,37 @@ TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")
UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
GOLD_CCT_DIRNAME = "gold_standard_cct"
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_text_extraction_evaluation():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
measure_text_extraction_accuracy(
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
)
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
assert len(df) == 3
assert len(df.columns) == 5
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_element_type_evaluation():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
measure_element_type_accuracy(
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
)
assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
assert len(df) == 1
assert len(df.columns) == 4
assert df.iloc[0].filename == "IRS-form-1987.pdf"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@ -25,16 +57,15 @@ def test_text_extraction_takes_list():
output_list = ["currency.csv.json"]
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
measure_text_edit_distance(
measure_text_extraction_accuracy(
output_dir=output_dir,
source_dir=source_dir,
output_list=output_list,
export_dir=export_dir,
)
# check that only the listed files are included
with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
lines = f.read().splitlines()
assert len(lines) == len(output_list) + 1 # includes header
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
assert len(df) == len(output_list)
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@ -42,8 +73,8 @@ def test_text_extraction_grouping():
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
measure_text_edit_distance(
measure_text_extraction_accuracy(
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
)
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
assert len(df) == 4
assert len(df) == 4 # metrics row and doctype rows

View File

@ -13,7 +13,7 @@ mkdir -p "$OUTPUT_DIR"
EVAL_NAME="$1"
if [ "$EVAL_NAME" == "text-extraction" ]; then
METRIC_STRATEGY="measure-text-edit-distance-command"
METRIC_STRATEGY="measure-text-extraction-accuracy-command"
elif [ "$EVAL_NAME" == "element-type" ]; then
METRIC_STRATEGY="measure-element-type-accuracy-command"
else

View File

@ -4,7 +4,10 @@ from typing import List, Optional, Tuple
import click
from unstructured.metrics.evaluate import measure_element_type_accuracy, measure_text_edit_distance
from unstructured.metrics.evaluate import (
measure_element_type_accuracy,
measure_text_extraction_accuracy,
)
@click.group()
@ -42,10 +45,17 @@ def main():
type=(int, int, int),
default=(2, 1, 1),
show_default=True,
help="A tuple of weights to the Levenshtein distance calculation. \
help="A list of weights to the Levenshtein distance calculation. Takes input as --weights 2 2 2\
See text_extraction.py/calculate_edit_distance for more details.",
)
def measure_text_edit_distance_command(
@click.option(
"--visualize",
is_flag=True,
show_default=True,
default=False,
help="Add the flag to show progress bar.",
)
def measure_text_extraction_accuracy_command(
output_dir: str,
source_dir: str,
output_list: Optional[List[str]],
@ -53,9 +63,10 @@ def measure_text_edit_distance_command(
export_dir: str,
grouping: Optional[str],
weights: Tuple[int, int, int],
visualize: bool,
):
return measure_text_edit_distance(
output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
return measure_text_extraction_accuracy(
output_dir, source_dir, output_list, source_list, export_dir, grouping, weights, visualize
)
@ -83,15 +94,23 @@ def measure_text_edit_distance_command(
help="Directory to save the output evaluation metrics to. Default to \
your/working/dir/metrics/",
)
@click.option(
"--visualize",
is_flag=True,
show_default=True,
default=False,
help="Add the flag to show progress bar.",
)
def measure_element_type_accuracy_command(
output_dir: str,
source_dir: str,
output_list: Optional[List[str]],
source_list: Optional[List[str]],
export_dir: str,
visualize: bool,
):
return measure_element_type_accuracy(
output_dir, source_dir, output_list, source_list, export_dir
output_dir, source_dir, output_list, source_list, export_dir, visualize
)

View File

@ -1,81 +0,0 @@
import os
from typing import List, Optional, Tuple
import click
import pandas as pd
from unstructured.metrics.evaluate import measure_text_edit_distance
@click.group()
def main():
pass
def aggregate_cct_data_by_doctype(results_dir: str):
# load tsv into dataframe
df = pd.read_csv(os.path.join(results_dir, "all-docs-cct.tsv"), sep="\t", header=0)
# group by doctype and calculate stats
agg_df = df.groupby("doctype").agg(
{"cct-accuracy": ["mean", "std", "count"], "cct-%missing": ["mean", "std", "count"]}
)
# write results to same export results folder
agg_df.to_csv(os.path.join(results_dir, "all-doctypes-agg-cct.tsv"))
@main.command()
@click.option("--output_dir", type=str, help="Directory to structured output.")
@click.option("--source_dir", type=str, help="Directory to source.")
@click.option(
"--output_list",
type=str,
multiple=True,
help="Optional: list of selected structured output file names under the \
directory to be evaluate. If none, all files under directory will be use.",
)
@click.option(
"--source_list",
type=str,
multiple=True,
help="Optional: list of selected source file names under the directory \
to be evaluate. If none, all files under directory will be use.",
)
@click.option(
"--export_dir",
type=str,
default="metrics_results",
help="Directory to save the output evaluation metrics to. Default to \
[your_working_dir]/metrics_results/",
)
@click.option(
"--weights",
type=(int, int, int),
default=(2, 1, 1),
show_default=True,
help="A tuple of weights to the Levenshtein distance calculation. \
See text_extraction.py/calculate_edit_distance for more details.",
)
def measure_holistic_eval_cct(
output_dir: str,
source_dir: str,
output_list: Optional[List[str]],
source_list: Optional[List[str]],
export_dir: str,
weights: Tuple[int, int, int],
) -> None:
export_dir = "result_doctype_aggregate"
measure_text_edit_distance(
output_dir=output_dir,
source_dir=source_dir,
output_list=output_list,
source_list=source_list,
export_dir=export_dir,
weights=weights,
)
aggregate_cct_data_by_doctype(export_dir)
if __name__ == "__main__":
main()

View File

@ -33,7 +33,7 @@ logger.setLevel(logging.DEBUG)
agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]
def measure_text_edit_distance(
def measure_text_extraction_accuracy(
output_dir: str,
source_dir: str,
output_list: Optional[List[str]] = None,