mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: add visualize param to command and add test (#2178)
- Add `visualize` parameter to the click command -- now callable using `--visualize` flag to show the progress bar. - Refactor the name.
This commit is contained in:
parent
50b1431c9e
commit
0aae1faa54
@ -13,7 +13,7 @@
|
||||
* **Do not extract text of `<style>` tags in HTML.** `<style>` tags containing CSS in invalid positions previously contributed to element text. Do not consider text node of a `<style>` element as textual content.
|
||||
* **Fix DOCX merged table cell repeats cell text.** Only include text for a merged cell, not for each underlying cell spanned by the merge.
|
||||
* **Fix tables not extracted from DOCX header/footers.** Headers and footers in DOCX documents skip tables defined in the header and commonly used for layout/alignment purposes. Extract text from tables as a string and include in the `Header` and `Footer` document elements.
|
||||
* **Fix output filepath for fsspec-based source connectors** Previously the base directory was being included in the output filepath unnecessarily.
|
||||
* **Fix output filepath for fsspec-based source connectors.** Previously the base directory was being included in the output filepath unnecessarily.
|
||||
|
||||
## 0.11.0
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -5,7 +5,8 @@ import pandas as pd
|
||||
import pytest
|
||||
|
||||
from unstructured.metrics.evaluate import (
|
||||
measure_text_edit_distance,
|
||||
measure_element_type_accuracy,
|
||||
measure_text_extraction_accuracy,
|
||||
)
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
@ -17,6 +18,37 @@ TESTING_FILE_DIR = os.path.join(EXAMPLE_DOCS_DIRECTORY, "test_evaluate_files")
|
||||
|
||||
UNSTRUCTURED_OUTPUT_DIRNAME = "unstructured_output"
|
||||
GOLD_CCT_DIRNAME = "gold_standard_cct"
|
||||
GOLD_ELEMENT_TYPE_DIRNAME = "gold_standard_element_type"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_text_extraction_evaluation():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
||||
)
|
||||
assert os.path.isfile(os.path.join(export_dir, "all-docs-cct.tsv"))
|
||||
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
||||
assert len(df) == 3
|
||||
assert len(df.columns) == 5
|
||||
assert df.iloc[0].filename == "Bank Good Credit Loan.pptx"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_element_type_evaluation():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_ELEMENT_TYPE_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_element_type_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir
|
||||
)
|
||||
assert os.path.isfile(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"))
|
||||
df = pd.read_csv(os.path.join(export_dir, "all-docs-element-type-frequency.tsv"), sep="\t")
|
||||
assert len(df) == 1
|
||||
assert len(df.columns) == 4
|
||||
assert df.iloc[0].filename == "IRS-form-1987.pdf"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@ -25,16 +57,15 @@ def test_text_extraction_takes_list():
|
||||
output_list = ["currency.csv.json"]
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_edit_distance(
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir,
|
||||
source_dir=source_dir,
|
||||
output_list=output_list,
|
||||
export_dir=export_dir,
|
||||
)
|
||||
# check that only the listed files are included
|
||||
with open(os.path.join(export_dir, "all-docs-cct.tsv")) as f:
|
||||
lines = f.read().splitlines()
|
||||
assert len(lines) == len(output_list) + 1 # includes header
|
||||
df = pd.read_csv(os.path.join(export_dir, "all-docs-cct.tsv"), sep="\t")
|
||||
assert len(df) == len(output_list)
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@ -42,8 +73,8 @@ def test_text_extraction_grouping():
|
||||
output_dir = os.path.join(TESTING_FILE_DIR, UNSTRUCTURED_OUTPUT_DIRNAME)
|
||||
source_dir = os.path.join(TESTING_FILE_DIR, GOLD_CCT_DIRNAME)
|
||||
export_dir = os.path.join(TESTING_FILE_DIR, "test_evaluate_results_cct")
|
||||
measure_text_edit_distance(
|
||||
measure_text_extraction_accuracy(
|
||||
output_dir=output_dir, source_dir=source_dir, export_dir=export_dir, grouping="doctype"
|
||||
)
|
||||
df = pd.read_csv(os.path.join(export_dir, "all-doctype-agg-cct.tsv"), sep="\t")
|
||||
assert len(df) == 4
|
||||
assert len(df) == 4 # metrics row and doctype rows
|
||||
|
@ -13,7 +13,7 @@ mkdir -p "$OUTPUT_DIR"
|
||||
EVAL_NAME="$1"
|
||||
|
||||
if [ "$EVAL_NAME" == "text-extraction" ]; then
|
||||
METRIC_STRATEGY="measure-text-edit-distance-command"
|
||||
METRIC_STRATEGY="measure-text-extraction-accuracy-command"
|
||||
elif [ "$EVAL_NAME" == "element-type" ]; then
|
||||
METRIC_STRATEGY="measure-element-type-accuracy-command"
|
||||
else
|
||||
|
@ -4,7 +4,10 @@ from typing import List, Optional, Tuple
|
||||
|
||||
import click
|
||||
|
||||
from unstructured.metrics.evaluate import measure_element_type_accuracy, measure_text_edit_distance
|
||||
from unstructured.metrics.evaluate import (
|
||||
measure_element_type_accuracy,
|
||||
measure_text_extraction_accuracy,
|
||||
)
|
||||
|
||||
|
||||
@click.group()
|
||||
@ -42,10 +45,17 @@ def main():
|
||||
type=(int, int, int),
|
||||
default=(2, 1, 1),
|
||||
show_default=True,
|
||||
help="A tuple of weights to the Levenshtein distance calculation. \
|
||||
help="A list of weights to the Levenshtein distance calculation. Takes input as --weights 2 2 2\
|
||||
See text_extraction.py/calculate_edit_distance for more details.",
|
||||
)
|
||||
def measure_text_edit_distance_command(
|
||||
@click.option(
|
||||
"--visualize",
|
||||
is_flag=True,
|
||||
show_default=True,
|
||||
default=False,
|
||||
help="Add the flag to show progress bar.",
|
||||
)
|
||||
def measure_text_extraction_accuracy_command(
|
||||
output_dir: str,
|
||||
source_dir: str,
|
||||
output_list: Optional[List[str]],
|
||||
@ -53,9 +63,10 @@ def measure_text_edit_distance_command(
|
||||
export_dir: str,
|
||||
grouping: Optional[str],
|
||||
weights: Tuple[int, int, int],
|
||||
visualize: bool,
|
||||
):
|
||||
return measure_text_edit_distance(
|
||||
output_dir, source_dir, output_list, source_list, export_dir, grouping, weights
|
||||
return measure_text_extraction_accuracy(
|
||||
output_dir, source_dir, output_list, source_list, export_dir, grouping, weights, visualize
|
||||
)
|
||||
|
||||
|
||||
@ -83,15 +94,23 @@ def measure_text_edit_distance_command(
|
||||
help="Directory to save the output evaluation metrics to. Default to \
|
||||
your/working/dir/metrics/",
|
||||
)
|
||||
@click.option(
|
||||
"--visualize",
|
||||
is_flag=True,
|
||||
show_default=True,
|
||||
default=False,
|
||||
help="Add the flag to show progress bar.",
|
||||
)
|
||||
def measure_element_type_accuracy_command(
|
||||
output_dir: str,
|
||||
source_dir: str,
|
||||
output_list: Optional[List[str]],
|
||||
source_list: Optional[List[str]],
|
||||
export_dir: str,
|
||||
visualize: bool,
|
||||
):
|
||||
return measure_element_type_accuracy(
|
||||
output_dir, source_dir, output_list, source_list, export_dir
|
||||
output_dir, source_dir, output_list, source_list, export_dir, visualize
|
||||
)
|
||||
|
||||
|
||||
|
@ -1,81 +0,0 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import click
|
||||
import pandas as pd
|
||||
|
||||
from unstructured.metrics.evaluate import measure_text_edit_distance
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
def aggregate_cct_data_by_doctype(results_dir: str):
|
||||
# load tsv into dataframe
|
||||
df = pd.read_csv(os.path.join(results_dir, "all-docs-cct.tsv"), sep="\t", header=0)
|
||||
|
||||
# group by doctype and calculate stats
|
||||
agg_df = df.groupby("doctype").agg(
|
||||
{"cct-accuracy": ["mean", "std", "count"], "cct-%missing": ["mean", "std", "count"]}
|
||||
)
|
||||
|
||||
# write results to same export results folder
|
||||
agg_df.to_csv(os.path.join(results_dir, "all-doctypes-agg-cct.tsv"))
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.option("--output_dir", type=str, help="Directory to structured output.")
|
||||
@click.option("--source_dir", type=str, help="Directory to source.")
|
||||
@click.option(
|
||||
"--output_list",
|
||||
type=str,
|
||||
multiple=True,
|
||||
help="Optional: list of selected structured output file names under the \
|
||||
directory to be evaluate. If none, all files under directory will be use.",
|
||||
)
|
||||
@click.option(
|
||||
"--source_list",
|
||||
type=str,
|
||||
multiple=True,
|
||||
help="Optional: list of selected source file names under the directory \
|
||||
to be evaluate. If none, all files under directory will be use.",
|
||||
)
|
||||
@click.option(
|
||||
"--export_dir",
|
||||
type=str,
|
||||
default="metrics_results",
|
||||
help="Directory to save the output evaluation metrics to. Default to \
|
||||
[your_working_dir]/metrics_results/",
|
||||
)
|
||||
@click.option(
|
||||
"--weights",
|
||||
type=(int, int, int),
|
||||
default=(2, 1, 1),
|
||||
show_default=True,
|
||||
help="A tuple of weights to the Levenshtein distance calculation. \
|
||||
See text_extraction.py/calculate_edit_distance for more details.",
|
||||
)
|
||||
def measure_holistic_eval_cct(
|
||||
output_dir: str,
|
||||
source_dir: str,
|
||||
output_list: Optional[List[str]],
|
||||
source_list: Optional[List[str]],
|
||||
export_dir: str,
|
||||
weights: Tuple[int, int, int],
|
||||
) -> None:
|
||||
export_dir = "result_doctype_aggregate"
|
||||
measure_text_edit_distance(
|
||||
output_dir=output_dir,
|
||||
source_dir=source_dir,
|
||||
output_list=output_list,
|
||||
source_list=source_list,
|
||||
export_dir=export_dir,
|
||||
weights=weights,
|
||||
)
|
||||
aggregate_cct_data_by_doctype(export_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -33,7 +33,7 @@ logger.setLevel(logging.DEBUG)
|
||||
agg_headers = ["metric", "average", "sample_sd", "population_sd", "count"]
|
||||
|
||||
|
||||
def measure_text_edit_distance(
|
||||
def measure_text_extraction_accuracy(
|
||||
output_dir: str,
|
||||
source_dir: str,
|
||||
output_list: Optional[List[str]] = None,
|
||||
|
Loading…
x
Reference in New Issue
Block a user