build: element type frequency evaluation metrics workflow in ci (#1862)

**Executive Summary**
Measured element type frequency accuracy from the current version of
code with the expected output. The performance is reported as tsv file
under `metrics`.

**Technical Details**
- The evaluation measures element type frequencies from
`structured-output-eval` against `expected-structured-output`
- `evaluation.py` has been edited to support function calling using
`click.group()` and `command()`
- `evaluation-ingest-cp.sh` is now added to all the `test-ingest-xx.sh`
scripts

**Outputs**
2 tsv files is saved

![image](https://github.com/Unstructured-IO/unstructured/assets/2177850/b4458094-a9fc-48f9-a0bd-2ccd6985440a)

![image](https://github.com/Unstructured-IO/unstructured/assets/2177850/6d785736-bcaf-4275-bf2d-ab511cdfb3f4)
9-0e05-41d4-b69f-841a2aa131ec)
and aggregated score is displayed.

![image](https://github.com/Unstructured-IO/unstructured/assets/2177850/9d42bd0c-a0dd-41c2-a2e5-b675a40f35cc)

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
Co-authored-by: Yao You <theyaoyou@gmail.com>
This commit is contained in:
Klaijan 2023-10-27 00:36:36 -04:00 committed by GitHub
parent f273a7cb83
commit 466255eec3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 192 additions and 19 deletions

View File

@ -2,6 +2,8 @@
### Enhancements
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
### Features
### Fixes

View File

@ -17,7 +17,7 @@ selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt")
# If structured output file in this connector's outputs match the
# selected outputs in the txt file, copy to the destination
for file in "${structured_outputs[@]}"; do
if [[ "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then
if [[ -f "$file" && "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then
echo "--- Copying $file to $CP_DIR ---"
cp "$file" "$CP_DIR"
fi

View File

@ -9,23 +9,36 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval
mkdir -p "$OUTPUT_DIR"
EVAL_NAME="$1"
# Download cct test from s3
BUCKET_NAME=utic-dev-tech-fixtures
FOLDER_NAME=small-cct
CCT_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME
mkdir -p "$CCT_DIR"
aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$CCT_DIR" --recursive --no-sign-request --region us-east-2
FOLDER_NAME=small-eval-"$EVAL_NAME"
LOCAL_EVAL_SOURCE_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME
mkdir -p "$LOCAL_EVAL_SOURCE_DIR"
aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$LOCAL_EVAL_SOURCE_DIR" --recursive --no-sign-request --region us-east-2
EXPORT_DIR="$SCRIPT_DIR"/metrics
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$CCT_DIR"
cleanup_dir "$LOCAL_EVAL_SOURCE_DIR"
}
trap cleanup EXIT
EXPORT_DIR="$SCRIPT_DIR"/metrics
if [ "$EVAL_NAME" == "text-extraction" ]; then
STRATEGY="measure-text-edit-distance"
elif [ "$EVAL_NAME" == "element-type" ]; then
STRATEGY="measure-element-type-accuracy"
else
echo "Wrong evaluation strategy given. Got [ $EVAL_NAME ]."
exit 1
fi
PYTHONPATH=. ./unstructured/ingest/evaluate.py \
$STRATEGY \
--output_dir "$OUTPUT_DIR" \
--source_dir "$CCT_DIR" \
--export_dir "$EXPORT_DIR"
--source_dir "$LOCAL_EVAL_SOURCE_DIR" \
--export_dir "$EXPORT_DIR"

View File

@ -1,3 +1,3 @@
strategy average sample_sd population_sd count
cct-accuracy 0.774 0.124 0.087 2
cct-%missing 0.065 0.035 0.025 2
cct-accuracy 0.777 0.088 0.072 3
cct-%missing 0.087 0.045 0.037 3

1 strategy average sample_sd population_sd count
2 cct-accuracy 0.774 0.777 0.124 0.088 0.087 0.072 2 3
3 cct-%missing 0.065 0.087 0.035 0.045 0.025 0.037 2 3

View File

@ -0,0 +1,2 @@
strategy average sample_sd population_sd count
element-type-accuracy 0
1 strategy average sample_sd population_sd count
2 element-type-accuracy 0

View File

@ -1,3 +1,4 @@
filename connector cct-accuracy cct-%missing
IRS-form-1987.pdf azure 0.783 0.13
example-10k.html local 0.686 0.04
science-exploration-1p.pptx box 0.861 0.09

1 filename connector cct-accuracy cct-%missing
2 IRS-form-1987.pdf azure 0.783 0.13
3 example-10k.html local 0.686 0.04
4 science-exploration-1p.pptx box 0.861 0.09

View File

@ -0,0 +1 @@
filename connector element-type-accuracy
1 filename connector element-type-accuracy

View File

@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -39,3 +39,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -45,3 +45,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -49,3 +49,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -41,3 +41,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -34,3 +34,5 @@ set +e
# once we have an alternative encoder that is deterministic, we test the diff here
# until then just validating the file was created
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -47,3 +47,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -52,3 +52,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
$ACCESS_TOKEN_FLAGS
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -38,3 +38,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -48,3 +48,5 @@ PYTHONPATH=. unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -68,3 +68,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -31,3 +31,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -32,3 +32,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--uncompress
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -47,3 +47,5 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key PYTHONPATH=. ./u
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -58,3 +58,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -54,3 +54,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -44,3 +44,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -97,6 +97,13 @@ for test in "${all_tests[@]}"; do
fi
done
echo "--------- RUNNING SCRIPT evaluation-metrics.sh ---------"
./test_unstructured_ingest/evaluation-metrics.sh
echo "--------- FINISHED SCRIPT evaluation-metrics.sh ---------"
all_eval=(
'text-extraction'
'element-type'
)
for eval in "${all_eval[@]}"; do
CURRENT_TEST="$eval"
echo "--------- RUNNING SCRIPT $eval ---------"
./test_unstructured_ingest/evaluation-metrics.sh "$eval"
echo "--------- FINISHED SCRIPT $eval ---------"
done

View File

@ -8,6 +8,10 @@ from typing import Any, List, Optional, Tuple
import click
from unstructured.metrics.element_type import (
calculate_element_type_percent_match,
get_element_type_frequency,
)
from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
from unstructured.staging.base import elements_from_json, elements_to_text
@ -24,7 +28,12 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
logger.setLevel(logging.DEBUG)
@click.command()
@click.group()
def main():
pass
@main.command()
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
@click.option(
"--output_list",
@ -56,7 +65,7 @@ logger.setLevel(logging.DEBUG)
help="A tuple of weights to the Levenshtein distance calculation. \
See text_extraction.py/calculate_edit_distance for more details.",
)
def measure_edit_distance(
def measure_text_edit_distance(
output_dir: str,
output_list: Optional[List[str]],
source_dir: str,
@ -123,6 +132,74 @@ def measure_edit_distance(
_display(agg_rows, headers)
@main.command()
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
@click.option(
"--output_list",
type=click.STRING,
multiple=True,
help="Optional: list of selected structured output file names under the \
directory to be evaluate. If none, all files under directory will be use.",
)
@click.option("--source_dir", type=click.STRING, help="Directory to a structured source.")
@click.option(
"--source_list",
type=click.STRING,
multiple=True,
help="Optional: list of selected structured source file names under the directory \
to be evaluate. If none, all files under directory will be use.",
)
@click.option(
"--export_dir",
type=click.STRING,
default="metrics",
help="Directory to save the output evaluation metrics to. Default to \
[your_working_dir]/metrics/",
)
def measure_element_type_accuracy(
output_dir: str,
output_list: Optional[List[str]],
source_dir: str,
source_list: Optional[List[str]],
export_dir: str,
):
if not output_list:
output_list = _listdir_recursive(output_dir)
if not source_list:
source_list = _listdir_recursive(source_dir)
rows = []
accuracy_scores: List[float] = []
for doc in output_list: # type: ignore
fn = (doc.split("/")[-1]).split(".json")[0]
fn_json = fn + ".json"
connector = doc.split("/")[0]
if fn_json in source_list: # type: ignore
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json)))
accuracy = round(calculate_element_type_percent_match(output, source), 3)
rows.append([fn, connector, accuracy])
accuracy_scores.append(accuracy)
headers = ["filename", "connector", "element-type-accuracy"]
_write_to_file(export_dir, "all-docs-element-type.tsv", rows, headers)
headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
agg_rows = []
agg_rows.append(
[
"element-type-accuracy",
_mean(accuracy_scores),
_stdev(accuracy_scores),
_pstdev(accuracy_scores),
len(accuracy_scores),
],
)
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, headers)
_display(agg_rows, headers)
def _listdir_recursive(dir: str):
listdir = []
for dirpath, _, filenames in os.walk(dir):
@ -164,9 +241,15 @@ def _display(rows, headers):
def _mean(scores: List[float], rounding: Optional[int] = 3):
if len(scores) < 1:
return None
elif len(scores) == 1:
mean = scores[0]
else:
mean = statistics.mean(scores)
if not rounding:
return statistics.mean(scores)
return round(statistics.mean(scores), rounding)
return mean
return round(mean, rounding)
def _stdev(scores: List[float], rounding: Optional[int] = 3):
@ -185,5 +268,11 @@ def _pstdev(scores: List[float], rounding: Optional[int] = 3):
return round(statistics.pstdev(scores), rounding)
def _read_text(path):
with open(path) as f:
text = f.read()
return text
if __name__ == "__main__":
measure_edit_distance()
main()