mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
build: element type frequency evaluation metrics workflow in ci (#1862)
**Executive Summary** Measured element type frequency accuracy from the current version of code with the expected output. The performance is reported as tsv file under `metrics`. **Technical Details** - The evaluation measures element type frequencies from `structured-output-eval` against `expected-structured-output` - `evaluation.py` has been edited to support function calling using `click.group()` and `command()` - `evaluation-ingest-cp.sh` is now added to all the `test-ingest-xx.sh` scripts **Outputs** 2 tsv files is saved   9-0e05-41d4-b69f-841a2aa131ec) and aggregated score is displayed.  --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Klaijan <Klaijan@users.noreply.github.com> Co-authored-by: Yao You <theyaoyou@gmail.com>
This commit is contained in:
parent
f273a7cb83
commit
466255eec3
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* **Add element type CI evaluation workflow** Adds element type frequency evaluation metrics to the current ingest workflow to measure the performance of each file extracted as well as aggregated-level performance.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
@ -17,7 +17,7 @@ selected_outputs=$(cat "$SCRIPT_DIR/metrics/metrics-json-manifest.txt")
|
|||||||
# If structured output file in this connector's outputs match the
|
# If structured output file in this connector's outputs match the
|
||||||
# selected outputs in the txt file, copy to the destination
|
# selected outputs in the txt file, copy to the destination
|
||||||
for file in "${structured_outputs[@]}"; do
|
for file in "${structured_outputs[@]}"; do
|
||||||
if [[ "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then
|
if [[ -f "$file" && "${selected_outputs[*]}" =~ $(basename "$file") ]] ; then
|
||||||
echo "--- Copying $file to $CP_DIR ---"
|
echo "--- Copying $file to $CP_DIR ---"
|
||||||
cp "$file" "$CP_DIR"
|
cp "$file" "$CP_DIR"
|
||||||
fi
|
fi
|
||||||
|
@ -9,23 +9,36 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output-eval
|
||||||
mkdir -p "$OUTPUT_DIR"
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
|
||||||
|
EVAL_NAME="$1"
|
||||||
|
|
||||||
# Download cct test from s3
|
# Download cct test from s3
|
||||||
BUCKET_NAME=utic-dev-tech-fixtures
|
BUCKET_NAME=utic-dev-tech-fixtures
|
||||||
FOLDER_NAME=small-cct
|
FOLDER_NAME=small-eval-"$EVAL_NAME"
|
||||||
CCT_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME
|
LOCAL_EVAL_SOURCE_DIR=$SCRIPT_DIR/gold-standard/$FOLDER_NAME
|
||||||
mkdir -p "$CCT_DIR"
|
mkdir -p "$LOCAL_EVAL_SOURCE_DIR"
|
||||||
aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$CCT_DIR" --recursive --no-sign-request --region us-east-2
|
aws s3 cp "s3://$BUCKET_NAME/$FOLDER_NAME" "$LOCAL_EVAL_SOURCE_DIR" --recursive --no-sign-request --region us-east-2
|
||||||
|
|
||||||
|
EXPORT_DIR="$SCRIPT_DIR"/metrics
|
||||||
|
|
||||||
# shellcheck disable=SC1091
|
# shellcheck disable=SC1091
|
||||||
source "$SCRIPT_DIR"/cleanup.sh
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
function cleanup() {
|
function cleanup() {
|
||||||
cleanup_dir "$OUTPUT_DIR"
|
cleanup_dir "$OUTPUT_DIR"
|
||||||
cleanup_dir "$CCT_DIR"
|
cleanup_dir "$LOCAL_EVAL_SOURCE_DIR"
|
||||||
}
|
}
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
||||||
EXPORT_DIR="$SCRIPT_DIR"/metrics
|
if [ "$EVAL_NAME" == "text-extraction" ]; then
|
||||||
|
STRATEGY="measure-text-edit-distance"
|
||||||
|
elif [ "$EVAL_NAME" == "element-type" ]; then
|
||||||
|
STRATEGY="measure-element-type-accuracy"
|
||||||
|
else
|
||||||
|
echo "Wrong evaluation strategy given. Got [ $EVAL_NAME ]."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/evaluate.py \
|
PYTHONPATH=. ./unstructured/ingest/evaluate.py \
|
||||||
|
$STRATEGY \
|
||||||
--output_dir "$OUTPUT_DIR" \
|
--output_dir "$OUTPUT_DIR" \
|
||||||
--source_dir "$CCT_DIR" \
|
--source_dir "$LOCAL_EVAL_SOURCE_DIR" \
|
||||||
--export_dir "$EXPORT_DIR"
|
--export_dir "$EXPORT_DIR"
|
@ -1,3 +1,3 @@
|
|||||||
strategy average sample_sd population_sd count
|
strategy average sample_sd population_sd count
|
||||||
cct-accuracy 0.774 0.124 0.087 2
|
cct-accuracy 0.777 0.088 0.072 3
|
||||||
cct-%missing 0.065 0.035 0.025 2
|
cct-%missing 0.087 0.045 0.037 3
|
||||||
|
|
@ -0,0 +1,2 @@
|
|||||||
|
strategy average sample_sd population_sd count
|
||||||
|
element-type-accuracy 0
|
|
@ -1,3 +1,4 @@
|
|||||||
filename connector cct-accuracy cct-%missing
|
filename connector cct-accuracy cct-%missing
|
||||||
|
IRS-form-1987.pdf azure 0.783 0.13
|
||||||
example-10k.html local 0.686 0.04
|
example-10k.html local 0.686 0.04
|
||||||
science-exploration-1p.pptx box 0.861 0.09
|
science-exploration-1p.pptx box 0.861 0.09
|
||||||
|
|
@ -0,0 +1 @@
|
|||||||
|
filename connector element-type-accuracy
|
|
@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--verbose
|
--verbose
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -39,3 +39,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -45,3 +45,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -49,3 +49,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
|
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -41,3 +41,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -48,3 +48,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -34,3 +34,5 @@ set +e
|
|||||||
# once we have an alternative encoder that is deterministic, we test the diff here
|
# once we have an alternative encoder that is deterministic, we test the diff here
|
||||||
# until then just validating the file was created
|
# until then just validating the file was created
|
||||||
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
|
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -47,3 +47,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -52,3 +52,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
$ACCESS_TOKEN_FLAGS
|
$ACCESS_TOKEN_FLAGS
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -38,3 +38,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -48,3 +48,5 @@ PYTHONPATH=. unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -68,3 +68,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -31,3 +31,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
set +e
|
set +e
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -33,3 +33,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
set +e
|
set +e
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -32,3 +32,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
set +e
|
set +e
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -46,3 +46,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -42,3 +42,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--uncompress
|
--uncompress
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -47,3 +47,5 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key PYTHONPATH=. ./u
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -37,3 +37,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -58,3 +58,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -54,3 +54,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -44,3 +44,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -35,3 +35,5 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--work-dir "$WORK_DIR"
|
--work-dir "$WORK_DIR"
|
||||||
|
|
||||||
"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||||
|
@ -97,6 +97,13 @@ for test in "${all_tests[@]}"; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "--------- RUNNING SCRIPT evaluation-metrics.sh ---------"
|
all_eval=(
|
||||||
./test_unstructured_ingest/evaluation-metrics.sh
|
'text-extraction'
|
||||||
echo "--------- FINISHED SCRIPT evaluation-metrics.sh ---------"
|
'element-type'
|
||||||
|
)
|
||||||
|
for eval in "${all_eval[@]}"; do
|
||||||
|
CURRENT_TEST="$eval"
|
||||||
|
echo "--------- RUNNING SCRIPT $eval ---------"
|
||||||
|
./test_unstructured_ingest/evaluation-metrics.sh "$eval"
|
||||||
|
echo "--------- FINISHED SCRIPT $eval ---------"
|
||||||
|
done
|
@ -8,6 +8,10 @@ from typing import Any, List, Optional, Tuple
|
|||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
|
from unstructured.metrics.element_type import (
|
||||||
|
calculate_element_type_percent_match,
|
||||||
|
get_element_type_frequency,
|
||||||
|
)
|
||||||
from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
|
from unstructured.metrics.text_extraction import calculate_accuracy, calculate_percent_missing_text
|
||||||
from unstructured.staging.base import elements_from_json, elements_to_text
|
from unstructured.staging.base import elements_from_json, elements_to_text
|
||||||
|
|
||||||
@ -24,7 +28,12 @@ if "ingest_log_handler" not in [h.name for h in logger.handlers]:
|
|||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.group()
|
||||||
|
def main():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
|
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
|
||||||
@click.option(
|
@click.option(
|
||||||
"--output_list",
|
"--output_list",
|
||||||
@ -56,7 +65,7 @@ logger.setLevel(logging.DEBUG)
|
|||||||
help="A tuple of weights to the Levenshtein distance calculation. \
|
help="A tuple of weights to the Levenshtein distance calculation. \
|
||||||
See text_extraction.py/calculate_edit_distance for more details.",
|
See text_extraction.py/calculate_edit_distance for more details.",
|
||||||
)
|
)
|
||||||
def measure_edit_distance(
|
def measure_text_edit_distance(
|
||||||
output_dir: str,
|
output_dir: str,
|
||||||
output_list: Optional[List[str]],
|
output_list: Optional[List[str]],
|
||||||
source_dir: str,
|
source_dir: str,
|
||||||
@ -123,6 +132,74 @@ def measure_edit_distance(
|
|||||||
_display(agg_rows, headers)
|
_display(agg_rows, headers)
|
||||||
|
|
||||||
|
|
||||||
|
@main.command()
|
||||||
|
@click.option("--output_dir", type=click.STRING, help="Directory to a structured output.")
|
||||||
|
@click.option(
|
||||||
|
"--output_list",
|
||||||
|
type=click.STRING,
|
||||||
|
multiple=True,
|
||||||
|
help="Optional: list of selected structured output file names under the \
|
||||||
|
directory to be evaluate. If none, all files under directory will be use.",
|
||||||
|
)
|
||||||
|
@click.option("--source_dir", type=click.STRING, help="Directory to a structured source.")
|
||||||
|
@click.option(
|
||||||
|
"--source_list",
|
||||||
|
type=click.STRING,
|
||||||
|
multiple=True,
|
||||||
|
help="Optional: list of selected structured source file names under the directory \
|
||||||
|
to be evaluate. If none, all files under directory will be use.",
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--export_dir",
|
||||||
|
type=click.STRING,
|
||||||
|
default="metrics",
|
||||||
|
help="Directory to save the output evaluation metrics to. Default to \
|
||||||
|
[your_working_dir]/metrics/",
|
||||||
|
)
|
||||||
|
def measure_element_type_accuracy(
|
||||||
|
output_dir: str,
|
||||||
|
output_list: Optional[List[str]],
|
||||||
|
source_dir: str,
|
||||||
|
source_list: Optional[List[str]],
|
||||||
|
export_dir: str,
|
||||||
|
):
|
||||||
|
if not output_list:
|
||||||
|
output_list = _listdir_recursive(output_dir)
|
||||||
|
if not source_list:
|
||||||
|
source_list = _listdir_recursive(source_dir)
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
accuracy_scores: List[float] = []
|
||||||
|
|
||||||
|
for doc in output_list: # type: ignore
|
||||||
|
fn = (doc.split("/")[-1]).split(".json")[0]
|
||||||
|
fn_json = fn + ".json"
|
||||||
|
connector = doc.split("/")[0]
|
||||||
|
if fn_json in source_list: # type: ignore
|
||||||
|
output = get_element_type_frequency(_read_text(os.path.join(output_dir, doc)))
|
||||||
|
source = get_element_type_frequency(_read_text(os.path.join(source_dir, fn_json)))
|
||||||
|
accuracy = round(calculate_element_type_percent_match(output, source), 3)
|
||||||
|
rows.append([fn, connector, accuracy])
|
||||||
|
accuracy_scores.append(accuracy)
|
||||||
|
|
||||||
|
headers = ["filename", "connector", "element-type-accuracy"]
|
||||||
|
_write_to_file(export_dir, "all-docs-element-type.tsv", rows, headers)
|
||||||
|
|
||||||
|
headers = ["strategy", "average", "sample_sd", "population_sd", "count"]
|
||||||
|
agg_rows = []
|
||||||
|
agg_rows.append(
|
||||||
|
[
|
||||||
|
"element-type-accuracy",
|
||||||
|
_mean(accuracy_scores),
|
||||||
|
_stdev(accuracy_scores),
|
||||||
|
_pstdev(accuracy_scores),
|
||||||
|
len(accuracy_scores),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
_write_to_file(export_dir, "aggregate-scores-element-type.tsv", agg_rows, headers)
|
||||||
|
_display(agg_rows, headers)
|
||||||
|
|
||||||
|
|
||||||
def _listdir_recursive(dir: str):
|
def _listdir_recursive(dir: str):
|
||||||
listdir = []
|
listdir = []
|
||||||
for dirpath, _, filenames in os.walk(dir):
|
for dirpath, _, filenames in os.walk(dir):
|
||||||
@ -164,9 +241,15 @@ def _display(rows, headers):
|
|||||||
|
|
||||||
|
|
||||||
def _mean(scores: List[float], rounding: Optional[int] = 3):
|
def _mean(scores: List[float], rounding: Optional[int] = 3):
|
||||||
|
if len(scores) < 1:
|
||||||
|
return None
|
||||||
|
elif len(scores) == 1:
|
||||||
|
mean = scores[0]
|
||||||
|
else:
|
||||||
|
mean = statistics.mean(scores)
|
||||||
if not rounding:
|
if not rounding:
|
||||||
return statistics.mean(scores)
|
return mean
|
||||||
return round(statistics.mean(scores), rounding)
|
return round(mean, rounding)
|
||||||
|
|
||||||
|
|
||||||
def _stdev(scores: List[float], rounding: Optional[int] = 3):
|
def _stdev(scores: List[float], rounding: Optional[int] = 3):
|
||||||
@ -185,5 +268,11 @@ def _pstdev(scores: List[float], rounding: Optional[int] = 3):
|
|||||||
return round(statistics.pstdev(scores), rounding)
|
return round(statistics.pstdev(scores), rounding)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text(path):
|
||||||
|
with open(path) as f:
|
||||||
|
text = f.read()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
measure_edit_distance()
|
main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user