Added plain-text comparison for tests (#1180)

This PR adds a comparison during ingest test for the content of the
files in plain text (i.e.: without JSON format)
This commit is contained in:
Benjamin Torres 2023-08-29 17:23:14 -06:00 committed by GitHub
parent 675a10ea69
commit 5052e6cb3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 42 additions and 1 deletions

View File

@ -297,6 +297,7 @@ jobs:
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
sudo apt-get install -y tesseract-ocr-kor
sudo apt-get install diffstat
tesseract --version
make install-ingest-s3
make install-ingest-airtable

4
.gitignore vendored
View File

@ -187,7 +187,9 @@ tags
# Ruff cache
.ruff_cache/
unstructured-inference/sample-docs/*
.ppm
unstructured-inference/
example-docs/*_images
examples/**/output/
examples/**/output/

View File

@ -1,2 +1,4 @@
structured-output
download
test_unstructured_ingest/expected-text-output/**
test_unstructured_ingest/text-output/**

View File

@ -16,7 +16,9 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")")
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
OUTPUT_FOLDER_NAME=$1
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [ "$OVERWRITE_FIXTURES" != "false" ]; then
@ -27,6 +29,11 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
mkdir -p "$EXPECTED_OUTPUT_DIR"
cp -rf "$OUTPUT_DIR" "$SCRIPT_DIR/expected-structured-output"
elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT"
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT"
diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt
cat outputdiff.txt
diffstat -c outputdiff.txt
echo
echo "There are differences from the previously checked-in structured outputs."
echo

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
# Description:
#
# Arguments:
# $1 folder with json files to process
# $2 folder to place the text field for all entries, for all files at $1
set +e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
INPUT_FOLDER_NAME=$1
OUTPUT_DIR_TEXT=$2
echo "CLEANING FOLDER: $INPUT_FOLDER_NAME"
echo "INTO: $OUTPUT_DIR_TEXT"
mkdir -p "$OUTPUT_DIR_TEXT"
find "$INPUT_FOLDER_NAME" -type f -print0| xargs -0 -IX -n1 "$SCRIPT_DIR"/json-to-text.sh X "$OUTPUT_DIR_TEXT"

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
# Clean the content of json file generated by unstructured library, storing just
# text elements. The resulting file will be stored at the $2 folder with the same
# name as the original file appending .txt as suffix.
# Arguments:
# - $1 path to the file to clean
# - $2 path to folder to store the result
#
BASE=$(basename "$1")
DEST=$2/$BASE.txt
jq '.[].text'<"$1"|fold -w 80 -s > "$DEST"