mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-26 06:36:06 +00:00
Added plain-text comparison for tests (#1180)
This PR adds a comparison during ingest test for the content of the files in plain text (i.e.: without JSON format)
This commit is contained in:
parent
675a10ea69
commit
5052e6cb3b
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -297,6 +297,7 @@ jobs:
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
sudo apt-get install -y tesseract-ocr-kor
|
||||
sudo apt-get install diffstat
|
||||
tesseract --version
|
||||
make install-ingest-s3
|
||||
make install-ingest-airtable
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@ -187,7 +187,9 @@ tags
|
||||
# Ruff cache
|
||||
.ruff_cache/
|
||||
|
||||
unstructured-inference/sample-docs/*
|
||||
.ppm
|
||||
unstructured-inference/
|
||||
|
||||
example-docs/*_images
|
||||
examples/**/output/
|
||||
examples/**/output/
|
||||
|
||||
2
test_unstructured_ingest/.gitignore
vendored
2
test_unstructured_ingest/.gitignore
vendored
@ -1,2 +1,4 @@
|
||||
structured-output
|
||||
download
|
||||
test_unstructured_ingest/expected-text-output/**
|
||||
test_unstructured_ingest/text-output/**
|
||||
|
||||
@ -16,7 +16,9 @@ SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
OUTPUT_FOLDER_NAME=$1
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
||||
@ -27,6 +29,11 @@ if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
||||
mkdir -p "$EXPECTED_OUTPUT_DIR"
|
||||
cp -rf "$OUTPUT_DIR" "$SCRIPT_DIR/expected-structured-output"
|
||||
elif ! diff -ru "$EXPECTED_OUTPUT_DIR" "$OUTPUT_DIR" ; then
|
||||
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$EXPECTED_OUTPUT_DIR" "$EXPECTED_OUTPUT_DIR_TEXT"
|
||||
"$SCRIPT_DIR"/json-to-clean-text-folder.sh "$OUTPUT_DIR" "$OUTPUT_DIR_TEXT"
|
||||
diff -ru "$EXPECTED_OUTPUT_DIR_TEXT" "$OUTPUT_DIR_TEXT"> outputdiff.txt
|
||||
cat outputdiff.txt
|
||||
diffstat -c outputdiff.txt
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
|
||||
16
test_unstructured_ingest/json-to-clean-text-folder.sh
Executable file
16
test_unstructured_ingest/json-to-clean-text-folder.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Description:
|
||||
#
|
||||
# Arguments:
|
||||
# $1 folder with json files to process
|
||||
# $2 folder to place the text field for all entries, for all files at $1
|
||||
|
||||
set +e
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
INPUT_FOLDER_NAME=$1
|
||||
OUTPUT_DIR_TEXT=$2
|
||||
echo "CLEANING FOLDER: $INPUT_FOLDER_NAME"
|
||||
echo "INTO: $OUTPUT_DIR_TEXT"
|
||||
mkdir -p "$OUTPUT_DIR_TEXT"
|
||||
find "$INPUT_FOLDER_NAME" -type f -print0| xargs -0 -IX -n1 "$SCRIPT_DIR"/json-to-text.sh X "$OUTPUT_DIR_TEXT"
|
||||
13
test_unstructured_ingest/json-to-text.sh
Executable file
13
test_unstructured_ingest/json-to-text.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Clean the content of json file generated by unstructured library, storing just
|
||||
# text elements. The resulting file will be stored at the $2 folder with the same
|
||||
# name as the original file appending .txt as suffix.
|
||||
# Arguments:
|
||||
# - $1 path to the file to clean
|
||||
# - $2 path to folder to store the result
|
||||
#
|
||||
|
||||
BASE=$(basename "$1")
|
||||
DEST=$2/$BASE.txt
|
||||
jq '.[].text'<"$1"|fold -w 80 -s > "$DEST"
|
||||
Loading…
x
Reference in New Issue
Block a user