mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-09-06 15:23:37 +00:00

- Updates CI to install tesseract version 5.3.0 (better than 4.x in various ways incl. perf.). - Adds azure expected output fixtures for more useful reference points and as a repro for Some PDF's with scanned images return empty elements #346 . - Adds a script to regenerate ingest test fixtures that is run in an ubuntu docker container (like CI), with the same version of tesseract. See the comments in scripts/ingest-test-fixtures-update.sh for details. - Updates expected outputs with above script. - Updates individual test-ingest scripts to update expected .json output if OVERWRITE_FIXTURES=true.
47 lines
1.6 KiB
Bash
Executable File
47 lines
1.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
|
|
if [[ "$CI" == "true" ]]; then
|
|
if [ "$(( RANDOM % 10))" -lt 1 ] ; then
|
|
# NOTE(crag): proper fix is being tracked here: https://github.com/Unstructured-IO/unstructured/issues/306
|
|
echo "Skipping ingest 90% of github ingest tests to avoid rate limiting issue."
|
|
exit 0
|
|
fi
|
|
fi
|
|
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
--metadata-exclude filename \
|
|
--github-url dcneiner/Downloadify \
|
|
--git-file-glob '*.html,*.txt' \
|
|
--structured-output-dir github-downloadify-output \
|
|
--reprocess \
|
|
--preserve-downloads \
|
|
--verbose
|
|
|
|
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
|
|
|
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
|
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
|
|
|
cp github-downloadify-output/* test_unstructured_ingest/expected-structured-output/github-downloadify/
|
|
|
|
elif ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
|
|
echo
|
|
echo "There are differences from the previously checked-in structured outputs."
|
|
echo
|
|
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
|
echo
|
|
echo " export OVERWRITE_FIXTURES=true"
|
|
echo
|
|
echo "and then rerun this script."
|
|
echo
|
|
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
|
echo "to update fixtures for CI."
|
|
echo
|
|
exit 1
|
|
|
|
fi
|