mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
- Creates ABC's for ingest connectors - Updates the s3_connector classes to inherit from ABC's - Moves s3 test script to it's own file to establish pattern for additional connectors - Rewrites the Ingest.md doc, including instructions how how to add a connector - Updates the example s3 ingest script to use the new location for main.py Note that there were no logic changes, this is essentially a refactoring PR. Test instructions: Run ./test_unstructured_ingest/test-ingest.sh and ./examples/ingest/s3-small-batch/ingest.sh.
25 lines
1.1 KiB
Bash
Executable File
25 lines
1.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
|
|
if [[ "$(find test_unstructured_ingest/expected-structured-output/s3-small-batch/ -type f -size +20k | wc -l)" != 3 ]]; then
|
|
echo "The test fixtures in test_unstructured_ingest/expected-structured-output/ look suspicious. At least one of the files is too small."
|
|
echo "Did you overwrite test fixtures with bad outputs?"
|
|
exit 1
|
|
fi
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py --s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ --s3-anonymous --structured-output-dir s3-small-batch-output
|
|
|
|
if ! diff -ru s3-small-batch-output test_unstructured_ingest/expected-structured-output/s3-small-batch ; then
|
|
echo
|
|
echo "There are differences from the previously checked-in structured outputs."
|
|
echo
|
|
echo "If these differences are acceptable, copy the outputs from"
|
|
echo "s3-small-batch-output/ to test_unstructured_ingest/expected-structured-output/s3-small-batch/ after running"
|
|
echo
|
|
echo " PYTHONPATH=. python examples/ingest/s3-small-batch/main.py --structured-output-dir s3-small-batch-output"
|
|
echo
|
|
exit 1
|
|
fi
|