mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 21:57:58 +00:00

### Description Pivot from using the retry logic as a decorator as this posed too many limitations on what can be passed in as a parameter at runtime. Moved this to be a class approach and now that can be instantiated with appropriate loggers leveraging the `--verbose` flag to set the log level. This also mitigates how much new code is being forked from the backoff library. The existing notion client that was using the previous decorator has been refactored to use the new class approach and the airtable connector was updated to support retry logic as well. Default log handlers were introduced which applies to all instances of the retry handler when it starts, backs off, and gives up. A generic approach was added to configuring the retry parameters in the CLI and was added to the running number of common configs across all CLI commands. Omitted CHANGELOG entry as this is mostly just a refactor of the retry code. All other connectors will be updated to support retry in another PR but this helps limit the number of changes to review in this one. ### Extra fixes * Updated local and salesforce source connector to set `ingest_doc_cls` in a `__post_init__` method since this variable can't be serialized. ### Testing Both the airtable and notion ingest tests can be run locally. While they might not pass due to text changes (to be expected when running locally), the process can be viewed in the logs to validate. Associated issue: #1488
51 lines
1.7 KiB
Bash
Executable File
51 lines
1.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -e
|
|
|
|
# Description: This test checks if all the processed content is the same as the expected outputs.
|
|
# Also checks if a large table can be ingested properly.
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
|
|
OUTPUT_FOLDER_NAME=airtable-diff
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
|
CI=${CI:-"false"}
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup() {
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
cleanup_dir "$WORK_DIR"
|
|
if [ "$CI" == "true" ]; then
|
|
cleanup_dir "$DOWNLOAD_DIR"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
|
|
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
|
|
|
|
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
|
|
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
|
|
exit 0
|
|
fi
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
airtable \
|
|
--download-dir "$DOWNLOAD_DIR" \
|
|
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
|
|
--list-of-paths "$VARIED_DATA_BASE_ID $VARIED_DATA_BASE_ID_2" \
|
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.date,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadatda.languages \
|
|
--num-processes "$max_processes" \
|
|
--preserve-downloads \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--work-dir "$WORK_DIR" \
|
|
--max-retry-time 10 \
|
|
--verbose
|
|
|
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|