mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-30 11:46:24 +00:00

### Description In an effort to speed up the ingest tests, bumping the num if processes to the max on the system for each
55 lines
2.4 KiB
Bash
Executable File
55 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -e
|
|
|
|
# Description: This test checks if the number of spaces and documents processed are as expected.
|
|
# Each space shows up as a directory in the output folder, hence check-num-dirs-output.sh
|
|
# Each document shows up as a file in a space directory, hence check-num-files-output.sh
|
|
|
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
|
|
OUTPUT_FOLDER_NAME=confluence-large
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python -c "import os; print(os.cpu_count())")}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
|
|
|
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
|
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
|
exit 0
|
|
fi
|
|
|
|
# The test checks the scenario where --confluence-list-of-spaces and --confluence-num-of-spaces
|
|
# are being provided at the same time, which is a wrong way to use the connector.
|
|
|
|
# We expect the test to ignore --confluence-num-of-spaces and use --confluence-list-of-spaces.
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
confluence \
|
|
--download-dir "$DOWNLOAD_DIR" \
|
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
|
--num-processes "$max_processes" \
|
|
--preserve-downloads \
|
|
--reprocess \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--verbose \
|
|
--url https://unstructured-ingest-test.atlassian.net \
|
|
--user-email "$CONFLUENCE_USER_EMAIL" \
|
|
--api-token "$CONFLUENCE_API_TOKEN" \
|
|
--max-num-of-spaces 10 \
|
|
--spaces testteamsp1 \
|
|
--max-num-of-docs-from-each-space 250 \
|
|
|
|
OUTPUT_SUBFOLDER_NAME=testteamsp1
|
|
|
|
# We are expecting two directories: one for the space, and one is the output directory itself
|
|
# Example:
|
|
# Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large
|
|
# Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1
|
|
"$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
|
|
|
|
# We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided.
|
|
"$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/
|