Roman Isecke 24a419ece0
separate ingest tests (#1951)
### Description
This splits the source ingest tests from the destination ingest tests
since they share a different pattern:
* src tests pull data from a source and compare the partitioned content
to the expected results
* destingation tests leverage the local connector to produce results to
push to a destination and leverages overhead to create temporary
locations at those destinations to write to and delete when done.

Only the src tests create partitioned content that needs to be checked
so the update ingest test CI job only needs to run these.
2023-11-01 19:23:44 +00:00

44 lines
1.3 KiB
Bash
Executable File

#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=gitlab
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
CI=${CI:-"false"}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
}
trap cleanup EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
gitlab \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.parent_id,metadata.category_depth \
--strategy hi_res \
--preserve-downloads \
--reprocess \
--output-dir "$OUTPUT_DIR" \
--verbose \
--git-branch 'v0.0.7' \
--git-file-glob '*.md,*.txt' \
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab \
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"