chore: ingest test file cleanup (#1366)

This commit is contained in:
Roman Isecke 2023-09-21 14:51:08 -04:00 committed by GitHub
parent 9e88929a8c
commit e88f7d9eab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 275 additions and 182 deletions

View File

@ -1,4 +1,4 @@
## 0.10.17-dev2
## 0.10.17-dev3
### Enhancements

17
test_unstructured_ingest/check-diff-expected-output.sh Normal file → Executable file
View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash
# Description: Compare the structured output files to the expected output files and exit with an error
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
# then this script will instead copy the output files to the expected output directory.
#
# Arguments:
@ -14,12 +14,27 @@ set +e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true}
OUTPUT_FOLDER_NAME=$1
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
if [ "$TMP_DIRECTORY_CLEANUP" == "true" ]; then
cleanup_dir "$EXPECTED_OUTPUT_DIR_TEXT"
cleanup_dir "$OUTPUT_DIR_TEXT"
else
echo "skipping tmp directory cleanup"
fi
}
trap cleanup EXIT
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [ "$OVERWRITE_FIXTURES" != "false" ]; then
# remove folder if it exists

0
test_unstructured_ingest/check-num-dirs-output.sh Normal file → Executable file
View File

View File

0
test_unstructured_ingest/check-num-files-output.sh Normal file → Executable file
View File

View File

View File

@ -0,0 +1,16 @@
#!/usr/bin/env bash
function cleanup_dir() {
local dir_to_cleanup="${1}"
echo "--- Running cleanup of $dir_to_cleanup ---"
if [ -d "$dir_to_cleanup" ]; then
echo "cleaning up directory: $dir_to_cleanup"
rm -rf "$dir_to_cleanup"
else
echo "$dir_to_cleanup does not exist or is not a directory, skipping deletion"
fi
echo "--- Cleanup done ---"
}

View File

@ -1,12 +1,17 @@
import click
from deltalake import DeltaTable
def run_check():
@click.command()
@click.option("--table-uri", type=str)
def run_check(table_uri):
print(f"Checking contents of table at {table_uri}")
delta_table = DeltaTable(
table_uri="/tmp/delta-table-dest",
table_uri=table_uri,
)
assert len(delta_table.to_pandas()) == 10
print("table check complete")
if __name__ == "__main__":

View File

@ -1,92 +0,0 @@
[
{
"type": "Title",
"element_id": "0540311f6c077fe8f797080918b8d74b",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "My First Heading"
},
{
"type": "Title",
"element_id": "399af454cb1368b8257ed406b430de84",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "My first paragraph."
},
{
"type": "Title",
"element_id": "b4cf0d13edfa976816649971bd640a66",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 1
},
"text": "Some CP1252-specific characters:"
},
{
"type": "UncategorizedText",
"element_id": "ada7c3084f437d31d297f85da3941a55",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 2
},
"text": "¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯"
},
{
"type": "UncategorizedText",
"element_id": "dda5e8c4d245c1954ecb64e5dfea598d",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 3
},
"text": "°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿"
},
{
"type": "Title",
"element_id": "85df09b375e5813aefa3b5f30c8ddff8",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 4
},
"text": "À\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ"
},
{
"type": "Title",
"element_id": "2726d2569cd7a6cecb79a6e46bb0b2b3",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 5
},
"text": "Ð\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß"
},
{
"type": "Title",
"element_id": "2b01f3e428520f6e47d8513292688cf6",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 6
},
"text": "à\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï"
},
{
"type": "Title",
"element_id": "5ed256e41bfb169af5f50524b9593a16",
"metadata": {
"data_source": {},
"filetype": "text/html",
"page_number": 7
},
"text": "ð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ"
}
]

View File

@ -11,6 +11,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=api-ingest-output
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--api-key "$UNS_API_KEY" \
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--file-glob "*1p.txt" \
--input-path example-docs
sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME

View File

@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=airtable-diff
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--reprocess \
--output-dir "$OUTPUT_DIR"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=airtable-large
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
exit 0
@ -35,16 +39,16 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
# We are expecting fifteen directories: fourteen bases and the parent directory
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
"$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
# We are expecting 101 files: 100 tables and the parent directory
sh "$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
"$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
# Test on ingesting a large number of bases
for i in {1..12}; do
var="LARGE_WORKSPACE_BASE_ID_$i"
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
"$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
done
# Test on ingesting a table with lots of rows
sh "$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json
"$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=azure
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
azure \
--download-dir "$DOWNLOAD_DIR" \
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--account-name azureunstructured1 \
--remote-url abfs://container1/
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-api
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
"$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
PYTHONPATH=. ./unstructured/ingest/main.py \
biomed \
@ -28,4 +32,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--max-request-time 30 \
--max-retries 5 \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-path
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
"$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
PYTHONPATH=. ./unstructured/ingest/main.py \
biomed \
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--max-retries 5 \
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=box
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
exit 0
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--reprocess \
--verbose
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -10,6 +10,10 @@ OUTPUT_FOLDER_NAME=confluence-diff
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
exit 0
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--api-token "$CONFLUENCE_API_TOKEN" \
--spaces testteamsp,MFS
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=confluence-large
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
exit 0
@ -43,7 +47,7 @@ OUTPUT_SUBFOLDER_NAME=testteamsp1
# Example:
# Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large
# Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
"$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
# We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided.
sh "$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/
"$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/

View File

@ -7,25 +7,26 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=delta-table
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=/tmp/delta-table-dest
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set."
exit 0
fi
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
if [ -d "$DESTINATION_TABLE" ]; then
echo "cleaning up tmp directory: $DESTINATION_TABLE"
rm -rf "$DESTINATION_TABLE"
fi
cleanup_dir "$DESTINATION_TABLE"
cleanup_dir "$OUTPUT_DIR"
}
trap cleanup EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
delta-table \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--download-dir "$DOWNLOAD_DIR" \
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
--output-dir "$OUTPUT_DIR" \
@ -34,8 +35,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose \
delta-table \
--write-column json_data \
--table-uri $DESTINATION_TABLE
--table-uri "$DESTINATION_TABLE"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=discord
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$DISCORD_TOKEN" ]; then
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
exit 0
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--channels 1099442333440802930,1099601456321003600 \
--token "$DISCORD_TOKEN" \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=dropbox
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url "dropbox:// /"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,12 +9,24 @@ OUTPUT_FOLDER_NAME=elasticsearch
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck source=/dev/null
sh scripts/elasticsearch-test-helpers/create-and-check-es.sh
wait
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
# Kill the container so the script can be repeatedly run using the same ports
trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT
function cleanup() {
# Kill the container so the script can be repeatedly run using the same ports
if docker ps --filter "name=es-test"; then
echo "Stopping Elasticsearch Docker container"
docker stop es-test
fi
cleanup_dir "$OUTPUT_DIR"
}
trap cleanup EXIT
# shellcheck source=/dev/null
scripts/elasticsearch-test-helpers/create-and-check-es.sh
wait
PYTHONPATH=. ./unstructured/ingest/main.py \
elasticsearch \
@ -29,5 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--url http://localhost:9200 \
--jq-query '{ethnicity, director, plot}'
echo "SCRIPT_DIR: $SCRIPT_DIR"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gcs
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
exit 0
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url gs://utic-test-ingest-fixtures/
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=github
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none}
ACCESS_TOKEN_FLAGS=""
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--git-file-glob '*.html,*.txt' \
$ACCESS_TOKEN_FLAGS
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gitlab
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
gitlab \
--download-dir "$DOWNLOAD_DIR" \
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--git-file-glob '*.md,*.txt' \
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab
sh "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=google-drive
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
@ -32,4 +36,4 @@ PYTHONPATH=. unstructured/ingest/main.py \
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE"
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=jira-diff
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then
echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set."
exit 0
@ -52,4 +56,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file-with-encoding
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -18,4 +22,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
@ -17,4 +21,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--file-glob "*.html" \
--input-path example-docs
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=notion
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$NOTION_API_KEY" ]; then
echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set."
exit 0
@ -25,4 +29,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=onedrive
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set."
exit 0
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--path '/utic-test-ingest-fixtures' \
--recursive \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=outlook
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
exit 0
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
INPUT_PATH=$SCRIPT_DIR/download
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
echo "REPROCESS INPUT PATH"
ls "$INPUT_PATH"
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -2,13 +2,18 @@
set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=s3
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
"$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
PYTHONPATH=. ./unstructured/ingest/main.py \
s3 \
@ -23,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--anonymous
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=salesforce
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then
echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set."
exit 0
@ -37,4 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--output-dir "$OUTPUT_DIR" \
--verbose
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=Sharepoint
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
exit 0
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--path "Shared Documents" \
--recursive \
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=slack
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
if [ -z "$SLACK_TOKEN" ]; then
echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set."
exit 0
@ -27,4 +31,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--start-date 2023-04-01 \
--end-date 2023-04-08T12:00:00-08:00
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=wikipedia
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
wikipedia \
--download-dir "$DOWNLOAD_DIR" \
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--verbose \
--page-title "Open Source Software"
sh "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME

View File

@ -1,6 +1,6 @@
#!/usr/bin/env bash
set -eux -o pipefail
set -eu -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
@ -8,35 +8,55 @@ cd "$SCRIPT_DIR"/.. || exit 1
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-s3.sh
./test_unstructured_ingest/test-ingest-azure.sh
./test_unstructured_ingest/test-ingest-box.sh
./test_unstructured_ingest/test-ingest-discord.sh
./test_unstructured_ingest/test-ingest-dropbox.sh
./test_unstructured_ingest/test-ingest-github.sh
./test_unstructured_ingest/test-ingest-gitlab.sh
./test_unstructured_ingest/test-ingest-google-drive.sh
./test_unstructured_ingest/test-ingest-wikipedia.sh
./test_unstructured_ingest/test-ingest-biomed-api.sh
./test_unstructured_ingest/test-ingest-biomed-path.sh
./test_unstructured_ingest/test-ingest-local.sh
./test_unstructured_ingest/test-ingest-slack.sh
./test_unstructured_ingest/test-ingest-against-api.sh
./test_unstructured_ingest/test-ingest-gcs.sh
./test_unstructured_ingest/test-ingest-onedrive.sh
./test_unstructured_ingest/test-ingest-outlook.sh
./test_unstructured_ingest/test-ingest-elasticsearch.sh
./test_unstructured_ingest/test-ingest-confluence-diff.sh
./test_unstructured_ingest/test-ingest-confluence-large.sh
./test_unstructured_ingest/test-ingest-airtable-diff.sh
./test_unstructured_ingest/test-ingest-airtable-large.sh
./test_unstructured_ingest/test-ingest-local-single-file.sh
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
./test_unstructured_ingest/test-ingest-notion.sh
./test_unstructured_ingest/test-ingest-delta-table.sh
./test_unstructured_ingest/test-ingest-salesforce.sh
./test_unstructured_ingest/test-ingest-jira.sh
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
./test_unstructured_ingest/test-ingest-sharepoint.sh
scripts=(
'test-ingest-s3.sh'
'test-ingest-azure.sh'
'test-ingest-box.sh'
'test-ingest-discord.sh'
'test-ingest-dropbox.sh'
'test-ingest-github.sh'
'test-ingest-gitlab.sh'
'test-ingest-google-drive.sh'
'test-ingest-wikipedia.sh'
'test-ingest-biomed-api.sh'
'test-ingest-biomed-path.sh'
'test-ingest-local.sh'
'test-ingest-slack.sh'
'test-ingest-against-api.sh'
'test-ingest-gcs.sh'
'test-ingest-onedrive.sh'
'test-ingest-outlook.sh'
'test-ingest-elasticsearch.sh'
'test-ingest-confluence-diff.sh'
'test-ingest-confluence-large.sh'
'test-ingest-airtable-diff.sh'
'test-ingest-airtable-large.sh'
'test-ingest-local-single-file.sh'
'test-ingest-local-single-file-with-encoding.sh'
'test-ingest-local-single-file-with-pdf-infer-table-structure.sh'
'test-ingest-notion.sh'
'test-ingest-delta-table.sh'
'test-ingest-salesforce.sh'
'test-ingest-jira.sh'
## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
'test-ingest-pdf-fast-reprocess.sh'
'test-ingest-sharepoint.sh'
)
CURRENT_SCRIPT="none"
function print_last_run() {
if [ "$CURRENT_SCRIPT" != "none" ]; then
echo "Last ran script: $CURRENT_SCRIPT"
fi
}
trap print_last_run EXIT
for script in "${scripts[@]}"; do
CURRENT_SCRIPT=$script
echo "--------- RUNNING SCRIPT $script ---------"
echo "Running ./test_unstructured_ingest/$script"
./test_unstructured_ingest/"$script"
echo "--------- FINISHED SCRIPT $script ---------"
done

View File

@ -1 +1 @@
__version__ = "0.10.17-dev2" # pragma: no cover
__version__ = "0.10.17-dev3" # pragma: no cover