mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 23:51:47 +00:00
chore: ingest test file cleanup (#1366)
This commit is contained in:
parent
9e88929a8c
commit
e88f7d9eab
@ -1,4 +1,4 @@
|
||||
## 0.10.17-dev2
|
||||
## 0.10.17-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
17
test_unstructured_ingest/check-diff-expected-output.sh
Normal file → Executable file
17
test_unstructured_ingest/check-diff-expected-output.sh
Normal file → Executable file
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Description: Compare the structured output files to the expected output files and exit with an error
|
||||
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
|
||||
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
|
||||
# then this script will instead copy the output files to the expected output directory.
|
||||
#
|
||||
# Arguments:
|
||||
@ -14,12 +14,27 @@ set +e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true}
|
||||
OUTPUT_FOLDER_NAME=$1
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
||||
function cleanup() {
|
||||
if [ "$TMP_DIRECTORY_CLEANUP" == "true" ]; then
|
||||
cleanup_dir "$EXPECTED_OUTPUT_DIR_TEXT"
|
||||
cleanup_dir "$OUTPUT_DIR_TEXT"
|
||||
else
|
||||
echo "skipping tmp directory cleanup"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
||||
# remove folder if it exists
|
||||
|
||||
0
test_unstructured_ingest/check-num-dirs-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-dirs-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-expected-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-expected-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-rows-and-columns-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-rows-and-columns-output.sh
Normal file → Executable file
16
test_unstructured_ingest/cleanup.sh
Normal file
16
test_unstructured_ingest/cleanup.sh
Normal file
@ -0,0 +1,16 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
|
||||
function cleanup_dir() {
|
||||
local dir_to_cleanup="${1}"
|
||||
echo "--- Running cleanup of $dir_to_cleanup ---"
|
||||
|
||||
if [ -d "$dir_to_cleanup" ]; then
|
||||
echo "cleaning up directory: $dir_to_cleanup"
|
||||
rm -rf "$dir_to_cleanup"
|
||||
else
|
||||
echo "$dir_to_cleanup does not exist or is not a directory, skipping deletion"
|
||||
fi
|
||||
|
||||
echo "--- Cleanup done ---"
|
||||
}
|
||||
@ -1,12 +1,17 @@
|
||||
import click
|
||||
from deltalake import DeltaTable
|
||||
|
||||
|
||||
def run_check():
|
||||
@click.command()
|
||||
@click.option("--table-uri", type=str)
|
||||
def run_check(table_uri):
|
||||
print(f"Checking contents of table at {table_uri}")
|
||||
delta_table = DeltaTable(
|
||||
table_uri="/tmp/delta-table-dest",
|
||||
table_uri=table_uri,
|
||||
)
|
||||
|
||||
assert len(delta_table.to_pandas()) == 10
|
||||
print("table check complete")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,92 +0,0 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "0540311f6c077fe8f797080918b8d74b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "My First Heading"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "399af454cb1368b8257ed406b430de84",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "My first paragraph."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "b4cf0d13edfa976816649971bd640a66",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Some CP1252-specific characters:"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ada7c3084f437d31d297f85da3941a55",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "dda5e8c4d245c1954ecb64e5dfea598d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "85df09b375e5813aefa3b5f30c8ddff8",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "À\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "2726d2569cd7a6cecb79a6e46bb0b2b3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "Ð\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "2b01f3e428520f6e47d8513292688cf6",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 6
|
||||
},
|
||||
"text": "à\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "5ed256e41bfb169af5f50524b9593a16",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "ð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ"
|
||||
}
|
||||
]
|
||||
@ -11,6 +11,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=api-ingest-output
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--api-key "$UNS_API_KEY" \
|
||||
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--file-glob "*1p.txt" \
|
||||
--input-path example-docs
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=airtable-diff
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
|
||||
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
|
||||
|
||||
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--reprocess \
|
||||
--output-dir "$OUTPUT_DIR"
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=airtable-large
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
|
||||
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
|
||||
exit 0
|
||||
@ -35,16 +39,16 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
|
||||
# We are expecting fifteen directories: fourteen bases and the parent directory
|
||||
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
|
||||
"$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
|
||||
|
||||
# We are expecting 101 files: 100 tables and the parent directory
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
|
||||
|
||||
# Test on ingesting a large number of bases
|
||||
for i in {1..12}; do
|
||||
var="LARGE_WORKSPACE_BASE_ID_$i"
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
|
||||
done
|
||||
|
||||
# Test on ingesting a table with lots of rows
|
||||
sh "$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json
|
||||
"$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=azure
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
azure \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--account-name azureunstructured1 \
|
||||
--remote-url abfs://container1/
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-api
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
"$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
biomed \
|
||||
@ -28,4 +32,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--max-request-time 30 \
|
||||
--max-retries 5 \
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-path
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
"$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
biomed \
|
||||
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--max-retries 5 \
|
||||
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=box
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
|
||||
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
|
||||
exit 0
|
||||
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--reprocess \
|
||||
--verbose
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -10,6 +10,10 @@ OUTPUT_FOLDER_NAME=confluence-diff
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
||||
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
||||
exit 0
|
||||
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--api-token "$CONFLUENCE_API_TOKEN" \
|
||||
--spaces testteamsp,MFS
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=confluence-large
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
||||
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
||||
exit 0
|
||||
@ -43,7 +47,7 @@ OUTPUT_SUBFOLDER_NAME=testteamsp1
|
||||
# Example:
|
||||
# Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large
|
||||
# Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1
|
||||
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
|
||||
"$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
|
||||
|
||||
# We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided.
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/
|
||||
|
||||
@ -7,25 +7,26 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=delta-table
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
DESTINATION_TABLE=/tmp/delta-table-dest
|
||||
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
|
||||
|
||||
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
|
||||
echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
||||
function cleanup() {
|
||||
if [ -d "$DESTINATION_TABLE" ]; then
|
||||
echo "cleaning up tmp directory: $DESTINATION_TABLE"
|
||||
rm -rf "$DESTINATION_TABLE"
|
||||
fi
|
||||
cleanup_dir "$DESTINATION_TABLE"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
delta-table \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
@ -34,8 +35,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--verbose \
|
||||
delta-table \
|
||||
--write-column json_data \
|
||||
--table-uri $DESTINATION_TABLE
|
||||
--table-uri "$DESTINATION_TABLE"
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py
|
||||
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=discord
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$DISCORD_TOKEN" ]; then
|
||||
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
|
||||
exit 0
|
||||
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--channels 1099442333440802930,1099601456321003600 \
|
||||
--token "$DISCORD_TOKEN" \
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=dropbox
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
|
||||
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
|
||||
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
|
||||
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--remote-url "dropbox:// /"
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -9,12 +9,24 @@ OUTPUT_FOLDER_NAME=elasticsearch
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
sh scripts/elasticsearch-test-helpers/create-and-check-es.sh
|
||||
wait
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
||||
# Kill the container so the script can be repeatedly run using the same ports
|
||||
trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT
|
||||
function cleanup() {
|
||||
# Kill the container so the script can be repeatedly run using the same ports
|
||||
if docker ps --filter "name=es-test"; then
|
||||
echo "Stopping Elasticsearch Docker container"
|
||||
docker stop es-test
|
||||
fi
|
||||
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
scripts/elasticsearch-test-helpers/create-and-check-es.sh
|
||||
wait
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
elasticsearch \
|
||||
@ -29,5 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--url http://localhost:9200 \
|
||||
--jq-query '{ethnicity, director, plot}'
|
||||
|
||||
echo "SCRIPT_DIR: $SCRIPT_DIR"
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gcs
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
||||
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
||||
exit 0
|
||||
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--remote-url gs://utic-test-ingest-fixtures/
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=github
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none}
|
||||
|
||||
ACCESS_TOKEN_FLAGS=""
|
||||
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--git-file-glob '*.html,*.txt' \
|
||||
$ACCESS_TOKEN_FLAGS
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gitlab
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
gitlab \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--git-file-glob '*.md,*.txt' \
|
||||
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=google-drive
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
||||
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
||||
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
|
||||
@ -32,4 +36,4 @@ PYTHONPATH=. unstructured/ingest/main.py \
|
||||
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE"
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=jira-diff
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then
|
||||
echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set."
|
||||
exit 0
|
||||
@ -52,4 +56,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=local-single-file-with-encoding
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
@ -18,4 +22,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
set +e
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
set +e
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=local-single-file
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
set +e
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=local
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
@ -17,4 +21,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--file-glob "*.html" \
|
||||
--input-path example-docs
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=notion
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$NOTION_API_KEY" ]; then
|
||||
echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set."
|
||||
exit 0
|
||||
@ -25,4 +29,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--verbose
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=onedrive
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then
|
||||
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set."
|
||||
exit 0
|
||||
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--path '/utic-test-ingest-fixtures' \
|
||||
--recursive \
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=outlook
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
|
||||
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
|
||||
exit 0
|
||||
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
INPUT_PATH=$SCRIPT_DIR/download
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
echo "REPROCESS INPUT PATH"
|
||||
ls "$INPUT_PATH"
|
||||
|
||||
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -2,13 +2,18 @@
|
||||
|
||||
set -e
|
||||
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=s3
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
"$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
s3 \
|
||||
@ -23,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--anonymous
|
||||
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=salesforce
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then
|
||||
echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set."
|
||||
exit 0
|
||||
@ -37,4 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=Sharepoint
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
|
||||
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
|
||||
exit 0
|
||||
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--path "Shared Documents" \
|
||||
--recursive \
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=slack
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
if [ -z "$SLACK_TOKEN" ]; then
|
||||
echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set."
|
||||
exit 0
|
||||
@ -27,4 +31,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--start-date 2023-04-01 \
|
||||
--end-date 2023-04-08T12:00:00-08:00
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=wikipedia
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
wikipedia \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--verbose \
|
||||
--page-title "Open Source Software"
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eux -o pipefail
|
||||
set -eu -o pipefail
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
@ -8,35 +8,55 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
||||
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
||||
export OMP_THREAD_LIMIT=1
|
||||
|
||||
./test_unstructured_ingest/test-ingest-s3.sh
|
||||
./test_unstructured_ingest/test-ingest-azure.sh
|
||||
./test_unstructured_ingest/test-ingest-box.sh
|
||||
./test_unstructured_ingest/test-ingest-discord.sh
|
||||
./test_unstructured_ingest/test-ingest-dropbox.sh
|
||||
./test_unstructured_ingest/test-ingest-github.sh
|
||||
./test_unstructured_ingest/test-ingest-gitlab.sh
|
||||
./test_unstructured_ingest/test-ingest-google-drive.sh
|
||||
./test_unstructured_ingest/test-ingest-wikipedia.sh
|
||||
./test_unstructured_ingest/test-ingest-biomed-api.sh
|
||||
./test_unstructured_ingest/test-ingest-biomed-path.sh
|
||||
./test_unstructured_ingest/test-ingest-local.sh
|
||||
./test_unstructured_ingest/test-ingest-slack.sh
|
||||
./test_unstructured_ingest/test-ingest-against-api.sh
|
||||
./test_unstructured_ingest/test-ingest-gcs.sh
|
||||
./test_unstructured_ingest/test-ingest-onedrive.sh
|
||||
./test_unstructured_ingest/test-ingest-outlook.sh
|
||||
./test_unstructured_ingest/test-ingest-elasticsearch.sh
|
||||
./test_unstructured_ingest/test-ingest-confluence-diff.sh
|
||||
./test_unstructured_ingest/test-ingest-confluence-large.sh
|
||||
./test_unstructured_ingest/test-ingest-airtable-diff.sh
|
||||
./test_unstructured_ingest/test-ingest-airtable-large.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
|
||||
./test_unstructured_ingest/test-ingest-notion.sh
|
||||
./test_unstructured_ingest/test-ingest-delta-table.sh
|
||||
./test_unstructured_ingest/test-ingest-salesforce.sh
|
||||
./test_unstructured_ingest/test-ingest-jira.sh
|
||||
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
||||
./test_unstructured_ingest/test-ingest-sharepoint.sh
|
||||
scripts=(
|
||||
'test-ingest-s3.sh'
|
||||
'test-ingest-azure.sh'
|
||||
'test-ingest-box.sh'
|
||||
'test-ingest-discord.sh'
|
||||
'test-ingest-dropbox.sh'
|
||||
'test-ingest-github.sh'
|
||||
'test-ingest-gitlab.sh'
|
||||
'test-ingest-google-drive.sh'
|
||||
'test-ingest-wikipedia.sh'
|
||||
'test-ingest-biomed-api.sh'
|
||||
'test-ingest-biomed-path.sh'
|
||||
'test-ingest-local.sh'
|
||||
'test-ingest-slack.sh'
|
||||
'test-ingest-against-api.sh'
|
||||
'test-ingest-gcs.sh'
|
||||
'test-ingest-onedrive.sh'
|
||||
'test-ingest-outlook.sh'
|
||||
'test-ingest-elasticsearch.sh'
|
||||
'test-ingest-confluence-diff.sh'
|
||||
'test-ingest-confluence-large.sh'
|
||||
'test-ingest-airtable-diff.sh'
|
||||
'test-ingest-airtable-large.sh'
|
||||
'test-ingest-local-single-file.sh'
|
||||
'test-ingest-local-single-file-with-encoding.sh'
|
||||
'test-ingest-local-single-file-with-pdf-infer-table-structure.sh'
|
||||
'test-ingest-notion.sh'
|
||||
'test-ingest-delta-table.sh'
|
||||
'test-ingest-salesforce.sh'
|
||||
'test-ingest-jira.sh'
|
||||
## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||
'test-ingest-pdf-fast-reprocess.sh'
|
||||
'test-ingest-sharepoint.sh'
|
||||
)
|
||||
|
||||
CURRENT_SCRIPT="none"
|
||||
|
||||
function print_last_run() {
|
||||
if [ "$CURRENT_SCRIPT" != "none" ]; then
|
||||
echo "Last ran script: $CURRENT_SCRIPT"
|
||||
fi
|
||||
}
|
||||
|
||||
trap print_last_run EXIT
|
||||
|
||||
for script in "${scripts[@]}"; do
|
||||
CURRENT_SCRIPT=$script
|
||||
echo "--------- RUNNING SCRIPT $script ---------"
|
||||
echo "Running ./test_unstructured_ingest/$script"
|
||||
./test_unstructured_ingest/"$script"
|
||||
echo "--------- FINISHED SCRIPT $script ---------"
|
||||
done
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.17-dev2" # pragma: no cover
|
||||
__version__ = "0.10.17-dev3" # pragma: no cover
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user