mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-17 02:03:34 +00:00
chore: ingest test file cleanup (#1366)
This commit is contained in:
parent
9e88929a8c
commit
e88f7d9eab
@ -1,4 +1,4 @@
|
|||||||
## 0.10.17-dev2
|
## 0.10.17-dev3
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
|||||||
17
test_unstructured_ingest/check-diff-expected-output.sh
Normal file → Executable file
17
test_unstructured_ingest/check-diff-expected-output.sh
Normal file → Executable file
@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Description: Compare the structured output files to the expected output files and exit with an error
|
# Description: Compare the structured output files to the expected output files and exit with an error
|
||||||
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
|
# if they are different. If the environment variable OVERWRITE_FIXTURES is not "false",
|
||||||
# then this script will instead copy the output files to the expected output directory.
|
# then this script will instead copy the output files to the expected output directory.
|
||||||
#
|
#
|
||||||
# Arguments:
|
# Arguments:
|
||||||
@ -14,12 +14,27 @@ set +e
|
|||||||
|
|
||||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||||
|
TMP_DIRECTORY_CLEANUP=${TMP_DIRECTORY_CLEANUP:-true}
|
||||||
OUTPUT_FOLDER_NAME=$1
|
OUTPUT_FOLDER_NAME=$1
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR_TEXT=$SCRIPT_DIR/text-output/$OUTPUT_FOLDER_NAME
|
||||||
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
|
EXPECTED_OUTPUT_DIR=$SCRIPT_DIR/expected-structured-output/$OUTPUT_FOLDER_NAME
|
||||||
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
|
EXPECTED_OUTPUT_DIR_TEXT=$SCRIPT_DIR/expected-text-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
|
||||||
|
function cleanup() {
|
||||||
|
if [ "$TMP_DIRECTORY_CLEANUP" == "true" ]; then
|
||||||
|
cleanup_dir "$EXPECTED_OUTPUT_DIR_TEXT"
|
||||||
|
cleanup_dir "$OUTPUT_DIR_TEXT"
|
||||||
|
else
|
||||||
|
echo "skipping tmp directory cleanup"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||||
if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
if [ "$OVERWRITE_FIXTURES" != "false" ]; then
|
||||||
# remove folder if it exists
|
# remove folder if it exists
|
||||||
|
|||||||
0
test_unstructured_ingest/check-num-dirs-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-dirs-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-expected-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-expected-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-files-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-rows-and-columns-output.sh
Normal file → Executable file
0
test_unstructured_ingest/check-num-rows-and-columns-output.sh
Normal file → Executable file
16
test_unstructured_ingest/cleanup.sh
Normal file
16
test_unstructured_ingest/cleanup.sh
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
|
||||||
|
function cleanup_dir() {
|
||||||
|
local dir_to_cleanup="${1}"
|
||||||
|
echo "--- Running cleanup of $dir_to_cleanup ---"
|
||||||
|
|
||||||
|
if [ -d "$dir_to_cleanup" ]; then
|
||||||
|
echo "cleaning up directory: $dir_to_cleanup"
|
||||||
|
rm -rf "$dir_to_cleanup"
|
||||||
|
else
|
||||||
|
echo "$dir_to_cleanup does not exist or is not a directory, skipping deletion"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "--- Cleanup done ---"
|
||||||
|
}
|
||||||
@ -1,12 +1,17 @@
|
|||||||
|
import click
|
||||||
from deltalake import DeltaTable
|
from deltalake import DeltaTable
|
||||||
|
|
||||||
|
|
||||||
def run_check():
|
@click.command()
|
||||||
|
@click.option("--table-uri", type=str)
|
||||||
|
def run_check(table_uri):
|
||||||
|
print(f"Checking contents of table at {table_uri}")
|
||||||
delta_table = DeltaTable(
|
delta_table = DeltaTable(
|
||||||
table_uri="/tmp/delta-table-dest",
|
table_uri=table_uri,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(delta_table.to_pandas()) == 10
|
assert len(delta_table.to_pandas()) == 10
|
||||||
|
print("table check complete")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -1,92 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "0540311f6c077fe8f797080918b8d74b",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 1
|
|
||||||
},
|
|
||||||
"text": "My First Heading"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "399af454cb1368b8257ed406b430de84",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 1
|
|
||||||
},
|
|
||||||
"text": "My first paragraph."
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "b4cf0d13edfa976816649971bd640a66",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 1
|
|
||||||
},
|
|
||||||
"text": "Some CP1252-specific characters:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "UncategorizedText",
|
|
||||||
"element_id": "ada7c3084f437d31d297f85da3941a55",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 2
|
|
||||||
},
|
|
||||||
"text": "¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "UncategorizedText",
|
|
||||||
"element_id": "dda5e8c4d245c1954ecb64e5dfea598d",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 3
|
|
||||||
},
|
|
||||||
"text": "°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "85df09b375e5813aefa3b5f30c8ddff8",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 4
|
|
||||||
},
|
|
||||||
"text": "À\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "2726d2569cd7a6cecb79a6e46bb0b2b3",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 5
|
|
||||||
},
|
|
||||||
"text": "Ð\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "2b01f3e428520f6e47d8513292688cf6",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 6
|
|
||||||
},
|
|
||||||
"text": "à\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "Title",
|
|
||||||
"element_id": "5ed256e41bfb169af5f50524b9593a16",
|
|
||||||
"metadata": {
|
|
||||||
"data_source": {},
|
|
||||||
"filetype": "text/html",
|
|
||||||
"page_number": 7
|
|
||||||
},
|
|
||||||
"text": "ð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -11,6 +11,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_FOLDER_NAME=api-ingest-output
|
OUTPUT_FOLDER_NAME=api-ingest-output
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
local \
|
local \
|
||||||
--api-key "$UNS_API_KEY" \
|
--api-key "$UNS_API_KEY" \
|
||||||
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--file-glob "*1p.txt" \
|
--file-glob "*1p.txt" \
|
||||||
--input-path example-docs
|
--input-path example-docs
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 1 $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=airtable-diff
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
|
VARIED_DATA_BASE_ID="app5YQxSfp220fWtm"
|
||||||
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
|
VARIED_DATA_BASE_ID_2="appJ43QmP8I17zu88"
|
||||||
|
|
||||||
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--reprocess \
|
--reprocess \
|
||||||
--output-dir "$OUTPUT_DIR"
|
--output-dir "$OUTPUT_DIR"
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=airtable-large
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
|
if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then
|
||||||
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
|
echo "Skipping Airtable ingest test because the AIRTABLE_PERSONAL_ACCESS_TOKEN is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -35,16 +39,16 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
# We are expecting fifteen directories: fourteen bases and the parent directory
|
# We are expecting fifteen directories: fourteen bases and the parent directory
|
||||||
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
|
"$SCRIPT_DIR"/check-num-dirs-output.sh 15 "$OUTPUT_FOLDER_NAME"
|
||||||
|
|
||||||
# We are expecting 101 files: 100 tables and the parent directory
|
# We are expecting 101 files: 100 tables and the parent directory
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
|
"$SCRIPT_DIR"/check-num-files-output.sh 101 "$OUTPUT_FOLDER_NAME"/"$LARGE_BASE_BASE_ID"/
|
||||||
|
|
||||||
# Test on ingesting a large number of bases
|
# Test on ingesting a large number of bases
|
||||||
for i in {1..12}; do
|
for i in {1..12}; do
|
||||||
var="LARGE_WORKSPACE_BASE_ID_$i"
|
var="LARGE_WORKSPACE_BASE_ID_$i"
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
|
"$SCRIPT_DIR"/check-num-files-output.sh 12 "$OUTPUT_FOLDER_NAME"/"${!var}"
|
||||||
done
|
done
|
||||||
|
|
||||||
# Test on ingesting a table with lots of rows
|
# Test on ingesting a table with lots of rows
|
||||||
sh "$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json
|
"$SCRIPT_DIR"/check-num-rows-and-columns-output.sh 39999 "$OUTPUT_DIR"/"$LARGE_TABLE_BASE_ID"/"$LARGE_TABLE_TABLE_ID".json
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=azure
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
azure \
|
azure \
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--account-name azureunstructured1 \
|
--account-name azureunstructured1 \
|
||||||
--remote-url abfs://container1/
|
--remote-url abfs://container1/
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-api
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
biomed \
|
biomed \
|
||||||
@ -28,4 +32,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--max-request-time 30 \
|
--max-request-time 30 \
|
||||||
--max-retries 5 \
|
--max-retries 5 \
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -9,7 +9,11 @@ OUTPUT_FOLDER_NAME=biomed-path
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
biomed \
|
biomed \
|
||||||
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--max-retries 5 \
|
--max-retries 5 \
|
||||||
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
|
--path "oa_pdf/07/07/sbaa031.073.PMC7234218.pdf" \
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=box
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
|
if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then
|
||||||
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
|
echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--reprocess \
|
--reprocess \
|
||||||
--verbose
|
--verbose
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -10,6 +10,10 @@ OUTPUT_FOLDER_NAME=confluence-diff
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
||||||
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--api-token "$CONFLUENCE_API_TOKEN" \
|
--api-token "$CONFLUENCE_API_TOKEN" \
|
||||||
--spaces testteamsp,MFS
|
--spaces testteamsp,MFS
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -12,6 +12,10 @@ OUTPUT_FOLDER_NAME=confluence-large
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then
|
||||||
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
echo "Skipping Confluence ingest test because the CONFLUENCE_USER_EMAIL or CONFLUENCE_API_TOKEN env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -43,7 +47,7 @@ OUTPUT_SUBFOLDER_NAME=testteamsp1
|
|||||||
# Example:
|
# Example:
|
||||||
# Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large
|
# Output dir: unstructured/test_unstructured_ingest/structured-output/confluence-large
|
||||||
# Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1
|
# Space dir: unstructured/test_unstructured_ingest/structured-output/confluence-large/testteamsp1
|
||||||
sh "$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
|
"$SCRIPT_DIR"/check-num-dirs-output.sh 2 "$OUTPUT_FOLDER_NAME"
|
||||||
|
|
||||||
# We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided.
|
# We are expecting 250 files due to the --confluence-num-of-docs-from-each-space 250 that we provided.
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/
|
"$SCRIPT_DIR"/check-num-files-output.sh 250 "$OUTPUT_FOLDER_NAME"/"$OUTPUT_SUBFOLDER_NAME"/
|
||||||
|
|||||||
@ -7,25 +7,26 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_FOLDER_NAME=delta-table
|
OUTPUT_FOLDER_NAME=delta-table
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
DESTINATION_TABLE=/tmp/delta-table-dest
|
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
|
||||||
|
|
||||||
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
|
if [ -z "$AWS_ACCESS_KEY_ID" ] && [ -z "$AWS_SECRET_ACCESS_KEY" ]; then
|
||||||
echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set."
|
echo "Skipping Delta Table ingest test because either AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY env var was not set."
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
|
||||||
function cleanup() {
|
function cleanup() {
|
||||||
if [ -d "$DESTINATION_TABLE" ]; then
|
cleanup_dir "$DESTINATION_TABLE"
|
||||||
echo "cleaning up tmp directory: $DESTINATION_TABLE"
|
cleanup_dir "$OUTPUT_DIR"
|
||||||
rm -rf "$DESTINATION_TABLE"
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
trap cleanup EXIT
|
trap cleanup EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
delta-table \
|
delta-table \
|
||||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
|
--table-uri s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/ \
|
||||||
--output-dir "$OUTPUT_DIR" \
|
--output-dir "$OUTPUT_DIR" \
|
||||||
@ -34,8 +35,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--verbose \
|
--verbose \
|
||||||
delta-table \
|
delta-table \
|
||||||
--write-column json_data \
|
--write-column json_data \
|
||||||
--table-uri $DESTINATION_TABLE
|
--table-uri "$DESTINATION_TABLE"
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py
|
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=discord
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$DISCORD_TOKEN" ]; then
|
if [ -z "$DISCORD_TOKEN" ]; then
|
||||||
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
|
echo "Skipping Discord ingest test because the DISCORD_TOKEN env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -24,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--channels 1099442333440802930,1099601456321003600 \
|
--channels 1099442333440802930,1099601456321003600 \
|
||||||
--token "$DISCORD_TOKEN" \
|
--token "$DISCORD_TOKEN" \
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=dropbox
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
|
if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then
|
||||||
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
|
echo "Skipping Dropbox ingest test because one or more of these env vars is not set:"
|
||||||
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
|
echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN"
|
||||||
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--remote-url "dropbox:// /"
|
--remote-url "dropbox:// /"
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -9,12 +9,24 @@ OUTPUT_FOLDER_NAME=elasticsearch
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
# shellcheck source=/dev/null
|
# shellcheck disable=SC1091
|
||||||
sh scripts/elasticsearch-test-helpers/create-and-check-es.sh
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
wait
|
|
||||||
|
|
||||||
# Kill the container so the script can be repeatedly run using the same ports
|
function cleanup() {
|
||||||
trap 'echo "Stopping Elasticsearch Docker container"; docker stop es-test' EXIT
|
# Kill the container so the script can be repeatedly run using the same ports
|
||||||
|
if docker ps --filter "name=es-test"; then
|
||||||
|
echo "Stopping Elasticsearch Docker container"
|
||||||
|
docker stop es-test
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup_dir "$OUTPUT_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
# shellcheck source=/dev/null
|
||||||
|
scripts/elasticsearch-test-helpers/create-and-check-es.sh
|
||||||
|
wait
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
elasticsearch \
|
elasticsearch \
|
||||||
@ -29,5 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--url http://localhost:9200 \
|
--url http://localhost:9200 \
|
||||||
--jq-query '{ethnicity, director, plot}'
|
--jq-query '{ethnicity, director, plot}'
|
||||||
|
|
||||||
echo "SCRIPT_DIR: $SCRIPT_DIR"
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gcs
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
||||||
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--remote-url gs://utic-test-ingest-fixtures/
|
--remote-url gs://utic-test-ingest-fixtures/
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=github
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none}
|
GH_READ_ONLY_ACCESS_TOKEN=${GH_READ_ONLY_ACCESS_TOKEN:-none}
|
||||||
|
|
||||||
ACCESS_TOKEN_FLAGS=""
|
ACCESS_TOKEN_FLAGS=""
|
||||||
@ -35,4 +39,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--git-file-glob '*.html,*.txt' \
|
--git-file-glob '*.html,*.txt' \
|
||||||
$ACCESS_TOKEN_FLAGS
|
$ACCESS_TOKEN_FLAGS
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=gitlab
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
gitlab \
|
gitlab \
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
@ -21,4 +25,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--git-file-glob '*.md,*.txt' \
|
--git-file-glob '*.md,*.txt' \
|
||||||
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab
|
--url https://gitlab.com/gitlab-com/content-sites/docsy-gitlab
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 2 $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=google-drive
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then
|
||||||
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set."
|
||||||
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
|
echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr"
|
||||||
@ -32,4 +36,4 @@ PYTHONPATH=. unstructured/ingest/main.py \
|
|||||||
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE"
|
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE"
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=jira-diff
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then
|
if [ -z "$JIRA_INGEST_USER_EMAIL" ] || [ -z "$JIRA_INGEST_API_TOKEN" ]; then
|
||||||
echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set."
|
echo "Skipping Jira ingest test because the JIRA_INGEST_USER_EMAIL or JIRA_INGEST_API_TOKEN env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -52,4 +56,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_FOLDER_NAME=local-single-file-with-encoding
|
OUTPUT_FOLDER_NAME=local-single-file-with-encoding
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
local \
|
local \
|
||||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
@ -18,4 +22,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
|
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
local \
|
local \
|
||||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_FOLDER_NAME=local-single-file
|
OUTPUT_FOLDER_NAME=local-single-file
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
local \
|
local \
|
||||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
set +e
|
set +e
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -7,6 +7,10 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
OUTPUT_FOLDER_NAME=local
|
OUTPUT_FOLDER_NAME=local
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
local \
|
local \
|
||||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||||
@ -17,4 +21,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--file-glob "*.html" \
|
--file-glob "*.html" \
|
||||||
--input-path example-docs
|
--input-path example-docs
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 12 $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=notion
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$NOTION_API_KEY" ]; then
|
if [ -z "$NOTION_API_KEY" ]; then
|
||||||
echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set."
|
echo "Skipping Notion ingest test because the NOTION_API_KEY env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -25,4 +29,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--verbose
|
--verbose
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=onedrive
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then
|
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ]; then
|
||||||
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set."
|
echo "Skipping OneDrive ingest test because the MS_CLIENT_ID, MS_CLIENT_CRED, MS_USER_PNAME env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -30,4 +34,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--path '/utic-test-ingest-fixtures' \
|
--path '/utic-test-ingest-fixtures' \
|
||||||
--recursive \
|
--recursive \
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=outlook
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
|
if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] || [ -z "$MS_USER_EMAIL" ]; then
|
||||||
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
|
echo "Skipping Outlook ingest test because the MS_CLIENT_ID or MS_CLIENT_CRED or MS_TENANT_ID or MS_USER_EMAIL env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -31,4 +35,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -9,6 +9,10 @@ OUTPUT_FOLDER_NAME=pdf-fast-reprocess
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
INPUT_PATH=$SCRIPT_DIR/download
|
INPUT_PATH=$SCRIPT_DIR/download
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
echo "REPROCESS INPUT PATH"
|
echo "REPROCESS INPUT PATH"
|
||||||
ls "$INPUT_PATH"
|
ls "$INPUT_PATH"
|
||||||
|
|
||||||
@ -26,4 +30,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -2,13 +2,18 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
|
||||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
cd "$SCRIPT_DIR"/.. || exit 1
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
OUTPUT_FOLDER_NAME=s3
|
OUTPUT_FOLDER_NAME=s3
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
|
"$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
s3 \
|
s3 \
|
||||||
@ -23,4 +28,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--anonymous
|
--anonymous
|
||||||
|
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -11,6 +11,10 @@ OUTPUT_FOLDER_NAME=salesforce
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then
|
if [ -z "$SALESFORCE_PRIVATE_KEY" ] && [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then
|
||||||
echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set."
|
echo "Skipping Salesforce ingest test because neither SALESFORCE_PRIVATE_KEY nor SALESFORCE_PRIVATE_KEY_PATH env vars are set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -37,4 +41,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--output-dir "$OUTPUT_DIR" \
|
--output-dir "$OUTPUT_DIR" \
|
||||||
--verbose
|
--verbose
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=Sharepoint
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
|
if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then
|
||||||
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
|
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -29,4 +33,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--path "Shared Documents" \
|
--path "Shared Documents" \
|
||||||
--recursive \
|
--recursive \
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=slack
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
if [ -z "$SLACK_TOKEN" ]; then
|
if [ -z "$SLACK_TOKEN" ]; then
|
||||||
echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set."
|
echo "Skipping Slack ingest test because the SLACK_TOKEN env var is not set."
|
||||||
exit 0
|
exit 0
|
||||||
@ -27,4 +31,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--start-date 2023-04-01 \
|
--start-date 2023-04-01 \
|
||||||
--end-date 2023-04-08T12:00:00-08:00
|
--end-date 2023-04-08T12:00:00-08:00
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -8,6 +8,10 @@ OUTPUT_FOLDER_NAME=wikipedia
|
|||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
# shellcheck disable=SC1091
|
||||||
|
source "$SCRIPT_DIR"/cleanup.sh
|
||||||
|
trap 'cleanup_dir "$OUTPUT_DIR"' EXIT
|
||||||
|
|
||||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
wikipedia \
|
wikipedia \
|
||||||
--download-dir "$DOWNLOAD_DIR" \
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
@ -19,4 +23,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--verbose \
|
--verbose \
|
||||||
--page-title "Open Source Software"
|
--page-title "Open Source Software"
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
|
"$SCRIPT_DIR"/check-num-files-output.sh 3 $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
set -eux -o pipefail
|
set -eu -o pipefail
|
||||||
|
|
||||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
cd "$SCRIPT_DIR"/.. || exit 1
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
@ -8,35 +8,55 @@ cd "$SCRIPT_DIR"/.. || exit 1
|
|||||||
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
||||||
export OMP_THREAD_LIMIT=1
|
export OMP_THREAD_LIMIT=1
|
||||||
|
|
||||||
./test_unstructured_ingest/test-ingest-s3.sh
|
scripts=(
|
||||||
./test_unstructured_ingest/test-ingest-azure.sh
|
'test-ingest-s3.sh'
|
||||||
./test_unstructured_ingest/test-ingest-box.sh
|
'test-ingest-azure.sh'
|
||||||
./test_unstructured_ingest/test-ingest-discord.sh
|
'test-ingest-box.sh'
|
||||||
./test_unstructured_ingest/test-ingest-dropbox.sh
|
'test-ingest-discord.sh'
|
||||||
./test_unstructured_ingest/test-ingest-github.sh
|
'test-ingest-dropbox.sh'
|
||||||
./test_unstructured_ingest/test-ingest-gitlab.sh
|
'test-ingest-github.sh'
|
||||||
./test_unstructured_ingest/test-ingest-google-drive.sh
|
'test-ingest-gitlab.sh'
|
||||||
./test_unstructured_ingest/test-ingest-wikipedia.sh
|
'test-ingest-google-drive.sh'
|
||||||
./test_unstructured_ingest/test-ingest-biomed-api.sh
|
'test-ingest-wikipedia.sh'
|
||||||
./test_unstructured_ingest/test-ingest-biomed-path.sh
|
'test-ingest-biomed-api.sh'
|
||||||
./test_unstructured_ingest/test-ingest-local.sh
|
'test-ingest-biomed-path.sh'
|
||||||
./test_unstructured_ingest/test-ingest-slack.sh
|
'test-ingest-local.sh'
|
||||||
./test_unstructured_ingest/test-ingest-against-api.sh
|
'test-ingest-slack.sh'
|
||||||
./test_unstructured_ingest/test-ingest-gcs.sh
|
'test-ingest-against-api.sh'
|
||||||
./test_unstructured_ingest/test-ingest-onedrive.sh
|
'test-ingest-gcs.sh'
|
||||||
./test_unstructured_ingest/test-ingest-outlook.sh
|
'test-ingest-onedrive.sh'
|
||||||
./test_unstructured_ingest/test-ingest-elasticsearch.sh
|
'test-ingest-outlook.sh'
|
||||||
./test_unstructured_ingest/test-ingest-confluence-diff.sh
|
'test-ingest-elasticsearch.sh'
|
||||||
./test_unstructured_ingest/test-ingest-confluence-large.sh
|
'test-ingest-confluence-diff.sh'
|
||||||
./test_unstructured_ingest/test-ingest-airtable-diff.sh
|
'test-ingest-confluence-large.sh'
|
||||||
./test_unstructured_ingest/test-ingest-airtable-large.sh
|
'test-ingest-airtable-diff.sh'
|
||||||
./test_unstructured_ingest/test-ingest-local-single-file.sh
|
'test-ingest-airtable-large.sh'
|
||||||
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
|
'test-ingest-local-single-file.sh'
|
||||||
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
|
'test-ingest-local-single-file-with-encoding.sh'
|
||||||
./test_unstructured_ingest/test-ingest-notion.sh
|
'test-ingest-local-single-file-with-pdf-infer-table-structure.sh'
|
||||||
./test_unstructured_ingest/test-ingest-delta-table.sh
|
'test-ingest-notion.sh'
|
||||||
./test_unstructured_ingest/test-ingest-salesforce.sh
|
'test-ingest-delta-table.sh'
|
||||||
./test_unstructured_ingest/test-ingest-jira.sh
|
'test-ingest-salesforce.sh'
|
||||||
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
'test-ingest-jira.sh'
|
||||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
## NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||||
./test_unstructured_ingest/test-ingest-sharepoint.sh
|
'test-ingest-pdf-fast-reprocess.sh'
|
||||||
|
'test-ingest-sharepoint.sh'
|
||||||
|
)
|
||||||
|
|
||||||
|
CURRENT_SCRIPT="none"
|
||||||
|
|
||||||
|
function print_last_run() {
|
||||||
|
if [ "$CURRENT_SCRIPT" != "none" ]; then
|
||||||
|
echo "Last ran script: $CURRENT_SCRIPT"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
trap print_last_run EXIT
|
||||||
|
|
||||||
|
for script in "${scripts[@]}"; do
|
||||||
|
CURRENT_SCRIPT=$script
|
||||||
|
echo "--------- RUNNING SCRIPT $script ---------"
|
||||||
|
echo "Running ./test_unstructured_ingest/$script"
|
||||||
|
./test_unstructured_ingest/"$script"
|
||||||
|
echo "--------- FINISHED SCRIPT $script ---------"
|
||||||
|
done
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.10.17-dev2" # pragma: no cover
|
__version__ = "0.10.17-dev3" # pragma: no cover
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user