2023-09-26 19:24:21 -04:00
#!/usr/bin/env bash
set -e
2023-11-01 15:23:44 -04:00
SRC_PATH = $( dirname " $( realpath " $0 " ) " )
SCRIPT_DIR = $( dirname " $SRC_PATH " )
2023-09-26 19:24:21 -04:00
cd " $SCRIPT_DIR " /.. || exit 1
OUTPUT_FOLDER_NAME = sharepoint-azure-dest
2023-11-02 16:41:56 -05:00
OUTPUT_ROOT = ${ OUTPUT_ROOT :- $SCRIPT_DIR }
OUTPUT_DIR = $OUTPUT_ROOT /structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR = $OUTPUT_ROOT /workdir/$OUTPUT_FOLDER_NAME
2023-09-26 19:24:21 -04:00
DOWNLOAD_DIR = $SCRIPT_DIR /download/$OUTPUT_FOLDER_NAME
2023-11-07 15:14:01 -08:00
DESTINATION_INDEX = " utic-test-ingest-fixtures-output- $( uuidgen) "
2023-09-26 19:24:21 -04:00
# The vector configs on the schema currently only exist on versions:
# 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview
API_VERSION = 2023-07-01-Preview
2023-10-02 16:47:24 -04:00
CI = ${ CI :- "false" }
2023-09-26 19:24:21 -04:00
2023-12-11 20:04:15 -05:00
if [ -z " $SHAREPOINT_CLIENT_ID " ] || [ -z " $SHAREPOINT_CRED " ] ; then
2023-12-18 23:48:21 -08:00
echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set."
exit 8
2023-09-26 19:24:21 -04:00
fi
2023-12-11 20:04:15 -05:00
if [ -z " $SHAREPOINT_PERMISSIONS_APP_ID " ] || [ -z " $SHAREPOINT_PERMISSIONS_APP_CRED " ] || [ -z " $SHAREPOINT_PERMISSIONS_TENANT " ] ; then
2023-12-18 23:48:21 -08:00
echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set."
exit 8
2023-10-13 01:38:08 +01:00
fi
2023-09-26 19:24:21 -04:00
if [ -z " $OPENAI_API_KEY " ] ; then
2023-12-18 23:48:21 -08:00
echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set."
exit 8
2023-09-26 19:24:21 -04:00
fi
if [ -z " $AZURE_SEARCH_ENDPOINT " ] && [ -z " $AZURE_SEARCH_API_KEY " ] ; then
2023-12-18 23:48:21 -08:00
echo "Skipping Sharepoint Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set."
exit 8
2023-09-26 19:24:21 -04:00
fi
2023-10-02 16:47:24 -04:00
# shellcheck disable=SC1091
source " $SCRIPT_DIR " /cleanup.sh
2023-09-26 19:24:21 -04:00
function cleanup {
2023-12-18 23:48:21 -08:00
response_code = $( curl -s -o /dev/null -w "%{http_code}" \
" https://utic-test-ingest-fixtures.search.windows.net/indexes/ $DESTINATION_INDEX ?api-version= $API_VERSION " \
--header " api-key: $AZURE_SEARCH_API_KEY " \
--header 'content-type: application/json' )
if [ " $response_code " = = "200" ] ; then
echo " deleting index $DESTINATION_INDEX "
curl -X DELETE \
" https://utic-test-ingest-fixtures.search.windows.net/indexes/ $DESTINATION_INDEX ?api-version= $API_VERSION " \
--header " api-key: $AZURE_SEARCH_API_KEY " \
--header 'content-type: application/json'
else
echo " Index $DESTINATION_INDEX does not exist, nothing to delete "
fi
cleanup_dir " $OUTPUT_DIR "
cleanup_dir " $WORK_DIR "
if [ " $CI " = = "true" ] ; then
cleanup_dir " $DOWNLOAD_DIR "
fi
2023-09-26 19:24:21 -04:00
}
trap cleanup EXIT
# Create index
echo " Creating index $DESTINATION_INDEX "
response_code = $( curl -s -o /dev/null -w "%{http_code}" -X PUT \
2023-12-18 23:48:21 -08:00
" https://utic-test-ingest-fixtures.search.windows.net/indexes/ $DESTINATION_INDEX ?api-version= $API_VERSION " \
--header " api-key: $AZURE_SEARCH_API_KEY " \
--header 'content-type: application/json' \
--data " @ $SCRIPT_DIR /files/azure_cognitive_index_schema.json " )
2023-09-26 19:24:21 -04:00
if [ " $response_code " -lt 400 ] ; then
2023-12-18 23:48:21 -08:00
echo " Index creation success: $response_code "
2023-09-26 19:24:21 -04:00
else
2023-12-18 23:48:21 -08:00
echo " Index creation failure: $response_code "
exit 1
2023-09-26 19:24:21 -04:00
fi
2023-11-02 16:41:56 -05:00
RUN_SCRIPT = ${ RUN_SCRIPT :- ./unstructured/ingest/main.py }
PYTHONPATH = ${ PYTHONPATH :- . } " $RUN_SCRIPT " \
2023-12-18 23:48:21 -08:00
sharepoint \
--download-dir " $DOWNLOAD_DIR " \
--metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes 2 \
--strategy hi_res \
--preserve-downloads \
--reprocess \
--output-dir " $OUTPUT_DIR " \
--verbose \
--client-cred " $SHAREPOINT_CRED " \
--client-id " $SHAREPOINT_CLIENT_ID " \
--site " $SHAREPOINT_SITE " \
--permissions-application-id " $SHAREPOINT_PERMISSIONS_APP_ID " \
--permissions-client-cred " $SHAREPOINT_PERMISSIONS_APP_CRED " \
--permissions-tenant " $SHAREPOINT_PERMISSIONS_TENANT " \
--path "Shared Documents" \
--recursive \
--embedding-provider "langchain-huggingface" \
2024-05-21 13:01:49 -04:00
--chunking-strategy by_title \
2023-12-18 23:48:21 -08:00
--chunk-multipage-sections \
--work-dir " $WORK_DIR " \
azure-cognitive-search \
--key " $AZURE_SEARCH_API_KEY " \
--endpoint " $AZURE_SEARCH_ENDPOINT " \
--index " $DESTINATION_INDEX "
2023-09-26 19:24:21 -04:00
# It can take some time for the index to catch up with the content that was written, this check between 10s sleeps
# to give it that time process the writes. Will timeout after checking for a minute.
docs_count_remote = 0
attempt = 1
while [ " $docs_count_remote " -eq 0 ] && [ " $attempt " -lt 6 ] ; do
2023-12-18 23:48:21 -08:00
echo " attempt $attempt : sleeping 10 seconds to let index finish catching up after writes "
sleep 10
2023-09-26 19:24:21 -04:00
2023-12-18 23:48:21 -08:00
# Check the contents of the index
docs_count_remote = $( curl " https://utic-test-ingest-fixtures.search.windows.net/indexes/ $DESTINATION_INDEX /docs/\$count?api-version= $API_VERSION " \
--header " api-key: $AZURE_SEARCH_API_KEY " \
--header 'content-type: application/json' | jq)
2023-09-26 19:24:21 -04:00
2023-12-18 23:48:21 -08:00
echo " docs count pulled from Azure: $docs_count_remote "
2023-09-26 19:24:21 -04:00
2023-12-18 23:48:21 -08:00
attempt = $(( attempt + 1 ))
2023-09-26 19:24:21 -04:00
done
docs_count_local = 0
for i in $( jq length " $OUTPUT_DIR " /**/*.json) ; do
2023-12-18 23:48:21 -08:00
docs_count_local = $(( docs_count_local + i))
2023-09-26 19:24:21 -04:00
done
2023-12-11 20:04:15 -05:00
if [ " $docs_count_remote " -ne " $docs_count_local " ] ; then
2023-12-18 23:48:21 -08:00
echo " Number of docs $docs_count_remote doesn't match the expected docs: $docs_count_local "
exit 1
2023-09-26 19:24:21 -04:00
fi