test: update ingest dest tests to follow set pattern (#1991)

### Description
Update all destination tests to match pattern:
* Don't omit any metadata to check full schema
* Move azure cognitive dest test from src to dest
* Split delta table test into seperate src and dest tests
* Fix azure cognitive search and add to dest tests being run (wasn't
being run originally)
This commit is contained in:
Roman Isecke 2023-11-03 08:46:56 -04:00 committed by GitHub
parent 668bd2967e
commit d09c8c0cab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 90 additions and 28 deletions

View File

@ -1,4 +1,4 @@
## 0.10.29-dev10
## 0.10.29-dev11
### Enhancements

View File

@ -82,6 +82,10 @@
"name": "date_processed",
"type": "Edm.DateTimeOffset"
},
{
"name": "permissions_data",
"type": "Edm.String"
},
{
"name": "record_locator",
"type": "Edm.String"
@ -114,6 +118,10 @@
"name": "page_number",
"type": "Edm.String"
},
{
"name": "links",
"type": "Collection(Edm.String)"
},
{
"name": "url",
"type": "Edm.String"

View File

@ -5,10 +5,12 @@ set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=s3-azure-cog-search-dest
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
OUTPUT_FOLDER_NAME=azure-cog-search-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)"
# The vector configs on the schema currently only exist on versions:
@ -65,17 +67,14 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
s3 \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--strategy fast \
--preserve-downloads \
--reprocess \
--output-dir "$OUTPUT_DIR" \
--verbose \
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--anonymous \
--work-dir "$WORK_DIR" \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
azure-cognitive-search \
--key "$AZURE_SEARCH_API_KEY" \
--endpoint "$AZURE_SEARCH_ENDPOINT" \

View File

@ -39,7 +39,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \

View File

@ -41,7 +41,6 @@
#PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
# local \
# --num-processes "$max_processes" \
# --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
# --output-dir "$OUTPUT_DIR" \
# --strategy fast \
# --verbose \

View File

@ -0,0 +1,43 @@
#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=delta-table-dest
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
CI=${CI:-"false"}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$DESTINATION_TABLE"
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
cleanup_dir "$DOWNLOAD_DIR"
fi
}
trap cleanup EXIT
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \
--reprocess \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
delta-table \
--write-column json_data \
--table-uri "$DESTINATION_TABLE"
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"

View File

@ -61,7 +61,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \

View File

@ -44,7 +44,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \

View File

@ -34,7 +34,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
--strategy fast \
--verbose \

View File

@ -81,6 +81,10 @@
"name": "date_processed",
"type": "Edm.DateTimeOffset"
},
{
"name": "permissions_data",
"type": "Edm.String"
},
{
"name": "record_locator",
"type": "Edm.String"
@ -117,6 +121,10 @@
"name": "page_number",
"type": "Edm.String"
},
{
"name": "links",
"type": "Collection(Edm.String)"
},
{
"name": "page_name",
"type": "Edm.String"

View File

@ -10,7 +10,15 @@ def run_check(table_uri):
table_uri=table_uri,
)
assert len(delta_table.to_pandas()) == 10
expected_rows = 5
found_rows = len(delta_table.to_pandas())
print(
f"Checking if expected number of rows ({expected_rows}) "
f"matches how many were found: {found_rows}"
)
assert (
expected_rows == found_rows
), f"expected number of rows doesn't match how many were found: {expected_rows}/{found_rows}"
print("table check complete")

View File

@ -10,7 +10,6 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
CI=${CI:-"false"}
@ -23,7 +22,6 @@ fi
source "$SCRIPT_DIR"/cleanup.sh
function cleanup() {
cleanup_dir "$DESTINATION_TABLE"
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
if [ "$CI" == "true" ]; then
@ -44,13 +42,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
--preserve-downloads \
--verbose \
--work-dir "$WORK_DIR" \
delta-table \
--write-column json_data \
--table-uri "$DESTINATION_TABLE"
--work-dir "$WORK_DIR"
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"

View File

@ -10,7 +10,9 @@ export OMP_THREAD_LIMIT=1
all_tests=(
'azure.sh'
'azure-cognitive-search.sh'
'box.sh'
'delta-table.sh'
'dropbox.sh'
'gcs.sh'
's3.sh'

View File

@ -1 +1 @@
__version__ = "0.10.29-dev10" # pragma: no cover
__version__ = "0.10.29-dev11" # pragma: no cover

View File

@ -61,6 +61,12 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
data["metadata"]["data_source"]["version"] = str(version)
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
if permissions_data := (
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
):
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
if links := data.get("metadata", {}).get("links"):
data["metadata"]["links"] = [json.dumps(link) for link in links]
if last_modified := data.get("metadata", {}).get("last_modified"):
data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
"%Y-%m-%dT%H:%M:%S.%fZ",