mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 21:55:33 +00:00
test: update ingest dest tests to follow set pattern (#1991)
### Description Update all destination tests to match pattern: * Don't omit any metadata to check full schema * Move azure cognitive dest test from src to dest * Split delta table test into seperate src and dest tests * Fix azure cognitive search and add to dest tests being run (wasn't being run originally)
This commit is contained in:
parent
668bd2967e
commit
d09c8c0cab
@ -1,4 +1,4 @@
|
||||
## 0.10.29-dev10
|
||||
## 0.10.29-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
|
||||
@ -82,6 +82,10 @@
|
||||
"name": "date_processed",
|
||||
"type": "Edm.DateTimeOffset"
|
||||
},
|
||||
{
|
||||
"name": "permissions_data",
|
||||
"type": "Edm.String"
|
||||
},
|
||||
{
|
||||
"name": "record_locator",
|
||||
"type": "Edm.String"
|
||||
@ -114,6 +118,10 @@
|
||||
"name": "page_number",
|
||||
"type": "Edm.String"
|
||||
},
|
||||
{
|
||||
"name": "links",
|
||||
"type": "Collection(Edm.String)"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"type": "Edm.String"
|
||||
|
||||
@ -5,10 +5,12 @@ set -e
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=s3-azure-cog-search-dest
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_FOLDER_NAME=azure-cog-search-dest
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(date +%s)"
|
||||
# The vector configs on the schema currently only exist on versions:
|
||||
@ -65,17 +67,14 @@ fi
|
||||
|
||||
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
s3 \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--strategy fast \
|
||||
--preserve-downloads \
|
||||
--reprocess \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--anonymous \
|
||||
--work-dir "$WORK_DIR" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/fake-memo.pdf \
|
||||
--work-dir "$WORK_DIR" \
|
||||
azure-cognitive-search \
|
||||
--key "$AZURE_SEARCH_API_KEY" \
|
||||
--endpoint "$AZURE_SEARCH_ENDPOINT" \
|
||||
@ -39,7 +39,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
|
||||
@ -41,7 +41,6 @@
|
||||
#PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
# local \
|
||||
# --num-processes "$max_processes" \
|
||||
# --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
# --output-dir "$OUTPUT_DIR" \
|
||||
# --strategy fast \
|
||||
# --verbose \
|
||||
|
||||
43
test_unstructured_ingest/dest/delta-table.sh
Executable file
43
test_unstructured_ingest/dest/delta-table.sh
Executable file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=delta-table-dest
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
CI=${CI:-"false"}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
||||
function cleanup() {
|
||||
cleanup_dir "$DESTINATION_TABLE"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/fake-memo.pdf \
|
||||
--work-dir "$WORK_DIR" \
|
||||
delta-table \
|
||||
--write-column json_data \
|
||||
--table-uri "$DESTINATION_TABLE"
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
|
||||
@ -61,7 +61,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
|
||||
@ -44,7 +44,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
|
||||
@ -34,7 +34,6 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
|
||||
@ -81,6 +81,10 @@
|
||||
"name": "date_processed",
|
||||
"type": "Edm.DateTimeOffset"
|
||||
},
|
||||
{
|
||||
"name": "permissions_data",
|
||||
"type": "Edm.String"
|
||||
},
|
||||
{
|
||||
"name": "record_locator",
|
||||
"type": "Edm.String"
|
||||
@ -117,6 +121,10 @@
|
||||
"name": "page_number",
|
||||
"type": "Edm.String"
|
||||
},
|
||||
{
|
||||
"name": "links",
|
||||
"type": "Collection(Edm.String)"
|
||||
},
|
||||
{
|
||||
"name": "page_name",
|
||||
"type": "Edm.String"
|
||||
|
||||
@ -10,7 +10,15 @@ def run_check(table_uri):
|
||||
table_uri=table_uri,
|
||||
)
|
||||
|
||||
assert len(delta_table.to_pandas()) == 10
|
||||
expected_rows = 5
|
||||
found_rows = len(delta_table.to_pandas())
|
||||
print(
|
||||
f"Checking if expected number of rows ({expected_rows}) "
|
||||
f"matches how many were found: {found_rows}"
|
||||
)
|
||||
assert (
|
||||
expected_rows == found_rows
|
||||
), f"expected number of rows doesn't match how many were found: {expected_rows}/{found_rows}"
|
||||
print("table check complete")
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,6 @@ OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
CI=${CI:-"false"}
|
||||
|
||||
@ -23,7 +22,6 @@ fi
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
||||
function cleanup() {
|
||||
cleanup_dir "$DESTINATION_TABLE"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
@ -44,13 +42,8 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
--storage_options "AWS_REGION=us-east-2,AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" \
|
||||
--preserve-downloads \
|
||||
--verbose \
|
||||
--work-dir "$WORK_DIR" \
|
||||
delta-table \
|
||||
--write-column json_data \
|
||||
--table-uri "$DESTINATION_TABLE"
|
||||
--work-dir "$WORK_DIR"
|
||||
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
||||
|
||||
python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
|
||||
|
||||
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
|
||||
|
||||
@ -10,7 +10,9 @@ export OMP_THREAD_LIMIT=1
|
||||
|
||||
all_tests=(
|
||||
'azure.sh'
|
||||
'azure-cognitive-search.sh'
|
||||
'box.sh'
|
||||
'delta-table.sh'
|
||||
'dropbox.sh'
|
||||
'gcs.sh'
|
||||
's3.sh'
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.10.29-dev10" # pragma: no cover
|
||||
__version__ = "0.10.29-dev11" # pragma: no cover
|
||||
|
||||
@ -61,6 +61,12 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
|
||||
data["metadata"]["data_source"]["version"] = str(version)
|
||||
if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"):
|
||||
data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator)
|
||||
if permissions_data := (
|
||||
data.get("metadata", {}).get("data_source", {}).get("permissions_data")
|
||||
):
|
||||
data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data)
|
||||
if links := data.get("metadata", {}).get("links"):
|
||||
data["metadata"]["links"] = [json.dumps(link) for link in links]
|
||||
if last_modified := data.get("metadata", {}).get("last_modified"):
|
||||
data["metadata"]["last_modified"] = parser.parse(last_modified).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user