diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2003a11fd..710ef8a7b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -184,6 +184,7 @@ jobs: GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} + GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} run: | source .venv/bin/activate sudo apt-get update @@ -194,6 +195,7 @@ jobs: make install-ingest-s3 make install-ingest-azure make install-ingest-discord + make install-ingest-google-drive make install-ingest-github make install-ingest-gitlab make install-ingest-slack diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 06f6393ab..295b257b6 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -61,6 +61,7 @@ jobs: GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} + GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} OVERWRITE_FIXTURES: "true" run: | source .venv/bin/activate @@ -72,6 +73,7 @@ jobs: make install-ingest-s3 make install-ingest-azure make install-ingest-discord + make install-ingest-google-drive make install-ingest-github make install-ingest-gitlab make install-ingest-slack diff --git a/test_unstructured_ingest/expected-structured-output/google-drive-output/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json b/test_unstructured_ingest/expected-structured-output/google-drive-output/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json new file mode 100644 index 000000000..171179edf --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/google-drive-output/117qrVqiCoR5EjYMsDHGdy3UMkEtKr9Q8-test-drive-doc.docx.json @@ -0,0 +1,20 @@ +[ + { + "element_id": "7e8cd2056da73a7fefb6cd91f4e5d199", + "text": "Title", + "type": "Title", + "metadata": { + "data_source": {}, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } + }, + { + "element_id": "9870998df89c1da4e01378d0fd085106", + "text": "This is a good reason to continue", + "type": "NarrativeText", + "metadata": { + "data_source": {}, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/google-drive-output/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json b/test_unstructured_ingest/expected-structured-output/google-drive-output/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json new file mode 100644 index 000000000..4cc4fd68e --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/google-drive-output/1SpQuE7jHz9nMt5hfQXsiok1SgIdRYX5o-fake.docx.json @@ -0,0 +1,11 @@ +[ + { + "element_id": "dd14cbbf0e74909aac7f248a85d190af", + "text": "Lorem ipsum dolor sit amet.", + "type": "Title", + "metadata": { + "data_source": {}, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/google-drive-output/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json b/test_unstructured_ingest/expected-structured-output/google-drive-output/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json new file mode 100644 index 000000000..9937bd6ea --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/google-drive-output/1cTKXAreuj-wYmL38nFnqKvz3X8UKcaMC-foo.txt.json @@ -0,0 +1,11 @@ +[ + { + "element_id": "8b5b9db0c13db24256c829aa364aa90c", + "text": "three", + "type": "Title", + "metadata": { + "data_source": {}, + "filetype": "text/plain" + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/test-ingest-google-drive.sh b/test_unstructured_ingest/test-ingest-google-drive.sh new file mode 100755 index 000000000..0dfef11b0 --- /dev/null +++ b/test_unstructured_ingest/test-ingest-google-drive.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) +cd "$SCRIPT_DIR"/.. || exit 1 + +if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then + echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." + echo "The Google Drive test content can be found at https://drive.google.com/drive/folders/1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr" + exit 0 +fi + +# Create a temporary file +GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) +echo "$GCP_INGEST_SERVICE_KEY" > "$GCP_INGEST_SERVICE_KEY_FILE" + +PYTHONPATH=. unstructured/ingest/main.py \ + --metadata-exclude filename,file_directory,metadata.data_source.date_processed \ + --drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \ + --drive-service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ + --structured-output-dir google-drive-output \ + --download-dir files-ingest-download/google-drive \ + --partition-strategy hi_res \ + --preserve-downloads \ + --reprocess \ + --num-processes 2 + +OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false} + +set +e + +# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64 +if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then + + cp google-drive-output/* test_unstructured_ingest/expected-structured-output/google-drive-output/ + +elif ! diff -ru test_unstructured_ingest/expected-structured-output/google-drive-output google-drive-output ; then + + echo + echo "There are differences from the previously checked-in structured outputs." + echo + echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:" + echo + echo " export OVERWRITE_FIXTURES=true" + echo + echo "and then rerun this script." + echo + echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware" + echo "to update fixtures for CI." + echo + exit 1 + +fi diff --git a/test_unstructured_ingest/test-ingest.sh b/test_unstructured_ingest/test-ingest.sh index 9b5fdda24..ff0569d93 100755 --- a/test_unstructured_ingest/test-ingest.sh +++ b/test_unstructured_ingest/test-ingest.sh @@ -13,6 +13,7 @@ export OMP_THREAD_LIMIT=1 ./test_unstructured_ingest/test-ingest-discord.sh ./test_unstructured_ingest/test-ingest-github.sh ./test_unstructured_ingest/test-ingest-gitlab.sh +./test_unstructured_ingest/test-ingest-google-drive.sh ./test_unstructured_ingest/test-ingest-wikipedia.sh ./test_unstructured_ingest/test-ingest-biomed-api.sh ./test_unstructured_ingest/test-ingest-biomed-path.sh