diff --git a/.github/actions/base-ingest-cache/action.yml b/.github/actions/base-ingest-cache/action.yml index b83a833cf..f29d86764 100644 --- a/.github/actions/base-ingest-cache/action.yml +++ b/.github/actions/base-ingest-cache/action.yml @@ -39,7 +39,7 @@ runs: python -m pip install --upgrade setuptools fi make install-ci - make install-all-ingest + make install-ingest - name: Save Ingest Cache if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true' id: ingest-virtualenv-cache-save diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 142578885..12c261ffb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,7 +72,6 @@ jobs: - name: Install all doc and test dependencies run: | make install-ci - make install-all-ingest make check-licenses lint: @@ -273,37 +272,6 @@ jobs: python-version: ${{ matrix.python-version }} check-only: 'true' - test_ingest_unit: - strategy: - matrix: - python-version: [ "3.9","3.10" ] - runs-on: ubuntu-latest - needs: [ setup_ingest, lint ] - steps: - # actions/checkout MUST come before auth - - uses: 'actions/checkout@v4' - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Get full Python version - id: full-python-version - run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT - - name: Setup virtual environment - uses: ./.github/actions/base-ingest-cache - with: - python-version: ${{ matrix.python-version }} - - name: Test Ingest (unit) - env: - NLTK_DATA: ${{ github.workspace }}/nltk_data - PYTHON: python${{ matrix.python-version }} - run: | - source .venv/bin/activate - make install-ci - make install-all-ingest - PYTHONPATH=. ${PYTHON} -m pytest test_unstructured_ingest/unit - - test_ingest_src: strategy: matrix: @@ -378,8 +346,6 @@ jobs: PYTHON: python${{ matrix.python-version }} run: | source .venv/bin/activate - make install-ci - make install-all-ingest sudo apt-get update sudo apt-get install -y libmagic-dev poppler-utils libreoffice make install-pandoc @@ -392,103 +358,6 @@ jobs: ./test_unstructured_ingest/test-ingest-src.sh - test_ingest_dest: - environment: ci - strategy: - matrix: - python-version: ["3.9","3.10"] - runs-on: ubuntu-latest-m - needs: [setup_ingest, lint] - steps: - # actions/checkout MUST come before auth - - uses: 'actions/checkout@v4' - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Get full Python version - id: full-python-version - run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT - - name: Setup virtual environment - uses: ./.github/actions/base-ingest-cache - with: - python-version: ${{ matrix.python-version }} - - name: Setup docker-compose - uses: KengoTODA/actions-setup-docker-compose@v1 - with: - version: '2.22.0' - - name: Test (end-to-end) - env: - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }} - S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }} - AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }} - AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }} - BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }} - DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }} - DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }} - DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }} - GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - MONGODB_URI: ${{ secrets.MONGODB_URI }} - MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }} - AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }} - PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}} - VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}} - VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}} - VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}} - ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}} - ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}} - CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}} - DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}} - DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}} - DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}} - DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}} - OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" - CI: "true" - NLTK_DATA: ${{ github.workspace }}/nltk_data - PYTHON: python${{ matrix.python-version }} - run: | - source .venv/bin/activate - make install-ci - make install-all-ingest - sudo apt-get update - sudo apt-get install -y libmagic-dev poppler-utils libreoffice - make install-pandoc - sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 - sudo apt-get update - sudo apt-get install -y tesseract-ocr - sudo apt-get install -y tesseract-ocr-kor - sudo apt-get install diffstat - tesseract --version - ./test_unstructured_ingest/test-ingest-dest.sh - - test_ingest_help: - environment: ci - strategy: - matrix: - python-version: ["3.9","3.10","3.11", "3.12"] - runs-on: ubuntu-latest - needs: [setup_ingest, lint] - steps: - - uses: 'actions/checkout@v4' - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - name: Setup virtual environment - uses: ./.github/actions/base-ingest-cache - with: - python-version: ${{ matrix.python-version }} - - name: Validate --help - run: | - source .venv/bin/activate - make install-ci - make install-all-ingest - ./test_unstructured_ingest/test-help.sh - - test_unstructured_api_unit: strategy: matrix: diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index d22a5aab9..f724e8dfc 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -105,6 +105,7 @@ jobs: sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 sudo apt-get install -y tesseract-ocr sudo apt-get install -y tesseract-ocr-kor + sudo apt-get install diffstat tesseract --version ./test_unstructured_ingest/test-ingest-src.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 293ffe581..68ccf02a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,12 @@ -## 0.15.15-dev0 +## 0.16.0 ### Enhancements +* **Remove ingest implementation.** The deprecated ingest functionality has been removed, as it is now maintained in the separate [unstructured-ingest](https://github.com/Unstructured-IO/unstructured-ingest) repository. + * Replace extras in `requirements/ingest` directory with a new `ingest.txt` extra for installing the `unstructured-ingest` library. + * Remove the `unstructured.ingest` submodule. + * Delete all shell scripts previously used for destination ingest tests. + ### Features ### Fixes diff --git a/MANIFEST.in b/MANIFEST.in index e5c39fc29..e4c7d4da5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -15,45 +15,3 @@ include requirements/extra-pptx.in include requirements/extra-xlsx.in include requirements/huggingface.in -# Ingest extras -include requirements/ingest/airtable.in -include requirements/ingest/astradb.in -include requirements/ingest/azure-cognitive-search.in -include requirements/ingest/azure.in -include requirements/ingest/biomed.in -include requirements/ingest/box.in -include requirements/ingest/chroma.in -include requirements/ingest/confluence.in -include requirements/ingest/databricks-volumes.in -include requirements/ingest/delta-table.in -include requirements/ingest/discord.in -include requirements/ingest/dropbox.in -include requirements/ingest/elasticsearch.in -include requirements/ingest/embed-aws-bedrock.in -include requirements/ingest/embed-huggingface.in -include requirements/ingest/embed-mixedbreadai.in -include requirements/ingest/embed-openai.in -include requirements/ingest/gcs.in -include requirements/ingest/github.in -include requirements/ingest/gitlab.in -include requirements/ingest/google-drive.in -include requirements/ingest/hubspot.in -include requirements/ingest/jira.in -include requirements/ingest/kafka.in -include requirements/ingest/mongodb.in -include requirements/ingest/notion.in -include requirements/ingest/onedrive.in -include requirements/ingest/opensearch.in -include requirements/ingest/outlook.in -include requirements/ingest/pinecone.in -include requirements/ingest/postgres.in -include requirements/ingest/qdrant.in -include requirements/ingest/reddit.in -include requirements/ingest/s3.in -include requirements/ingest/salesforce.in -include requirements/ingest/sftp.in -include requirements/ingest/sharepoint.in -include requirements/ingest/slack.in -include requirements/ingest/singlestore.in -include requirements/ingest/weaviate.in -include requirements/ingest/wikipedia.in diff --git a/Makefile b/Makefile index d9a3e1803..714992a83 100644 --- a/Makefile +++ b/Makefile @@ -99,171 +99,9 @@ install-xlsx: .PHONY: install-all-docs install-all-docs: install-base install-csv install-docx install-epub install-odt install-pypandoc install-markdown install-pdf-image install-pptx install-xlsx -.PHONY: install-all-ingest -install-all-ingest: - find requirements/ingest -type f -name "*.txt" -exec ${PYTHON} -m pip install -r '{}' ';' - - -.PHONY: install-ingest-google-drive -install-ingest-google-drive: - ${PYTHON} -m pip install -r requirements/ingest/google-drive.txt - -## install-ingest-s3: install requirements for the s3 connector -.PHONY: install-ingest-s3 -install-ingest-s3: - ${PYTHON} -m pip install -r requirements/ingest/s3.txt - -.PHONY: install-ingest-gcs -install-ingest-gcs: - ${PYTHON} -m pip install -r requirements/ingest/gcs.txt - -.PHONY: install-ingest-dropbox -install-ingest-dropbox: - ${PYTHON} -m pip install -r requirements/ingest/dropbox.txt - -.PHONY: install-ingest-azure -install-ingest-azure: - ${PYTHON} -m pip install -r requirements/ingest/azure.txt - -.PHONY: install-ingest-box -install-ingest-box: - ${PYTHON} -m pip install -r requirements/ingest/box.txt - -.PHONY: install-ingest-delta-table -install-ingest-delta-table: - ${PYTHON} -m pip install -r requirements/ingest/delta-table.txt - -.PHONY: install-ingest-discord -install-ingest-discord: - ${PYTHON} -m pip install -r requirements/ingest/discord.txt - -.PHONY: install-ingest-github -install-ingest-github: - ${PYTHON} -m pip install -r requirements/ingest/github.txt - -.PHONY: install-ingest-biomed -install-ingest-biomed: - ${PYTHON} -m pip install -r requirements/ingest/biomed.txt - -.PHONY: install-ingest-gitlab -install-ingest-gitlab: - ${PYTHON} -m pip install -r requirements/ingest/gitlab.txt - -.PHONY: install-ingest-onedrive -install-ingest-onedrive: - ${PYTHON} -m pip install -r requirements/ingest/onedrive.txt - -.PHONY: install-ingest-outlook -install-ingest-outlook: - ${PYTHON} -m pip install -r requirements/ingest/outlook.txt - -.PHONY: install-ingest-reddit -install-ingest-reddit: - ${PYTHON} -m pip install -r requirements/ingest/reddit.txt - -.PHONY: install-ingest-slack -install-ingest-slack: - ${PYTHON} -m pip install -r requirements/ingest/slack.txt - -.PHONY: install-ingest-kafka -install-ingest-kafka: - ${PYTHON} -m pip install -r requirements/ingest/kafka.txt - -.PHONY: install-ingest-wikipedia -install-ingest-wikipedia: - ${PYTHON} -m pip install -r requirements/ingest/wikipedia.txt - -.PHONY: install-ingest-elasticsearch -install-ingest-elasticsearch: - ${PYTHON} -m pip install -r requirements/ingest/elasticsearch.txt - -.PHONY: install-ingest-opensearch -install-ingest-opensearch: - ${PYTHON} -m pip install -r requirements/ingest/opensearch.txt - -.PHONY: install-ingest-confluence -install-ingest-confluence: - ${PYTHON} -m pip install -r requirements/ingest/confluence.txt - -.PHONY: install-ingest-airtable -install-ingest-airtable: - ${PYTHON} -m pip install -r requirements/ingest/airtable.txt - -.PHONY: install-ingest-sharepoint -install-ingest-sharepoint: - ${PYTHON} -m pip install -r requirements/ingest/sharepoint.txt - -.PHONY: install-ingest-singlestore -install-ingest-singlestore: - ${PYTHON} -m pip install -r requirements/ingest/singlestore.txt - -.PHONY: install-ingest-weaviate -install-ingest-weaviate: - ${PYTHON} -m pip install -r requirements/ingest/weaviate.txt - -.PHONY: install-ingest-local -install-ingest-local: - echo "no unique dependencies for local connector" - -.PHONY: install-ingest-notion -install-ingest-notion: - ${PYTHON} -m pip install -r requirements/ingest/notion.txt - -.PHONY: install-ingest-salesforce -install-ingest-salesforce: - ${PYTHON} -m pip install -r requirements/ingest/salesforce.txt - -.PHONY: install-ingest-jira -install-ingest-jira: - ${PYTHON} -m pip install -r requirements/ingest/jira.txt - -.PHONY: install-ingest-hubspot -install-ingest-hubspot: - ${PYTHON} -m pip install -r requirements/ingest/hubspot.txt - -.PHONY: install-ingest-sftp -install-ingest-sftp: - ${PYTHON} -m pip install -r requirements/ingest/sftp.txt - -.PHONY: install-ingest-pinecone -install-ingest-pinecone: - ${PYTHON} -m pip install -r requirements/ingest/pinecone.txt - -.PHONY: install-ingest-qdrant -install-ingest-qdrant: - ${PYTHON} -m pip install -r requirements/ingest/qdrant.txt - -.PHONY: install-ingest-chroma -install-ingest-chroma: - ${PYTHON} -m pip install -r requirements/ingest/chroma.txt - -.PHONY: install-ingest-postgres -install-ingest-postgres: - ${PYTHON} -m pip install -r requirements/ingest/postgres.txt - -.PHONY: install-ingest-mongodb -install-ingest-mongodb: - ${PYTHON} -m pip install -r requirements/ingest/mongodb.txt - -.PHONY: install-ingest-databricks-volumes -install-ingest-databricks-volumes: - ${PYTHON} -m pip install -r requirements/ingest/databricks-volumes.txt - -.PHONY: install-ingest-astradb -install-ingest-astradb: - ${PYTHON} -m pip install -r requirements/ingest/astradb.txt - -.PHONY: install-ingest-clarifai -install-ingest-clarifai: - ${PYTHON} -m pip install -r requirements/ingest/clarifai.txt - -.PHONY: install-embed-huggingface -install-embed-huggingface: - ${PYTHON} -m pip install -r requirements/ingest/embed-huggingface.txt - -.PHONY: install-unstructured-inference -install-unstructured-inference: - ${PYTHON} -m pip install -r requirements/ingest/local-inference.txt +.PHONY: install-ingest +install-ingest: + python3 -m pip install -r requirements/ingest/ingest.txt ## install-local-inference: installs requirements for local inference .PHONY: install-local-inference @@ -367,7 +205,7 @@ test-extra-xlsx: ## check: runs linters (includes tests) .PHONY: check -check: check-ruff check-black check-flake8 check-version check-flake8-print +check: check-ruff check-black check-flake8 check-version .PHONY: check-shfmt check-shfmt: @@ -385,12 +223,6 @@ check-flake8: check-licenses: @scripts/check-licenses.sh -# Check for print statements in ingest since anything going to console should be using the ingest logger -# as it has a built in filter to redact sensitive information -.PHONY: check-flake8-print -check-flake8-print: - flake8 --per-file-ignores "" ./unstructured/ingest - .PHONY: check-ruff check-ruff: # -- ruff options are determined by pyproject.toml -- diff --git a/docs/requirements.in b/docs/requirements.in index 27a82d80c..46b71caac 100644 --- a/docs/requirements.in +++ b/docs/requirements.in @@ -22,4 +22,4 @@ furo==2023.7.26 certifi>=2022.12.07 # NOTE(ronny) - Added to suppress Sphinx warnings -myst-parser \ No newline at end of file +myst-parser diff --git a/docs/requirements.txt b/docs/requirements.txt index ee5fdd1d2..e20c1267e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,17 +10,17 @@ babel==2.13.1 # via sphinx beautifulsoup4==4.12.2 # via - # -c base.txt + # -c ./deps/base.txt # furo certifi==2023.11.17 # via - # -c base.txt + # -c ./deps/base.txt # -c constraints.in # -r build.in # requests charset-normalizer==3.3.2 # via - # -c base.txt + # -c ./deps/base.txt # requests docutils==0.18.1 # via @@ -32,7 +32,7 @@ furo==2023.7.26 # via -r build.in idna==3.6 # via - # -c base.txt + # -c ./deps/base.txt # requests imagesize==1.4.1 # via sphinx @@ -56,7 +56,7 @@ myst-parser==2.0.0 # via -r build.in packaging==23.2 # via - # -c base.txt + # -c ./deps/base.txt # sphinx pygments==2.17.2 # via @@ -69,13 +69,13 @@ pyyaml==6.0.1 # via myst-parser requests==2.31.0 # via - # -c base.txt + # -c ./deps/base.txt # sphinx snowballstemmer==2.2.0 # via sphinx soupsieve==2.5 # via - # -c base.txt + # -c ./deps/base.txt # beautifulsoup4 sphinx==6.2.1 # via @@ -118,7 +118,7 @@ sphinxcontrib-serializinghtml==1.1.5 # sphinx urllib3==1.26.18 # via - # -c base.txt + # -c ./deps/base.txt # -c constraints.in # requests zipp==3.17.0 diff --git a/requirements/Makefile b/requirements/Makefile index 9c4175401..9e6b685fc 100644 --- a/requirements/Makefile +++ b/requirements/Makefile @@ -3,12 +3,8 @@ SHELL := /bin/bash BASE_REQUIREMENTS := $(shell ls ./*.in) BASE_REQUIREMENTSTXT := $(patsubst %.in,%.txt,$(BASE_REQUIREMENTS)) -INGEST_REQUIREMENTS := $(shell ls ./ingest/*.in) -INGEST_REQUIREMENTSTXT := $(patsubst %.in,%.txt,$(INGEST_REQUIREMENTS)) - - .PHONY: all -all: compile-all-base compile-ingest +all: compile-all-base .PHONY: compile-test compile-test: @@ -26,18 +22,9 @@ compile-base: compile-all-base: compile-base compile-test compile-dev @$(foreach file,$(BASE_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;) -.PHONY: compile-ingest -compile-ingest: - @$(foreach file,$(INGEST_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;) - .PHONY: clean -clean: clean-base clean-ingest +clean: clean-base .PHONY: clean-base clean-base: - rm $(BASE_REQUIREMENTSTXT) - -.PHONY: clean-ingest -clean-ingest: - rm $(INGEST_REQUIREMENTSTXT) - + rm $(BASE_REQUIREMENTSTXT) \ No newline at end of file diff --git a/requirements/base.txt b/requirements/base.txt index b0f454c0f..5ff129c06 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,7 +4,7 @@ # # pip-compile ./base.in # -anyio==4.5.0 +anyio==4.6.0 # via httpx backoff==2.2.1 # via -r ./base.in @@ -36,7 +36,7 @@ dataclasses-json==0.6.7 # unstructured-client deepdiff==8.0.1 # via unstructured-client -emoji==2.13.0 +emoji==2.14.0 # via -r ./base.in exceptiongroup==1.2.2 # via anyio @@ -44,7 +44,7 @@ filetype==1.2.0 # via -r ./base.in h11==0.14.0 # via httpcore -httpcore==1.0.5 +httpcore==1.0.6 # via httpx httpx==0.27.2 # via unstructured-client @@ -88,7 +88,7 @@ psutil==6.0.0 # via -r ./base.in pycparser==2.22 # via cffi -pypdf==5.0.0 +pypdf==5.0.1 # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client @@ -98,7 +98,7 @@ python-magic==0.4.27 # via -r ./base.in python-oxmsg==0.0.1 # via -r ./base.in -rapidfuzz==3.9.7 +rapidfuzz==3.10.0 # via -r ./base.in regex==2024.9.11 # via nltk @@ -130,7 +130,6 @@ typing-extensions==4.12.2 # via # -r ./base.in # anyio - # emoji # pypdf # python-oxmsg # typing-inspect @@ -140,7 +139,9 @@ typing-inspect==0.9.0 # dataclasses-json # unstructured-client unstructured-client==0.25.9 - # via -r ./base.in + # via + # -c ././deps/constraints.txt + # -r ./base.in urllib3==1.26.20 # via # -c ././deps/constraints.txt diff --git a/requirements/cache.txt b/requirements/cache.txt deleted file mode 100644 index d229daaec..000000000 --- a/requirements/cache.txt +++ /dev/null @@ -1 +0,0 @@ -# a \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt index 4df21d1ab..3ce9e87d6 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -38,7 +38,7 @@ platformdirs==4.3.6 # virtualenv pre-commit==3.8.0 # via -r ./dev.in -pyproject-hooks==1.1.0 +pyproject-hooks==1.2.0 # via # build # pip-tools @@ -46,12 +46,12 @@ pyyaml==6.0.2 # via # -c ./test.txt # pre-commit -tomli==2.0.1 +tomli==2.0.2 # via # -c ./test.txt # build # pip-tools -virtualenv==20.26.5 +virtualenv==20.26.6 # via pre-commit wheel==0.44.0 # via pip-tools diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index f606a04a4..1896204fb 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -8,7 +8,7 @@ numpy==1.26.4 # via # -c ./base.txt # pandas -pandas==2.2.2 +pandas==2.2.3 # via -r ./extra-csv.in python-dateutil==2.9.0.post0 # via @@ -20,5 +20,5 @@ six==1.16.0 # via # -c ./base.txt # python-dateutil -tzdata==2024.1 +tzdata==2024.2 # via pandas diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 048822000..db0079f9f 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,7 +4,7 @@ # # pip-compile ./extra-paddleocr.in # -anyio==4.5.0 +anyio==4.6.0 # via # -c ./base.txt # httpx @@ -32,13 +32,13 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # anyio -fonttools==4.53.1 +fonttools==4.54.1 # via matplotlib h11==0.14.0 # via # -c ./base.txt # httpcore -httpcore==1.0.5 +httpcore==1.0.6 # via # -c ./base.txt # httpx @@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0 # matplotlib pyyaml==6.0.2 # via unstructured-paddleocr -rapidfuzz==3.9.7 +rapidfuzz==3.10.0 # via # -c ./base.txt # unstructured-paddleocr diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 29ace44ca..a7d3ce8cf 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -42,7 +42,7 @@ filelock==3.16.1 # transformers flatbuffers==24.3.25 # via onnxruntime -fonttools==4.53.1 +fonttools==4.54.1 # via matplotlib fsspec==2024.9.0 # via @@ -60,14 +60,14 @@ googleapis-common-protos==1.65.0 # via # google-api-core # grpcio-status -grpcio==1.66.1 +grpcio==1.66.2 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.25.0 +huggingface-hub==0.25.1 # via # timm # tokenizers @@ -119,7 +119,7 @@ numpy==1.26.4 # transformers omegaconf==2.3.0 # via effdet -onnx==1.16.2 +onnx==1.17.0 # via # -r ./extra-pdf-image.in # unstructured-inference @@ -138,7 +138,7 @@ packaging==24.1 # pikepdf # transformers # unstructured-pytesseract -pandas==2.2.2 +pandas==2.2.3 # via layoutparser pdf2image==1.17.0 # via @@ -152,7 +152,7 @@ pdfplumber==0.11.4 # via layoutparser pi-heif==0.18.0 # via -r ./extra-pdf-image.in -pikepdf==9.2.1 +pikepdf==9.3.0 # via -r ./extra-pdf-image.in pillow==10.4.0 # via @@ -194,7 +194,7 @@ pycparser==2.22 # cffi pyparsing==3.1.4 # via matplotlib -pypdf==5.0.0 +pypdf==5.0.1 # via # -c ./base.txt # -r ./extra-pdf-image.in @@ -205,7 +205,7 @@ python-dateutil==2.9.0.post0 # -c ./base.txt # matplotlib # pandas -python-multipart==0.0.9 +python-multipart==0.0.12 # via unstructured-inference pytz==2024.2 # via pandas @@ -216,7 +216,7 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.9.7 +rapidfuzz==3.10.0 # via # -c ./base.txt # unstructured-inference @@ -279,7 +279,7 @@ typing-extensions==4.12.2 # iopath # pypdf # torch -tzdata==2024.1 +tzdata==2024.2 # via pandas unstructured-inference==0.7.36 # via -r ./extra-pdf-image.in diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index bd931000a..ff08577a7 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -14,7 +14,7 @@ numpy==1.26.4 # pandas openpyxl==3.1.5 # via -r ./extra-xlsx.in -pandas==2.2.2 +pandas==2.2.3 # via -r ./extra-xlsx.in python-dateutil==2.9.0.post0 # via @@ -26,7 +26,7 @@ six==1.16.0 # via # -c ./base.txt # python-dateutil -tzdata==2024.1 +tzdata==2024.2 # via pandas xlrd==2.0.1 # via -r ./extra-xlsx.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index daa466d3d..7b2e04bde 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -25,7 +25,7 @@ fsspec==2024.9.0 # via # huggingface-hub # torch -huggingface-hub==0.25.0 +huggingface-hub==0.25.1 # via # tokenizers # transformers diff --git a/requirements/ingest/airtable.in b/requirements/ingest/airtable.in deleted file mode 100644 index e6e85c3c6..000000000 --- a/requirements/ingest/airtable.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -pyairtable diff --git a/requirements/ingest/airtable.txt b/requirements/ingest/airtable.txt deleted file mode 100644 index e45acd598..000000000 --- a/requirements/ingest/airtable.txt +++ /dev/null @@ -1,44 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/airtable.in -# -annotated-types==0.7.0 - # via pydantic -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -inflection==0.5.1 - # via pyairtable -pyairtable==2.3.3 - # via -r ./ingest/airtable.in -pydantic==2.9.2 - # via pyairtable -pydantic-core==2.23.4 - # via pydantic -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # pyairtable -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # pyairtable - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # pyairtable - # requests diff --git a/requirements/ingest/astradb.in b/requirements/ingest/astradb.in deleted file mode 100644 index 0c99a4c93..000000000 --- a/requirements/ingest/astradb.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -astrapy diff --git a/requirements/ingest/astradb.txt b/requirements/ingest/astradb.txt deleted file mode 100644 index 46553972a..000000000 --- a/requirements/ingest/astradb.txt +++ /dev/null @@ -1,100 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/astradb.in -# -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -astrapy==1.4.2 - # via -r ./ingest/astradb.in -cassandra-driver==3.29.2 - # via cassio -cassio==0.1.9 - # via astrapy -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -click==8.1.7 - # via - # -c ./ingest/../base.txt - # geomet -deprecation==2.1.0 - # via astrapy -dnspython==2.6.1 - # via pymongo -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -geomet==0.2.1.post1 - # via cassandra-driver -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -h2==4.1.0 - # via httpx -hpack==4.0.0 - # via h2 -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx[http2]==0.27.2 - # via - # -c ./ingest/../base.txt - # astrapy -hyperframe==6.0.1 - # via h2 -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # cassio -packaging==24.1 - # via - # -c ./ingest/../base.txt - # deprecation -pymongo==4.9.1 - # via astrapy -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # cassio -six==1.16.0 - # via - # -c ./ingest/../base.txt - # geomet -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -toml==0.10.2 - # via astrapy -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -uuid6==2024.7.10 - # via astrapy diff --git a/requirements/ingest/azure-cognitive-search.in b/requirements/ingest/azure-cognitive-search.in deleted file mode 100644 index 226649fb3..000000000 --- a/requirements/ingest/azure-cognitive-search.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -azure-search-documents diff --git a/requirements/ingest/azure-cognitive-search.txt b/requirements/ingest/azure-cognitive-search.txt deleted file mode 100644 index ef220fca5..000000000 --- a/requirements/ingest/azure-cognitive-search.txt +++ /dev/null @@ -1,45 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/azure-cognitive-search.in -# -azure-common==1.1.28 - # via azure-search-documents -azure-core==1.31.0 - # via azure-search-documents -azure-search-documents==11.5.1 - # via -r ./ingest/azure-cognitive-search.in -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -isodate==0.6.1 - # via azure-search-documents -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # azure-core -six==1.16.0 - # via - # -c ./ingest/../base.txt - # azure-core - # isodate -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # azure-core - # azure-search-documents -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/azure.in b/requirements/ingest/azure.in deleted file mode 100644 index e90750100..000000000 --- a/requirements/ingest/azure.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -adlfs -fsspec diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt deleted file mode 100644 index b02308977..000000000 --- a/requirements/ingest/azure.txt +++ /dev/null @@ -1,108 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/azure.in -# -adlfs==2024.7.0 - # via -r ./ingest/azure.in -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via adlfs -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -azure-core==1.31.0 - # via - # adlfs - # azure-identity - # azure-storage-blob -azure-datalake-store==0.0.53 - # via adlfs -azure-identity==1.18.0 - # via adlfs -azure-storage-blob==12.23.0 - # via adlfs -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # azure-datalake-store - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # azure-identity - # azure-storage-blob - # msal - # pyjwt -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -fsspec==2024.9.0 - # via - # -r ./ingest/azure.in - # adlfs -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests - # yarl -isodate==0.6.1 - # via azure-storage-blob -msal==1.31.0 - # via - # azure-datalake-store - # azure-identity - # msal-extensions -msal-extensions==1.2.0 - # via azure-identity -multidict==6.1.0 - # via - # aiohttp - # yarl -portalocker==2.10.1 - # via msal-extensions -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pyjwt[crypto]==2.9.0 - # via msal -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # azure-core - # azure-datalake-store - # msal -six==1.16.0 - # via - # -c ./ingest/../base.txt - # azure-core - # isodate -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # azure-core - # azure-identity - # azure-storage-blob - # multidict -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/biomed.in b/requirements/ingest/biomed.in deleted file mode 100644 index 7a231f44f..000000000 --- a/requirements/ingest/biomed.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -bs4 diff --git a/requirements/ingest/biomed.txt b/requirements/ingest/biomed.txt deleted file mode 100644 index 770ec68a4..000000000 --- a/requirements/ingest/biomed.txt +++ /dev/null @@ -1,16 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/biomed.in -# -beautifulsoup4==4.12.3 - # via - # -c ./ingest/../base.txt - # bs4 -bs4==0.0.2 - # via -r ./ingest/biomed.in -soupsieve==2.6 - # via - # -c ./ingest/../base.txt - # beautifulsoup4 diff --git a/requirements/ingest/box.in b/requirements/ingest/box.in deleted file mode 100644 index 3b123f814..000000000 --- a/requirements/ingest/box.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -boxfs -fsspec diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt deleted file mode 100644 index 297f02410..000000000 --- a/requirements/ingest/box.txt +++ /dev/null @@ -1,65 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/box.in -# -attrs==24.2.0 - # via boxsdk -boxfs==0.3.0 - # via -r ./ingest/box.in -boxsdk[jwt]==3.13.0 - # via boxfs -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # boxsdk -fsspec==2024.9.0 - # via - # -r ./ingest/box.in - # boxfs -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pyjwt==2.9.0 - # via boxsdk -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # boxsdk -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # boxsdk - # requests-toolbelt -requests-toolbelt==1.0.0 - # via - # -c ./ingest/../base.txt - # boxsdk -six==1.16.0 - # via - # -c ./ingest/../base.txt - # python-dateutil -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # boxsdk - # requests diff --git a/requirements/ingest/chroma.in b/requirements/ingest/chroma.in deleted file mode 100644 index b94a6b462..000000000 --- a/requirements/ingest/chroma.in +++ /dev/null @@ -1,10 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -chromadb>0.4.14 -importlib-metadata>=8.2.0 -# Future releases adds in typer-cli which breaks the resolution of typer as a library -typer<=0.9.0 -# tenacity 9.0.0 is being installed via chroma, but other dependencies (langchain) restrict tenacity -# to <9.0.0 and resolve to 8.5.0. -# The original langchain pin: https://github.com/langchain-ai/langchain/pull/849/ -tenacity==8.5.0 diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt deleted file mode 100644 index e0bd8c909..000000000 --- a/requirements/ingest/chroma.txt +++ /dev/null @@ -1,256 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/chroma.in -# -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx - # starlette - # watchfiles -backoff==2.2.1 - # via - # -c ./ingest/../base.txt - # opentelemetry-exporter-otlp-proto-grpc - # posthog -bcrypt==4.2.0 - # via chromadb -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpx - # kubernetes - # pulsar-client - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -chroma-hnswlib==0.7.3 - # via chromadb -chromadb==0.4.17 - # via -r ./ingest/chroma.in -click==8.1.7 - # via - # -c ./ingest/../base.txt - # typer - # uvicorn -coloredlogs==15.0.1 - # via onnxruntime -deprecated==1.2.14 - # via opentelemetry-api -durationpy==0.7 - # via kubernetes -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -fastapi==0.115.0 - # via chromadb -filelock==3.16.1 - # via huggingface-hub -flatbuffers==24.3.25 - # via onnxruntime -fsspec==2024.9.0 - # via huggingface-hub -google-auth==2.35.0 - # via kubernetes -googleapis-common-protos==1.65.0 - # via opentelemetry-exporter-otlp-proto-grpc -grpcio==1.66.1 - # via - # -c ./ingest/../deps/constraints.txt - # chromadb - # opentelemetry-exporter-otlp-proto-grpc -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore - # uvicorn -httptools==0.6.1 - # via uvicorn -huggingface-hub==0.25.0 - # via tokenizers -humanfriendly==10.0 - # via coloredlogs -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests -importlib-metadata==8.5.0 - # via - # -c ./ingest/../deps/constraints.txt - # -r ./ingest/chroma.in -importlib-resources==6.4.5 - # via chromadb -kubernetes==31.0.0 - # via chromadb -monotonic==1.6 - # via posthog -mpmath==1.3.0 - # via sympy -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # chroma-hnswlib - # chromadb - # onnxruntime -oauthlib==3.2.2 - # via - # kubernetes - # requests-oauthlib -onnxruntime==1.19.2 - # via chromadb -opentelemetry-api==1.16.0 - # via - # chromadb - # opentelemetry-exporter-otlp-proto-grpc - # opentelemetry-sdk -opentelemetry-exporter-otlp-proto-grpc==1.16.0 - # via chromadb -opentelemetry-proto==1.16.0 - # via opentelemetry-exporter-otlp-proto-grpc -opentelemetry-sdk==1.16.0 - # via - # chromadb - # opentelemetry-exporter-otlp-proto-grpc -opentelemetry-semantic-conventions==0.37b0 - # via opentelemetry-sdk -overrides==7.7.0 - # via chromadb -packaging==24.1 - # via - # -c ./ingest/../base.txt - # build - # huggingface-hub - # onnxruntime -posthog==3.6.6 - # via chromadb -protobuf==4.25.5 - # via - # -c ./ingest/../deps/constraints.txt - # googleapis-common-protos - # onnxruntime - # opentelemetry-proto -pulsar-client==3.5.0 - # via chromadb -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pydantic==2.9.2 - # via - # chromadb - # fastapi -pydantic-core==2.23.4 - # via pydantic -pypika==0.48.9 - # via chromadb -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # kubernetes - # posthog -python-dotenv==1.0.1 - # via uvicorn -pyyaml==6.0.2 - # via - # chromadb - # huggingface-hub - # kubernetes - # uvicorn -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # chromadb - # huggingface-hub - # kubernetes - # posthog - # requests-oauthlib -requests-oauthlib==2.0.0 - # via kubernetes -rsa==4.9 - # via google-auth -six==1.16.0 - # via - # -c ./ingest/../base.txt - # kubernetes - # posthog - # python-dateutil -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -starlette==0.38.5 - # via fastapi -sympy==1.13.3 - # via onnxruntime -tenacity==8.5.0 - # via - # -r ./ingest/chroma.in - # chromadb -tokenizers==0.19.1 - # via - # -c ./ingest/../deps/constraints.txt - # chromadb -tqdm==4.66.5 - # via - # -c ./ingest/../base.txt - # chromadb - # huggingface-hub -typer==0.9.0 - # via - # -r ./ingest/chroma.in - # chromadb -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # chromadb - # fastapi - # huggingface-hub - # opentelemetry-sdk - # pydantic - # pydantic-core - # starlette - # typer - # uvicorn -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # kubernetes - # requests -uvicorn[standard]==0.30.6 - # via chromadb -uvloop==0.20.0 - # via uvicorn -watchfiles==0.24.0 - # via uvicorn -websocket-client==1.8.0 - # via kubernetes -websockets==13.0.1 - # via uvicorn -wrapt==1.16.0 - # via - # -c ./ingest/../base.txt - # deprecated - # opentelemetry-instrumentation -zipp==3.20.2 - # via - # importlib-metadata - # importlib-resources - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/ingest/clarifai.in b/requirements/ingest/clarifai.in deleted file mode 100644 index becc852ac..000000000 --- a/requirements/ingest/clarifai.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -clarifai diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt deleted file mode 100644 index 15c99bf72..000000000 --- a/requirements/ingest/clarifai.txt +++ /dev/null @@ -1,83 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/clarifai.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -clarifai==10.7.0 - # via -r ./ingest/clarifai.in -clarifai-grpc==10.7.1 - # via clarifai -contextlib2==21.6.0 - # via schema -googleapis-common-protos==1.65.0 - # via clarifai-grpc -grpcio==1.66.1 - # via - # -c ./ingest/../deps/constraints.txt - # clarifai-grpc -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -inquirerpy==0.3.4 - # via clarifai -markdown-it-py==3.0.0 - # via rich -mdurl==0.1.2 - # via markdown-it-py -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # clarifai - # tritonclient -pfzy==0.3.4 - # via inquirerpy -pillow==10.4.0 - # via clarifai -prompt-toolkit==3.0.47 - # via inquirerpy -protobuf==4.25.5 - # via - # -c ./ingest/../deps/constraints.txt - # clarifai-grpc - # googleapis-common-protos -pygments==2.18.0 - # via rich -python-rapidjson==1.20 - # via tritonclient -pyyaml==6.0.2 - # via clarifai -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # clarifai-grpc -rich==13.8.1 - # via clarifai -schema==0.7.5 - # via clarifai -tabulate==0.9.0 - # via - # -c ./ingest/../base.txt - # clarifai -tqdm==4.66.5 - # via - # -c ./ingest/../base.txt - # clarifai -tritonclient==2.41.1 - # via clarifai -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -wcwidth==0.2.13 - # via prompt-toolkit diff --git a/requirements/ingest/confluence.in b/requirements/ingest/confluence.in deleted file mode 100644 index 37f92cb8c..000000000 --- a/requirements/ingest/confluence.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -atlassian-python-api diff --git a/requirements/ingest/confluence.txt b/requirements/ingest/confluence.txt deleted file mode 100644 index a54fa71d8..000000000 --- a/requirements/ingest/confluence.txt +++ /dev/null @@ -1,56 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/confluence.in -# -atlassian-python-api==3.41.16 - # via -r ./ingest/confluence.in -beautifulsoup4==4.12.3 - # via - # -c ./ingest/../base.txt - # atlassian-python-api -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -deprecated==1.2.14 - # via atlassian-python-api -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -jmespath==1.0.1 - # via atlassian-python-api -oauthlib==3.2.2 - # via - # atlassian-python-api - # requests-oauthlib -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # atlassian-python-api - # requests-oauthlib -requests-oauthlib==2.0.0 - # via atlassian-python-api -six==1.16.0 - # via - # -c ./ingest/../base.txt - # atlassian-python-api -soupsieve==2.6 - # via - # -c ./ingest/../base.txt - # beautifulsoup4 -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -wrapt==1.16.0 - # via - # -c ./ingest/../base.txt - # deprecated diff --git a/requirements/ingest/databricks-volumes.in b/requirements/ingest/databricks-volumes.in deleted file mode 100644 index 8bad8aec3..000000000 --- a/requirements/ingest/databricks-volumes.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -databricks-sdk diff --git a/requirements/ingest/databricks-volumes.txt b/requirements/ingest/databricks-volumes.txt deleted file mode 100644 index ac6f34cc5..000000000 --- a/requirements/ingest/databricks-volumes.txt +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/databricks-volumes.in -# -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -databricks-sdk==0.32.3 - # via -r ./ingest/databricks-volumes.in -google-auth==2.35.0 - # via databricks-sdk -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # databricks-sdk -rsa==4.9 - # via google-auth -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/delta-table.in b/requirements/ingest/delta-table.in deleted file mode 100644 index 47d4079bd..000000000 --- a/requirements/ingest/delta-table.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -deltalake<=0.19.1 -fsspec diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt deleted file mode 100644 index 68f8dfae7..000000000 --- a/requirements/ingest/delta-table.txt +++ /dev/null @@ -1,16 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/delta-table.in -# -deltalake==0.19.1 - # via -r ./ingest/delta-table.in -fsspec==2024.9.0 - # via -r ./ingest/delta-table.in -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # pyarrow -pyarrow==17.0.0 - # via deltalake diff --git a/requirements/ingest/discord.in b/requirements/ingest/discord.in deleted file mode 100644 index 83bbeed43..000000000 --- a/requirements/ingest/discord.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -discord-py diff --git a/requirements/ingest/discord.txt b/requirements/ingest/discord.txt deleted file mode 100644 index 6368195ea..000000000 --- a/requirements/ingest/discord.txt +++ /dev/null @@ -1,36 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/discord.in -# -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via discord-py -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -discord-py==2.4.0 - # via -r ./ingest/discord.in -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -idna==3.10 - # via - # -c ./ingest/../base.txt - # yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # multidict -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/dropbox.in b/requirements/ingest/dropbox.in deleted file mode 100644 index b9b0fe1d3..000000000 --- a/requirements/ingest/dropbox.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -dropboxdrivefs -fsspec diff --git a/requirements/ingest/dropbox.txt b/requirements/ingest/dropbox.txt deleted file mode 100644 index bbba1ba1b..000000000 --- a/requirements/ingest/dropbox.txt +++ /dev/null @@ -1,45 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/dropbox.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -dropbox==12.0.2 - # via dropboxdrivefs -dropboxdrivefs==1.4.1 - # via -r ./ingest/dropbox.in -fsspec==2024.9.0 - # via - # -r ./ingest/dropbox.in - # dropboxdrivefs -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -ply==3.11 - # via stone -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # dropbox - # dropboxdrivefs -six==1.16.0 - # via - # -c ./ingest/../base.txt - # dropbox - # stone -stone==3.3.1 - # via dropbox -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/elasticsearch.in b/requirements/ingest/elasticsearch.in deleted file mode 100644 index 5b6d0db36..000000000 --- a/requirements/ingest/elasticsearch.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -elasticsearch[async] diff --git a/requirements/ingest/elasticsearch.txt b/requirements/ingest/elasticsearch.txt deleted file mode 100644 index b23d77117..000000000 --- a/requirements/ingest/elasticsearch.txt +++ /dev/null @@ -1,47 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/elasticsearch.in -# -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via elasticsearch -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # elastic-transport -elastic-transport==8.15.0 - # via elasticsearch -elasticsearch[async]==8.15.1 - # via -r ./ingest/elasticsearch.in -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -idna==3.10 - # via - # -c ./ingest/../base.txt - # yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # multidict -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # elastic-transport -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/embed-aws-bedrock.in b/requirements/ingest/embed-aws-bedrock.in deleted file mode 100644 index dd73b768d..000000000 --- a/requirements/ingest/embed-aws-bedrock.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -boto3 -langchain-community diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt deleted file mode 100644 index 69d0e147e..000000000 --- a/requirements/ingest/embed-aws-bedrock.txt +++ /dev/null @@ -1,191 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-aws-bedrock.in -# -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via - # langchain - # langchain-community -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -async-timeout==4.0.3 - # via - # aiohttp - # langchain -attrs==24.2.0 - # via aiohttp -boto3==1.34.131 - # via -r ./ingest/embed-aws-bedrock.in -botocore==1.34.131 - # via - # -c ./ingest/../deps/constraints.txt - # boto3 - # s3transfer -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -dataclasses-json==0.6.7 - # via - # -c ./ingest/../base.txt - # langchain-community -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # langsmith -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests - # yarl -jmespath==1.0.1 - # via - # boto3 - # botocore -jsonpatch==1.33 - # via langchain-core -jsonpointer==3.0.0 - # via jsonpatch -langchain==0.3.0 - # via langchain-community -langchain-community==0.3.0 - # via -r ./ingest/embed-aws-bedrock.in -langchain-core==0.3.2 - # via - # langchain - # langchain-community - # langchain-text-splitters -langchain-text-splitters==0.3.0 - # via langchain -langsmith==0.1.125 - # via - # langchain - # langchain-community - # langchain-core -marshmallow==3.22.0 - # via - # -c ./ingest/../base.txt - # dataclasses-json -multidict==6.1.0 - # via - # aiohttp - # yarl -mypy-extensions==1.0.0 - # via - # -c ./ingest/../base.txt - # typing-inspect -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # langchain - # langchain-community -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via - # -c ./ingest/../base.txt - # langchain-core - # marshmallow -pydantic==2.9.2 - # via - # langchain - # langchain-core - # langsmith - # pydantic-settings -pydantic-core==2.23.4 - # via pydantic -pydantic-settings==2.5.2 - # via langchain-community -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # botocore -python-dotenv==1.0.1 - # via pydantic-settings -pyyaml==6.0.2 - # via - # langchain - # langchain-community - # langchain-core -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # langchain - # langchain-community - # langsmith -s3transfer==0.10.2 - # via boto3 -six==1.16.0 - # via - # -c ./ingest/../base.txt - # python-dateutil -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -sqlalchemy==2.0.35 - # via - # langchain - # langchain-community -tenacity==8.5.0 - # via - # langchain - # langchain-community - # langchain-core -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # langchain-core - # multidict - # pydantic - # pydantic-core - # sqlalchemy - # typing-inspect -typing-inspect==0.9.0 - # via - # -c ./ingest/../base.txt - # dataclasses-json -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # botocore - # requests -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/embed-huggingface.in b/requirements/ingest/embed-huggingface.in deleted file mode 100644 index 88b7218f8..000000000 --- a/requirements/ingest/embed-huggingface.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt - -langchain-huggingface diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt deleted file mode 100644 index 24756b413..000000000 --- a/requirements/ingest/embed-huggingface.txt +++ /dev/null @@ -1,170 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-huggingface.in -# -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -filelock==3.16.1 - # via - # huggingface-hub - # torch - # transformers -fsspec==2024.9.0 - # via - # huggingface-hub - # torch -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # langsmith -huggingface-hub==0.25.0 - # via - # langchain-huggingface - # sentence-transformers - # tokenizers - # transformers -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests -jinja2==3.1.4 - # via torch -joblib==1.4.2 - # via - # -c ./ingest/../base.txt - # scikit-learn -jsonpatch==1.33 - # via langchain-core -jsonpointer==3.0.0 - # via jsonpatch -langchain-core==0.3.2 - # via langchain-huggingface -langchain-huggingface==0.1.0 - # via -r ./ingest/embed-huggingface.in -langsmith==0.1.125 - # via langchain-core -markupsafe==2.1.5 - # via jinja2 -mpmath==1.3.0 - # via sympy -networkx==3.2.1 - # via torch -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # scikit-learn - # scipy - # transformers -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via - # -c ./ingest/../base.txt - # huggingface-hub - # langchain-core - # transformers -pillow==10.4.0 - # via sentence-transformers -pydantic==2.9.2 - # via - # langchain-core - # langsmith -pydantic-core==2.23.4 - # via pydantic -pyyaml==6.0.2 - # via - # huggingface-hub - # langchain-core - # transformers -regex==2024.9.11 - # via - # -c ./ingest/../base.txt - # transformers -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # huggingface-hub - # langsmith - # transformers -safetensors==0.4.5 - # via transformers -scikit-learn==1.5.2 - # via sentence-transformers -scipy==1.13.1 - # via - # scikit-learn - # sentence-transformers -sentence-transformers==3.1.1 - # via langchain-huggingface -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -sympy==1.13.3 - # via torch -tenacity==8.5.0 - # via langchain-core -threadpoolctl==3.5.0 - # via scikit-learn -tokenizers==0.19.1 - # via - # -c ./ingest/../deps/constraints.txt - # langchain-huggingface - # transformers -torch==2.4.1 - # via sentence-transformers -tqdm==4.66.5 - # via - # -c ./ingest/../base.txt - # huggingface-hub - # sentence-transformers - # transformers -transformers==4.44.2 - # via - # langchain-huggingface - # sentence-transformers -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # huggingface-hub - # langchain-core - # pydantic - # pydantic-core - # torch -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/embed-mixedbreadai.in b/requirements/ingest/embed-mixedbreadai.in deleted file mode 100644 index 929e3f0ae..000000000 --- a/requirements/ingest/embed-mixedbreadai.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -mixedbread-ai \ No newline at end of file diff --git a/requirements/ingest/embed-mixedbreadai.txt b/requirements/ingest/embed-mixedbreadai.txt deleted file mode 100644 index da63dcbe9..000000000 --- a/requirements/ingest/embed-mixedbreadai.txt +++ /dev/null @@ -1,56 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-mixedbreadai.in -# -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # mixedbread-ai -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -mixedbread-ai==2.2.6 - # via -r ./ingest/embed-mixedbreadai.in -pydantic==2.9.2 - # via mixedbread-ai -pydantic-core==2.23.4 - # via pydantic -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # mixedbread-ai - # pydantic - # pydantic-core diff --git a/requirements/ingest/embed-octoai.in b/requirements/ingest/embed-octoai.in deleted file mode 100644 index ede6c81e8..000000000 --- a/requirements/ingest/embed-octoai.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -openai -tiktoken diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt deleted file mode 100644 index 87d04cc36..000000000 --- a/requirements/ingest/embed-octoai.txt +++ /dev/null @@ -1,87 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-octoai.in -# -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx - # openai -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -distro==1.9.0 - # via openai -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # openai -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests -jiter==0.5.0 - # via openai -openai==1.46.1 - # via -r ./ingest/embed-octoai.in -pydantic==2.9.2 - # via openai -pydantic-core==2.23.4 - # via pydantic -regex==2024.9.11 - # via - # -c ./ingest/../base.txt - # tiktoken -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # tiktoken -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # openai -tiktoken==0.7.0 - # via -r ./ingest/embed-octoai.in -tqdm==4.66.5 - # via - # -c ./ingest/../base.txt - # openai -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # openai - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/embed-openai.in b/requirements/ingest/embed-openai.in deleted file mode 100644 index fb130e9cb..000000000 --- a/requirements/ingest/embed-openai.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt - -langchain-openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt deleted file mode 100644 index 7490efc76..000000000 --- a/requirements/ingest/embed-openai.txt +++ /dev/null @@ -1,113 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-openai.in -# -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx - # openai -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -distro==1.9.0 - # via openai -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # langsmith - # openai -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests -jiter==0.5.0 - # via openai -jsonpatch==1.33 - # via langchain-core -jsonpointer==3.0.0 - # via jsonpatch -langchain-core==0.3.2 - # via langchain-openai -langchain-openai==0.2.0 - # via -r ./ingest/embed-openai.in -langsmith==0.1.125 - # via langchain-core -openai==1.46.1 - # via langchain-openai -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via - # -c ./ingest/../base.txt - # langchain-core -pydantic==2.9.2 - # via - # langchain-core - # langsmith - # openai -pydantic-core==2.23.4 - # via pydantic -pyyaml==6.0.2 - # via langchain-core -regex==2024.9.11 - # via - # -c ./ingest/../base.txt - # tiktoken -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # langsmith - # tiktoken -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # openai -tenacity==8.5.0 - # via langchain-core -tiktoken==0.7.0 - # via langchain-openai -tqdm==4.66.5 - # via - # -c ./ingest/../base.txt - # openai -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # langchain-core - # openai - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/embed-vertexai.in b/requirements/ingest/embed-vertexai.in deleted file mode 100644 index ba68465a8..000000000 --- a/requirements/ingest/embed-vertexai.in +++ /dev/null @@ -1,5 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -langchain -langchain-community -langchain-google-vertexai diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt deleted file mode 100644 index a912d25cb..000000000 --- a/requirements/ingest/embed-vertexai.txt +++ /dev/null @@ -1,275 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-vertexai.in -# -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via - # langchain - # langchain-community -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -async-timeout==4.0.3 - # via - # aiohttp - # langchain -attrs==24.2.0 - # via aiohttp -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -dataclasses-json==0.6.7 - # via - # -c ./ingest/../base.txt - # langchain-community -docstring-parser==0.16 - # via google-cloud-aiplatform -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -google-api-core[grpc]==2.20.0 - # via - # google-cloud-aiplatform - # google-cloud-bigquery - # google-cloud-core - # google-cloud-resource-manager - # google-cloud-storage -google-auth==2.35.0 - # via - # google-api-core - # google-cloud-aiplatform - # google-cloud-bigquery - # google-cloud-core - # google-cloud-resource-manager - # google-cloud-storage -google-cloud-aiplatform==1.67.1 - # via langchain-google-vertexai -google-cloud-bigquery==3.25.0 - # via google-cloud-aiplatform -google-cloud-core==2.4.1 - # via - # google-cloud-bigquery - # google-cloud-storage -google-cloud-resource-manager==1.12.5 - # via google-cloud-aiplatform -google-cloud-storage==2.18.2 - # via - # google-cloud-aiplatform - # langchain-google-vertexai -google-crc32c==1.6.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 - # via - # google-cloud-bigquery - # google-cloud-storage -googleapis-common-protos[grpc]==1.65.0 - # via - # google-api-core - # grpc-google-iam-v1 - # grpcio-status -grpc-google-iam-v1==0.13.1 - # via google-cloud-resource-manager -grpcio==1.66.1 - # via - # -c ./ingest/../deps/constraints.txt - # google-api-core - # googleapis-common-protos - # grpc-google-iam-v1 - # grpcio-status -grpcio-status==1.62.3 - # via google-api-core -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # langchain-google-vertexai - # langsmith -httpx-sse==0.4.0 - # via langchain-google-vertexai -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests - # yarl -jsonpatch==1.33 - # via langchain-core -jsonpointer==3.0.0 - # via jsonpatch -langchain==0.3.0 - # via - # -r ./ingest/embed-vertexai.in - # langchain-community -langchain-community==0.3.0 - # via -r ./ingest/embed-vertexai.in -langchain-core==0.3.2 - # via - # langchain - # langchain-community - # langchain-google-vertexai - # langchain-text-splitters -langchain-google-vertexai==2.0.1 - # via -r ./ingest/embed-vertexai.in -langchain-text-splitters==0.3.0 - # via langchain -langsmith==0.1.125 - # via - # langchain - # langchain-community - # langchain-core -marshmallow==3.22.0 - # via - # -c ./ingest/../base.txt - # dataclasses-json -multidict==6.1.0 - # via - # aiohttp - # yarl -mypy-extensions==1.0.0 - # via - # -c ./ingest/../base.txt - # typing-inspect -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # langchain - # langchain-community - # shapely -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via - # -c ./ingest/../base.txt - # google-cloud-aiplatform - # google-cloud-bigquery - # langchain-core - # marshmallow -proto-plus==1.24.0 - # via - # google-api-core - # google-cloud-aiplatform - # google-cloud-resource-manager -protobuf==4.25.5 - # via - # -c ./ingest/../deps/constraints.txt - # google-api-core - # google-cloud-aiplatform - # google-cloud-resource-manager - # googleapis-common-protos - # grpc-google-iam-v1 - # grpcio-status - # proto-plus -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pydantic==2.9.2 - # via - # google-cloud-aiplatform - # langchain - # langchain-core - # langchain-google-vertexai - # langsmith - # pydantic-settings -pydantic-core==2.23.4 - # via pydantic -pydantic-settings==2.5.2 - # via langchain-community -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # google-cloud-bigquery -python-dotenv==1.0.1 - # via pydantic-settings -pyyaml==6.0.2 - # via - # langchain - # langchain-community - # langchain-core -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # google-api-core - # google-cloud-bigquery - # google-cloud-storage - # langchain - # langchain-community - # langsmith -rsa==4.9 - # via google-auth -shapely==2.0.6 - # via google-cloud-aiplatform -six==1.16.0 - # via - # -c ./ingest/../base.txt - # python-dateutil -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -sqlalchemy==2.0.35 - # via - # langchain - # langchain-community -tenacity==8.5.0 - # via - # langchain - # langchain-community - # langchain-core -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # langchain-core - # multidict - # pydantic - # pydantic-core - # sqlalchemy - # typing-inspect -typing-inspect==0.9.0 - # via - # -c ./ingest/../base.txt - # dataclasses-json -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/embed-voyageai.in b/requirements/ingest/embed-voyageai.in deleted file mode 100644 index efe01c7b0..000000000 --- a/requirements/ingest/embed-voyageai.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -langchain -langchain-voyageai diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt deleted file mode 100644 index a1d3572e8..000000000 --- a/requirements/ingest/embed-voyageai.txt +++ /dev/null @@ -1,147 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/embed-voyageai.in -# -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via - # langchain - # voyageai -aiolimiter==1.1.0 - # via voyageai -aiosignal==1.3.1 - # via aiohttp -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -async-timeout==4.0.3 - # via - # aiohttp - # langchain -attrs==24.2.0 - # via aiohttp -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # langsmith -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx - # requests - # yarl -jsonpatch==1.33 - # via langchain-core -jsonpointer==3.0.0 - # via jsonpatch -langchain==0.3.0 - # via -r ./ingest/embed-voyageai.in -langchain-core==0.3.2 - # via - # langchain - # langchain-text-splitters - # langchain-voyageai -langchain-text-splitters==0.3.0 - # via langchain -langchain-voyageai==0.1.2 - # via -r ./ingest/embed-voyageai.in -langsmith==0.1.125 - # via - # langchain - # langchain-core -multidict==6.1.0 - # via - # aiohttp - # yarl -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # langchain - # voyageai -orjson==3.10.7 - # via langsmith -packaging==24.1 - # via - # -c ./ingest/../base.txt - # langchain-core -pydantic==2.9.2 - # via - # langchain - # langchain-core - # langchain-voyageai - # langsmith -pydantic-core==2.23.4 - # via pydantic -pyyaml==6.0.2 - # via - # langchain - # langchain-core -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # langchain - # langsmith - # voyageai -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -sqlalchemy==2.0.35 - # via langchain -tenacity==8.5.0 - # via - # langchain - # langchain-core - # voyageai -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # langchain-core - # multidict - # pydantic - # pydantic-core - # sqlalchemy -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -voyageai==0.2.3 - # via langchain-voyageai -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/gcs.in b/requirements/ingest/gcs.in deleted file mode 100644 index 842468b00..000000000 --- a/requirements/ingest/gcs.in +++ /dev/null @@ -1,5 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -gcsfs -fsspec -bs4 diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt deleted file mode 100644 index c2954c0b1..000000000 --- a/requirements/ingest/gcs.txt +++ /dev/null @@ -1,120 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/gcs.in -# -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via gcsfs -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -beautifulsoup4==4.12.3 - # via - # -c ./ingest/../base.txt - # bs4 -bs4==0.0.2 - # via -r ./ingest/gcs.in -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -decorator==5.1.1 - # via gcsfs -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -fsspec==2024.9.0 - # via - # -r ./ingest/gcs.in - # gcsfs -gcsfs==2024.9.0.post1 - # via -r ./ingest/gcs.in -google-api-core==2.20.0 - # via - # google-cloud-core - # google-cloud-storage -google-auth==2.35.0 - # via - # gcsfs - # google-api-core - # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage -google-auth-oauthlib==1.2.1 - # via gcsfs -google-cloud-core==2.4.1 - # via google-cloud-storage -google-cloud-storage==2.18.2 - # via gcsfs -google-crc32c==1.6.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 - # via google-cloud-storage -googleapis-common-protos==1.65.0 - # via google-api-core -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests - # yarl -multidict==6.1.0 - # via - # aiohttp - # yarl -oauthlib==3.2.2 - # via requests-oauthlib -proto-plus==1.24.0 - # via google-api-core -protobuf==4.25.5 - # via - # -c ./ingest/../deps/constraints.txt - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # gcsfs - # google-api-core - # google-cloud-storage - # requests-oauthlib -requests-oauthlib==2.0.0 - # via google-auth-oauthlib -rsa==4.9 - # via google-auth -soupsieve==2.6 - # via - # -c ./ingest/../base.txt - # beautifulsoup4 -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # multidict -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/github.in b/requirements/ingest/github.in deleted file mode 100644 index 599585d7a..000000000 --- a/requirements/ingest/github.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -# NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436 -pygithub>1.58.0 diff --git a/requirements/ingest/github.txt b/requirements/ingest/github.txt deleted file mode 100644 index 0d8fa1ac5..000000000 --- a/requirements/ingest/github.txt +++ /dev/null @@ -1,57 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/github.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography - # pynacl -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # pyjwt -deprecated==1.2.14 - # via pygithub -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pygithub==2.4.0 - # via -r ./ingest/github.in -pyjwt[crypto]==2.9.0 - # via pygithub -pynacl==1.5.0 - # via pygithub -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # pygithub -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # pygithub -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # pygithub - # requests -wrapt==1.16.0 - # via - # -c ./ingest/../base.txt - # deprecated diff --git a/requirements/ingest/gitlab.in b/requirements/ingest/gitlab.in deleted file mode 100644 index 86be2c44b..000000000 --- a/requirements/ingest/gitlab.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -python-gitlab diff --git a/requirements/ingest/gitlab.txt b/requirements/ingest/gitlab.txt deleted file mode 100644 index 77d5743ba..000000000 --- a/requirements/ingest/gitlab.txt +++ /dev/null @@ -1,34 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/gitlab.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -python-gitlab==4.11.1 - # via -r ./ingest/gitlab.in -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # python-gitlab - # requests-toolbelt -requests-toolbelt==1.0.0 - # via - # -c ./ingest/../base.txt - # python-gitlab -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/google-drive.in b/requirements/ingest/google-drive.in deleted file mode 100644 index e95e27f71..000000000 --- a/requirements/ingest/google-drive.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -google-api-python-client diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt deleted file mode 100644 index 54726088b..000000000 --- a/requirements/ingest/google-drive.txt +++ /dev/null @@ -1,66 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/google-drive.in -# -cachetools==5.5.0 - # via google-auth -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -google-api-core==2.20.0 - # via google-api-python-client -google-api-python-client==2.146.0 - # via -r ./ingest/google-drive.in -google-auth==2.35.0 - # via - # google-api-core - # google-api-python-client - # google-auth-httplib2 -google-auth-httplib2==0.2.0 - # via google-api-python-client -googleapis-common-protos==1.65.0 - # via google-api-core -httplib2==0.22.0 - # via - # google-api-python-client - # google-auth-httplib2 -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -proto-plus==1.24.0 - # via google-api-core -protobuf==4.25.5 - # via - # -c ./ingest/../deps/constraints.txt - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.1 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 - # via google-auth -pyparsing==3.1.4 - # via httplib2 -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # google-api-core -rsa==4.9 - # via google-auth -uritemplate==4.1.1 - # via google-api-python-client -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/hubspot.in b/requirements/ingest/hubspot.in deleted file mode 100644 index 033413401..000000000 --- a/requirements/ingest/hubspot.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -hubspot-api-client -urllib3 diff --git a/requirements/ingest/hubspot.txt b/requirements/ingest/hubspot.txt deleted file mode 100644 index da9b63b37..000000000 --- a/requirements/ingest/hubspot.txt +++ /dev/null @@ -1,27 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/hubspot.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # hubspot-api-client -hubspot-api-client==9.0.0 - # via -r ./ingest/hubspot.in -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # hubspot-api-client -six==1.16.0 - # via - # -c ./ingest/../base.txt - # hubspot-api-client - # python-dateutil -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # -r ./ingest/hubspot.in - # hubspot-api-client diff --git a/requirements/ingest/ingest.txt b/requirements/ingest/ingest.txt new file mode 100644 index 000000000..957f788f1 --- /dev/null +++ b/requirements/ingest/ingest.txt @@ -0,0 +1,5 @@ +unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia] +s3fs>=2024.9.0 +urllib3>=1.26.20 +backoff>=2.2.1 +httpx>=0.27.2 diff --git a/requirements/ingest/jira.in b/requirements/ingest/jira.in deleted file mode 100644 index 37f92cb8c..000000000 --- a/requirements/ingest/jira.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -atlassian-python-api diff --git a/requirements/ingest/jira.txt b/requirements/ingest/jira.txt deleted file mode 100644 index 7b2f98861..000000000 --- a/requirements/ingest/jira.txt +++ /dev/null @@ -1,56 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/jira.in -# -atlassian-python-api==3.41.16 - # via -r ./ingest/jira.in -beautifulsoup4==4.12.3 - # via - # -c ./ingest/../base.txt - # atlassian-python-api -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -deprecated==1.2.14 - # via atlassian-python-api -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -jmespath==1.0.1 - # via atlassian-python-api -oauthlib==3.2.2 - # via - # atlassian-python-api - # requests-oauthlib -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # atlassian-python-api - # requests-oauthlib -requests-oauthlib==2.0.0 - # via atlassian-python-api -six==1.16.0 - # via - # -c ./ingest/../base.txt - # atlassian-python-api -soupsieve==2.6 - # via - # -c ./ingest/../base.txt - # beautifulsoup4 -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -wrapt==1.16.0 - # via - # -c ./ingest/../base.txt - # deprecated diff --git a/requirements/ingest/kafka.in b/requirements/ingest/kafka.in deleted file mode 100644 index 25b9ad2f6..000000000 --- a/requirements/ingest/kafka.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -confluent-kafka \ No newline at end of file diff --git a/requirements/ingest/kafka.txt b/requirements/ingest/kafka.txt deleted file mode 100644 index 4dbb8306d..000000000 --- a/requirements/ingest/kafka.txt +++ /dev/null @@ -1,8 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/kafka.in -# -confluent-kafka==2.5.3 - # via -r ./ingest/kafka.in diff --git a/requirements/ingest/mongodb.in b/requirements/ingest/mongodb.in deleted file mode 100644 index 48f292290..000000000 --- a/requirements/ingest/mongodb.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -pymongo diff --git a/requirements/ingest/mongodb.txt b/requirements/ingest/mongodb.txt deleted file mode 100644 index 778a13fc4..000000000 --- a/requirements/ingest/mongodb.txt +++ /dev/null @@ -1,10 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/mongodb.in -# -dnspython==2.6.1 - # via pymongo -pymongo==4.9.1 - # via -r ./ingest/mongodb.in diff --git a/requirements/ingest/notion.in b/requirements/ingest/notion.in deleted file mode 100644 index 47823a112..000000000 --- a/requirements/ingest/notion.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -notion-client -htmlBuilder diff --git a/requirements/ingest/notion.txt b/requirements/ingest/notion.txt deleted file mode 100644 index ea8a45578..000000000 --- a/requirements/ingest/notion.txt +++ /dev/null @@ -1,49 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/notion.in -# -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -htmlbuilder==1.0.0 - # via -r ./ingest/notion.in -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx==0.27.2 - # via - # -c ./ingest/../base.txt - # notion-client -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -notion-client==2.2.1 - # via -r ./ingest/notion.in -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio diff --git a/requirements/ingest/onedrive.in b/requirements/ingest/onedrive.in deleted file mode 100644 index c53222881..000000000 --- a/requirements/ingest/onedrive.in +++ /dev/null @@ -1,5 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -msal -Office365-REST-Python-Client -bs4 diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt deleted file mode 100644 index 88330e86e..000000000 --- a/requirements/ingest/onedrive.txt +++ /dev/null @@ -1,65 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/onedrive.in -# -beautifulsoup4==4.12.3 - # via - # -c ./ingest/../base.txt - # bs4 -bs4==0.0.2 - # via -r ./ingest/onedrive.in -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # msal - # pyjwt -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -msal==1.31.0 - # via - # -r ./ingest/onedrive.in - # office365-rest-python-client -office365-rest-python-client==2.5.13 - # via -r ./ingest/onedrive.in -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pyjwt[crypto]==2.9.0 - # via msal -pytz==2024.2 - # via office365-rest-python-client -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # msal - # office365-rest-python-client -soupsieve==2.6 - # via - # -c ./ingest/../base.txt - # beautifulsoup4 -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # office365-rest-python-client -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/opensearch.in b/requirements/ingest/opensearch.in deleted file mode 100644 index ac336e8d1..000000000 --- a/requirements/ingest/opensearch.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -opensearch-py diff --git a/requirements/ingest/opensearch.txt b/requirements/ingest/opensearch.txt deleted file mode 100644 index 03a011830..000000000 --- a/requirements/ingest/opensearch.txt +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/opensearch.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # opensearch-py - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -events==0.5 - # via opensearch-py -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -opensearch-py==2.7.1 - # via -r ./ingest/opensearch.in -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # opensearch-py -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # opensearch-py -six==1.16.0 - # via - # -c ./ingest/../base.txt - # python-dateutil -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # opensearch-py - # requests diff --git a/requirements/ingest/outlook.in b/requirements/ingest/outlook.in deleted file mode 100644 index 3b65d3029..000000000 --- a/requirements/ingest/outlook.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -msal -Office365-REST-Python-Client diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt deleted file mode 100644 index f73262791..000000000 --- a/requirements/ingest/outlook.txt +++ /dev/null @@ -1,55 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/outlook.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # msal - # pyjwt -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -msal==1.31.0 - # via - # -r ./ingest/outlook.in - # office365-rest-python-client -office365-rest-python-client==2.5.13 - # via -r ./ingest/outlook.in -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pyjwt[crypto]==2.9.0 - # via msal -pytz==2024.2 - # via office365-rest-python-client -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # msal - # office365-rest-python-client -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # office365-rest-python-client -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/pinecone.in b/requirements/ingest/pinecone.in deleted file mode 100644 index 985accf43..000000000 --- a/requirements/ingest/pinecone.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -pinecone-client>=3.7.1 diff --git a/requirements/ingest/pinecone.txt b/requirements/ingest/pinecone.txt deleted file mode 100644 index 68a6197ff..000000000 --- a/requirements/ingest/pinecone.txt +++ /dev/null @@ -1,31 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/pinecone.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # pinecone-client -pinecone-client==5.0.1 - # via -r ./ingest/pinecone.in -pinecone-plugin-inference==1.1.0 - # via pinecone-client -pinecone-plugin-interface==0.0.7 - # via - # pinecone-client - # pinecone-plugin-inference -tqdm==4.66.5 - # via - # -c ./ingest/../base.txt - # pinecone-client -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # pinecone-client -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # pinecone-client diff --git a/requirements/ingest/postgres.in b/requirements/ingest/postgres.in deleted file mode 100644 index f57ac1a36..000000000 --- a/requirements/ingest/postgres.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -psycopg2-binary diff --git a/requirements/ingest/postgres.txt b/requirements/ingest/postgres.txt deleted file mode 100644 index 813ca1616..000000000 --- a/requirements/ingest/postgres.txt +++ /dev/null @@ -1,8 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/postgres.in -# -psycopg2-binary==2.9.9 - # via -r ./ingest/postgres.in diff --git a/requirements/ingest/qdrant.in b/requirements/ingest/qdrant.in deleted file mode 100644 index 051f54715..000000000 --- a/requirements/ingest/qdrant.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -qdrant-client diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt deleted file mode 100644 index 0ea8c17ae..000000000 --- a/requirements/ingest/qdrant.txt +++ /dev/null @@ -1,86 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/qdrant.in -# -annotated-types==0.7.0 - # via pydantic -anyio==4.5.0 - # via - # -c ./ingest/../base.txt - # httpx -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # httpcore - # httpx -exceptiongroup==1.2.2 - # via - # -c ./ingest/../base.txt - # anyio -grpcio==1.66.1 - # via - # -c ./ingest/../deps/constraints.txt - # grpcio-tools - # qdrant-client -grpcio-tools==1.62.3 - # via qdrant-client -h11==0.14.0 - # via - # -c ./ingest/../base.txt - # httpcore -h2==4.1.0 - # via httpx -hpack==4.0.0 - # via h2 -httpcore==1.0.5 - # via - # -c ./ingest/../base.txt - # httpx -httpx[http2]==0.27.2 - # via - # -c ./ingest/../base.txt - # qdrant-client -hyperframe==6.0.1 - # via h2 -idna==3.10 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -numpy==1.26.4 - # via - # -c ./ingest/../base.txt - # qdrant-client -portalocker==2.10.1 - # via qdrant-client -protobuf==4.25.5 - # via - # -c ./ingest/../deps/constraints.txt - # grpcio-tools -pydantic==2.9.2 - # via qdrant-client -pydantic-core==2.23.4 - # via pydantic -qdrant-client==1.11.2 - # via -r ./ingest/qdrant.in -sniffio==1.3.1 - # via - # -c ./ingest/../base.txt - # anyio - # httpx -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # anyio - # pydantic - # pydantic-core -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # qdrant-client - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/ingest/reddit.in b/requirements/ingest/reddit.in deleted file mode 100644 index 5fa199c8c..000000000 --- a/requirements/ingest/reddit.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -praw diff --git a/requirements/ingest/reddit.txt b/requirements/ingest/reddit.txt deleted file mode 100644 index 2d5bd0302..000000000 --- a/requirements/ingest/reddit.txt +++ /dev/null @@ -1,36 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/reddit.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -praw==7.7.1 - # via -r ./ingest/reddit.in -prawcore==2.4.0 - # via praw -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # prawcore - # update-checker -update-checker==0.18.0 - # via praw -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -websocket-client==1.8.0 - # via praw diff --git a/requirements/ingest/s3.in b/requirements/ingest/s3.in deleted file mode 100644 index 43c7b2ecf..000000000 --- a/requirements/ingest/s3.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -s3fs -fsspec diff --git a/requirements/ingest/s3.txt b/requirements/ingest/s3.txt deleted file mode 100644 index 98bb2313b..000000000 --- a/requirements/ingest/s3.txt +++ /dev/null @@ -1,70 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/s3.in -# -aiobotocore==2.13.3 - # via s3fs -aiohappyeyeballs==2.4.0 - # via aiohttp -aiohttp==3.10.5 - # via - # aiobotocore - # s3fs -aioitertools==0.12.0 - # via aiobotocore -aiosignal==1.3.1 - # via aiohttp -async-timeout==4.0.3 - # via aiohttp -attrs==24.2.0 - # via aiohttp -botocore==1.34.131 - # via - # -c ./ingest/../deps/constraints.txt - # aiobotocore -frozenlist==1.4.1 - # via - # aiohttp - # aiosignal -fsspec==2024.9.0 - # via - # -r ./ingest/s3.in - # s3fs -idna==3.10 - # via - # -c ./ingest/../base.txt - # yarl -jmespath==1.0.1 - # via botocore -multidict==6.1.0 - # via - # aiohttp - # yarl -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # botocore -s3fs==2024.9.0 - # via -r ./ingest/s3.in -six==1.16.0 - # via - # -c ./ingest/../base.txt - # python-dateutil -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # aioitertools - # multidict -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # botocore -wrapt==1.16.0 - # via - # -c ./ingest/../base.txt - # aiobotocore -yarl==1.11.1 - # via aiohttp diff --git a/requirements/ingest/salesforce.in b/requirements/ingest/salesforce.in deleted file mode 100644 index 69547e5dd..000000000 --- a/requirements/ingest/salesforce.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -simple-salesforce diff --git a/requirements/ingest/salesforce.txt b/requirements/ingest/salesforce.txt deleted file mode 100644 index a87fe3bf4..000000000 --- a/requirements/ingest/salesforce.txt +++ /dev/null @@ -1,76 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/salesforce.in -# -attrs==24.2.0 - # via zeep -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # pyjwt -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -isodate==0.6.1 - # via zeep -lxml==5.3.0 - # via - # -c ./ingest/../base.txt - # zeep -more-itertools==10.5.0 - # via simple-salesforce -platformdirs==4.3.6 - # via zeep -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pyjwt[crypto]==2.9.0 - # via simple-salesforce -pytz==2024.2 - # via zeep -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # requests-file - # requests-toolbelt - # simple-salesforce - # zeep -requests-file==2.1.0 - # via zeep -requests-toolbelt==1.0.0 - # via - # -c ./ingest/../base.txt - # zeep -simple-salesforce==1.12.6 - # via -r ./ingest/salesforce.in -six==1.16.0 - # via - # -c ./ingest/../base.txt - # isodate -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # simple-salesforce -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -zeep==4.2.1 - # via simple-salesforce diff --git a/requirements/ingest/sftp.in b/requirements/ingest/sftp.in deleted file mode 100644 index e91c3eb34..000000000 --- a/requirements/ingest/sftp.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -fsspec -paramiko diff --git a/requirements/ingest/sftp.txt b/requirements/ingest/sftp.txt deleted file mode 100644 index 149af18f6..000000000 --- a/requirements/ingest/sftp.txt +++ /dev/null @@ -1,27 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/sftp.in -# -bcrypt==4.2.0 - # via paramiko -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography - # pynacl -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # paramiko -fsspec==2024.9.0 - # via -r ./ingest/sftp.in -paramiko==3.5.0 - # via -r ./ingest/sftp.in -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pynacl==1.5.0 - # via paramiko diff --git a/requirements/ingest/sharepoint.in b/requirements/ingest/sharepoint.in deleted file mode 100644 index 3b65d3029..000000000 --- a/requirements/ingest/sharepoint.in +++ /dev/null @@ -1,4 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -msal -Office365-REST-Python-Client diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt deleted file mode 100644 index 324fc52dd..000000000 --- a/requirements/ingest/sharepoint.txt +++ /dev/null @@ -1,55 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/sharepoint.in -# -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # msal - # pyjwt -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -msal==1.31.0 - # via - # -r ./ingest/sharepoint.in - # office365-rest-python-client -office365-rest-python-client==2.5.13 - # via -r ./ingest/sharepoint.in -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -pyjwt[crypto]==2.9.0 - # via msal -pytz==2024.2 - # via office365-rest-python-client -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # msal - # office365-rest-python-client -typing-extensions==4.12.2 - # via - # -c ./ingest/../base.txt - # office365-rest-python-client -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests diff --git a/requirements/ingest/singlestore.in b/requirements/ingest/singlestore.in deleted file mode 100644 index 5a7e51c28..000000000 --- a/requirements/ingest/singlestore.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -singlestoredb diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt deleted file mode 100644 index 226285577..000000000 --- a/requirements/ingest/singlestore.txt +++ /dev/null @@ -1,62 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/singlestore.in -# -build==1.2.2 - # via singlestoredb -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -importlib-metadata==8.5.0 - # via - # -c ./ingest/../deps/constraints.txt - # build -packaging==24.1 - # via - # -c ./ingest/../base.txt - # build -parsimonious==0.10.0 - # via singlestoredb -pyjwt==2.9.0 - # via singlestoredb -pyproject-hooks==1.1.0 - # via build -regex==2024.9.11 - # via - # -c ./ingest/../base.txt - # parsimonious -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # singlestoredb -singlestoredb==1.6.3 - # via -r ./ingest/singlestore.in -sqlparams==6.1.0 - # via singlestoredb -tomli==2.0.1 - # via - # build - # singlestoredb -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -wheel==0.44.0 - # via singlestoredb -zipp==3.20.2 - # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/ingest/slack.in b/requirements/ingest/slack.in deleted file mode 100644 index 5eaa80bc7..000000000 --- a/requirements/ingest/slack.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -slack_sdk diff --git a/requirements/ingest/slack.txt b/requirements/ingest/slack.txt deleted file mode 100644 index f518e3e81..000000000 --- a/requirements/ingest/slack.txt +++ /dev/null @@ -1,8 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/slack.in -# -slack-sdk==3.33.1 - # via -r ./ingest/slack.in diff --git a/requirements/ingest/weaviate.in b/requirements/ingest/weaviate.in deleted file mode 100644 index e487fcead..000000000 --- a/requirements/ingest/weaviate.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -weaviate-client diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt deleted file mode 100644 index 8c457917f..000000000 --- a/requirements/ingest/weaviate.txt +++ /dev/null @@ -1,45 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/weaviate.in -# -authlib==1.3.2 - # via weaviate-client -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -cffi==1.17.1 - # via - # -c ./ingest/../base.txt - # cryptography -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -cryptography==43.0.1 - # via - # -c ./ingest/../base.txt - # authlib -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -pycparser==2.22 - # via - # -c ./ingest/../base.txt - # cffi -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # weaviate-client -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -validators==0.34.0 - # via weaviate-client -weaviate-client==3.26.7 - # via -r ./ingest/weaviate.in diff --git a/requirements/ingest/wikipedia.in b/requirements/ingest/wikipedia.in deleted file mode 100644 index fb68f0930..000000000 --- a/requirements/ingest/wikipedia.in +++ /dev/null @@ -1,3 +0,0 @@ --c ../deps/constraints.txt --c ../base.txt -wikipedia diff --git a/requirements/ingest/wikipedia.txt b/requirements/ingest/wikipedia.txt deleted file mode 100644 index 790b132de..000000000 --- a/requirements/ingest/wikipedia.txt +++ /dev/null @@ -1,37 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile ./ingest/wikipedia.in -# -beautifulsoup4==4.12.3 - # via - # -c ./ingest/../base.txt - # wikipedia -certifi==2024.8.30 - # via - # -c ./ingest/../base.txt - # requests -charset-normalizer==3.3.2 - # via - # -c ./ingest/../base.txt - # requests -idna==3.10 - # via - # -c ./ingest/../base.txt - # requests -requests==2.32.3 - # via - # -c ./ingest/../base.txt - # wikipedia -soupsieve==2.6 - # via - # -c ./ingest/../base.txt - # beautifulsoup4 -urllib3==1.26.20 - # via - # -c ./ingest/../base.txt - # -c ./ingest/../deps/constraints.txt - # requests -wikipedia==1.4.0 - # via -r ./ingest/wikipedia.in diff --git a/requirements/test.txt b/requirements/test.txt index 9585aaa4d..6c9660091 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,7 +6,7 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.5.0 +anyio==4.6.0 # via # -c ./base.txt # httpx @@ -50,7 +50,7 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.1 # via -r ./test.in -grpcio==1.66.1 +grpcio==1.66.2 # via # -c ././deps/constraints.txt # -r ./test.in @@ -58,7 +58,7 @@ h11==0.14.0 # via # -c ./base.txt # httpcore -httpcore==1.0.5 +httpcore==1.0.6 # via # -c ./base.txt # httpx @@ -115,7 +115,7 @@ packaging==24.1 # -c ./base.txt # black # pytest -pandas==2.2.2 +pandas==2.2.3 # via label-studio-sdk pathspec==0.12.1 # via black @@ -185,7 +185,7 @@ sniffio==1.3.1 # httpx toml==0.10.2 # via liccheck -tomli==2.0.1 +tomli==2.0.2 # via # autoflake # black @@ -216,7 +216,7 @@ typing-extensions==4.12.2 # mypy # pydantic # pydantic-core -tzdata==2024.1 +tzdata==2024.2 # via pandas ujson==5.10.0 # via label-studio-sdk @@ -234,7 +234,7 @@ wrapt==1.16.0 # vcrpy xmljson==0.2.1 # via label-studio-sdk -yarl==1.11.1 +yarl==1.13.1 # via vcrpy # The following packages are considered to be unsafe in a requirements file: diff --git a/scripts/airtable-test-helpers/create_scale_test_components.py b/scripts/airtable-test-helpers/create_scale_test_components.py index af279457c..e2e11cb86 100644 --- a/scripts/airtable-test-helpers/create_scale_test_components.py +++ b/scripts/airtable-test-helpers/create_scale_test_components.py @@ -3,7 +3,7 @@ import os # import pyairtable as pyair from pyairtable import Api -from unstructured.ingest.logger import logger +from unstructured.logger import logger SCALE_TEST_NUMBER_OF_RECORDS = 20_000 diff --git a/scripts/consistent-deps.sh b/scripts/consistent-deps.sh index f210c6acb..81c772894 100755 --- a/scripts/consistent-deps.sh +++ b/scripts/consistent-deps.sh @@ -16,7 +16,7 @@ function join_by { } # NOTE(alan): Add any dependency files here we don't want to include in the resolution. -excludefiles=("requirements//build.txt") +excludefiles=("requirements/ingest/ingest.txt") # Build an array of requirements files. shopt -s nullglob @@ -39,6 +39,8 @@ reqstring=$(join_by ' -r ' "${reqfiles[@]}") reqstring="-r ${reqstring}" # This pip command will attempt to resolve the dependencies without installing anything. pipcommand="pip install --dry-run --ignore-installed ${reqstring}" +echo "dry run install of the following req files:" +echo "${pipcommand}" if $pipcommand >>/dev/null; then echo "Everything looks fine!" else diff --git a/setup.py b/setup.py index 89813f7c1..3b698e12e 100644 --- a/setup.py +++ b/setup.py @@ -121,58 +121,10 @@ setup( "rst": rst_reqs, "tsv": tsv_reqs, "xlsx": xlsx_reqs, - # Extra requirements for data connectors - "airtable": load_requirements("requirements/ingest/airtable.in"), - "astradb": load_requirements("requirements/ingest/astradb.in"), - "azure": load_requirements("requirements/ingest/azure.in"), - "azure-cognitive-search": load_requirements( - "requirements/ingest/azure-cognitive-search.in", - ), - "biomed": load_requirements("requirements/ingest/biomed.in"), - "box": load_requirements("requirements/ingest/box.in"), - "chroma": load_requirements("requirements/ingest/chroma.in"), - "clarifai": load_requirements("requirements/ingest/clarifai.in"), - "confluence": load_requirements("requirements/ingest/confluence.in"), - "delta-table": load_requirements("requirements/ingest/delta-table.in"), - "discord": load_requirements("requirements/ingest/discord.in"), - "dropbox": load_requirements("requirements/ingest/dropbox.in"), - "elasticsearch": load_requirements("requirements/ingest/elasticsearch.in"), - "gcs": load_requirements("requirements/ingest/gcs.in"), - "github": load_requirements("requirements/ingest/github.in"), - "gitlab": load_requirements("requirements/ingest/gitlab.in"), - "google-drive": load_requirements("requirements/ingest/google-drive.in"), - "hubspot": load_requirements("requirements/ingest/hubspot.in"), - "jira": load_requirements("requirements/ingest/jira.in"), - "kafka": load_requirements("requirements/ingest/kafka.in"), - "mongodb": load_requirements("requirements/ingest/mongodb.in"), - "notion": load_requirements("requirements/ingest/notion.in"), - "onedrive": load_requirements("requirements/ingest/onedrive.in"), - "opensearch": load_requirements("requirements/ingest/opensearch.in"), - "outlook": load_requirements("requirements/ingest/outlook.in"), - "pinecone": load_requirements("requirements/ingest/pinecone.in"), - "postgres": load_requirements("requirements/ingest/postgres.in"), - "qdrant": load_requirements("requirements/ingest/qdrant.in"), - "reddit": load_requirements("requirements/ingest/reddit.in"), - "s3": load_requirements("requirements/ingest/s3.in"), - "sharepoint": load_requirements("requirements/ingest/sharepoint.in"), - "salesforce": load_requirements("requirements/ingest/salesforce.in"), - "sftp": load_requirements("requirements/ingest/sftp.in"), - "slack": load_requirements("requirements/ingest/slack.in"), - "wikipedia": load_requirements("requirements/ingest/wikipedia.in"), - "weaviate": load_requirements("requirements/ingest/weaviate.in"), # Legacy extra requirements "huggingface": load_requirements("requirements/huggingface.in"), "local-inference": all_doc_reqs, "paddleocr": load_requirements("requirements/extra-paddleocr.in"), - "embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"), - "embed-mixedbreadai": load_requirements("requirements/ingest/embed-mixedbreadai.in"), - "embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"), - "embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"), - "embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"), - "openai": load_requirements("requirements/ingest/embed-openai.in"), - "bedrock": load_requirements("requirements/ingest/embed-aws-bedrock.in"), - "databricks-volumes": load_requirements("requirements/ingest/databricks-volumes.in"), - "singlestore": load_requirements("requirements/ingest/singlestore.in"), }, package_dir={"unstructured": "unstructured"}, package_data={"unstructured": ["nlp/*.txt", "py.typed"]}, diff --git a/test_unstructured/embed/test_mixedbreadai.py b/test_unstructured/embed/test_mixedbreadai.py index 015342677..0121d3d48 100644 --- a/test_unstructured/embed/test_mixedbreadai.py +++ b/test_unstructured/embed/test_mixedbreadai.py @@ -22,8 +22,8 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): mock_client.embeddings.side_effect = mock_embeddings - # Mock create_client to return our mock_client - mocker.patch.object(MixedbreadAIEmbeddingEncoder, "create_client", return_value=mock_client) + # Mock get_client to return our mock_client + mocker.patch.object(MixedbreadAIEmbeddingConfig, "get_client", return_value=mock_client) encoder = MixedbreadAIEmbeddingEncoder( config=MixedbreadAIEmbeddingConfig( diff --git a/test_unstructured/embed/test_octoai.py b/test_unstructured/embed/test_octoai.py index df9b302e4..6b237ff84 100644 --- a/test_unstructured/embed/test_octoai.py +++ b/test_unstructured/embed/test_octoai.py @@ -7,8 +7,8 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): mock_client = mocker.MagicMock() mock_client.embed_documents.return_value = [1, 2] - # Mock create_client to return our mock_client - mocker.patch.object(OctoAIEmbeddingEncoder, "create_client", return_value=mock_client) + # Mock get_client to return our mock_client + mocker.patch.object(OctoAiEmbeddingConfig, "get_client", return_value=mock_client) encoder = OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(api_key="api_key")) elements = encoder.embed_documents( diff --git a/test_unstructured/embed/test_openai.py b/test_unstructured/embed/test_openai.py index 7d37257b8..39148a454 100644 --- a/test_unstructured/embed/test_openai.py +++ b/test_unstructured/embed/test_openai.py @@ -7,8 +7,8 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): mock_client = mocker.MagicMock() mock_client.embed_documents.return_value = [1, 2] - # Mock create_client to return our mock_client - mocker.patch.object(OpenAIEmbeddingEncoder, "create_client", return_value=mock_client) + # Mock get_client to return our mock_client + mocker.patch.object(OpenAIEmbeddingConfig, "get_client", return_value=mock_client) encoder = OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(api_key="api_key")) elements = encoder.embed_documents( diff --git a/test_unstructured/embed/test_vertexai.py b/test_unstructured/embed/test_vertexai.py index f754b19a1..3899a1994 100644 --- a/test_unstructured/embed/test_vertexai.py +++ b/test_unstructured/embed/test_vertexai.py @@ -8,7 +8,7 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): mock_client.embed_documents.return_value = [1, 2] # Mock create_client to return our mock_client - mocker.patch.object(VertexAIEmbeddingEncoder, "create_client", return_value=mock_client) + mocker.patch.object(VertexAIEmbeddingConfig, "get_client", return_value=mock_client) encoder = VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(api_key="api_key")) elements = encoder.embed_documents( diff --git a/test_unstructured/embed/test_voyageai.py b/test_unstructured/embed/test_voyageai.py index cd4bd0551..b759e6153 100644 --- a/test_unstructured/embed/test_voyageai.py +++ b/test_unstructured/embed/test_voyageai.py @@ -7,8 +7,8 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): mock_client = mocker.MagicMock() mock_client.embed_documents.return_value = [1, 2] - # Mock create_client to return our mock_client - mocker.patch.object(VoyageAIEmbeddingEncoder, "create_client", return_value=mock_client) + # Mock get_client to return our mock_client + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) encoder = VoyageAIEmbeddingEncoder( config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-law-2") diff --git a/test_unstructured/ingest/utils/test_compression.py b/test_unstructured/ingest/utils/test_compression.py deleted file mode 100644 index 7699a385e..000000000 --- a/test_unstructured/ingest/utils/test_compression.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -import tarfile - -from unstructured.ingest.utils.compression import uncompress_tar_file - - -def test_uncompress_tar_file(tmpdir): - tar_filename = os.path.join(tmpdir, "test.tar") - filename = "example-docs/fake-text.txt" - - with tarfile.open(tar_filename, "w:gz") as tar: - tar.add(filename, arcname=os.path.basename(filename)) - - path = uncompress_tar_file(tar_filename, path=tmpdir.dirname) - assert path == tmpdir.dirname diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index 8d8f5a7eb..487b98b2c 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -2,7 +2,6 @@ from __future__ import annotations import json import os -import re import pytest @@ -313,32 +312,6 @@ def test_catch_overlapping_and_nested_bboxes_non_overlapping_case(): assert overlapping_cases == [] -def test_validate_data_args(): - assert utils.validate_date_args("2020-10-10") is True - - with pytest.raises(ValueError): - utils.validate_date_args("blah") - - with pytest.raises(ValueError): - utils.validate_date_args(None) - - -@pytest.mark.parametrize( - "date", ["1990-12-01", "2050-01-01T00:00:00", "2050-01-01+00:00:00", "2022-02-12T14:30:00-0500"] -) -def test_validate_date_args_accepts_standard_formats(date): - assert utils.validate_date_args(date) - - -@pytest.mark.parametrize("date", [None, "not a date", "1990-12-33"]) -def test_validate_date_args_raises_for_invalid_formats(date): - pattern1 = re.compile(r"The argument.*?(?:is None).*") - pattern2 = re.compile(r"The argument.*?(?:does not satisfy the format: YYYY-MM-DD).*") - combined_pattern = re.compile(f"({pattern1.pattern}|{pattern2.pattern})") - with pytest.raises(ValueError, match=combined_pattern): - assert utils.validate_date_args(date) - - def test_only_returns_singleton_iterable(): singleton_iterable = [42] result = utils.only(singleton_iterable) diff --git a/test_unstructured_ingest/dest/astradb.sh b/test_unstructured_ingest/dest/astradb.sh deleted file mode 100755 index 77fc0e25e..000000000 --- a/test_unstructured_ingest/dest/astradb.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=astradb-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -if [ -z "$ASTRA_DB_APPLICATION_TOKEN" ]; then - echo "Skipping Astra DB ingest test because ASTRA_DB_APPLICATION_TOKEN env var is not set." - exit 0 -fi - -if [ -z "$ASTRA_DB_API_ENDPOINT" ]; then - echo "Skipping Astra DB ingest test because ASTRA_DB_API_ENDPOINT env var is not set." - exit 0 -fi - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) -COLLECTION_NAME="astradb_test_output_$RANDOM_SUFFIX" -EMBEDDING_DIMENSION=384 - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh - -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - - python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \ - --token "$ASTRA_DB_APPLICATION_TOKEN" \ - --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ - --collection-name "$COLLECTION_NAME" down -} - -trap cleanup EXIT - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --input-path example-docs/book-war-and-peace-1p.txt \ - --work-dir "$WORK_DIR" \ - --chunking-strategy by_title \ - --chunk-max-characters 1500 \ - --chunk-multipage-sections \ - --embedding-provider "langchain-huggingface" \ - astradb \ - --token "$ASTRA_DB_APPLICATION_TOKEN" \ - --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ - --collection-name "$COLLECTION_NAME" \ - --embedding-dimension "$EMBEDDING_DIMENSION" \ - --requested-indexing-policy '{"deny": ["metadata"]}' - -python "$SCRIPT_DIR"/python/test-ingest-astradb-output.py \ - --token "$ASTRA_DB_APPLICATION_TOKEN" \ - --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ - --collection-name "$COLLECTION_NAME" check diff --git a/test_unstructured_ingest/dest/azure-cognitive-search.sh b/test_unstructured_ingest/dest/azure-cognitive-search.sh deleted file mode 100755 index 8b534939f..000000000 --- a/test_unstructured_ingest/dest/azure-cognitive-search.sh +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_FOLDER_NAME=azure-cog-search-dest -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -UPLOAD_DIR=$WORK_DIR/upload_stage -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -AZURE_SEARCH_ENDPOINT="https://ingest-test-azure-cognitive-search.search.windows.net" - -random_id=$(uuidgen) -# index name must be all lowercase -random_id=$(echo "$random_id" | tr '[:upper:]' '[:lower:]') -DESTINATION_INDEX="utic-test-ingest-fixtures-output-$random_id" -# The vector configs on the schema currently only exist on versions: -# 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview -API_VERSION=2023-07-01-Preview - -if [ -z "$AZURE_SEARCH_API_KEY" ] || [ -z "$AZURE_SEARCH_ENDPOINT" ]; then - echo "Skipping Azure Cognitive Search ingest test because AZURE_SEARCH_API_KEY or AZURE_SEARCH_ENDPOINT env var is not set." - exit 8 -fi - -endpoint="$AZURE_SEARCH_ENDPOINT/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" -echo "Connecting to endpoint: $endpoint" -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Index cleanup - response_code=$(curl -s -o /dev/null -w "%{http_code}" \ - "$endpoint" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json') - if [ "$response_code" == "200" ]; then - echo "deleting index $DESTINATION_INDEX" - curl -X DELETE \ - "$endpoint" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json' - else - echo "Index $DESTINATION_INDEX does not exist, nothing to delete" - fi - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" -} - -trap cleanup EXIT - -# Create index -echo "Creating index $DESTINATION_INDEX" -response=$(curl -X PUT -s -w "\n%{http_code}" \ - "$endpoint" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json' \ - --data "@$SCRIPT_DIR/files/azure_cognitive_index_schema.json") -response_code=$(echo "$response" | tail -n 1) # get the last line -content=$(echo "$response" | head -n 1) # get the first line -if [ "$response_code" -lt 400 ]; then - echo "Index creation success: $response_code" -else - echo "Index creation failure [$response_code]: $content" - exit 1 -fi - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - --chunking-strategy by_title \ - --chunk-combine-text-under-n-chars 150 \ - --chunk-new-after-n-chars 1500 \ - --chunk-max-characters 2500 \ - --chunk-multipage-sections \ - --chunk-no-include-orig-elements \ - --embedding-provider "langchain-huggingface" \ - azure-cognitive-search \ - --key "$AZURE_SEARCH_API_KEY" \ - --endpoint "$AZURE_SEARCH_ENDPOINT" \ - --index "$DESTINATION_INDEX" - -# It can take some time for the index to catch up with the content that was written, this check between 10s sleeps -# to give it that time process the writes. Will timeout after checking for a minute. -docs_count_remote=0 -attempt=1 -while [ "$docs_count_remote" -eq 0 ] && [ "$attempt" -lt 6 ]; do - echo "attempt $attempt: sleeping 10 seconds to let index finish catching up after writes" - sleep 10 - - # Check the contents of the index - docs_count_remote=$(curl "$AZURE_SEARCH_ENDPOINT/indexes/$DESTINATION_INDEX/docs/\$count?api-version=$API_VERSION" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json' | jq) - - echo "docs count pulled from Azure Cognitive Search: $docs_count_remote" - - attempt=$((attempt + 1)) -done - -docs_count_local=0 -for i in $(jq length "$UPLOAD_DIR"/*.json); do - docs_count_local=$((docs_count_local + i)) -done - -if [ "$docs_count_remote" -ne "$docs_count_local" ]; then - echo "Number of docs in Azure Cognitive Search $docs_count_remote doesn't match the expected docs: $docs_count_local" - exit 1 -fi diff --git a/test_unstructured_ingest/dest/azure.sh b/test_unstructured_ingest/dest/azure.sh deleted file mode 100755 index 208b4a5a4..000000000 --- a/test_unstructured_ingest/dest/azure.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=azure-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -if [ -z "$AZURE_DEST_CONNECTION_STR" ]; then - echo "Skipping Azure destination ingest test because the AZURE_DEST_CONNECTION_STR env var is not set." - exit 8 -fi - -CONTAINER=utic-ingest-test-fixtures-output -DIRECTORY=$(uuidgen) -REMOTE_URL_RAW="$CONTAINER/$DIRECTORY/" -REMOTE_URL="abfs://$REMOTE_URL_RAW" - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - - python "$SCRIPT_DIR"/python/test-azure-output.py down \ - --connection-string "$AZURE_DEST_CONNECTION_STR" \ - --container "$CONTAINER" \ - --blob-path "$DIRECTORY" - -} -trap cleanup EXIT - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - azure \ - --overwrite \ - --remote-url "$REMOTE_URL" \ - --connection-string "$AZURE_DEST_CONNECTION_STR" - -# Simply check the number of files uploaded -python "$SCRIPT_DIR"/python/test-azure-output.py check \ - --expected-files 1 \ - --connection-string "$AZURE_DEST_CONNECTION_STR" \ - --container "$CONTAINER" \ - --blob-path "$DIRECTORY" diff --git a/test_unstructured_ingest/dest/box.sh b/test_unstructured_ingest/dest/box.sh deleted file mode 100755 index 37ad702dd..000000000 --- a/test_unstructured_ingest/dest/box.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash -#TODO currently box api/sdk does not work to create folders and check for content similar to other fsspec ingest tests - -# -#set -e -# -#DEST_PATH=$(dirname "$(realpath "$0")") -#SCRIPT_DIR=$(dirname "$DEST_PATH") -#cd "$SCRIPT_DIR"/.. || exit 1 -#OUTPUT_FOLDER_NAME=box-dest -#OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -#WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -#max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -#DESTINATION_BOX="box://utic-dev-tech-fixtures/utic-ingest-test-fixtures-output/$(uuidgen)/" -# -#CI=${CI:-"false"} -# -#if [ -z "$BOX_APP_CONFIG" ] && [ -z "$BOX_APP_CONFIG_PATH" ]; then -# echo "Skipping Box ingest test because neither BOX_APP_CONFIG nor BOX_APP_CONFIG_PATH env vars are set." -# exit 0 -#fi -# -#if [ -z "$BOX_APP_CONFIG_PATH" ]; then -# # Create temporary service key file -# BOX_APP_CONFIG_PATH=$(mktemp) -# echo "$BOX_APP_CONFIG" >"$BOX_APP_CONFIG_PATH" -#fi -# -## shellcheck disable=SC1091 -#source "$SCRIPT_DIR"/cleanup.sh -#function cleanup() { -# cleanup_dir "$OUTPUT_DIR" -# cleanup_dir "$WORK_DIR" -# if [ "$CI" == "true" ]; then -# cleanup_dir "$DOWNLOAD_DIR" -# fi -#} -#trap cleanup EXIT -# -#RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -#PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ -# local \ -# --num-processes "$max_processes" \ -# --output-dir "$OUTPUT_DIR" \ -# --strategy fast \ -# --verbose \ -# --reprocess \ -# --input-path example-docs/pdf/fake-memo.pdf \ -# --work-dir "$WORK_DIR" \ -# box \ -# --box-app-config "$BOX_APP_CONFIG_PATH" \ -# --remote-url "$DESTINATION_BOX" \ -# -## Simply check the number of files uploaded -#expected_num_files=1 diff --git a/test_unstructured_ingest/dest/chroma.sh b/test_unstructured_ingest/dest/chroma.sh deleted file mode 100755 index 926cb4380..000000000 --- a/test_unstructured_ingest/dest/chroma.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=chroma-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -DESTINATION_PATH=$SCRIPT_DIR/chroma-dest -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -CI=${CI:-"false"} - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) - -COLLECTION_NAME="chroma-test-output-$RANDOM_SUFFIX" - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh - -function cleanup() { - # Kill chroma background process - pgrep -f chroma-dest | xargs kill - cleanup_dir "$DESTINATION_PATH" - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} - -trap cleanup EXIT - -# Run chroma from different script so it can be forced into background -scripts/chroma-test-helpers/create-and-check-chroma.sh "$DESTINATION_PATH" -wait -sleep 5 - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --input-path example-docs/book-war-and-peace-1p.txt \ - --work-dir "$WORK_DIR" \ - --chunking-strategy by_title \ - --chunk-max-characters 1500 \ - --chunk-multipage-sections \ - --embedding-provider "langchain-huggingface" \ - chroma \ - --host "localhost" \ - --port 8000 \ - --collection-name "$COLLECTION_NAME" \ - --tenant "default_tenant" \ - --database "default_database" \ - --batch-size 80 - -python "$SCRIPT_DIR"/python/test-ingest-chroma-output.py --collection-name "$COLLECTION_NAME" diff --git a/test_unstructured_ingest/dest/clarifai.sh b/test_unstructured_ingest/dest/clarifai.sh deleted file mode 100755 index 2ed046aae..000000000 --- a/test_unstructured_ingest/dest/clarifai.sh +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=clarifai-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -if [ -z "$CLARIFAI_API_KEY" ]; then - echo "Skipping Clarifai ingest test because CLARIFAI_API_KEY env var is not set." - exit 0 - -fi - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) -# Set the variables with default values -USER_ID="unstructured" -APP_ID="test-app-unstructured-$RANDOM_SUFFIX" - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Get response code to check if app really exists - response_code=$(curl \ - -s -o /dev/null \ - -w "%{http_code}" \ - --request GET "https://api.clarifai.com/v2/users/$USER_ID/apps/$APP_ID" \ - --header "Authorization: Key $CLARIFAI_API_KEY") - - # Cleanup (delete) index if it exists - if [ "$response_code" == "200" ]; then - echo "" - echo "deleting clarifai app $APP_ID" - curl --request DELETE "https://api.clarifai.com/v2/users/$USER_ID/apps/$APP_ID" \ - -H "Authorization: Key $CLARIFAI_API_KEY" - - else - echo "There was an error during deletion of clarifai app $APP_ID, with response code: $response_code. App might not exists in your account." - fi - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" -} - -trap cleanup EXIT - -echo "Creating Clarifai app $APP_ID" -response_code=$( - curl \ - -s -o /dev/null \ - -w "%{http_code}" \ - --location --request POST "https://api.clarifai.com/v2/users/$USER_ID/apps/" \ - --header "Content-Type: application/json" \ - --header "Authorization: Key $CLARIFAI_API_KEY" \ - --data-raw "{\"apps\": [{\"id\": \"$APP_ID\", \"default_workflow_id\": \"Universal\"}]}" -) -if [ "$response_code" -lt 400 ]; then - echo "App created successfully: $APP_ID" -else - echo "Failed to create app $APP_ID: $response_code" - exit 1 -fi - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --input-path example-docs/book-war-and-peace-1p.txt \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --chunking-strategy by_title \ - --num-processes "$max_processes" \ - --work-dir "$WORK_DIR" \ - --verbose \ - clarifai \ - --app-id "$APP_ID" \ - --user-id "$USER_ID" \ - --api-key "$CLARIFAI_API_KEY" \ - --batch-size 100 - -no_of_inputs=0 -sleep_time=5 - -max_retries=10 -retry_count=0 - -while [ "$no_of_inputs" -eq 0 ]; do - echo "checking for no of inputs in clarifai app" - sleep $sleep_time - - if [ "$retry_count" -eq "$max_retries" ]; then - echo "Reached maximum retries limit. Exiting..." - break - fi - - resp=$(curl \ - -s GET "https://api.clarifai.com/v2/users/$USER_ID/apps/$APP_ID/inputs/status" \ - -H "Authorization: Key $CLARIFAI_API_KEY") - - no_of_inputs=$(echo "$resp" | jq -r '.counts.processed' | sed 's/\x1b\[[0-9;]*m//g') - echo "Processed count: $no_of_inputs" - retry_count=$((retry_count + 1)) - -done - -EXPECTED=8 - -if [ "$no_of_inputs" -ne "$EXPECTED" ]; then - echo "Number of inputs in the clarifai app $APP_ID is not equal to expected. Test failed." - exit 1 - -fi diff --git a/test_unstructured_ingest/dest/databricks-volumes.sh b/test_unstructured_ingest/dest/databricks-volumes.sh deleted file mode 100755 index 6cf6e38a2..000000000 --- a/test_unstructured_ingest/dest/databricks-volumes.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=databricks-volumes -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -DESTINATION_PATH=$SCRIPT_DIR/databricks-volumes -CI=${CI:-"false"} - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) - -DATABRICKS_VOLUME="test-platform" -DATABRICKS_VOLUME_PATH="databricks-volumes-test-output-$RANDOM_SUFFIX" - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh - -function cleanup() { - python "$SCRIPT_DIR"/python/test-databricks-volumes.py cleanup \ - --host "$DATABRICKS_HOST" \ - --username "$DATABRICKS_USERNAME" \ - --password "$DATABRICKS_PASSWORD" \ - --volume "$DATABRICKS_VOLUME" \ - --catalog "$DATABRICKS_CATALOG" \ - --volume-path "$DATABRICKS_VOLUME_PATH" - - cleanup_dir "$DESTINATION_PATH" - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} - -trap cleanup EXIT - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - databricks-volumes \ - --host "$DATABRICKS_HOST" \ - --username "$DATABRICKS_USERNAME" \ - --password "$DATABRICKS_PASSWORD" \ - --volume "$DATABRICKS_VOLUME" \ - --catalog "$DATABRICKS_CATALOG" \ - --volume-path "$DATABRICKS_VOLUME_PATH" - -python "$SCRIPT_DIR"/python/test-databricks-volumes.py test \ - --host "$DATABRICKS_HOST" \ - --username "$DATABRICKS_USERNAME" \ - --password "$DATABRICKS_PASSWORD" \ - --volume "$DATABRICKS_VOLUME" \ - --catalog "$DATABRICKS_CATALOG" \ - --volume-path "$DATABRICKS_VOLUME_PATH" diff --git a/test_unstructured_ingest/dest/delta-table.sh b/test_unstructured_ingest/dest/delta-table.sh deleted file mode 100755 index cf54e1054..000000000 --- a/test_unstructured_ingest/dest/delta-table.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=delta-table-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -DESTINATION_TABLE=$SCRIPT_DIR/delta-table-dest -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -CI=${CI:-"false"} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh - -function cleanup() { - cleanup_dir "$DESTINATION_TABLE" - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" -} - -trap cleanup EXIT - -# Make sure directory doesn't exist at the beginning of script as this will cause it to break -if [ -d "$DESTINATION_TABLE" ]; then - echo "cleaning up directory: $DESTINATION_TABLE" - rm -rf "$DESTINATION_TABLE" -else - echo "$DESTINATION_TABLE does not exist or is not a directory, skipping deletion" -fi - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - delta-table \ - --table-uri "$DESTINATION_TABLE" - -python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE" diff --git a/test_unstructured_ingest/dest/dropbox.sh b/test_unstructured_ingest/dest/dropbox.sh deleted file mode 100755 index 52ade6722..000000000 --- a/test_unstructured_ingest/dest/dropbox.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=dropbox-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_DROPBOX="/test-output/$(uuidgen)" -CI=${CI:-"false"} - -if [ -z "$DROPBOX_APP_KEY" ] || [ -z "$DROPBOX_APP_SECRET" ] || [ -z "$DROPBOX_REFRESH_TOKEN" ]; then - echo "Skipping Dropbox ingest test because one or more of these env vars is not set:" - echo "DROPBOX_APP_KEY, DROPBOX_APP_SECRET, DROPBOX_REFRESH_TOKEN" - exit 8 -fi - -# Get a new access token from Dropbox -DROPBOX_RESPONSE=$(curl -s https://api.dropbox.com/oauth2/token -d refresh_token="$DROPBOX_REFRESH_TOKEN" -d grant_type=refresh_token -d client_id="$DROPBOX_APP_KEY" -d client_secret="$DROPBOX_APP_SECRET") -DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<<"$DROPBOX_RESPONSE") - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - - echo "deleting test folder $DESTINATION_DROPBOX" - curl -X POST https://api.dropboxapi.com/2/files/delete_v2 \ - --header "Content-Type: application/json" \ - --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ - --data "{\"path\":\"$DESTINATION_DROPBOX\"}" | jq -} -trap cleanup EXIT - -# Create new folder for test -echo "creating temp directory in dropbox for testing: $DESTINATION_DROPBOX" -response=$(curl -X POST -s -w "\n%{http_code}" https://api.dropboxapi.com/2/files/create_folder_v2 \ - --header "Content-Type: application/json" \ - --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ - --data "{\"autorename\":false,\"path\":\"$DESTINATION_DROPBOX\"}") -http_code=$(tail -n1 <<<"$response") # get the last line -content=$(sed '$ d' <<<"$response") # get all but the last line which contains the status code - -if [ "$http_code" -ge 300 ]; then - echo "Failed to create temp dir in dropbox: [$http_code] $content" - exit 1 -else - echo "$http_code:" - jq <<<"$content" -fi - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - dropbox \ - --token "$DROPBOX_ACCESS_TOKEN" \ - --remote-url "dropbox://$DESTINATION_DROPBOX" - -# Simply check the number of files uploaded -expected_num_files=1 -num_files_in_dropbox=$(curl -X POST https://api.dropboxapi.com/2/files/list_folder \ - --header "Content-Type: application/json" \ - --header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \ - --data "{\"path\":\"$DESTINATION_DROPBOX/\"}" | jq '.entries | length') -if [ "$num_files_in_dropbox" -ne "$expected_num_files" ]; then - echo "Expected $expected_num_files files to be uploaded to dropbox, but found $num_files_in_dropbox files." - exit 1 -fi diff --git a/test_unstructured_ingest/dest/elasticsearch.sh b/test_unstructured_ingest/dest/elasticsearch.sh deleted file mode 100755 index c4e6c8fe2..000000000 --- a/test_unstructured_ingest/dest/elasticsearch.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=elasticsearch-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -CI=${CI:-"false"} -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -# shellcheck disable=SC1091 -source scripts/elasticsearch-test-helpers/common/es-dest-ingest-test-creds.env -function cleanup { - # Index cleanup - echo "Stopping Elasticsearch Docker container" - docker-compose -f scripts/elasticsearch-test-helpers/common/docker-compose.yaml down --remove-orphans -v - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} - -trap cleanup EXIT - -echo "Creating elasticsearch instance" -# shellcheck source=/dev/null -scripts/elasticsearch-test-helpers/destination_connector/create-elasticsearch-instance.sh -wait - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/book-war-and-peace-1225p.txt \ - --work-dir "$WORK_DIR" \ - --chunking-strategy by_title \ - --chunk-combine-text-under-n-chars 200 \ - --chunk-new-after-n-chars 2500 \ - --chunk-max-characters 38000 \ - --chunk-multipage-sections \ - --embedding-provider "langchain-huggingface" \ - elasticsearch \ - --hosts http://localhost:9200 \ - --index-name ingest-test-destination \ - --username "$ELASTIC_USER" \ - --password "$ELASTIC_PASSWORD" \ - --batch-size-bytes 15000000 \ - --num-threads "$max_processes" - -desired_count=$(cat "$WORK_DIR"/upload_stage/* | jq 'length') -desired_embeddings=$(cat "$WORK_DIR"/upload_stage/* | jq '.[0]._source.embeddings' | tr -d '\n') - -PYTHONPATH=. scripts/elasticsearch-test-helpers/destination_connector/test-ingest-elasticsearch-output.py \ - --num-elements "$desired_count" \ - --embeddings "$desired_embeddings" diff --git a/test_unstructured_ingest/dest/gcs.sh b/test_unstructured_ingest/dest/gcs.sh deleted file mode 100755 index 21571a937..000000000 --- a/test_unstructured_ingest/dest/gcs.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=gcs-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -BUCKET="utic-test-ingest-fixtures-output" -DIRECTORY=$(uuidgen) -DESTINATION_GCS="gs://$BUCKET/$DIRECTORY" -CI=${CI:-"false"} - -if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then - echo "Skipping Google Drive ingest test because the GCP_INGEST_SERVICE_KEY env var is not set." - exit 8 -fi - -# Create temporary service key file -GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) -echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - - python "$SCRIPT_DIR"/python/test-gcs-output.py down \ - --service-account-file "$GCP_INGEST_SERVICE_KEY_FILE" \ - --bucket "$BUCKET" \ - --blob-path "$DIRECTORY" - -} - -trap cleanup EXIT - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - gcs \ - --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ - --remote-url "$DESTINATION_GCS" - -# Simply check the number of files uploaded -python "$SCRIPT_DIR"/python/test-gcs-output.py check \ - --expected-files 1 \ - --service-account-file "$GCP_INGEST_SERVICE_KEY_FILE" \ - --bucket "$BUCKET" \ - --blob-path "$DIRECTORY" diff --git a/test_unstructured_ingest/dest/kafka-local.sh b/test_unstructured_ingest/dest/kafka-local.sh deleted file mode 100755 index 9086687ed..000000000 --- a/test_unstructured_ingest/dest/kafka-local.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=local-kafka-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) - -LC_ALL=C - -# Set the variables with default values if they're not set in the environment -KAFKA_TOPIC=${KAFKA_TOPIC:-"ingest-test-$RANDOM_SUFFIX"} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - - echo "Stopping local Kafka instance" - docker-compose -f scripts/kafka-test-helpers/docker-compose.yml down --remove-orphans -v -} - -trap cleanup EXIT - -echo "Creating local Kafka instance" -# shellcheck source=/dev/null -scripts/kafka-test-helpers/create-kafka-instance.sh -wait - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/layout-parser-paper.pdf \ - --work-dir "$WORK_DIR" \ - --chunking-strategy basic \ - --chunk-combine-text-under-n-chars 200 \ - --chunk-new-after-n-chars 2500 \ - --chunk-max-characters 38000 \ - --chunk-multipage-sections \ - --embedding-provider "langchain-huggingface" \ - kafka \ - --topic "$KAFKA_TOPIC" \ - --bootstrap-server "$KAFKA_BOOTSTRAP_SERVER" \ - --port 29092 \ - --confluent false - -echo "Checking for matching messages in Kafka" - -#Check the number of messages in destination topic -python "$SCRIPT_DIR"/python/test-kafka-output.py check \ - --bootstrap-server "$KAFKA_BOOTSTRAP_SERVER" \ - --topic "$KAFKA_TOPIC" \ - --confluent false \ - --port 29092 diff --git a/test_unstructured_ingest/dest/mongodb.sh b/test_unstructured_ingest/dest/mongodb.sh deleted file mode 100755 index 938af0d5f..000000000 --- a/test_unstructured_ingest/dest/mongodb.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC2012 - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=mongodb-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_MONGO_COLLECTION="utic-test-ingest-fixtures-output-$(uuidgen)" -CI=${CI:-"false"} - -if [ -z "$MONGODB_URI" ] && [ -z "$MONGODB_DATABASE_NAME" ]; then - echo "Skipping MongoDB destination ingest test because the MONGODB_URI and MONGODB_DATABASE_NAME env var are not set." - exit 8 -fi - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - - python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \ - --uri "$MONGODB_URI" \ - --database "$MONGODB_DATABASE_NAME" \ - --collection "$DESTINATION_MONGO_COLLECTION" down - -} - -trap cleanup EXIT - -# NOTE(robinson) - per pymongo docs, pymongo ships with its own version of the bson library, -# which is incompatible with the bson installed from pypi. bson is installed as part of the -# astradb dependencies. -# ref: https://pymongo.readthedocs.io/en/stable/installation.html -python -m pip uninstall -y bson pymongo -make install-ingest-mongodb - -python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \ - --uri "$MONGODB_URI" \ - --database "$MONGODB_DATABASE_NAME" \ - --collection "$DESTINATION_MONGO_COLLECTION" up - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-huggingface" \ - mongodb \ - --uri "$MONGODB_URI" \ - --database "$MONGODB_DATABASE_NAME" \ - --collection "$DESTINATION_MONGO_COLLECTION" - -python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \ - --uri "$MONGODB_URI" \ - --database "$MONGODB_DATABASE_NAME" \ - --collection "$DESTINATION_MONGO_COLLECTION" \ - check --expected-records 5 - -stage_file=$(ls -1 "$WORK_DIR"/upload_stage | head -n 1) -python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \ - --uri "$MONGODB_URI" \ - --database "$MONGODB_DATABASE_NAME" \ - --collection "$DESTINATION_MONGO_COLLECTION" \ - check-vector \ - --output-json "$WORK_DIR"/upload_stage/"$stage_file" diff --git a/test_unstructured_ingest/dest/opensearch.sh b/test_unstructured_ingest/dest/opensearch.sh deleted file mode 100755 index 003e4f286..000000000 --- a/test_unstructured_ingest/dest/opensearch.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=opensearch-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -CI=${CI:-"false"} -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Index cleanup - echo "Stopping OpenSearch Docker container" - docker-compose -f scripts/opensearch-test-helpers/common/docker-compose.yaml down --remove-orphans -v - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} - -trap cleanup EXIT - -echo "Creating opensearch instance" -# shellcheck source=/dev/null -scripts/opensearch-test-helpers/destination_connector/create-opensearch-instance.sh -wait - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-huggingface" \ - opensearch \ - --hosts http://localhost:9247 \ - --index-name ingest-test-destination \ - --username "admin" \ - --password "admin" \ - --use-ssl \ - --batch-size-bytes 150 \ - --num-threads "$max_processes" - -scripts/opensearch-test-helpers/destination_connector/test-ingest-opensearch-output.py diff --git a/test_unstructured_ingest/dest/pgvector.sh b/test_unstructured_ingest/dest/pgvector.sh deleted file mode 100755 index 25836cf1d..000000000 --- a/test_unstructured_ingest/dest/pgvector.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=sql-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -CI=${CI:-"false"} -DATABASE_TYPE="pgvector" - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - echo "Stopping SQL DB Docker container" - docker-compose -f scripts/sql-test-helpers/docker-compose-"$DATABASE_TYPE".yaml down --remove-orphans -v - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} - -trap cleanup EXIT - -# Create sql instance and create `elements` class -echo "Creating SQL DB instance" -# shellcheck source=/dev/null -scripts/sql-test-helpers/create-sql-instance.sh "$DATABASE_TYPE" -wait - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-huggingface" \ - sql \ - --db-type "postgresql" \ - --username unstructured \ - --password test \ - --host localhost \ - --port 5433 \ - --database elements - -"$SCRIPT_DIR"/python/test-ingest-sql-output.py "$DATABASE_TYPE" "5433" diff --git a/test_unstructured_ingest/dest/pinecone.sh b/test_unstructured_ingest/dest/pinecone.sh deleted file mode 100755 index 45adaca83..000000000 --- a/test_unstructured_ingest/dest/pinecone.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=s3-pinecone-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -writer_processes=$(((max_processes - 1) > 1 ? (max_processes - 1) : 2)) - -if [ -z "$PINECONE_API_KEY" ]; then - echo "Skipping Pinecone ingest test because PINECONE_API_KEY env var is not set." - exit 0 -fi - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) - -# Set the variables with default values if they're not set in the environment -PINECONE_INDEX=${PINECONE_INDEX:-"ingest-test-$RANDOM_SUFFIX"} -PINECONE_HOST_POSTFIX=${PINECONE_HOST_POSTFIX:-"4627-b74a"} -PINECONE_ENVIRONMENT=${PINECONE_ENVIRONMENT:-"us-east1-gcp"} -PINECONE_PROJECT_ID=${PINECONE_PROJECT_ID:-"art8iaj"} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - - # Get response code to check if index exists - response_code=$(curl \ - -s -o /dev/null \ - -w "%{http_code}" \ - --request GET \ - --url "https://api.pinecone.io/indexes/$PINECONE_INDEX" \ - --header 'accept: application/json' \ - --header "Api-Key: $PINECONE_API_KEY") - - # Cleanup (delete) index if it exists - if [ "$response_code" == "200" ]; then - echo "" - echo "deleting index $PINECONE_INDEX" - curl --request DELETE \ - "https://api.pinecone.io/indexes/$PINECONE_INDEX" \ - --header "Api-Key: $PINECONE_API_KEY" \ - --header 'content-type: application/json' - - else - echo "There was an error during index deletion for index $PINECONE_INDEX, with response code: $response_code. It might be that index $PINECONE_INDEX does not exist, so there is nothing to delete." - fi - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" -} - -trap cleanup EXIT - -echo "Creating index $PINECONE_INDEX" -response_code=$(curl \ - -s -o /dev/null \ - -w "%{http_code}" \ - --request POST \ - --url "https://api.pinecone.io/indexes" \ - --header "accept: application/json" \ - --header "content-type: application/json" \ - --header "Api-Key: $PINECONE_API_KEY" \ - --data ' -{ - "name": "'"$PINECONE_INDEX"'", - "dimension": 384, - "metric": "cosine", - "spec": { - "serverless": { - "cloud": "aws", - "region": "us-east-1" - } - } -} -') - -if [ "$response_code" -lt 400 ]; then - echo "Index creation success: $response_code" -else - echo "Index creation failure: $response_code" - exit 1 -fi - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/book-war-and-peace-1225p.txt \ - --work-dir "$WORK_DIR" \ - --chunking-strategy by_title \ - --chunk-combine-text-under-n-chars 150 --chunk-new-after-n-chars 1500 --chunk-max-characters 2500 --chunk-multipage-sections \ - --embedding-provider "langchain-huggingface" \ - pinecone \ - --api-key "$PINECONE_API_KEY" \ - --index-name "$PINECONE_INDEX" \ - --environment "$PINECONE_ENVIRONMENT" \ - --batch-size 80 \ - --num-processes "$writer_processes" - -# It can take some time for the index to catch up with the content that was written, this check between 10s sleeps -# to give it that time process the writes. Will timeout after checking for a minute. -num_of_vectors_remote=0 -attempt=1 -sleep_amount=30 -while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do - echo "attempt $attempt: sleeping $sleep_amount seconds to let index finish catching up after writes" - sleep $sleep_amount - - num_of_vectors_remote=$(curl --request POST \ - -s \ - --url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.aped-$PINECONE_HOST_POSTFIX.pinecone.io/describe_index_stats" \ - --header "accept: application/json" \ - --header "content-type: application/json" \ - --header "Api-Key: $PINECONE_API_KEY" | jq -r '.totalVectorCount') - - echo "vector count in Pinecone: $num_of_vectors_remote" - attempt=$((attempt + 1)) -done - -EXPECTED=1835 - -if [ "$num_of_vectors_remote" -ne $EXPECTED ]; then - echo "Number of vectors in Pinecone are $num_of_vectors_remote when the expected number is $EXPECTED. Test failed." - exit 1 -fi diff --git a/test_unstructured_ingest/dest/qdrant.sh b/test_unstructured_ingest/dest/qdrant.sh deleted file mode 100755 index ec9cf7cee..000000000 --- a/test_unstructured_ingest/dest/qdrant.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash - -set -ex - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=qdrant-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -writer_processes=$(((max_processes - 1) > 1 ? (max_processes - 1) : 2)) -CONTAINTER_NAME="qdrant_test" -QDRANT_PORT=6333 -QDRANT_HOST=localhost:$QDRANT_PORT -COLLECTION_NAME="qdrant-test-$(date +%s)" -EXPECTED_POINTS_COUNT=1387 -RETRIES=5 - -function stop_docker() { - docker stop $CONTAINTER_NAME -} - -docker run -d --rm \ - -p 6333:$QDRANT_PORT \ - --name $CONTAINTER_NAME qdrant/qdrant:latest - -trap stop_docker SIGINT -trap stop_docker ERR - -until curl --output /dev/null --silent --get --fail http://$QDRANT_HOST/collections; do - RETRIES=$((RETRIES - 1)) - if [ "$RETRIES" -le 0 ]; then - echo "Qdrant server failed to start" - stop_docker - exit 1 - fi - printf 'Waiting for Qdrant server to start...' - sleep 5 -done - -curl -X PUT \ - http://$QDRANT_HOST/collections/"$COLLECTION_NAME" \ - -H 'Content-Type: application/json' \ - -d '{ - "vectors": { - "size": 384, - "distance": "Cosine" - } -}' - -EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"} - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/book-war-and-peace-1225p.txt \ - --work-dir "$WORK_DIR" \ - --chunking-strategy by_title \ - --chunk-combine-text-under-n-chars 200 --chunk-new-after-n-chars 2500 --chunk-max-characters 38000 --chunk-multipage-sections \ - --embedding-provider "langchain-huggingface" \ - qdrant \ - --collection-name "$COLLECTION_NAME" \ - --location "http://"$QDRANT_HOST \ - --batch-size 80 \ - --num-processes "$writer_processes" - -response=$(curl -s -X POST \ - $QDRANT_HOST/collections/"$COLLECTION_NAME"/points/count \ - -H 'Content-Type: application/json' \ - -d '{ - "exact": true -}') - -count=$(echo "$response" | jq -r '.result.count') - -if [ "$count" -ne $EXPECTED_POINTS_COUNT ]; then - echo "Points count assertion failed. Expected: $EXPECTED. Got: $count. Test failed." - stop_docker - exit 1 -fi - -stop_docker diff --git a/test_unstructured_ingest/dest/s3.sh b/test_unstructured_ingest/dest/s3.sh deleted file mode 100755 index b8d0b901e..000000000 --- a/test_unstructured_ingest/dest/s3.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=s3-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -DESTINATION_S3="s3://utic-ingest-test-fixtures/destination/$(uuidgen)/" -CI=${CI:-"false"} - -if [ -z "$S3_INGEST_TEST_ACCESS_KEY" ] || [ -z "$S3_INGEST_TEST_SECRET_KEY" ]; then - echo "Skipping S3 ingest test because S3_INGEST_TEST_ACCESS_KEY or S3_INGEST_TEST_SECRET_KEY env var is not set." - exit 8 -fi - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup() { - cleanup_dir "$WORK_DIR" - - if AWS_ACCESS_KEY_ID="$S3_INGEST_TEST_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$S3_INGEST_TEST_SECRET_KEY" aws s3 ls "$DESTINATION_S3" --region us-east-2; then - echo "deleting destination s3 location: $DESTINATION_S3" - AWS_ACCESS_KEY_ID="$S3_INGEST_TEST_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$S3_INGEST_TEST_SECRET_KEY" aws s3 rm "$DESTINATION_S3" --recursive --region us-east-2 - fi - -} -trap cleanup EXIT - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - local \ - --num-processes "$max_processes" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - s3 \ - --key "$S3_INGEST_TEST_ACCESS_KEY" \ - --secret "$S3_INGEST_TEST_SECRET_KEY" \ - --remote-url "$DESTINATION_S3" - -# Simply check the number of files uploaded -expected_num_files=1 -num_files_in_s3=$(AWS_ACCESS_KEY_ID="$S3_INGEST_TEST_ACCESS_KEY" AWS_SECRET_ACCESS_KEY="$S3_INGEST_TEST_SECRET_KEY" aws s3 ls "${DESTINATION_S3}" --region us-east-2 | grep -c "\.json$") -if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then - echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files." - exit 1 -else - echo "Expected number of files found: $num_files_in_s3/$expected_num_files" -fi diff --git a/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh b/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh deleted file mode 100755 index 5c222a459..000000000 --- a/test_unstructured_ingest/dest/sharepoint-embed-cog-index.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=sharepoint-azure-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME -DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(uuidgen)" -# The vector configs on the schema currently only exist on versions: -# 2023-07-01-Preview, 2021-04-30-Preview, 2020-06-30-Preview -API_VERSION=2023-07-01-Preview -CI=${CI:-"false"} - -if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then - echo "Skipping Sharepoint ingest test because the SHAREPOINT_CLIENT_ID or SHAREPOINT_CRED env var is not set." - exit 8 -fi - -if [ -z "$SHAREPOINT_PERMISSIONS_APP_ID" ] || [ -z "$SHAREPOINT_PERMISSIONS_APP_CRED" ] || [ -z "$SHAREPOINT_PERMISSIONS_TENANT" ]; then - echo "Skipping Sharepoint ingest test because the SHAREPOINT_PERMISSIONS_APP_ID, SHAREPOINT_PERMISSIONS_APP_CRED, or SHAREPOINT_PERMISSIONS_TENANT env var is not set." - exit 8 -fi - -if [ -z "$OPENAI_API_KEY" ]; then - echo "Skipping Sharepoint embedding ingest test because the OPENAI_API_KEY env var is not set." - exit 8 -fi - -if [ -z "$AZURE_SEARCH_ENDPOINT" ] && [ -z "$AZURE_SEARCH_API_KEY" ]; then - echo "Skipping Sharepoint Azure Cognitive Search ingest test because neither AZURE_SEARCH_ENDPOINT nor AZURE_SEARCH_API_KEY env vars are set." - exit 8 -fi - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh - -function cleanup { - response_code=$(curl -s -o /dev/null -w "%{http_code}" \ - "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json') - if [ "$response_code" == "200" ]; then - echo "deleting index $DESTINATION_INDEX" - curl -X DELETE \ - "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json' - else - echo "Index $DESTINATION_INDEX does not exist, nothing to delete" - fi - - cleanup_dir "$OUTPUT_DIR" - cleanup_dir "$WORK_DIR" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - fi -} - -trap cleanup EXIT - -# Create index -echo "Creating index $DESTINATION_INDEX" -response_code=$(curl -s -o /dev/null -w "%{http_code}" -X PUT \ - "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX?api-version=$API_VERSION" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json' \ - --data "@$SCRIPT_DIR/files/azure_cognitive_index_schema.json") - -if [ "$response_code" -lt 400 ]; then - echo "Index creation success: $response_code" -else - echo "Index creation failure: $response_code" - exit 1 -fi - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ - sharepoint \ - --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ - --num-processes 2 \ - --strategy hi_res \ - --preserve-downloads \ - --reprocess \ - --output-dir "$OUTPUT_DIR" \ - --verbose \ - --client-cred "$SHAREPOINT_CRED" \ - --client-id "$SHAREPOINT_CLIENT_ID" \ - --site "$SHAREPOINT_SITE" \ - --permissions-application-id "$SHAREPOINT_PERMISSIONS_APP_ID" \ - --permissions-client-cred "$SHAREPOINT_PERMISSIONS_APP_CRED" \ - --permissions-tenant "$SHAREPOINT_PERMISSIONS_TENANT" \ - --path "Shared Documents" \ - --recursive \ - --embedding-provider "langchain-huggingface" \ - --chunking-strategy by_title \ - --chunk-multipage-sections \ - --work-dir "$WORK_DIR" \ - azure-cognitive-search \ - --key "$AZURE_SEARCH_API_KEY" \ - --endpoint "$AZURE_SEARCH_ENDPOINT" \ - --index "$DESTINATION_INDEX" - -# It can take some time for the index to catch up with the content that was written, this check between 10s sleeps -# to give it that time process the writes. Will timeout after checking for a minute. -docs_count_remote=0 -attempt=1 -while [ "$docs_count_remote" -eq 0 ] && [ "$attempt" -lt 6 ]; do - echo "attempt $attempt: sleeping 10 seconds to let index finish catching up after writes" - sleep 10 - - # Check the contents of the index - docs_count_remote=$(curl "https://utic-test-ingest-fixtures.search.windows.net/indexes/$DESTINATION_INDEX/docs/\$count?api-version=$API_VERSION" \ - --header "api-key: $AZURE_SEARCH_API_KEY" \ - --header 'content-type: application/json' | jq) - - echo "docs count pulled from Azure: $docs_count_remote" - - attempt=$((attempt + 1)) -done - -docs_count_local=0 -for i in $(jq length "$OUTPUT_DIR"/**/*.json); do - docs_count_local=$((docs_count_local + i)) -done - -if [ "$docs_count_remote" -ne "$docs_count_local" ]; then - echo "Number of docs $docs_count_remote doesn't match the expected docs: $docs_count_local" - exit 1 -fi diff --git a/test_unstructured_ingest/dest/singlestore.sh b/test_unstructured_ingest/dest/singlestore.sh deleted file mode 100755 index a04f81370..000000000 --- a/test_unstructured_ingest/dest/singlestore.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=singlestore-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -CI=${CI:-"false"} -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Index cleanup - echo "Stopping Singlestore Docker container" - docker compose -f scripts/singlestore-test-helpers/docker-compose.yml down --remove-orphans -v - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - -} - -trap cleanup EXIT - -# Create singlestore instance and create `elements` class -echo "Creating singlestore instance" -# shellcheck source=/dev/null -docker compose -f scripts/singlestore-test-helpers/docker-compose.yml up -d --wait-timeout 60 - -DATABASE=ingest_test -USER=root -HOST=localhost -PASSWORD=password -PORT=3306 -TABLE=elements - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-huggingface" \ - singlestore \ - --host $HOST \ - --user $USER \ - --password $PASSWORD \ - --database $DATABASE \ - --port $PORT \ - --table-name $TABLE \ - --drop-empty-cols - -expected_num_elements=$(cat "$WORK_DIR"/embed/* | jq 'length') -./scripts/singlestore-test-helpers/test_outputs.py \ - --table-name $TABLE \ - --database $DATABASE \ - --num-elements "$expected_num_elements" diff --git a/test_unstructured_ingest/dest/sqlite.sh b/test_unstructured_ingest/dest/sqlite.sh deleted file mode 100755 index 9cd54b35e..000000000 --- a/test_unstructured_ingest/dest/sqlite.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash - -set -e - -SRC_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$SRC_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=sql-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} -CI=${CI:-"false"} -DATABASE_TYPE="sqlite" -DB_PATH=$SCRIPT_DIR/elements.db - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - rm -rf "$DB_PATH" - if [ "$CI" == "true" ]; then - cleanup_dir "$DOWNLOAD_DIR" - - fi -} - -trap cleanup EXIT - -# Create sql instance and create `elements` class -echo "Creating SQL DB instance" -# shellcheck source=/dev/null -scripts/sql-test-helpers/create-sql-instance.sh "$DATABASE_TYPE" "$DB_PATH" -wait - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - sql \ - --db-type "$DATABASE_TYPE" \ - --username unstructured \ - --database "$DB_PATH" - -"$SCRIPT_DIR"/python/test-ingest-sql-output.py "$DATABASE_TYPE" "$DB_PATH" diff --git a/test_unstructured_ingest/dest/vectara.sh b/test_unstructured_ingest/dest/vectara.sh deleted file mode 100755 index 0ba223d44..000000000 --- a/test_unstructured_ingest/dest/vectara.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=local-vectara-dest -OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -RANDOM_SUFFIX=$((RANDOM % 100000 + 1)) -CORPUS_NAME="test-corpus-vectara-"$RANDOM_SUFFIX - -# Expected size of the uploaded document -EXPECTED_CORPUS_SIZE=8843308 - -if [ -z "$VECTARA_OAUTH_CLIENT_ID" ] && [ -z "$VECTARA_OAUTH_SECRET" ] && [ -z "$VECTARA_CUSTOMER_ID" ]; then - echo "Skipping VECTARA ingest test because VECTARA_OAUTH_CLIENT_ID, VECTARA_OAUTH_SECRET, or VECTARA_CUSTOMER_ID env var is not set." - exit 8 -fi - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - echo "Deleting corpus $corpus_id ($CORPUS_NAME)" - curl -sS -L -X POST 'https://api.vectara.io/v1/delete-corpus' \ - -H 'Content-Type: application/json' \ - -H 'Accept: application/json' \ - -H "Authorization: Bearer $access_token" \ - -H "customer-id: $VECTARA_CUSTOMER_ID" \ - --data-raw "{ - \"corpusId\": $corpus_id - }" - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" -} - -trap cleanup EXIT - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/book-war-and-peace-1225p.txt \ - --work-dir "$WORK_DIR" \ - vectara \ - --customer-id "$VECTARA_CUSTOMER_ID" \ - --oauth-client-id "$VECTARA_OAUTH_CLIENT_ID" \ - --oauth-secret "$VECTARA_OAUTH_SECRET" \ - --corpus-name "$CORPUS_NAME" - -# Get JWT token -jwt_token_resp=$(curl -sS -XPOST -H "Content-type: application/x-www-form-urlencoded" -d \ - "grant_type=client_credentials&client_id=$VECTARA_OAUTH_CLIENT_ID&client_secret=$VECTARA_OAUTH_SECRET" \ - "https://vectara-prod-$VECTARA_CUSTOMER_ID.auth.us-west-2.amazoncognito.com/oauth2/token") -access_token=$(echo "$jwt_token_resp" | jq -r '.access_token') - -# Get corpus ID from name -corpora_resp=$(curl -sS -L -X POST 'https://api.vectara.io/v1/list-corpora' \ - -H 'Content-Type: application/json' \ - -H 'Accept: application/json' \ - -H "customer-id: $VECTARA_CUSTOMER_ID" \ - -H "Authorization: Bearer $access_token" \ - --data-raw "{ - \"numResults\": 100, - \"filter\": \"$CORPUS_NAME\" - }") -corpus_id=$(echo "$corpora_resp" | jq -r '.corpus[0].id') - -# Check that the size of the corpus is as expected -get_corpus_size=$(curl -L -X POST 'https://api.vectara.io/v1/compute-corpus-size' \ - -H 'Content-Type: application/json' \ - -H 'Accept: application/json' \ - -H "customer-id: $VECTARA_CUSTOMER_ID" \ - -H "Authorization: Bearer $access_token" \ - --data-raw "{ - \"corpusId\": $corpus_id -}") -corpus_size=$(echo "$get_corpus_size" | jq -r '.size.size') - -if [ "$corpus_size" == "$EXPECTED_CORPUS_SIZE" ]; then - echo "Corpus size is as expected: $corpus_size" -else - echo "Corpus size is not as expected: $corpus_size" - echo "vs $EXPECTED_CORPUS_SIZE" - exit 1 -fi diff --git a/test_unstructured_ingest/dest/weaviate.sh b/test_unstructured_ingest/dest/weaviate.sh deleted file mode 100755 index 7dfa3281a..000000000 --- a/test_unstructured_ingest/dest/weaviate.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env bash - -set -e - -DEST_PATH=$(dirname "$(realpath "$0")") -SCRIPT_DIR=$(dirname "$DEST_PATH") -cd "$SCRIPT_DIR"/.. || exit 1 -OUTPUT_FOLDER_NAME=weaviate-dest -OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} -OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME -WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME -CI=${CI:-"false"} -max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} - -# shellcheck disable=SC1091 -source "$SCRIPT_DIR"/cleanup.sh -function cleanup { - # Index cleanup - echo "Stopping Weaviate Docker container" - docker-compose -f scripts/weaviate-test-helpers/docker-compose.yml down --remove-orphans -v - - # Local file cleanup - cleanup_dir "$WORK_DIR" - cleanup_dir "$OUTPUT_DIR" - -} - -trap cleanup EXIT - -# Create weaviate instance and create `elements` class -echo "Creating weaviate instance" -# shellcheck source=/dev/null -scripts/weaviate-test-helpers/create-weaviate-instance.sh -wait - -PYTHONPATH=. ./unstructured/ingest/main.py \ - local \ - --num-processes "$max_processes" \ - --output-dir "$OUTPUT_DIR" \ - --strategy fast \ - --verbose \ - --reprocess \ - --input-path example-docs/pdf/fake-memo.pdf \ - --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-huggingface" \ - weaviate \ - --host-url http://localhost:8080 \ - --class-name elements \ - --anonymous - -"$SCRIPT_DIR"/python/test-ingest-weaviate-output.py diff --git a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index 92d6daaa1..06e6a9009 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -11,7 +11,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -33,7 +33,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -55,7 +55,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -77,7 +77,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -99,7 +99,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -121,7 +121,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -143,7 +143,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -165,7 +165,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -187,7 +187,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -209,7 +209,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -231,7 +231,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -253,7 +253,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -275,7 +275,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -297,7 +297,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -319,7 +319,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -341,7 +341,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -363,7 +363,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -385,7 +385,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -407,7 +407,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -429,7 +429,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -451,7 +451,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -473,7 +473,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -495,7 +495,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -517,7 +517,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -539,7 +539,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -561,7 +561,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf", - "version": "237960874052008560436652606947751982249", + "version": "0x8DB214A673DD8D8", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json index 3cad0fd85..cca8a4dd1 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json @@ -11,7 +11,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -33,7 +33,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -55,7 +55,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -77,7 +77,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -99,7 +99,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -121,7 +121,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -143,7 +143,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -165,7 +165,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -187,7 +187,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -209,7 +209,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -231,7 +231,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -253,7 +253,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -275,7 +275,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -297,7 +297,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -319,7 +319,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -341,7 +341,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -363,7 +363,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -385,7 +385,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -407,7 +407,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -429,7 +429,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -451,7 +451,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -473,7 +473,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -495,7 +495,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -517,7 +517,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -539,7 +539,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -561,7 +561,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -583,7 +583,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -605,7 +605,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -627,7 +627,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -649,7 +649,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -671,7 +671,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -693,7 +693,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -715,7 +715,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -737,7 +737,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -759,7 +759,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -781,7 +781,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -803,7 +803,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -825,7 +825,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -847,7 +847,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -869,7 +869,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -891,7 +891,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -913,7 +913,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -935,7 +935,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -957,7 +957,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -979,7 +979,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1001,7 +1001,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1023,7 +1023,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1045,7 +1045,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1067,7 +1067,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1089,7 +1089,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1111,7 +1111,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1133,7 +1133,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1155,7 +1155,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1177,7 +1177,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1199,7 +1199,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1221,7 +1221,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1243,7 +1243,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1265,7 +1265,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1287,7 +1287,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1309,7 +1309,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1331,7 +1331,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1353,7 +1353,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1375,7 +1375,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1397,7 +1397,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1419,7 +1419,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1441,7 +1441,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1463,7 +1463,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1485,7 +1485,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1507,7 +1507,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1529,7 +1529,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1551,7 +1551,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1573,7 +1573,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1595,7 +1595,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1617,7 +1617,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1639,7 +1639,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1661,7 +1661,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1683,7 +1683,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1705,7 +1705,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1727,7 +1727,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1749,7 +1749,7 @@ "page_number": 2, "data_source": { "url": "abfs://container1/IRS-form-1987.pdf", - "version": "337148261958285544336683139132069637358", + "version": "0x8DB214AEE092B1E", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json index 4c72f31bb..870978812 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json +++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json @@ -11,7 +11,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -33,7 +33,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -55,7 +55,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -77,7 +77,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -99,7 +99,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -121,7 +121,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -143,7 +143,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -165,7 +165,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -187,7 +187,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -209,7 +209,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -231,7 +231,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -253,7 +253,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -275,7 +275,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -297,7 +297,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -319,7 +319,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -341,7 +341,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -363,7 +363,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -385,7 +385,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -407,7 +407,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -429,7 +429,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -451,7 +451,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -473,7 +473,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -495,7 +495,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -517,7 +517,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -539,7 +539,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -561,7 +561,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -583,7 +583,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -605,7 +605,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -627,7 +627,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -649,7 +649,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -671,7 +671,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -693,7 +693,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -715,7 +715,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -737,7 +737,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -759,7 +759,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -781,7 +781,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -803,7 +803,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -825,7 +825,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -847,7 +847,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -869,7 +869,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -891,7 +891,7 @@ "page_number": 1, "data_source": { "url": "abfs://container1/IRS-form-1987.png", - "version": "178514357676599756686300559820761454543", + "version": "0x8DB214C1B270B0D", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" diff --git a/test_unstructured_ingest/expected-structured-output/azure/rfc854.txt.json b/test_unstructured_ingest/expected-structured-output/azure/rfc854.txt.json index 0cd30210b..91374854e 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/rfc854.txt.json +++ b/test_unstructured_ingest/expected-structured-output/azure/rfc854.txt.json @@ -10,7 +10,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -31,7 +31,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -52,7 +52,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -73,7 +73,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -94,7 +94,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -115,7 +115,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -136,7 +136,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -157,7 +157,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -178,7 +178,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -199,7 +199,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -220,7 +220,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -241,7 +241,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -262,7 +262,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -283,7 +283,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -304,7 +304,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -325,7 +325,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -346,7 +346,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -367,7 +367,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -388,7 +388,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -409,7 +409,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -430,7 +430,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -451,7 +451,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -472,7 +472,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -493,7 +493,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -514,7 +514,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -535,7 +535,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -556,7 +556,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -577,7 +577,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -598,7 +598,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -619,7 +619,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -640,7 +640,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -661,7 +661,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -682,7 +682,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -703,7 +703,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -724,7 +724,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -745,7 +745,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -766,7 +766,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -787,7 +787,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -808,7 +808,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -829,7 +829,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -850,7 +850,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -871,7 +871,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -892,7 +892,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -913,7 +913,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -934,7 +934,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -955,7 +955,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -976,7 +976,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -997,7 +997,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1018,7 +1018,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1039,7 +1039,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1060,7 +1060,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1081,7 +1081,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1102,7 +1102,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1123,7 +1123,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1144,7 +1144,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1165,7 +1165,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1186,7 +1186,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1207,7 +1207,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1228,7 +1228,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1249,7 +1249,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1270,7 +1270,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1291,7 +1291,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1312,7 +1312,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1333,7 +1333,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1354,7 +1354,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1375,7 +1375,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1396,7 +1396,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1417,7 +1417,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1438,7 +1438,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1459,7 +1459,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1480,7 +1480,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1501,7 +1501,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1522,7 +1522,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1543,7 +1543,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1564,7 +1564,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1585,7 +1585,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1606,7 +1606,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1627,7 +1627,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1648,7 +1648,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1669,7 +1669,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1690,7 +1690,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1711,7 +1711,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1732,7 +1732,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1753,7 +1753,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1774,7 +1774,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1795,7 +1795,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1816,7 +1816,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1837,7 +1837,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1858,7 +1858,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1879,7 +1879,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1900,7 +1900,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1921,7 +1921,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1942,7 +1942,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1963,7 +1963,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1984,7 +1984,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2005,7 +2005,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2026,7 +2026,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2047,7 +2047,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2068,7 +2068,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2089,7 +2089,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2110,7 +2110,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2131,7 +2131,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2152,7 +2152,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2173,7 +2173,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2194,7 +2194,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2215,7 +2215,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2236,7 +2236,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2257,7 +2257,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2278,7 +2278,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2299,7 +2299,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2320,7 +2320,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2341,7 +2341,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2362,7 +2362,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2383,7 +2383,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2404,7 +2404,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2425,7 +2425,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2446,7 +2446,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2467,7 +2467,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2488,7 +2488,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2509,7 +2509,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2530,7 +2530,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2551,7 +2551,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2572,7 +2572,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2593,7 +2593,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2614,7 +2614,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2635,7 +2635,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2656,7 +2656,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2677,7 +2677,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2698,7 +2698,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2719,7 +2719,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2740,7 +2740,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2761,7 +2761,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2782,7 +2782,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2803,7 +2803,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2824,7 +2824,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2845,7 +2845,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2866,7 +2866,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2887,7 +2887,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2908,7 +2908,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2929,7 +2929,7 @@ "filetype": "text/plain", "data_source": { "url": "abfs://container1/rfc854.txt", - "version": "252402046838802114392575683859882596254", + "version": "0x8DB214DA15CE591", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" diff --git a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json index 387857ab5..e62bb1938 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json +++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json @@ -10,7 +10,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -31,7 +31,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -52,7 +52,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -81,7 +81,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -108,7 +108,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -135,7 +135,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -162,7 +162,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -189,7 +189,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -216,7 +216,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -243,7 +243,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -270,7 +270,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -297,7 +297,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -324,7 +324,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -351,7 +351,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -378,7 +378,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -405,7 +405,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -432,7 +432,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -459,7 +459,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -486,7 +486,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -513,7 +513,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -540,7 +540,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -567,7 +567,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -594,7 +594,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -621,7 +621,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -648,7 +648,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -675,7 +675,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -702,7 +702,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -729,7 +729,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -756,7 +756,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -783,7 +783,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -810,7 +810,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -837,7 +837,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -864,7 +864,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -891,7 +891,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -918,7 +918,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -945,7 +945,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -972,7 +972,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -999,7 +999,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1026,7 +1026,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1053,7 +1053,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1080,7 +1080,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1107,7 +1107,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1134,7 +1134,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1161,7 +1161,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1188,7 +1188,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1215,7 +1215,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1242,7 +1242,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1269,7 +1269,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1296,7 +1296,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1323,7 +1323,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1350,7 +1350,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1377,7 +1377,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1404,7 +1404,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1431,7 +1431,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1458,7 +1458,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1485,7 +1485,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1512,7 +1512,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1539,7 +1539,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1566,7 +1566,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1593,7 +1593,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1620,7 +1620,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1647,7 +1647,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1674,7 +1674,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1701,7 +1701,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1728,7 +1728,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1755,7 +1755,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1782,7 +1782,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1809,7 +1809,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1836,7 +1836,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1863,7 +1863,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1890,7 +1890,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1917,7 +1917,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1944,7 +1944,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1971,7 +1971,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -1998,7 +1998,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2025,7 +2025,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2052,7 +2052,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2079,7 +2079,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2106,7 +2106,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2133,7 +2133,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2160,7 +2160,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2187,7 +2187,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2214,7 +2214,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2235,7 +2235,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2256,7 +2256,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2283,7 +2283,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2316,7 +2316,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2343,7 +2343,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2364,7 +2364,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2397,7 +2397,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2430,7 +2430,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" @@ -2463,7 +2463,7 @@ "filetype": "text/html", "data_source": { "url": "abfs://container1/spring-weather.html", - "version": "162215905222974206637545574128436022861", + "version": "0x8DB214B74525BB6", "record_locator": { "protocol": "abfs", "remote_file_path": "abfs://container1/" diff --git a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json index 39646d9a7..3c7ca733b 100644 --- a/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/box/handbook-1p.docx.json @@ -11,7 +11,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -39,7 +39,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -67,7 +67,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -89,7 +89,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -111,7 +111,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -133,7 +133,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -155,7 +155,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -177,7 +177,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -199,7 +199,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -221,7 +221,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -243,7 +243,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -265,7 +265,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -287,7 +287,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -309,7 +309,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -332,7 +332,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { "url": "box:///utic-test-ingest-fixtures/handbook-1p.docx", - "version": "83125548004193369404829885052395764226", + "version": "1255888824139", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json index 1b9c8bad3..d0025fcee 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/ideas-page.html.json @@ -11,7 +11,7 @@ "filetype": "text/html", "data_source": { "url": "box:///utic-test-ingest-fixtures/nested-1/ideas-page.html", - "version": "77943175838335685751163845636763163681", + "version": "1255892530552", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", diff --git a/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json index ef9902dfb..e9bc64409 100644 --- a/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/box/nested-1/nested-2/ideas-page.html.json @@ -11,7 +11,7 @@ "filetype": "text/html", "data_source": { "url": "box:///utic-test-ingest-fixtures/nested-1/nested-2/ideas-page.html", - "version": "293680985726204769765169474511274942733", + "version": "1255884723846", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", diff --git a/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json b/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json index 23a1ddae7..2e6dbf696 100644 --- a/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json +++ b/test_unstructured_ingest/expected-structured-output/box/science-exploration-1p.pptx.json @@ -11,7 +11,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -34,7 +34,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -57,7 +57,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -80,7 +80,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -103,7 +103,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -126,7 +126,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -149,7 +149,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -172,7 +172,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -195,7 +195,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -218,7 +218,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -241,7 +241,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -264,7 +264,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", @@ -287,7 +287,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { "url": "box:///utic-test-ingest-fixtures/science-exploration-1p.pptx", - "version": "309546934335254463247992132065898582121", + "version": "1255894255490", "record_locator": { "protocol": "box", "remote_file_path": "box://utic-test-ingest-fixtures", diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json index 94e1c93f4..9e61bf43b 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/handbook-1p.docx.json @@ -10,13 +10,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -36,13 +38,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -62,13 +66,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -82,13 +88,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -102,13 +110,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -122,13 +132,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -142,13 +154,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -162,13 +176,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -182,13 +198,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -202,13 +220,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -222,13 +242,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -242,13 +264,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -262,13 +286,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -282,13 +308,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } }, @@ -303,13 +331,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "data_source": { - "url": "dropbox:///test-input/handbook-1p.docx", - "version": "134700592086487568162605251521926324397", + "url": "dropbox://test-input/handbook-1p.docx", + "version": "2ddaae143b824b304ab42bb607d0cd4a96e2d0d0a60a30025e4ce749a53a0b8e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACQ" - } + }, + "date_created": "1687394168.0", + "date_modified": "1697632567.0" } } } diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json index fb02cb1ff..1c500c276 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-1/ideas-page.html.json @@ -10,13 +10,15 @@ ], "filetype": "text/html", "data_source": { - "url": "dropbox:///test-input/nested-1/ideas-page.html", - "version": "67356979305728150851855820427694668063", + "url": "dropbox://test-input/nested-1/ideas-page.html", + "version": "7a31fe250cc57a9733f8d50e61b9b265c53f5dd12faedf4829e559e2c3a8845e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACw" - } + }, + "date_created": "1687394194.0", + "date_modified": "1697632566.0" } } } diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json index 564bd2577..0fa649855 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/nested-2/ideas-page.html.json @@ -10,13 +10,15 @@ ], "filetype": "text/html", "data_source": { - "url": "dropbox:///test-input/nested-2/ideas-page.html", - "version": "145453788782335405288844961545898675998", + "url": "dropbox://test-input/nested-2/ideas-page.html", + "version": "7a31fe250cc57a9733f8d50e61b9b265c53f5dd12faedf4829e559e2c3a8845e", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAADQ" - } + }, + "date_created": "1687394213.0", + "date_modified": "1697632566.0" } } } diff --git a/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json b/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json index c5a44f158..8e59883c8 100644 --- a/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json +++ b/test_unstructured_ingest/expected-structured-output/dropbox/science-exploration-1p.pptx.json @@ -10,13 +10,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -31,13 +33,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -52,13 +56,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -73,13 +79,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -94,13 +102,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -115,13 +125,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -136,13 +148,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -157,13 +171,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -178,13 +194,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -199,13 +217,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -220,13 +240,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -241,13 +263,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } }, @@ -262,13 +286,15 @@ ], "filetype": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "data_source": { - "url": "dropbox:///test-input/science-exploration-1p.pptx", - "version": "26035320120182381452247268381589958225", + "url": "dropbox://test-input/science-exploration-1p.pptx", + "version": "82ebb5e422916b72fa2bd283cae3b9f41b96a9d0af59f92a8edd6e9556ca5510", "record_locator": { "protocol": "dropbox", "remote_file_path": "dropbox://test-input/", "file_id": "id:De4ZYtDd-JoAAAAAAAAACA" - } + }, + "date_created": "1687394162.0", + "date_modified": "1697632567.0" } } } diff --git a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json index d3a3f0854..bcd7ef201 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/ideas-page.html.json @@ -11,7 +11,7 @@ "filetype": "text/html", "data_source": { "url": "gs://utic-test-ingest-fixtures/ideas-page.html", - "version": "199523943725186047835150971481714294476", + "version": "CJXRtOuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json index d49564e20..8c8d34a2f 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/fake-text.txt.json @@ -10,7 +10,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/fake-text.txt", - "version": "180263070579038859328651626981788275889", + "version": "CKyIrMaE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -32,7 +32,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/fake-text.txt", - "version": "180263070579038859328651626981788275889", + "version": "CKyIrMaE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -54,7 +54,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/fake-text.txt", - "version": "180263070579038859328651626981788275889", + "version": "CKyIrMaE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -76,7 +76,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/fake-text.txt", - "version": "180263070579038859328651626981788275889", + "version": "CKyIrMaE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -98,7 +98,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/fake-text.txt", - "version": "180263070579038859328651626981788275889", + "version": "CKyIrMaE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -120,7 +120,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/fake-text.txt", - "version": "180263070579038859328651626981788275889", + "version": "CKyIrMaE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json index 662caae8c..e31d5a5e0 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-1/nested/ideas-page.html.json @@ -11,7 +11,7 @@ "filetype": "text/html", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-1/nested/ideas-page.html", - "version": "310890354306462681752199911957569001015", + "version": "CMWrx8aE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json index 7f5a3c007..22bcb125b 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/fake-text.txt.json @@ -10,7 +10,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/fake-text.txt", - "version": "198731266903969902154134165613731741332", + "version": "CPXPxMuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -32,7 +32,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/fake-text.txt", - "version": "198731266903969902154134165613731741332", + "version": "CPXPxMuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -54,7 +54,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/fake-text.txt", - "version": "198731266903969902154134165613731741332", + "version": "CPXPxMuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -76,7 +76,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/fake-text.txt", - "version": "198731266903969902154134165613731741332", + "version": "CPXPxMuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -98,7 +98,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/fake-text.txt", - "version": "198731266903969902154134165613731741332", + "version": "CPXPxMuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -120,7 +120,7 @@ "filetype": "text/plain", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/fake-text.txt", - "version": "198731266903969902154134165613731741332", + "version": "CPXPxMuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json index 4b34ff850..b318f7a12 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/nested/ideas-page.html.json @@ -11,7 +11,7 @@ "filetype": "text/html", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/nested/ideas-page.html", - "version": "113813498010717860141768546590661839404", + "version": "COXZ3MuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", diff --git a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json index c7a6b9d3b..4931718ff 100644 --- a/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json +++ b/test_unstructured_ingest/expected-structured-output/gcs/nested-2/stanley-cups.xlsx.json @@ -12,7 +12,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", - "version": "25646232132200560657189097157576319365", + "version": "COul9MuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -37,7 +37,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", - "version": "25646232132200560657189097157576319365", + "version": "COul9MuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -61,7 +61,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", - "version": "25646232132200560657189097157576319365", + "version": "COul9MuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", @@ -86,7 +86,7 @@ "filetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "data_source": { "url": "gs://utic-test-ingest-fixtures/nested-2/stanley-cups.xlsx", - "version": "25646232132200560657189097157576319365", + "version": "COul9MuE0/8CEAE=", "record_locator": { "protocol": "gs", "remote_file_path": "gs://utic-test-ingest-fixtures/", diff --git a/test_unstructured_ingest/src/against-api.sh b/test_unstructured_ingest/src/against-api.sh index a4ff8f3ad..7f2d6a944 100755 --- a/test_unstructured_ingest/src/against-api.sh +++ b/test_unstructured_ingest/src/against-api.sh @@ -27,7 +27,7 @@ trap cleanup EXIT TEST_FILE_NAME=layout-parser-paper-with-table.pdf # including pdf-infer-table-structure to validate partition arguments are passed to the api -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --api-key "$UNS_API_KEY" \ diff --git a/test_unstructured_ingest/src/airtable-diff.sh b/test_unstructured_ingest/src/airtable-diff.sh index 3aa9bb638..3cd81eff7 100755 --- a/test_unstructured_ingest/src/airtable-diff.sh +++ b/test_unstructured_ingest/src/airtable-diff.sh @@ -35,7 +35,7 @@ if [ -z "$AIRTABLE_PERSONAL_ACCESS_TOKEN" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ airtable \ --download-dir "$DOWNLOAD_DIR" \ @@ -47,7 +47,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --reprocess \ --output-dir "$OUTPUT_DIR" \ --work-dir "$WORK_DIR" \ - --max-retry-time 10 \ --verbose "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME diff --git a/test_unstructured_ingest/src/airtable-large.sh b/test_unstructured_ingest/src/airtable-large.sh index d15fed2b9..c0bf06fe4 100755 --- a/test_unstructured_ingest/src/airtable-large.sh +++ b/test_unstructured_ingest/src/airtable-large.sh @@ -38,7 +38,7 @@ fi # shellcheck disable=SC1091 source ./scripts/airtable-test-helpers/component_ids.sh -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ airtable \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/astradb.sh b/test_unstructured_ingest/src/astradb.sh index 9aa89c48f..1ea211a6b 100755 --- a/test_unstructured_ingest/src/astradb.sh +++ b/test_unstructured_ingest/src/astradb.sh @@ -22,7 +22,8 @@ fi COLLECTION_NAME="ingest_test_src" -PYTHONPATH=. ./unstructured/ingest/main.py \ +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} +PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ astradb \ --token "$ASTRA_DB_APPLICATION_TOKEN" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ diff --git a/test_unstructured_ingest/src/azure.sh b/test_unstructured_ingest/src/azure.sh index 602f2de43..6744805d6 100755 --- a/test_unstructured_ingest/src/azure.sh +++ b/test_unstructured_ingest/src/azure.sh @@ -21,11 +21,11 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ azure \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --strategy hi_res \ --preserve-downloads \ diff --git a/test_unstructured_ingest/src/biomed-api.sh b/test_unstructured_ingest/src/biomed-api.sh index 75db5294e..82b29f887 100755 --- a/test_unstructured_ingest/src/biomed-api.sh +++ b/test_unstructured_ingest/src/biomed-api.sh @@ -23,7 +23,7 @@ trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 2 $OUTPUT_FOLDER_NAME 10k -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ biomed \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/biomed-path.sh b/test_unstructured_ingest/src/biomed-path.sh index 95effb0b6..12401ed8a 100755 --- a/test_unstructured_ingest/src/biomed-path.sh +++ b/test_unstructured_ingest/src/biomed-path.sh @@ -23,7 +23,7 @@ trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 1 $OUTPUT_FOLDER_NAME 10k -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ biomed \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/box.sh b/test_unstructured_ingest/src/box.sh index e9f2462b5..3ab2f44b4 100755 --- a/test_unstructured_ingest/src/box.sh +++ b/test_unstructured_ingest/src/box.sh @@ -38,13 +38,13 @@ if [ -z "$BOX_APP_CONFIG_PATH" ]; then echo "$BOX_APP_CONFIG" >"$BOX_APP_CONFIG_PATH" fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ box \ --download-dir "$DOWNLOAD_DIR" \ --box-app-config "$BOX_APP_CONFIG_PATH" \ --remote-url box://utic-test-ingest-fixtures \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --num-processes "$max_processes" \ --preserve-downloads \ diff --git a/test_unstructured_ingest/src/confluence-diff.sh b/test_unstructured_ingest/src/confluence-diff.sh index 5cc54f93b..dc0f71cd1 100755 --- a/test_unstructured_ingest/src/confluence-diff.sh +++ b/test_unstructured_ingest/src/confluence-diff.sh @@ -31,7 +31,7 @@ if [ -z "$CONFLUENCE_USER_EMAIL" ] || [ -z "$CONFLUENCE_API_TOKEN" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ confluence \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/confluence-large.sh b/test_unstructured_ingest/src/confluence-large.sh index 7b20d0ee0..790d675b9 100755 --- a/test_unstructured_ingest/src/confluence-large.sh +++ b/test_unstructured_ingest/src/confluence-large.sh @@ -37,7 +37,7 @@ fi # are being provided at the same time, which is a wrong way to use the connector. # We expect the test to ignore --confluence-num-of-spaces and use --confluence-list-of-spaces. -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ confluence \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/delta-table.sh b/test_unstructured_ingest/src/delta-table.sh index 7faf23c40..d8ac97145 100755 --- a/test_unstructured_ingest/src/delta-table.sh +++ b/test_unstructured_ingest/src/delta-table.sh @@ -31,7 +31,7 @@ function cleanup() { trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ delta-table \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/discord.sh b/test_unstructured_ingest/src/discord.sh index 64bf18364..ca986e3b0 100755 --- a/test_unstructured_ingest/src/discord.sh +++ b/test_unstructured_ingest/src/discord.sh @@ -29,7 +29,7 @@ if [ -z "$DISCORD_TOKEN" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ discord \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/dropbox.sh b/test_unstructured_ingest/src/dropbox.sh index 414ce0846..ff2c82998 100755 --- a/test_unstructured_ingest/src/dropbox.sh +++ b/test_unstructured_ingest/src/dropbox.sh @@ -34,12 +34,12 @@ fi DROPBOX_RESPONSE=$(curl https://api.dropbox.com/oauth2/token -d refresh_token="$DROPBOX_REFRESH_TOKEN" -d grant_type=refresh_token -d client_id="$DROPBOX_APP_KEY" -d client_secret="$DROPBOX_APP_SECRET") DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<<"$DROPBOX_RESPONSE") -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ dropbox \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --preserve-downloads \ --reprocess \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_unstructured_ingest/src/elasticsearch.sh b/test_unstructured_ingest/src/elasticsearch.sh index 1534f0018..9141cde57 100755 --- a/test_unstructured_ingest/src/elasticsearch.sh +++ b/test_unstructured_ingest/src/elasticsearch.sh @@ -37,11 +37,11 @@ trap cleanup EXIT scripts/elasticsearch-test-helpers/source_connector/create-fill-and-check-es.sh wait -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ elasticsearch \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ diff --git a/test_unstructured_ingest/src/gcs.sh b/test_unstructured_ingest/src/gcs.sh index 77d2d86c6..5261c1169 100755 --- a/test_unstructured_ingest/src/gcs.sh +++ b/test_unstructured_ingest/src/gcs.sh @@ -34,12 +34,12 @@ fi GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ gcs \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --preserve-downloads \ --reprocess \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_unstructured_ingest/src/github.sh b/test_unstructured_ingest/src/github.sh index a34355333..bea75f359 100755 --- a/test_unstructured_ingest/src/github.sh +++ b/test_unstructured_ingest/src/github.sh @@ -37,7 +37,7 @@ elif [[ "$CI" == "true" ]]; then echo fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} #shellcheck disable=SC2086 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ github \ diff --git a/test_unstructured_ingest/src/gitlab.sh b/test_unstructured_ingest/src/gitlab.sh index 64ac21353..1bd01b488 100755 --- a/test_unstructured_ingest/src/gitlab.sh +++ b/test_unstructured_ingest/src/gitlab.sh @@ -24,7 +24,7 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ gitlab \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/google-drive.sh b/test_unstructured_ingest/src/google-drive.sh index 36a6ab79b..7e580e8a1 100755 --- a/test_unstructured_ingest/src/google-drive.sh +++ b/test_unstructured_ingest/src/google-drive.sh @@ -35,11 +35,11 @@ fi GCP_INGEST_SERVICE_KEY_FILE=$(mktemp) echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ google-drive \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.version \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.version \ --num-processes "$max_processes" \ --strategy hi_res \ --preserve-downloads \ @@ -47,7 +47,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --output-dir "$OUTPUT_DIR" \ --verbose \ --drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \ - --service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \ + --service-account-key-path "$GCP_INGEST_SERVICE_KEY_FILE" \ --recursive \ --extensions "pdf,docx" \ --work-dir "$WORK_DIR" diff --git a/test_unstructured_ingest/src/hubspot.sh b/test_unstructured_ingest/src/hubspot.sh index 86a75630c..d5b617569 100755 --- a/test_unstructured_ingest/src/hubspot.sh +++ b/test_unstructured_ingest/src/hubspot.sh @@ -39,7 +39,8 @@ fi # Can be used multiple times to specify multiple objects. # --custom-properties Custom property to process information from. Comma separated list. -PYTHONPATH=. ./unstructured/ingest/main.py \ +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} +PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ hubspot \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/jira.sh b/test_unstructured_ingest/src/jira.sh index 533fc3224..ce6b4e049 100755 --- a/test_unstructured_ingest/src/jira.sh +++ b/test_unstructured_ingest/src/jira.sh @@ -50,7 +50,7 @@ fi # Note: When any of the optional arguments are provided, connector will ingest only those components, and nothing else. # When none of the optional arguments are provided, all issues in all projects will be ingested. -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ jira \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/kafka-local.sh b/test_unstructured_ingest/src/kafka-local.sh index c2ed84d0b..36b21754f 100755 --- a/test_unstructured_ingest/src/kafka-local.sh +++ b/test_unstructured_ingest/src/kafka-local.sh @@ -57,7 +57,7 @@ python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \ --confluent false \ --port 29092 -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ kafka \ --bootstrap-server localhost \ diff --git a/test_unstructured_ingest/src/local-embed-bedrock.sh b/test_unstructured_ingest/src/local-embed-bedrock.sh index 1d23431cf..285d15a56 100755 --- a/test_unstructured_ingest/src/local-embed-bedrock.sh +++ b/test_unstructured_ingest/src/local-embed-bedrock.sh @@ -24,17 +24,17 @@ if [ -z "$AWS_ACCESS_KEY_ID" ] || [ -z "$AWS_SECRET_ACCESS_KEY" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-aws-bedrock" \ + --embedding-provider "aws-bedrock" \ --embedding-aws-access-key-id "$AWS_ACCESS_KEY_ID" \ --embedding-aws-secret-access-key "$AWS_SECRET_ACCESS_KEY" diff --git a/test_unstructured_ingest/src/local-embed-mixedbreadai.sh b/test_unstructured_ingest/src/local-embed-mixedbreadai.sh index 75d949c89..99168d7dd 100755 --- a/test_unstructured_ingest/src/local-embed-mixedbreadai.sh +++ b/test_unstructured_ingest/src/local-embed-mixedbreadai.sh @@ -22,10 +22,8 @@ function cleanup() { } trap cleanup EXIT -# Define the run script -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} - # Run the ingestion script with the specified parameters +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/local-embed-octoai.sh b/test_unstructured_ingest/src/local-embed-octoai.sh index e75ee6dc5..54ff3e2a0 100755 --- a/test_unstructured_ingest/src/local-embed-octoai.sh +++ b/test_unstructured_ingest/src/local-embed-octoai.sh @@ -25,7 +25,7 @@ if [ -z "$OCTOAI_API_KEY" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/local-embed-vertexai.sh b/test_unstructured_ingest/src/local-embed-vertexai.sh index b7342fa75..4ef499bc5 100755 --- a/test_unstructured_ingest/src/local-embed-vertexai.sh +++ b/test_unstructured_ingest/src/local-embed-vertexai.sh @@ -25,17 +25,17 @@ if [ -z "$GCP_INGEST_SERVICE_KEY" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-vertexai" \ + --embedding-provider "vertexai" \ --embedding-api-key "$GCP_INGEST_SERVICE_KEY" \ --embedding-model-name "textembedding-gecko@001" diff --git a/test_unstructured_ingest/src/local-embed-voyageai.sh b/test_unstructured_ingest/src/local-embed-voyageai.sh index 62f5c60d3..c5f3be1fe 100755 --- a/test_unstructured_ingest/src/local-embed-voyageai.sh +++ b/test_unstructured_ingest/src/local-embed-voyageai.sh @@ -25,7 +25,7 @@ if [ -z "$VOYAGE_API_KEY" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ @@ -35,7 +35,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-voyageai" \ + --embedding-provider "voyageai" \ --embedding-api-key "$VOYAGE_API_KEY" \ --embedding-model-name "voyage-large-2" diff --git a/test_unstructured_ingest/src/local-embed.sh b/test_unstructured_ingest/src/local-embed.sh index 0b8d540e3..210a7111c 100755 --- a/test_unstructured_ingest/src/local-embed.sh +++ b/test_unstructured_ingest/src/local-embed.sh @@ -19,17 +19,17 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --verbose \ --reprocess \ --input-path example-docs/book-war-and-peace-1p.txt \ --work-dir "$WORK_DIR" \ - --embedding-provider "langchain-huggingface" + --embedding-provider "huggingface" set +e diff --git a/test_unstructured_ingest/src/local-failed-partition.sh b/test_unstructured_ingest/src/local-failed-partition.sh index dbe4f1c77..a230888b3 100755 --- a/test_unstructured_ingest/src/local-failed-partition.sh +++ b/test_unstructured_ingest/src/local-failed-partition.sh @@ -38,7 +38,7 @@ function check() { fi } -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh index 7786e1c63..575bd876f 100755 --- a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh +++ b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh @@ -22,8 +22,7 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} - +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --chunking-strategy basic \ diff --git a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh index 452686eeb..051c5fba2 100755 --- a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh +++ b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh @@ -33,14 +33,13 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} - +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --chunking-strategy by_title \ - --chunk-no-include-orig-elements \ + --no-chunk-include-orig-elements \ --chunk-max-characters 2000 \ - --chunk-no-multipage-sections \ + --no-chunk-multipage-sections \ --input-path "$ABS_INPUT_PATH" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/local-single-file-with-encoding.sh b/test_unstructured_ingest/src/local-single-file-with-encoding.sh index 016177073..3cf91223e 100755 --- a/test_unstructured_ingest/src/local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/src/local-single-file-with-encoding.sh @@ -20,11 +20,11 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --encoding cp1252 \ --verbose \ diff --git a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh index 4265d0c4f..4c0ab5b36 100755 --- a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh @@ -20,11 +20,11 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --skip-infer-table-types "xls,xlsx" \ --strategy hi_res \ diff --git a/test_unstructured_ingest/src/local-single-file.sh b/test_unstructured_ingest/src/local-single-file.sh index 14804f085..249746ed8 100755 --- a/test_unstructured_ingest/src/local-single-file.sh +++ b/test_unstructured_ingest/src/local-single-file.sh @@ -22,11 +22,11 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ --additional-partition-args '{"strategy":"ocr_only", "languages":["ind", "est"]}' \ --verbose \ diff --git a/test_unstructured_ingest/src/local.sh b/test_unstructured_ingest/src/local.sh index deac065b5..ac725144c 100755 --- a/test_unstructured_ingest/src/local.sh +++ b/test_unstructured_ingest/src/local.sh @@ -19,7 +19,7 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/mongodb.sh b/test_unstructured_ingest/src/mongodb.sh index 553014266..8429d7e1f 100755 --- a/test_unstructured_ingest/src/mongodb.sh +++ b/test_unstructured_ingest/src/mongodb.sh @@ -25,9 +25,10 @@ fi # astradb dependencies. # ref: https://pymongo.readthedocs.io/en/stable/installation.html python -m pip uninstall -y bson pymongo -make install-ingest-mongodb +pip install "unstructured-ingest[mongodb]" -PYTHONPATH=. ./unstructured/ingest/main.py \ +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} +PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ mongodb \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/notion.sh b/test_unstructured_ingest/src/notion.sh index 063a9199e..e80a11bfa 100755 --- a/test_unstructured_ingest/src/notion.sh +++ b/test_unstructured_ingest/src/notion.sh @@ -29,7 +29,7 @@ if [ -z "$NOTION_API_KEY" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ notion \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_unstructured_ingest/src/onedrive.sh b/test_unstructured_ingest/src/onedrive.sh index 0dfa3263a..d38b7ab80 100755 --- a/test_unstructured_ingest/src/onedrive.sh +++ b/test_unstructured_ingest/src/onedrive.sh @@ -29,11 +29,11 @@ if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_USER_PNAME" ] exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ onedrive \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --strategy hi_res \ --preserve-downloads \ diff --git a/test_unstructured_ingest/src/opensearch.sh b/test_unstructured_ingest/src/opensearch.sh index 0b0a412a3..f1d7c150e 100755 --- a/test_unstructured_ingest/src/opensearch.sh +++ b/test_unstructured_ingest/src/opensearch.sh @@ -35,11 +35,11 @@ trap cleanup EXIT scripts/opensearch-test-helpers/source_connector/create-and-check-opensearch.sh wait -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ opensearch \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ diff --git a/test_unstructured_ingest/src/outlook.sh b/test_unstructured_ingest/src/outlook.sh index 890037070..a1a5a4878 100755 --- a/test_unstructured_ingest/src/outlook.sh +++ b/test_unstructured_ingest/src/outlook.sh @@ -29,7 +29,7 @@ if [ -z "$MS_CLIENT_ID" ] || [ -z "$MS_CLIENT_CRED" ] || [ -z "$MS_TENANT_ID" ] exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ outlook \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index a0dda9375..b27e32e8e 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -28,10 +28,10 @@ trap cleanup EXIT echo "REPROCESS INPUT PATH" ls "$INPUT_PATH" -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --strategy fast \ --reprocess \ diff --git a/test_unstructured_ingest/src/s3-compression.sh b/test_unstructured_ingest/src/s3-compression.sh index 1d1faabee..7ee066f3a 100755 --- a/test_unstructured_ingest/src/s3-compression.sh +++ b/test_unstructured_ingest/src/s3-compression.sh @@ -20,7 +20,7 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/s3-minio.sh b/test_unstructured_ingest/src/s3-minio.sh index c6011be05..85dd8f85d 100755 --- a/test_unstructured_ingest/src/s3-minio.sh +++ b/test_unstructured_ingest/src/s3-minio.sh @@ -32,13 +32,13 @@ trap cleanup EXIT scripts/minio-test-helpers/create-and-check-minio.sh wait -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_modified,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.date_created \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_modified,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.date_created \ --strategy hi_res \ --preserve-downloads \ --reprocess \ diff --git a/test_unstructured_ingest/src/s3.sh b/test_unstructured_ingest/src/s3.sh index 61e0fe13d..bfdc72c1c 100755 --- a/test_unstructured_ingest/src/s3.sh +++ b/test_unstructured_ingest/src/s3.sh @@ -23,12 +23,12 @@ trap cleanup EXIT "$SCRIPT_DIR"/check-num-files-expected-output.sh 3 $OUTPUT_FOLDER_NAME 20k -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy hi_res \ --preserve-downloads \ --reprocess \ diff --git a/test_unstructured_ingest/src/salesforce.sh b/test_unstructured_ingest/src/salesforce.sh index 8ebce46a1..54ebd0555 100755 --- a/test_unstructured_ingest/src/salesforce.sh +++ b/test_unstructured_ingest/src/salesforce.sh @@ -43,15 +43,15 @@ if [ -z "$SALESFORCE_PRIVATE_KEY_PATH" ]; then echo "$SALESFORCE_PRIVATE_KEY" >"$SALESFORCE_PRIVATE_KEY_PATH" fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ salesforce \ --categories "EmailMessage,Campaign" \ --download-dir "$DOWNLOAD_DIR" \ --username "$SALESFORCE_USERNAME" \ --consumer-key "$SALESFORCE_CONSUMER_KEY" \ - --private-key "$SALESFORCE_PRIVATE_KEY_PATH" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --preserve-downloads \ --reprocess \ diff --git a/test_unstructured_ingest/src/sftp.sh b/test_unstructured_ingest/src/sftp.sh index 3386b3a8c..e3312224d 100755 --- a/test_unstructured_ingest/src/sftp.sh +++ b/test_unstructured_ingest/src/sftp.sh @@ -33,12 +33,12 @@ trap cleanup EXIT scripts/sftp-test-helpers/create-and-check-sftp.sh wait -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ sftp \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.data_source.version \ + --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.data_source.filesize_bytes,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.last_modified,metadata.data_source.version \ --preserve-downloads \ --reprocess \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_unstructured_ingest/src/sharepoint-with-permissions.sh b/test_unstructured_ingest/src/sharepoint-with-permissions.sh index 1b00bdd96..cc16c1135 100755 --- a/test_unstructured_ingest/src/sharepoint-with-permissions.sh +++ b/test_unstructured_ingest/src/sharepoint-with-permissions.sh @@ -39,7 +39,7 @@ fi # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly # excluding metadata.data_source.permissions_data since the api has deprecation warnings. Will want to do a separate test for permissions data -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ sharepoint \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/sharepoint.sh b/test_unstructured_ingest/src/sharepoint.sh index ff5d0dd83..ea07410d2 100755 --- a/test_unstructured_ingest/src/sharepoint.sh +++ b/test_unstructured_ingest/src/sharepoint.sh @@ -31,7 +31,7 @@ if [ -z "$SHAREPOINT_CLIENT_ID" ] || [ -z "$SHAREPOINT_CRED" ]; then fi # excluding metadata.last_modified since this will always update as date processed because the Sharepoint connector creates documents on the fly -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ sharepoint \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/src/slack.sh b/test_unstructured_ingest/src/slack.sh index 6e76e0f34..503e67240 100755 --- a/test_unstructured_ingest/src/slack.sh +++ b/test_unstructured_ingest/src/slack.sh @@ -29,7 +29,7 @@ if [ -z "$SLACK_TOKEN" ]; then exit 8 fi -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ slack \ --num-processes "$max_processes" \ diff --git a/test_unstructured_ingest/src/wikipedia.sh b/test_unstructured_ingest/src/wikipedia.sh index 24f8c0855..21a55e572 100755 --- a/test_unstructured_ingest/src/wikipedia.sh +++ b/test_unstructured_ingest/src/wikipedia.sh @@ -24,7 +24,7 @@ function cleanup() { } trap cleanup EXIT -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} +RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ wikipedia \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_unstructured_ingest/test-help.sh b/test_unstructured_ingest/test-help.sh deleted file mode 100755 index 9ec8a9824..000000000 --- a/test_unstructured_ingest/test-help.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -set -u -o pipefail -e - -RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} -sources=$(PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" --help | sed -e '1,/Commands/ d' | awk '{NF=1}1') -echo "Checking all source: $sources" -for src in $sources; do - destinations=$(PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" "$src" --help | sed -e '1,/Destinations/ d' | awk '{NF=1}1') - for dest in $destinations; do - echo "Checking $src -> $dest" - PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" "$src" "$dest" --help - done -done diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index 1ebb3dc58..8634b330f 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -44,7 +44,8 @@ all_tests=( 'elasticsearch.sh' 'confluence-diff.sh' 'confluence-large.sh' - 'airtable-diff.sh' + # NOTE(christine): This test is disabled because it is triggering 404 client errors to the API + # 'airtable-diff.sh' # # NOTE(ryan): This test is disabled because it is triggering too many requests to the API # 'airtable-large.sh' 'local-single-file.sh' diff --git a/test_unstructured_ingest/unit/cli/test_cli.py b/test_unstructured_ingest/unit/cli/test_cli.py deleted file mode 100644 index b0fcf50cc..000000000 --- a/test_unstructured_ingest/unit/cli/test_cli.py +++ /dev/null @@ -1,18 +0,0 @@ -import click -import pytest - -from unstructured.ingest.cli.interfaces import CliMixin - - -def test_add_params(): - @click.command() - def sample_cmd(): - pass - - options = [ - click.Option(["--opt1"]), - click.Option(["--opt1"]), - ] - cmd = sample_cmd - with pytest.raises(ValueError): - CliMixin.add_params(cmd=cmd, params=options) diff --git a/test_unstructured_ingest/unit/connector/fsspec/test_connector_gcs.py b/test_unstructured_ingest/unit/connector/fsspec/test_connector_gcs.py deleted file mode 100644 index 60a14e987..000000000 --- a/test_unstructured_ingest/unit/connector/fsspec/test_connector_gcs.py +++ /dev/null @@ -1,35 +0,0 @@ -from unittest.mock import MagicMock - -import pytest - -from unstructured.ingest.connector.fsspec.gcs import GcsAccessConfig - - -@pytest.mark.parametrize( - ("given_access_token", "then_access_token"), - [ - (None, None), - ("/tmp/gcs.key", "/tmp/gcs.key"), - ("google_default", "google_default"), - ("cache", "cache"), - ("anon", "anon"), - ("browser", "browser"), - ("cloud", "cloud"), - ("{'some_key': 'some_value'}", {"some_key": "some_value"}), - ], -) -def test_validate_access_token(mocker, given_access_token, then_access_token): - mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file") - mocked_isfile.return_value = True - - when_token = GcsAccessConfig(token=given_access_token).token - assert when_token == then_access_token - - -def test_fail_validate_access_token(mocker): - mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file") - mocked_isfile.return_value = False - - given_access_token = "/tmp/gcs.key" - with pytest.raises(ValueError): - GcsAccessConfig(token=given_access_token) diff --git a/test_unstructured_ingest/unit/connector/fsspec/test_fsspec.py b/test_unstructured_ingest/unit/connector/fsspec/test_fsspec.py deleted file mode 100644 index edbe543dc..000000000 --- a/test_unstructured_ingest/unit/connector/fsspec/test_fsspec.py +++ /dev/null @@ -1,25 +0,0 @@ -from unittest.mock import MagicMock, patch - -from fsspec import AbstractFileSystem - -from unstructured.ingest.connector.fsspec.fsspec import FsspecIngestDoc, SimpleFsspecConfig -from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig - - -@patch("fsspec.get_filesystem_class") -def test_version_is_string(mock_get_filesystem_class): - """ - Test that the version is a string even when the filesystem checksum is an integer. - """ - mock_fs = MagicMock(spec=AbstractFileSystem) - mock_fs.checksum.return_value = 1234567890 - mock_fs.info.return_value = {"etag": ""} - mock_get_filesystem_class.return_value = lambda **kwargs: mock_fs - config = SimpleFsspecConfig("s3://my-bucket", access_config={}) - doc = FsspecIngestDoc( - processor_config=ProcessorConfig(), - read_config=ReadConfig(), - connector_config=config, - remote_file_path="test.txt", - ) - assert isinstance(doc.source_metadata.version, str) diff --git a/test_unstructured_ingest/unit/connector/fsspec/test_paths.py b/test_unstructured_ingest/unit/connector/fsspec/test_paths.py deleted file mode 100644 index de3648914..000000000 --- a/test_unstructured_ingest/unit/connector/fsspec/test_paths.py +++ /dev/null @@ -1,223 +0,0 @@ -from dataclasses import dataclass -from pathlib import Path - -import pytest - -from unstructured.ingest.connector.fsspec.dropbox import ( - DropboxIngestDoc, -) -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecIngestDoc, -) -from unstructured.ingest.connector.fsspec.sftp import SftpAccessConfig, SimpleSftpConfig -from unstructured.ingest.interfaces import ( - FsspecConfig, -) - - -@dataclass -class FakeConfigDropboxRoot: - output_dir = "/fakeuser/fake_output" - dir_path = " " - download_dir = "/fakeuser/fake_download" - path_without_protocol = " " - - -@dataclass -class FakeConfigFolder: - output_dir = "/fakeuser/fake_output" - dir_path = "fake_folder" - download_dir = "/fakeuser/fake_download" - path_without_protocol = "fake_folder" - - -def test_dropbox_root_succeeds(): - """ - Test that path joining method works for Dropbox root folder. - Note slash in front of remote_file_path. - """ - dbox = DropboxIngestDoc( - connector_config=FakeConfigDropboxRoot, - read_config=FakeConfigDropboxRoot, - processor_config=FakeConfigDropboxRoot, - remote_file_path="/fake_file.txt", - ) - output_filename = dbox._output_filename - download_filename = dbox._tmp_download_file() - - assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json") - assert download_filename == Path("/fakeuser/fake_download/fake_file.txt") - - -def test_dropbox_root_succeeds2(): - """ - Test that path joining method works for Dropbox root folder. - Note lack of slash in front of remote_file_path. This still works. - """ - dbox = DropboxIngestDoc( - connector_config=FakeConfigDropboxRoot, - read_config=FakeConfigDropboxRoot, - processor_config=FakeConfigDropboxRoot, - remote_file_path="fake_file.txt", - ) - output_filename = dbox._output_filename - download_filename = dbox._tmp_download_file() - - assert output_filename == Path("/fakeuser/fake_output/fake_file.txt.json") - assert download_filename == Path("/fakeuser/fake_download/fake_file.txt") - - -def test_dropbox_folder_succeeds(): - """ - Test that path joining method works for Dropbox root folder. - Note no slash in front of remote_file_path. - """ - dbox = DropboxIngestDoc( - connector_config=FakeConfigFolder, - read_config=FakeConfigFolder, - processor_config=FakeConfigFolder, - remote_file_path="fake_file2.txt", - ) - output_filename = dbox._output_filename - download_filename = dbox._tmp_download_file() - - assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json") - assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt") - - -def test_dropbox_folder_fails(): - """Test that path joining method gives WRONG path. Note slash in front of remote_file_path. - Path joining is sensitive. Note that the path is MISSING the folders.""" - dbox = DropboxIngestDoc( - connector_config=FakeConfigFolder, - read_config=FakeConfigFolder, - processor_config=FakeConfigFolder, - remote_file_path="/fake_file2.txt", - ) - output_filename = dbox._output_filename - download_filename = dbox._tmp_download_file() - - assert output_filename == Path("/fake_file2.txt.json") - assert download_filename == Path("/fake_file2.txt") - - -def test_fsspec_folder_succeeds(): - """ - Test that path joining method works for root folder. - Note no slash in front of remote_file_path. - """ - dbox = FsspecIngestDoc( - connector_config=FakeConfigFolder, - read_config=FakeConfigFolder, - processor_config=FakeConfigFolder, - remote_file_path="fake_file2.txt", - ) - output_filename = dbox._output_filename - download_filename = dbox._tmp_download_file() - - assert output_filename == Path("/fakeuser/fake_output/fake_file2.txt.json") - assert download_filename == Path("/fakeuser/fake_download/fake_file2.txt") - - -def test_fsspec_folder_fails(): - """Test that path joining method gives WRONG path. Note slash in front of remote_file_path. - Path joining is sensitive. Note that the path is MISSING the folders.""" - fstest = FsspecIngestDoc( - connector_config=FakeConfigFolder, - read_config=FakeConfigFolder, - processor_config=FakeConfigFolder, - remote_file_path="/fake_file2.txt", - ) - output_filename = fstest._output_filename - download_filename = fstest._tmp_download_file() - - assert output_filename == Path("/fake_file2.txt.json") - assert download_filename == Path("/fake_file2.txt") - - -def test_post_init_invalid_protocol(): - """Validate that an invalid protocol raises a ValueError""" - with pytest.raises(ValueError): - FsspecConfig(remote_url="ftp://example.com/path/to/file.txt") - - -def test_fsspec_path_extraction_dropbox_root(): - """Validate that the path extraction works for dropbox root""" - config = FsspecConfig(remote_url="dropbox:// /") - assert config.protocol == "dropbox" - assert config.path_without_protocol == " /" - assert config.dir_path == " " - assert config.file_path == "" - - -def test_fsspec_path_extraction_dropbox_subfolder(): - """Validate that the path extraction works for dropbox subfolder""" - config = FsspecConfig(remote_url="dropbox://path") - assert config.protocol == "dropbox" - assert config.path_without_protocol == "path" - assert config.dir_path == "path" - assert config.file_path == "" - - -def test_fsspec_path_extraction_s3_bucket_only(): - """Validate that the path extraction works for s3 bucket without filename""" - config = FsspecConfig(remote_url="s3://bucket-name") - assert config.protocol == "s3" - assert config.path_without_protocol == "bucket-name" - assert config.dir_path == "bucket-name" - assert config.file_path == "" - - -def test_fsspec_path_extraction_s3_valid_path(): - """Validate that the path extraction works for s3 bucket with filename""" - config = FsspecConfig(remote_url="s3://bucket-name/path/to/file.txt") - assert config.protocol == "s3" - assert config.path_without_protocol == "bucket-name/path/to/file.txt" - assert config.dir_path == "bucket-name" - assert config.file_path == "path/to/file.txt" - - -def test_fsspec_path_extraction_s3_invalid_path(): - """Validate that an invalid s3 path (that mimics triple slash for dropbox) - raises a ValueError""" - with pytest.raises(ValueError): - FsspecConfig(remote_url="s3:///bucket-name/path/to") - - -def test_sftp_path_extraction_post_init_with_extension(): - """Validate that the path extraction works for sftp with file extension""" - config = SimpleSftpConfig( - remote_url="sftp://example.com/path/to/file.txt", - access_config=SftpAccessConfig(username="username", password="password", host="", port=22), - ) - assert config.file_path == "file.txt" - assert config.dir_path == "path/to" - assert config.path_without_protocol == "path/to" - assert config.access_config.host == "example.com" - assert config.access_config.port == 22 - - -def test_sftp_path_extraction_without_extension(): - """Validate that the path extraction works for sftp without extension""" - config = SimpleSftpConfig( - remote_url="sftp://example.com/path/to/directory", - access_config=SftpAccessConfig(username="username", password="password", host="", port=22), - ) - assert config.file_path == "" - assert config.dir_path == "path/to/directory" - assert config.path_without_protocol == "path/to/directory" - assert config.access_config.host == "example.com" - assert config.access_config.port == 22 - - -def test_sftp_path_extraction_with_port(): - """Validate that the path extraction works for sftp with a non-default port""" - config = SimpleSftpConfig( - remote_url="sftp://example.com:47474/path/to/file.txt", - access_config=SftpAccessConfig(username="username", password="password", host="", port=22), - ) - assert config.file_path == "file.txt" - assert config.dir_path == "path/to" - assert config.path_without_protocol == "path/to" - assert config.access_config.host == "example.com" - assert config.access_config.port == 47474 diff --git a/test_unstructured_ingest/unit/connector/test_connector_git.py b/test_unstructured_ingest/unit/connector/test_connector_git.py deleted file mode 100644 index 88760df16..000000000 --- a/test_unstructured_ingest/unit/connector/test_connector_git.py +++ /dev/null @@ -1,61 +0,0 @@ -from pathlib import Path - -import pytest - -from unstructured.ingest.connector.git import GitAccessConfig, GitSourceConnector, SimpleGitConfig - - -@pytest.mark.parametrize( - ("given_file_path", "then_is_supported"), - [ - (Path("src/submodule/document.md"), True), - (Path("src/submodule/document.txt"), True), - (Path("src/submodule/document.pdf"), True), - (Path("src/submodule/document.doc"), True), - (Path("src/submodule/document.docx"), True), - (Path("src/submodule/document.eml"), True), - (Path("src/submodule/document.html"), True), - (Path("src/submodule/document.png"), True), - (Path("src/submodule/document.jpg"), True), - (Path("src/submodule/document.ppt"), True), - (Path("src/submodule/document.pptx"), True), - (Path("src/submodule/document.xml"), True), - (Path("src/submodule/code.py"), False), - (Path("src/submodule/Dockerfile"), False), - (Path("src/submodule/Makefile"), False), - (Path("src/submodule/LICENSE"), False), - ], -) -def test_connector_supports_file(given_file_path, then_is_supported): - when_is_supported = GitSourceConnector.is_file_type_supported(str(given_file_path)) - - assert when_is_supported == then_is_supported - - -class FakeGitSourceConnectorImpl(GitSourceConnector): - def get_ingest_docs(self): - pass - - -@pytest.mark.parametrize( - ("given_file_path", "given_file_glob", "then_matches_glob"), - [ - (Path("LICENSE"), None, True), - (Path("Makefile"), ["Makefile"], True), - (Path("src/my/super/module/main.py"), ["**/*.py"], True), - (Path("src/my/super/module/main.pyc"), ["**/*.py"], False), - ], -) -def test_connector_does_path_match_glob(given_file_path, given_file_glob, then_matches_glob): - connector_config = SimpleGitConfig( - url="some_fake_url", - access_config=GitAccessConfig(access_token="some_fake_token"), - file_glob=given_file_glob, - ) - connector = FakeGitSourceConnectorImpl( - processor_config=None, read_config=None, connector_config=connector_config - ) - - when_matches_glob = connector.does_path_match_glob(str(given_file_path)) - - assert when_matches_glob == then_matches_glob diff --git a/test_unstructured_ingest/unit/connector/test_salesforce_connector.py b/test_unstructured_ingest/unit/connector/test_salesforce_connector.py deleted file mode 100644 index 29643ec2b..000000000 --- a/test_unstructured_ingest/unit/connector/test_salesforce_connector.py +++ /dev/null @@ -1,57 +0,0 @@ -from pathlib import Path -from unittest.mock import MagicMock - -import pytest -from cryptography.hazmat.primitives import serialization -from cryptography.hazmat.primitives.asymmetric import dsa, ec, rsa - -from unstructured.ingest.connector.salesforce import SalesforceAccessConfig - - -def pkey_to_str(key) -> str: - return key.private_bytes( - encoding=serialization.Encoding.PEM, - format=serialization.PrivateFormat.PKCS8, - encryption_algorithm=serialization.NoEncryption(), - ).decode("utf-8") - - -def rsa_private_key() -> str: - return pkey_to_str(rsa.generate_private_key(0x10001, 2048)) - - -def brainpoolp512r1_private_key() -> str: - return pkey_to_str(ec.generate_private_key(ec.BrainpoolP512R1)) - - -def dsa_private_key() -> str: - return pkey_to_str(dsa.generate_private_key(1024)) - - -@pytest.mark.parametrize( - ("private_key", "private_key_type"), - [ - (rsa_private_key(), str), - (brainpoolp512r1_private_key(), str), - (dsa_private_key(), str), - ("some_path/priv.key", Path), - ], -) -def test_private_key_type(mocker, private_key, private_key_type): - mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file") - mocked_isfile.return_value = True - - config = SalesforceAccessConfig(consumer_key="asdf", private_key=private_key) - actual_pkey_value, actual_pkey_type = config.get_private_key_value_and_type() - assert actual_pkey_type == private_key_type - assert actual_pkey_value == private_key - - -def test_private_key_type_fail(mocker): - mocked_isfile: MagicMock = mocker.patch("pathlib.Path.is_file") - mocked_isfile.return_value = False - - given_nonexistent_path = "some_path/priv.key" - with pytest.raises(expected_exception=ValueError): - config = SalesforceAccessConfig(consumer_key="asdf", private_key=given_nonexistent_path) - config.get_private_key_value_and_type() diff --git a/test_unstructured_ingest/unit/connector/test_serialization.py b/test_unstructured_ingest/unit/connector/test_serialization.py deleted file mode 100644 index f7043e996..000000000 --- a/test_unstructured_ingest/unit/connector/test_serialization.py +++ /dev/null @@ -1,46 +0,0 @@ -from unstructured.ingest.connector.local import LocalIngestDoc, SimpleLocalConfig -from unstructured.ingest.connector.registry import ( - create_ingest_doc_from_dict, - create_ingest_doc_from_json, -) -from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig - -doc = LocalIngestDoc( - path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf", - connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"), - processor_config=ProcessorConfig(), - read_config=ReadConfig(), -) -doc.update_source_metadata() -serialized_json = doc.to_json() -serialized_dict = doc.to_dict() - - -def test_manual_deserialization(): - deserialized_doc = LocalIngestDoc.from_json(serialized_json) - assert doc == deserialized_doc - - -def test_registry_from_json(): - deserialized_doc = create_ingest_doc_from_json(serialized_json) - assert doc == deserialized_doc - - -def test_registry_from_dict(): - deserialized_doc = create_ingest_doc_from_dict(serialized_dict) - assert doc == deserialized_doc - - -def test_source_metadata_serialization(): - doc = LocalIngestDoc( - path="test_unstructured_ingest/example-docs/layout-parser-paper.pdf", - connector_config=SimpleLocalConfig(input_path="test_unstructured_ingest/example-docs/"), - processor_config=ProcessorConfig(), - read_config=ReadConfig(), - ) - serialized_json = doc.to_dict() - assert not serialized_json["_source_metadata"] - - doc.update_source_metadata() - serialized_json_w_meta = doc.to_dict() - assert serialized_json_w_meta["_source_metadata"] diff --git a/test_unstructured_ingest/unit/connector/test_sharepoint.py b/test_unstructured_ingest/unit/connector/test_sharepoint.py deleted file mode 100644 index c48747fb9..000000000 --- a/test_unstructured_ingest/unit/connector/test_sharepoint.py +++ /dev/null @@ -1,59 +0,0 @@ -from datetime import datetime -from unittest.mock import MagicMock - -import pytest - -from unstructured.ingest.connector.sharepoint import SharepointIngestDoc -from unstructured.ingest.interfaces import ProcessorConfig, ReadConfig - - -@pytest.mark.parametrize( - ("time_created", "time_last_modified", "expected_created", "expected_modified"), - [ - ( - "2023-06-16T05:05:05+00:00", - datetime(2023, 6, 16, 5, 5, 5), - "2023-06-16T05:05:05+00:00", - "2023-06-16T05:05:05", - ), - ("2023-06-16 05:05:05", "2023-06-16", "2023-06-16T05:05:05", "2023-06-16T00:00:00"), - # Add more pairs of input strings and their expected ISO format results here - ], -) -def test_datetime_handling_in_update_source_metadata( - mocker, time_created, time_last_modified, expected_created, expected_modified -): - """Test the handling of various datetime formats in update_source_metadata.""" - # Create a mock SharePoint response directly in the test - mock_sharepoint_response = mocker.MagicMock() - mock_sharepoint_response.time_created = time_created - mock_sharepoint_response.time_last_modified = time_last_modified - - # Patch the SharePoint interaction methods to use the mock response - mocker.patch( - "unstructured.ingest.connector.sharepoint.SharepointIngestDoc._fetch_file", - return_value=mock_sharepoint_response, - ) - mocker.patch( - "unstructured.ingest.connector.sharepoint.SharepointIngestDoc._fetch_page", - return_value=None, - ) - - # Instantiate your document with dummy data - ingest_doc = SharepointIngestDoc( - connector_config=MagicMock(), - site_url="dummy_url", - server_path="dummy_path", - is_page=False, - file_path="dummy_path.html", - processor_config=ProcessorConfig(), - read_config=ReadConfig(), - ) - - # Execute the method under test - ingest_doc.update_source_metadata() - - # Assertions to verify the datetime handling against expected results - assert ingest_doc.source_metadata is not None - assert ingest_doc.source_metadata.date_created.startswith(expected_created) - assert ingest_doc.source_metadata.date_modified.startswith(expected_modified) diff --git a/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py b/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py deleted file mode 100644 index 45a8a44ef..000000000 --- a/test_unstructured_ingest/unit/connector/test_sql_conform_dict.py +++ /dev/null @@ -1,169 +0,0 @@ -import datetime -from unittest.mock import Mock, patch - -from unstructured.ingest.connector.sql import SqlDestinationConnector - -TEST_DATA_1 = { - "element_id": "80803034fe04181c163306740700cc54", - "metadata": { - "coordinates": { - "layout_height": 792, - "layout_width": 612, - "points": [ - [72.0, 72.69200000000001], - [72.0, 83.69200000000001], - [135.8, 83.69200000000001], - [135.8, 72.69200000000001], - ], - "system": "PixelSpace", - }, - "data_source": { - "date_created": "2023-10-25 10:05:44.976775", - "date_modified": "2023-10-25 10:05:44.976775", - "date_processed": "2023-12-14T17:06:33.074057", - "permissions_data": [{"mode": 33188}], - "url": "example-docs/pdf/fake-memo.pdf", - }, - "file_directory": "example-docs", - "filename": "fake-memo.pdf", - "filetype": "application/pdf", - "languages": ["eng"], - "last_modified": "2023-10-25T10:05:44", - "page_number": 1, - }, - "text": "May 5, 2023", - "type": "UncategorizedText", - "embeddings": [ - -0.05623878538608551, - 0.008579030632972717, - 0.03698136284947395, - -0.01745658740401268, - -0.030465232208371162, - 0.00996527448296547, - ], -} - -TEST_DATA_2 = { - "metadata": { - "coordinates": {"points": [1, 2, 3]}, - "links": {"link1": "https://example.com", "link2": "https://example.org"}, - "data_source": { - "date_created": "2021-01-01T00:00:00", - "date_modified": "2021-01-02T00:00:00", - "date_processed": "2022-12-13T15:44:08", - "version": 1.1, - }, - "last_modified": "2021-01-03T00:00:00", - "page_number": 10, - }, - "embeddings": [0.1, 0.2, 0.3], -} - -TEST_DATA_3 = { - "metadata": { - "coordinates": {"points": [1, 2, 3]}, - "data_source": { - "date_created": "2021-01-01T00:00:00", - "date_modified": "2021-01-02T00:00:00", - "date_processed": "2022-12-13T15:44:08", - "version": 1.1, - }, - "last_modified": "2021-01-03T00:00:00", - "page_number": 10, - "link_texts": ["Skip to main content"], - "link_urls": ["#main-content"], - }, - "embeddings": [0.1, 0.2, 0.3], -} - - -def test_conform_dict_1(): - """Validate that the conform_dict method returns the expected output for a real example""" - # Create a mock instance of the connector class - connector = SqlDestinationConnector(write_config=Mock(), connector_config=Mock()) - - # Mock the uuid.uuid4 function to return a fixed value - with patch("uuid.uuid4", return_value="mocked_uuid"): - # Call the conform_dict method - data_out = TEST_DATA_1.copy() - connector.conform_dict(data_out) - - # Assert that the result matches the expected output - assert data_out == { - "element_id": "80803034fe04181c163306740700cc54", - "text": "May 5, 2023", - "type": "UncategorizedText", - "id": "mocked_uuid", - "file_directory": "example-docs", - "filename": "fake-memo.pdf", - "filetype": "application/pdf", - "languages": ["eng"], - "last_modified": datetime.datetime(2023, 10, 25, 10, 5, 44), - "page_number": "1", - "date_created": datetime.datetime(2023, 10, 25, 10, 5, 44, 976775), - "date_modified": datetime.datetime(2023, 10, 25, 10, 5, 44, 976775), - "date_processed": datetime.datetime(2023, 12, 14, 17, 6, 33, 74057), - "permissions_data": '[{"mode": 33188}]', - "url": "example-docs/pdf/fake-memo.pdf", - "layout_height": 792, - "layout_width": 612, - "points": "[[72.0, 72.69200000000001], [72.0, 83.69200000000001]," - " [135.8, 83.69200000000001], [135.8, 72.69200000000001]]", - "system": "PixelSpace", - "embeddings": "[-0.05623878538608551, 0.008579030632972717, " - "0.03698136284947395, -0.01745658740401268, " - "-0.030465232208371162, 0.00996527448296547]", - } - - -def test_conform_dict_2(): - """Validate that the conform_dict method returns the expected output for a simplified example""" - # Create a mock instance of the connector class - connector = SqlDestinationConnector(write_config=Mock(), connector_config=Mock()) - - # Mock the uuid.uuid4 function to return a fixed value - with patch("uuid.uuid4", return_value="mocked_uuid"): - # Call the conform_dict method - data_out = TEST_DATA_2.copy() - connector.conform_dict(data_out) - - # Assert that the result matches the expected output - assert data_out == { - "embeddings": "[0.1, 0.2, 0.3]", - "id": "mocked_uuid", - "links": '{"link1": "https://example.com", "link2": "https://example.org"}', - "last_modified": datetime.datetime(2021, 1, 3, 0, 0), - "page_number": "10", - "date_created": datetime.datetime(2021, 1, 1, 0, 0), - "date_modified": datetime.datetime(2021, 1, 2, 0, 0), - "date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8), - "version": "1.1", - "points": "[1, 2, 3]", - } - - -def test_conform_dict_link_texts(): - """Validate that the conform_dict method returns the expected output link_texts""" - # Create a mock instance of the connector class - connector = SqlDestinationConnector(write_config=Mock(), connector_config=Mock()) - - # Mock the uuid.uuid4 function to return a fixed value - with patch("uuid.uuid4", return_value="mocked_uuid"): - # Call the conform_dict method - data_out = TEST_DATA_3.copy() - connector.conform_dict(data_out) - - # Assert that the result matches the expected output - assert data_out == { - "embeddings": "[0.1, 0.2, 0.3]", - "id": "mocked_uuid", - "last_modified": datetime.datetime(2021, 1, 3, 0, 0), - "link_texts": ["Skip to main content"], - "link_urls": ["#main-content"], - "page_number": "10", - "date_created": datetime.datetime(2021, 1, 1, 0, 0), - "date_modified": datetime.datetime(2021, 1, 2, 0, 0), - "date_processed": datetime.datetime(2022, 12, 13, 15, 44, 8), - "version": "1.1", - "points": "[1, 2, 3]", - } diff --git a/test_unstructured_ingest/unit/enhanced_dataclass/test_enhanced_dataclass.py b/test_unstructured_ingest/unit/enhanced_dataclass/test_enhanced_dataclass.py deleted file mode 100644 index 7e1727d1e..000000000 --- a/test_unstructured_ingest/unit/enhanced_dataclass/test_enhanced_dataclass.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -from dataclasses import Field, dataclass, fields - -import pytest - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.enhanced_dataclass.dataclasses import EnhancedField - - -@dataclass -class AuthData(EnhancedDataClassJsonMixin): - username: str - password: str = enhanced_field(sensitive=True) - date: int = enhanced_field(overload_name="time") - - -auth = AuthData(username="my name", password="top secret", date=3) - - -def test_enhanced_field(): - fs = fields(AuthData) - for f in fs: - if f.name == "username": - assert isinstance(f, Field) - assert hasattr(f, "sensitive") is False - else: - assert isinstance(f, EnhancedField) - if f.name == "password": - assert f.sensitive is True - else: - assert not f.sensitive - - -@pytest.mark.parametrize( - ("apply_name_overload", "expected_dict"), - [ - (True, {"username": "my name", "password": "THIS IS REDACTED", "time": 3}), - (False, {"username": "my name", "password": "THIS IS REDACTED", "date": 3}), - ], -) -def test_to_json(apply_name_overload: bool, expected_dict: dict): - j = auth.to_json( - redact_sensitive=True, - redacted_text="THIS IS REDACTED", - apply_name_overload=apply_name_overload, - ) - expected = json.dumps(expected_dict) - assert j == expected - - -@pytest.mark.parametrize( - ("apply_name_overload", "expected_dict"), - [ - (True, {"username": "my name", "password": "***REDACTED***", "time": 3}), - (False, {"username": "my name", "password": "***REDACTED***", "date": 3}), - ], -) -def test_to_dict(apply_name_overload: bool, expected_dict: dict): - d = auth.to_dict(redact_sensitive=True, apply_name_overload=apply_name_overload) - assert d == expected_dict diff --git a/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py b/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py deleted file mode 100644 index 433ee810d..000000000 --- a/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py +++ /dev/null @@ -1,156 +0,0 @@ -from __future__ import annotations - -import json -import logging -import os - -import pytest -from _pytest.logging import LogCaptureFixture - -from test_unstructured.unit_utils import ( - FixtureRequest, - Mock, - example_doc_path, - function_mock, - method_mock, -) -from unstructured.documents.elements import CompositeElement -from unstructured.ingest.interfaces import ChunkingConfig, PartitionConfig -from unstructured.ingest.pipeline.interfaces import PipelineContext -from unstructured.ingest.pipeline.reformat.chunking import Chunker - -ELEMENTS_JSON_FILE = example_doc_path( - "test_evaluate_files/unstructured_output/Bank Good Credit Loan.pptx.json" -) - - -class DescribeChunker: - """Unit tests for ingest.pipeline.reformat.chunking.Chunker""" - - # -- Chunker.run() ----------------------------------------------------------------------------- - - # -- integration test -- - def it_creates_JSON_elements(self, _ingest_docs_map_: Mock, tmpdir: str): - chunker = Chunker( - chunking_config=ChunkingConfig(chunking_strategy="by_title"), - pipeline_context=PipelineContext(work_dir=tmpdir), - partition_config=PartitionConfig(), - ) - # -- `Chunker.chunk()` defaults to writing to "{work_dir}/chunked", which is located in - # -- "/.cache" of a user's profile. - # -- Define `work_dir` add the "/chunked" subdirectory to it: - os.makedirs(os.path.join(tmpdir, "chunked"), exist_ok=True) - - filename = chunker.run(ELEMENTS_JSON_FILE) or "" - - head, tail = os.path.split(filename if filename else "") - # -- Check that a json file was created in `/chunked` -- - assert head.endswith("chunked") - assert tail.endswith(".json") - # -- Check contents of file -- - with open(filename) as json_f: - json_data = json.load(json_f) - assert all(d.get("type") == "CompositeElement" for d in json_data) - assert len(json_data) == 5 - - def it_returns_None_and_logs_message_without_chunking_strategy( - self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture - ): - chunker = Chunker( - chunking_config=ChunkingConfig(), - pipeline_context=PipelineContext(), - partition_config=PartitionConfig(), - ) - caplog.set_level(logging.INFO) - - assert chunker.run(ELEMENTS_JSON_FILE) is None - assert "chunking_strategy is None, skipping chunking for" in caplog.text - - def it_logs_error_on_invalid_remote_chunking_strategy( - self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture - ): - chunker = Chunker( - chunking_config=ChunkingConfig(chunking_strategy="by_invalid"), - pipeline_context=PipelineContext(), - partition_config=PartitionConfig(partition_by_api=True), - ) - - chunker.run(ELEMENTS_JSON_FILE) - - assert "Input should be 'basic', 'by_page', 'by_similarity'" in caplog.text - - def it_warns_with_nonlocal_chunking_strategy_and_partition_by_api_False( - self, _ingest_docs_map_: Mock, caplog: LogCaptureFixture - ): - chunker = Chunker( - chunking_config=ChunkingConfig(chunking_strategy="by_similarity"), - pipeline_context=PipelineContext(), - partition_config=PartitionConfig(partition_by_api=False), - ) - - chunker.run(ELEMENTS_JSON_FILE) - - assert "There is no locally available chunking_strategy:" in caplog.text - - # -- Chunker.chunk() --------------------------------------------------------------------------- - - def it_skips_chunking_if_strategy_is_None(self): - chunker = Chunker( - chunking_config=ChunkingConfig(chunking_strategy=None), - pipeline_context=PipelineContext(), - partition_config=PartitionConfig(), - ) - - assert chunker.chunk(ELEMENTS_JSON_FILE) is None - - # -- integration test -- - @pytest.mark.parametrize("strategy", ["by_title", "basic"]) - def it_chunks_locally(self, strategy: str, _ingest_docs_map_: Mock): - chunker = Chunker( - chunking_config=ChunkingConfig(chunking_strategy=strategy), - pipeline_context=PipelineContext(), - partition_config=PartitionConfig(), - ) - - chunked_elements = chunker.chunk(ELEMENTS_JSON_FILE) - - assert all(isinstance(elem, CompositeElement) for elem in chunked_elements) # type: ignore - - def it_chunks_remotely(self, _ingest_docs_map_: Mock, _partition_via_api_: Mock): - chunker = Chunker( - chunking_config=ChunkingConfig(chunking_strategy="by_similarity"), - pipeline_context=PipelineContext(), - partition_config=PartitionConfig( - partition_by_api=True, api_key="aaaaaaaaaaaaaaaaaaaaa" - ), - ) - - chunker.chunk(ELEMENTS_JSON_FILE) - - _partition_via_api_.assert_called_once_with( - filename=ELEMENTS_JSON_FILE, - api_key="aaaaaaaaaaaaaaaaaaaaa", - api_url="https://api.unstructured.io/general/v0/general", - chunking_strategy="by_similarity", - # (jennings) the sdk uses combine_under_n_chars but the ChunkingConfig param is - # combine_text_under_n_chars - combine_under_n_chars=None, - include_orig_elements=None, - max_characters=None, - multipage_sections=None, - new_after_n_chars=None, - overlap=None, - overlap_all=None, - ) - - # -- fixtures -------------------------------------------------------------------------------- - - @pytest.fixture() - def _ingest_docs_map_(self, request: FixtureRequest): - return method_mock(request, PipelineContext, "ingest_docs_map") - - @pytest.fixture() - def _partition_via_api_(self, request: FixtureRequest): - return function_mock( - request, "unstructured.ingest.pipeline.reformat.chunking.partition_via_api" - ) diff --git a/test_unstructured_ingest/unit/test_error.py b/test_unstructured_ingest/unit/test_error.py deleted file mode 100644 index 0c588409e..000000000 --- a/test_unstructured_ingest/unit/test_error.py +++ /dev/null @@ -1,27 +0,0 @@ -import pytest - -from unstructured.ingest.error import ( - DestinationConnectionError, - PartitionError, - SourceConnectionError, -) - - -@pytest.mark.parametrize( - ("error_class", "exception_type", "error_message"), - [ - (SourceConnectionError, ValueError, "Simulated connection error"), - (DestinationConnectionError, RuntimeError, "Simulated connection error"), - (PartitionError, FileNotFoundError, "Simulated partition error"), - ], -) -def test_custom_error_decorator(error_class, exception_type, error_message): - @error_class.wrap - def simulate_error(): - raise exception_type(error_message) - - with pytest.raises(error_class) as context: - simulate_error() - - expected_error_string = error_class.error_string.format(error_message) - assert str(context.value) == expected_error_string diff --git a/test_unstructured_ingest/unit/test_interfaces.py b/test_unstructured_ingest/unit/test_interfaces.py deleted file mode 100644 index 7a91ed9f1..000000000 --- a/test_unstructured_ingest/unit/test_interfaces.py +++ /dev/null @@ -1,281 +0,0 @@ -from __future__ import annotations - -import os -import pathlib -from dataclasses import dataclass -from typing import Any, Dict - -import pytest - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseSingleIngestDoc, - ChunkingConfig, - PartitionConfig, - ProcessorConfig, - ReadConfig, -) -from unstructured.partition.auto import partition -from unstructured.staging.base import elements_to_dicts - -DIRECTORY = pathlib.Path(__file__).parent.resolve() -EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "../..", "example-docs") -TEST_DOWNLOAD_DIR = "/tmp" -TEST_OUTPUT_DIR = "/tmp" -TEST_ID = "test" -TEST_FILE_PATH = os.path.join(EXAMPLE_DOCS_DIRECTORY, "book-war-and-peace-1p.txt") - - -@dataclass -class ExampleConfig(BaseConnectorConfig): - id: str - path: str - - -TEST_CONFIG = ExampleConfig(id=TEST_ID, path=TEST_FILE_PATH) -TEST_SOURCE_URL = "test-source-url" -TEST_VERSION = "1.1.1" -TEST_RECORD_LOCATOR = {"id": "data-source-id"} -TEST_DATE_CREATED = "2021-01-01T00:00:00" -TEST_DATE_MODIFIED = "2021-01-02T00:00:00" -TEST_DATE_PROCESSSED = "2022-12-13T15:44:08" - - -@dataclass -class ExampleIngestDoc(BaseSingleIngestDoc): - connector_config: ExampleConfig - - @property - def filename(self): - return TEST_FILE_PATH - - @property - def _output_filename(self): - return TEST_FILE_PATH + ".json" - - @property - def source_url(self) -> str: - return TEST_SOURCE_URL - - @property - def version(self) -> str: - return TEST_VERSION - - @property - def record_locator(self) -> Dict[str, Any]: - return TEST_RECORD_LOCATOR - - @property - def date_created(self) -> str: - return TEST_DATE_CREATED - - @property - def date_modified(self) -> str: - return TEST_DATE_MODIFIED - - @property - def exists(self) -> bool: - return True - - def cleanup_file(self): - pass - - def get_file(self): - pass - - def has_output(self): - return True - - def write_result(self, result): - pass - - -@pytest.fixture() -def partition_test_results(): - # Reusable partition test results, calculated only once - result = partition( - filename=str(TEST_FILE_PATH), - data_source_metadata=DataSourceMetadata( - url=TEST_SOURCE_URL, - version=TEST_VERSION, - record_locator=TEST_RECORD_LOCATOR, - date_created=TEST_DATE_CREATED, - date_modified=TEST_DATE_MODIFIED, - date_processed=TEST_DATE_PROCESSSED, - ), - ) - return result - - -@pytest.fixture() -def partition_file_test_results(partition_test_results): - # Reusable partition_file test results, calculated only once - return elements_to_dicts(partition_test_results) - - -def test_partition_file(): - """Validate partition_file returns a list of dictionaries with the expected keys, - metadatakeys, and data source metadata values.""" - test_ingest_doc = ExampleIngestDoc( - connector_config=TEST_CONFIG, - read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR), - ) - test_ingest_doc._date_processed = TEST_DATE_PROCESSSED - elements = test_ingest_doc.partition_file(partition_config=PartitionConfig()) - element_dicts = elements_to_dicts(elements) - assert len(element_dicts) - expected_keys = { - "element_id", - "text", - "type", - "metadata", - } - # The document in TEST_FILE_PATH does not have elements with coordinates so - # partition is not expected to return coordinates metadata. - expected_metadata_keys = { - "data_source", - "filename", - "file_directory", - "filetype", - "languages", - "last_modified", - } - for elem in element_dicts: - # Parent IDs are non-deterministic - remove them from the test - elem["metadata"].pop("parent_id", None) - - assert expected_keys == set(elem.keys()) - assert expected_metadata_keys == set(elem["metadata"].keys()) - data_source_metadata = elem["metadata"]["data_source"] - assert data_source_metadata["url"] == TEST_SOURCE_URL - assert data_source_metadata["version"] == TEST_VERSION - assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR - assert data_source_metadata["date_created"] == TEST_DATE_CREATED - assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED - assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED - - -def test_process_file_fields_include_default(mocker, partition_test_results): - """Validate when metadata_include and metadata_exclude are not set, all fields: - ("element_id", "text", "type", "metadata") are included""" - mock_partition = mocker.patch( - "unstructured.partition.auto.partition", - return_value=partition_test_results, - ) - test_ingest_doc = ExampleIngestDoc( - connector_config=TEST_CONFIG, - read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR), - ) - elements = test_ingest_doc.partition_file(partition_config=PartitionConfig()) - element_dicts = elements_to_dicts(elements) - assert len(element_dicts) - assert mock_partition.call_count == 1 - for elem in element_dicts: - # Parent IDs are non-deterministic - remove them from the test - elem["metadata"].pop("parent_id", None) - - assert {"element_id", "text", "type", "metadata"} == set(elem.keys()) - data_source_metadata = elem["metadata"]["data_source"] - assert data_source_metadata["url"] == TEST_SOURCE_URL - assert data_source_metadata["version"] == TEST_VERSION - assert data_source_metadata["record_locator"] == TEST_RECORD_LOCATOR - assert data_source_metadata["date_created"] == TEST_DATE_CREATED - assert data_source_metadata["date_modified"] == TEST_DATE_MODIFIED - assert data_source_metadata["date_processed"] == TEST_DATE_PROCESSSED - - -def test_process_file_metadata_includes_filename_and_filetype( - mocker, - partition_test_results, -): - """Validate when metadata_include is set to "filename,filetype", - only filename is included in metadata""" - mocker.patch( - "unstructured.partition.auto.partition", - return_value=partition_test_results, - ) - partition_config = PartitionConfig( - metadata_include=["filename", "filetype"], - ) - test_ingest_doc = ExampleIngestDoc( - connector_config=TEST_CONFIG, - read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - processor_config=ProcessorConfig(output_dir=TEST_OUTPUT_DIR), - ) - isd_elems = test_ingest_doc.process_file(partition_config=partition_config) - assert len(isd_elems) - for elem in isd_elems: - # Parent IDs are non-deterministic - remove them from the test - elem["metadata"].pop("parent_id", None) - - assert set(elem["metadata"].keys()) == {"filename", "filetype"} - - -def test_process_file_metadata_exclude_filename_pagenum(mocker, partition_test_results): - """Validate when metadata_exclude is set to "filename,page_number", - neither filename nor page_number are included in metadata""" - mocker.patch( - "unstructured.partition.auto.partition", - return_value=partition_test_results, - ) - partition_config = PartitionConfig( - metadata_exclude=["filename", "page_number"], - ) - test_ingest_doc = ExampleIngestDoc( - connector_config=TEST_CONFIG, - read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - processor_config=ProcessorConfig( - output_dir=TEST_OUTPUT_DIR, - ), - ) - isd_elems = test_ingest_doc.process_file(partition_config=partition_config) - assert len(isd_elems) - for elem in isd_elems: - assert "filename" not in elem["metadata"] - assert "page_number" not in elem["metadata"] - - -def test_process_file_flatten_metadata(mocker, partition_test_results): - mocker.patch( - "unstructured.partition.auto.partition", - return_value=partition_test_results, - ) - partition_config = PartitionConfig( - metadata_include=["filename", "file_directory", "filetype"], - flatten_metadata=True, - ) - test_ingest_doc = ExampleIngestDoc( - connector_config=TEST_CONFIG, - read_config=ReadConfig(download_dir=TEST_DOWNLOAD_DIR), - processor_config=ProcessorConfig( - output_dir=TEST_OUTPUT_DIR, - ), - ) - isd_elems = test_ingest_doc.process_file(partition_config=partition_config) - expected_keys = {"element_id", "text", "type", "filename", "file_directory", "filetype"} - for elem in isd_elems: - assert expected_keys == set(elem.keys()) - - -class DescribeChunkingConfig: - """Unit tests for unstructured.ingest.interfaces.ChunkingConfig""" - - def it_accepts_chunking_strategy_by_itself(self): - config = ChunkingConfig(chunking_strategy="basic") - assert config.chunking_strategy == "basic" - - def it_defaults_to_chunk_by_title_if_only_chunk_elements_is_True(self): - config = ChunkingConfig(chunk_elements=True) - assert config.chunking_strategy == "by_title" - - def but_it_defaults_to_chunking_strategy_over_chunk_elements(self): - config = ChunkingConfig(chunk_elements=True, chunking_strategy="basic") - assert config.chunking_strategy == "basic" - - def it_silently_accepts_unrecognized_chunker(self, caplog: pytest.LogCaptureFixture): - config = ChunkingConfig(chunking_strategy="foobar") - assert config.chunking_strategy == "foobar" - assert caplog.text == "" diff --git a/test_unstructured_ingest/unit/test_logger.py b/test_unstructured_ingest/unit/test_logger.py deleted file mode 100644 index 4f15aba4c..000000000 --- a/test_unstructured_ingest/unit/test_logger.py +++ /dev/null @@ -1,78 +0,0 @@ -import json - -import pytest - -from unstructured.ingest.logger import ( - default_is_data_sensitive, - hide_sensitive_fields, - redact_jsons, -) - - -@pytest.mark.parametrize( - ("key", "value", "is_sensitive"), - [ - ("username", "john_smith", False), - ("password", "13?H%", True), - ("token", "123", True), - ("AWS_CREDENTIAL", "aws_credential", True), - ("AWS_KEY", None, False), - ], -) -def test_default_is_sensitive(key, value, is_sensitive): - assert default_is_data_sensitive(key, value) == is_sensitive - - -def test_hide_sensitive_fields(): - d = { - "username": "john_smith", - "password": "13?H%", - "inner": { - "token": "123", - "AWS_KEY": None, - "inner_j_string": json.dumps( - {"account_name": "secret name", "client_id": 123, "timestamp": 123} - ), - }, - } - redacted_d = hide_sensitive_fields(d) - expected_d = { - "password": "*******", - "username": "john_smith", - "inner": { - "token": "*******", - "AWS_KEY": None, - "inner_j_string": json.dumps( - {"account_name": "*******", "client_id": "*******", "timestamp": 123} - ), - }, - } - assert redacted_d == expected_d - - -def test_redact_jsons(): - d1 = { - "username": "john_smith", - "password": "13?H%", - "inner": { - "token": "123", - "AWS_KEY": None, - "inner_j_string": json.dumps( - {"account_name": "secret name", "client_id": 123, "timestamp": 123} - ), - }, - } - - d2 = {"username": "tim67", "update_time": 456} - d3 = {"account_name": "top secret", "host": "http://localhost:8888"} - - sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})" - expected_string = ( - 'Some topic secret info ({"username": "john_smith", "password": "*******", ' - '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": ' - '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", ' - '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} ' - 'and {"account_name": "*******", "host": "http://localhost:8888"})' - ) - redacted_string = redact_jsons(sensitive_string) - assert redacted_string == expected_string diff --git a/test_unstructured_ingest/unit/test_utils.py b/test_unstructured_ingest/unit/test_utils.py deleted file mode 100644 index bf2556cbe..000000000 --- a/test_unstructured_ingest/unit/test_utils.py +++ /dev/null @@ -1,164 +0,0 @@ -import json -import typing as t -from dataclasses import dataclass, field -from datetime import datetime - -import pytest -import pytz - -from unstructured.ingest.cli.utils import extract_config -from unstructured.ingest.interfaces import BaseConfig -from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime, json_to_dict - - -@dataclass -class A(BaseConfig): - a: str - - -@dataclass -class B(BaseConfig): - a: A - b: int - - -flat_data = {"a": "test", "b": 4, "c": True} - - -def test_extract_config_concrete(): - @dataclass - class C(BaseConfig): - b: B - c: bool - - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_extract_config_optional(): - @dataclass - class C(BaseConfig): - c: bool - b: t.Optional[B] = None - - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": True} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_extract_config_union(): - @dataclass - class C(BaseConfig): - c: bool - b: t.Optional[t.Union[B, int]] = None - - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"b": 4, "c": True} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_extract_config_list(): - @dataclass - class C(BaseConfig): - c: t.List[int] - b: B - - flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]} - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_extract_config_optional_list(): - @dataclass - class C(BaseConfig): - b: B - c: t.Optional[t.List[int]] = None - - flat_data = {"a": "test", "b": 4, "c": [1, 2, 3]} - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"b": {"a": {"a": "test"}, "b": 4}, "c": [1, 2, 3]} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_extract_config_dataclass_list(): - @dataclass - class C(BaseConfig): - c: bool - b: t.List[B] = field(default_factory=list) - - flat_data = {"a": "test", "c": True} - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"b": [], "c": True} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_extract_config_dict(): - @dataclass - class C(BaseConfig): - c: bool - b: t.Dict[str, B] = field(default_factory=dict) - - flat_data = {"c": True} - c = extract_config(flat_data=flat_data, config=C) - expected_result = {"c": True, "b": {}} - assert c.to_json(sort_keys=True) == json.dumps(expected_result, sort_keys=True) - - -def test_json_to_dict_valid_json(): - json_string = '{"key": "value"}' - expected_result = {"key": "value"} - assert json_to_dict(json_string) == expected_result - assert isinstance(json_to_dict(json_string), dict) - - -def test_json_to_dict_malformed_json(): - json_string = '{"key": "value"' - expected_result = '{"key": "value"' - assert json_to_dict(json_string) == expected_result - assert isinstance(json_to_dict(json_string), str) - - -def test_json_to_dict_single_quotes(): - json_string = "{'key': 'value'}" - expected_result = {"key": "value"} - assert json_to_dict(json_string) == expected_result - assert isinstance(json_to_dict(json_string), dict) - - -def test_json_to_dict_path(): - json_string = "/path/to/file.json" - expected_result = "/path/to/file.json" - assert json_to_dict(json_string) == expected_result - assert isinstance(json_to_dict(json_string), str) - - -def test_ensure_isoformat_datetime_for_datetime(): - dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0)) - assert dt == "2021-01-01T12:00:00" - - -def test_ensure_isoformat_datetime_for_datetime_with_tz(): - dt = ensure_isoformat_datetime(datetime(2021, 1, 1, 12, 0, 0, tzinfo=pytz.UTC)) - assert dt == "2021-01-01T12:00:00+00:00" - - -def test_ensure_isoformat_datetime_for_string(): - dt = ensure_isoformat_datetime("2021-01-01T12:00:00") - assert dt == "2021-01-01T12:00:00" - - -def test_ensure_isoformat_datetime_for_string2(): - dt = ensure_isoformat_datetime("2021-01-01T12:00:00+00:00") - assert dt == "2021-01-01T12:00:00+00:00" - - -def test_ensure_isoformat_datetime_fails_on_string(): - with pytest.raises(ValueError): - ensure_isoformat_datetime("bad timestamp") - - -def test_ensure_isoformat_datetime_fails_on_int(): - with pytest.raises(TypeError): - ensure_isoformat_datetime(1111) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index e794a070a..65162b438 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.15-dev0" # pragma: no cover +__version__ = "0.16.0" # pragma: no cover diff --git a/unstructured/embed/bedrock.py b/unstructured/embed/bedrock.py index dba52e776..b667e9558 100644 --- a/unstructured/embed/bedrock.py +++ b/unstructured/embed/bedrock.py @@ -1,62 +1,69 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List import numpy as np +from pydantic import SecretStr from unstructured.documents.elements import ( Element, ) from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: from langchain_community.embeddings import BedrockEmbeddings -@dataclass class BedrockEmbeddingConfig(EmbeddingConfig): - aws_access_key_id: str = enhanced_field(sensitive=True) - aws_secret_access_key: str = enhanced_field(sensitive=True) + aws_access_key_id: SecretStr + aws_secret_access_key: SecretStr region_name: str = "us-west-2" + @requires_dependencies( + ["boto3", "numpy", "langchain_community"], + extras="bedrock", + ) + def get_client(self) -> "BedrockEmbeddings": + # delay import only when needed + import boto3 + from langchain_community.embeddings import BedrockEmbeddings + + bedrock_runtime = boto3.client( + service_name="bedrock-runtime", + aws_access_key_id=self.aws_access_key_id.get_secret_value(), + aws_secret_access_key=self.aws_secret_access_key.get_secret_value(), + region_name=self.region_name, + ) + + bedrock_client = BedrockEmbeddings(client=bedrock_runtime) + return bedrock_client + @dataclass class BedrockEmbeddingEncoder(BaseEmbeddingEncoder): config: BedrockEmbeddingConfig - _client: Optional["BedrockEmbeddings"] = enhanced_field(init=False, default=None) - _exemplary_embedding: Optional[List[float]] = enhanced_field(init=False, default=None) - @property - def client(self) -> "BedrockEmbeddings": - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: - if self._exemplary_embedding is None: - self._exemplary_embedding = self.client.embed_query("Q") - return self._exemplary_embedding + def get_exemplary_embedding(self) -> List[float]: + return self.embed_query(query="Q") def __post_init__(self): self.initialize() - def initialize(self): - self.bedrock_client = self.create_client() - def num_of_dimensions(self): - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) def is_unit_vector(self): - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def embed_query(self, query): - return np.array(self.bedrock_client.embed_query(query)) + bedrock_client = self.config.get_client() + return np.array(bedrock_client.embed_query(query)) def embed_documents(self, elements: List[Element]) -> List[Element]: - embeddings = self.bedrock_client.embed_documents([str(e) for e in elements]) + bedrock_client = self.config.get_client() + embeddings = bedrock_client.embed_documents([str(e) for e in elements]) elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) return elements_with_embeddings @@ -67,18 +74,3 @@ class BedrockEmbeddingEncoder(BaseEmbeddingEncoder): element.embeddings = embeddings[i] elements_w_embedding.append(element) return elements - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["boto3", "numpy", "langchain_community"], - extras="bedrock", - ) - def create_client(self) -> "BedrockEmbeddings": - # delay import only when needed - import boto3 - from langchain_community.embeddings import BedrockEmbeddings - - bedrock_runtime = boto3.client(service_name="bedrock-runtime", **self.config.to_dict()) - - bedrock_client = BedrockEmbeddings(client=bedrock_runtime) - return bedrock_client diff --git a/unstructured/embed/huggingface.py b/unstructured/embed/huggingface.py index cb98be0e8..d955f7053 100644 --- a/unstructured/embed/huggingface.py +++ b/unstructured/embed/huggingface.py @@ -1,60 +1,59 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional import numpy as np +from pydantic import Field from unstructured.documents.elements import ( Element, ) from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: from langchain_huggingface.embeddings import HuggingFaceEmbeddings -@dataclass class HuggingFaceEmbeddingConfig(EmbeddingConfig): - model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2" - model_kwargs: Optional[dict] = field(default_factory=lambda: {"device": "cpu"}) - encode_kwargs: Optional[dict] = field(default_factory=lambda: {"normalize_embeddings": False}) - cache_folder: Optional[dict] = None + model_name: Optional[str] = Field(default="sentence-transformers/all-MiniLM-L6-v2") + model_kwargs: Optional[dict] = Field(default_factory=lambda: {"device": "cpu"}) + encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False}) + cache_folder: Optional[dict] = Field(default=None) + + @requires_dependencies( + ["langchain_huggingface"], + extras="embed-huggingface", + ) + def get_client(self) -> "HuggingFaceEmbeddings": + """Creates a langchain Huggingface python client to embed elements.""" + from langchain_huggingface.embeddings import HuggingFaceEmbeddings + + client = HuggingFaceEmbeddings(**self.dict()) + return client @dataclass class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder): config: HuggingFaceEmbeddingConfig - _client: Optional["HuggingFaceEmbeddings"] = field(init=False, default=None) - _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) - @property - def client(self) -> "HuggingFaceEmbeddings": - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: - if self._exemplary_embedding is None: - self._exemplary_embedding = self.client.embed_query("Q") - return self._exemplary_embedding - - def initialize(self): - """Creates a langchain HuggingFace object to embed elements.""" - _ = self.client + def get_exemplary_embedding(self) -> List[float]: + return self.embed_query(query="Q") def num_of_dimensions(self): - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) def is_unit_vector(self): - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def embed_query(self, query): - return self.client.embed_query(str(query)) + client = self.config.get_client() + return client.embed_query(str(query)) def embed_documents(self, elements: List[Element]) -> List[Element]: - embeddings = self.client.embed_documents([str(e) for e in elements]) + client = self.config.get_client() + embeddings = client.embed_documents([str(e) for e in elements]) elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) return elements_with_embeddings @@ -66,15 +65,3 @@ class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder): element.embeddings = embeddings[i] elements_w_embedding.append(element) return elements - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["langchain_huggingface"], - extras="embed-huggingface", - ) - def create_client(self) -> "HuggingFaceEmbeddings": - """Creates a langchain Huggingface python client to embed elements.""" - from langchain_huggingface.embeddings import HuggingFaceEmbeddings - - client = HuggingFaceEmbeddings(**self.config.to_dict()) - return client diff --git a/unstructured/embed/interfaces.py b/unstructured/embed/interfaces.py index e98c0c902..a6b0a3665 100644 --- a/unstructured/embed/interfaces.py +++ b/unstructured/embed/interfaces.py @@ -2,17 +2,17 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import List, Tuple +from pydantic import BaseModel + from unstructured.documents.elements import Element -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -@dataclass -class EmbeddingConfig(EnhancedDataClassJsonMixin): +class EmbeddingConfig(BaseModel): pass @dataclass -class BaseEmbeddingEncoder(EnhancedDataClassJsonMixin, ABC): +class BaseEmbeddingEncoder(ABC): config: EmbeddingConfig @abstractmethod diff --git a/unstructured/embed/mixedbreadai.py b/unstructured/embed/mixedbreadai.py index 656d41e99..d89db571f 100644 --- a/unstructured/embed/mixedbreadai.py +++ b/unstructured/embed/mixedbreadai.py @@ -3,10 +3,10 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, List, Optional import numpy as np +from pydantic import Field, SecretStr from unstructured.documents.elements import Element from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies USER_AGENT = "@mixedbread-ai/unstructured" @@ -22,7 +22,6 @@ if TYPE_CHECKING: from mixedbread_ai.core import RequestOptions -@dataclass class MixedbreadAIEmbeddingConfig(EmbeddingConfig): """ Configuration class for Mixedbread AI Embedding Encoder. @@ -32,14 +31,31 @@ class MixedbreadAIEmbeddingConfig(EmbeddingConfig): model_name (str): Name of the model to use for embeddings. """ - api_key: str = field( - default_factory=lambda: os.environ.get("MXBAI_API_KEY"), + api_key: SecretStr = Field( + default_factory=lambda: SecretStr(os.environ.get("MXBAI_API_KEY")), ) - model_name: str = field( + model_name: str = Field( default="mixedbread-ai/mxbai-embed-large-v1", ) + @requires_dependencies( + ["mixedbread_ai"], + extras="embed-mixedbreadai", + ) + def get_client(self) -> "MixedbreadAI": + """ + Create the Mixedbread AI client. + + Returns: + MixedbreadAI: Initialized client. + """ + from mixedbread_ai.client import MixedbreadAI + + return MixedbreadAI( + api_key=self.api_key.get_secret_value(), + ) + @dataclass class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder): @@ -52,23 +68,12 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder): config: MixedbreadAIEmbeddingConfig - _client: Optional["MixedbreadAI"] = field(init=False, default=None) _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) _request_options: Optional["RequestOptions"] = field(init=False, default=None) - @property - def client(self) -> "MixedbreadAI": - """Lazy initialization of the Mixedbread AI client.""" - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: + def get_exemplary_embedding(self) -> List[float]: """Get an exemplary embedding to determine dimensions and unit vector status.""" - if self._exemplary_embedding is None: - self._exemplary_embedding = self._embed(["Q"])[0] - return self._exemplary_embedding + return self._embed(["Q"])[0] def initialize(self): if self.config.api_key is None: @@ -89,12 +94,14 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder): @property def num_of_dimensions(self): """Get the number of dimensions for the embeddings.""" - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) @property def is_unit_vector(self) -> bool: """Check if the embedding is a unit vector.""" - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def _embed(self, texts: List[str]) -> List[List[float]]: """ @@ -110,10 +117,10 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder): batch_itr = range(0, len(texts), batch_size) responses = [] - + client = self.config.get_client() for i in batch_itr: batch = texts[i : i + batch_size] - response = self.client.embeddings( + response = client.embeddings( model=self.config.model_name, normalized=True, encoding_format=ENCODING_FORMAT, @@ -169,21 +176,3 @@ class MixedbreadAIEmbeddingEncoder(BaseEmbeddingEncoder): List[float]: Embedding of the query. """ return self._embed([query])[0] - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["mixedbread_ai"], - extras="embed-mixedbreadai", - ) - def create_client(self) -> "MixedbreadAI": - """ - Create the Mixedbread AI client. - - Returns: - MixedbreadAI: Initialized client. - """ - from mixedbread_ai.client import MixedbreadAI - - return MixedbreadAI( - api_key=self.config.api_key, - ) diff --git a/unstructured/embed/octoai.py b/unstructured/embed/octoai.py index e4f7fcb38..119a41bc8 100644 --- a/unstructured/embed/octoai.py +++ b/unstructured/embed/octoai.py @@ -2,57 +2,57 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, List, Optional import numpy as np +from pydantic import Field, SecretStr from unstructured.documents.elements import ( Element, ) from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: from openai import OpenAI -OCTOAI_BASE_URL = "https://text.octoai.run/v1" - -@dataclass class OctoAiEmbeddingConfig(EmbeddingConfig): - api_key: str = enhanced_field(sensitive=True) - model_name: str = "thenlper/gte-large" + api_key: SecretStr + model_name: str = Field(default="thenlper/gte-large") + base_url: str = Field(default="https://text.octoai.run/v1") + + @requires_dependencies( + ["openai", "tiktoken"], + extras="embed-octoai", + ) + def get_client(self) -> "OpenAI": + """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK.""" + from openai import OpenAI + + return OpenAI(api_key=self.api_key.get_secret_value(), base_url=self.base_url) @dataclass class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder): config: OctoAiEmbeddingConfig # Uses the OpenAI SDK - _client: Optional["OpenAI"] = field(init=False, default=None) _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) - @property - def client(self) -> "OpenAI": - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: - if self._exemplary_embedding is None: - self._exemplary_embedding = self.embed_query("Q") - return self._exemplary_embedding + def get_exemplary_embedding(self) -> List[float]: + return self.embed_query("Q") def initialize(self): pass def num_of_dimensions(self): - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) def is_unit_vector(self): - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def embed_query(self, query): - response = self.client.embeddings.create(input=str(query), model=self.config.model_name) + client = self.config.get_client() + response = client.embeddings.create(input=str(query), model=self.config.model_name) return response.data[0].embedding def embed_documents(self, elements: List[Element]) -> List[Element]: @@ -67,14 +67,3 @@ class OctoAIEmbeddingEncoder(BaseEmbeddingEncoder): element.embeddings = embeddings[i] elements_w_embedding.append(element) return elements - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["openai", "tiktoken"], - extras="embed-octoai", - ) - def create_client(self) -> "OpenAI": - """Creates an OpenAI python client to embed elements. Uses the OpenAI SDK.""" - from openai import OpenAI - - return OpenAI(api_key=self.config.api_key, base_url=OCTOAI_BASE_URL) diff --git a/unstructured/embed/openai.py b/unstructured/embed/openai.py index a2f7d6472..ad97c49d9 100644 --- a/unstructured/embed/openai.py +++ b/unstructured/embed/openai.py @@ -1,58 +1,60 @@ -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, List, Optional +from dataclasses import dataclass +from typing import TYPE_CHECKING, List import numpy as np +from pydantic import Field, SecretStr from unstructured.documents.elements import ( Element, ) from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: from langchain_openai.embeddings import OpenAIEmbeddings -@dataclass class OpenAIEmbeddingConfig(EmbeddingConfig): - api_key: str = enhanced_field(sensitive=True) - model_name: str = "text-embedding-ada-002" + api_key: SecretStr + model_name: str = Field(default="text-embedding-ada-002") + + @requires_dependencies(["langchain_openai"], extras="openai") + def get_client(self) -> "OpenAIEmbeddings": + """Creates a langchain OpenAI python client to embed elements.""" + from langchain_openai import OpenAIEmbeddings + + openai_client = OpenAIEmbeddings( + openai_api_key=self.api_key.get_secret_value(), + model=self.model_name, # type:ignore + ) + return openai_client @dataclass class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder): config: OpenAIEmbeddingConfig - _client: Optional["OpenAIEmbeddings"] = field(init=False, default=None) - _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) - @property - def client(self) -> "OpenAIEmbeddings": - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: - if self._exemplary_embedding is None: - self._exemplary_embedding = self.client.embed_query("Q") - return self._exemplary_embedding + def get_exemplary_embedding(self) -> List[float]: + return self.embed_query(query="Q") def initialize(self): pass def num_of_dimensions(self): - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) def is_unit_vector(self): - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def embed_query(self, query): - return self.client.embed_query(str(query)) + client = self.config.get_client() + return client.embed_query(str(query)) def embed_documents(self, elements: List[Element]) -> List[Element]: - embeddings = self.client.embed_documents([str(e) for e in elements]) + client = self.config.get_client() + embeddings = client.embed_documents([str(e) for e in elements]) elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) return elements_with_embeddings @@ -63,15 +65,3 @@ class OpenAIEmbeddingEncoder(BaseEmbeddingEncoder): element.embeddings = embeddings[i] elements_w_embedding.append(element) return elements - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies(["langchain_openai"], extras="openai") - def create_client(self) -> "OpenAIEmbeddings": - """Creates a langchain OpenAI python client to embed elements.""" - from langchain_openai import OpenAIEmbeddings - - openai_client = OpenAIEmbeddings( - openai_api_key=self.config.api_key, - model=self.config.model_name, # type:ignore - ) - return openai_client diff --git a/unstructured/embed/vertexai.py b/unstructured/embed/vertexai.py index edbc8c2ef..5228ed497 100644 --- a/unstructured/embed/vertexai.py +++ b/unstructured/embed/vertexai.py @@ -1,62 +1,71 @@ # type: ignore import json import os -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional import numpy as np +from pydantic import Field, SecretStr from unstructured.documents.elements import ( Element, ) from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import FileHandler, requires_dependencies if TYPE_CHECKING: from langchain_google_vertexai import VertexAIEmbeddings -@dataclass class VertexAIEmbeddingConfig(EmbeddingConfig): - api_key: str = enhanced_field(sensitive=True) - model_name: Optional[str] = "textembedding-gecko@001" + api_key: SecretStr + model_name: Optional[str] = Field(default="textembedding-gecko@001") + + def register_application_credentials(self): + application_credentials_path = os.path.join("/tmp", "google-vertex-app-credentials.json") + credentials_file = FileHandler(application_credentials_path) + credentials_file.write_file(json.dumps(json.loads(self.api_key.get_secret_value()))) + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = application_credentials_path + + @requires_dependencies( + ["langchain", "langchain_google_vertexai"], + extras="embed-vertexai", + ) + def get_client(self) -> "VertexAIEmbeddings": + """Creates a Langchain VertexAI python client to embed elements.""" + from langchain_google_vertexai import VertexAIEmbeddings + + self.register_application_credentials() + vertexai_client = VertexAIEmbeddings(model_name=self.model_name) + return vertexai_client @dataclass class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder): config: VertexAIEmbeddingConfig - _client: Optional["VertexAIEmbeddings"] = field(init=False, default=None) - _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) - @property - def client(self) -> "VertexAIEmbeddings": - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: - if self._exemplary_embedding is None: - self._exemplary_embedding = self.client.embed_query("A sample query.") - return self._exemplary_embedding + def get_exemplary_embedding(self) -> List[float]: + return self.embed_query(query="A sample query.") def initialize(self): pass def num_of_dimensions(self): - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) def is_unit_vector(self): - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def embed_query(self, query): - result = self.client.embed_query(str(query)) + client = self.config.get_client() + result = client.embed_query(str(query)) return result def embed_documents(self, elements: List[Element]) -> List[Element]: - embeddings = self.client.embed_documents([str(e) for e in elements]) + client = self.config.get_client() + embeddings = client.embed_documents([str(e) for e in elements]) elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) return elements_with_embeddings @@ -67,25 +76,3 @@ class VertexAIEmbeddingEncoder(BaseEmbeddingEncoder): element.embeddings = embeddings[i] elements_w_embedding.append(element) return elements - - @property - def application_credentials_path(self): - return os.path.join("/tmp", "google-vertex-app-credentials.json") - - def register_application_credentials(self): - credentials_file = FileHandler(self.application_credentials_path) - credentials_file.write_file(json.dumps(json.loads(self.config.api_key))) - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.application_credentials_path - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["langchain", "langchain_google_vertexai"], - extras="embed-vertexai", - ) - def create_client(self) -> "VertexAIEmbeddings": - """Creates a Langchain VertexAI python client to embed elements.""" - from langchain_google_vertexai import VertexAIEmbeddings - - self.register_application_credentials() - vertexai_client = VertexAIEmbeddings(model_name=self.config.model_name) - return vertexai_client diff --git a/unstructured/embed/voyageai.py b/unstructured/embed/voyageai.py index 56f98d365..c5dd5b61c 100644 --- a/unstructured/embed/voyageai.py +++ b/unstructured/embed/voyageai.py @@ -1,61 +1,67 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional import numpy as np +from pydantic import Field, SecretStr from unstructured.documents.elements import Element from unstructured.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import EmbeddingEncoderConnectionError from unstructured.utils import requires_dependencies if TYPE_CHECKING: from langchain_voyageai import VoyageAIEmbeddings -@dataclass class VoyageAIEmbeddingConfig(EmbeddingConfig): - api_key: str = enhanced_field(sensitive=True) + api_key: SecretStr model_name: str - batch_size: Optional[int] = None - truncation: Optional[bool] = None + batch_size: Optional[int] = Field(default=None) + truncation: Optional[bool] = Field(default=None) + + @requires_dependencies( + ["langchain", "langchain_voyageai"], + extras="embed-voyageai", + ) + def get_client(self) -> "VoyageAIEmbeddings": + """Creates a Langchain VoyageAI python client to embed elements.""" + from langchain_voyageai import VoyageAIEmbeddings + + return VoyageAIEmbeddings( + voyage_api_key=self.api_key, + model=self.model_name, + batch_size=self.batch_size, + truncation=self.truncation, + ) @dataclass class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder): config: VoyageAIEmbeddingConfig - _client: Optional["VoyageAIEmbeddings"] = field(init=False, default=None) - _exemplary_embedding: Optional[List[float]] = field(init=False, default=None) - @property - def client(self) -> "VoyageAIEmbeddings": - if self._client is None: - self._client = self.create_client() - return self._client - - @property - def exemplary_embedding(self) -> List[float]: - if self._exemplary_embedding is None: - self._exemplary_embedding = self.client.embed_query("A sample query.") - return self._exemplary_embedding + def get_exemplary_embedding(self) -> List[float]: + return self.embed_query(query="A sample query.") def initialize(self): pass @property def num_of_dimensions(self) -> tuple[int, ...]: - return np.shape(self.exemplary_embedding) + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) @property def is_unit_vector(self) -> bool: - return np.isclose(np.linalg.norm(self.exemplary_embedding), 1.0) + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) def embed_documents(self, elements: List[Element]) -> List[Element]: - embeddings = self.client.embed_documents([str(e) for e in elements]) + client = self.config.get_client() + embeddings = client.embed_documents([str(e) for e in elements]) return self._add_embeddings_to_elements(elements, embeddings) def embed_query(self, query: str) -> List[float]: - return self.client.embed_query(query) + client = self.config.get_client() + return client.embed_query(query) @staticmethod def _add_embeddings_to_elements(elements, embeddings) -> List[Element]: @@ -65,19 +71,3 @@ class VoyageAIEmbeddingEncoder(BaseEmbeddingEncoder): element.embeddings = embeddings[i] elements_w_embedding.append(element) return elements - - @EmbeddingEncoderConnectionError.wrap - @requires_dependencies( - ["langchain", "langchain_voyageai"], - extras="embed-voyageai", - ) - def create_client(self) -> "VoyageAIEmbeddings": - """Creates a Langchain VoyageAI python client to embed elements.""" - from langchain_voyageai import VoyageAIEmbeddings - - return VoyageAIEmbeddings( - voyage_api_key=self.config.api_key, - model=self.config.model_name, - batch_size=self.config.batch_size, - truncation=self.config.truncation, - ) diff --git a/unstructured/ingest/README.md b/unstructured/ingest/README.md deleted file mode 100644 index f7291aa5a..000000000 --- a/unstructured/ingest/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Ingest -![Project unmaintained](https://img.shields.io/badge/project-unmaintained-red.svg) - -Project has been moved to: [Unstructured Ingest](https://github.com/Unstructured-IO/unstructured-ingest) - -This python module will be removed from this repo in the near future. diff --git a/unstructured/ingest/__init__.py b/unstructured/ingest/__init__.py deleted file mode 100644 index cae55db4a..000000000 --- a/unstructured/ingest/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from __future__ import annotations - -import warnings - -warnings.warn( - "unstructured.ingest will be removed in a future version. " - "Functionality moved to the unstructured-ingest project.", - DeprecationWarning, - stacklevel=2, -) diff --git a/unstructured/ingest/cli/__init__.py b/unstructured/ingest/cli/__init__.py deleted file mode 100644 index f3490ae22..000000000 --- a/unstructured/ingest/cli/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -import typing as t - -import click - -from unstructured.ingest.cli.cmds import base_dest_cmd_fns, base_src_cmd_fns - -src: t.List[click.Group] = [v().get_src_cmd() for v in base_src_cmd_fns] - -dest: t.List[click.Command] = [v().get_dest_cmd() for v in base_dest_cmd_fns] - -__all__ = [ - "src", - "dest", -] diff --git a/unstructured/ingest/cli/base/__init__.py b/unstructured/ingest/cli/base/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/cli/base/cmd.py b/unstructured/ingest/cli/base/cmd.py deleted file mode 100644 index f02a81424..000000000 --- a/unstructured/ingest/cli/base/cmd.py +++ /dev/null @@ -1,19 +0,0 @@ -import typing as t -from abc import ABC -from dataclasses import dataclass, field - -from unstructured.ingest.cli.interfaces import CliConfig -from unstructured.ingest.interfaces import BaseConfig - - -@dataclass -class BaseCmd(ABC): - cmd_name: str - cli_config: t.Optional[t.Type[BaseConfig]] = None - additional_cli_options: t.List[t.Type[CliConfig]] = field(default_factory=list) - addition_configs: t.Dict[str, t.Type[BaseConfig]] = field(default_factory=dict) - is_fsspec: bool = False - - @property - def cmd_name_key(self): - return self.cmd_name.replace("-", "_") diff --git a/unstructured/ingest/cli/base/dest.py b/unstructured/ingest/cli/base/dest.py deleted file mode 100644 index 4b3d62739..000000000 --- a/unstructured/ingest/cli/base/dest.py +++ /dev/null @@ -1,87 +0,0 @@ -import logging -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.cmd import BaseCmd -from unstructured.ingest.cli.cmd_factory import get_src_cmd -from unstructured.ingest.cli.common import ( - log_options, -) -from unstructured.ingest.cli.interfaces import BaseConfig, CliFilesStorageConfig -from unstructured.ingest.cli.utils import ( - add_options, - conform_click_options, - extract_config, - extract_configs, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner.writers import writer_map - - -@dataclass -class BaseDestCmd(BaseCmd): - write_config: t.Optional[t.Type[BaseConfig]] = None - - def get_dest_runner(self, source_cmd: str, options: dict, parent_options: dict): - src_cmd_fn = get_src_cmd(cmd_name=source_cmd) - src_cmd = src_cmd_fn() - runner = src_cmd.get_source_runner(options=parent_options) - addition_configs = self.addition_configs - if "connector_config" not in addition_configs: - addition_configs["connector_config"] = self.cli_config - if self.write_config: - addition_configs["write_config"] = self.write_config - configs = extract_configs( - options, - validate=[self.cli_config] if self.cli_config else None, - extras=addition_configs, - add_defaults=False, - ) - writer_cls = writer_map[self.cmd_name_key] - writer = writer_cls(**configs) # type: ignore - runner.writer = writer - runner.writer_kwargs = options - return runner - - def check_dest_options(self, options: dict): - extract_config(flat_data=options, config=self.cli_config) - - def dest(self, ctx: click.Context, **options): - if not ctx.parent: - raise click.ClickException("destination command called without a parent") - if not ctx.parent.info_name: - raise click.ClickException("parent command missing info name") - source_cmd = ctx.parent.info_name.replace("-", "_") - parent_options: dict = ctx.parent.params if ctx.parent else {} - conform_click_options(options) - verbose = parent_options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(parent_options, verbose=verbose) - log_options(options, verbose=verbose) - try: - self.check_dest_options(options=options) - runner = self.get_dest_runner( - source_cmd=source_cmd, - options=options, - parent_options=parent_options, - ) - runner.run(**parent_options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - def get_dest_cmd(self) -> click.Command: - # Dynamically create the command without the use of click decorators - fn = self.dest - fn = click.pass_context(fn) - cmd: click.Group = click.command(fn) - cmd.name = self.cmd_name - cmd.invoke_without_command = True - options = [self.cli_config] if self.cli_config else [] - options += self.additional_cli_options - if self.is_fsspec and CliFilesStorageConfig not in options: - options.append(CliFilesStorageConfig) - add_options(cmd, extras=options, is_src=False) - return cmd diff --git a/unstructured/ingest/cli/base/src.py b/unstructured/ingest/cli/base/src.py deleted file mode 100644 index 70acbced4..000000000 --- a/unstructured/ingest/cli/base/src.py +++ /dev/null @@ -1,57 +0,0 @@ -import logging -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.cmd import BaseCmd -from unstructured.ingest.cli.common import ( - log_options, -) -from unstructured.ingest.cli.interfaces import CliFilesStorageConfig -from unstructured.ingest.cli.utils import Group, add_options, conform_click_options, extract_configs -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import runner_map - - -@dataclass -class BaseSrcCmd(BaseCmd): - def get_source_runner(self, options: dict): - addition_configs = self.addition_configs - if "connector_config" not in addition_configs: - addition_configs["connector_config"] = self.cli_config - configs = extract_configs( - options, - validate=[self.cli_config] if self.cli_config else None, - extras=addition_configs, - ) - runner = runner_map[self.cmd_name_key] - return runner(**configs) # type: ignore - - def src(self, ctx: click.Context, **options): - if ctx.invoked_subcommand: - return - - conform_click_options(options) - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options, verbose=verbose) - try: - runner = self.get_source_runner(options=options) - runner.run(**options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - def get_src_cmd(self) -> click.Group: - # Dynamically create the command without the use of click decorators - fn = self.src - fn = click.pass_context(fn) - cmd: click.Group = click.group(fn, cls=Group) - cmd.name = self.cmd_name - cmd.invoke_without_command = True - extra_options = [self.cli_config] if self.cli_config else [] - extra_options += self.additional_cli_options - if self.is_fsspec and CliFilesStorageConfig not in extra_options: - extra_options.append(CliFilesStorageConfig) - add_options(cmd, extras=extra_options) - return cmd diff --git a/unstructured/ingest/cli/cli.py b/unstructured/ingest/cli/cli.py deleted file mode 100644 index fa7c3008e..000000000 --- a/unstructured/ingest/cli/cli.py +++ /dev/null @@ -1,32 +0,0 @@ -import click - -from unstructured.ingest.cli import dest, src -from unstructured.ingest.v2.cli.cmds import dest as dest_v2 -from unstructured.ingest.v2.cli.cmds import src as src_v2 - - -@click.group() -def ingest(): - pass - - -def get_cmd() -> click.Command: - """Construct and return a Click command object representing the main command for the CLI. - - This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those - to the main command as nested subcommands. - """ - cmd = ingest - src_dict = {s.name: s for s in src} - dest_dict = {d.name: d for d in dest} - for s in src_v2: - src_dict[s.name] = s - for d in dest_v2: - dest_dict[d.name] = d - # Add all subcommands - for src_subcommand in src_dict.values(): - # Add all destination subcommands - for dest_subcommand in dest_dict.values(): - src_subcommand.add_command(dest_subcommand) - cmd.add_command(src_subcommand) - return cmd diff --git a/unstructured/ingest/cli/cmd_factory.py b/unstructured/ingest/cli/cmd_factory.py deleted file mode 100644 index 3260828cb..000000000 --- a/unstructured/ingest/cli/cmd_factory.py +++ /dev/null @@ -1,12 +0,0 @@ -import typing as t - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.cmds import base_src_cmd_fns - - -def get_src_cmd_map() -> t.Dict[str, t.Callable[[], BaseSrcCmd]]: - return {b().cmd_name_key: b for b in base_src_cmd_fns} - - -def get_src_cmd(cmd_name: str) -> t.Callable[[], BaseSrcCmd]: - return get_src_cmd_map()[cmd_name] diff --git a/unstructured/ingest/cli/cmds/__init__.py b/unstructured/ingest/cli/cmds/__init__.py deleted file mode 100644 index f75ee797e..000000000 --- a/unstructured/ingest/cli/cmds/__init__.py +++ /dev/null @@ -1,145 +0,0 @@ -from __future__ import annotations - -import collections -import typing as t - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_base_src_cmd - -from .airtable import get_base_src_cmd as airtable_base_src_cmd -from .astradb import get_base_dest_cmd as astradb_base_dest_cmd -from .astradb import get_base_src_cmd as astradb_base_src_cmd -from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd -from .biomed import get_base_src_cmd as biomed_base_src_cmd -from .chroma import get_base_dest_cmd as chroma_base_dest_cmd -from .clarifai import get_base_dest_cmd as clarifai_base_dest_cmd -from .confluence import get_base_src_cmd as confluence_base_src_cmd -from .databricks_volumes import get_base_dest_cmd as databricks_volumes_dest_cmd -from .delta_table import get_base_dest_cmd as delta_table_dest_cmd -from .delta_table import get_base_src_cmd as delta_table_base_src_cmd -from .discord import get_base_src_cmd as discord_base_src_cmd -from .elasticsearch import get_base_dest_cmd as elasticsearch_base_dest_cmd -from .elasticsearch import get_base_src_cmd as elasticsearch_base_src_cmd -from .fsspec.azure import get_base_dest_cmd as azure_base_dest_cmd -from .fsspec.azure import get_base_src_cmd as azure_base_src_cmd -from .fsspec.box import get_base_dest_cmd as box_base_dest_cmd -from .fsspec.box import get_base_src_cmd as box_base_src_cmd -from .fsspec.dropbox import get_base_dest_cmd as dropbox_base_dest_cmd -from .fsspec.dropbox import get_base_src_cmd as dropbox_base_src_cmd -from .fsspec.fsspec import get_base_dest_cmd as fsspec_base_dest_cmd -from .fsspec.fsspec import get_base_src_cmd as fsspec_base_src_cmd -from .fsspec.gcs import get_base_dest_cmd as gcs_base_dest_cmd -from .fsspec.gcs import get_base_src_cmd as gcs_base_src_cmd -from .fsspec.s3 import get_base_dest_cmd as s3_base_dest_cmd -from .fsspec.s3 import get_base_src_cmd as s3_base_src_cmd -from .github import get_base_src_cmd as github_base_src_cmd -from .gitlab import get_base_src_cmd as gitlab_base_src_cmd -from .google_drive import get_base_src_cmd as google_drive_base_src_cmd -from .hubspot import get_base_src_cmd as hubspot_base_src_cmd -from .jira import get_base_src_cmd as jira_base_src_cmd -from .kafka import get_base_dest_cmd as kafka_base_dest_cmd -from .kafka import get_base_src_cmd as kafka_base_src_cmd -from .local import get_base_src_cmd as local_base_src_cmd -from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd -from .mongodb import get_base_src_cmd as mongodb_base_src_cmd -from .notion import get_base_src_cmd as notion_base_src_cmd -from .onedrive import get_base_src_cmd as onedrive_base_src_cmd -from .opensearch import get_base_dest_cmd as opensearch_base_dest_cmd -from .opensearch import get_base_src_cmd as opensearch_base_src_cmd -from .outlook import get_base_src_cmd as outlook_base_src_cmd -from .pinecone import get_base_dest_cmd as pinecone_base_dest_cmd -from .qdrant import get_base_dest_cmd as qdrant_base_dest_cmd -from .reddit import get_base_src_cmd as reddit_base_src_cmd -from .salesforce import get_base_src_cmd as salesforce_base_src_cmd -from .sharepoint import get_base_src_cmd as sharepoint_base_src_cmd -from .slack import get_base_src_cmd as slack_base_src_cmd -from .sql import get_base_dest_cmd as sql_base_dest_cmd -from .vectara import get_base_dest_cmd as vectara_base_dest_cmd -from .weaviate import get_base_dest_cmd as weaviate_dest_cmd -from .wikipedia import get_base_src_cmd as wikipedia_base_src_cmd - -if t.TYPE_CHECKING: - from unstructured.ingest.cli.base.dest import BaseDestCmd - -base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [ - airtable_base_src_cmd, - astradb_base_src_cmd, - azure_base_src_cmd, - biomed_base_src_cmd, - box_base_src_cmd, - confluence_base_src_cmd, - delta_table_base_src_cmd, - discord_base_src_cmd, - dropbox_base_src_cmd, - elasticsearch_base_src_cmd, - fsspec_base_src_cmd, - gcs_base_src_cmd, - github_base_src_cmd, - gitlab_base_src_cmd, - google_drive_base_src_cmd, - hubspot_base_src_cmd, - jira_base_src_cmd, - kafka_base_src_cmd, - local_base_src_cmd, - mongodb_base_src_cmd, - notion_base_src_cmd, - onedrive_base_src_cmd, - opensearch_base_src_cmd, - outlook_base_src_cmd, - reddit_base_src_cmd, - salesforce_base_src_cmd, - sftp_base_src_cmd, - sharepoint_base_src_cmd, - slack_base_src_cmd, - s3_base_src_cmd, - wikipedia_base_src_cmd, -] - -# Make sure there are not overlapping names -src_cmd_names = [b().cmd_name for b in base_src_cmd_fns] -src_duplicates = [item for item, count in collections.Counter(src_cmd_names).items() if count > 1] -if src_duplicates: - raise ValueError( - "multiple base src commands defined with the same names: {}".format( - ", ".join(src_duplicates), - ), - ) - -base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [ - astradb_base_dest_cmd, - azure_base_dest_cmd, - box_base_dest_cmd, - chroma_base_dest_cmd, - clarifai_base_dest_cmd, - databricks_volumes_dest_cmd, - dropbox_base_dest_cmd, - elasticsearch_base_dest_cmd, - fsspec_base_dest_cmd, - gcs_base_dest_cmd, - kafka_base_dest_cmd, - s3_base_dest_cmd, - azure_cognitive_search_base_dest_cmd, - delta_table_dest_cmd, - sql_base_dest_cmd, - weaviate_dest_cmd, - mongo_base_dest_cmd, - pinecone_base_dest_cmd, - qdrant_base_dest_cmd, - opensearch_base_dest_cmd, - vectara_base_dest_cmd, -] - -# Make sure there are not overlapping names -dest_cmd_names = [b().cmd_name for b in base_dest_cmd_fns] -dest_duplicates = [item for item, count in collections.Counter(dest_cmd_names).items() if count > 1] -if dest_duplicates: - raise ValueError( - "multiple base dest commands defined with the same names: {}".format( - ", ".join(dest_duplicates), - ), - ) - -__all__ = [ - "base_src_cmd_fns", - "base_dest_cmd_fns", -] diff --git a/unstructured/ingest/cli/cmds/airtable.py b/unstructured/ingest/cli/cmds/airtable.py deleted file mode 100644 index c7462a707..000000000 --- a/unstructured/ingest/cli/cmds/airtable.py +++ /dev/null @@ -1,69 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.airtable import SimpleAirtableConfig - - -@dataclass -class AirtableCliConfig(SimpleAirtableConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--personal-access-token"], - default=None, - help="Personal access token to authenticate into Airtable. Check: " - "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens " - "for more info", - ), - click.Option( - ["--list-of-paths"], - default=None, - help=""" - A list of paths that specify the locations to ingest data from within Airtable. - - If this argument is not set, the connector ingests all tables within each and every base. - --list-of-paths: path1 path2 path3 …. - path: base_id/table_id(optional)/view_id(optional)/ - - To obtain (base, table, view) ids in bulk, check: - https://airtable.com/developers/web/api/list-bases (base ids) - https://airtable.com/developers/web/api/get-base-schema (table and view ids) - https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids) - - To obtain specific ids from Airtable UI, go to your workspace, and copy any - relevant id from the URL structure: - https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM - appAbcDeF1ghijKlm -> base_id - tblABcdEfG1HIJkLm -> table_id - viwABCDEfg6hijKLM -> view_id - - You can also check: https://support.airtable.com/docs/finding-airtable-ids - - Here is an example for one --list-of-paths: - base1/ → gets the entirety of all tables inside base1 - base1/table1 → gets all rows and columns within table1 in base1 - base1/table1/view1 → gets the rows and columns that are - visible in view1 for the table1 in base1 - - Examples to invalid airtable_paths: - table1 → has to mention base to be valid - base1/view1 → has to mention table to be valid - """, - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="airtable", - cli_config=AirtableCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/astradb.py b/unstructured/ingest/cli/cmds/astradb.py deleted file mode 100644 index b7be8f56c..000000000 --- a/unstructured/ingest/cli/cmds/astradb.py +++ /dev/null @@ -1,99 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig, Dict -from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig - - -@dataclass -class AstraDBCliConfig(SimpleAstraDBConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--token"], - required=True, - type=str, - help="Astra DB Token with access to the database.", - envvar="ASTRA_DB_APPLICATION_TOKEN", - show_envvar=True, - ), - click.Option( - ["--api-endpoint"], - required=True, - type=str, - help="The API endpoint for the Astra DB.", - envvar="ASTRA_DB_API_ENDPOINT", - show_envvar=True, - ), - click.Option( - ["--collection-name"], - required=False, - type=str, - help="The name of the Astra DB collection. " - "Note that the collection name must only include letters, " - "numbers, and underscores.", - ), - click.Option( - ["--namespace"], - required=False, - default=None, - type=str, - help="The Astra DB connection namespace.", - ), - ] - return options - - -@dataclass -class AstraDBCliWriteConfig(AstraDBWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--embedding-dimension"], - required=True, - default=384, - type=int, - help="The dimensionality of the embeddings", - ), - click.Option( - ["--requested-indexing-policy"], - required=False, - default=None, - type=Dict(), - help="The indexing policy to use for the collection." - 'example: \'{"deny": ["metadata"]}\' ', - ), - click.Option( - ["--batch-size"], - default=20, - type=int, - help="Number of records per batch", - ), - ] - return options - - -def get_base_src_cmd(): - from unstructured.ingest.cli.base.src import BaseSrcCmd - - cmd_cls = BaseSrcCmd( - cmd_name="astradb", - cli_config=AstraDBCliConfig, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="astradb", - cli_config=AstraDBCliConfig, - additional_cli_options=[AstraDBCliWriteConfig], - write_config=AstraDBWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/azure_cognitive_search.py b/unstructured/ingest/cli/cmds/azure_cognitive_search.py deleted file mode 100644 index 029519fb8..000000000 --- a/unstructured/ingest/cli/cmds/azure_cognitive_search.py +++ /dev/null @@ -1,65 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.azure_cognitive_search import ( - AzureCognitiveSearchWriteConfig, - SimpleAzureCognitiveSearchStorageConfig, -) - - -@dataclass -class AzureCognitiveSearchCliConfig(SimpleAzureCognitiveSearchStorageConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--key"], - required=True, - type=str, - help="Key credential used for authenticating to an Azure service.", - envvar="AZURE_SEARCH_API_KEY", - show_envvar=True, - ), - click.Option( - ["--endpoint"], - required=True, - type=str, - help="The URL endpoint of an Azure search service. " - "In the form of https://{{service_name}}.search.windows.net", - envvar="AZURE_SEARCH_ENDPOINT", - show_envvar=True, - ), - ] - return options - - -@dataclass -class AzureCognitiveSearchCliWriteConfig(AzureCognitiveSearchWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--index"], - required=True, - type=str, - help="The name of the index to connect to", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="azure-cognitive-search", - cli_config=AzureCognitiveSearchCliConfig, - additional_cli_options=[AzureCognitiveSearchCliWriteConfig], - write_config=AzureCognitiveSearchCliWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/biomed.py b/unstructured/ingest/cli/cmds/biomed.py deleted file mode 100644 index bafe403f3..000000000 --- a/unstructured/ingest/cli/cmds/biomed.py +++ /dev/null @@ -1,52 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.biomed import SimpleBiomedConfig - - -@dataclass -class BiomedCliConfig(SimpleBiomedConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--api-id"], - default=None, - help="ID parameter for OA Web Service API.", - ), - click.Option( - ["--api-from"], - default=None, - help="From parameter for OA Web Service API.", - ), - click.Option( - ["--api-until"], - default=None, - help="Until parameter for OA Web Service API.", - ), - click.Option( - ["--path"], - default=None, - help="PMC Open Access FTP Directory Path.", - ), - click.Option( - ["--max-request-time"], - default=45, - help="(In seconds) Max request time to OA Web Service API.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="biomed", - cli_config=BiomedCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/chroma.py b/unstructured/ingest/cli/cmds/chroma.py deleted file mode 100644 index c4a5cbcce..000000000 --- a/unstructured/ingest/cli/cmds/chroma.py +++ /dev/null @@ -1,104 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig, Dict -from unstructured.ingest.connector.chroma import ChromaWriteConfig, SimpleChromaConfig - - -@dataclass -class ChromaCliConfig(SimpleChromaConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--path"], - required=False, - type=str, - help="Location where Chroma is persisted," "if not connecting via http.", - ), - click.Option( - ["--settings"], - required=False, - type=Dict(), - help="A dictionary of settings to communicate with the chroma server." - 'example: \'{"persist_directory":"./chroma-persist"}\' ', - ), - click.Option( - ["--tenant"], - required=False, - default="default_tenant", - type=str, - help="The tenant to use for this client. Chroma defaults to 'default_tenant'.", - ), - click.Option( - ["--database"], - required=False, - default="default_database", - type=str, - help="The database to use for this client." - "Chroma defaults to 'default_database'.", - ), - click.Option( - ["--host"], - required=False, - type=str, - help="The hostname of the Chroma server.", - ), - click.Option( - ["--port"], - required=False, - type=int, - help="The port of the Chroma server.", - ), - click.Option( - ["--ssl"], - required=False, - default=False, - is_flag=True, - type=bool, - help="Whether to use SSL to connect to the Chroma server.", - ), - click.Option( - ["--headers"], - required=False, - type=Dict(), - help="A dictionary of headers to send to the Chroma server." - 'example: \'{"Authorization":"Basic()"}\' ', - ), - click.Option( - ["--collection-name"], - required=True, - type=str, - help="The name of the Chroma collection to write into.", - ), - ] - return options - - -@dataclass -class ChromaCliWriteConfig(ChromaWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="chroma", - cli_config=ChromaCliConfig, - additional_cli_options=[ChromaCliWriteConfig], - write_config=ChromaWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/clarifai.py b/unstructured/ingest/cli/cmds/clarifai.py deleted file mode 100644 index 23178d172..000000000 --- a/unstructured/ingest/cli/cmds/clarifai.py +++ /dev/null @@ -1,71 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig -from unstructured.ingest.connector.clarifai import ( - ClarifaiWriteConfig, - SimpleClarifaiConfig, -) - -CMD_NAME = "clarifai" - - -@dataclass -class ClarifaiCliConfig(SimpleClarifaiConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--api-key"], - required=True, - type=str, - help="The CLARIFAI_PAT of the user to access clarifai platform apps and models", - envvar="CLARIFAI_PAT", - show_envvar=True, - ), - click.Option( - ["--app-id"], - required=True, - type=str, - help="Clarifai app name/id", - ), - click.Option( - ["--user-id"], - required=True, - type=str, - help="Clarifai User name/ID", - ), - click.Option( - ["--dataset-id"], type=str, default=None, help="Clarifai App Dataset ID (optional)" - ), - ] - return options - - -@dataclass -class ClarifaiCliWriteConfig(ClarifaiWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.option]: - options = [ - click.Option( - ["--batch-size"], - type=int, - default=50, - help="No of inputs upload per batch", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=ClarifaiCliConfig, - additional_cli_options=[ClarifaiCliWriteConfig], - write_config=ClarifaiWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/confluence.py b/unstructured/ingest/cli/cmds/confluence.py deleted file mode 100644 index 1fc43d2ae..000000000 --- a/unstructured/ingest/cli/cmds/confluence.py +++ /dev/null @@ -1,69 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - DelimitedString, -) -from unstructured.ingest.connector.confluence import SimpleConfluenceConfig - - -@dataclass -class ConfluenceCliConfig(SimpleConfluenceConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--api-token"], - required=True, - help="API Token to authenticate into Confluence Cloud. " - "Check " - "https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/ " - "for more info.", - ), - click.Option( - ["--url"], - required=True, - help='URL to Confluence Cloud, e.g. "unstructured-ingest-test.atlassian.net"', - ), - click.Option( - ["--user-email"], - required=True, - help="Email to authenticate into Confluence Cloud", - ), - click.Option( - ["--spaces"], - default=None, - type=DelimitedString(), - help="A list of confluence space ids to be fetched. From each fetched space, " - "--num-of-docs-from-each-space number of docs will be ingested. " - "--spaces and --num-of-spaces cannot be used at the same time", - ), - click.Option( - ["--max-num-of-docs-from-each-space"], - default=100, - help="Number of documents to be aimed to be ingested from each fetched " - "confluence space. If any space has fewer documents, all the documents from " - "that space will be ingested. Documents are not necessarily " - "ingested in order of creation date.", - ), - click.Option( - ["--max-num-of-spaces"], - default=500, - help="Number of confluence space ids to be fetched. From each fetched space, " - "--num-of-docs-from-each-space number of docs will be ingested. " - "--spaces and --num-of-spaces cannot be used at the same time", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="confluence", - cli_config=ConfluenceCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/databricks_volumes.py b/unstructured/ingest/cli/cmds/databricks_volumes.py deleted file mode 100644 index faea5e0d4..000000000 --- a/unstructured/ingest/cli/cmds/databricks_volumes.py +++ /dev/null @@ -1,163 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig -from unstructured.ingest.connector.databricks_volumes import ( - DatabricksVolumesWriteConfig, - SimpleDatabricksVolumesConfig, -) - -CMD_NAME = "databricks-volumes" - - -@dataclass -class DatabricksVolumesCliConfig(SimpleDatabricksVolumesConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--host"], - type=str, - default=None, - help="The Databricks host URL for either the " - "Databricks workspace endpoint or the " - "Databricks accounts endpoint.", - ), - click.Option( - ["--account-id"], - type=str, - default=None, - help="The Databricks account ID for the Databricks " - "accounts endpoint. Only has effect when Host is " - "either https://accounts.cloud.databricks.com/ (AWS), " - "https://accounts.azuredatabricks.net/ (Azure), " - "or https://accounts.gcp.databricks.com/ (GCP).", - ), - click.Option( - ["--username"], - type=str, - default=None, - help="The Databricks username part of basic authentication. " - "Only possible when Host is *.cloud.databricks.com (AWS).", - ), - click.Option( - ["--password"], - type=str, - default=None, - help="The Databricks password part of basic authentication. " - "Only possible when Host is *.cloud.databricks.com (AWS).", - ), - click.Option(["--client-id"], type=str, default=None), - click.Option(["--client-secret"], type=str, default=None), - click.Option( - ["--token"], - type=str, - default=None, - help="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or " - "Azure Active Directory (Azure AD) token (Azure).", - ), - click.Option( - ["--azure-workspace-resource-id"], - type=str, - default=None, - help="The Azure Resource Manager ID for the Azure Databricks workspace, " - "which is exchanged for a Databricks host URL.", - ), - click.Option( - ["--azure-client-secret"], - type=str, - default=None, - help="The Azure AD service principal’s client secret.", - ), - click.Option( - ["--azure-client-id"], - type=str, - default=None, - help="The Azure AD service principal’s application ID.", - ), - click.Option( - ["--azure-tenant-id"], - type=str, - default=None, - help="The Azure AD service principal’s tenant ID.", - ), - click.Option( - ["--azure-environment"], - type=str, - default=None, - help="The Azure environment type (such as Public, UsGov, China, and Germany) for a " - "specific set of API endpoints. Defaults to PUBLIC.", - ), - click.Option( - ["--auth-type"], - type=str, - default=None, - help="When multiple auth attributes are available in the " - "environment, use the auth type specified by this " - "argument. This argument also holds the currently " - "selected auth.", - ), - click.Option(["--cluster-id"], type=str, default=None), - click.Option(["--google-credentials"], type=str, default=None), - click.Option(["--google-service-account"], type=str, default=None), - ] - return options - - -@dataclass -class DatabricksVolumesCliWriteConfig(DatabricksVolumesWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--volume"], type=str, required=True, help="Name of volume in the Unity Catalog" - ), - click.Option( - ["--catalog"], - type=str, - required=True, - help="Name of the catalog in the Databricks Unity Catalog service", - ), - click.Option( - ["--volume-path"], - type=str, - required=False, - default=None, - help="Optional path within the volume to write to", - ), - click.Option( - ["--overwrite"], - type=bool, - is_flag=True, - help="If true, an existing file will be overwritten.", - ), - click.Option( - ["--encoding"], - type=str, - required=True, - default="utf-8", - help="Encoding applied to the data when written to the volume", - ), - click.Option( - ["--schema"], - type=str, - required=True, - default="default", - help="Schema associated with the volume to write to in the Unity Catalog service", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=DatabricksVolumesCliConfig, - additional_cli_options=[DatabricksVolumesCliWriteConfig], - write_config=DatabricksVolumesWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/delta_table.py b/unstructured/ingest/cli/cmds/delta_table.py deleted file mode 100644 index 8504c09b0..000000000 --- a/unstructured/ingest/cli/cmds/delta_table.py +++ /dev/null @@ -1,94 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliConfig, Dict -from unstructured.ingest.connector.delta_table import DeltaTableWriteConfig, SimpleDeltaTableConfig - -CMD_NAME = "delta-table" - - -@dataclass -class DeltaTableCliConfig(SimpleDeltaTableConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--table-uri"], - required=True, - help="the path of the DeltaTable", - ), - click.Option( - ["--version"], - default=None, - type=int, - help="version of the DeltaTable", - ), - click.Option( - ["--storage_options"], - required=False, - type=Dict(), - default=None, - help="a dictionary of the options to use for the storage backend, " - "passed in as a json string", - ), - click.Option( - ["--without-files"], - is_flag=True, - default=False, - help="If set, will load table without tracking files.", - ), - ] - return options - - -@dataclass -class DeltaTableCliWriteConfig(DeltaTableWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--overwrite-schema"], - is_flag=True, - default=False, - help="Flag to overwrite schema of destination table", - ), - click.Option( - ["--drop-empty-cols"], - is_flag=True, - default=False, - help="Flag to drop any columns that have no content", - ), - click.Option( - ["--mode"], - default="error", - type=click.Choice(["error", "append", "overwrite", "ignore"]), - help="How to handle existing data. Default is to error if table already exists. " - "If 'append', will add new data. " - "If 'overwrite', will replace table with new data. " - "If 'ignore', will not write anything if table already exists.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=DeltaTableCliConfig, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=DeltaTableCliConfig, - additional_cli_options=[DeltaTableCliWriteConfig], - write_config=DeltaTableWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/discord.py b/unstructured/ingest/cli/cmds/discord.py deleted file mode 100644 index 115745a6a..000000000 --- a/unstructured/ingest/cli/cmds/discord.py +++ /dev/null @@ -1,47 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - DelimitedString, -) -from unstructured.ingest.connector.discord import SimpleDiscordConfig - - -@dataclass -class DiscordCliConfig(SimpleDiscordConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--token"], - required=True, - help="Bot token used to access Discord API, must have " - "READ_MESSAGE_HISTORY scope for the bot user", - ), - click.Option( - ["--channels"], - required=True, - type=DelimitedString(), - help="Comma-delimited list of discord channel ids to ingest from.", - ), - click.Option( - ["--period"], - default=None, - type=click.IntRange(0), - help="Number of days to go back in the history of " - "discord channels, must be a number", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="discord", - cli_config=DiscordCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/elasticsearch.py b/unstructured/ingest/cli/cmds/elasticsearch.py deleted file mode 100644 index 58e3ec4d6..000000000 --- a/unstructured/ingest/cli/cmds/elasticsearch.py +++ /dev/null @@ -1,133 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString -from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchWriteConfig, - SimpleElasticsearchConfig, -) - -CMD_NAME = "elasticsearch" - - -@dataclass -class ElasticsearchCliConfig(SimpleElasticsearchConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--index-name"], - required=True, - type=str, - help="Name of the Elasticsearch index to pull data from, or upload data to.", - ), - click.Option( - ["--hosts"], - type=DelimitedString(), - help='List of the Elasticsearch hosts to connect to, e.g. "http://localhost:9200"', - ), - click.Option( - ["--fields"], - type=DelimitedString(), - default=[], - help="If provided, will limit the fields returned by Elasticsearch " - "to this comma-delimited list", - ), - click.Option( - ["--username"], type=str, default=None, help="username when using basic auth" - ), - click.Option( - ["--password"], - type=str, - default=None, - help="password when using basic auth or connecting to a cloud instance", - ), - click.Option( - ["--cloud-id"], type=str, default=None, help="id used to connect to Elastic Cloud" - ), - click.Option( - ["--es-api-key"], type=str, default=None, help="api key used for authentication" - ), - click.Option( - ["--api-key-id"], - type=str, - default=None, - help="id associated with api key used for authentication: " - "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501 - ), - click.Option( - ["--bearer-auth"], - type=str, - default=None, - help="bearer token used for HTTP bearer authentication", - ), - click.Option( - ["--ca-certs"], - type=click.Path(), - default=None, - ), - click.Option( - ["--ssl-assert-fingerprint"], - type=str, - default=None, - help="SHA256 fingerprint value", - ), - click.Option( - ["--batch-size"], - default=100, - type=click.IntRange(0), - help="how many records to read at a time per process", - ), - ] - return options - - -@dataclass -class ElasticsearchCliWriteConfig(ElasticsearchWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size-bytes"], - required=False, - default=15_000_000, - type=int, - help="Size limit (in bytes) for each batch of items to be uploaded. Check" - " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html" - "#_how_big_is_too_big for more information.", - ), - click.Option( - ["--num-processes"], - required=False, - default=1, - type=int, - help="Number of processes to be used while uploading content", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="elasticsearch", - cli_config=ElasticsearchCliConfig, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="elasticsearch", - cli_config=ElasticsearchCliConfig, - additional_cli_options=[ElasticsearchCliWriteConfig], - addition_configs={ - "connector_config": SimpleElasticsearchConfig, - "write_config": ElasticsearchCliWriteConfig, - }, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/__init__.py b/unstructured/ingest/cli/cmds/fsspec/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/cli/cmds/fsspec/azure.py b/unstructured/ingest/cli/cmds/fsspec/azure.py deleted file mode 100644 index 0d5f04344..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/azure.py +++ /dev/null @@ -1,94 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.fsspec.azure import ( - AzureWriteConfig, - SimpleAzureBlobStorageConfig, -) - -CMD_NAME = "azure" - - -@dataclass -class AzureCliConfig(SimpleAzureBlobStorageConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--account-key"], - default=None, - help="The storage account key. This is used for shared key " - "authentication. If any of account key, sas token or " - "client_id are not specified, anonymous access will be used.", - ), - click.Option( - ["--account-name"], - default=None, - help="The storage account name. This is used to authenticate " - "requests signed with an account key and to construct " - "the storage endpoint. It is required unless a connection " - "string is given, or if a custom domain is used with " - "anonymous authentication.", - ), - click.Option( - ["--connection-string"], - default=None, - help="If specified, this will override all other parameters. See " - "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501 - "for the connection string format.", - ), - click.Option( - ["--sas_token"], - default=None, - help="A shared access signature token to use to authenticate " - "requests instead of the account key. If account key and " - "sas token are both specified, account key will be used " - "to sign. If any of account key, sas token or client_id " - "are not specified, anonymous access will be used.", - ), - ] - return options - - -@dataclass -class AzureCliWriteConfig(AzureWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--overwrite"], - is_flag=True, - default=False, - show_default=True, - help="If set, will overwrite content if content already exists", - ) - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=AzureCliConfig, - is_fsspec=True, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=AzureCliConfig, - write_config=AzureCliWriteConfig, - is_fsspec=True, - additional_cli_options=[AzureCliWriteConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/box.py b/unstructured/ingest/cli/cmds/fsspec/box.py deleted file mode 100644 index 0d7976350..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/box.py +++ /dev/null @@ -1,48 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.fsspec.box import BoxWriteConfig, SimpleBoxConfig - -CMD_NAME = "box" - - -@dataclass -class BoxCliConfig(SimpleBoxConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--box-app-config"], - default=None, - type=click.Path(), - help="Path to Box app credentials as json file.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=BoxCliConfig, - is_fsspec=True, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=BoxCliConfig, - write_config=BoxWriteConfig, - is_fsspec=True, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/dropbox.py b/unstructured/ingest/cli/cmds/fsspec/dropbox.py deleted file mode 100644 index 247643016..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/dropbox.py +++ /dev/null @@ -1,51 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.fsspec.dropbox import ( - DropboxWriteConfig, - SimpleDropboxConfig, -) - -CMD_NAME = "dropbox" - - -@dataclass -class DropboxCliConfig(SimpleDropboxConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--token"], - required=True, - type=str, - help="Dropbox access token.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=DropboxCliConfig, - is_fsspec=True, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=DropboxCliConfig, - write_config=DropboxWriteConfig, - is_fsspec=True, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/fsspec.py b/unstructured/ingest/cli/cmds/fsspec/fsspec.py deleted file mode 100644 index e2d50a278..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/fsspec.py +++ /dev/null @@ -1,15 +0,0 @@ -from unstructured.ingest.cli.base.src import BaseSrcCmd - -CMD_NAME = "fsspec" - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd(cmd_name=CMD_NAME, is_fsspec=True) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd(cmd_name=CMD_NAME, is_fsspec=True) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/gcs.py b/unstructured/ingest/cli/cmds/fsspec/gcs.py deleted file mode 100644 index 4664694a7..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/gcs.py +++ /dev/null @@ -1,71 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - FileOrJson, -) -from unstructured.ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig - -CMD_NAME = "gcs" - - -@dataclass -class GcsCliConfig(SimpleGcsConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - help_string = """ - Options: - - ``None``, GCSFS will attempt to guess your credentials in the - following order: gcloud CLI default, gcsfs cached token, google compute - metadata service, anonymous. - - ``'google_default'``, your default gcloud credentials will be used, - which are typically established by doing ``gcloud login`` in a terminal. - - ``'cache'``, credentials from previously successful gcsfs - authentication will be used (use this after "browser" auth succeeded) - - ``'anon'``, no authentication is performed, and you can only - access data which is accessible to allUsers (in this case, the project and - access level parameters are meaningless) - - ``'browser'``, you get an access code with which you can - authenticate via a specially provided URL - - if ``'cloud'``, we assume we are running within google compute - or google container engine, and query the internal metadata directly for - a token. - - you may supply a token generated by the - [gcloud](https://cloud.google.com/sdk/docs/) - utility; this is either a python dictionary or the name of a file - containing the JSON returned by logging in with the gcloud CLI tool. - """ - options = [ - click.Option( - ["--service-account-key"], - default=None, - type=FileOrJson(allow_raw_str=True), - help=help_string, - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=GcsCliConfig, - is_fsspec=True, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=GcsCliConfig, - write_config=GcsWriteConfig, - is_fsspec=True, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/s3.py b/unstructured/ingest/cli/cmds/fsspec/s3.py deleted file mode 100644 index a185fa2e1..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/s3.py +++ /dev/null @@ -1,74 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config - -CMD_NAME = "s3" - - -@dataclass -class S3CliConfig(SimpleS3Config, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--anonymous"], - is_flag=True, - default=False, - help="Connect to s3 without local AWS credentials.", - ), - click.Option( - ["--endpoint-url"], - type=str, - default=None, - help="Use this endpoint_url, if specified. Needed for " - "connecting to non-AWS S3 buckets.", - ), - click.Option( - ["--key"], - type=str, - default=None, - help="If not anonymous, use this access key ID, if specified. Takes precedence " - "over `aws_access_key_id` in client_kwargs.", - ), - click.Option( - ["--secret"], - type=str, - default=None, - help="If not anonymous, use this secret access key, if specified.", - ), - click.Option( - ["--token"], - type=str, - default=None, - help="If not anonymous, use this security token, if specified.", - ), - ] - return options - - -def get_base_src_cmd(): - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=S3CliConfig, - is_fsspec=True, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=S3CliConfig, - write_config=S3WriteConfig, - is_fsspec=True, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/fsspec/sftp.py b/unstructured/ingest/cli/cmds/fsspec/sftp.py deleted file mode 100644 index 01f7c615a..000000000 --- a/unstructured/ingest/cli/cmds/fsspec/sftp.py +++ /dev/null @@ -1,58 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.fsspec.sftp import SimpleSftpConfig - -CMD_NAME = "sftp" - - -@dataclass -class SftpCliConfig(SimpleSftpConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--username"], - required=True, - type=str, - help="Username for sftp connection", - ), - click.Option( - ["--password"], - required=True, - type=str, - help="Password for sftp connection", - ), - click.Option( - ["--look-for-keys"], - required=False, - default=False, - is_flag=True, - type=bool, - help="Whether to search for private key files in ~/.ssh/", - ), - click.Option( - ["--allow-agent"], - required=False, - default=False, - is_flag=True, - type=bool, - help="Whether to connect to the SSH agent.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=SftpCliConfig, - is_fsspec=True, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/github.py b/unstructured/ingest/cli/cmds/github.py deleted file mode 100644 index bb3f1b7f0..000000000 --- a/unstructured/ingest/cli/cmds/github.py +++ /dev/null @@ -1,54 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString -from unstructured.ingest.connector.github import SimpleGitHubConfig - - -@dataclass -class GithubCliConfig(SimpleGitHubConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--url"], - required=True, - type=str, - help="URL to GitHub repository, e.g. " - '"https://github.com/Unstructured-IO/unstructured", or ' - 'a repository owner/name pair, e.g. "Unstructured-IO/unstructured"', - ), - click.Option( - ["--git-access-token"], - default=None, - help="A GitHub or GitLab access token, " - "see https://docs.github.com/en/authentication or " - "https://docs.gitlab.com/ee/api/rest/index.html#personalprojectgroup-access-tokens", - ), - click.Option( - ["--git-branch"], - default=None, - type=str, - help="The branch for which to fetch files from. If not given," - " the default repository branch is used.", - ), - click.Option( - ["--git-file-glob"], - default=None, - type=DelimitedString(), - help="A comma-separated list of file globs to limit which " - "types of files are accepted, e.g. '*.html,*.txt'", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="github", - cli_config=GithubCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/gitlab.py b/unstructured/ingest/cli/cmds/gitlab.py deleted file mode 100644 index 5f01c4201..000000000 --- a/unstructured/ingest/cli/cmds/gitlab.py +++ /dev/null @@ -1,54 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString -from unstructured.ingest.connector.gitlab import SimpleGitlabConfig - - -@dataclass -class GitlabCliConfig(SimpleGitlabConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--url"], - required=True, - type=str, - help="URL to GitHub repository, e.g. " - '"https://github.com/Unstructured-IO/unstructured", or ' - 'a repository owner/name pair, e.g. "Unstructured-IO/unstructured"', - ), - click.Option( - ["--git-access-token"], - default=None, - help="A GitHub or GitLab access token, " - "see https://docs.github.com/en/authentication or " - "https://docs.gitlab.com/ee/api/rest/index.html#personalprojectgroup-access-tokens", - ), - click.Option( - ["--git-branch"], - default=None, - type=str, - help="The branch for which to fetch files from. If not given," - " the default repository branch is used.", - ), - click.Option( - ["--git-file-glob"], - default=None, - type=DelimitedString(), - help="A comma-separated list of file globs to limit which types of " - "files are accepted, e.g. '*.html,*.txt'", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="gitlab", - cli_config=GitlabCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/google_drive.py b/unstructured/ingest/cli/cmds/google_drive.py deleted file mode 100644 index 6fc9b1930..000000000 --- a/unstructured/ingest/cli/cmds/google_drive.py +++ /dev/null @@ -1,49 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, - FileOrJson, -) -from unstructured.ingest.connector.google_drive import SimpleGoogleDriveConfig - - -@dataclass -class GoogleDriveCliConfig(SimpleGoogleDriveConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--drive-id"], - required=True, - type=str, - help="Google Drive File or Folder ID.", - ), - click.Option( - ["--service-account-key"], - required=True, - type=FileOrJson(), - help="Either the file path of the credentials file to use or a json string of " - "those values to use for authentication", - ), - click.Option( - ["--extension"], - default=None, - type=str, - help="Filters the files to be processed based on extension e.g. .jpg, .docx, etc.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="google-drive", - cli_config=GoogleDriveCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/hubspot.py b/unstructured/ingest/cli/cmds/hubspot.py deleted file mode 100644 index 219973cb7..000000000 --- a/unstructured/ingest/cli/cmds/hubspot.py +++ /dev/null @@ -1,70 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliMixin, DelimitedString, Dict -from unstructured.ingest.connector.hubspot import HubSpotObjectTypes, SimpleHubSpotConfig - -OBJECT_TYPES = {t.value for t in HubSpotObjectTypes} - - -def validate_custom_property(ctx, param, value) -> t.Dict[str, t.List[str]]: - if not value: - return value - for k in value: - if k not in OBJECT_TYPES: - raise ValueError(f"Invalid object type: {k}, must be one of {OBJECT_TYPES}") - if not isinstance(value[k], list): - raise ValueError(f"Invalid type: {type(value[k])}, must be a Python list.") - return value - - -@dataclass -class HubSpotCliConfig(SimpleHubSpotConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--api-token"], - required=True, - type=str, - help="Access token to perform operations on Hubspot. \ - Check \ - https://developers.hubspot.com/docs/api/private-apps/ \ - for more info", - ), - click.Option( - ["--object-types"], - default=None, - required=False, - type=DelimitedString(choices=OBJECT_TYPES), - is_flag=False, - help=f"Object to include in the process.\ - Must be a subset of {','.join(OBJECT_TYPES)}.\ - If the argument is omitted all objects listed will be processed.", - ), - click.Option( - ["--custom-properties"], - default=None, - required=False, - type=Dict(), - is_flag=False, - callback=validate_custom_property, - help="Custom property to process information from.\ - It should be a json-like string in the form\ - :[, ..., ]\ - Must be internal name of the variable. If the property is missing, \ - it will be omitted.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="hubspot", - cli_config=HubSpotCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/jira.py b/unstructured/ingest/cli/cmds/jira.py deleted file mode 100644 index 74b2d5356..000000000 --- a/unstructured/ingest/cli/cmds/jira.py +++ /dev/null @@ -1,71 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - DelimitedString, -) -from unstructured.ingest.connector.jira import SimpleJiraConfig - - -@dataclass -class JiraCliConfig(SimpleJiraConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--api-token"], - required=True, - type=str, - help="API Token to authenticate into Jira (into Atlassian). \ - Check \ - https://developer.atlassian.com/cloud/jira/platform/basic-auth-for-rest-apis/ \ - for more info.", - ), - click.Option( - ["--url"], - required=True, - type=str, - help="URL to Atlassian (Jira) Cloud, e.g. " - '"unstructured-jira-connector-test.atlassian.net"', - ), - click.Option( - ["--user-email"], - required=True, - type=str, - help="Email to authenticate into Atlassian (Jira) Cloud.", - ), - click.Option( - ["--projects"], - default=None, - type=DelimitedString(), - help="Comma-delimited Project ids or keys. Use Jira UI or the " - "API to find or obtain keys. Alternatively, use API to obtain ids.", - ), - click.Option( - ["--boards"], - default=None, - type=DelimitedString(), - help="Comma-delimited Board ids. Check board URL, or use the " - "API to find the board ids.", - ), - click.Option( - ["--issues"], - default=None, - type=DelimitedString(), - help="Comma-delimited Issue ids or keys. Use Jira UI or the API to " - "find or obtain keys. Alternatively, use API to obtain ids.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="jira", - cli_config=JiraCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/kafka.py b/unstructured/ingest/cli/cmds/kafka.py deleted file mode 100644 index afbad4888..000000000 --- a/unstructured/ingest/cli/cmds/kafka.py +++ /dev/null @@ -1,102 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliConfig -from unstructured.ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig - -CMD_NAME = "kafka" - - -@dataclass -class KafkaCliConfig(SimpleKafkaConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--bootstrap-server"], required=True, type=str, help="Broker server hostname" - ), - click.Option( - ["--port"], - required=True, - type=str, - help="The bootstrap port", - ), - click.Option( - ["--topic"], - required=True, - type=str, - help="The topic to write into.'", - ), - click.Option( - ["--kafka-api-key"], - required=False, - type=str, - help="The API KEY", - ), - click.Option( - ["--secret"], - required=False, - type=str, - help="The secret", - ), - click.Option( - ["--num-messages-to-consume"], - required=False, - type=int, - default=1, - help="The number of messages to consume before unblocking the consumer", - ), - click.Option( - ["--timeout"], - required=False, - type=float, - default=1.0, - help="Maximum time to block waiting for message(Seconds)", - ), - click.Option( - ["--confluent"], - required=False, - type=bool, - default=True, - help="Whether this Kafka instance is from Confluent", - ), - ] - return options - - -@dataclass -class KafkaCliWriteConfig(KafkaWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=4, - type=int, - help="Number of records per batch", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=KafkaCliConfig, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=KafkaCliConfig, - additional_cli_options=[KafkaCliWriteConfig], - write_config=KafkaWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/local.py b/unstructured/ingest/cli/cmds/local.py deleted file mode 100644 index ff70c44ca..000000000 --- a/unstructured/ingest/cli/cmds/local.py +++ /dev/null @@ -1,43 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, - DelimitedString, -) -from unstructured.ingest.connector.local import SimpleLocalConfig - - -@dataclass -class LocalCliConfig(SimpleLocalConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--input-path"], - required=True, - type=click.Path(file_okay=True, dir_okay=True, exists=True), - help="Path to the location in the local file system that will be processed.", - ), - click.Option( - ["--file-glob"], - default=None, - type=DelimitedString(), - help="A comma-separated list of file globs to limit which types of " - "local files are accepted, e.g. '*.html,*.txt'", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="local", - cli_config=LocalCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/mongodb.py b/unstructured/ingest/cli/cmds/mongodb.py deleted file mode 100644 index 6fbb5c365..000000000 --- a/unstructured/ingest/cli/cmds/mongodb.py +++ /dev/null @@ -1,72 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString -from unstructured.ingest.connector.mongodb import SimpleMongoDBConfig -from unstructured.ingest.interfaces import WriteConfig - -CMD_NAME = "mongodb" - - -@dataclass -class MongoDBCliConfig(SimpleMongoDBConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--uri"], - help="URI to user when connecting", - ), - click.Option( - ["--host"], - type=DelimitedString(), - help="hostname or IP address or Unix domain socket path of a single mongod or " - "mongos instance to connect to, or a list of hostnames", - ), - click.Option(["--port"], type=int, default=27017), - click.Option( - ["--database"], type=str, required=True, help="database name to connect to" - ), - click.Option( - ["--collection"], required=True, type=str, help="collection name to connect to" - ), - ] - return options - - -@dataclass -class MongoDBReadConfig(SimpleMongoDBConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=click.IntRange(0), - help="how many records to read at a time per process", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name=CMD_NAME, - cli_config=MongoDBCliConfig, - additional_cli_options=[MongoDBReadConfig], - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=MongoDBCliConfig, - write_config=WriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/notion.py b/unstructured/ingest/cli/cmds/notion.py deleted file mode 100644 index 02a9a30ed..000000000 --- a/unstructured/ingest/cli/cmds/notion.py +++ /dev/null @@ -1,48 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, - DelimitedString, -) -from unstructured.ingest.connector.notion.connector import SimpleNotionConfig - - -@dataclass -class NotionCliConfig(SimpleNotionConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--notion-api-key"], - required=True, - type=str, - help="API key for Notion api", - ), - click.Option( - ["--page-ids"], - default=None, - type=DelimitedString(), - help="Notion page IDs to pull text from", - ), - click.Option( - ["--database-ids"], - default=None, - type=DelimitedString(), - help="Notion database IDs to pull text from", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="notion", - cli_config=NotionCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/onedrive.py b/unstructured/ingest/cli/cmds/onedrive.py deleted file mode 100644 index 5bf671d9f..000000000 --- a/unstructured/ingest/cli/cmds/onedrive.py +++ /dev/null @@ -1,66 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, -) -from unstructured.ingest.connector.onedrive import SimpleOneDriveConfig - - -@dataclass -class OnedriveCliConfig(SimpleOneDriveConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--client-id"], - required=True, - type=str, - help="Microsoft app client ID", - ), - click.Option( - ["--client-cred"], - required=True, - type=str, - help="Microsoft App client secret", - ), - click.Option( - ["--user-pname"], - required=True, - type=str, - help="User principal name, usually is your Azure AD email.", - ), - click.Option( - ["--tenant"], - default="common", - type=str, - help="ID or domain name associated with your Azure AD instance", - ), - click.Option( - ["--path"], - default=None, - type=str, - help="Folder to start parsing files from.", - ), - click.Option( - ["--authority-url"], - default="https://login.microsoftonline.com", - type=str, - help="Authentication token provider for Microsoft apps, default is " - "https://login.microsoftonline.com", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="onedrive", - cli_config=OnedriveCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/opensearch.py b/unstructured/ingest/cli/cmds/opensearch.py deleted file mode 100644 index 0f135de15..000000000 --- a/unstructured/ingest/cli/cmds/opensearch.py +++ /dev/null @@ -1,117 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.cmds.elasticsearch import ElasticsearchCliWriteConfig -from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString -from unstructured.ingest.connector.opensearch import SimpleOpenSearchConfig - -CMD_NAME = "opensearch" - - -@dataclass -class OpenSearchCliConfig(SimpleOpenSearchConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--index-name"], - required=True, - type=str, - help="Name of the OpenSearch index to pull data from, or upload data to.", - ), - click.Option( - ["--hosts"], - type=DelimitedString(), - help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"', - ), - click.Option( - ["--fields"], - type=DelimitedString(), - default=[], - help="If provided, will limit the fields returned by OpenSearch " - "to this comma-delimited list", - ), - click.Option( - ["--username"], type=str, default=None, help="username when using basic auth" - ), - click.Option( - ["--password"], - type=str, - default=None, - help="password when using basic auth", - ), - click.Option( - ["--use-ssl"], - type=bool, - default=False, - is_flag=True, - help="use ssl for the connection", - ), - click.Option( - ["--verify-certs"], - type=bool, - default=False, - is_flag=True, - help="whether to verify SSL certificates", - ), - click.Option( - ["--ssl-show-warn"], - type=bool, - default=False, - is_flag=True, - help="show warning when verify certs is disabled", - ), - click.Option( - ["--ca-certs"], - type=click.Path(), - default=None, - help="path to CA bundle", - ), - click.Option( - ["--client-cert"], - type=click.Path(), - default=None, - help="path to the file containing the private key and the certificate," - " or cert only if using client_key", - ), - click.Option( - ["--client-key"], - type=click.Path(), - default=None, - help="path to the file containing the private key" - " if using separate cert and key files", - ), - click.Option( - ["--batch-size"], - default=100, - type=click.IntRange(0), - help="how many records to read at a time per process", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="opensearch", - cli_config=OpenSearchCliConfig, - ) - return cmd_cls - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="opensearch", - cli_config=OpenSearchCliConfig, - additional_cli_options=[ElasticsearchCliWriteConfig], - addition_configs={ - "connector_config": SimpleOpenSearchConfig, - "write_config": ElasticsearchCliWriteConfig, - }, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/outlook.py b/unstructured/ingest/cli/cmds/outlook.py deleted file mode 100644 index 7b4e66968..000000000 --- a/unstructured/ingest/cli/cmds/outlook.py +++ /dev/null @@ -1,67 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, - DelimitedString, -) -from unstructured.ingest.connector.outlook import SimpleOutlookConfig - - -@dataclass -class OutlookCliConfig(SimpleOutlookConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--client-id"], - required=True, - type=str, - help="Microsoft app client ID", - ), - click.Option( - ["--user-email"], - required=True, - type=str, - help="Outlook email to download messages from.", - ), - click.Option( - ["--tenant"], - default="common", - help="ID or domain name associated with your Azure AD instance", - ), - click.Option( - ["--outlook-folders"], - default=None, - type=DelimitedString(), - help="Folders to download email messages from. " - "Do not specify subfolders. Use quotes if spaces in folder names.", - ), - click.Option( - ["--client-cred"], - default=None, - type=str, - help="Microsoft App client secret", - ), - click.Option( - ["--authority-url"], - default="https://login.microsoftonline.com", - type=str, - help="Authentication token provider for Microsoft apps, default is " - "https://login.microsoftonline.com", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="outlook", - cli_config=OutlookCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/pinecone.py b/unstructured/ingest/cli/cmds/pinecone.py deleted file mode 100644 index 91d476669..000000000 --- a/unstructured/ingest/cli/cmds/pinecone.py +++ /dev/null @@ -1,71 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig - - -@dataclass -class PineconeCliConfig(SimplePineconeConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--api-key"], - required=True, - type=str, - help="API key used for authenticating to a Pinecone instance.", - envvar="PINECONE_API_KEY", - show_envvar=True, - ), - click.Option( - ["--index-name"], - required=True, - type=str, - help="The name of the pinecone index to connect to.", - ), - click.Option( - ["--environment"], - required=True, - type=str, - help="The environment where the index lives. Eg. 'gcp-starter' or 'us-east1-gcp'", - ), - ] - return options - - -@dataclass -class PineconeCliWriteConfig(PineconeWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=50, - type=int, - help="Number of records per batch", - ), - click.Option( - ["--num-processes"], - default=2, - type=int, - help="Number of parallel processes with which to upload elements", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="pinecone", - cli_config=PineconeCliConfig, - additional_cli_options=[PineconeCliWriteConfig], - write_config=PineconeWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/qdrant.py b/unstructured/ingest/cli/cmds/qdrant.py deleted file mode 100644 index 1a0847614..000000000 --- a/unstructured/ingest/cli/cmds/qdrant.py +++ /dev/null @@ -1,124 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig - - -@dataclass -class QdrantCliConfig(SimpleQdrantConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--collection-name"], - required=True, - type=str, - help="The name of the Qdrant collection to use.", - ), - click.Option( - ["--location"], - type=str, - help="The location of the Qdrant cluster.", - ), - click.Option( - ["--url"], - type=str, - help="The location of the Qdrant cluster.", - ), - click.Option( - ["--port"], - type=int, - default=6333, - help="Port of the REST API interface. Default: 6333.", - ), - click.Option( - ["--grpc-port"], - type=int, - default=6334, - help="Port of the gRPC interface. Default: 6334.", - ), - click.Option( - ["--prefer-grpc"], - type=bool, - is_flag=True, - help="Whether to use gPRC interface whenever possible in methods. Default: False.", - ), - click.Option( - ["--https"], - type=bool, - is_flag=True, - help="Whether to use HTTPS(SSL) protocol. Default: False.", - ), - click.Option( - ["--prefix"], - type=str, - help="Prefix to add the REST API endpoints.", - ), - click.Option( - ["--timeout"], - type=int, - help="Timeout for operations. Default: 5.0 seconds for REST, unlimited for gRPC.", - ), - click.Option( - ["--host"], - type=str, - help="Host name of the Qdrant service.", - ), - click.Option( - ["--path"], - type=str, - help="Persistence path for QdrantLocal.", - ), - click.Option( - ["--force-disable-check-same-thread"], - type=bool, - is_flag=True, - help="Whether to force disable check same thread for QdrantLocal.", - ), - click.Option( - ["--api-key"], - type=str, - help="API key for authentication in Qdrant Cloud. Default: None.", - envvar="QDRANT_API_KEY", - show_envvar=True, - ), - ] - return options - - -@dataclass -class QdrantCliWriteConfig(QdrantWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=50, - type=int, - help="Number of points to upload per batch", - ), - click.Option( - ["--num-processes"], - default=2, - type=int, - help="Number of parallel processes with which to upload", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="qdrant", - cli_config=QdrantCliConfig, - additional_cli_options=[QdrantCliWriteConfig], - write_config=QdrantWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/reddit.py b/unstructured/ingest/cli/cmds/reddit.py deleted file mode 100644 index 067b74250..000000000 --- a/unstructured/ingest/cli/cmds/reddit.py +++ /dev/null @@ -1,67 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.reddit import SimpleRedditConfig - - -@dataclass -class RedditCliConfig(SimpleRedditConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--client-id"], - required=True, - type=str, - help="The client ID, see " - "https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" # noqa: E501 - " for more information.", - ), - click.Option( - ["--client-secret"], - required=True, - type=str, - help="The client secret, see " - "https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" # noqa: E501 - " for more information.", - ), - click.Option( - ["--subreddit-name"], - required=True, - type=str, - help='The name of a subreddit, without the "r\\", e.g. "machinelearning"', - ), - click.Option( - ["--search-query"], - default=None, - type=str, - help="If set, return posts using this query. Otherwise, use hot posts.", - ), - click.Option( - ["--num-posts"], - required=True, - type=click.IntRange(0), - help="If set, limits the number of posts to pull in.", - ), - click.Option( - ["--user-agent"], - required=True, - type=str, - help="user agent request header to use when calling Reddit API", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="reddit", - cli_config=RedditCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/salesforce.py b/unstructured/ingest/cli/cmds/salesforce.py deleted file mode 100644 index a6d7119a1..000000000 --- a/unstructured/ingest/cli/cmds/salesforce.py +++ /dev/null @@ -1,58 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, - DelimitedString, -) -from unstructured.ingest.connector.salesforce import SimpleSalesforceConfig - - -@dataclass -class SalesforceCliConfig(SimpleSalesforceConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - possible_categories = ["Account", "Case", "Campaign", "EmailMessage", "Lead"] - options = [ - click.Option( - ["--username"], - required=True, - type=str, - help="Salesforce username usually looks like an email.", - ), - click.Option( - ["--consumer-key"], - required=True, - type=str, - help="For the Salesforce JWT auth. Found in Consumer Details.", - ), - click.Option( - ["--private-key"], - required=True, - type=str, - help="Path to the private key or its contents for the Salesforce JWT auth. " - "Key file is usually named server.key.", - ), - click.Option( - ["--categories"], - default=None, - required=True, - type=DelimitedString(choices=possible_categories), - help="Comma-delimited salesforce categories to download. " - "Currently only {}.".format(", ".join(possible_categories)), - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="salesforce", - cli_config=SalesforceCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/sharepoint.py b/unstructured/ingest/cli/cmds/sharepoint.py deleted file mode 100644 index 5c6185eef..000000000 --- a/unstructured/ingest/cli/cmds/sharepoint.py +++ /dev/null @@ -1,66 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - CliRecursiveConfig, -) -from unstructured.ingest.connector.sharepoint import SimpleSharepointConfig - - -@dataclass -class SharepointCliConfig(SimpleSharepointConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--client-id"], - default=None, - type=str, - help="Sharepoint app client ID", - ), - click.Option( - ["--client-cred"], - default=None, - type=str, - help="Sharepoint app secret", - ), - click.Option( - ["--site"], - default=None, - type=str, - help="Sharepoint site url. Process either base url e.g \ - https://[tenant].sharepoint.com or relative sites \ - https://[tenant].sharepoint.com/sites/. \ - To process all sites within the tenant pass a site url as \ - https://[tenant]-admin.sharepoint.com.\ - This requires the app to be registered at a tenant level", - ), - click.Option( - ["--path"], - default="Shared Documents", - type=str, - help="Path from which to start parsing files. If the connector is to \ - process all sites within the tenant this filter will be applied to \ - all sites document libraries. Default 'Shared Documents'", - ), - click.Option( - ["--files-only"], - is_flag=True, - default=False, - help="Process only files.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="sharepoint", - cli_config=SharepointCliConfig, - additional_cli_options=[CliRecursiveConfig], - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/slack.py b/unstructured/ingest/cli/cmds/slack.py deleted file mode 100644 index 7112849e1..000000000 --- a/unstructured/ingest/cli/cmds/slack.py +++ /dev/null @@ -1,56 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, - DelimitedString, -) -from unstructured.ingest.connector.slack import SimpleSlackConfig - - -@dataclass -class SlackCliConfig(SimpleSlackConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--token"], - required=True, - type=str, - help="Bot token used to access Slack API, must have channels:history " - "scope for the bot user", - ), - click.Option( - ["--channels"], - required=True, - type=DelimitedString(), - help="Comma-delimited list of Slack channel IDs to pull messages from, " - "can be a public or private channel", - ), - click.Option( - ["--start-date"], - default=None, - type=str, - help="Start date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or " - "YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz", - ), - click.Option( - ["--end-date"], - default=None, - type=str, - help="End date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or " - "YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="slack", - cli_config=SlackCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/sql.py b/unstructured/ingest/cli/cmds/sql.py deleted file mode 100644 index 7b4800e55..000000000 --- a/unstructured/ingest/cli/cmds/sql.py +++ /dev/null @@ -1,66 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig -from unstructured.ingest.connector.sql import SimpleSqlConfig -from unstructured.ingest.interfaces import WriteConfig - -SQL_DRIVERS = {"postgresql", "sqlite"} - - -@dataclass -class SqlCliConfig(SimpleSqlConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--db-type"], - required=True, - type=click.Choice(SQL_DRIVERS), - help="Type of the database backend", - ), - click.Option( - ["--username"], - default=None, - type=str, - help="DB username", - ), - click.Option( - ["--password"], - default=None, - type=str, - help="DB password", - ), - click.Option( - ["--host"], - default=None, - type=str, - help="DB host", - ), - click.Option( - ["--port"], - default=None, - type=int, - help="DB host connection port", - ), - click.Option( - ["--database"], - default=None, - type=str, - help="Database name. For sqlite databases, this is the path to the .db file.", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="sql", - cli_config=SqlCliConfig, - write_config=WriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/vectara.py b/unstructured/ingest/cli/cmds/vectara.py deleted file mode 100644 index 0c623362b..000000000 --- a/unstructured/ingest/cli/cmds/vectara.py +++ /dev/null @@ -1,66 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig -from unstructured.ingest.connector.vectara import SimpleVectaraConfig, WriteConfig - - -@dataclass -class VectaraCliWriteConfig(SimpleVectaraConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--customer-id"], - required=True, - type=str, - help="The Vectara customer-id.", - envvar="VECTARA_CUSTOMER_ID", - show_envvar=True, - ), - click.Option( - ["--oauth-client-id"], - required=True, - type=str, - help="Vectara OAuth2 client ID.", - envvar="VECTARA_OAUTH_CLIENT_ID", - show_envvar=True, - ), - click.Option( - ["--oauth-secret"], - required=True, - type=str, - help="Vectara OAuth2 secret.", - envvar="VECTARA_OAUTH_SECRET", - show_envvar=True, - ), - click.Option( - ["--corpus-name"], - required=False, - type=str, - default=None, - help="The Vectara corpus-name.", - ), - click.Option( - ["--token-url"], - required=False, - default="https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token", - type=str, - help="The Vectara endpoint for token refresh. Needs curly brackets for customer_id", - ), - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name="vectara", - cli_config=VectaraCliWriteConfig, - additional_cli_options=[], - write_config=WriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/weaviate.py b/unstructured/ingest/cli/cmds/weaviate.py deleted file mode 100644 index 69107a9c2..000000000 --- a/unstructured/ingest/cli/cmds/weaviate.py +++ /dev/null @@ -1,98 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString -from unstructured.ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig - -CMD_NAME = "weaviate" - - -@dataclass -class WeaviateCliConfig(SimpleWeaviateConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--host-url"], - required=True, - help="Weaviate instance url", - ), - click.Option( - ["--class-name"], - default=None, - type=str, - help="Name of the class to push the records into, e.g: Pdf-elements", - ), - click.Option( - ["--access-token"], default=None, type=str, help="Used to create the bearer token." - ), - click.Option( - ["--refresh-token"], - default=None, - type=str, - help="Will tie this value to the bearer token. If not provided, " - "the authentication will expire once the lifetime of the access token is up.", - ), - click.Option( - ["--api-key"], - default=None, - type=str, - ), - click.Option( - ["--client-secret"], - default=None, - type=str, - ), - click.Option( - ["--scope"], - default=None, - type=DelimitedString(), - ), - click.Option( - ["--username"], - default=None, - type=str, - ), - click.Option( - ["--password"], - default=None, - type=str, - ), - click.Option( - ["--anonymous"], - is_flag=True, - default=False, - type=bool, - help="if set, all auth values will be ignored", - ), - ] - return options - - -@dataclass -class WeaviateCliWriteConfig(WeaviateWriteConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ) - ] - return options - - -def get_base_dest_cmd(): - from unstructured.ingest.cli.base.dest import BaseDestCmd - - cmd_cls = BaseDestCmd( - cmd_name=CMD_NAME, - cli_config=WeaviateCliConfig, - additional_cli_options=[WeaviateCliWriteConfig], - write_config=WeaviateWriteConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/cmds/wikipedia.py b/unstructured/ingest/cli/cmds/wikipedia.py deleted file mode 100644 index a25f5c44c..000000000 --- a/unstructured/ingest/cli/cmds/wikipedia.py +++ /dev/null @@ -1,40 +0,0 @@ -import typing as t -from dataclasses import dataclass - -import click - -from unstructured.ingest.cli.base.src import BaseSrcCmd -from unstructured.ingest.cli.interfaces import ( - CliConfig, -) -from unstructured.ingest.connector.wikipedia import SimpleWikipediaConfig - - -@dataclass -class WikipediaCliConfig(SimpleWikipediaConfig, CliConfig): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--page-title"], - required=True, - type=str, - help='Title of a Wikipedia page, e.g. "Open source software".', - ), - click.Option( - ["--auto-suggest"], - default=True, - is_flag=True, - help="Whether to automatically suggest a page if the exact page was not found." - " Set to False if the wrong Wikipedia page is fetched.", - ), - ] - return options - - -def get_base_src_cmd() -> BaseSrcCmd: - cmd_cls = BaseSrcCmd( - cmd_name="wikipedia", - cli_config=WikipediaCliConfig, - ) - return cmd_cls diff --git a/unstructured/ingest/cli/common.py b/unstructured/ingest/cli/common.py deleted file mode 100644 index 53dacafaf..000000000 --- a/unstructured/ingest/cli/common.py +++ /dev/null @@ -1,7 +0,0 @@ -import logging - -from unstructured.ingest.logger import ingest_log_streaming_init - - -def log_options(options: dict, verbose=False): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py deleted file mode 100644 index 4703a1c47..000000000 --- a/unstructured/ingest/cli/interfaces.py +++ /dev/null @@ -1,656 +0,0 @@ -from __future__ import annotations - -import json -import os.path -import typing as t -from abc import abstractmethod -from dataclasses import fields -from gettext import gettext, ngettext -from pathlib import Path - -import click -from dataclasses_json.core import Json -from typing_extensions import Self - -from unstructured.chunking import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT -from unstructured.ingest.interfaces import ( - BaseConfig, - ChunkingConfig, - EmbeddingConfig, - FileStorageConfig, - PartitionConfig, - PermissionsConfig, - ProcessorConfig, - ReadConfig, - RetryStrategyConfig, -) - - -class Dict(click.ParamType): - name = "dict" - - def convert( - self, - value: t.Any, - param: t.Optional[click.Parameter] = None, - ctx: t.Optional[click.Context] = None, - ) -> t.Any: - try: - return json.loads(value) - except json.JSONDecodeError: - self.fail( - gettext( - "{value} is not a valid json value.", - ).format(value=value), - param, - ctx, - ) - - -class FileOrJson(click.ParamType): - name = "file-or-json" - - def __init__(self, allow_raw_str: bool = False): - self.allow_raw_str = allow_raw_str - - def convert( - self, - value: t.Any, - param: t.Optional[click.Parameter] = None, - ctx: t.Optional[click.Context] = None, - ) -> t.Any: - # check if valid file - full_path = os.path.abspath(os.path.expanduser(value)) - if os.path.isfile(full_path): - return str(Path(full_path).resolve()) - if isinstance(value, str): - try: - return json.loads(value) - except json.JSONDecodeError: - if self.allow_raw_str: - return value - self.fail( - gettext( - "{value} is not a valid json string nor an existing filepath.", - ).format(value=value), - param, - ctx, - ) - - -class DelimitedString(click.ParamType): - name = "delimited-string" - - def __init__(self, delimiter: str = ",", choices: t.Optional[t.List[str]] = None): - self.choices = choices if choices else [] - self.delimiter = delimiter - - def convert( - self, - value: t.Any, - param: t.Optional[click.Parameter] = None, - ctx: t.Optional[click.Context] = None, - ) -> t.Any: - # In case a list is provided as the default, will not break - if isinstance(value, list): - split = [str(v).strip() for v in value] - else: - split = [v.strip() for v in value.split(self.delimiter)] - if not self.choices: - return split - choices_str = ", ".join(map(repr, self.choices)) - for s in split: - if s not in self.choices: - self.fail( - ngettext( - "{value!r} is not {choice}.", - "{value!r} is not one of {choices}.", - len(self.choices), - ).format(value=s, choice=choices_str, choices=choices_str), - param, - ctx, - ) - return split - - -class CliMixin: - @staticmethod - @abstractmethod - def get_cli_options() -> t.List[click.Option]: - pass - - @classmethod - def add_cli_options(cls, cmd: click.Command) -> None: - options_to_add = cls.get_cli_options() - CliMixin.add_params(cmd, params=options_to_add) - - def add_params(cmd: click.Command, params: t.List[click.Parameter]): - existing_opts = [] - for param in cmd.params: - existing_opts.extend(param.opts) - - for param in params: - for opt in param.opts: - if opt in existing_opts: - raise ValueError(f"{opt} is already defined on the command {cmd.name}") - existing_opts.append(opt) - cmd.params.append(param) - - -class CliConfig(BaseConfig, CliMixin): - pass - - -class CliRetryStrategyConfig(RetryStrategyConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--max-retries"], - default=None, - type=int, - help="If provided, will use this max retry for " - "back off strategy if http calls fail", - ), - click.Option( - ["--max-retry-time"], - default=None, - type=float, - help="If provided, will attempt retries for this long as part " - "of back off strategy if http calls fail", - ), - ] - return options - - @classmethod - def from_dict(cls, kvs: Json, **kwargs): - """ - Return None if none of the fields are being populated - """ - if isinstance(kvs, dict): - field_names = {field.name for field in fields(cls) if field.name in kvs} - field_values = [kvs.get(n) for n in field_names if kvs.get(n)] - if not field_values: - return None - return super().from_dict(kvs=kvs, **kwargs) - - -class CliProcessorConfig(ProcessorConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--reprocess"], - is_flag=True, - default=False, - help="Reprocess a downloaded file even if the relevant structured " - "output .json file in output directory already exists.", - ), - click.Option( - ["--output-dir"], - default="structured-output", - help="Where to place structured output .json files.", - ), - click.Option( - ["--work-dir"], - type=str, - default=str( - (Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve(), - ), - show_default=True, - help="Where to place working files when processing each step", - ), - click.Option( - ["--num-processes"], - default=2, - show_default=True, - help="Number of parallel processes with which to process docs", - ), - click.Option( - ["--raise-on-error"], - is_flag=True, - default=False, - help="Is set, will raise error if any doc in the pipeline fail. Otherwise will " - "log error and continue with other docs", - ), - click.Option(["-v", "--verbose"], is_flag=True, default=False), - ] - return options - - -class CliReadConfig(ReadConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - click.Option( - ["--re-download"], - is_flag=True, - default=False, - help="Re-download files even if they are already present in download dir.", - ), - click.Option( - ["--preserve-downloads"], - is_flag=True, - default=False, - help="Preserve downloaded files. Otherwise each file is removed " - "after being processed successfully.", - ), - click.Option( - ["--download-only"], - is_flag=True, - default=False, - help="Download any files that are not already present in either --download-dir or " - "the default download ~/.cache/... location in case --download-dir " - "is not specified and " - "skip processing them through unstructured.", - ), - click.Option( - ["--max-docs"], - default=None, - type=int, - help="If specified, process at most the specified number of documents.", - ), - ] - return options - - -class CliPartitionConfig(PartitionConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--pdf-infer-table-structure"], - is_flag=True, - default=False, - help="Partition will include the table's text_as_html " "in the response metadata.", - ), - click.Option( - ["--strategy"], - default="auto", - help="The method that will be used to process the documents. " - "Default: auto. Other strategies include `fast` and `hi_res`.", - ), - click.Option( - ["--ocr-languages"], - default=None, - type=DelimitedString(delimiter="+"), - help="A list of language packs to specify which languages to use for OCR, " - "separated by '+' e.g. 'eng+deu' to use the English and German language packs. " - "The appropriate Tesseract " - "language pack needs to be installed.", - ), - click.Option( - ["--encoding"], - default=None, - help="Text encoding to use when reading documents. By default the encoding is " - "detected automatically.", - ), - click.Option( - ["--skip-infer-table-types"], - type=DelimitedString(), - default=None, - help="Optional list of document types to skip table extraction on", - ), - click.Option( - ["--additional-partition-args"], - type=Dict(), - help="A json string representation of values to pass through to partition()", - ), - click.Option( - ["--fields-include"], - type=DelimitedString(), - default=["element_id", "text", "type", "metadata", "embeddings"], - help="Comma-delimited list. If set, include the specified top-level " - "fields in an element.", - ), - click.Option( - ["--flatten-metadata"], - is_flag=True, - default=False, - help="Results in flattened json elements. " - "Specifically, the metadata key values are brought to " - "the top-level of the element, and the `metadata` key itself is removed.", - ), - click.Option( - ["--metadata-include"], - default=[], - type=DelimitedString(), - help="Comma-delimited list. If set, include the specified metadata " - "fields if they exist and drop all other fields. ", - ), - click.Option( - ["--metadata-exclude"], - default=[], - type=DelimitedString(), - help="Comma-delimited list. If set, drop the specified metadata " - "fields if they exist.", - ), - click.Option( - ["--partition-by-api"], - is_flag=True, - default=False, - help="Use a remote API to partition the files." - " Otherwise, use the function from partition.auto", - ), - click.Option( - ["--partition-endpoint"], - default="https://api.unstructured.io/general/v0/general", - help="If partitioning via api, use the following host. " - "Default: https://api.unstructured.io/general/v0/general", - ), - click.Option( - ["--api-key"], - default=None, - help="API Key for partition endpoint.", - ), - click.Option( - ["--hi-res-model-name"], - default=None, - help="Model name for hi-res strategy.", - ), - ] - return options - - -class CliRecursiveConfig(CliConfig): - recursive: bool - - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - ] - return options - - -class CliFilesStorageConfig(FileStorageConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--remote-url"], - required=True, - help="Remote fsspec URL formatted as `protocol://dir/path`", - ), - click.Option( - ["--uncompress"], - type=bool, - default=False, - is_flag=True, - help="Uncompress any archived files. Currently supporting zip and tar " - "files based on file extension.", - ), - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - click.Option( - ["--file-glob"], - default=None, - type=DelimitedString(), - help="A comma-separated list of file globs to limit which types of " - "local files are accepted, e.g. '*.html,*.txt'", - ), - ] - return options - - -class CliEmbeddingConfig(EmbeddingConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - from unstructured.embed import EMBEDDING_PROVIDER_TO_CLASS_MAP - - options = [ - click.Option( - ["--embedding-provider"], - help="Type of the embedding class to be used. Can be one of: " - f"{list(EMBEDDING_PROVIDER_TO_CLASS_MAP)}", - type=click.Choice(list(EMBEDDING_PROVIDER_TO_CLASS_MAP)), - ), - click.Option( - ["--embedding-api-key"], - help="API key for the embedding model, for the case an API key is needed.", - type=str, - default=None, - ), - click.Option( - ["--embedding-model-name"], - help="Embedding model name, if needed. " - "Chooses a particular LLM between different options, to embed with it.", - type=str, - default=None, - ), - click.Option( - ["--embedding-aws-access-key-id"], - help="AWS access key used for AWS-based embedders, such as bedrock", - type=str, - default=None, - ), - click.Option( - ["--embedding-aws-secret-access-key"], - help="AWS secret key used for AWS-based embedders, such as bedrock", - type=str, - default=None, - ), - click.Option( - ["--embedding-aws-region"], - help="AWS region used for AWS-based embedders, such as bedrock", - type=str, - default="us-west-2", - ), - ] - return options - - @classmethod - def from_dict(cls, kvs: Json, **kwargs): - """ - Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with embedding_ during CLI invocation but - doesn't require that as part of the field names in this class - """ - if isinstance(kvs, dict): - new_kvs = { - k[len("embedding_") :]: v # noqa: E203 - for k, v in kvs.items() - if k.startswith("embedding_") - } - if len(new_kvs.keys()) == 0: - return None - if not new_kvs.get("provider"): - return None - return super().from_dict(new_kvs, **kwargs) - return super().from_dict(kvs, **kwargs) - - -class CliChunkingConfig(ChunkingConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--chunk-elements"], - is_flag=True, - default=False, - help="Deprecated, use --chunking-strategy instead.", - ), - click.Option( - ["--chunking-strategy"], - type=str, - help="The rule-set to use to form chunks. Omit to disable chunking.", - ), - click.Option( - ["--chunk-combine-text-under-n-chars"], - type=int, - help=( - "Combine consecutive chunks when the first does not exceed this length and" - " the second will fit without exceeding the hard-maximum length. Only" - " operative for 'by_title' chunking-strategy." - ), - ), - click.Option( - ["--chunk-include-orig-elements/--chunk-no-include-orig-elements"], - is_flag=True, - default=True, - help=( - "When chunking, add the original elements consolidated to form each chunk to" - " `.metadata.orig_elements` on that chunk." - ), - ), - click.Option( - ["--chunk-max-characters"], - type=int, - default=CHUNK_MAX_CHARS_DEFAULT, - show_default=True, - help=( - "Hard maximum chunk length. No chunk will exceed this length. An oversized" - " element will be divided by text-splitting to fit this window." - ), - ), - click.Option( - ["--chunk-multipage-sections/--chunk-no-multipage-sections"], - is_flag=True, - default=CHUNK_MULTI_PAGE_DEFAULT, - help=( - "Ignore page boundaries when chunking such that elements from two different" - " pages can appear in the same chunk. Only operative for 'by_title'" - " chunking-strategy." - ), - ), - click.Option( - ["--chunk-new-after-n-chars"], - type=int, - help=( - "Soft-maximum chunk length. Another element will not be added to a chunk of" - " this length even when it would fit without exceeding the hard-maximum" - " length." - ), - ), - click.Option( - ["--chunk-overlap"], - type=int, - default=0, - show_default=True, - help=( - "Prefix chunk text with last overlap=N characters of prior chunk. Only" - " applies to oversized chunks divided by text-splitting. To apply overlap to" - " non-oversized chunks use the --overlap-all option." - ), - ), - click.Option( - ["--chunk-overlap-all"], - is_flag=True, - default=False, - help=( - "Apply overlap to chunks formed from whole elements as well as those formed" - " by text-splitting oversized elements. Overlap length is take from --overlap" - " option value." - ), - ), - ] - return options - - @classmethod - def from_dict(cls, kvs: Json, **kwargs: t.Any) -> t.Optional[Self]: - """Extension of dataclass from_dict() to avoid a naming conflict with other CLI params. - - This allows CLI arguments to be prefixed with "chunking_" during CLI invocation but doesn't - require that as part of the field names in this class - """ - if not isinstance(kvs, dict): - return super().from_dict(kvs=kvs, **kwargs) - - options: t.Dict[str, t.Any] = kvs.copy() - chunk_elements = options.pop("chunk_elements", None) - chunking_strategy = options.pop("chunking_strategy", None) - # -- when neither are specified, chunking is not requested -- - if not chunk_elements and not chunking_strategy: - return None - - def iter_kv_pairs() -> t.Iterator[t.Tuple[str, t.Any]]: - # -- newer `chunking_strategy` option takes precedence over legacy `chunk_elements` -- - if chunking_strategy: - yield "chunking_strategy", chunking_strategy - # -- but legacy case is still supported, equivalent to `chunking_strategy="by_title" -- - elif chunk_elements: - yield "chunking_strategy", "by_title" - - yield from ( - (key[len("chunk_") :], value) - for key, value in options.items() - if key.startswith("chunk_") - ) - - new_kvs = dict(iter_kv_pairs()) - return None if len(new_kvs) == 0 else super().from_dict(kvs=new_kvs, **kwargs) - - -class CliPermissionsConfig(PermissionsConfig, CliMixin): - @staticmethod - def get_cli_options() -> t.List[click.Option]: - options = [ - click.Option( - ["--permissions-application-id"], - type=str, - help="Microsoft Graph API application id", - ), - click.Option( - ["--permissions-client-cred"], - type=str, - help="Microsoft Graph API application credentials", - ), - click.Option( - ["--permissions-tenant"], - type=str, - help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.", - ), - ] - return options - - @classmethod - def from_dict(cls, kvs: Json, **kwargs): - """ - Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with permissions_ during CLI invocation but - doesn't require that as part of the field names in this class. It also checks if the - CLI params are provided as intended. - """ - - if isinstance(kvs, dict): - permissions_application_id = kvs.get("permissions_application_id") - permissions_client_cred = kvs.get("permissions_client_cred") - permissions_tenant = kvs.get("permissions_tenant") - permission_values = [ - permissions_application_id, - permissions_client_cred, - permissions_tenant, - ] - if any(permission_values) and not all(permission_values): - raise ValueError( - "Please provide either none or all of the following optional values:\n" - "--permissions-application-id\n" - "--permissions-client-cred\n" - "--permissions-tenant", - ) - - new_kvs = { - k[len("permissions_") :]: v # noqa: E203 - for k, v in kvs.items() - if k.startswith("permissions_") - } - if len(new_kvs.keys()) == 0: - return None - return super().from_dict(kvs=new_kvs, **kwargs) - return super().from_dict(kvs=kvs, **kwargs) diff --git a/unstructured/ingest/cli/utils.py b/unstructured/ingest/cli/utils.py deleted file mode 100644 index 701355f26..000000000 --- a/unstructured/ingest/cli/utils.py +++ /dev/null @@ -1,205 +0,0 @@ -import typing as t -from dataclasses import fields, is_dataclass -from gettext import gettext as _ - -import click - -from unstructured.ingest.cli.interfaces import ( - CliChunkingConfig, - CliConfig, - CliEmbeddingConfig, - CliPartitionConfig, - CliPermissionsConfig, - CliProcessorConfig, - CliReadConfig, - CliRetryStrategyConfig, -) -from unstructured.ingest.interfaces import BaseConfig -from unstructured.ingest.logger import logger - - -def conform_click_options(options: dict): - # Click sets all multiple fields as tuple, this needs to be updated to list - for k, v in options.items(): - if isinstance(v, tuple): - options[k] = list(v) - - -def extract_config(flat_data: dict, config: t.Type[BaseConfig]) -> BaseConfig: - """ - To be able to extract a nested dataclass from a flat dictionary (as in one coming - from a click-based options input), the config class is dynamically looked through for - nested dataclass fields and new nested dictionaries are created to conform to the - shape the overall class expects whn parsing from a dict. During the process, this will create - copies of the original dictionary to avoid pruning fields but this isn't a - problem since the `from_dict()` method ignores unneeded values. - - Not handling more complex edge cases for now such as nested types i.e Union[List[List[...]]] - """ - - def conform_dict(inner_d: dict, inner_config: t.Type[BaseConfig]): - # Catch edge cases (i.e. Dict[str, ...]) where underlying type is not a concrete Class, - # causing 'issubclass() arg 1 must be a class' errors, return False - def is_subclass(instance, class_type) -> bool: - try: - return issubclass(instance, class_type) - except Exception: - return False - - dd = inner_d.copy() - for field in fields(inner_config): - f_type = field.type - # Handle the case where the type of a value if a Union (possibly optional) - if t.get_origin(f_type) is t.Union: - union_values = t.get_args(f_type) - # handle List types - union_values = [ - t.get_args(u)[0] if t.get_origin(u) is list else u for u in union_values - ] - # Ignore injected NoneType when optional - concrete_union_values = [v for v in union_values if not is_subclass(v, type(None))] - dataclass_union_values = [v for v in concrete_union_values if is_dataclass(v)] - non_dataclass_union_values = [ - v for v in concrete_union_values if not is_dataclass(v) - ] - if not dataclass_union_values: - continue - # Check if the key for this field already exists in the dictionary, - # if so it might map to one of these non dataclass fields and this - # can't be enforced - if non_dataclass_union_values and field.name in dd: - continue - if len(dataclass_union_values) > 1: - logger.warning( - "more than one dataclass type possible for field {}, " - "not extracting: {}".format(field.name, ", ".join(dataclass_union_values)) - ) - continue - f_type = dataclass_union_values[0] - origin = t.get_origin(f_type) - if origin: - f_type = origin - if is_subclass(f_type, BaseConfig): - dd[field.name] = conform_dict(inner_d=dd, inner_config=f_type) - return dd - - adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config) - return config.from_dict(adjusted_dict, apply_name_overload=False) - - -def extract_configs( - data: dict, - extras: t.Optional[t.Dict[str, t.Type[BaseConfig]]] = None, - validate: t.Optional[t.List[t.Type[BaseConfig]]] = None, - add_defaults: bool = True, -) -> t.Dict[str, BaseConfig]: - """ - Extract all common configs used across CLI command and validate that any - command-specific configs have all their needed information from the Click - options that are passed in during invocation. - """ - validate = validate if validate else [] - res = ( - { - "read_config": extract_config(flat_data=data, config=CliReadConfig), - "partition_config": extract_config(flat_data=data, config=CliPartitionConfig), - "embedding_config": extract_config(flat_data=data, config=CliEmbeddingConfig), - "chunking_config": extract_config(flat_data=data, config=CliChunkingConfig), - "processor_config": extract_config(flat_data=data, config=CliProcessorConfig), - "permissions_config": extract_config(flat_data=data, config=CliPermissionsConfig), - "retry_strategy_config": extract_config(flat_data=data, config=CliRetryStrategyConfig), - } - if add_defaults - else {} - ) - if extras: - for k, conf in extras.items(): - try: - res[k] = extract_config(flat_data=data, config=conf) - except Exception as e: - logger.error(f"failed to extract config from {conf.__name__}") - raise e - for v in validate: - try: - extract_config(flat_data=data, config=v) - except Exception as e: - raise Exception(f"failed to validate config {v.__name__}") from e - - return res - - -def add_options( - cmd: click.Command, extras: t.List[t.Type[CliConfig]], is_src: bool = True -) -> click.Command: - configs: t.List[t.Type[CliConfig]] = ( - [ - CliPartitionConfig, - CliReadConfig, - CliEmbeddingConfig, - CliChunkingConfig, - CliProcessorConfig, - CliPermissionsConfig, - CliRetryStrategyConfig, - ] - if is_src - else [] - ) - # make sure what's unique to this cmd appears first - extras.extend(configs) - for config in extras: - try: - config.add_cli_options(cmd=cmd) - except ValueError as e: - raise ValueError(f"failed to set configs from {config.__name__}: {e}") - return cmd - - -class Group(click.Group): - def parse_args(self, ctx, args): - """ - This allows for subcommands to be called with the --help flag without breaking - if parent command is missing any of its required parameters - """ - - try: - return super().parse_args(ctx, args) - except click.MissingParameter: - if "--help" not in args: - raise - - # remove the required params so that help can display - for param in self.params: - param.required = False - return super().parse_args(ctx, args) - - def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: - """ - Copy of the original click.Group format_commands() method but replacing - 'Commands' -> 'Destinations' - """ - commands = [] - for subcommand in self.list_commands(ctx): - cmd = self.get_command(ctx, subcommand) - # What is this, the tool lied about a command. Ignore it - if cmd is None: - continue - if cmd.hidden: - continue - - commands.append((subcommand, cmd)) - - # allow for 3 times the default spacing - if len(commands): - if formatter.width: - limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands) - else: - limit = -6 - max(len(cmd[0]) for cmd in commands) - - rows = [] - for subcommand, cmd in commands: - help = cmd.get_short_help_str(limit) - rows.append((subcommand, help)) - - if rows: - with formatter.section(_("Destinations")): - formatter.write_dl(rows) diff --git a/unstructured/ingest/connector/__init__.py b/unstructured/ingest/connector/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/connector/airtable.py b/unstructured/ingest/connector/airtable.py deleted file mode 100644 index 27669d4a3..000000000 --- a/unstructured/ingest/connector/airtable.py +++ /dev/null @@ -1,309 +0,0 @@ -import typing as t -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from pyairtable import Api - - -@dataclass -class AirtableAccessConfig(AccessConfig): - personal_access_token: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleAirtableConfig(BaseConnectorConfig): - """Connector config where: - auth_token is the authentication token to authenticate into Airtable. - - Check https://support.airtable.com/docs/airtable-api-key-deprecation-notice - for more info on authentication. - """ - - access_config: AirtableAccessConfig - list_of_paths: t.Optional[str] = None - - -@dataclass -class AirtableTableMeta: - """Metadata specifying a table id, a base id which the table is stored in, - and an t.Optional view id in case particular rows and fields are to be ingested""" - - base_id: str - table_id: str - view_id: t.Optional[str] = None - - -@dataclass -class AirtableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing). - - Current implementation creates an Airtable connection object - to fetch each document, rather than creating a it for each thread. - """ - - connector_config: SimpleAirtableConfig - table_meta: AirtableTableMeta - registry_name: str = "airtable" - - @property - def filename(self): - return ( - Path(self.read_config.download_dir) - / self.table_meta.base_id - / f"{self.table_meta.table_id}.csv" - ).resolve() - - @property - def _output_filename(self): - """Create output file path based on output directory, base id, and table id""" - output_file = f"{self.table_meta.table_id}.json" - return Path(self.processor_config.output_dir) / self.table_meta.base_id / output_file - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "base_id": self.table_meta.base_id, - "table_id": self.table_meta.table_id, - "view_id": self.table_meta.view_id, - } - - @property - def version(self) -> t.Optional[str]: - return None - - @requires_dependencies(["pyairtable"], extras="airtable") - def _query_table(self): - from pyairtable import Api - - api = Api(self.connector_config.access_config.personal_access_token) - table = api.table(self.table_meta.base_id, self.table_meta.table_id) - table_url = table.url - rows = table.all( - view=self.table_meta.view_id, - ) - return rows, table_url - - @SourceConnectionNetworkError.wrap - def _get_table_rows(self): - rows, table_url = self._query_table() - - if len(rows) == 0: - logger.info("Empty document, retrieved table but it has no rows.") - return rows, table_url - - def update_source_metadata(self, **kwargs): - """Gets file metadata from the current table.""" - - rows, table_url = kwargs.get("rows_tuple", self._get_table_rows()) - if rows is None or len(rows) < 1: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - dates = [r.get("createdTime", "") for r in rows] - dates.sort() - - date_created = datetime.strptime( - dates[0], - "%Y-%m-%dT%H:%M:%S.%fZ", - ).isoformat() - - date_modified = datetime.strptime( - dates[-1], - "%Y-%m-%dT%H:%M:%S.%fZ", - ).isoformat() - - self.source_metadata = SourceMetadata( - date_created=date_created, - date_modified=date_modified, - source_url=table_url, - exists=True, - ) - - @SourceConnectionError.wrap - @requires_dependencies(["pandas"]) - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - import pandas as pd - - rows, table_url = self._get_table_rows() - self.update_source_metadata(rows_tuple=(rows, table_url)) - if rows is None: - raise ValueError( - "Failed to retrieve rows from table " - f"{self.table_meta.base_id}/{self.table_meta.table_id}. Check logs", - ) - # NOTE: Might be a good idea to add pagination for large tables - df = pd.DataFrame.from_dict( - [row["fields"] for row in rows], - ).sort_index(axis=1) - - self.document = df.to_csv() - self.filename.parent.mkdir(parents=True, exist_ok=True) - - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.document) - - -airtable_id_prefixes = ["app", "tbl", "viw"] - - -def raise_airtable_path_error(piece): - if any(piece[:3] == prefix for prefix in airtable_id_prefixes): - raise ( - ValueError( - "Path components are not correctly ordered.\ - Valid path structures: \ - - base_id/table_id/view_id , \ - - base_id/table_id, \ - - base_id .\ - It is also possible to leave --airtable-list-of-paths \ - argument empty (this will ingest everything).", - ) - ) - else: - raise ( - ValueError( - """Path components are not valid Airtable ids. - base_id should look like: appAbcDeF1ghijKlm, - table_id should look like: tblAbcDeF1ghijKlm, - view_id should look like: viwAbcDeF1ghijKlm""", - ) - ) - - -def check_path_validity(path): - pieces = path.split("/") - assert ( - 1 <= len(pieces) <= 3 - ), "Path should be composed of between 1-3 \ - components (base_id, table_id, view_id)." - - for i, piece in enumerate(pieces): - try: - assert piece[:3] == airtable_id_prefixes[i] - except AssertionError: - raise_airtable_path_error(piece) - - -@dataclass -class AirtableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Fetches tables or views from an Airtable org.""" - - connector_config: SimpleAirtableConfig - _api: t.Optional["Api"] = field(init=False, default=None) - - @property - def api(self): - if self._api is None: - self._api = Api(self.connector_config.access_config.personal_access_token) - return self._api - - @api.setter - def api(self, api: "Api"): - self._api = api - - def check_connection(self): - import requests - - try: - self.api.request(method="HEAD", url=self.api.build_url("meta", "bases")) - except requests.HTTPError as http_error: - logger.error(f"failed to validate connection: {http_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {http_error}") - - @requires_dependencies(["pyairtable"], extras="airtable") - def initialize(self): - from pyairtable import Api - - self.base_ids_to_fetch_tables_from = [] - if self.connector_config.list_of_paths: - self.list_of_paths = self.connector_config.list_of_paths.split() - - self.api = Api(self.connector_config.access_config.personal_access_token) - - @requires_dependencies(["pyairtable"], extras="airtable") - def use_all_bases(self): - from pyairtable.metadata import get_api_bases - - self.base_ids_to_fetch_tables_from = [ - base["id"] for base in get_api_bases(self.api)["bases"] - ] - - @requires_dependencies(["pyairtable"], extras="airtable") - def fetch_table_ids(self): - from pyairtable.metadata import get_base_schema - - bases = [ - (base_id, self.api.base(base_id)) for base_id in self.base_ids_to_fetch_tables_from - ] - - metadata_for_each_base = [ - (base_id, get_base_schema(base)["tables"]) for base_id, base in bases - ] - - baseid_tableid_viewid_tuples = [ - (base_id, table["id"], None) - for base_id, base_metadata in metadata_for_each_base - for table in base_metadata - ] - - return baseid_tableid_viewid_tuples - - def get_ingest_docs(self): - """Fetches documents in an Airtable org.""" - - # When no list of paths provided, the connector ingests everything. - if not self.connector_config.list_of_paths: - self.use_all_bases() - baseid_tableid_viewid_tuples = self.fetch_table_ids() - - # When there is a list of paths, the connector checks the validity - # of the paths, and fetches table_ids to be ingested, based on the paths. - else: - self.paths = self.connector_config.list_of_paths.split() - self.paths = [path.strip("/") for path in self.paths] - - [check_path_validity(path) for path in self.paths] - - self.base_ids_to_fetch_tables_from = [] - baseid_tableid_viewid_tuples = [] - - for path in self.paths: - components = path.split("/") - if len(components) == 1: # only a base_id is provided - self.base_ids_to_fetch_tables_from.append(components[0]) - elif len(components) == 2: # a base_id and a table_id are provided - baseid_tableid_viewid_tuples.append((components[0], components[1], None)) - elif len(components) == 3: # a base_id, table_id, and a view_id are provided - baseid_tableid_viewid_tuples.append( - (components[0], components[1], components[2]), - ) - - baseid_tableid_viewid_tuples += self.fetch_table_ids() - return [ - AirtableIngestDoc( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - table_meta=AirtableTableMeta(base_id, table_id, view_id), - ) - for base_id, table_id, view_id in baseid_tableid_viewid_tuples - ] diff --git a/unstructured/ingest/connector/astradb.py b/unstructured/ingest/connector/astradb.py deleted file mode 100644 index 2642ea191..000000000 --- a/unstructured/ingest/connector/astradb.py +++ /dev/null @@ -1,238 +0,0 @@ -import copy -import typing as t -from dataclasses import dataclass, field -from pathlib import Path - -from unstructured import __name__ as integration_name -from unstructured.__version__ import __version__ as integration_version -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from astrapy.db import AstraDB, AstraDBCollection - -NON_INDEXED_FIELDS = ["metadata._node_content", "content"] - - -@dataclass -class AstraDBAccessConfig(AccessConfig): - token: str = enhanced_field(sensitive=True) - api_endpoint: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleAstraDBConfig(BaseConnectorConfig): - access_config: AstraDBAccessConfig - collection_name: str - namespace: t.Optional[str] = None - - -@dataclass -class AstraDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleAstraDBConfig - metadata: t.Dict[str, str] = field(default_factory=dict) - registry_name: str = "astradb" - - @property - def filename(self): - return ( - Path(self.read_config.download_dir) - / self.connector_config.collection_name - / f"{self.metadata['_id']}.txt" - ).resolve() - - @property - def _output_filename(self): - return ( - Path(self.processor_config.output_dir) - / self.connector_config.collection_name - / f"{self.metadata['_id']}.json" - ).resolve() - - def update_source_metadata(self, **kwargs): - if not self.metadata: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - self.source_metadata = SourceMetadata( - exists=True, - ) - - @SourceConnectionError.wrap - @requires_dependencies(["astrapy"], extras="astradb") - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - self.filename.parent.mkdir(parents=True, exist_ok=True) - - flattened_dict = flatten_dict(dictionary=self.metadata) - str_values = [str(value) for value in flattened_dict.values()] - concatenated_values = "\n".join(str_values) - - with open(self.filename, "w") as f: - f.write(concatenated_values) - - -@dataclass -class AstraDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleAstraDBConfig - _astra_db: t.Optional["AstraDB"] = field(init=False, default=None) - _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None) - - @property - @requires_dependencies(["astrapy"], extras="astradb") - def astra_db_collection(self) -> "AstraDBCollection": - if self._astra_db_collection is None: - from astrapy.db import AstraDB - - # Build the Astra DB object. - # caller_name/version for Astra DB tracking - self._astra_db = AstraDB( - api_endpoint=self.connector_config.access_config.api_endpoint, - token=self.connector_config.access_config.token, - namespace=self.connector_config.namespace, - caller_name=integration_name, - caller_version=integration_version, - ) - - # Create and connect to the collection - self._astra_db_collection = self._astra_db.collection( - collection_name=self.connector_config.collection_name, - ) - return self._astra_db_collection # type: ignore - - @requires_dependencies(["astrapy"], extras="astradb") - @SourceConnectionError.wrap # type: ignore - def initialize(self): - _ = self.astra_db_collection - - @requires_dependencies(["astrapy"], extras="astradb") - def check_connection(self): - try: - _ = self.astra_db_collection - except Exception as e: - logger.error(f"Failed to validate connection {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - @requires_dependencies(["astrapy"], extras="astradb") - def get_ingest_docs(self): # type: ignore - # Perform the find operation - astra_docs = list(self.astra_db_collection.paginated_find()) - - doc_list = [] - for record in astra_docs: - doc = AstraDBIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - metadata=record, - ) - - doc.update_source_metadata() - - doc_list.append(doc) - - return doc_list - - -@dataclass -class AstraDBWriteConfig(WriteConfig): - embedding_dimension: int - requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None - batch_size: int = 20 - - -@dataclass -class AstraDBDestinationConnector(BaseDestinationConnector): - write_config: AstraDBWriteConfig - connector_config: SimpleAstraDBConfig - _astra_db: t.Optional["AstraDB"] = field(init=False, default=None) - _astra_db_collection: t.Optional["AstraDBCollection"] = field(init=False, default=None) - - def to_dict(self, **kwargs): - """ - The _astra_db_collection variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle '_thread.lock' object - When serializing, remove it, meaning client data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - - if hasattr(self_cp, "_astra_db_collection"): - setattr(self_cp, "_astra_db_collection", None) - - return _asdict(self_cp, **kwargs) - - @property - @requires_dependencies(["astrapy"], extras="astradb") - def astra_db_collection(self) -> "AstraDBCollection": - if self._astra_db_collection is None: - from astrapy.db import AstraDB - - collection_name = self.connector_config.collection_name - embedding_dimension = self.write_config.embedding_dimension - - # If the user has requested an indexing policy, pass it to the Astra DB - requested_indexing_policy = self.write_config.requested_indexing_policy - options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None - - # caller_name/version for Astra DB tracking - self._astra_db = AstraDB( - api_endpoint=self.connector_config.access_config.api_endpoint, - token=self.connector_config.access_config.token, - namespace=self.connector_config.namespace, - caller_name=integration_name, - caller_version=integration_version, - ) - - # Create and connect to the newly created collection - self._astra_db_collection = self._astra_db.create_collection( - collection_name=collection_name, - dimension=embedding_dimension, - options=options, - ) - return self._astra_db_collection - - @requires_dependencies(["astrapy"], extras="astradb") - @DestinationConnectionError.wrap - def initialize(self): - _ = self.astra_db_collection - - @requires_dependencies(["astrapy"], extras="astradb") - def check_connection(self): - try: - _ = self.astra_db_collection - except Exception as e: - logger.error(f"Failed to validate connection {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info(f"Inserting / updating {len(elements_dict)} documents to Astra DB.") - - astra_batch_size = self.write_config.batch_size - - for batch in batch_generator(elements_dict, astra_batch_size): - self._astra_db_collection.insert_many(batch) - - def normalize_dict(self, element_dict: dict) -> dict: - return { - "$vector": element_dict.pop("embeddings", None), - "content": element_dict.pop("text", None), - "metadata": element_dict, - } diff --git a/unstructured/ingest/connector/azure_cognitive_search.py b/unstructured/ingest/connector/azure_cognitive_search.py deleted file mode 100644 index fc932eb5e..000000000 --- a/unstructured/ingest/connector/azure_cognitive_search.py +++ /dev/null @@ -1,142 +0,0 @@ -import json -import typing as t -import uuid -from dataclasses import dataclass, field - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError, WriteError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from azure.search.documents import SearchClient - - -@dataclass -class AzureCognitiveSearchAccessConfig(AccessConfig): - key: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleAzureCognitiveSearchStorageConfig(BaseConnectorConfig): - endpoint: str - access_config: AzureCognitiveSearchAccessConfig - - -@dataclass -class AzureCognitiveSearchWriteConfig(WriteConfig): - index: str - - -@dataclass -class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector): - write_config: AzureCognitiveSearchWriteConfig - connector_config: SimpleAzureCognitiveSearchStorageConfig - _client: t.Optional["SearchClient"] = field(init=False, default=None) - - @requires_dependencies(["azure.search"], extras="azure-cognitive-search") - def generate_client(self) -> "SearchClient": - from azure.core.credentials import AzureKeyCredential - from azure.search.documents import SearchClient - - # Create a client - credential = AzureKeyCredential(self.connector_config.access_config.key) - return SearchClient( - endpoint=self.connector_config.endpoint, - index_name=self.write_config.index, - credential=credential, - ) - - @property - def client(self) -> "SearchClient": - if self._client is None: - self._client = self.generate_client() - return self._client - - def check_connection(self): - try: - self.client.get_document_count() - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def initialize(self): - _ = self.client - - def conform_dict(self, data: dict) -> None: - """ - updates the dictionary that is from each Element being converted into a dict/json - into a dictionary that conforms to the schema expected by the - Azure Cognitive Search index - """ - from dateutil import parser # type: ignore - - data["id"] = str(uuid.uuid4()) - - if points := data.get("metadata", {}).get("coordinates", {}).get("points"): - data["metadata"]["coordinates"]["points"] = json.dumps(points) - if version := data.get("metadata", {}).get("data_source", {}).get("version"): - data["metadata"]["data_source"]["version"] = str(version) - if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"): - data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator) - if permissions_data := ( - data.get("metadata", {}).get("data_source", {}).get("permissions_data") - ): - data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data) - if links := data.get("metadata", {}).get("links"): - data["metadata"]["links"] = [json.dumps(link) for link in links] - if last_modified := data.get("metadata", {}).get("last_modified"): - data["metadata"]["last_modified"] = parser.parse(last_modified).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"): - data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"): - data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"): - data["metadata"]["data_source"]["date_processed"] = parser.parse( - date_processed, - ).strftime("%Y-%m-%dT%H:%M:%S.%fZ") - if page_number := data.get("metadata", {}).get("page_number"): - data["metadata"]["page_number"] = str(page_number) - - @requires_dependencies(["azure"], extras="azure-cognitive-search") - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - import azure.core.exceptions - - logger.info( - f"writing {len(elements_dict)} documents to destination " - f"index at {self.write_config.index}", - ) - try: - results = self.client.upload_documents(documents=elements_dict) - - except azure.core.exceptions.HttpResponseError as http_error: - raise WriteError(f"http error: {http_error}") from http_error - errors = [] - success = [] - for result in results: - if result.succeeded: - success.append(result) - else: - errors.append(result) - logger.debug(f"results: {len(success)} successes, {len(errors)} failures") - if errors: - raise WriteError( - ", ".join( - [ - f"{error.key}: [{error.status_code}] {error.error_message}" - for error in errors - ], - ), - ) diff --git a/unstructured/ingest/connector/biomed.py b/unstructured/ingest/connector/biomed.py deleted file mode 100644 index 7371699e3..000000000 --- a/unstructured/ingest/connector/biomed.py +++ /dev/null @@ -1,313 +0,0 @@ -import os -import typing as t -import urllib.request -from dataclasses import dataclass -from ftplib import FTP, error_perm -from pathlib import Path - -import requests -from requests.adapters import HTTPAdapter - -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - validate_date_args, -) - -DOMAIN = "ftp.ncbi.nlm.nih.gov" -FTP_DOMAIN = f"ftp://{DOMAIN}" -PMC_DIR = "pub/pmc" -PDF_DIR = "oa_pdf" - - -@dataclass -class BiomedFileMeta: - ftp_path: str - download_filepath: str - output_filepath: str - - -@dataclass -class SimpleBiomedConfig(BaseConnectorConfig): - """Connector config where path is the FTP directory path and - id_, from_, until, format are API parameters.""" - - path: t.Optional[str] = None - # OA Web Service API Options - api_id: t.Optional[str] = None - api_from: t.Optional[str] = None - api_until: t.Optional[str] = None - max_request_time: int = 45 - - def validate_api_inputs(self): - valid = False - - if self.api_from: - valid = validate_date_args(self.api_from) - - if self.api_until: - valid = validate_date_args(self.api_until) - - return valid - - def __post_init__(self): - self.is_file = False - self.is_dir = False - self.is_api = False - - if not self.path: - is_valid = self.validate_api_inputs() - if not is_valid: - raise ValueError( - "Path argument or at least one of the " - "OA Web Service arguments MUST be provided.", - ) - - self.is_api = True - else: - self.path = self.path.strip("/") - is_valid = self.path.lower().startswith(PDF_DIR) - - if not is_valid: - raise ValueError(f"Path MUST start with {PDF_DIR}") - - ftp = FTP(DOMAIN) - ftp.login() - - path = Path(PMC_DIR) / self.path - response = "" - try: - if path.suffix == ".pdf": - response = ftp.cwd(str(path.parent)) - self.is_file = True - else: - response = ftp.cwd(str(path)) - except error_perm as exc: - if "no such file or directory" in exc.args[0].lower(): - raise ValueError(f"The path: {path} is not valid.") - elif "not a directory" in exc.args[0].lower(): - self.is_file = True - elif "command successful" in response: - self.is_dir = True - else: - raise ValueError( - "Something went wrong when validating the path: {path}.", - ) - - -@dataclass -class BiomedIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleBiomedConfig - file_meta: BiomedFileMeta - registry_name: str = "biomed" - - @property - def filename(self): - return Path(self.file_meta.download_filepath).resolve() # type: ignore - - @property - def _output_filename(self): - return Path(f"{self.file_meta.output_filepath}.json").resolve() - - def cleanup_file(self): - if ( - not self.read_config.preserve_downloads - and self.filename.is_file() - and not self.read_config.download_only - ): - logger.debug(f"Cleaning up {self}") - Path.unlink(self.filename) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - download_path = self.file_meta.download_filepath # type: ignore - dir_ = Path(os.path.dirname(download_path)) # type: ignore - if not dir_.is_dir(): - logger.debug(f"Creating directory: {dir_}") - - if dir_: - dir_.mkdir(parents=True, exist_ok=True) - self._retrieve() - logger.debug(f"File downloaded: {self.file_meta.download_filepath}") - - @SourceConnectionNetworkError.wrap - def _retrieve(self): - urllib.request.urlretrieve( - self.file_meta.ftp_path, # type: ignore - self.file_meta.download_filepath, - ) - - -class BiomedSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Objects of this class support fetching documents from Biomedical literature FTP directory""" - - connector_config: SimpleBiomedConfig - - def get_base_endpoints_url(self) -> str: - endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf" - - if self.connector_config.api_id: - endpoint_url += f"&id={self.connector_config.api_id}" - - if self.connector_config.api_from: - endpoint_url += f"&from={self.connector_config.api_from}" - - if self.connector_config.api_until: - endpoint_url += f"&until={self.connector_config.api_until}" - - return endpoint_url - - def _list_objects_api(self) -> t.List[BiomedFileMeta]: - from bs4 import BeautifulSoup - - def urls_to_metadata(urls): - files = [] - for url in urls: - parts = url.split(PDF_DIR) - if len(parts) > 1: - local_path = parts[1].strip("/") - files.append( - BiomedFileMeta( - ftp_path=url, - download_filepath=(Path(self.read_config.download_dir) / local_path) - .resolve() - .as_posix(), - output_filepath=(Path(self.processor_config.output_dir) / local_path) - .resolve() - .as_posix(), - ), - ) - - return files - - files: t.List[BiomedFileMeta] = [] - - endpoint_url = self.get_base_endpoints_url() - - while endpoint_url: - session = requests.Session() - adapter = HTTPAdapter() - session.mount("http://", adapter) - session.mount("https://", adapter) - response = self._get_request(session=session, endpoint_url=endpoint_url) - soup = BeautifulSoup(response.content, features="lxml") - urls = [link["href"] for link in soup.find_all("link")] - - if not urls: - return files - - endpoint_url = urls[-1] if "resumptiontoken" in urls[-1].lower() else None - if endpoint_url: - urls = urls[:-1] - - files.extend(urls_to_metadata(urls)) - - return files - - @SourceConnectionNetworkError.wrap - def _get_request(self, session: requests.Session, endpoint_url: str) -> requests.Response: - return session.get(endpoint_url, timeout=self.connector_config.max_request_time) - - def _list_objects(self) -> t.List[BiomedFileMeta]: - files = [] - - # Conform to mypy, null check performed elsewhere. - # Wouldn't be in this method unless self.config.path exists - path: str = self.connector_config.path if self.connector_config.path else "" - - def traverse(path, download_dir, output_dir): - full_path = Path(PMC_DIR) / path - logger.debug(f"Traversing directory: {full_path}") - - ftp = FTP(DOMAIN) - ftp.login() - - try: - response = ftp.cwd(str(full_path)) - except error_perm: - raise ValueError(f"{full_path} is not a valid directory.") - - if "command successful" in response.lower(): - sub_paths = [path / p for p in ftp.nlst()] - - if not sub_paths: - return - - ext = Path(sub_paths[0]).suffix - if ext: - for sub_path in sub_paths: - ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{sub_path}" - local_path = "/".join(str(sub_path).split("/")[1:]) - files.append( - BiomedFileMeta( - ftp_path=ftp_path, - download_filepath=(Path(self.read_config.download_dir) / local_path) - .resolve() - .as_posix(), - output_filepath=( - Path(self.processor_config.output_dir) / local_path - ) - .resolve() - .as_posix(), - ), - ) - - else: - for sub_path in sub_paths: - traverse(sub_path, download_dir, output_dir) - - else: - raise ValueError(f"{full_path} is not a valid directory.") - - ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{self.connector_config.path}" - if self.connector_config.is_file: - local_path = "/".join(path.split("/")[1:]) - return [ - BiomedFileMeta( - ftp_path=ftp_path, - download_filepath=(Path(self.read_config.download_dir) / local_path) - .resolve() - .as_posix(), - output_filepath=(Path(self.processor_config.output_dir) / local_path) - .resolve() - .as_posix(), - ), - ] - else: - traverse( - Path(path), - Path(self.read_config.download_dir), - Path(self.processor_config.output_dir), - ) - - return files - - def initialize(self): - pass - - def check_connection(self): - resp = requests.head(self.get_base_endpoints_url()) - try: - resp.raise_for_status() - except requests.HTTPError as http_error: - raise SourceConnectionError(f"failed to validate connection: {http_error}") - - def get_ingest_docs(self): - files = self._list_objects_api() if self.connector_config.is_api else self._list_objects() - return [ - BiomedIngestDoc( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - file_meta=file, - ) - for file in files - ] diff --git a/unstructured/ingest/connector/chroma.py b/unstructured/ingest/connector/chroma.py deleted file mode 100644 index 547b988a2..000000000 --- a/unstructured/ingest/connector/chroma.py +++ /dev/null @@ -1,159 +0,0 @@ -import copy -import typing as t -import uuid -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from chromadb import Collection as ChromaCollection - - -@dataclass -class ChromaAccessConfig(AccessConfig): - settings: t.Optional[t.Dict[str, str]] = None - headers: t.Optional[t.Dict[str, str]] = None - - -@dataclass -class SimpleChromaConfig(BaseConnectorConfig): - access_config: ChromaAccessConfig - collection_name: str - path: t.Optional[str] = None - tenant: t.Optional[str] = "default_tenant" - database: t.Optional[str] = "default_database" - host: t.Optional[str] = None - port: t.Optional[int] = None - ssl: bool = False - - -@dataclass -class ChromaWriteConfig(WriteConfig): - batch_size: int = 100 - - -@dataclass -class ChromaDestinationConnector(BaseDestinationConnector): - write_config: ChromaWriteConfig - connector_config: SimpleChromaConfig - _collection: t.Optional["ChromaCollection"] = None - - @property - def chroma_collection(self): - if self._collection is None: - self._collection = self.create_collection() - return self._collection - - def initialize(self): - pass - - @DestinationConnectionError.wrap - def check_connection(self): - _ = self.chroma_collection - - def to_dict(self, **kwargs): - """ - The _collection variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle 'module' object - When serializing, remove it, meaning collection data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - if hasattr(self_cp, "_collection"): - setattr(self_cp, "_collection", None) - return _asdict(self_cp, **kwargs) - - @requires_dependencies(["chromadb"], extras="chroma") - def create_collection(self) -> "ChromaCollection": - import chromadb - - if self.connector_config.path: - chroma_client = chromadb.PersistentClient( - path=self.connector_config.path, - settings=self.connector_config.settings, - tenant=self.connector_config.tenant, - database=self.connector_config.database, - ) - - elif self.connector_config.host and self.connector_config.port: - chroma_client = chromadb.HttpClient( - host=self.connector_config.host, - port=self.connector_config.port, - ssl=self.connector_config.ssl, - headers=self.connector_config.access_config.headers, - settings=self.connector_config.access_config.settings, - tenant=self.connector_config.tenant, - database=self.connector_config.database, - ) - else: - raise ValueError("Chroma connector requires either path or host and port to be set.") - - collection = chroma_client.get_or_create_collection( - name=self.connector_config.collection_name - ) - return collection - - @DestinationConnectionError.wrap - @requires_dependencies(["chromadb"], extras="chroma") - def upsert_batch(self, batch): - collection = self.chroma_collection - - try: - # Chroma wants lists even if there is only one element - # Upserting to prevent duplicates - collection.upsert( - ids=batch["ids"], - documents=batch["documents"], - embeddings=batch["embeddings"], - metadatas=batch["metadatas"], - ) - except Exception as e: - raise ValueError(f"chroma error: {e}") from e - - @staticmethod - def prepare_chroma_list(chunk: t.Tuple[t.Dict[str, t.Any]]) -> t.Dict[str, t.List[t.Any]]: - """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb. - ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}""" - chroma_dict = {} - chroma_dict["ids"] = [x.get("id") for x in chunk] - chroma_dict["documents"] = [x.get("document") for x in chunk] - chroma_dict["embeddings"] = [x.get("embedding") for x in chunk] - chroma_dict["metadatas"] = [x.get("metadata") for x in chunk] - # Make sure all lists are of the same length - assert ( - len(chroma_dict["ids"]) - == len(chroma_dict["documents"]) - == len(chroma_dict["embeddings"]) - == len(chroma_dict["metadatas"]) - ) - return chroma_dict - - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info(f"Inserting / updating {len(elements_dict)} documents to destination ") - - chroma_batch_size = self.write_config.batch_size - - for chunk in batch_generator(elements_dict, chroma_batch_size): - self.upsert_batch(self.prepare_chroma_list(chunk)) - - def normalize_dict(self, element_dict: dict) -> dict: - element_id = element_dict.get("element_id", str(uuid.uuid4())) - return { - "id": element_id, - "embedding": element_dict.pop("embeddings", None), - "document": element_dict.pop("text", None), - "metadata": flatten_dict( - element_dict, separator="-", flatten_lists=True, remove_none=True - ), - } diff --git a/unstructured/ingest/connector/clarifai.py b/unstructured/ingest/connector/clarifai.py deleted file mode 100644 index 1c1e06412..000000000 --- a/unstructured/ingest/connector/clarifai.py +++ /dev/null @@ -1,122 +0,0 @@ -import typing as t -import uuid -from dataclasses import dataclass, field - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from clarifai.client.input import Inputs - - -@dataclass -class ClarifaiAccessConfig(AccessConfig): - api_key: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleClarifaiConfig(BaseConnectorConfig): - access_config: ClarifaiAccessConfig - app_id: str - user_id: str - dataset_id: t.Optional[str] = None - - -@dataclass -class ClarifaiWriteConfig(WriteConfig): - batch_size: int = 50 - - -@dataclass -class ClarifaiDestinationConnector(BaseDestinationConnector): - write_config: ClarifaiWriteConfig - connector_config: SimpleClarifaiConfig - _client: t.Optional["Inputs"] = field(init=False, default=None) - - @property - @requires_dependencies(["clarifai"], extras="clarifai") - def client(self) -> "Inputs": - if self._client is None: - from clarifai.client.input import Inputs - - access_conf = self.connector_config.access_config - try: - if access_conf.api_key is not None: - clarifai_pat = access_conf.api_key - except Exception as e: - raise (f"please provide clarifai PAT key : {e}") - - self._client = Inputs( - app_id=self.connector_config.app_id, - user_id=self.connector_config.user_id, - pat=clarifai_pat, - ) - return self._client - - @requires_dependencies(["clarifai"], extras="clarifai") - @DestinationConnectionError.wrap - def initialize(self): - _ = self.client - - def check_connection(self): - try: - _ = [inp for inp in self.client.list_inputs(page_no=1, per_page=1)] # noqa: C416 - except Exception as e: - logger.error(f"Failed to validate connection {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def normalize_dict(self, element_dict: dict) -> dict: - """Modifying schema of the dict in order to compile with clarifai input formats""" - return { - "input_id": str(uuid.uuid4().hex), - "text": element_dict.pop("text", None), - "metadata": { - **flatten_dict( - element_dict, - separator="_", - flatten_lists=True, - remove_none=True, - ), - }, - } - - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - from google.protobuf.struct_pb2 import Struct - - logger.info( - f"writing {len(elements_dict)} objects to destination " - f"app {self.connector_config.app_id} " - ) - try: - batch_size = self.write_config.batch_size - for idx in range(0, len(elements_dict), batch_size): - batch_dict = elements_dict[idx : batch_size + idx] - input_batch = [] - for elem in batch_dict: - meta_struct = Struct() - meta_struct.update(elem["metadata"]) - input_batch.append( - self._client.get_text_input( - input_id=elem["input_id"], - raw_text=elem["text"], - dataset_id=self.connector_config.dataset_id, - metadata=meta_struct, - ) - ) - result_id = self._client.upload_inputs(inputs=input_batch) - logger.debug( - f"Input posted successfully into {self.connector_config.app_id}. \ - Result id: {result_id}" - ) - - except Exception as e: - raise e diff --git a/unstructured/ingest/connector/confluence.py b/unstructured/ingest/connector/confluence.py deleted file mode 100644 index 4e1369349..000000000 --- a/unstructured/ingest/connector/confluence.py +++ /dev/null @@ -1,285 +0,0 @@ -import math -import typing as t -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path - -import requests - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from atlassian import Confluence - - -@dataclass -class ConfluenceAccessConfig(AccessConfig): - api_token: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleConfluenceConfig(BaseConnectorConfig): - """Connector config where: - user_email is the email to authenticate into Confluence Cloud, - api_token is the api token to authenticate into Confluence Cloud, - and url is the URL pointing to the Confluence Cloud instance. - - Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/ - for more info on the api_token. - """ - - user_email: str - access_config: ConfluenceAccessConfig - url: str - max_num_of_spaces: int = 500 - max_num_of_docs_from_each_space: int = 100 - spaces: t.List[str] = field(default_factory=list) - - -@dataclass -class ConfluenceDocumentMeta: - """Metadata specifying: - id for the confluence space that the document locates in, - and the id of document that is being reached to. - """ - - space_id: str - document_id: str - - -def scroll_wrapper(func): - def wrapper(*args, **kwargs): - """Wraps a function to obtain scroll functionality.""" - number_of_items_to_fetch = kwargs["number_of_items_to_fetch"] - del kwargs["number_of_items_to_fetch"] - - kwargs["limit"] = min(100, number_of_items_to_fetch) - kwargs["start"] = kwargs.get("start", 0) - - all_results = [] - num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"]) - - for _ in range(num_iterations): - response = func(*args, **kwargs) - if isinstance(response, list): - all_results += func(*args, **kwargs) - elif isinstance(response, dict): - all_results += func(*args, **kwargs)["results"] - - kwargs["start"] += kwargs["limit"] - - return all_results[:number_of_items_to_fetch] - - return wrapper - - -@dataclass -class ConfluenceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing). - - Current implementation creates a Confluence connection object - to fetch each doc, rather than creating a it for each thread. - """ - - connector_config: SimpleConfluenceConfig - document_meta: ConfluenceDocumentMeta - registry_name: str = "confluence" - - # TODO: remove one of filename or _tmp_download_file, using a wrapper - @property - def filename(self): - if not self.read_config.download_dir: - return None - return ( - Path(self.read_config.download_dir) - / self.document_meta.space_id - / f"{self.document_meta.document_id}.html" - ).resolve() - - @property - def _output_filename(self): - """Create output file path based on output directory, space id and document id.""" - output_file = f"{self.document_meta.document_id}.json" - return Path(self.processor_config.output_dir) / self.document_meta.space_id / output_file - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "url": self.connector_config.url, - "page_id": self.document_meta.document_id, - } - - @SourceConnectionNetworkError.wrap - @requires_dependencies(["atlassian"], extras="Confluence") - def _get_page(self): - from atlassian import Confluence - from atlassian.errors import ApiError - - try: - confluence = Confluence( - self.connector_config.url, - username=self.connector_config.user_email, - password=self.connector_config.access_config.api_token, - ) - result = confluence.get_page_by_id( - page_id=self.document_meta.document_id, - expand="history.lastUpdated,version,body.view", - ) - except ApiError as e: - logger.error(e) - return None - return result - - def update_source_metadata(self, **kwargs): - """Fetches file metadata from the current page.""" - page = kwargs.get("page", self._get_page()) - if page is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - document_history = page["history"] - date_created = datetime.strptime( - document_history["createdDate"], - "%Y-%m-%dT%H:%M:%S.%fZ", - ).isoformat() - if last_updated := document_history.get("lastUpdated", {}).get("when", ""): - date_modified = datetime.strptime( - last_updated, - "%Y-%m-%dT%H:%M:%S.%fZ", - ).isoformat() - else: - date_modified = date_created - version = page["version"]["number"] - self.source_metadata = SourceMetadata( - date_created=date_created, - date_modified=date_modified, - version=version, - source_url=page["_links"].get("self", None), - exists=True, - ) - - @SourceConnectionError.wrap - @requires_dependencies(["atlassian"], extras="confluence") - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - # TODO: instead of having a separate connection object for each doc, - # have a separate connection object for each process - - result = self._get_page() - self.update_source_metadata(page=result) - if result is None: - raise ValueError(f"Failed to retrieve page with ID {self.document_meta.document_id}") - self.document = result["body"]["view"]["value"] - self.filename.parent.mkdir(parents=True, exist_ok=True) - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.document) - - -@dataclass -class ConfluenceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Fetches body fields from all documents within all spaces in a Confluence Cloud instance.""" - - connector_config: SimpleConfluenceConfig - _confluence: t.Optional["Confluence"] = field(init=False, default=None) - - @property - def confluence(self) -> "Confluence": - from atlassian import Confluence - - if self._confluence is None: - self._confluence = Confluence( - url=self.connector_config.url, - username=self.connector_config.user_email, - password=self.connector_config.access_config.api_token, - ) - return self._confluence - - @requires_dependencies(["atlassian"], extras="Confluence") - def check_connection(self): - url = "rest/api/space" - try: - self.confluence.request(method="HEAD", path=url) - except requests.HTTPError as http_error: - logger.error(f"failed to validate connection: {http_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {http_error}") - - @requires_dependencies(["atlassian"], extras="Confluence") - def initialize(self): - self.list_of_spaces = None - if self.connector_config.spaces: - self.list_of_spaces = self.connector_config.spaces - if self.connector_config.max_num_of_spaces: - logger.warning( - """--confluence-list-of-spaces and --confluence-num-of-spaces cannot - be used at the same time. Connector will only fetch the - --confluence-list-of-spaces that you've provided.""", - ) - - @requires_dependencies(["atlassian"], extras="Confluence") - def _get_space_ids(self): - """Fetches spaces in a confluence domain.""" - - get_spaces_with_scroll = scroll_wrapper(self.confluence.get_all_spaces) - - all_results = get_spaces_with_scroll( - number_of_items_to_fetch=self.connector_config.max_num_of_spaces, - ) - - space_ids = [space["key"] for space in all_results] - return space_ids - - @requires_dependencies(["atlassian"], extras="Confluence") - def _get_docs_ids_within_one_space( - self, - space_id: str, - content_type: str = "page", - ): - get_pages_with_scroll = scroll_wrapper(self.confluence.get_all_pages_from_space) - results = get_pages_with_scroll( - space=space_id, - number_of_items_to_fetch=self.connector_config.max_num_of_docs_from_each_space, - content_type=content_type, - ) - - doc_ids = [(space_id, doc["id"]) for doc in results] - return doc_ids - - @requires_dependencies(["atlassian"], extras="Confluence") - def _get_doc_ids_within_spaces(self): - space_ids = self._get_space_ids() if not self.list_of_spaces else self.list_of_spaces - - doc_ids_all = [self._get_docs_ids_within_one_space(space_id=id) for id in space_ids] - - doc_ids_flattened = [ - (space_id, doc_id) - for doc_ids_space in doc_ids_all - for space_id, doc_id in doc_ids_space - ] - return doc_ids_flattened - - def get_ingest_docs(self): - """Fetches all documents in a confluence space.""" - doc_ids = self._get_doc_ids_within_spaces() - return [ - ConfluenceIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - document_meta=ConfluenceDocumentMeta(space_id, doc_id), - ) - for space_id, doc_id in doc_ids - ] diff --git a/unstructured/ingest/connector/databricks_volumes.py b/unstructured/ingest/connector/databricks_volumes.py deleted file mode 100644 index 5662d65cd..000000000 --- a/unstructured/ingest/connector/databricks_volumes.py +++ /dev/null @@ -1,137 +0,0 @@ -import copy -import json -import os -import typing as t -from dataclasses import dataclass, field -from io import BytesIO -from pathlib import PurePath - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - BaseSingleIngestDoc, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from databricks.sdk import WorkspaceClient - - -@dataclass -class DatabricksVolumesAccessConfig(AccessConfig): - account_id: t.Optional[str] = None - username: t.Optional[str] = None - password: t.Optional[str] = enhanced_field(default=None, sensitive=True) - client_id: t.Optional[str] = None - client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True) - token: t.Optional[str] = enhanced_field(default=None, sensitive=True) - profile: t.Optional[str] = None - azure_workspace_resource_id: t.Optional[str] = None - azure_client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True) - azure_client_id: t.Optional[str] = None - azure_tenant_id: t.Optional[str] = None - azure_environment: t.Optional[str] = None - auth_type: t.Optional[str] = None - cluster_id: t.Optional[str] = None - google_credentials: t.Optional[str] = None - google_service_account: t.Optional[str] = None - - -@dataclass -class SimpleDatabricksVolumesConfig(BaseConnectorConfig): - access_config: DatabricksVolumesAccessConfig - host: t.Optional[str] = None - - -@dataclass -class DatabricksVolumesWriteConfig(WriteConfig): - volume: str - catalog: str - volume_path: t.Optional[str] = None - overwrite: bool = False - encoding: str = "utf-8" - schema: str = "default" - - @property - def path(self) -> str: - path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}" - if self.volume_path: - path = f"{path}/{self.volume_path}" - return path - - -@dataclass -class DatabricksVolumesDestinationConnector(BaseDestinationConnector): - write_config: DatabricksVolumesWriteConfig - connector_config: SimpleDatabricksVolumesConfig - _client: t.Optional["WorkspaceClient"] = field(init=False, default=None) - - def to_dict(self, **kwargs): - self_cp = copy.copy(self) - if hasattr(self_cp, "_client"): - setattr(self_cp, "_client", None) - return _asdict(self_cp, **kwargs) - - @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes") - def generate_client(self) -> "WorkspaceClient": - from databricks.sdk import WorkspaceClient - - return WorkspaceClient( - host=self.connector_config.host, **self.connector_config.access_config.to_dict() - ) - - @property - def client(self) -> "WorkspaceClient": - if self._client is None: - self._client = self.generate_client() - return self._client - - def check_connection(self): - try: - assert self.client.current_user.me().active - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def initialize(self): - _ = self.client - - def write_dict( - self, - *args, - elements_dict: t.List[t.Dict[str, t.Any]], - filename: t.Optional[str] = None, - indent: int = 4, - encoding: str = "utf-8", - **kwargs, - ) -> None: - output_folder = self.write_config.path - output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator - filename = ( - filename.strip(os.sep) if filename else filename - ) # Make sure filename doesn't begin with file seperator - output_path = str(PurePath(output_folder, filename)) if filename else output_folder - logger.debug(f"uploading content to {output_path}") - self.client.files.upload( - file_path=output_path, - contents=BytesIO(json.dumps(elements_dict).encode(encoding=self.write_config.encoding)), - overwrite=self.write_config.overwrite, - ) - - def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]: - pass - - def write(self, docs: t.List[BaseSingleIngestDoc]) -> None: - for doc in docs: - file_path = doc.base_output_filename - filename = file_path if file_path else None - with open(doc._output_filename) as json_file: - logger.debug(f"uploading content from {doc._output_filename}") - json_list = json.load(json_file) - self.write_dict(elements_dict=json_list, filename=filename) diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py deleted file mode 100644 index 1382ed05d..000000000 --- a/unstructured/ingest/connector/delta_table.py +++ /dev/null @@ -1,203 +0,0 @@ -import os -import typing as t -from dataclasses import dataclass -from datetime import datetime as dt -from multiprocessing import Process -from pathlib import Path - -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseDestinationConnector, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from deltalake import DeltaTable - - -@dataclass -class SimpleDeltaTableConfig(BaseConnectorConfig): - table_uri: t.Union[str, Path] - version: t.Optional[int] = None - storage_options: t.Optional[t.Dict[str, str]] = None - without_files: bool = False - - -@dataclass -class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleDeltaTableConfig - uri: str - modified_date: str - created_at: str - registry_name: str = "delta-table" - - def uri_filename(self) -> str: - basename = os.path.basename(self.uri) - return os.path.splitext(basename)[0] - - @property - def filename(self): - return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve() - - @property - def _output_filename(self): - """Create filename document id combined with a hash of the query to uniquely identify - the output file.""" - return Path(self.processor_config.output_dir) / f"{self.uri_filename()}.json" - - def _create_full_tmp_dir_path(self): - self.filename.parent.mkdir(parents=True, exist_ok=True) - self._output_filename.parent.mkdir(parents=True, exist_ok=True) - - @requires_dependencies(["fsspec"], extras="delta-table") - def _get_fs_from_uri(self): - from fsspec.core import url_to_fs - - try: - fs, _ = url_to_fs(self.uri) - except ImportError as error: - raise ImportError( - f"uri {self.uri} may be associated with a filesystem that " - f"requires additional dependencies: {error}", - ) - return fs - - def update_source_metadata(self, **kwargs): - fs = kwargs.get("fs", self._get_fs_from_uri()) - version = ( - fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "") - ) - file_exists = fs.exists(self.uri) - self.source_metadata = SourceMetadata( - date_created=self.created_at, - date_modified=self.modified_date, - version=version, - source_url=self.uri, - exists=file_exists, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - fs = self._get_fs_from_uri() - self.update_source_metadata(fs=fs) - logger.info(f"using a {fs} filesystem to collect table data") - self._create_full_tmp_dir_path() - - df = self._get_df(filesystem=fs) - - logger.info(f"writing {len(df)} rows to {self.filename}") - df.to_csv(self.filename) - - @SourceConnectionNetworkError.wrap - def _get_df(self, filesystem): - import pyarrow.parquet as pq - - return pq.ParquetDataset(self.uri, filesystem=filesystem).read_pandas().to_pandas() - - -@dataclass -class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleDeltaTableConfig - delta_table: t.Optional["DeltaTable"] = None - - def check_connection(self): - pass - - @requires_dependencies(["deltalake"], extras="delta-table") - def initialize(self): - from deltalake import DeltaTable - - self.delta_table = DeltaTable( - table_uri=self.connector_config.table_uri, - version=self.connector_config.version, - storage_options=self.connector_config.storage_options, - without_files=self.connector_config.without_files, - ) - rows = self.delta_table.to_pyarrow_dataset().count_rows() - if not rows > 0: - raise ValueError(f"no data found at {self.connector_config.table_uri}") - logger.info(f"processing {rows} rows of data") - - def get_ingest_docs(self): - """Batches the results into distinct docs""" - if not self.delta_table: - raise ValueError("delta table was never initialized") - actions = self.delta_table.get_add_actions().to_pandas() - mod_date_dict = { - row["path"]: str(row["modification_time"]) for _, row in actions.iterrows() - } - created_at = dt.fromtimestamp(self.delta_table.metadata().created_time / 1000) - return [ - DeltaTableIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - uri=uri, - modified_date=mod_date_dict[os.path.basename(uri)], - created_at=str(created_at), - ) - for uri in self.delta_table.file_uris() - ] - - -@dataclass -class DeltaTableWriteConfig(WriteConfig): - drop_empty_cols: bool = False - mode: t.Literal["error", "append", "overwrite", "ignore"] = "error" - schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None - engine: t.Literal["pyarrow", "rust"] = "pyarrow" - - -@dataclass -class DeltaTableDestinationConnector(BaseDestinationConnector): - write_config: DeltaTableWriteConfig - connector_config: SimpleDeltaTableConfig - - @requires_dependencies(["deltalake"], extras="delta-table") - def initialize(self): - pass - - def check_connection(self): - pass - - @requires_dependencies(["deltalake"], extras="delta-table") - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - from deltalake.writer import write_deltalake - - from unstructured.ingest.utils.table import convert_to_pandas_dataframe - - df = convert_to_pandas_dataframe( - elements_dict=elements_dict, - drop_empty_cols=self.write_config.drop_empty_cols, - ) - logger.info( - f"writing {len(df)} rows to destination table " - f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}", - ) - writer_kwargs = { - "table_or_uri": self.connector_config.table_uri, - "data": df, - "mode": self.write_config.mode, - "engine": self.write_config.engine, - } - if self.write_config.schema_mode is not None: - writer_kwargs["schema_mode"] = self.write_config.schema_mode - # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause - # ingest to fail, even though all tasks are completed normally. Putting the writer into a - # process mitigates this issue by ensuring python interpreter waits properly for deltalake's - # rust backend to finish - writer = Process( - target=write_deltalake, - kwargs=writer_kwargs, - ) - writer.start() - writer.join() diff --git a/unstructured/ingest/connector/discord.py b/unstructured/ingest/connector/discord.py deleted file mode 100644 index bfbfc8fbd..000000000 --- a/unstructured/ingest/connector/discord.py +++ /dev/null @@ -1,180 +0,0 @@ -import datetime as dt -import typing as t -from dataclasses import dataclass -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - requires_dependencies, -) - - -@dataclass -class DiscordAccessConfig(AccessConfig): - token: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleDiscordConfig(BaseConnectorConfig): - """Connector config where channels is a comma separated list of - Discord channels to pull messages from. - """ - - # Discord Specific Options - access_config: DiscordAccessConfig - channels: t.List[str] - period: t.Optional[int] = None - - -@dataclass -class DiscordIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - connector_config: SimpleDiscordConfig - channel: str - days: t.Optional[int] = None - registry_name: str = "discord" - - # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file - # __post_init__ for multiprocessing simplicity (no Path objects in initially - # instantiated object) - def _tmp_download_file(self): - channel_file = self.channel + ".txt" - return Path(self.read_config.download_dir) / channel_file - - @property - def _output_filename(self): - output_file = self.channel + ".json" - return Path(self.processor_config.output_dir) / output_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @SourceConnectionNetworkError.wrap - @requires_dependencies(dependencies=["discord"], extras="discord") - def _get_messages(self): - """Actually fetches the data from discord.""" - import discord - from discord.ext import commands - - messages: t.List[discord.Message] = [] - jumpurl: t.List[str] = [] - intents = discord.Intents.default() - intents.message_content = True - bot = commands.Bot(command_prefix=">", intents=intents) - - @bot.event - async def on_ready(): - try: - after_date = None - if self.days: - after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days) - channel = bot.get_channel(int(self.channel)) - jumpurl.append(channel.jump_url) # type: ignore - async for msg in channel.history(after=after_date): # type: ignore - messages.append(msg) - await bot.close() - except Exception: - logger.error("Error fetching messages") - await bot.close() - raise - - bot.run(self.connector_config.access_config.token) - jump_url = None if len(jumpurl) < 1 else jumpurl[0] - return messages, jump_url - - def update_source_metadata(self, **kwargs): - messages, jump_url = kwargs.get("messages_tuple", self._get_messages()) - if messages == []: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - dates = [m.created_at for m in messages if m.created_at] - dates.sort() - self.source_metadata = SourceMetadata( - date_created=dates[0].isoformat(), - date_modified=dates[-1].isoformat(), - source_url=jump_url, - exists=True, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - self._create_full_tmp_dir_path() - - messages, jump_url = self._get_messages() - self.update_source_metadata(messages_tuple=(messages, jump_url)) - if messages == []: - raise ValueError(f"Failed to retrieve messages from Discord channel {self.channel}") - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - with open(self._tmp_download_file(), "w") as f: - for m in messages: - f.write(m.content + "\n") - - @property - def filename(self): - """The filename of the file created from a discord channel""" - return self._tmp_download_file() - - @property - def version(self) -> t.Optional[str]: - return None - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "channel": self.channel, - } - - -class DiscordSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Objects of this class support fetching document(s) from""" - - connector_config: SimpleDiscordConfig - - def initialize(self): - pass - - @requires_dependencies(dependencies=["discord"], extras="discord") - def check_connection(self): - import asyncio - - import discord - from discord.client import Client - - intents = discord.Intents.default() - try: - client = Client(intents=intents) - asyncio.run(client.start(token=self.connector_config.access_config.token)) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def get_ingest_docs(self): - return [ - DiscordIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - channel=channel, - days=self.connector_config.period, - ) - for channel in self.connector_config.channels - ] diff --git a/unstructured/ingest/connector/elasticsearch.py b/unstructured/ingest/connector/elasticsearch.py deleted file mode 100644 index aa8ff1d9e..000000000 --- a/unstructured/ingest/connector/elasticsearch.py +++ /dev/null @@ -1,397 +0,0 @@ -import copy -import hashlib -import typing as t -import uuid -from dataclasses import dataclass, field -from pathlib import Path - -from dataclasses_json.core import Json - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - BaseIngestDocBatch, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import generator_batching_wbytes -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from elasticsearch import Elasticsearch - - -@dataclass -class ElasticsearchAccessConfig(AccessConfig): - hosts: t.Optional[t.List[str]] = None - username: t.Optional[str] = None - password: t.Optional[str] = enhanced_field(default=None, sensitive=True) - cloud_id: t.Optional[str] = None - api_key: t.Optional[str] = enhanced_field( - default=None, sensitive=True, overload_name="es_api_key" - ) - api_key_id: t.Optional[str] = None - bearer_auth: t.Optional[str] = enhanced_field(default=None, sensitive=True) - ca_certs: t.Optional[str] = None - ssl_assert_fingerprint: t.Optional[str] = enhanced_field(default=None, sensitive=True) - - def to_dict(self, **kwargs) -> t.Dict[str, Json]: - d = super().to_dict(**kwargs) - # Update auth related fields to conform to what the SDK expects based on the - # supported methods: - # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html - if not self.ca_certs: - # ES library already sets a default for this, don't want to - # introduce data by setting it to None - d.pop("ca_certs") - if self.password and (self.cloud_id or self.ca_certs or self.ssl_assert_fingerprint): - d.pop("password") - d["basic_auth"] = ("elastic", self.password) - elif not self.cloud_id and self.username and self.password: - d.pop("username", None) - d.pop("password", None) - d["basic_auth"] = (self.username, self.password) - elif self.api_key and self.api_key_id: - d.pop("api_key_id", None) - d.pop("api_key", None) - d["api_key"] = (self.api_key_id, self.api_key) - # This doesn't exist on the client init, remove: - d.pop("api_key_id", None) - return d - - -@dataclass -class SimpleElasticsearchConfig(BaseConnectorConfig): - """Connector config where: - url is the url to access the elasticsearch server, - index_name is the name of the index to reach to, - """ - - index_name: str - batch_size: int = 100 - fields: t.List[str] = field(default_factory=list) - access_config: ElasticsearchAccessConfig = None - - -@dataclass -class ElasticsearchDocumentMeta: - """Metadata specifying: - name of the elasticsearch index that is being reached to, - and the id of document that is being reached to, - """ - - index_name: str - document_id: str - - -@dataclass -class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Current implementation creates a python Elasticsearch client to fetch each doc, - rather than creating a client for each thread. - """ - - connector_config: SimpleElasticsearchConfig - document_meta: ElasticsearchDocumentMeta - document: dict = field(default_factory=dict) - registry_name: str = "elasticsearch" - - # TODO: remove one of filename or _tmp_download_file, using a wrapper - @property - def filename(self): - f = self.document_meta.document_id - if self.connector_config.fields: - f = "{}-{}".format( - f, - hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8], - ) - return ( - Path(self.read_config.download_dir) / self.document_meta.index_name / f"{f}.txt" - ).resolve() - - @property - def _output_filename(self): - """Create filename document id combined with a hash of the query to uniquely identify - the output file.""" - # Generate SHA256 hash and take the first 8 characters - filename = self.document_meta.document_id - if self.connector_config.fields: - filename = "{}-{}".format( - filename, - hashlib.sha256(",".join(self.connector_config.fields).encode()).hexdigest()[:8], - ) - output_file = f"{filename}.json" - return ( - Path(self.processor_config.output_dir) / self.connector_config.index_name / output_file - ) - - def update_source_metadata(self, **kwargs): - if self.document is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - self.source_metadata = SourceMetadata( - version=self.document["_version"], - exists=True, - ) - - @SourceConnectionError.wrap - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - pass - - @property - def date_created(self) -> t.Optional[str]: - return None - - @property - def date_modified(self) -> t.Optional[str]: - return None - - @property - def source_url(self) -> t.Optional[str]: - return None - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "hosts": self.connector_config.access_config.hosts, - "index_name": self.connector_config.index_name, - "document_id": self.document_meta.document_id, - } - - -@dataclass -class ElasticsearchIngestDocBatch(BaseIngestDocBatch): - connector_config: SimpleElasticsearchConfig - ingest_docs: t.List[ElasticsearchIngestDoc] = field(default_factory=list) - list_of_ids: t.List[str] = field(default_factory=list) - registry_name: str = "elasticsearch_batch" - - def __post_init__(self): - # Until python3.8 is deprecated, this is a limitation of dataclass inheritance - # to make it a required field - if len(self.list_of_ids) == 0: - raise ValueError("list_of_ids is required") - - @property - def unique_id(self) -> str: - return ",".join(sorted(self.list_of_ids)) - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def _get_docs(self): - from elasticsearch import Elasticsearch - from elasticsearch.helpers import scan - - es = Elasticsearch(**self.connector_config.access_config.to_dict(apply_name_overload=False)) - scan_query = { - "_source": self.connector_config.fields, - "version": True, - "query": {"ids": {"values": self.list_of_ids}}, - } - - result = scan( - es, - query=scan_query, - scroll="1m", - index=self.connector_config.index_name, - ) - return list(result) - - @SourceConnectionError.wrap - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def get_files(self): - documents = self._get_docs() - for doc in documents: - ingest_doc = ElasticsearchIngestDoc( - processor_config=self.processor_config, - read_config=self.read_config, - connector_config=self.connector_config, - document=doc, - document_meta=ElasticsearchDocumentMeta( - self.connector_config.index_name, doc["_id"] - ), - ) - ingest_doc.update_source_metadata() - doc_body = doc["_source"] - filename = ingest_doc.filename - flattened_dict = flatten_dict(dictionary=doc_body) - str_values = [str(value) for value in flattened_dict.values()] - concatenated_values = "\n".join(str_values) - - filename.parent.mkdir(parents=True, exist_ok=True) - with open(filename, "w", encoding="utf8") as f: - f.write(concatenated_values) - self.ingest_docs.append(ingest_doc) - - -@dataclass -class ElasticsearchSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Fetches particular fields from all documents in a given elasticsearch cluster and index""" - - connector_config: SimpleElasticsearchConfig - _es: t.Optional["Elasticsearch"] = field(init=False, default=None) - - @property - def es(self): - from elasticsearch import Elasticsearch - - if self._es is None: - self._es = Elasticsearch( - **self.connector_config.access_config.to_dict(apply_name_overload=False) - ) - return self._es - - def check_connection(self): - try: - self.es.perform_request("HEAD", "/", headers={"accept": "application/json"}) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def __post_init__(self): - self.scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}} - - def initialize(self): - pass - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def _get_doc_ids(self): - """Fetches all document ids in an index""" - from elasticsearch.helpers import scan - - hits = scan( - self.es, - query=self.scan_query, - scroll="1m", - index=self.connector_config.index_name, - ) - - return [hit["_id"] for hit in hits] - - def get_ingest_docs(self): - """Fetches all documents in an index, using ids that are fetched with _get_doc_ids""" - ids = self._get_doc_ids() - id_batches = [ - ids[ - i - * self.connector_config.batch_size : (i + 1) # noqa - * self.connector_config.batch_size - ] - for i in range( - (len(ids) + self.connector_config.batch_size - 1) - // self.connector_config.batch_size - ) - ] - return [ - ElasticsearchIngestDocBatch( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - list_of_ids=batched_ids, - ) - for batched_ids in id_batches - ] - - -@dataclass -class ElasticsearchWriteConfig(WriteConfig): - batch_size_bytes: int = 15_000_000 - num_processes: int = 1 - - -@dataclass -class ElasticsearchDestinationConnector(BaseDestinationConnector): - write_config: ElasticsearchWriteConfig - connector_config: SimpleElasticsearchConfig - _client: t.Optional["Elasticsearch"] = field(init=False, default=None) - - def to_dict(self, **kwargs): - """ - The _client variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle '_thread.lock' object - When serializing, remove it, meaning client data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - if hasattr(self_cp, "_client"): - setattr(self_cp, "_client", None) - return _asdict(self_cp, **kwargs) - - @DestinationConnectionError.wrap - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def generate_client(self) -> "Elasticsearch": - from elasticsearch import Elasticsearch - - return Elasticsearch( - **self.connector_config.access_config.to_dict(apply_name_overload=False) - ) - - @property - def client(self): - if self._client is None: - self._client = self.generate_client() - return self._client - - def initialize(self): - _ = self.client - - @DestinationConnectionError.wrap - def check_connection(self): - try: - assert self.client.ping() - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info( - f"writing document batches to destination" - f" index named {self.connector_config.index_name}" - f" at {self.connector_config.access_config.hosts}" - f" with batch size (in bytes) {self.write_config.batch_size_bytes}" - f" with {self.write_config.num_processes} (number of) processes" - ) - from elasticsearch.helpers import parallel_bulk - - for batch in generator_batching_wbytes( - elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes - ): - for success, info in parallel_bulk( - self.client, batch, thread_count=self.write_config.num_processes - ): - if not success: - logger.error( - "upload failed for a batch in elasticsearch destination connector:", info - ) - - def normalize_dict(self, element_dict: dict) -> dict: - return { - "_index": self.connector_config.index_name, - "_id": str(uuid.uuid4()), - "_source": { - "element_id": element_dict.pop("element_id", None), - "embeddings": element_dict.pop("embeddings", None), - "text": element_dict.pop("text", None), - "type": element_dict.pop("type", None), - "metadata": flatten_dict( - element_dict.pop("metadata", None), - separator="-", - ), - }, - } diff --git a/unstructured/ingest/connector/fsspec/__init__.py b/unstructured/ingest/connector/fsspec/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/connector/fsspec/azure.py b/unstructured/ingest/connector/fsspec/azure.py deleted file mode 100644 index 169cda6a0..000000000 --- a/unstructured/ingest/connector/fsspec/azure.py +++ /dev/null @@ -1,78 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecDestinationConnector, - FsspecIngestDoc, - FsspecSourceConnector, - FsspecWriteConfig, - SimpleFsspecConfig, - WriteTextConfig, -) -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import AccessConfig -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -@dataclass -class AzureWriteTextConfig(WriteTextConfig): - overwrite: bool = False - - -@dataclass -class AzureWriteConfig(FsspecWriteConfig): - write_text_config: t.Optional[AzureWriteTextConfig] = None - - -@dataclass -class AzureAccessConfig(AccessConfig): - account_name: t.Optional[str] = enhanced_field(default=None, sensitive=True) - account_key: t.Optional[str] = enhanced_field(default=None, sensitive=True) - connection_string: t.Optional[str] = enhanced_field(default=None, sensitive=True) - sas_token: t.Optional[str] = enhanced_field(default=None, sensitive=True) - - -@dataclass -class SimpleAzureBlobStorageConfig(SimpleFsspecConfig): - access_config: AzureAccessConfig = None - - -@dataclass -class AzureBlobStorageIngestDoc(FsspecIngestDoc): - connector_config: SimpleAzureBlobStorageConfig - registry_name: str = "azure" - - @SourceConnectionError.wrap - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def get_file(self): - super().get_file() - - -@dataclass -class AzureBlobStorageSourceConnector(FsspecSourceConnector): - connector_config: SimpleAzureBlobStorageConfig - - def __post_init__(self): - self.ingest_doc_cls: t.Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc - - -@dataclass -class AzureBlobStorageDestinationConnector(FsspecDestinationConnector): - connector_config: SimpleAzureBlobStorageConfig - write_config: AzureWriteConfig - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def initialize(self): - super().initialize() - - @requires_dependencies(["adlfs"], extras="azure") - def check_connection(self): - from adlfs import AzureBlobFileSystem - - try: - AzureBlobFileSystem(**self.connector_config.get_access_config()) - except ValueError as connection_error: - logger.error(f"failed to validate connection: {connection_error}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {connection_error}") diff --git a/unstructured/ingest/connector/fsspec/box.py b/unstructured/ingest/connector/fsspec/box.py deleted file mode 100644 index 67a56fa69..000000000 --- a/unstructured/ingest/connector/fsspec/box.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -Box Connector -Box does not make it simple to download files with an App. -First of all, this does not work with a free Box account. -Make sure the App service email is a collaborator for your folder (co-owner or editor) -Make sure you have the 'write all files' application scope -Maybe check 'Make api calls as the as-user header' -REAUTHORIZE app after making any of the above changes -""" - -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecDestinationConnector, - FsspecIngestDoc, - FsspecSourceConnector, - FsspecWriteConfig, - SimpleFsspecConfig, -) -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import AccessConfig -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -class AccessTokenError(Exception): - """There is a problem with the Access Token.""" - - -@dataclass -class BoxWriteConfig(FsspecWriteConfig): - pass - - -@dataclass -class BoxAccessConfig(AccessConfig): - box_app_config: t.Optional[str] = None - - -@dataclass -class SimpleBoxConfig(SimpleFsspecConfig): - access_config: BoxAccessConfig = None - - @requires_dependencies(["boxfs"], extras="box") - def get_access_config(self) -> dict: - # Return access_kwargs with oauth. The oauth object can not be stored directly in the config - # because it is not serializable. - from boxsdk import JWTAuth - - access_kwargs_with_oauth: dict[str, t.Any] = { - "oauth": JWTAuth.from_settings_file( - self.access_config.box_app_config, - ), - } - access_config: dict[str, t.Any] = self.access_config.to_dict() - access_config.pop("box_app_config", None) - access_kwargs_with_oauth.update(access_config) - - return access_kwargs_with_oauth - - -@dataclass -class BoxIngestDoc(FsspecIngestDoc): - connector_config: SimpleBoxConfig - registry_name: str = "box" - - @SourceConnectionError.wrap - @requires_dependencies(["boxfs", "fsspec"], extras="box") - def get_file(self): - super().get_file() - - -@dataclass -class BoxSourceConnector(FsspecSourceConnector): - connector_config: SimpleBoxConfig - - @requires_dependencies(["boxfs"], extras="box") - def check_connection(self): - from boxfs import BoxFileSystem - - try: - BoxFileSystem(**self.connector_config.get_access_config()) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def __post_init__(self): - self.ingest_doc_cls: t.Type[BoxIngestDoc] = BoxIngestDoc - - -@dataclass -class BoxDestinationConnector(FsspecDestinationConnector): - connector_config: SimpleBoxConfig - write_config: BoxWriteConfig - - @requires_dependencies(["boxfs", "fsspec"], extras="box") - def initialize(self): - super().initialize() - - @requires_dependencies(["boxfs"], extras="box") - def check_connection(self): - from boxfs import BoxFileSystem - - try: - BoxFileSystem(**self.connector_config.get_access_config()) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") diff --git a/unstructured/ingest/connector/fsspec/dropbox.py b/unstructured/ingest/connector/fsspec/dropbox.py deleted file mode 100644 index 23647bb6d..000000000 --- a/unstructured/ingest/connector/fsspec/dropbox.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -Dropbox Connector -The Dropbox Connector presents a couple abnormal situations. -1) They don't have an unexpiring token -2) They require a forward slash `/` in front of the remote_file_path. This presents -some real problems creating paths. When appending a path that begins with a -forward slash to any path, whether using the / shorthand or joinpath, causes the -starting path to disappear. So the `/` needs to be stripped off. -3) To list and get files from the root directory Dropbox you need a ""," ", or " /" -""" - -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Type - -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecDestinationConnector, - FsspecIngestDoc, - FsspecSourceConnector, - FsspecWriteConfig, - SimpleFsspecConfig, -) -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError -from unstructured.ingest.interfaces import AccessConfig -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -class MissingFolderError(Exception): - """There is no folder by that name. For root try `dropbox:// /`""" - - -@dataclass -class DropboxAccessConfig(AccessConfig): - token: str = enhanced_field(sensitive=True) - - -@dataclass -class DropboxWriteConfig(FsspecWriteConfig): - pass - - -@dataclass -class SimpleDropboxConfig(SimpleFsspecConfig): - access_config: DropboxAccessConfig = None - - -@dataclass -class DropboxIngestDoc(FsspecIngestDoc): - connector_config: SimpleDropboxConfig - registry_name: str = "dropbox" - - @SourceConnectionError.wrap - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def get_file(self): - super().get_file() - - @property - def _output_filename(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root - if self.connector_config.dir_path == " ": - return Path(self.processor_config.output_dir) / re.sub( - "^/", - "", - f"{self.remote_file_path}.json", - ) - else: - return ( - Path(self.processor_config.output_dir) - / f"{self.remote_file_path.replace(f'/{self.connector_config.dir_path}/', '')}.json" - ) - - def _tmp_download_file(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root - download_dir: str = self.read_config.download_dir if self.read_config.download_dir else "" - if not download_dir: - return "" - if self.connector_config.dir_path == " ": - return Path(download_dir) / re.sub( - "^/", - "", - self.remote_file_path, - ) - else: - return Path(download_dir) / self.remote_file_path.replace( - f"/{self.connector_config.dir_path}/", - "", - ) - - -@dataclass -class DropboxSourceConnector(FsspecSourceConnector): - connector_config: SimpleDropboxConfig - - def __post_init__(self): - self.ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def initialize(self): - from fsspec import AbstractFileSystem, get_filesystem_class - - try: - self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - ls_output = self.fs.ls(f"/{self.connector_config.path_without_protocol}") - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - if ls_output and len(ls_output) >= 1: - return - elif ls_output: - raise ValueError( - f"No objects found in {self.connector_config.remote_url}.", - ) - else: - raise MissingFolderError( - "There is no folder by that name. For root try `dropbox:// /`", - ) - - def _list_files(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - if not self.connector_config.recursive: - # fs.ls does not walk directories - # directories that are listed in cloud storage can cause problems because they are seen - # as 0byte files - return [ - x.get("name") - for x in self.fs.ls( - f"/{self.connector_config.path_without_protocol}", - detail=True, - ) - if x.get("size") - ] - else: - # fs.find will recursively walk directories - # "size" is a common key for all the cloud protocols with fs - return [ - k - for k, v in self.fs.find( - f"/{self.connector_config.path_without_protocol}", - detail=True, - ).items() - if v.get("size") - ] - - -@dataclass -class DropboxDestinationConnector(FsspecDestinationConnector): - connector_config: SimpleFsspecConfig - write_config: DropboxWriteConfig diff --git a/unstructured/ingest/connector/fsspec/fsspec.py b/unstructured/ingest/connector/fsspec/fsspec.py deleted file mode 100644 index 1b60a1d87..000000000 --- a/unstructured/ingest/connector/fsspec/fsspec.py +++ /dev/null @@ -1,359 +0,0 @@ -import fnmatch -import json -import os -import typing as t -from abc import ABC -from contextlib import suppress -from dataclasses import dataclass -from pathlib import Path, PurePath - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.error import ( - DestinationConnectionError, - SourceConnectionError, - SourceConnectionNetworkError, -) -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseDestinationConnector, - BaseSingleIngestDoc, - BaseSourceConnector, - FsspecConfig, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.compression import ( - TAR_FILE_EXT, - ZIP_FILE_EXT, - CompressionSourceConnectorMixin, -) -from unstructured.utils import ( - requires_dependencies, -) - -SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [ - "s3", - "s3a", - "abfs", - "az", - "gs", - "gcs", - "box", - "dropbox", - "sftp", -] - - -@dataclass -class SimpleFsspecConfig(FsspecConfig, BaseConnectorConfig): - pass - - -@dataclass -class FsspecIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - connector_config: SimpleFsspecConfig - remote_file_path: str - - def _tmp_download_file(self): - download_dir = self.read_config.download_dir if self.read_config.download_dir else "" - return Path(download_dir) / self.remote_file_path.replace( - f"{self.connector_config.dir_path}/", - "", - ) - - @property - def _output_filename(self): - # Dynamically parse filename , can change if remote path was pointing to the single - # file, a directory, or nested directory - if self.remote_file_path == self.connector_config.path_without_protocol: - file = self.remote_file_path.split("/")[-1] - filename = f"{file}.json" - else: - path_without_protocol = ( - self.connector_config.path_without_protocol - if self.connector_config.path_without_protocol.endswith("/") - else f"{self.connector_config.path_without_protocol}/" - ) - filename = f"{self.remote_file_path.replace(path_without_protocol, '')}.json" - return Path(self.processor_config.output_dir) / filename - - def _create_full_tmp_dir_path(self): - """Includes "directories" in the object path""" - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the file from the current filesystem and stores it locally.""" - from fsspec import AbstractFileSystem, get_filesystem_class - - self._create_full_tmp_dir_path() - fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - self._get_file(fs=fs) - fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix()) - self.update_source_metadata() - - @SourceConnectionNetworkError.wrap - def _get_file(self, fs): - fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix()) - - @requires_dependencies(["fsspec"]) - def update_source_metadata(self): - from fsspec import AbstractFileSystem, get_filesystem_class - - fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - - date_created = None - with suppress(NotImplementedError): - date_created = fs.created(self.remote_file_path).isoformat() - - date_modified = None - with suppress(NotImplementedError): - date_modified = fs.modified(self.remote_file_path).isoformat() - - version = ( - fs.checksum(self.remote_file_path) - if self.connector_config.protocol != "gs" - else fs.info(self.remote_file_path).get("etag", "") - ) - file_exists = fs.exists(self.remote_file_path) - self.source_metadata = SourceMetadata( - date_created=date_created, - date_modified=date_modified, - version=str(version), - source_url=f"{self.connector_config.protocol}://{self.remote_file_path}", - exists=file_exists, - ) - - @property - def filename(self): - """The filename of the file after downloading from cloud""" - return self._tmp_download_file() - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - """Returns the equivalent of ls in dict""" - return { - "protocol": self.connector_config.protocol, - "remote_file_path": self.remote_file_path, - } - - -@dataclass -class FsspecSourceConnector( - SourceConnectorCleanupMixin, - CompressionSourceConnectorMixin, - BaseSourceConnector, -): - """Objects of this class support fetching document(s) from""" - - connector_config: SimpleFsspecConfig - - def check_connection(self): - from fsspec import get_filesystem_class - - try: - fs = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - fs.ls(path=self.connector_config.path_without_protocol, detail=False) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def __post_init__(self): - self.ingest_doc_cls: t.Type[FsspecIngestDoc] = FsspecIngestDoc - - def initialize(self): - from fsspec import AbstractFileSystem, get_filesystem_class - - self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - - """Verify that can get metadata for an object, validates connections info.""" - ls_output = self.fs.ls(self.connector_config.path_without_protocol, detail=False) - if len(ls_output) < 1: - raise ValueError( - f"No objects found in {self.connector_config.remote_url}.", - ) - - def _list_files(self): - if not self.connector_config.recursive: - # fs.ls does not walk directories - # directories that are listed in cloud storage can cause problems - # because they are seen as 0 byte files - return [ - x.get("name") - for x in self.fs.ls(self.connector_config.path_without_protocol, detail=True) - if x.get("size") > 0 - ] - else: - # fs.find will recursively walk directories - # "size" is a common key for all the cloud protocols with fs - return [ - k - for k, v in self.fs.find( - self.connector_config.path_without_protocol, - detail=True, - ).items() - if v.get("size") > 0 - ] - - def does_path_match_glob(self, path: str) -> bool: - if self.connector_config.file_glob is None: - return True - patterns = self.connector_config.file_glob - for pattern in patterns: - if fnmatch.filter([path], pattern): - return True - logger.debug(f"The file {path!r} is discarded as it does not match any given glob.") - return False - - def get_ingest_docs(self): - raw_files = self._list_files() - # If glob filters provided, use to fiter on filepaths - files = [f for f in raw_files if self.does_path_match_glob(f)] - # remove compressed files - compressed_file_ext = TAR_FILE_EXT + ZIP_FILE_EXT - compressed_files = [] - uncompressed_files = [] - docs: t.List[BaseSingleIngestDoc] = [] - for file in files: - if any(file.endswith(ext) for ext in compressed_file_ext): - compressed_files.append(file) - else: - uncompressed_files.append(file) - docs.extend( - [ - self.ingest_doc_cls( - read_config=self.read_config, - connector_config=self.connector_config, - processor_config=self.processor_config, - remote_file_path=file, - ) - for file in uncompressed_files - ], - ) - if not self.connector_config.uncompress: - return docs - for compressed_file in compressed_files: - compressed_doc = self.ingest_doc_cls( - read_config=self.read_config, - processor_config=self.processor_config, - connector_config=self.connector_config, - remote_file_path=compressed_file, - ) - try: - local_ingest_docs = self.process_compressed_doc(doc=compressed_doc) - logger.info(f"adding {len(local_ingest_docs)} from {compressed_file}") - docs.extend(local_ingest_docs) - finally: - compressed_doc.cleanup_file() - return docs - - -@dataclass -class WriteTextConfig(EnhancedDataClassJsonMixin, ABC): - pass - - -@dataclass -class FsspecWriteConfig(WriteConfig): - write_text_config: t.Optional[WriteTextConfig] = None - - def get_write_text_config(self) -> t.Dict[str, t.Any]: - if write_text_kwargs := self.write_text_config: - return write_text_kwargs.to_dict() - return {} - - -@dataclass -class FsspecDestinationConnector(BaseDestinationConnector): - connector_config: SimpleFsspecConfig - write_config: FsspecWriteConfig - - def initialize(self): - from fsspec import AbstractFileSystem, get_filesystem_class - - self.fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - self.check_connection() - - def check_connection(self): - from fsspec import AbstractFileSystem, get_filesystem_class - - try: - fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - - # e.g. Dropbox path starts with / - bucket_name = "/" if self.connector_config.path_without_protocol.startswith("/") else "" - bucket_name += self.connector_config.dir_path.split("/")[0] - - logger.info(f"checking connection for destination {bucket_name}") - fs.ls(path=bucket_name, detail=False) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def write_dict( - self, - *args, - elements_dict: t.List[t.Dict[str, t.Any]], - filename: t.Optional[str] = None, - indent: int = 4, - encoding: str = "utf-8", - **kwargs, - ) -> None: - from fsspec import AbstractFileSystem, get_filesystem_class - - fs: AbstractFileSystem = get_filesystem_class(self.connector_config.protocol)( - **self.connector_config.get_access_config(), - ) - - logger.info(f"Writing content using filesystem: {type(fs).__name__}") - - output_folder = self.connector_config.path_without_protocol - output_folder = os.path.join(output_folder) # Make sure folder ends with file seperator - filename = ( - filename.strip(os.sep) if filename else filename - ) # Make sure filename doesn't begin with file seperator - output_path = str(PurePath(output_folder, filename)) if filename else output_folder - full_output_path = f"{self.connector_config.protocol}://{output_path}" - logger.debug(f"uploading content to {full_output_path}") - write_text_configs = self.write_config.get_write_text_config() if self.write_config else {} - fs.write_text( - full_output_path, - json.dumps(elements_dict, indent=indent), - encoding=encoding, - **write_text_configs, - ) - - def get_elements_dict(self, docs: t.List[BaseSingleIngestDoc]) -> t.List[t.Dict[str, t.Any]]: - pass - - def write(self, docs: t.List[BaseSingleIngestDoc]) -> None: - for doc in docs: - file_path = doc.base_output_filename - filename = file_path if file_path else None - with open(doc._output_filename) as json_file: - logger.debug(f"uploading content from {doc._output_filename}") - json_list = json.load(json_file) - self.write_dict(elements_dict=json_list, filename=filename) diff --git a/unstructured/ingest/connector/fsspec/gcs.py b/unstructured/ingest/connector/fsspec/gcs.py deleted file mode 100644 index db5b0de44..000000000 --- a/unstructured/ingest/connector/fsspec/gcs.py +++ /dev/null @@ -1,82 +0,0 @@ -import typing as t -from dataclasses import dataclass -from pathlib import Path -from typing import Type - -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecDestinationConnector, - FsspecIngestDoc, - FsspecSourceConnector, - FsspecWriteConfig, - SimpleFsspecConfig, -) -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError -from unstructured.ingest.interfaces import AccessConfig -from unstructured.ingest.utils.string_and_date_utils import json_to_dict -from unstructured.utils import requires_dependencies - - -@dataclass -class GcsAccessConfig(AccessConfig): - token: t.Optional[str] = enhanced_field( - default=None, sensitive=True, overload_name="service_account_key" - ) - - def __post_init__(self): - ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud" - - # Case: null value - if not self.token: - return - # Case: one of auth constants - if self.token in ALLOWED_AUTH_VALUES: - return - # Case: token as json - if isinstance(json_to_dict(self.token), dict): - self.token = json_to_dict(self.token) - return - # Case: path to token - if Path(self.token).is_file(): - return - - raise ValueError("Invalid auth token value") - - -@dataclass -class GcsWriteConfig(FsspecWriteConfig): - pass - - -@dataclass -class SimpleGcsConfig(SimpleFsspecConfig): - access_config: GcsAccessConfig = None - - -@dataclass -class GcsIngestDoc(FsspecIngestDoc): - connector_config: SimpleGcsConfig - registry_name: str = "gcs" - - @SourceConnectionError.wrap - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def get_file(self): - super().get_file() - - -@dataclass -class GcsSourceConnector(FsspecSourceConnector): - connector_config: SimpleGcsConfig - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def initialize(self): - super().initialize() - - def __post_init__(self): - self.ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc - - -@dataclass -class GcsDestinationConnector(FsspecDestinationConnector): - connector_config: SimpleGcsConfig - write_config: GcsWriteConfig diff --git a/unstructured/ingest/connector/fsspec/s3.py b/unstructured/ingest/connector/fsspec/s3.py deleted file mode 100644 index 799276a27..000000000 --- a/unstructured/ingest/connector/fsspec/s3.py +++ /dev/null @@ -1,62 +0,0 @@ -import typing as t -from dataclasses import dataclass -from typing import Type - -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecDestinationConnector, - FsspecIngestDoc, - FsspecSourceConnector, - FsspecWriteConfig, - SimpleFsspecConfig, -) -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.interfaces import AccessConfig -from unstructured.utils import requires_dependencies - - -@dataclass -class S3AccessConfig(AccessConfig): - anon: bool = enhanced_field(default=False, overload_name="anonymous") - endpoint_url: t.Optional[str] = None - key: t.Optional[str] = enhanced_field(default=None, sensitive=True) - secret: t.Optional[str] = enhanced_field(default=None, sensitive=True) - token: t.Optional[str] = enhanced_field(default=None, sensitive=True) - - -@dataclass -class S3WriteConfig(FsspecWriteConfig): - pass - - -@dataclass -class SimpleS3Config(SimpleFsspecConfig): - access_config: S3AccessConfig = enhanced_field(default=None) - - -@dataclass -class S3IngestDoc(FsspecIngestDoc): - connector_config: SimpleS3Config - remote_file_path: str - registry_name: str = "s3" - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def get_file(self): - super().get_file() - - -@dataclass -class S3SourceConnector(FsspecSourceConnector): - connector_config: SimpleS3Config - - def __post_init__(self): - self.ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc - - -@dataclass -class S3DestinationConnector(FsspecDestinationConnector): - connector_config: SimpleS3Config - write_config: S3WriteConfig - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def initialize(self): - super().initialize() diff --git a/unstructured/ingest/connector/fsspec/sftp.py b/unstructured/ingest/connector/fsspec/sftp.py deleted file mode 100644 index f179fc233..000000000 --- a/unstructured/ingest/connector/fsspec/sftp.py +++ /dev/null @@ -1,81 +0,0 @@ -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Type -from urllib.parse import urlparse - -from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecIngestDoc, - FsspecSourceConnector, - SimpleFsspecConfig, -) -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError -from unstructured.ingest.interfaces import AccessConfig -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -@dataclass -class SftpAccessConfig(AccessConfig): - username: str - password: str = enhanced_field(sensitive=True) - host: str = "" - port: int = 22 - look_for_keys: bool = False - allow_agent: bool = False - - -@dataclass -class SimpleSftpConfig(SimpleFsspecConfig): - access_config: SftpAccessConfig = None - - def __post_init__(self): - super().__post_init__() - - _, ext = os.path.splitext(self.remote_url) - parsed_url = urlparse(self.remote_url) - if ext: - # We only want the file_path if it has an extension - self.file_path = Path(self.remote_url).name - self.dir_path = Path(parsed_url.path).parent.as_posix().lstrip("/") - self.path_without_protocol = self.dir_path - else: - self.file_path = "" - self.dir_path = parsed_url.path.lstrip("/") - self.path_without_protocol = self.dir_path - self.access_config.host = parsed_url.hostname or self.access_config.host - self.access_config.port = parsed_url.port or self.access_config.port - - -@dataclass -class SftpIngestDoc(FsspecIngestDoc): - connector_config: SimpleSftpConfig - registry_name: str = "sftp" - - @SourceConnectionError.wrap - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def get_file(self): - super().get_file() - - -@dataclass -class SftpSourceConnector(FsspecSourceConnector): - connector_config: SimpleSftpConfig - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def initialize(self): - super().initialize() - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def check_connection(self): - from fsspec.implementations.sftp import SFTPFileSystem - - try: - SFTPFileSystem(**self.connector_config.get_access_config()) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def __post_init__(self): - self.ingest_doc_cls: Type[SftpIngestDoc] = SftpIngestDoc diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py deleted file mode 100644 index e03b6f4e7..000000000 --- a/unstructured/ingest/connector/git.py +++ /dev/null @@ -1,124 +0,0 @@ -import fnmatch -import typing as t -from dataclasses import dataclass, field -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, -) -from unstructured.ingest.logger import logger - - -@dataclass -class GitAccessConfig(AccessConfig): - access_token: t.Optional[str] = enhanced_field( - default=None, sensitive=True, overload_name="git_access_token" - ) - - -@dataclass -class SimpleGitConfig(BaseConnectorConfig): - url: str - access_config: GitAccessConfig - branch: t.Optional[str] = enhanced_field(default=None, overload_name="git_branch") - file_glob: t.Optional[t.List[str]] = enhanced_field(default=None, overload_name="git_file_glob") - repo_path: str = field(init=False, repr=False) - - -@dataclass -class GitIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleGitConfig = field(repr=False) - path: str - - @property - def filename(self): - return (Path(self.read_config.download_dir) / self.path).resolve() - - @property - def _output_filename(self): - return Path(self.processor_config.output_dir) / f"{self.path}.json" - - @property - def record_locator(self) -> t.Dict[str, t.Any]: - record_locator = { - "repo_path": self.connector_config.repo_path, - "file_path": self.path, - } - if self.connector_config.branch is not None: - record_locator["branch"] = self.connector_config.branch - return record_locator - - def _create_full_tmp_dir_path(self): - """includes directories in in the gitlab repository""" - self.filename.parent.mkdir(parents=True, exist_ok=True) - - def update_source_metadata(self, **kwargs): - raise NotImplementedError() - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - self._create_full_tmp_dir_path() - self._fetch_and_write() - - def _fetch_content(self) -> None: - raise NotImplementedError() - - def _fetch_and_write(self) -> None: - raise NotImplementedError() - - -@dataclass -class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleGitConfig - - def initialize(self): - pass - - def check_connection(self): - pass - - @staticmethod - def is_file_type_supported(path: str) -> bool: - # Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files - # TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc. - supported = path.endswith( - ( - ".md", - ".txt", - ".pdf", - ".doc", - ".docx", - ".eml", - ".heic", - ".html", - ".png", - ".jpg", - ".ppt", - ".pptx", - ".xml", - ), - ) - if not supported: - logger.debug( - f"The file {path!r} is discarded as it does not contain a supported filetype.", - ) - return supported - - def does_path_match_glob(self, path: str) -> bool: - if not self.connector_config.file_glob: - return True - patterns = self.connector_config.file_glob - for pattern in patterns: - if fnmatch.filter([path], pattern): - return True - logger.debug(f"The file {path!r} is discarded as it does not match any given glob.") - return False diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py deleted file mode 100644 index 2a63b8f32..000000000 --- a/unstructured/ingest/connector/github.py +++ /dev/null @@ -1,173 +0,0 @@ -import typing as t -from dataclasses import dataclass -from datetime import datetime -from urllib.parse import urlparse - -import requests - -from unstructured.ingest.connector.git import ( - GitIngestDoc, - GitSourceConnector, - SimpleGitConfig, -) -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import SourceMetadata -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from github.Repository import Repository - - -@dataclass -class SimpleGitHubConfig(SimpleGitConfig): - def __post_init__(self): - parsed_gh_url = urlparse(self.url) - path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment] - - # If a scheme and netloc are provided, ensure they are correct - # Additionally, ensure that the path contains two fragments - if ( - (parsed_gh_url.scheme and parsed_gh_url.scheme != "https") - or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com") - or len(path_fragments) != 2 - ): - raise ValueError( - 'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"' - ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".', - ) - - # If there's no issues, store the core repository info - self.repo_path = parsed_gh_url.path - - @SourceConnectionError.wrap - @requires_dependencies(["github"], extras="github") - def get_repo(self) -> "Repository": - from github import Github - - github = Github(self.access_config.access_token) - return github.get_repo(self.repo_path) - - -@dataclass -class GitHubIngestDoc(GitIngestDoc): - connector_config: SimpleGitHubConfig - registry_name: str = "github" - - @property - def date_created(self) -> t.Optional[str]: - return None - - @requires_dependencies(["github"], extras="github") - def _fetch_file(self): - from github.GithubException import UnknownObjectException - - try: - content_file = self.connector_config.get_repo().get_contents(self.path) - except UnknownObjectException: - logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}") - return None - - return content_file - - @SourceConnectionNetworkError.wrap - def _fetch_content(self, content_file): - contents = b"" - if ( - not content_file.content # type: ignore - and content_file.encoding == "none" # type: ignore - and content_file.size # type: ignore - ): - logger.info("File too large for the GitHub API, using direct download link instead.") - # NOTE: Maybe add a raise_for_status to catch connection timeout or HTTP Errors? - response = requests.get(content_file.download_url) # type: ignore - if response.status_code != 200: - logger.info("Direct download link has failed... Skipping this file.") - return None - else: - contents = response.content - else: - contents = content_file.decoded_content # type: ignore - return contents - - def update_source_metadata(self, **kwargs): - content_file = kwargs.get("content_file", self._fetch_file()) - if content_file is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - - date_modified = datetime.strptime( - content_file.last_modified, - "%a, %d %b %Y %H:%M:%S %Z", - ).isoformat() - self.source_metadata = SourceMetadata( - date_modified=date_modified, - version=content_file.etag, - source_url=content_file.download_url, - exists=True, - ) - - def _fetch_and_write(self) -> None: - content_file = self._fetch_file() - self.update_source_metadata(content_file=content_file) - contents = self._fetch_content(content_file) - if contents is None: - raise ValueError( - f"Failed to retrieve file from repo " - f"{self.connector_config.url}/{self.path}. Check logs", - ) - with open(self.filename, "wb") as f: - f.write(contents) - - -@dataclass -class GitHubSourceConnector(GitSourceConnector): - connector_config: SimpleGitHubConfig - - @requires_dependencies(["github"], extras="github") - def check_connection(self): - from github import Consts - from github.GithubRetry import GithubRetry - from github.Requester import Requester - - try: - requester = Requester( - auth=self.connector_config.access_config.access_token, - base_url=Consts.DEFAULT_BASE_URL, - timeout=Consts.DEFAULT_TIMEOUT, - user_agent=Consts.DEFAULT_USER_AGENT, - per_page=Consts.DEFAULT_PER_PAGE, - verify=True, - retry=GithubRetry(), - pool_size=None, - ) - url_base = ( - "/repositories/" if isinstance(self.connector_config.repo_path, int) else "/repos/" - ) - url = f"{url_base}{self.connector_config.repo_path}" - headers, _ = requester.requestJsonAndCheck("HEAD", url) - logger.debug(f"headers from HEAD request: {headers}") - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def get_ingest_docs(self): - repo = self.connector_config.get_repo() - # Load the Git tree with all files, and then create Ingest docs - # for all blobs, i.e. all files, ignoring directories - sha = self.connector_config.branch or repo.default_branch - git_tree = repo.get_git_tree(sha, recursive=True) - return [ - GitHubIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - path=element.path, - ) - for element in git_tree.tree - if element.type == "blob" - and self.is_file_type_supported(element.path) - and (not self.connector_config.file_glob or self.does_path_match_glob(element.path)) - ] diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py deleted file mode 100644 index 1d1e6c5f8..000000000 --- a/unstructured/ingest/connector/gitlab.py +++ /dev/null @@ -1,142 +0,0 @@ -import typing as t -from dataclasses import dataclass -from urllib.parse import urlparse - -from unstructured.ingest.connector.git import ( - GitIngestDoc, - GitSourceConnector, - SimpleGitConfig, -) -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import SourceMetadata -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from gitlab.v4.objects.projects import Project - - -@dataclass -class SimpleGitlabConfig(SimpleGitConfig): - base_url: str = "https://gitlab.com" - - def __post_init__(self): - parsed_gh_url = urlparse(self.url) - # If a scheme or netloc are provided, use the parsed base url - if parsed_gh_url.scheme or parsed_gh_url.netloc: - self.base_url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}" - self.repo_path = parsed_gh_url.path - while self.repo_path.startswith("/"): - self.repo_path = self.repo_path[1:] - - @SourceConnectionError.wrap - @requires_dependencies(["gitlab"], extras="gitlab") - def get_project(self) -> "Project": - from gitlab import Gitlab - - gitlab = Gitlab(self.base_url, private_token=self.access_config.access_token) - return gitlab.projects.get(self.repo_path) - - -@dataclass -class GitLabIngestDoc(GitIngestDoc): - connector_config: SimpleGitlabConfig - registry_name: str = "gitlab" - - @property - def date_created(self) -> t.Optional[str]: - return None - - @property - def date_modified(self) -> t.Optional[str]: - return None - - @property - def source_url(self) -> t.Optional[str]: - return None - - @SourceConnectionNetworkError.wrap - @requires_dependencies(["gitlab"], extras="gitlab") - def _fetch_content(self): - from gitlab.exceptions import GitlabHttpError - - try: - project = self.connector_config.get_project() - content_file = project.files.get( - self.path, - ref=self.connector_config.branch or project.default_branch, - ) - except GitlabHttpError as e: - if e.response_code == 404: - logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}") - return None - raise - return content_file - - def update_source_metadata(self, **kwargs): - content_file = kwargs.get("content_file", self._fetch_content()) - if content_file is None: - self.source_metadata = SourceMetadata( - exists=None, - ) - return - self.source_metadata = SourceMetadata( - version=content_file.attributes.get("last_commit_id", ""), - exists=True, - ) - - def _fetch_and_write(self) -> None: - content_file = self._fetch_content() - self.update_source_metadata(content_file=content_file) - if content_file is None: - raise ValueError( - f"Failed to retrieve file from repo " - f"{self.connector_config.url}/{self.path}. Check logs.", - ) - contents = content_file.decode() - with open(self.filename, "wb") as f: - f.write(contents) - - -@dataclass -class GitLabSourceConnector(GitSourceConnector): - connector_config: SimpleGitlabConfig - - @requires_dependencies(["gitlab"], extras="gitlab") - def check_connection(self): - from gitlab import Gitlab - from gitlab.exceptions import GitlabError - - try: - gitlab = Gitlab( - self.connector_config.base_url, - private_token=self.connector_config.access_config.access_token, - ) - gitlab.auth() - except GitlabError as gitlab_error: - logger.error(f"failed to validate connection: {gitlab_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {gitlab_error}") - - def get_ingest_docs(self): - # Load the Git tree with all files, and then create Ingest docs - # for all blobs, i.e. all files, ignoring directories - project = self.connector_config.get_project() - ref = self.connector_config.branch or project.default_branch - git_tree = project.repository_tree( - ref=ref, - recursive=True, - iterator=True, - all=True, - ) - return [ - GitLabIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - path=element["path"], - ) - for element in git_tree - if element["type"] == "blob" - and self.is_file_type_supported(element["path"]) - and (not self.connector_config.file_glob or self.does_path_match_glob(element["path"])) - ] diff --git a/unstructured/ingest/connector/google_drive.py b/unstructured/ingest/connector/google_drive.py deleted file mode 100644 index e3b0f931c..000000000 --- a/unstructured/ingest/connector/google_drive.py +++ /dev/null @@ -1,348 +0,0 @@ -import io -import json -import os -import typing as t -from dataclasses import dataclass, field -from datetime import datetime -from mimetypes import guess_extension -from pathlib import Path - -from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSessionHandle, - BaseSingleIngestDoc, - BaseSourceConnector, - ConfigSessionHandleMixin, - IngestDocCleanupMixin, - IngestDocSessionHandleMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.string_and_date_utils import json_to_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from googleapiclient.discovery import Resource as GoogleAPIResource - from googleapiclient.http import MediaIoBaseDownload - -FILE_FORMAT = "{id}-{name}{ext}" -DIRECTORY_FORMAT = "{id}-{name}" - - -@dataclass -class GoogleDriveSessionHandle(BaseSessionHandle): - service: "GoogleAPIResource" - - -@requires_dependencies(["googleapiclient"], extras="google-drive") -def create_service_account_object(key_path: t.Union[str, dict], id=None): - """ - Creates a service object for interacting with Google Drive. - - Providing a drive id enforces a key validation process. - - Args: - key_path: Path to Google Drive service account json file. (or the actual json) - id: ID of a file on Google Drive. File has to be either publicly accessible or accessible - to the service account. - - Returns: - Service account object - """ - from google.auth import default, exceptions - from google.oauth2 import service_account - from googleapiclient.discovery import build - from googleapiclient.errors import HttpError - - # Service account key can be a dict or a file path(str) - # But the dict may come in as a string - key_path = json_to_dict(key_path) - - try: - if isinstance(key_path, dict): - creds = service_account.Credentials.from_service_account_info(key_path) - elif isinstance(key_path, str): - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path - creds, _ = default() - else: - raise ValueError( - f"key path not recognized as a dictionary or a file path: " - f"[{type(key_path)}] {key_path}", - ) - service = build("drive", "v3", credentials=creds) - - if id: - service.files().list( - spaces="drive", - fields="files(id)", - pageToken=None, - corpora="user", - q=f"'{id}' in parents", - ).execute() - - except HttpError as exc: - raise ValueError(f"{exc.reason}") - except exceptions.DefaultCredentialsError: - raise ValueError("The provided API key is invalid.") - - return service - - -@dataclass -class GoogleDriveAccessConfig(AccessConfig): - service_account_key: t.Union[str, dict] = enhanced_field(sensitive=True) - - -@dataclass -class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - """Connector config where drive_id is the id of the document to process or - the folder to process all documents from.""" - - # Google Drive Specific Options - drive_id: str - access_config: GoogleDriveAccessConfig - extension: t.Optional[str] = None - recursive: bool = False - - def create_session_handle( - self, - ) -> GoogleDriveSessionHandle: - service = create_service_account_object(self.access_config.service_account_key) - return GoogleDriveSessionHandle(service=service) - - -@dataclass -class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleGoogleDriveConfig - meta: t.Dict[str, str] = field(default_factory=dict) - registry_name: str = "google_drive" - - @property - def filename(self): - return Path(self.meta.get("download_filepath")).resolve() # type: ignore - - @property - def _output_filename(self): - return Path(f"{self.meta.get('output_filepath')}.json").resolve() - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "drive_id": self.connector_config.drive_id, - "file_id": self.meta["id"], - } - - @requires_dependencies(["googleapiclient"], extras="google-drive") - def update_source_metadata(self): - from googleapiclient.errors import HttpError - - try: - file_obj = ( - self.session_handle.service.files() - .get( - fileId=self.meta["id"], - fields="id, createdTime, modifiedTime, version, webContentLink", - ) - .execute() - ) - except HttpError as e: - if e.status_code == 404: - logger.error(f"File {self.meta['name']} not found") - self.source_metadata = SourceMetadata( - exists=True, - ) - return - raise - - date_created = None - if dc := file_obj.get("createdTime", ""): - date_created = datetime.strptime( - dc, - "%Y-%m-%dT%H:%M:%S.%fZ", - ).isoformat() - - date_modified = None - if dm := file_obj.get("modifiedTime", ""): - date_modified = datetime.strptime( - dm, - "%Y-%m-%dT%H:%M:%S.%fZ", - ).isoformat() - - self.source_metadata = SourceMetadata( - date_created=date_created, - date_modified=date_modified, - version=file_obj.get("version", ""), - source_url=file_obj.get("webContentLink", ""), - exists=True, - ) - - @SourceConnectionNetworkError.wrap - def _run_downloader(self, downloader: "MediaIoBaseDownload") -> bool: - downloaded = False - while downloaded is False: - _, downloaded = downloader.next_chunk() - return downloaded - - @requires_dependencies(["googleapiclient"], extras="google-drive") - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - from googleapiclient.http import MediaIoBaseDownload - - if self.meta.get("mimeType", "").startswith("application/vnd.google-apps"): - export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get( - self.meta.get("mimeType"), # type: ignore - ) - if not export_mime: - logger.info( - f"File not supported. Name: {self.meta.get('name')} " - f"ID: {self.meta.get('id')} " - f"MimeType: {self.meta.get('mimeType')}", - ) - return - - request = self.session_handle.service.files().export_media( - fileId=self.meta.get("id"), - mimeType=export_mime, - ) - else: - request = self.session_handle.service.files().get_media(fileId=self.meta.get("id")) - file = io.BytesIO() - downloader = MediaIoBaseDownload(file, request) - self.update_source_metadata() - downloaded = self._run_downloader(downloader=downloader) - - saved = False - if downloaded and file: - dir_ = Path(self.meta["download_dir"]) - if dir_: - if not dir_.is_dir(): - logger.debug(f"Creating directory: {self.meta.get('download_dir')}") - - if dir_: - dir_.mkdir(parents=True, exist_ok=True) - - with open(self.filename, "wb") as handler: - handler.write(file.getbuffer()) - saved = True - logger.debug(f"File downloaded: {self.filename}.") - if not saved: - logger.error(f"Error while downloading and saving file: {self.filename}.") - - def write_result(self): - """Write the structured json result for this doc. result must be json serializable.""" - if self.read_config.download_only: - return - self._output_filename.parent.mkdir(parents=True, exist_ok=True) - with open(self._output_filename, "w") as output_f: - output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2)) - logger.info(f"Wrote {self._output_filename}") - - -@dataclass -class GoogleDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Objects of this class support fetching documents from Google Drive""" - - connector_config: SimpleGoogleDriveConfig - - def _list_objects(self, drive_id, recursive=False): - files = [] - service = self.connector_config.create_session_handle().service - - def traverse(drive_id, download_dir, output_dir, recursive=False): - page_token = None - while True: - response = ( - service.files() - .list( - spaces="drive", - fields="nextPageToken, files(id, name, mimeType)", - pageToken=page_token, - corpora="user", - q=f"'{drive_id}' in parents", - ) - .execute() - ) - - for meta in response.get("files", []): - if meta.get("mimeType") == "application/vnd.google-apps.folder": - dir_ = DIRECTORY_FORMAT.format(name=meta.get("name"), id=meta.get("id")) - if recursive: - download_sub_dir = (download_dir / dir_).resolve() - output_sub_dir = (output_dir / dir_).resolve() - traverse(meta.get("id"), download_sub_dir, output_sub_dir, True) - else: - ext = "" - if not Path(meta.get("name")).suffixes: - guess = guess_extension(meta.get("mimeType")) - ext = guess if guess else ext - - if meta.get("mimeType", "").startswith("application/vnd.google-apps"): - export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(meta.get("mimeType")) - if not export_mime: - logger.info( - f"File {meta.get('name')} has an " - f"unsupported MimeType {meta.get('mimeType')}", - ) - continue - - if not ext: - guess = guess_extension(export_mime) - ext = guess if guess else ext - - # TODO (Habeeb): Consider filtering at the query level. - if ( - self.connector_config.extension - and self.connector_config.extension != ext - ): # noqa: SIM102 - logger.debug( - f"File {meta.get('name')} does not match " - f"the file type {self.connector_config.extension}", - ) - continue - - name = FILE_FORMAT.format(name=meta.get("name"), id=meta.get("id"), ext=ext) - meta["download_dir"] = str(download_dir) - meta["download_filepath"] = (download_dir / name).resolve().as_posix() - meta["output_dir"] = str(output_dir) - meta["output_filepath"] = (output_dir / name).resolve().as_posix() - files.append(meta) - - page_token = response.get("nextPageToken", None) - if page_token is None: - break - - traverse( - drive_id, - Path(self.read_config.download_dir), - Path(self.processor_config.output_dir), - recursive, - ) - return files - - def initialize(self): - pass - - def check_connection(self): - try: - self.connector_config.create_session_handle().service - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def get_ingest_docs(self): - files = self._list_objects(self.connector_config.drive_id, self.connector_config.recursive) - return [ - GoogleDriveIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - meta=file, - ) - for file in files - ] diff --git a/unstructured/ingest/connector/hubspot.py b/unstructured/ingest/connector/hubspot.py deleted file mode 100644 index 3f01f4e81..000000000 --- a/unstructured/ingest/connector/hubspot.py +++ /dev/null @@ -1,278 +0,0 @@ -import typing as t -from dataclasses import dataclass -from enum import Enum -from functools import reduce -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSessionHandle, - BaseSingleIngestDoc, - BaseSourceConnector, - ConfigSessionHandleMixin, - IngestDocCleanupMixin, - IngestDocSessionHandleMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from hubspot import HubSpot - -CONTENT_TAG = "content" - - -class HubSpotObjectTypes(Enum): - CALLS = "calls" - COMMUNICATIONS = "communications" - EMAILS = "emails" - NOTES = "notes" - PRODUCTS = "products" - TICKETS = "tickets" - - -@dataclass -class HubSpotSessionHandle(BaseSessionHandle): - service: "HubSpot" - - -@dataclass -class HubSpotAccessConfig(AccessConfig): - api_token: str = enhanced_field(repr=False, sensitive=True) - - -@dataclass -class SimpleHubSpotConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - access_config: HubSpotAccessConfig - params: t.Optional[str] = None - properties: t.Optional[dict] = None - object_types: t.Optional[t.List[str]] = None - custom_properties: t.Optional[t.Dict[str, t.List[str]]] = None - - @requires_dependencies(["hubspot"], extras="hubspot") - def create_session_handle(self) -> HubSpotSessionHandle: - from hubspot import HubSpot - - service = HubSpot(access_token=self.access_config.api_token) - return HubSpotSessionHandle(service=service) - - -@dataclass -class HubSpotIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleHubSpotConfig - object_id: str - object_type: str - content_properties: t.List[str] - registry_name: str = "hubspot" - - def __post_init__(self): - self._add_custom_properties() - - @property - def filename(self): - return ( - Path(self.read_config.download_dir) - / f"{self.object_type}/{self.object_id}.txt" # type: ignore - ).resolve() - - @property - def _output_filename(self): - return ( - Path(self.processor_config.output_dir) - / f"{self.object_type}/{self.object_id}.json" # type: ignore - ).resolve() - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - f"{self.registry_name}_id": self.object_id, - } - - @property - def version(self) -> t.Optional[str]: - return None - - @property - def source_url(self) -> t.Optional[str]: - return None - - def _add_custom_properties(self): - if (self.connector_config.custom_properties is not None) and ( - (cprops := self.connector_config.custom_properties.get(self.object_type)) is not None - ): - self.content_properties += cprops - - def _join_object_properties(self, obj) -> str: - return "\n".join( - [ - obj.properties[cprop] - for cprop in self.content_properties - if (obj.properties.get(cprop) is not None) - ], - ) - - def _resolve_getter(self): - method_path = "" - if self.object_type in [ - HubSpotObjectTypes.CALLS.value, - HubSpotObjectTypes.COMMUNICATIONS.value, - HubSpotObjectTypes.EMAILS.value, - HubSpotObjectTypes.NOTES.value, - ]: - method_path = f"crm.objects.{self.object_type}.basic_api.get_by_id" - if self.object_type in [ - HubSpotObjectTypes.PRODUCTS.value, - HubSpotObjectTypes.TICKETS.value, - ]: - method_path = f"crm.{self.object_type}.basic_api.get_by_id" - - method = reduce(getattr, method_path.split("."), self.session_handle.service) - return method - - @requires_dependencies(["hubspot"], extras="hubspot") - def _fetch_obj(self, check_only=False): - from hubspot.crm.objects.exceptions import NotFoundException - - get_by_id_method = self._resolve_getter() - try: - response = get_by_id_method( - self.object_id, - properties=([] if check_only else self.content_properties), - ) - except NotFoundException as e: - logger.error(e) - return None - return response - - def update_source_metadata(self, **kwargs) -> None: - obj = kwargs.get("object", self._fetch_obj(check_only=True)) # type: ignore - if obj is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - self.source_metadata = SourceMetadata( - date_created=obj.created_at.isoformat(), - date_modified=obj.updated_at.isoformat(), - exists=True, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - obj = self._fetch_obj() - if obj is None: - raise ValueError( - f"Failed to retrieve object {self.registry_name}", - f"with ID {self.object_id}", - ) - self.update_source_metadata(object=obj) - output = self._join_object_properties(obj) - self.filename.parent.mkdir(parents=True, exist_ok=True) - with open(self.filename, "w", encoding="utf8") as f: - f.write(output) - return - - -@dataclass -class HubSpotSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleHubSpotConfig - - def initialize(self): - self.hubspot = self.connector_config.create_session_handle().service - - def check_connection(self): - return self.connector_config.create_session_handle().service - - @requires_dependencies(["hubspot"], extras="hubspot") - def _list_objects(self, get_page_method, object_type: str, content_properties: t.List[str]): - try: - objects = get_page_method() - except Exception as e: - logger.error(e) - logger.error( - f"Failed to retrieve {object_type}, omitting processing...", - ) - return [] - return [ - HubSpotIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - object_id=obj.id, - object_type=object_type, - content_properties=content_properties, - ) - for obj in objects.results - ] - - def _get_calls(self) -> t.List[HubSpotIngestDoc]: - return self._list_objects( - self.hubspot.crm.objects.calls.basic_api.get_page, - HubSpotObjectTypes.CALLS.value, - ["hs_call_title", "hs_call_body"], - ) - - def _get_communications(self) -> t.List[HubSpotIngestDoc]: - return self._list_objects( - self.hubspot.crm.objects.communications.basic_api.get_page, - HubSpotObjectTypes.COMMUNICATIONS.value, - ["hs_communication_body"], - ) - - def _get_emails(self) -> t.List[HubSpotIngestDoc]: - return self._list_objects( - self.hubspot.crm.objects.emails.basic_api.get_page, - HubSpotObjectTypes.EMAILS.value, - ["hs_email_subject", "hs_email_text"], - ) - - def _get_notes(self) -> t.List[HubSpotIngestDoc]: - return self._list_objects( - self.hubspot.crm.objects.notes.basic_api.get_page, - HubSpotObjectTypes.NOTES.value, - ["hs_note_body"], - ) - - def _get_products(self) -> t.List[HubSpotIngestDoc]: - return self._list_objects( - self.hubspot.crm.products.basic_api.get_page, - HubSpotObjectTypes.PRODUCTS.value, - ["description"], - ) - - def _get_tickets(self) -> t.List[HubSpotIngestDoc]: - return self._list_objects( - self.hubspot.crm.tickets.basic_api.get_page, - HubSpotObjectTypes.TICKETS.value, - ["subject", "content"], - ) - - def get_ingest_docs(self): - obj_method_resolver = { - HubSpotObjectTypes.CALLS.value: self._get_calls, - HubSpotObjectTypes.COMMUNICATIONS.value: self._get_communications, - HubSpotObjectTypes.EMAILS.value: self._get_emails, - HubSpotObjectTypes.NOTES.value: self._get_notes, - HubSpotObjectTypes.PRODUCTS.value: self._get_products, - HubSpotObjectTypes.TICKETS.value: self._get_tickets, - } - - if self.connector_config.object_types is not None: - obj_method_resolver = { - obj_name: obj_method_resolver.get(obj_name) # type: ignore - for obj_name in self.connector_config.object_types - } - - ingest_docs: t.List[HubSpotIngestDoc] = [] - for obj_name, obj_method in obj_method_resolver.items(): - logger.info(f"Retrieving - {obj_name}") - results: t.List[HubSpotIngestDoc] = obj_method() # type: ignore - ingest_docs += results # type: ignore - - return ingest_docs diff --git a/unstructured/ingest/connector/jira.py b/unstructured/ingest/connector/jira.py deleted file mode 100644 index d29e1f2dc..000000000 --- a/unstructured/ingest/connector/jira.py +++ /dev/null @@ -1,469 +0,0 @@ -import math -import typing as t -from collections import abc -from dataclasses import dataclass, field -from datetime import datetime -from functools import cached_property -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSessionHandle, - BaseSingleIngestDoc, - BaseSourceConnector, - ConfigSessionHandleMixin, - IngestDocCleanupMixin, - IngestDocSessionHandleMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from atlassian import Jira - - -@dataclass -class JiraSessionHandle(BaseSessionHandle): - service: "Jira" - - -@requires_dependencies(["atlassian"], extras="jira") -def create_jira_object(url, user_email, api_token): - """ - Creates a jira object for interacting with Jira Cloud. - Args: - url: URL to Jira Cloud organization - user_email: Email for the user with the permissions - api_token: API Token, generated for the user - - Returns: - Jira object - """ - from atlassian import Jira - - jira = Jira( - url, - username=user_email, - password=api_token, - ) - - response = jira.get_permissions("BROWSE_PROJECTS") - permitted = response["permissions"]["BROWSE_PROJECTS"]["havePermission"] - - if permitted: - return jira - - else: - raise ValueError( - """The user with the provided *user_email* and the *api_token* - is not permitted to browse projects for the jira organization - for the provided *url*. Try checking user_email, api_token, - and the url arguments.""", - ) - - -@dataclass -class JiraAccessConfig(AccessConfig): - api_token: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleJiraConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - """Connector config where: - user_email is the email to authenticate into Atlassian (Jira) Cloud, - api_token is the api token to authenticate into Atlassian (Jira) Cloud, - url is the URL pointing to the Atlassian (Jira) Cloud instance, - list_of_projects is a list of project that is aimed to be ingested. - - Check ... - for more info on the api_token. - """ - - user_email: str - access_config: JiraAccessConfig - url: str - projects: t.Optional[t.List[str]] = None - boards: t.Optional[t.List[str]] = None - issues: t.Optional[t.List[str]] = None - - def create_session_handle( - self, - ) -> JiraSessionHandle: - service = create_jira_object( - url=self.url, user_email=self.user_email, api_token=self.access_config.api_token - ) - return JiraSessionHandle(service=service) - - -@dataclass -class JiraFileMeta: - """Metadata specifying: - project_id: id for the jira project that the issue locates in, and - issue_key: key for the issue that is being reached to. - """ - - project_id: str - board_id: t.Optional[str] - issue_key: str - issue_id: str - - -# An implementation to obtain nested-defaultdict functionality. -# Keys have default values in a recursive manner, allowing -# limitless templates to parse an api response object. -def nested_object_to_field_getter(object): - if isinstance(object, abc.Mapping): - new_object = {} - for k, v in object.items(): - if isinstance(v, abc.Mapping): - new_object[k] = FieldGetter(nested_object_to_field_getter(v)) - else: - new_object[k] = v - return FieldGetter(new_object) - else: - return object - - -class FieldGetter(dict): - def __getitem__(self, key): - value = super().__getitem__(key) if key in self else None - if value is None: - value = FieldGetter({}) - return value - - -def form_templated_string(issue, parsed_fields, c_sep="|||", r_sep="\n\n\n"): - """Forms a template string via parsing the fields from the API response object on the issue - The template string will be saved to the disk, and then will be processed by partition.""" - return r_sep.join( - [ - _get_id_fields_for_issue(issue), - _get_project_fields_for_issue(parsed_fields), - _get_dropdown_fields_for_issue(parsed_fields), - _get_subtasks_for_issue(parsed_fields), - _get_comments_for_issue(parsed_fields), - _get_text_fields_for_issue(parsed_fields), - ], - ) - - -DEFAULT_C_SEP = " " * 5 -DEFAULT_R_SEP = "\n" - - -def _get_id_fields_for_issue(issue, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP): - id, key = issue["id"], issue["key"] - return f"IssueID_IssueKey:{id}{c_sep}{key}{r_sep}" - - -def _get_project_fields_for_issue(issue, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP): - if "project" in issue: - return ( - f"""ProjectID_Key:{issue["project"]["key"]}{c_sep}{issue["project"]["name"]}{r_sep}""" - ) - else: - return "" - - -def _get_dropdown_fields_for_issue(issue, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP): - return f""" - IssueType:{issue["issuetype"]["name"]} - {r_sep} - Status:{issue["status"]["name"]} - {r_sep} - Priority:{issue["priority"]} - {r_sep} - AssigneeID_Name:{issue["assignee"]["accountId"]}{c_sep}{issue["assignee"]["displayName"]} - {r_sep} - ReporterAdr_Name:{issue["reporter"]["emailAddress"]}{c_sep}{issue["reporter"]["displayName"]} - {r_sep} - Labels:{c_sep.join(issue["labels"])} - {r_sep} - Components:{c_sep.join([component["name"] for component in issue["components"]])} - {r_sep} - """ - - -def _get_subtasks_for_issue(issue): - return "" - - -def _get_text_fields_for_issue(issue, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP): - return f""" - {issue["summary"]} - {r_sep} - {issue["description"]} - {r_sep} - {c_sep.join([atch["self"] for atch in issue["attachment"]])} - {r_sep} - """ - - -def _get_comments_for_issue(issue, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP): - return c_sep.join( - [_get_fields_for_comment(comment) for comment in issue["comment"]["comments"]], - ) - - -def _get_fields_for_comment(comment, c_sep=DEFAULT_C_SEP, r_sep=DEFAULT_R_SEP): - return f"{comment['author']['displayName']}{c_sep}{comment['body']}{r_sep}" - - -def scroll_wrapper(func, results_key="results"): - def wrapper(*args, **kwargs): - """Wraps a function to obtain scroll functionality. - Function needs to be able to accept 'start' and 'limit' arguments.""" - if "number_of_items_to_fetch" in kwargs: - number_of_items_to_fetch = kwargs["number_of_items_to_fetch"] - del kwargs["number_of_items_to_fetch"] - else: - number_of_items_to_fetch = 100 - - kwargs["limit"] = min(100, number_of_items_to_fetch) - kwargs["start"] = kwargs.get("start", 0) - - all_results = [] - num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"]) - - for _ in range(num_iterations): - response = func(*args, **kwargs) - if isinstance(response, list): - all_results += func(*args, **kwargs) - elif isinstance(response, dict): - if results_key not in response: - raise KeyError( - "Response object has no known keys to \ - access the results, such as 'results' or 'values'.", - ) - all_results += func(*args, **kwargs)[results_key] - kwargs["start"] += kwargs["limit"] - - return all_results[:number_of_items_to_fetch] - - return wrapper - - -@dataclass -class JiraIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing). - - Current implementation creates a Jira connection object - to fetch each doc, rather than creating a it for each thread. - """ - - connector_config: SimpleJiraConfig - file_meta: t.Optional[JiraFileMeta] = None - registry_name: str = "jira" - - @cached_property - def record_locator(self): # Values must be JSON-serializable - """A dictionary with any data necessary to uniquely identify the document on - the source system.""" - return { - "base_url": self.connector_config.url, - "issue_key": self.file_meta.issue_key, - } - - @cached_property - @SourceConnectionNetworkError.wrap - def issue(self): - """Gets issue data""" - jira = self.session_handle.service - return jira.issue(self.file_meta.issue_key) - - @cached_property - def parsed_fields(self): - return nested_object_to_field_getter(self.issue["fields"]) - - @property - def grouping_folder_name(self): - if self.file_meta.board_id: - return self.file_meta.board_id - else: - return self.file_meta.project_id - - @property - def filename(self): - download_file = f"{self.file_meta.issue_id}.txt" - - return ( - Path(self.read_config.download_dir) / self.grouping_folder_name / download_file - ).resolve() - - @property - def _output_filename(self): - """Create output file path.""" - output_file = f"{self.file_meta.issue_id}.json" - - return ( - Path(self.processor_config.output_dir) / self.grouping_folder_name / output_file - ).resolve() - - @property - def version(self) -> t.Optional[str]: - return None - - def update_source_metadata(self, **kwargs) -> None: - exists = bool(self.issue) - if not exists: - self.source_metadata = SourceMetadata( - exists=exists, - ) - return - - self.source_metadata = SourceMetadata( - date_created=datetime.strptime( - self.parsed_fields["created"], - "%Y-%m-%dT%H:%M:%S.%f%z", - ).isoformat(), - date_modified=datetime.strptime( - self.parsed_fields["updated"], - "%Y-%m-%dT%H:%M:%S.%f%z", - ).isoformat(), - source_url=f"{self.connector_config.url}/browse/{self.file_meta.issue_key}", - exists=exists, - ) - - @SourceConnectionError.wrap - @requires_dependencies(["atlassian"], extras="jira") - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - document = form_templated_string(self.issue, self.parsed_fields) - self.update_source_metadata() - self.filename.parent.mkdir(parents=True, exist_ok=True) - - with open(self.filename, "w", encoding="utf8") as f: - f.write(document) - - -@dataclass -class JiraSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Fetches issues from projects in an Atlassian (Jira) Cloud instance.""" - - connector_config: SimpleJiraConfig - _jira: t.Optional["Jira"] = field(init=False, default=None) - - @property - def jira(self) -> "Jira": - if self._jira is None: - try: - self._jira = self.connector_config.create_session_handle().service - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - return self._jira - - @requires_dependencies(["atlassian"], extras="jira") - def initialize(self): - _ = self.jira - - def check_connection(self): - _ = self.jira - - @requires_dependencies(["atlassian"], extras="jira") - def _get_all_project_ids(self): - """Fetches ids for all projects in a Jira domain.""" - project_ids = [project["key"] for project in self.jira.projects()] - return project_ids - - @requires_dependencies(["atlassian"], extras="jira") - def _get_issues_within_one_project( - self, - project_id: str, - ): - get_issues_with_scroll = scroll_wrapper(self.jira.get_all_project_issues) - results = get_issues_with_scroll(project=project_id, fields=["key"]) - - return [(issue["key"], issue["id"], None) for issue in results] - - @requires_dependencies(["atlassian"], extras="jira") - def _get_issue_keys_within_projects(self, project_ids: t.Optional[t.List[str]] = None): - if project_ids is None: - # for when a component list is provided, without any projects - if bool(self.connector_config.boards or self.connector_config.issues): - return [] - # for when no components are provided. all projects will be ingested - else: - return self._get_all_project_ids() - - # for when a component list is provided, including some projects - issue_keys_all = [self._get_issues_within_one_project(project_id=id) for id in project_ids] - - issue_keys_flattened = [ - (issue_key, issue_id, None) - for issue_keys_project in issue_keys_all - for issue_key, issue_id, board_id in issue_keys_project - ] - - return issue_keys_flattened - - def _get_issues_within_one_board(self, board_id: str): - get_issues_with_scroll = scroll_wrapper( - self.jira.get_issues_for_board, - results_key="issues", - ) - results = get_issues_with_scroll(board_id=board_id, fields=["key"], jql=None) - - return [(issue["key"], issue["id"], board_id) for issue in results] - - def _get_issue_keys_within_boards(self, board_ids): - if board_ids is None: - return [] - - issue_keys_all = [self._get_issues_within_one_board(board_id=id) for id in board_ids] - - issue_keys_flattened = [ - (issue_key, issue_id, board_id) - for issue_keys_board in issue_keys_all - for issue_key, issue_id, board_id in issue_keys_board - ] - return issue_keys_flattened - - def get_issues_info(self, issues): - issues_info = [self.jira.get_issue(issue, ["key", "id"]) for issue in issues] - return [(info["key"], info["id"], None) for info in issues_info] - - def get_issue_keys_for_given_components(self): - issues = [] - - if self.connector_config.projects: - issues += self._get_issue_keys_within_projects(self.connector_config.projects) - if self.connector_config.boards: - issues += self._get_issue_keys_within_boards(self.connector_config.boards) - if self.connector_config.issues: - issues += self.get_issues_info(self.connector_config.issues) - - return issues - - def get_ingest_docs(self): - """Fetches all issues in a project.""" - if bool( - self.connector_config.projects - or self.connector_config.boards - or self.connector_config.issues, - ): - issue_keys_and_ids = self.get_issue_keys_for_given_components() - else: - # gets all issue ids from all projects - issue_keys_and_ids = self._get_issue_keys_within_projects() - - return [ - JiraIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - file_meta=JiraFileMeta( - issue_id=issue_id, - issue_key=issue_key, - project_id=issue_key.split("-")[0], - board_id=board_id, - ), - ) - for issue_key, issue_id, board_id in issue_keys_and_ids - ] diff --git a/unstructured/ingest/connector/kafka.py b/unstructured/ingest/connector/kafka.py deleted file mode 100644 index 4510cf3d7..000000000 --- a/unstructured/ingest/connector/kafka.py +++ /dev/null @@ -1,294 +0,0 @@ -import base64 -import json -import socket -import typing as t -from dataclasses import dataclass -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - BaseIngestDoc, - BaseSingleIngestDoc, - BaseSourceConnector, - ConfigSessionHandleMixin, - IngestDocCleanupMixin, - IngestDocSessionHandleMixin, - SourceConnectorCleanupMixin, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from confluent_kafka import Consumer, Producer - - -@dataclass -class KafkaAccessConfig(AccessConfig): - kafka_api_key: t.Optional[str] = enhanced_field(sensitive=True) - secret: t.Optional[str] = enhanced_field(sensitive=True) - - -@dataclass -class SimpleKafkaConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - bootstrap_server: str - port: str - topic: str - access_config: KafkaAccessConfig - confluent: t.Optional[bool] = True - num_messages_to_consume: t.Optional[int] = 1 - timeout: t.Optional[float] = 1.0 - - -@dataclass -class KafkaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a message and writing processed results.""" - - connector_config: SimpleKafkaConfig - raw_content: str - raw_filename: str - registry_name: str = "kafka" - - def _tmp_download_file(self): - topic_file = self.connector_config.topic + "-" + self.raw_filename - return Path(self.read_config.download_dir) / topic_file - - @property - def version(self) -> t.Optional[str]: - return None - - @property - def source_url(self) -> t.Optional[str]: - return None - - @property - def filename(self): - """The filename of the file created""" - return self._tmp_download_file() - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @property - def _output_filename(self): - """Create filename document id combined with a hash of the query to uniquely identify - the output file.""" - output_file = self.connector_config.topic + ".json" - return Path(self.processor_config.output_dir) / output_file - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - self._create_full_tmp_dir_path() - - pdf_data = base64.b64decode(self.raw_content) - - with open(self.filename, "wb") as file: - file.write(pdf_data) - - -@dataclass -class KafkaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Source connector for Kafka. - Main job is to consume from a Kafka topic and create instances of - KakfaIngestDoc. - Note that messages have the format of: - : the name of the file (with correct file extension) - : base64 encoded (whether was binary or not) - """ - - connector_config: SimpleKafkaConfig - _consumer: t.Optional["Consumer"] = None - - def check_connection(self): - try: - self.kafka_consumer - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def initialize(self): - topic = self.connector_config.topic - logger.info(f"Subscribing to topic: {topic}") - self.kafka_consumer.subscribe([topic]) - - @property - def kafka_consumer(self): - if self._consumer is None: - self._consumer = self.create_consumer() - return self._consumer - - @requires_dependencies(["confluent_kafka"], extras="kafka") - def create_consumer(self) -> "Consumer": - from confluent_kafka import Consumer - - is_confluent = self.connector_config.confluent - bootstrap = self.connector_config.bootstrap_server - port = self.connector_config.port - - conf = { - "bootstrap.servers": f"{bootstrap}:{port}", - "client.id": socket.gethostname(), - "group.id": "your_group_id", - "enable.auto.commit": "false", - "auto.offset.reset": "earliest", - "message.max.bytes": 10485760, - } - - if is_confluent: - kafka_api_key = self.connector_config.access_config.kafka_api_key - secret = self.connector_config.access_config.secret - conf["sasl.mechanism"] = "PLAIN" - conf["security.protocol"] = "SASL_SSL" - conf["sasl.username"] = kafka_api_key - conf["sasl.password"] = secret - - consumer = Consumer(conf) - logger.debug(f"Kafka Consumer connected to bootstrap: {bootstrap}") - return consumer - - @SourceConnectionError.wrap - def get_ingest_docs(self): - from confluent_kafka import KafkaError - - consumer = self.kafka_consumer - running = True - - collected = [] - num_messages_to_consume = self.connector_config.num_messages_to_consume - logger.info(f"Config set for blocking on {num_messages_to_consume} messages") - # Consume specified number of messages - while running: - msg = consumer.poll(timeout=self.connector_config.timeout) - if msg is None: - logger.debug("No Kafka messages found") - continue - if msg.error(): - if msg.error().code() == KafkaError._PARTITION_EOF: - # End of partition event - logger.error( - "%% %s [%d] reached end at offset %d\n" - % (msg.topic(), msg.partition(), msg.offset()) - ) - else: - collected.append(json.loads(msg.value().decode("utf8"))) - if len(collected) >= num_messages_to_consume: - logger.debug(f"Found {len(collected)} messages, stopping") - consumer.commit(asynchronous=False) - break - - return [ - KafkaIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - raw_filename=msg["filename"], - raw_content=msg["content"], - ) - for msg in collected - ] - - -@dataclass -class KafkaWriteConfig(WriteConfig): - batch_size: int = 4 - - -@dataclass -class KafkaDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector): - """Connector to write BaseIngestDoc types to Kafka - Writes messages to Kafka in the format: - "type" - "text": - "filename": - """ - - write_config: KafkaWriteConfig - connector_config: SimpleKafkaConfig - _producer: t.Optional["Producer"] = None - - @property - def kafka_producer(self): - if self._producer is None: - self._producer = self.create_producer() - return self._producer - - def initialize(self): - pass - - @requires_dependencies(["confluent_kafka"], extras="kafka") - def create_producer(self) -> "Producer": - from confluent_kafka import Producer - - is_confluent = self.connector_config.confluent - bootstrap = self.connector_config.bootstrap_server - port = self.connector_config.port - - conf = { - "bootstrap.servers": f"{bootstrap}:{port}", - "client.id": socket.gethostname(), - } - - if is_confluent: - api_key = self.connector_config.access_config.kafka_api_key - secret = self.connector_config.access_config.secret - conf["sasl.mechanism"] = "PLAIN" - conf["security.protocol"] = "SASL_SSL" - conf["sasl.username"] = api_key - conf["sasl.password"] = secret - - producer = Producer(conf) - logger.debug(f"Connected to bootstrap: {bootstrap}") - return producer - - def check_connection(self): - try: - self.kafka_producer - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - @DestinationConnectionError.wrap - def upload_msg(self, batch) -> int: - logger.debug(f"Uploading batch: {batch}") - topic = self.connector_config.topic - producer = self.kafka_producer - uploaded = 0 - for i in range(len(batch)): - filename = f'{batch[i].pop("filename")}' - producer.produce(topic, key=filename, value=str(batch[i])) - uploaded += 1 - return uploaded - - @DestinationConnectionError.wrap - def write_dict(self, *args, dict_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info(f"Writing {len(dict_list)} documents to Kafka") - num_uploaded = 0 - - for chunk in batch_generator(dict_list, self.write_config.batch_size): - num_uploaded += self.upload_msg(chunk) # noqa: E203 - - producer = self.kafka_producer - producer.flush() - logger.info(f"Uploaded {num_uploaded} documents to Kafka") - - def write(self, docs: t.List[BaseIngestDoc]) -> None: - content_list: t.List[t.Dict[str, t.Any]] = [] - for doc in docs: - local_path = doc._output_filename - with open(local_path) as json_file: - dict_content = json.load(json_file) - for content in dict_content: - content_list.append( - { - "type": content["type"], - "text": content["text"], - "filename": content["metadata"]["filename"], - } - ) - self.write_dict(dict_list=content_list) diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py deleted file mode 100644 index 417828606..000000000 --- a/unstructured/ingest/connector/local.py +++ /dev/null @@ -1,139 +0,0 @@ -import fnmatch -import glob -import os -import typing as t -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path - -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - SourceMetadata, -) -from unstructured.ingest.logger import logger - - -@dataclass -class SimpleLocalConfig(BaseConnectorConfig): - # Local specific options - input_path: str - recursive: bool = False - file_glob: t.Optional[t.List[str]] = None - - def __post_init__(self): - if os.path.isfile(self.input_path): - self.input_path_is_file = True - else: - self.input_path_is_file = False - - -@dataclass -class LocalIngestDoc(BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - """ - - connector_config: SimpleLocalConfig - path: str - registry_name: str = "local" - - @property - def base_filename(self) -> t.Optional[str]: - download_path = Path(self.connector_config.input_path).resolve() - full_path = Path(self.filename).resolve() - if download_path.is_file(): - download_path = download_path.parent - relative_path = full_path.relative_to(download_path) - return str(relative_path) - - @property - def filename(self): - """The filename of the local file to be processed""" - return Path(self.path) - - def cleanup_file(self): - """Not applicable to local file system""" - - def get_file(self): - """Not applicable to local file system""" - - def update_source_metadata(self, **kwargs) -> None: - try: - out = os.lstat(self.path) - self._source_metadata = SourceMetadata( - exists=True, - date_created=str(datetime.fromtimestamp(out.st_ctime)), - date_modified=str(datetime.fromtimestamp(out.st_mtime)), - permissions_data=[{"mode": out.st_mode}], - source_url=self.path, - ) - except FileNotFoundError: - self._source_metadata = SourceMetadata(exists=False) - - @property - def _output_filename(self) -> Path: - """Returns output filename for the doc - If input path argument is a file itself, it returns the filename of the doc. - If input path argument is a folder, it returns the relative path of the doc. - """ - input_path = Path(self.connector_config.input_path) - basename = ( - f"{self.base_filename}.json" - if input_path.is_file() - else f"{Path(self.path).relative_to(input_path)}.json" - ) - return Path(self.processor_config.output_dir) / basename - - -@dataclass -class LocalSourceConnector(BaseSourceConnector): - """Objects of this class support fetching document(s) from local file system""" - - def check_connection(self): - pass - - connector_config: SimpleLocalConfig - - def __post_init__(self): - self.ingest_doc_cls: t.Type[LocalIngestDoc] = LocalIngestDoc - - def cleanup(self, cur_dir=None): - """Not applicable to local file system""" - - def initialize(self): - """Not applicable to local file system""" - - def _list_files(self): - if self.connector_config.input_path_is_file: - return glob.glob(f"{self.connector_config.input_path}") - elif self.connector_config.recursive: - return glob.glob( - f"{self.connector_config.input_path}/**", - recursive=self.connector_config.recursive, - ) - else: - return glob.glob(f"{self.connector_config.input_path}/*") - - def does_path_match_glob(self, path: str) -> bool: - if self.connector_config.file_glob is None: - return True - patterns = self.connector_config.file_glob - for pattern in patterns: - if fnmatch.filter([path], pattern): - return True - logger.debug(f"The file {path!r} is discarded as it does not match any given glob.") - return False - - def get_ingest_docs(self): - return [ - self.ingest_doc_cls( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - path=file, - ) - for file in self._list_files() - if os.path.isfile(file) and self.does_path_match_glob(file) - ] diff --git a/unstructured/ingest/connector/mongodb.py b/unstructured/ingest/connector/mongodb.py deleted file mode 100644 index ae73ecbec..000000000 --- a/unstructured/ingest/connector/mongodb.py +++ /dev/null @@ -1,284 +0,0 @@ -import copy -import typing as t -from dataclasses import dataclass, field -from pathlib import Path - -from unstructured.__version__ import __version__ as unstructured_version -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError, WriteError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - BaseIngestDocBatch, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from pymongo import MongoClient - - -SERVER_API_VERSION = "1" - - -def parse_userinfo(userinfo: str) -> t.Tuple[str, str]: - user, _, passwd = userinfo.partition(":") - return user, passwd - - -@dataclass -class MongoDBAccessConfig(AccessConfig): - uri: t.Optional[str] = enhanced_field(sensitive=True, default=None) - - -@dataclass -class SimpleMongoDBConfig(BaseConnectorConfig): - access_config: MongoDBAccessConfig - host: t.Optional[str] = None - database: t.Optional[str] = None - collection: t.Optional[str] = None - port: int = 27017 - batch_size: int = 100 - - @requires_dependencies(["pymongo"], extras="mongodb") - def generate_client(self) -> "MongoClient": - from pymongo import MongoClient - from pymongo.driver_info import DriverInfo - from pymongo.server_api import ServerApi - - if self.access_config.uri: - return MongoClient( - self.access_config.uri, - server_api=ServerApi(version=SERVER_API_VERSION), - driver=DriverInfo(name="unstructured", version=unstructured_version), - ) - else: - return MongoClient( - host=self.host, - port=self.port, - server_api=ServerApi(version=SERVER_API_VERSION), - ) - - def get_collection(self, client): - database = client[self.database] - return database.get_collection(name=self.collection) - - -@dataclass -class MongoDBDocumentMeta: - collection: str - document_id: str - date_created: str - - -@dataclass -class MongoDBIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleMongoDBConfig - document_meta: MongoDBDocumentMeta - document: dict = field(default_factory=dict) - registry_name: str = "mongodb" - - @property - def filename(self): - return ( - Path(self.read_config.download_dir) - / self.connector_config.collection - / f"{self.document_meta.document_id}.txt" - ).resolve() - - @property - def _output_filename(self): - return ( - Path(self.processor_config.output_dir) - / self.connector_config.collection - / f"{self.document_meta.document_id}.json" - ) - - def update_source_metadata(self, **kwargs): - if self.document is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - self.source_metadata = SourceMetadata( - date_created=self.document_meta.date_created, - exists=True, - ) - - @SourceConnectionError.wrap - @requires_dependencies(["pymongo"], extras="mongodb") - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - pass - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "host": self.connector_config.host, - "collection": self.connector_config.collection, - "document_id": self.document_meta.document_id, - } - - -@dataclass -class MongoDBIngestDocBatch(BaseIngestDocBatch): - connector_config: SimpleMongoDBConfig - ingest_docs: t.List[MongoDBIngestDoc] = field(default_factory=list) - list_of_ids: t.List[str] = field(default_factory=list) - registry_name: str = "mongodb_batch" - - @property - def unique_id(self) -> str: - return ",".join(sorted(self.list_of_ids)) - - @requires_dependencies(["pymongo"], extras="mongodb") - def _get_docs(self) -> t.List[dict]: - """Fetches all documents in a collection.""" - from bson.objectid import ObjectId - - # Note for future. Maybe this could use other client - client = self.connector_config.generate_client() - collection = self.connector_config.get_collection(client) - # MondoDB expects a list of ObjectIds - list_of_object_ids = [] - for x in self.list_of_ids: - list_of_object_ids.append(ObjectId(x)) - return list(collection.find({"_id": {"$in": list_of_object_ids}})) - - def get_files(self): - documents = self._get_docs() - for doc in documents: - ingest_doc = MongoDBIngestDoc( - processor_config=self.processor_config, - read_config=self.read_config, - connector_config=self.connector_config, - document_meta=MongoDBDocumentMeta( - collection=self.connector_config.collection, - document_id=str(doc.get("_id")), - date_created=doc.get("_id").generation_time.isoformat(), - ), - document=doc, - ) - ingest_doc.update_source_metadata() - del doc["_id"] - filename = ingest_doc.filename - flattened_dict = flatten_dict(dictionary=doc) - str_values = [str(value) for value in flattened_dict.values()] - concatenated_values = "\n".join(str_values) - - filename.parent.mkdir(parents=True, exist_ok=True) - with open(filename, "w", encoding="utf8") as f: - f.write(concatenated_values) - - self.ingest_docs.append(ingest_doc) - - -@dataclass -class MongoDBSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleMongoDBConfig - _client: t.Optional["MongoClient"] = field(init=False, default=None) - - @property - def client(self) -> "MongoClient": - if self._client is None: - self._client = self.connector_config.generate_client() - return self._client - - def check_connection(self): - try: - self.client.admin.command("ping") - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def initialize(self): - _ = self.client - - @requires_dependencies(["pymongo"], extras="mongodb") - def _get_doc_ids(self) -> t.List[str]: - """Fetches all document ids in a collection.""" - collection = self.connector_config.get_collection(self.client) - return [str(x) for x in collection.distinct("_id")] - - def get_ingest_docs(self): - """Fetches all documents in an index, using ids that are fetched with _get_doc_ids""" - ids = self._get_doc_ids() - id_batches = [ - ids[ - i - * self.connector_config.batch_size : (i + 1) # noqa - * self.connector_config.batch_size - ] - for i in range( - (len(ids) + self.connector_config.batch_size - 1) - // self.connector_config.batch_size - ) - ] - - return [ - MongoDBIngestDocBatch( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - list_of_ids=batched_ids, - ) - for batched_ids in id_batches - ] - - -@dataclass -class MongoDBDestinationConnector(BaseDestinationConnector): - connector_config: SimpleMongoDBConfig - _client: t.Optional["MongoClient"] = field(init=False, default=None) - - def to_dict(self, **kwargs): - """ - The _client variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle '_thread.lock' object - When serializing, remove it, meaning client data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - if hasattr(self_cp, "_client"): - setattr(self_cp, "_client", None) - return _asdict(self_cp, **kwargs) - - @property - def client(self) -> "MongoClient": - if self._client is None: - self._client = self.connector_config.generate_client() - return self._client - - @requires_dependencies(["pymongo"], extras="mongodb") - def check_connection(self): - try: - self.client.admin.command("ping") - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def initialize(self): - _ = self.client - - @requires_dependencies(["pymongo"], extras="mongodb") - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info( - f"writing {len(elements_dict)} documents to destination " - f"database {self.connector_config.database}, " - f"at collection {self.connector_config.collection}", - ) - - collection = self.connector_config.get_collection(self.client) - try: - collection.insert_many(elements_dict) - except Exception as e: - logger.error(f"failed to write records: {e}", exc_info=True) - raise WriteError(f"failed to write records: {e}") diff --git a/unstructured/ingest/connector/notion/__init__.py b/unstructured/ingest/connector/notion/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/connector/notion/client.py b/unstructured/ingest/connector/notion/client.py deleted file mode 100644 index dfb9e8e48..000000000 --- a/unstructured/ingest/connector/notion/client.py +++ /dev/null @@ -1,233 +0,0 @@ -from typing import Any, Generator, List, Optional, Tuple - -import backoff -import httpx -import notion_client.errors -from notion_client import Client as NotionClient -from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint -from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint -from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint -from notion_client.api_endpoints import Endpoint -from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint -from notion_client.errors import RequestTimeoutError - -from unstructured.ingest.connector.notion.types.block import Block -from unstructured.ingest.connector.notion.types.database import Database -from unstructured.ingest.connector.notion.types.database_properties import ( - map_cells, -) -from unstructured.ingest.connector.notion.types.page import Page -from unstructured.ingest.ingest_backoff import RetryHandler -from unstructured.ingest.interfaces import RetryStrategyConfig - -retryable_exceptions = ( - httpx.TimeoutException, - httpx.HTTPStatusError, - notion_client.errors.HTTPResponseError, -) - - -def get_retry_handler(endpoint: Endpoint) -> Optional[RetryHandler]: - if retry_strategy_config := getattr(endpoint, "retry_strategy_config"): - return RetryHandler( - backoff.expo, - retryable_exceptions, - max_time=retry_strategy_config.max_retry_time, - max_tries=retry_strategy_config.max_retries, - logger=endpoint.parent.logger, - start_log_level=endpoint.parent.logger.level, - backoff_log_level=endpoint.parent.logger.level, - ) - return None - - -class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): - def __init__( - self, - *args, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]: - resp: dict = ( - self.retry_handler(super().list, block_id=block_id, **kwargs) - if self.retry_handler - else super().list(block_id=block_id, **kwargs) - ) # type: ignore - child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] - return child_blocks, resp - - def iterate_list( - self, - block_id: str, - **kwargs: Any, - ) -> Generator[List[Block], None, None]: - while True: - response: dict = ( - self.retry_handler(super().list, block_id=block_id, **kwargs) - if self.retry_handler - else super().list(block_id=block_id, **kwargs) - ) # type: ignore - child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] - yield child_blocks - - next_cursor = response.get("next_cursor") - if not response.get("has_more") or not next_cursor: - return - - -class DatabasesEndpoint(NotionDatabasesEndpoint): - def __init__( - self, - *args, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def retrieve(self, database_id: str, **kwargs: Any) -> Database: - resp: dict = ( - self.retry_handler(super().retrieve, database_id=database_id, **kwargs) - if (self.retry_handler) - else (super().retrieve(database_id=database_id, **kwargs)) - ) # type: ignore - return Database.from_dict(data=resp) - - def retrieve_status(self, database_id: str, **kwargs) -> int: - request = self.parent._build_request( - method="HEAD", - path=f"databases/{database_id}", - auth=kwargs.get("auth"), - ) - try: - response: httpx.Response = ( - self.retry_handler(self.parent.client.send, request) - if (self.retry_handler) - else (self.parent.client.send(request)) - ) # type: ignore - return response.status_code - except httpx.TimeoutException: - raise RequestTimeoutError() - - def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: - """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. - - *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)* - """ # noqa: E501 - resp: dict = ( - self.retry_handler(super().query, database_id=database_id, **kwargs) - if (self.retry_handler) - else (super().query(database_id=database_id, **kwargs)) - ) # type: ignore - pages = [Page.from_dict(data=p) for p in resp.pop("results")] - for p in pages: - p.properties = map_cells(p.properties) - return pages, resp - - def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: - while True: - response: dict = ( - self.retry_handler(super().query, database_id=database_id, **kwargs) - if (self.retry_handler) - else (super().query(database_id=database_id, **kwargs)) - ) # type: ignore - pages = [Page.from_dict(data=p) for p in response.pop("results", [])] - for p in pages: - p.properties = map_cells(p.properties) - yield pages - - next_cursor = response.get("next_cursor") - if not response.get("has_more") or not next_cursor: - return - - -class BlocksEndpoint(NotionBlocksEndpoint): - def __init__( - self, - *args: Any, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - self.children = BlocksChildrenEndpoint( - retry_strategy_config=retry_strategy_config, - *args, - **kwargs, - ) - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def retrieve(self, block_id: str, **kwargs: Any) -> Block: - resp: dict = ( - self.retry_handler(super().retrieve, block_id=block_id, **kwargs) - if (self.retry_handler) - else (super().retrieve(block_id=block_id, **kwargs)) - ) # type: ignore - return Block.from_dict(data=resp) - - -class PagesEndpoint(NotionPagesEndpoint): - def __init__( - self, - *args, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs, - ): - super().__init__(*args, **kwargs) - self.retry_strategy_config = retry_strategy_config - - @property - def retry_handler(self) -> Optional[RetryHandler]: - return get_retry_handler(self) - - def retrieve(self, page_id: str, **kwargs: Any) -> Page: - resp: dict = ( - self.retry_handler(super().retrieve, page_id=page_id, **kwargs) - if (self.retry_handler) - else (super().retrieve(page_id=page_id, **kwargs)) - ) # type: ignore - return Page.from_dict(data=resp) - - def retrieve_status(self, page_id: str, **kwargs) -> int: - request = self.parent._build_request( - method="HEAD", - path=f"pages/{page_id}", - auth=kwargs.get("auth"), - ) - try: - response: httpx.Response = ( - self.retry_handler(self.parent.client.send, request) - if (self.retry_handler) - else (self.parent.client.send(request)) - ) # type: ignore - return response.status_code - except httpx.TimeoutException: - raise RequestTimeoutError() - - -class Client(NotionClient): - def __init__( - self, - *args: Any, - retry_strategy_config: Optional[RetryStrategyConfig] = None, - **kwargs: Any, - ) -> None: - super().__init__(*args, **kwargs) - self.blocks = BlocksEndpoint(retry_strategy_config=retry_strategy_config, parent=self) - self.pages = PagesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) - self.databases = DatabasesEndpoint(retry_strategy_config=retry_strategy_config, parent=self) diff --git a/unstructured/ingest/connector/notion/connector.py b/unstructured/ingest/connector/notion/connector.py deleted file mode 100644 index c9588cc47..000000000 --- a/unstructured/ingest/connector/notion/connector.py +++ /dev/null @@ -1,468 +0,0 @@ -import typing as t -from dataclasses import dataclass, field -from pathlib import Path -from uuid import UUID - -import httpx - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - RetryStrategyConfig, - SourceConnectorCleanupMixin, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - requires_dependencies, -) - -NOTION_API_VERSION = "2022-06-28" -if t.TYPE_CHECKING: - from unstructured.ingest.connector.notion.client import Client as NotionClient - - -@dataclass -class NotionAccessConfig(AccessConfig): - notion_api_key: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleNotionConfig(BaseConnectorConfig): - """Connector config to process all messages by channel id's.""" - - access_config: NotionAccessConfig - page_ids: t.Optional[t.List[str]] = None - database_ids: t.Optional[t.List[str]] = None - recursive: bool = False - - def __post_init__(self): - if self.page_ids: - self.page_ids = [str(UUID(p.strip())) for p in self.page_ids] - - if self.database_ids: - self.database_ids = [str(UUID(d.strip())) for d in self.database_ids] - - -@dataclass -class NotionPageIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - page_id: str - connector_config: SimpleNotionConfig - registry_name: str = "notion_page" - retry_strategy_config: t.Optional[RetryStrategyConfig] = None - - def _tmp_download_file(self): - page_file = self.page_id + ".html" - return Path(self.read_config.download_dir) / page_file - - @property - def _output_filename(self): - page_file = self.page_id + ".json" - return Path(self.processor_config.output_dir) / page_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_client(self): - from unstructured.ingest.connector.notion.client import Client as NotionClient - - # Pin the version of the api to avoid schema changes - return NotionClient( - notion_version=NOTION_API_VERSION, - auth=self.connector_config.access_config.notion_api_key, - logger=logger, - log_level=logger.level, - retry_strategy_config=self.retry_strategy_config, - ) - - @BaseSingleIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_file(self): - from notion_client import APIErrorCode, APIResponseError - - from unstructured.ingest.connector.notion.helpers import extract_page_html - - self._create_full_tmp_dir_path() - - client = self.get_client() - - try: - text_extraction = extract_page_html( - client=client, - page_id=self.page_id, - logger=logger, - ) - self.check_exists = True - self.file_exists = True - if html := text_extraction.html: - with open(self._tmp_download_file(), "w") as page_file: - page_file.write(html.render(pretty=True)) - - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - logger.error(f"Error: {error}") - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_file_metadata(self): - from notion_client import APIErrorCode, APIResponseError - - client = self.get_client() - - # The Notion block endpoint gives more hierarchical information (parent,child relationships) - # than the pages endpoint so choosing to use that one to get metadata about the page - try: - self.file_metadata = client.pages.retrieve(page_id=self.page_id) # type: ignore - self.check_exists = True - self.file_exists = True - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - logger.error(f"Error: {error}") - - @property - def date_created(self) -> t.Optional[str]: - """The date the document was created on the source system.""" - if not hasattr(self, "file_metadata") or not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.created_time if self.file_metadata else None - - @property - def date_modified(self) -> t.Optional[str]: - """The date the document was last modified on the source system.""" - if not hasattr(self, "file_metadata") or not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.last_edited_time if self.file_metadata else None - - @property - def exists(self) -> t.Optional[bool]: - """Whether the document exists on the remote source.""" - if self.check_exists: - return self.file_exists - - self.get_file_metadata() - - return self.file_exists - - @property - def filename(self): - """The filename of the file created from a notion page""" - return self._tmp_download_file() - - -@dataclass -class NotionDatabaseIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - database_id: str - connector_config: SimpleNotionConfig - retry_strategy_config: t.Optional[RetryStrategyConfig] = None - registry_name: str = "notion_database" - - def _tmp_download_file(self): - page_file = self.database_id + ".html" - return Path(self.read_config.download_dir) / page_file - - @property - def _output_filename(self): - page_file = self.database_id + ".json" - return Path(self.processor_config.output_dir) / page_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_client(self): - from unstructured.ingest.connector.notion.client import Client as NotionClient - - # Pin the version of the api to avoid schema changes - return NotionClient( - notion_version=NOTION_API_VERSION, - auth=self.connector_config.access_config.notion_api_key, - logger=logger, - log_level=logger.level, - retry_strategy_config=self.retry_strategy_config, - ) - - @BaseSingleIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_file(self): - from notion_client import APIErrorCode, APIResponseError - - from unstructured.ingest.connector.notion.helpers import extract_database_html - - self._create_full_tmp_dir_path() - - client = self.get_client() - - try: - text_extraction = extract_database_html( - client=client, - database_id=self.database_id, - logger=logger, - ) - self.check_exists = True - self.file_exists = True - if html := text_extraction.html: - with open(self._tmp_download_file(), "w") as page_file: - page_file.write(html.render(pretty=True)) - - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - logger.error(f"Error: {error}") - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_file_metadata(self): - from notion_client import APIErrorCode, APIResponseError - - client = self.get_client() - - # The Notion block endpoint gives more hierarchical information (parent,child relationships) - # than the pages endpoint so choosing to use that one to get metadata about the page - try: - self.file_metadata = client.databases.retrieve( - database_id=self.database_id, - ) # type: ignore - self.check_exists = True - self.file_exists = True - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - logger.error(f"Error: {error}") - - @property - def date_created(self) -> t.Optional[str]: - """The date the document was created on the source system.""" - if not hasattr(self, "file_metadata") or not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.created_time if self.file_metadata else None - - @property - def date_modified(self) -> t.Optional[str]: - """The date the document was last modified on the source system.""" - if not hasattr(self, "file_metadata") or not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.last_edited_time if self.file_metadata else None - - @property - def exists(self) -> t.Optional[bool]: - """Whether the document exists on the remote source.""" - if self.check_exists: - return self.file_exists - - self.get_file_metadata() - - return self.file_exists - - @property - def filename(self): - """The filename of the file created from a notion page""" - return self._tmp_download_file() - - -@dataclass -class NotionSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Objects of this class support fetching document(s) from""" - - connector_config: SimpleNotionConfig - retry_strategy_config: t.Optional[RetryStrategyConfig] = None - _client: t.Optional["NotionClient"] = field(init=False, default=None) - - @property - def client(self) -> "NotionClient": - if self._client is None: - self._client = self.create_client() - return self._client - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def create_client(self) -> "NotionClient": - from unstructured.ingest.connector.notion.client import Client as NotionClient - - return NotionClient( - notion_version=NOTION_API_VERSION, - auth=self.connector_config.access_config.notion_api_key, - logger=logger, - log_level=logger.level, - retry_strategy_config=self.retry_strategy_config, - ) - - def check_connection(self): - try: - request = self.client._build_request("HEAD", "users") - response = self.client.client.send(request) - response.raise_for_status() - except httpx.HTTPStatusError as http_error: - logger.error(f"failed to validate connection: {http_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {http_error}") - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def initialize(self): - """Verify that can get metadata for an object, validates connections info.""" - _ = self.client - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_child_page_content(self, page_id: str): - from unstructured.ingest.connector.notion.helpers import ( - get_recursive_content_from_page, - ) - - # sanity check that database id is valid - resp_code = self.client.pages.retrieve_status(page_id=page_id) - if resp_code != 200: - raise ValueError( - f"page associated with page id could not be found: {page_id}", - ) - - child_content = get_recursive_content_from_page( - client=self.client, - page_id=page_id, - logger=logger, - ) - return child_content - - def get_child_content(self, page_id: str): - from unstructured.ingest.connector.notion.helpers import ( - get_recursive_content_from_page, - ) - - child_content = get_recursive_content_from_page( - client=self.client, - page_id=page_id, - logger=logger, - ) - return child_content - - @requires_dependencies(dependencies=["notion_client"], extras="notion") - def get_child_database_content(self, database_id: str): - from unstructured.ingest.connector.notion.helpers import ( - get_recursive_content_from_database, - ) - - # sanity check that database id is valid - resp_code = self.client.databases.retrieve_status(database_id=database_id) - if resp_code != 200: - raise ValueError( - f"database associated with database id could not be found: {database_id}", - ) - - child_content = get_recursive_content_from_database( - client=self.client, - database_id=database_id, - logger=logger, - ) - return child_content - - def get_ingest_docs(self): - docs: t.List[BaseSingleIngestDoc] = [] - if self.connector_config.page_ids: - docs += [ - NotionPageIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - retry_strategy_config=self.retry_strategy_config, - read_config=self.read_config, - page_id=page_id, - ) - for page_id in self.connector_config.page_ids - ] - if self.connector_config.database_ids: - docs += [ - NotionDatabaseIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - retry_strategy_config=self.retry_strategy_config, - read_config=self.read_config, - database_id=database_id, - ) - for database_id in self.connector_config.database_ids - ] - if self.connector_config.recursive: - logger.info("Getting recursive content") - child_pages = [] - child_databases = [] - if self.connector_config.page_ids: - for page_id in self.connector_config.page_ids: - child_content = self.get_child_page_content(page_id=page_id) - child_pages.extend(child_content.child_pages) - child_databases.extend(child_content.child_databases) - - if self.connector_config.database_ids: - for database_id in self.connector_config.database_ids: - child_content = self.get_child_database_content(database_id=database_id) - child_pages.extend(child_content.child_pages) - child_databases.extend(child_content.child_databases) - - # Remove duplicates - child_pages = list(set(child_pages)) - if self.connector_config.page_ids: - child_pages = [c for c in child_pages if c not in self.connector_config.page_ids] - - child_databases = list(set(child_databases)) - if self.connector_config.database_ids: - child_databases = [ - db for db in child_databases if db not in self.connector_config.database_ids - ] - - if child_pages: - logger.info( - "Adding the following child page ids: {}".format(", ".join(child_pages)), - ) - docs += [ - NotionPageIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - retry_strategy_config=self.retry_strategy_config, - read_config=self.read_config, - page_id=page_id, - ) - for page_id in child_pages - ] - - if child_databases: - logger.info( - "Adding the following child database ids: {}".format( - ", ".join(child_databases), - ), - ) - docs += [ - NotionDatabaseIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - retry_strategy_config=self.retry_strategy_config, - read_config=self.read_config, - database_id=database_id, - ) - for database_id in child_databases - ] - - return docs diff --git a/unstructured/ingest/connector/notion/helpers.py b/unstructured/ingest/connector/notion/helpers.py deleted file mode 100644 index a09fa083b..000000000 --- a/unstructured/ingest/connector/notion/helpers.py +++ /dev/null @@ -1,584 +0,0 @@ -import enum -import logging -from dataclasses import dataclass, field -from typing import List, Optional, Tuple -from urllib.parse import urlparse -from uuid import UUID - -from htmlBuilder.attributes import Style, Type -from htmlBuilder.tags import ( - Body, - Div, - Head, - Html, - HtmlTag, - Ol, - Table, - Td, - Th, - Title, - Tr, - Ul, -) -from notion_client.errors import APIResponseError - -import unstructured.ingest.connector.notion.types.blocks as notion_blocks -from unstructured.ingest.connector.notion.client import Client -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.block import Block -from unstructured.ingest.connector.notion.types.database import Database - - -@dataclass -class TextExtractionResponse: - text: Optional[str] = None - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -@dataclass -class HtmlExtractionResponse: - html: Optional[HtmlTag] = None - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -def extract_page_html( - client: Client, - page_id: str, - logger: logging.Logger, -) -> HtmlExtractionResponse: - page_id_uuid = UUID(page_id) - html_elements: List[Tuple[BlockBase, HtmlTag]] = [] - parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore - head = None - if isinstance(parent_block.block, notion_blocks.ChildPage): - head = Head([], Title([], parent_block.block.title)) - child_pages: List[str] = [] - child_databases: List[str] = [] - parents: List[Tuple[int, Block]] = [(0, parent_block)] - processed_block_ids = [] - while len(parents) > 0: - level, parent = parents.pop(0) - parent_html = parent.get_html() - if parent_html: - html_elements.append((parent.block, parent_html)) - logger.debug(f"processing block: {parent}") - if isinstance(parent.block, notion_blocks.ChildPage) and parent.id != str(page_id_uuid): - child_pages.append(parent.id) - continue - if isinstance(parent.block, notion_blocks.ChildDatabase): - child_databases.append(parent.id) - continue - if isinstance(parent.block, notion_blocks.Table): - table_response = build_table(client=client, table=parent) - html_elements.append((parent.block, table_response.table_html)) - child_pages.extend(table_response.child_pages) - child_databases.extend(table_response.child_databases) - continue - if isinstance(parent.block, notion_blocks.ColumnList): - column_html = build_columned_list(client=client, column_parent=parent) - html_elements.append((parent.block, column_html)) - continue - if isinstance(parent.block, notion_blocks.BulletedListItem): - bullet_list_resp = build_bulleted_list_children( - client=client, - bulleted_list_item_parent=parent, - ) - if bullet_list_children := bullet_list_resp.child_list: - html_elements.append((parent.block, bullet_list_children)) - continue - if isinstance(parent.block, notion_blocks.NumberedListItem): - numbered_list_resp = build_numbered_list_children( - client=client, - numbered_list_item_parent=parent, - ) - if numbered_list_children := numbered_list_resp.child_list: - html_elements.append((parent.block, numbered_list_children)) - continue - if parent.block.can_have_children() and parent.has_children: - children = [] - for children_block in client.blocks.children.iterate_list( # type: ignore - block_id=parent.id, - ): - children.extend(children_block) - if children: - logger.debug(f"Adding {len(children)} children from parent: {parent}") - for child in children: - if child.id not in processed_block_ids: - parents.append((level + 1, child)) - processed_block_ids.append(parent) - - # Join list items - joined_html_elements = [] - numbered_list_items = [] - bullet_list_items = [] - for block, html in html_elements: - if isinstance(block, notion_blocks.BulletedListItem): - bullet_list_items.append(html) - continue - if isinstance(block, notion_blocks.NumberedListItem): - numbered_list_items.append(html) - continue - if len(numbered_list_items) > 0: - joined_html_elements.append(Ol([], numbered_list_items)) - numbered_list_items = [] - if len(bullet_list_items) > 0: - joined_html_elements.append(Ul([], bullet_list_items)) - bullet_list_items = [] - joined_html_elements.append(html) - - body = Body([], joined_html_elements) - all_elements = [body] - if head: - all_elements = [head] + all_elements - full_html = Html([], all_elements) - return HtmlExtractionResponse( - full_html, - child_pages=child_pages, - child_databases=child_databases, - ) - - -def extract_database_html( - client: Client, - database_id: str, - logger: logging.Logger, -) -> HtmlExtractionResponse: - logger.debug(f"processing database id: {database_id}") - database: Database = client.databases.retrieve(database_id=database_id) # type: ignore - property_keys = list(database.properties.keys()) - property_keys = sorted(property_keys) - table_html_rows = [] - child_pages: List[str] = [] - child_databases: List[str] = [] - # Create header row - table_html_rows.append(Tr([], [Th([], k) for k in property_keys])) - - all_pages = [] - for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore - all_pages.extend(page_chunk) - - logger.debug(f"Creating {len(all_pages)} rows") - for page in all_pages: - if is_database_url(client=client, url=page.url): - child_databases.append(page.id) - if is_page_url(client=client, url=page.url): - child_pages.append(page.id) - properties = page.properties - inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore - table_html_rows.append( - Tr( - [], - [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]], - ), - ) - - table_html = Table([], table_html_rows) - - return HtmlExtractionResponse( - html=table_html, - child_pages=child_pages, - child_databases=child_databases, - ) - - -@dataclass -class ChildExtractionResponse: - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -class QueueEntryType(enum.Enum): - DATABASE = "database" - PAGE = "page" - - -@dataclass -class QueueEntry: - type: QueueEntryType - id: UUID - - -def get_recursive_content_from_page( - client: Client, - page_id: str, - logger: logging.Logger, -) -> ChildExtractionResponse: - return get_recursive_content( - client=client, - init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)), - logger=logger, - ) - - -def get_recursive_content_from_database( - client: Client, - database_id: str, - logger: logging.Logger, -) -> ChildExtractionResponse: - return get_recursive_content( - client=client, - init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), - logger=logger, - ) - - -def get_recursive_content( - client: Client, - init_entry: QueueEntry, - logger: logging.Logger, -) -> ChildExtractionResponse: - parents: List[QueueEntry] = [init_entry] - child_pages: List[str] = [] - child_dbs: List[str] = [] - processed: List[str] = [] - while len(parents) > 0: - parent: QueueEntry = parents.pop() - processed.append(str(parent.id)) - if parent.type == QueueEntryType.PAGE: - logger.debug(f"Getting child data from page: {parent.id}") - page_children = [] - try: - for children_block in client.blocks.children.iterate_list( # type: ignore - block_id=str(parent.id), - ): - page_children.extend(children_block) - except APIResponseError as api_error: - logger.error(f"failed to get page with id {parent.id}: {api_error}") - if str(parent.id) in child_pages: - child_pages.remove(str(parent.id)) - continue - if not page_children: - continue - - # Extract child pages - child_pages_from_page = [ - c for c in page_children if isinstance(c.block, notion_blocks.ChildPage) - ] - if child_pages_from_page: - child_page_blocks: List[notion_blocks.ChildPage] = [ - p.block - for p in child_pages_from_page - if isinstance(p.block, notion_blocks.ChildPage) - ] - logger.debug( - "found child pages from parent page {}: {}".format( - parent.id, - ", ".join([block.title for block in child_page_blocks]), - ), - ) - new_pages = [p.id for p in child_pages_from_page if p.id not in processed] - new_pages = list(set(new_pages)) - child_pages.extend(new_pages) - parents.extend( - [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], - ) - - # Extract child databases - child_dbs_from_page = [ - c for c in page_children if isinstance(c.block, notion_blocks.ChildDatabase) - ] - if child_dbs_from_page: - child_db_blocks: List[notion_blocks.ChildDatabase] = [ - c.block - for c in page_children - if isinstance(c.block, notion_blocks.ChildDatabase) - ] - logger.debug( - "found child database from parent page {}: {}".format( - parent.id, - ", ".join([block.title for block in child_db_blocks]), - ), - ) - new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed] - new_dbs = list(set(new_dbs)) - child_dbs.extend(new_dbs) - parents.extend( - [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], - ) - - linked_to_others: List[notion_blocks.LinkToPage] = [ - c.block for c in page_children if isinstance(c.block, notion_blocks.LinkToPage) - ] - for link in linked_to_others: - if (page_id := link.page_id) and ( - page_id not in processed and page_id not in child_pages - ): - child_pages.append(page_id) - parents.append(QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id))) - if (database_id := link.database_id) and ( - database_id not in processed and database_id not in child_dbs - ): - child_dbs.append(database_id) - parents.append( - QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), - ) - - elif parent.type == QueueEntryType.DATABASE: - logger.debug(f"Getting child data from database: {parent.id}") - database_pages = [] - try: - for page_entries in client.databases.iterate_query( # type: ignore - database_id=str(parent.id), - ): - database_pages.extend(page_entries) - except APIResponseError as api_error: - logger.error(f"failed to get database with id {parent.id}: {api_error}") - if str(parent.id) in child_dbs: - child_dbs.remove(str(parent.id)) - continue - if not database_pages: - continue - - child_pages_from_db = [ - p for p in database_pages if is_page_url(client=client, url=p.url) - ] - if child_pages_from_db: - logger.debug( - "found child pages from parent database {}: {}".format( - parent.id, - ", ".join([p.url for p in child_pages_from_db]), - ), - ) - new_pages = [p.id for p in child_pages_from_db if p.id not in processed] - child_pages.extend(new_pages) - parents.extend( - [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], - ) - - child_dbs_from_db = [ - p for p in database_pages if is_database_url(client=client, url=p.url) - ] - if child_dbs_from_db: - logger.debug( - "found child database from parent database {}: {}".format( - parent.id, - ", ".join([db.url for db in child_dbs_from_db]), - ), - ) - new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed] - child_dbs.extend(new_dbs) - parents.extend( - [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], - ) - - return ChildExtractionResponse( - child_pages=child_pages, - child_databases=child_dbs, - ) - - -def is_valid_uuid(uuid_str: str) -> bool: - try: - UUID(uuid_str) - return True - except Exception: - return False - - -def get_uuid_from_url(path: str) -> Optional[str]: - strings = path.split("-") - if len(strings) > 0 and is_valid_uuid(strings[-1]): - return strings[-1] - return None - - -def is_page_url(client: Client, url: str): - parsed_url = urlparse(url) - path = parsed_url.path.split("/")[-1] - if parsed_url.netloc != "www.notion.so": - return False - page_uuid = get_uuid_from_url(path=path) - if not page_uuid: - return False - check_resp = client.pages.retrieve_status(page_id=page_uuid) - return check_resp == 200 - - -def is_database_url(client: Client, url: str): - parsed_url = urlparse(url) - path = parsed_url.path.split("/")[-1] - if parsed_url.netloc != "www.notion.so": - return False - database_uuid = get_uuid_from_url(path=path) - if not database_uuid: - return False - check_resp = client.databases.retrieve_status(database_id=database_uuid) - return check_resp == 200 - - -@dataclass -class BuildTableResponse: - table_html: HtmlTag - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -def build_table(client: Client, table: Block) -> BuildTableResponse: - if not isinstance(table.block, notion_blocks.Table): - raise ValueError(f"block type not table: {type(table.block)}") - rows: List[notion_blocks.TableRow] = [] - child_pages: List[str] = [] - child_databases: List[str] = [] - for row_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=table.id, - ): - rows.extend( - [row.block for row in row_chunk if isinstance(row.block, notion_blocks.TableRow)], - ) - - # Extract child databases and pages - for row in rows: - for c in row.cells: - for rt in c.rich_texts: - if mention := rt.mention: - if mention.type == "page" and (page := mention.page): - child_pages.append(page.id) - if mention.type == "database" and (database := mention.database): - child_databases.append(database.id) - - header: Optional[notion_blocks.TableRow] = None - if table.block.has_column_header: - header = rows.pop(0) - table_html_rows = [] - if header: - header.is_header = True - table_html_rows.append(header.get_html()) - table_html_rows.extend([row.get_html() for row in rows]) - html_table = Table([], table_html_rows) - - return BuildTableResponse( - table_html=html_table, - child_pages=child_pages, - child_databases=child_databases, - ) - - -def build_columned_list(client: Client, column_parent: Block) -> HtmlTag: - if not isinstance(column_parent.block, notion_blocks.ColumnList): - raise ValueError(f"block type not column list: {type(column_parent.block)}") - columns: List[Block] = [] - for column_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=column_parent.id, - ): - columns.extend(column_chunk) - num_columns = len(columns) - columns_content = [] - for column in columns: - for column_content_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=column.id, - ): - columns_content.append( - Div( - [Style(f"width:{100/num_columns}%; float: left")], - [content.block.get_html() for content in column_content_chunk], - ), - ) - - return Div([], columns_content) - - -@dataclass -class BulletedListResponse: - html: HtmlTag - child_list: Optional[HtmlTag] = None - - -bulleted_list_styles = ["circle", "square", "disc"] - - -def build_bulleted_list_children( - client: Client, - bulleted_list_item_parent: Block, - list_style_ind: int = 0, -) -> BulletedListResponse: - if not isinstance(bulleted_list_item_parent.block, notion_blocks.BulletedListItem): - raise ValueError( - f"block type not bulleted list item: {type(bulleted_list_item_parent.block)}", - ) - html = bulleted_list_item_parent.get_html() - if html: - html.attributes = [Style("margin-left: 10px")] - if not bulleted_list_item_parent.has_children: - return BulletedListResponse( - html=html, - ) - children = [] - for child_block in client.blocks.children.iterate_list( # type: ignore - block_id=bulleted_list_item_parent.id, - ): - children.extend(child_block) - if not children: - return BulletedListResponse( - html=bulleted_list_item_parent.get_html(), - ) - child_html = [] - for child in children: - child_resp = build_bulleted_list_children( - client=client, - bulleted_list_item_parent=child, - list_style_ind=(list_style_ind + 1) % len(bulleted_list_styles), - ) - child_html.append(child_resp.html) - if child_children := child_resp.child_list: - child_html.append(child_children) - - return BulletedListResponse( - html=html, - child_list=Ul( - [Style(f"list-style-type: {bulleted_list_styles[list_style_ind]}")], - child_html, - ), - ) - - -@dataclass -class NumberedListResponse: - html: HtmlTag - child_list: Optional[HtmlTag] = None - - -numbered_list_types = ["a", "i", "1"] - - -def build_numbered_list_children( - client: Client, - numbered_list_item_parent: Block, - type_attr_ind=0, -) -> NumberedListResponse: - if not isinstance(numbered_list_item_parent.block, notion_blocks.NumberedListItem): - raise ValueError( - f"block type not numbered list item: {type(numbered_list_item_parent.block)}", - ) - html = numbered_list_item_parent.get_html() - if html: - html.attributes = [Style("margin-left: 10px")] - if not numbered_list_item_parent.has_children: - return NumberedListResponse( - html=html, - ) - children = [] - for child_block in client.blocks.children.iterate_list( # type: ignore - block_id=numbered_list_item_parent.id, - ): - children.extend(child_block) - if not children: - return NumberedListResponse( - html=numbered_list_item_parent.get_html(), - ) - child_html = [] - for child in children: - child_resp = build_numbered_list_children( - client=client, - numbered_list_item_parent=child, - type_attr_ind=(type_attr_ind + 1) % len(numbered_list_types), - ) - child_html.append(child_resp.html) - if child_children := child_resp.child_list: - child_html.append(child_children) - - return NumberedListResponse( - html=html, - child_list=Ol([Type(numbered_list_types[type_attr_ind])], child_html), - ) diff --git a/unstructured/ingest/connector/notion/interfaces.py b/unstructured/ingest/connector/notion/interfaces.py deleted file mode 100644 index bcfa788d5..000000000 --- a/unstructured/ingest/connector/notion/interfaces.py +++ /dev/null @@ -1,32 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Optional - -from htmlBuilder.tags import HtmlTag - - -class FromJSONMixin(ABC): - @classmethod - @abstractmethod - def from_dict(cls, data: dict): - pass - - -class GetHTMLMixin(ABC): - @abstractmethod - def get_html(self) -> Optional[HtmlTag]: - pass - - -class BlockBase(FromJSONMixin, GetHTMLMixin): - @staticmethod - @abstractmethod - def can_have_children() -> bool: - pass - - -class DBPropertyBase(FromJSONMixin): - pass - - -class DBCellBase(FromJSONMixin, GetHTMLMixin): - pass diff --git a/unstructured/ingest/connector/notion/types/__init__.py b/unstructured/ingest/connector/notion/types/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/connector/notion/types/block.py b/unstructured/ingest/connector/notion/types/block.py deleted file mode 100644 index 7159816d9..000000000 --- a/unstructured/ingest/connector/notion/types/block.py +++ /dev/null @@ -1,95 +0,0 @@ -# https://developers.notion.com/reference/page -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - BlockBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types import blocks -from unstructured.ingest.connector.notion.types.parent import Parent -from unstructured.ingest.connector.notion.types.user import PartialUser - -block_type_mapping = { - "bookmark": blocks.Bookmark, - "breadcrumb": blocks.Breadcrumb, - "bulleted_list_item": blocks.BulletedListItem, - "callout": blocks.Callout, - "child_database": blocks.ChildDatabase, - "child_page": blocks.ChildPage, - "code": blocks.Code, - "column": blocks.Column, - "column_list": blocks.ColumnList, - "divider": blocks.Divider, - "heading_1": blocks.Heading, - "heading_2": blocks.Heading, - "heading_3": blocks.Heading, - "embed": blocks.Embed, - "equation": blocks.Equation, - "file": blocks.File, - "image": blocks.Image, - "link_preview": blocks.LinkPreview, - "link_to_page": blocks.LinkToPage, - "numbered_list_item": blocks.NumberedListItem, - "paragraph": blocks.Paragraph, - "pdf": blocks.PDF, - "quote": blocks.Quote, - "synced_block": blocks.SyncBlock, - "table": blocks.Table, - "table_of_contents": blocks.TableOfContents, - "table_row": blocks.TableRow, - "template": blocks.Template, - "to_do": blocks.ToDo, - "toggle": blocks.Toggle, - "unsupported": blocks.Unsupported, - "video": blocks.Video, -} - - -@dataclass -class Block(FromJSONMixin, GetHTMLMixin): - id: str - type: str - created_time: str - created_by: PartialUser - last_edited_time: str - last_edited_by: PartialUser - archived: bool - has_children: bool - parent: Parent - block: BlockBase - object: str = "block" - request_id: Optional[str] = None - - def __repr__(self): - return f"{self.__class__.__name__}(id={self.id}, type={self.type})" - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - block_data = data.pop(t) - created_by = data.pop("created_by") - last_edited_by = data.pop("last_edited_by") - parent = data.pop("parent") - try: - block = cls( - created_by=PartialUser.from_dict(created_by), - last_edited_by=PartialUser.from_dict(last_edited_by), - parent=Parent.from_dict(parent), - block=block_type_mapping[t].from_dict(block_data), # type: ignore - **data, - ) - except KeyError as ke: - raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke - except TypeError as te: - raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te - - return block - - def get_html(self) -> Optional[HtmlTag]: - if self.block: - return self.block.get_html() - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/__init__.py b/unstructured/ingest/connector/notion/types/blocks/__init__.py deleted file mode 100644 index 5cd158bc8..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/__init__.py +++ /dev/null @@ -1,63 +0,0 @@ -from .bookmark import Bookmark -from .breadcrumb import Breadcrumb -from .bulleted_list_item import BulletedListItem -from .callout import Callout -from .child_database import ChildDatabase -from .child_page import ChildPage -from .code import Code -from .column_list import Column, ColumnList -from .divider import Divider -from .embed import Embed -from .equation import Equation -from .file import File -from .heading import Heading -from .image import Image -from .link_preview import LinkPreview -from .link_to_page import LinkToPage -from .numbered_list import NumberedListItem -from .paragraph import Paragraph -from .pdf import PDF -from .quote import Quote -from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock -from .table import Table, TableRow -from .table_of_contents import TableOfContents -from .template import Template -from .todo import ToDo -from .toggle import Toggle -from .unsupported import Unsupported -from .video import Video - -__all__ = [ - "Bookmark", - "Breadcrumb", - "BulletedListItem", - "Callout", - "ChildDatabase", - "ChildPage", - "Code", - "Column", - "ColumnList", - "Divider", - "Embed", - "Equation", - "File", - "Heading", - "Image", - "LinkPreview", - "LinkToPage", - "NumberedListItem", - "Paragraph", - "PDF", - "Quote", - "SyncBlock", - "OriginalSyncedBlock", - "DuplicateSyncedBlock", - "Table", - "TableRow", - "TableOfContents", - "Template", - "ToDo", - "Toggle", - "Unsupported", - "Video", -] diff --git a/unstructured/ingest/connector/notion/types/blocks/bookmark.py b/unstructured/ingest/connector/notion/types/blocks/bookmark.py deleted file mode 100644 index 46804475f..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/bookmark.py +++ /dev/null @@ -1,40 +0,0 @@ -# https://developers.notion.com/reference/block#bookmark -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Bookmark(BlockBase): - url: str - caption: List[RichText] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - captions = data.pop("caption", []) - return cls( - url=data["url"], - caption=[RichText.from_dict(c) for c in captions], - ) - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.url: - texts.append(A([Href(self.url)], self.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) - - @staticmethod - def can_have_children() -> bool: - return False diff --git a/unstructured/ingest/connector/notion/types/blocks/breadcrumb.py b/unstructured/ingest/connector/notion/types/blocks/breadcrumb.py deleted file mode 100644 index d6b1626a2..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/breadcrumb.py +++ /dev/null @@ -1,21 +0,0 @@ -# https://developers.notion.com/reference/block#breadcrumb -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Breadcrumb(BlockBase): - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - pass diff --git a/unstructured/ingest/connector/notion/types/blocks/bulleted_list_item.py b/unstructured/ingest/connector/notion/types/blocks/bulleted_list_item.py deleted file mode 100644 index 5db911dd2..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/bulleted_list_item.py +++ /dev/null @@ -1,31 +0,0 @@ -# https://developers.notion.com/reference/block#bulleted-list-item -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag, Li - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class BulletedListItem(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - return cls( - color=data["color"], - children=data.get("children", []), - rich_text=[RichText.from_dict(rt) for rt in rich_text], - ) - - def get_html(self) -> Optional[HtmlTag]: - return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured/ingest/connector/notion/types/blocks/callout.py b/unstructured/ingest/connector/notion/types/blocks/callout.py deleted file mode 100644 index 6ea2bb130..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/callout.py +++ /dev/null @@ -1,94 +0,0 @@ -# https://developers.notion.com/reference/block#callout -from dataclasses import dataclass, field -from typing import List, Optional, Union - -from htmlBuilder.attributes import Href, Style -from htmlBuilder.tags import A, Div, HtmlTag, P - -from unstructured.ingest.connector.notion.interfaces import ( - BlockBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class EmojiIcon(FromJSONMixin, GetHTMLMixin): - emoji: str - type: str = "emoji" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return P([], self.emoji) - - -@dataclass -class ExternalIconContent(FromJSONMixin): - url: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class ExternalIcon(FromJSONMixin, GetHTMLMixin): - external: ExternalIconContent - type: str = "external" - - @classmethod - def from_dict(cls, data: dict): - return cls(external=ExternalIconContent.from_dict(data=data.pop("external")), **data) - - def get_html(self) -> Optional[HtmlTag]: - if self.external: - return A([Href(self.external.url)], [self.external.url]) - else: - return None - - -class Icon(FromJSONMixin): - @classmethod - def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon]: - t = data.get("type") - if t == "emoji": - return EmojiIcon.from_dict(data) - elif t == "external": - return ExternalIcon.from_dict(data) - else: - raise ValueError(f"Unexpected icon type: {t} ({data})") - - -@dataclass -class Callout(BlockBase): - color: str - icon: Optional[Union[EmojiIcon, ExternalIcon]] = None - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - return cls( - color=data["color"], - icon=Icon.from_dict(data.pop("icon")), - rich_text=[RichText.from_dict(rt) for rt in rich_text], - ) - - def get_html(self) -> Optional[HtmlTag]: - elements = [] - if self.icon and self.icon.get_html(): - elements.append(self.icon.get_html()) - if self.rich_text: - elements.extend([rt.get_html() for rt in self.rich_text]) - attributes = [] - if self.color: - attributes.append(Style(f"color:{self.color}")) - return Div(attributes, elements) diff --git a/unstructured/ingest/connector/notion/types/blocks/child_database.py b/unstructured/ingest/connector/notion/types/blocks/child_database.py deleted file mode 100644 index 578b400f2..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/child_database.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#child-database -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag, P - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class ChildDatabase(BlockBase): - title: str - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return P([], self.title) diff --git a/unstructured/ingest/connector/notion/types/blocks/child_page.py b/unstructured/ingest/connector/notion/types/blocks/child_page.py deleted file mode 100644 index 6ee6f9047..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/child_page.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#child-page -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag, P - -from unstructured.ingest.connector.notion.interfaces import BlockBase, GetHTMLMixin - - -@dataclass -class ChildPage(BlockBase, GetHTMLMixin): - title: str - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return P([], self.title) diff --git a/unstructured/ingest/connector/notion/types/blocks/code.py b/unstructured/ingest/connector/notion/types/blocks/code.py deleted file mode 100644 index 3a6d80e36..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/code.py +++ /dev/null @@ -1,43 +0,0 @@ -# https://developers.notion.com/reference/block#code -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Br, Div, HtmlTag -from htmlBuilder.tags import Code as HtmlCode - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Code(BlockBase): - language: str - rich_text: List[RichText] = field(default_factory=list) - caption: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - caption = data.pop("caption", []) - return cls( - language=data["language"], - rich_text=[RichText.from_dict(rt) for rt in rich_text], - caption=[RichText.from_dict(c) for c in caption], - ) - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.rich_text: - texts.append(HtmlCode([], [rt.get_html() for rt in self.rich_text])) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/unstructured/ingest/connector/notion/types/blocks/column_list.py b/unstructured/ingest/connector/notion/types/blocks/column_list.py deleted file mode 100644 index d2df367c2..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/column_list.py +++ /dev/null @@ -1,35 +0,0 @@ -# https://developers.notion.com/reference/block#column-list-and-column -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class ColumnList(BlockBase): - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return None - - -@dataclass -class Column(BlockBase): - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/divider.py b/unstructured/ingest/connector/notion/types/blocks/divider.py deleted file mode 100644 index 33fc01e7b..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/divider.py +++ /dev/null @@ -1,22 +0,0 @@ -# https://developers.notion.com/reference/block#divider -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Hr, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Divider(BlockBase): - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return Hr([Style("border-top: 3px solid #bbb")]) diff --git a/unstructured/ingest/connector/notion/types/blocks/embed.py b/unstructured/ingest/connector/notion/types/blocks/embed.py deleted file mode 100644 index 561fe828a..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/embed.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://developers.notion.com/reference/block#embed -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Embed(BlockBase): - url: str - caption: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(caption=[RichText.from_dict(d) for d in data.pop("caption", [])], **data) - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.url: - texts.append(A([Href(self.url)], self.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/unstructured/ingest/connector/notion/types/blocks/equation.py b/unstructured/ingest/connector/notion/types/blocks/equation.py deleted file mode 100644 index ccab3d04d..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/equation.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#equation -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Equation(BlockBase): - expression: str - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.expression) diff --git a/unstructured/ingest/connector/notion/types/blocks/file.py b/unstructured/ingest/connector/notion/types/blocks/file.py deleted file mode 100644 index ad7fe54be..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/file.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/block#file -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import External -from unstructured.ingest.connector.notion.types.file import File as FileContent -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class File(BlockBase): - type: str - external: Optional[External] = None - file: Optional[FileContent] = None - caption: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - caption = [RichText.from_dict(rt) for rt in data.pop("caption", [])] - t = data["type"] - file = cls(type=t, caption=caption) - if t == "external": - file.external = External.from_dict(data["external"]) - elif t == "file": - file.file = FileContent.from_dict(data["file"]) - return file - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.file: - texts.append(A([Href(self.file.url)], self.file.url)) - if self.external: - texts.append(A([Href(self.external.url)], self.external.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/unstructured/ingest/connector/notion/types/blocks/heading.py b/unstructured/ingest/connector/notion/types/blocks/heading.py deleted file mode 100644 index 86983f585..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/heading.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/block#headings -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Heading(BlockBase): - color: str - is_toggleable: bool - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - heading = cls(**data) - heading.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return heading - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - texts = [rt.get_html() for rt in self.rich_text] - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, texts) diff --git a/unstructured/ingest/connector/notion/types/blocks/image.py b/unstructured/ingest/connector/notion/types/blocks/image.py deleted file mode 100644 index d9c5203c4..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/image.py +++ /dev/null @@ -1,21 +0,0 @@ -# https://developers.notion.com/reference/block#image -from typing import Optional - -from htmlBuilder.attributes import Src -from htmlBuilder.tags import HtmlTag, Img - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import FileObject - - -class Image(BlockBase, FileObject): - @staticmethod - def can_have_children() -> bool: - return False - - def get_html(self) -> Optional[HtmlTag]: - if self.external: - return Img([Src(self.external.url)], []) - if self.file: - return Img([Src(self.file.url)], []) - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/link_preview.py b/unstructured/ingest/connector/notion/types/blocks/link_preview.py deleted file mode 100644 index 913df1f72..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/link_preview.py +++ /dev/null @@ -1,24 +0,0 @@ -# https://developers.notion.com/reference/block#link-preview -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class LinkPreview(BlockBase): - url: str - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return A([Href(self.url)], self.url) diff --git a/unstructured/ingest/connector/notion/types/blocks/link_to_page.py b/unstructured/ingest/connector/notion/types/blocks/link_to_page.py deleted file mode 100644 index ed9156d26..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/link_to_page.py +++ /dev/null @@ -1,29 +0,0 @@ -# https://developers.notion.com/reference/block#link-to-page -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class LinkToPage(BlockBase): - type: str - page_id: Optional[str] = None - database_id: Optional[str] = None - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if page_id := self.page_id: - return Div([], page_id) - if database_id := self.database_id: - return Div([], database_id) - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/numbered_list.py b/unstructured/ingest/connector/notion/types/blocks/numbered_list.py deleted file mode 100644 index b0051bc80..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/numbered_list.py +++ /dev/null @@ -1,29 +0,0 @@ -# https://developers.notion.com/reference/block#numbered-list-item -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag, Li - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class NumberedListItem(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - numbered_list = cls(**data) - numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return numbered_list - - def get_html(self) -> Optional[HtmlTag]: - return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured/ingest/connector/notion/types/blocks/paragraph.py b/unstructured/ingest/connector/notion/types/blocks/paragraph.py deleted file mode 100644 index bc31e4cba..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/paragraph.py +++ /dev/null @@ -1,31 +0,0 @@ -# https://developers.notion.com/reference/block#paragraph -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Paragraph(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - paragraph = cls(**data) - paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return paragraph - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return Br() - return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured/ingest/connector/notion/types/blocks/pdf.py b/unstructured/ingest/connector/notion/types/blocks/pdf.py deleted file mode 100644 index 61ef3a820..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/pdf.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/block#pdf -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import External, File -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class PDF(BlockBase): - type: str - caption: List[RichText] = field(default_factory=list) - external: Optional[External] = None - file: Optional[File] = None - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - caption = data.pop("caption", []) - t = data["type"] - paragraph = cls(type=t) - paragraph.caption = [RichText.from_dict(c) for c in caption] - if t == "external": - paragraph.external = External.from_dict(data["external"]) - elif t == "file": - paragraph.file = File.from_dict(data["file"]) - return paragraph - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.external: - texts.append(A([Href(self.external.url)], self.external.url)) - if self.file: - texts.append(A([Href(self.file.url)], self.file.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/unstructured/ingest/connector/notion/types/blocks/quote.py b/unstructured/ingest/connector/notion/types/blocks/quote.py deleted file mode 100644 index 1469f1d2a..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/quote.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/block#quote -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Quote(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - quote = cls(**data) - quote.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return quote - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - texts = [rt.get_html() for rt in self.rich_text] - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, texts) diff --git a/unstructured/ingest/connector/notion/types/blocks/synced_block.py b/unstructured/ingest/connector/notion/types/blocks/synced_block.py deleted file mode 100644 index b4cd2da10..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/synced_block.py +++ /dev/null @@ -1,57 +0,0 @@ -# https://developers.notion.com/reference/block#synced-block -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class OriginalSyncedBlock(BlockBase): - synced_from: Optional[str] = None - children: List[dict] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(children=data["children"]) - - def get_html(self) -> Optional[HtmlTag]: - return None - - -@dataclass -class DuplicateSyncedBlock(BlockBase): - type: str - block_id: str - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return None - - -class SyncBlock(BlockBase): - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - if "synced_from" in data: - return OriginalSyncedBlock.from_dict(data) - else: - return DuplicateSyncedBlock.from_dict(data) - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/table.py b/unstructured/ingest/connector/notion/types/blocks/table.py deleted file mode 100644 index 785827563..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/table.py +++ /dev/null @@ -1,63 +0,0 @@ -# https://developers.notion.com/reference/block#table -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag, Td, Th, Tr - -from unstructured.ingest.connector.notion.interfaces import ( - BlockBase, - FromJSONMixin, -) -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Table(BlockBase): - table_width: int - has_column_header: bool - has_row_header: bool - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return None - - -@dataclass -class TableCell(FromJSONMixin): - rich_texts: List[RichText] - - @classmethod - def from_dict(cls, data: dict): - return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])]) - - def get_html(self, is_header: bool) -> Optional[HtmlTag]: - if is_header: - return Th([], [rt.get_html() for rt in self.rich_texts]) - else: - return Td([], [rt.get_html() for rt in self.rich_texts]) - - -# https://developers.notion.com/reference/block#table-rows -@dataclass -class TableRow(BlockBase): - is_header: bool = False - cells: List[TableCell] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - cells = data.get("cells", []) - return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells]) - - @staticmethod - def can_have_children() -> bool: - return False - - def get_html(self) -> Optional[HtmlTag]: - return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells]) diff --git a/unstructured/ingest/connector/notion/types/blocks/table_of_contents.py b/unstructured/ingest/connector/notion/types/blocks/table_of_contents.py deleted file mode 100644 index f753f6074..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/table_of_contents.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#table-of-contents -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class TableOfContents(BlockBase): - color: str - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/template.py b/unstructured/ingest/connector/notion/types/blocks/template.py deleted file mode 100644 index 45056876f..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/template.py +++ /dev/null @@ -1,30 +0,0 @@ -# https://developers.notion.com/reference/block#template -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Template(BlockBase): - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - template = cls(**data) - template.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return template - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/unstructured/ingest/connector/notion/types/blocks/todo.py b/unstructured/ingest/connector/notion/types/blocks/todo.py deleted file mode 100644 index 3e03b2ce0..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/todo.py +++ /dev/null @@ -1,42 +0,0 @@ -# https://developers.notion.com/reference/block#to-do -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Checked, Style, Type -from htmlBuilder.tags import Div, HtmlTag, Input - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class ToDo(BlockBase): - color: str - checked: bool = False - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - todo = cls(**data) - todo.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return todo - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - elements = [] - check_input_attributes = [Type("checkbox")] - if self.checked: - check_input_attributes.append(Checked("")) - elements.append(Input(check_input_attributes)) - elements.extend([rt.get_html() for rt in self.rich_text]) - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, elements) diff --git a/unstructured/ingest/connector/notion/types/blocks/toggle.py b/unstructured/ingest/connector/notion/types/blocks/toggle.py deleted file mode 100644 index 8619eb7de..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/toggle.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/block#toggle-blocks -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Toggle(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - toggle = cls(**data) - toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return toggle - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - texts = [rt.get_html() for rt in self.rich_text] - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, texts) diff --git a/unstructured/ingest/connector/notion/types/blocks/unsupported.py b/unstructured/ingest/connector/notion/types/blocks/unsupported.py deleted file mode 100644 index 6e28b8cf2..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/unsupported.py +++ /dev/null @@ -1,20 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Unsupported(BlockBase): - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/unstructured/ingest/connector/notion/types/blocks/video.py b/unstructured/ingest/connector/notion/types/blocks/video.py deleted file mode 100644 index 2523adf70..000000000 --- a/unstructured/ingest/connector/notion/types/blocks/video.py +++ /dev/null @@ -1,22 +0,0 @@ -# https://developers.notion.com/reference/block#image -from typing import Optional - -from htmlBuilder.attributes import Src -from htmlBuilder.tags import HtmlTag, Source -from htmlBuilder.tags import Video as VideoHtml - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import FileObject - - -class Video(BlockBase, FileObject): - @staticmethod - def can_have_children() -> bool: - return False - - def get_html(self) -> Optional[HtmlTag]: - if self.external: - return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])]) - if self.file: - return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])]) - return None diff --git a/unstructured/ingest/connector/notion/types/database.py b/unstructured/ingest/connector/notion/types/database.py deleted file mode 100644 index db5718cf3..000000000 --- a/unstructured/ingest/connector/notion/types/database.py +++ /dev/null @@ -1,72 +0,0 @@ -# https://developers.notion.com/reference/database -from dataclasses import dataclass, field -from typing import Dict, List, Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBPropertyBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.database_properties import ( - map_properties, -) -from unstructured.ingest.connector.notion.types.file import FileObject -from unstructured.ingest.connector.notion.types.parent import Parent -from unstructured.ingest.connector.notion.types.rich_text import RichText -from unstructured.ingest.connector.notion.types.user import PartialUser - - -@dataclass -class Database(FromJSONMixin, GetHTMLMixin): - id: str - created_time: str - created_by: PartialUser - last_edited_time: str - last_edited_by: PartialUser - archived: bool - parent: Parent - url: str - is_inline: bool - public_url: str - request_id: Optional[str] = None - properties: Dict[str, DBPropertyBase] = field(default_factory=dict) - title: List[RichText] = field(default_factory=list) - description: List[RichText] = field(default_factory=list) - icon: Optional[FileObject] = None - cover: Optional[FileObject] = None - object: str = "database" - - @classmethod - def from_dict(cls, data: dict): - created_by = data.pop("created_by") - last_edited_by = data.pop("last_edited_by") - icon = data.pop("icon") - cover = data.pop("cover") - parent = data.pop("parent") - title = data.pop("title") - description = data.pop("description") - page = cls( - properties=map_properties(data.pop("properties", {})), - created_by=PartialUser.from_dict(created_by), - last_edited_by=PartialUser.from_dict(last_edited_by), - icon=FileObject.from_dict(icon) if icon else None, - cover=FileObject.from_dict(cover) if cover else None, - parent=Parent.from_dict(parent), - title=[RichText.from_dict(data=r) for r in title], - description=[RichText.from_dict(data=r) for r in description], - **data, - ) - - return page - - def get_html(self) -> Optional[HtmlTag]: - spans = [] - if title := self.title: - spans.append(Span([], [rt.get_html() for rt in title])) - if description := self.description: - spans.append(Span([], [rt.get_html() for rt in description])) - if spans: - return Div([], spans) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/__init__.py b/unstructured/ingest/connector/notion/types/database_properties/__init__.py deleted file mode 100644 index 100111365..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/__init__.py +++ /dev/null @@ -1,106 +0,0 @@ -from typing import Dict - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - -from .checkbox import Checkbox, CheckboxCell -from .created_by import CreatedBy, CreatedByCell -from .created_time import CreatedTime, CreatedTimeCell -from .date import Date, DateCell -from .email import Email, EmailCell -from .files import Files, FilesCell -from .formula import Formula, FormulaCell -from .last_edited_by import LastEditedBy, LastEditedByCell -from .last_edited_time import LastEditedTime, LastEditedTimeCell -from .multiselect import MultiSelect, MultiSelectCell -from .number import Number, NumberCell -from .people import People, PeopleCell -from .phone_number import PhoneNumber, PhoneNumberCell -from .relation import Relation, RelationCell -from .rich_text import RichText, RichTextCell -from .rollup import Rollup, RollupCell -from .select import Select, SelectCell -from .status import Status, StatusCell -from .title import Title, TitleCell -from .unique_id import UniqueID, UniqueIDCell -from .url import URL, URLCell -from .verification import Verification, VerificationCell - -db_prop_type_mapping = { - "checkbox": Checkbox, - "created_by": CreatedBy, - "created_time": CreatedTime, - "date": Date, - "email": Email, - "files": Files, - "formula": Formula, - "last_edited_by": LastEditedBy, - "last_edited_time": LastEditedTime, - "multi_select": MultiSelect, - "number": Number, - "people": People, - "phone_number": PhoneNumber, - "relation": Relation, - "rich_text": RichText, - "rollup": Rollup, - "select": Select, - "status": Status, - "title": Title, - "unique_id": UniqueID, - "url": URL, - "verification": Verification, -} - - -def map_properties(props: Dict[str, dict]) -> Dict[str, DBPropertyBase]: - mapped_dict = {} - for k, v in props.items(): - try: - mapped_dict[k] = db_prop_type_mapping[v["type"]].from_dict(v) # type: ignore - except KeyError as ke: - raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke - - return mapped_dict - - -db_cell_type_mapping = { - "checkbox": CheckboxCell, - "created_by": CreatedByCell, - "created_time": CreatedTimeCell, - "date": DateCell, - "email": EmailCell, - "files": FilesCell, - "formula": FormulaCell, - "last_edited_by": LastEditedByCell, - "last_edited_time": LastEditedTimeCell, - "multi_select": MultiSelectCell, - "number": NumberCell, - "people": PeopleCell, - "phone_number": PhoneNumberCell, - "relation": RelationCell, - "rich_text": RichTextCell, - "rollup": RollupCell, - "select": SelectCell, - "status": StatusCell, - "title": TitleCell, - "unique_id": UniqueIDCell, - "url": URLCell, - "verification": VerificationCell, -} - - -def map_cells(props: Dict[str, dict]) -> Dict[str, DBCellBase]: - mapped_dict = {} - for k, v in props.items(): - try: - t = v["type"] - mapped_dict[k] = db_cell_type_mapping[t].from_dict(v) # type: ignore - except KeyError as ke: - raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke - - return mapped_dict - - -__all__ = [ - "map_properties", - "map_cells", -] diff --git a/unstructured/ingest/connector/notion/types/database_properties/checkbox.py b/unstructured/ingest/connector/notion/types/database_properties/checkbox.py deleted file mode 100644 index b60d187a1..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/checkbox.py +++ /dev/null @@ -1,38 +0,0 @@ -# https://developers.notion.com/reference/property-object#checkbox -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.attributes import Checked, Type -from htmlBuilder.tags import Div, HtmlTag, Input - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class Checkbox(DBPropertyBase): - id: str - name: str - type: str = "checkbox" - checkbox: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class CheckboxCell(DBCellBase): - id: str - checkbox: bool - name: Optional[str] = None - type: str = "checkbox" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - check_input_attributes = [Type("checkbox")] - if self.checkbox: - check_input_attributes.append(Checked("")) - return Div([], Input(check_input_attributes)) diff --git a/unstructured/ingest/connector/notion/types/database_properties/created_by.py b/unstructured/ingest/connector/notion/types/database_properties/created_by.py deleted file mode 100644 index 034b0c1c4..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/created_by.py +++ /dev/null @@ -1,35 +0,0 @@ -# https://developers.notion.com/reference/property-object#created-by -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class CreatedBy(DBPropertyBase): - id: str - name: str - type: str = "created_by" - created_by: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class CreatedByCell(DBCellBase): - id: str - created_by: People - type: str = "created_by" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(created_by=People.from_dict(data.pop("created_by")), **data) - - def get_html(self) -> Optional[HtmlTag]: - return self.created_by.get_html() diff --git a/unstructured/ingest/connector/notion/types/database_properties/created_time.py b/unstructured/ingest/connector/notion/types/database_properties/created_time.py deleted file mode 100644 index 86c1173d6..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/created_time.py +++ /dev/null @@ -1,34 +0,0 @@ -# https://developers.notion.com/reference/property-object#created-time -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class CreatedTime(DBPropertyBase): - id: str - name: str - type: str = "created_time" - created_time: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class CreatedTimeCell(DBCellBase): - id: str - created_time: str - type: str = "created_time" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.created_time) diff --git a/unstructured/ingest/connector/notion/types/database_properties/date.py b/unstructured/ingest/connector/notion/types/database_properties/date.py deleted file mode 100644 index 779ef60cc..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/date.py +++ /dev/null @@ -1,41 +0,0 @@ -# https://developers.notion.com/reference/property-object#date -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.date import Date as DateType - - -@dataclass -class Date(DBPropertyBase): - id: str - name: str - type: str = "date" - date: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class DateCell(DBCellBase): - id: str - date: Optional[DateType] = None - name: Optional[str] = None - type: str = "date" - - @classmethod - def from_dict(cls, data: dict): - date = None - date_data = data.pop("date") - if date_data: - date = DateType.from_dict(date_data) - return cls(date=date, **data) - - def get_html(self) -> Optional[HtmlTag]: - if date := self.date: - return date.get_html() - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/email.py b/unstructured/ingest/connector/notion/types/database_properties/email.py deleted file mode 100644 index 1303770a8..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/email.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://developers.notion.com/reference/property-object#email -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class Email(DBPropertyBase): - id: str - name: str - type: str = "email" - email: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class EmailCell(DBCellBase): - id: str - email: str - name: Optional[str] = None - type: str = "email" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if email := self.email: - return Div([], email) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/files.py b/unstructured/ingest/connector/notion/types/database_properties/files.py deleted file mode 100644 index 680ee15ba..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/files.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/property-object#files -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.file import FileObject - - -@dataclass -class Files(DBPropertyBase): - id: str - name: str - type: str = "files" - files: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class FilesCell(DBCellBase): - id: str - files: List[FileObject] - type: str = "files" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(files=[FileObject.from_dict(f) for f in data.pop("files", [])], **data) - - def get_html(self) -> Optional[HtmlTag]: - if not self.files: - return None - return Div([], [f.get_html() for f in self.files]) diff --git a/unstructured/ingest/connector/notion/types/database_properties/formula.py b/unstructured/ingest/connector/notion/types/database_properties/formula.py deleted file mode 100644 index b1921367e..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/formula.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/property-object#formula -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class FormulaProp(FromJSONMixin): - expression: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Formula(DBPropertyBase): - id: str - name: str - formula: FormulaProp - type: str = "formula" - - @classmethod - def from_dict(cls, data: dict): - return cls(formula=FormulaProp.from_dict(data.pop("formula", {})), **data) - - -@dataclass -class FormulaCell(DBCellBase): - id: str - formula: dict - type: str = "formula" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - formula = self.formula - t = formula.get("type") - return Div([], str(formula[t])) diff --git a/unstructured/ingest/connector/notion/types/database_properties/last_edited_by.py b/unstructured/ingest/connector/notion/types/database_properties/last_edited_by.py deleted file mode 100644 index a1a2d0a9c..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/last_edited_by.py +++ /dev/null @@ -1,34 +0,0 @@ -# https://developers.notion.com/reference/property-object#last-edited-by -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class LastEditedBy(DBPropertyBase): - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_text(self) -> Optional[str]: - return None - - -@dataclass -class LastEditedByCell(DBCellBase): - id: str - last_edited_by: People - type: str = "last_edited_by" - - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(last_edited_by=People.from_dict(data.pop("last_edited_by", {})), **data) - - def get_html(self) -> Optional[HtmlTag]: - return self.last_edited_by.get_html() diff --git a/unstructured/ingest/connector/notion/types/database_properties/last_edited_time.py b/unstructured/ingest/connector/notion/types/database_properties/last_edited_time.py deleted file mode 100644 index 4c9e00981..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/last_edited_time.py +++ /dev/null @@ -1,34 +0,0 @@ -# https://developers.notion.com/reference/property-object#last-edited-time -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class LastEditedTime(DBPropertyBase): - id: str - name: str - type: str = "last_edited_time" - last_edited_time: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class LastEditedTimeCell(DBCellBase): - id: str - last_edited_time: str - type: str = "last_edited_time" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.last_edited_time) diff --git a/unstructured/ingest/connector/notion/types/database_properties/multiselect.py b/unstructured/ingest/connector/notion/types/database_properties/multiselect.py deleted file mode 100644 index 7534ab82d..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/multiselect.py +++ /dev/null @@ -1,73 +0,0 @@ -# https://developers.notion.com/reference/property-object#multi-select -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class MultiSelectOption(FromJSONMixin): - color: str - id: str - name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class MultiSelectProp(FromJSONMixin): - options: List[MultiSelectOption] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - return cls(options=[MultiSelectOption.from_dict(o) for o in data.get("options", [])]) - - -@dataclass -class MultiSelect(DBPropertyBase): - id: str - name: str - multi_select: MultiSelectProp - type: str = "multi_select" - - @classmethod - def from_dict(cls, data: dict): - return cls( - multi_select=data.pop("multi_select", {}), - **data, - ) - - -@dataclass -class MultiSelectCell(DBCellBase): - id: str - multi_select: List[MultiSelectOption] - type: str = "multi_select" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls( - multi_select=[MultiSelectOption.from_dict(o) for o in data.pop("multi_select", [])], - **data, - ) - - def get_html(self) -> Optional[HtmlTag]: - if not self.multi_select: - return None - option_spans = [] - for option in self.multi_select: - option_attributes = [] - if option.color and option.color != "default": - option_attributes.append(Style(f"color: {option.color}")) - option_spans.append(Span(option_attributes, option.name)) - return Div([], option_spans) diff --git a/unstructured/ingest/connector/notion/types/database_properties/number.py b/unstructured/ingest/connector/notion/types/database_properties/number.py deleted file mode 100644 index 599981fc0..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/number.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/property-object#number -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class NumberProp(FromJSONMixin): - format: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Number(DBPropertyBase): - id: str - name: str - number: NumberProp - type: str = "number" - - @classmethod - def from_dict(cls, data: dict): - return cls(number=NumberProp.from_dict(data.pop("number")), **data) - - -@dataclass -class NumberCell(DBCellBase): - id: str - number: Optional[int] = None - type: str = "number" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if number := self.number: - return Div([], str(number)) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/people.py b/unstructured/ingest/connector/notion/types/database_properties/people.py deleted file mode 100644 index 44e66b2e8..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/people.py +++ /dev/null @@ -1,40 +0,0 @@ -# https://developers.notion.com/reference/property-object#people -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.user import People as PeopleType - - -@dataclass -class People(DBPropertyBase): - id: str - name: str - type: str = "people" - people: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class PeopleCell(DBCellBase): - id: str - people: List[PeopleType] - type: str = "people" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(people=[PeopleType.from_dict(p) for p in data.pop("people", {})], **data) - - def get_html(self) -> Optional[HtmlTag]: - if not self.people: - return None - people_spans = [] - for person in self.people: - people_spans.append(Span([], person.get_html())) - return Div([], people_spans) diff --git a/unstructured/ingest/connector/notion/types/database_properties/phone_number.py b/unstructured/ingest/connector/notion/types/database_properties/phone_number.py deleted file mode 100644 index 58a5c9170..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/phone_number.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://developers.notion.com/reference/property-object#phone-number -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class PhoneNumber(DBPropertyBase): - id: str - name: str - type: str = "phone_number" - phone_number: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class PhoneNumberCell(DBCellBase): - id: str - phone_number: Optional[str] - name: Optional[str] = None - type: str = "phone_number" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if phone_number := self.phone_number: - return Div([], phone_number) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/relation.py b/unstructured/ingest/connector/notion/types/database_properties/relation.py deleted file mode 100644 index 35c283a11..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/relation.py +++ /dev/null @@ -1,67 +0,0 @@ -# https://developers.notion.com/reference/property-object#relation -from dataclasses import dataclass -from typing import Optional -from urllib.parse import unquote - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class DualProperty(FromJSONMixin): - synced_property_id: str - synced_property_name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class RelationProp(FromJSONMixin): - database_id: str - type: str - dual_property: DualProperty - - @classmethod - def from_dict(cls, data: dict): - t = data.get("type") - if t == "dual_property": - dual_property = DualProperty.from_dict(data.pop(t)) - else: - raise ValueError(f"{t} type not recognized") - - return cls(dual_property=dual_property, **data) - - -@dataclass -class Relation(DBPropertyBase): - id: str - name: str - relation: RelationProp - type: str = "relation" - - @classmethod - def from_dict(cls, data: dict): - return cls(relation=RelationProp.from_dict(data.pop("relation")), **data) - - -@dataclass -class RelationCell(DBCellBase): - id: str - has_more: bool - relation: list - type: str = "relation" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], unquote(self.id)) diff --git a/unstructured/ingest/connector/notion/types/database_properties/rich_text.py b/unstructured/ingest/connector/notion/types/database_properties/rich_text.py deleted file mode 100644 index 2bd56c2c9..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/rich_text.py +++ /dev/null @@ -1,43 +0,0 @@ -# https://developers.notion.com/reference/property-object#rich-text -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.rich_text import ( - RichText as RichTextType, -) - - -@dataclass -class RichText(DBPropertyBase): - id: str - name: str - type: str = "rich_text" - rich_text: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class RichTextCell(DBCellBase): - id: str - rich_text: List[RichTextType] - name: Optional[str] = None - type: str = "rich_text" - - @classmethod - def from_dict(cls, data: dict): - return cls( - rich_text=[RichTextType.from_dict(rt) for rt in data.pop("rich_text", [])], - **data, - ) - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - spans = [Span([], rt.get_html()) for rt in self.rich_text] - return Div([], spans) diff --git a/unstructured/ingest/connector/notion/types/database_properties/rollup.py b/unstructured/ingest/connector/notion/types/database_properties/rollup.py deleted file mode 100644 index 5134b40c4..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/rollup.py +++ /dev/null @@ -1,56 +0,0 @@ -# https://developers.notion.com/reference/property-object#rollup -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class RollupProp(FromJSONMixin): - function: str - relation_property_id: str - relation_property_name: str - rollup_property_id: str - rollup_property_name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Rollup(DBPropertyBase): - id: str - name: str - rollup: RollupProp - type: str = "rollup" - - @classmethod - def from_dict(cls, data: dict): - return cls(rollup=RollupProp.from_dict(data.pop("rollup")), **data) - - -@dataclass -class RollupCell(DBCellBase): - id: str - rollup: dict - type: str = "rollup" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - rollup = self.rollup - t = rollup.get("type") - v = rollup[t] - if isinstance(v, list): - return Div([], [Span([], str(x)) for x in v]) - return Div([], str(v)) diff --git a/unstructured/ingest/connector/notion/types/database_properties/select.py b/unstructured/ingest/connector/notion/types/database_properties/select.py deleted file mode 100644 index 550f2ffed..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/select.py +++ /dev/null @@ -1,68 +0,0 @@ -# https://developers.notion.com/reference/property-object#select -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class SelectOption(FromJSONMixin): - color: str - id: str - name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class SelectProp(FromJSONMixin): - options: List[SelectOption] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - return cls(options=[SelectOption.from_dict(o) for o in data.get("options", [])]) - - -@dataclass -class Select(DBPropertyBase): - id: str - name: str - select: SelectProp - type: str = "select" - - @classmethod - def from_dict(cls, data: dict): - return cls(select=SelectProp.from_dict(data.pop("select", {})), **data) - - -@dataclass -class SelectCell(DBCellBase): - id: str - select: Optional[SelectOption] - type: str = "select" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - select_data = data.pop("select") - select = None - if select_data: - select = SelectOption.from_dict(select_data) - return cls(select=select, **data) - - def get_html(self) -> Optional[HtmlTag]: - if select := self.select: - select_attr = [] - if select.color and select.color != "default": - select_attr.append(Style(f"color: {select.color}")) - return Div(select_attr, select.name) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/status.py b/unstructured/ingest/connector/notion/types/database_properties/status.py deleted file mode 100644 index 8139b98a6..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/status.py +++ /dev/null @@ -1,80 +0,0 @@ -# https://developers.notion.com/reference/property-object#status -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class StatusOption(FromJSONMixin): - color: str - id: str - name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class StatusGroup(FromJSONMixin): - color: str - id: str - name: str - option_ids: List[str] = field(default_factory=List[str]) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class StatusProp(FromJSONMixin): - options: List[StatusOption] = field(default_factory=list) - groups: List[StatusGroup] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - return cls( - options=[StatusOption.from_dict(o) for o in data.get("options", [])], - groups=[StatusGroup.from_dict(g) for g in data.get("groups", [])], - ) - - -@dataclass -class Status(DBPropertyBase): - id: str - name: str - status: StatusProp - type: str = "status" - - @classmethod - def from_dict(cls, data: dict): - return cls(status=StatusProp.from_dict(data.pop("status", {})), **data) - - -@dataclass -class StatusCell(DBCellBase): - id: str - status: Optional[StatusOption] - type: str = "status" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(status=StatusOption.from_dict(data.pop("status", {})), **data) - - def get_html(self) -> Optional[HtmlTag]: - if status := self.status: - select_attr = [] - if status.color and status.color != "default": - select_attr.append(Style(f"color: {status.color}")) - return Div(select_attr, status.name) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/title.py b/unstructured/ingest/connector/notion/types/database_properties/title.py deleted file mode 100644 index aaee0e6ad..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/title.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/property-object#title -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Title(DBPropertyBase): - id: str - name: str - type: str = "title" - title: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class TitleCell(DBCellBase): - id: str - title: List[RichText] - type: str = "title" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(title=[RichText.from_dict(rt) for rt in data.pop("title", [])], **data) - - def get_html(self) -> Optional[HtmlTag]: - if not self.title: - return None - return Div([], [rt.get_html() for rt in self.title]) diff --git a/unstructured/ingest/connector/notion/types/database_properties/unique_id.py b/unstructured/ingest/connector/notion/types/database_properties/unique_id.py deleted file mode 100644 index 643f2c07a..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/unique_id.py +++ /dev/null @@ -1,50 +0,0 @@ -# https://developers.notion.com/reference/property-object#title -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class UniqueID(DBPropertyBase): - id: str - name: str - type: str = "unique_id" - unique_id: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class UniqueIDCellData(FromJSONMixin): - prefix: str - number: int - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class UniqueIDCell(DBCellBase): - id: str - unique_id: Optional[UniqueIDCellData] - type: str = "title" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(unique_id=UniqueIDCellData.from_dict(data.pop("unique_id")), **data) - - def get_html(self) -> Optional[HtmlTag]: - if unique_id := self.unique_id: - return Div([], f"{unique_id.prefix}-{unique_id.number}") - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/url.py b/unstructured/ingest/connector/notion/types/database_properties/url.py deleted file mode 100644 index 8233ae9c2..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/url.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/property-object#url -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class URL(DBPropertyBase): - id: str - name: str - type: str = "url" - url: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class URLCell(DBCellBase): - id: str - url: Optional[str] = None - name: Optional[str] = None - type: str = "url" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if url := self.url: - return A([Href(url)], url) - return None diff --git a/unstructured/ingest/connector/notion/types/database_properties/verification.py b/unstructured/ingest/connector/notion/types/database_properties/verification.py deleted file mode 100644 index 03ade8e3b..000000000 --- a/unstructured/ingest/connector/notion/types/database_properties/verification.py +++ /dev/null @@ -1,78 +0,0 @@ -# https://developers.notion.com/reference/property-object#url -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.date import Date -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class Verification(DBPropertyBase): - id: str - name: str - type: str = "verification" - verification: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class VerificationData(FromJSONMixin, GetHTMLMixin): - state: Optional[str] - verified_by: Optional[People] - date: Optional[Date] - - @classmethod - def from_dict(cls, data: dict): - verified_by = data.pop("verified_by", None) - date = data.pop("date", None) - return cls( - verified_by=People.from_dict(data=verified_by) if verified_by else None, - date=Date.from_dict(data=date) if date else None, - **data, - ) - - def get_html(self) -> Optional[HtmlTag]: - elements = [] - if state := self.state: - elements.append(Span([], state)) - if (verified_by := self.verified_by) and (verified_by_html := verified_by.get_html()): - elements.append(verified_by_html) - if (date := self.date) and (date_html := date.get_html()): - elements.append(date_html) - if elements: - return Div([], elements) - return None - - -@dataclass -class VerificationCell(DBCellBase): - id: str - verification: Optional[VerificationData] - name: Optional[str] = None - type: str = "verification" - - @classmethod - def from_dict(cls, data: dict): - return cls(verification=VerificationData.from_dict(data.pop("verification")), **data) - - def get_html(self) -> Optional[HtmlTag]: - elements = [] - if name := self.name: - elements.append(Span([], name)) - if (verification := self.verification) and (verification_html := verification.get_html()): - elements.append(verification_html) - - if elements: - return Div([], elements) - return None diff --git a/unstructured/ingest/connector/notion/types/date.py b/unstructured/ingest/connector/notion/types/date.py deleted file mode 100644 index 7c6dcf1fd..000000000 --- a/unstructured/ingest/connector/notion/types/date.py +++ /dev/null @@ -1,26 +0,0 @@ -# https://developers.notion.com/reference/property-value-object#date-property-values -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin - - -@dataclass -class Date(FromJSONMixin, GetHTMLMixin): - start: str - end: Optional[str] = None - time_zone: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - text = f"{self.start}" - if end := self.end: - text += f" - {end}" - if self.time_zone: - text += f" {self.time_zone}" - return Div([], text) diff --git a/unstructured/ingest/connector/notion/types/file.py b/unstructured/ingest/connector/notion/types/file.py deleted file mode 100644 index 6ade2d1e4..000000000 --- a/unstructured/ingest/connector/notion/types/file.py +++ /dev/null @@ -1,51 +0,0 @@ -# https://developers.notion.com/reference/file-object -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin - - -@dataclass -class External(FromJSONMixin): - url: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class File(FromJSONMixin): - url: str - expiry_time: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class FileObject(FromJSONMixin, GetHTMLMixin): - type: str - external: Optional[External] = None - file: Optional[File] = None - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - file_object = cls(type=t) - if t == "external": - file_object.external = External.from_dict(data["external"]) - elif t == "file": - file_object.file = File.from_dict(data["file"]) - return file_object - - def get_html(self) -> Optional[HtmlTag]: - if self.file: - return A([Href(self.file.url)], self.file.url) - if self.external: - return A([Href(self.external.url)], self.external.url) - return None diff --git a/unstructured/ingest/connector/notion/types/page.py b/unstructured/ingest/connector/notion/types/page.py deleted file mode 100644 index 1bbda85c7..000000000 --- a/unstructured/ingest/connector/notion/types/page.py +++ /dev/null @@ -1,44 +0,0 @@ -# https://developers.notion.com/reference/page -from dataclasses import dataclass -from typing import Optional - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin -from unstructured.ingest.connector.notion.types.file import FileObject -from unstructured.ingest.connector.notion.types.parent import Parent -from unstructured.ingest.connector.notion.types.user import PartialUser - - -@dataclass -class Page(FromJSONMixin): - id: str - created_time: str - created_by: PartialUser - last_edited_time: str - last_edited_by: PartialUser - archived: bool - properties: dict - parent: Parent - url: str - public_url: str - request_id: Optional[str] = None - object: str = "page" - icon: Optional[FileObject] = None - cover: Optional[FileObject] = None - - @classmethod - def from_dict(cls, data: dict): - created_by = data.pop("created_by") - last_edited_by = data.pop("last_edited_by") - icon = data.pop("icon") - cover = data.pop("cover") - parent = data.pop("parent") - page = cls( - created_by=PartialUser.from_dict(created_by), - last_edited_by=PartialUser.from_dict(last_edited_by), - icon=FileObject.from_dict(icon) if icon else None, - cover=FileObject.from_dict(cover) if cover else None, - parent=Parent.from_dict(parent), - **data, - ) - - return page diff --git a/unstructured/ingest/connector/notion/types/parent.py b/unstructured/ingest/connector/notion/types/parent.py deleted file mode 100644 index f78c16673..000000000 --- a/unstructured/ingest/connector/notion/types/parent.py +++ /dev/null @@ -1,66 +0,0 @@ -# https://developers.notion.com/reference/parent-object -from dataclasses import dataclass - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin - - -# https://developers.notion.com/reference/parent-object#database-parent -@dataclass -class DatabaseParent(FromJSONMixin): - database_id: str - type: str = "database_id" - - @classmethod - def from_dict(cls, data: dict): - return cls(database_id=data["database_id"]) - - -# https://developers.notion.com/reference/parent-object#page-parent -@dataclass -class PageParent(FromJSONMixin): - page_id: str - type: str = "page_id" - - @classmethod - def from_dict(cls, data: dict): - return cls(page_id=data["page_id"]) - - -# https://developers.notion.com/reference/parent-object#workspace-parent -@dataclass -class WorkspaceParent(FromJSONMixin): - type: str = "workspace" - workspace: bool = True - - @classmethod - def from_dict(cls, data: dict): - return cls() - - -# https://developers.notion.com/reference/parent-object#block-parent -@dataclass -class BlockParent(FromJSONMixin): - block_id: str - type: str = "block_id" - - @classmethod - def from_dict(cls, data: dict): - return cls(block_id=data["block_id"]) - - -@dataclass -class Parent(FromJSONMixin): - block_id: str - type: str = "block_id" - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - if t == "database_id": - return DatabaseParent.from_dict(data) - elif t == "page_id": - return PageParent.from_dict(data) - elif t == "workspace": - return WorkspaceParent.from_dict(data) - elif t == "block_id": - return BlockParent.from_dict(data) diff --git a/unstructured/ingest/connector/notion/types/rich_text.py b/unstructured/ingest/connector/notion/types/rich_text.py deleted file mode 100644 index ae71a0a78..000000000 --- a/unstructured/ingest/connector/notion/types/rich_text.py +++ /dev/null @@ -1,189 +0,0 @@ -# https://developers.notion.com/reference/rich-text -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Href, Style -from htmlBuilder.tags import A, B, Code, Div, HtmlTag, I, S, Span, U -from htmlBuilder.tags import Text as HtmlText - -from unstructured.ingest.connector.notion.interfaces import ( - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.date import Date -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class Annotations(FromJSONMixin): - bold: bool - code: bool - italic: bool - strikethrough: bool - underline: bool - color: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Equation(FromJSONMixin, GetHTMLMixin): - expression: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Code([], self.expression) if self.expression else None - - -@dataclass -class MentionDatabase(FromJSONMixin, GetHTMLMixin): - id: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.id) if self.id else None - - -@dataclass -class MentionLinkPreview(FromJSONMixin, GetHTMLMixin): - url: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return A([Href(self.url)], self.url) if self.url else None - - -@dataclass -class MentionPage(FromJSONMixin, GetHTMLMixin): - id: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.id) if self.id else None - - -@dataclass -class MentionTemplate(FromJSONMixin): - template_mention_date: Optional[str] - template_mention_user: Optional[str] - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Mention(FromJSONMixin, GetHTMLMixin): - type: str - database: Optional[MentionDatabase] = None - date: Optional[Date] = None - link_preview: Optional[MentionLinkPreview] = None - page: Optional[MentionPage] = None - template_mention: Optional[MentionTemplate] = None - user: Optional[People] = None - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - mention = cls(type=t) - if t == "date": - mention.date = Date.from_dict(data["date"]) - elif t == "database": - mention.database = MentionDatabase.from_dict(data["database"]) - elif t == "link_preview": - mention.link_preview = MentionLinkPreview.from_dict(data["link_preview"]) - elif t == "page": - mention.page = MentionPage.from_dict(data["page"]) - elif t == "template_mention": - mention.template_mention = MentionTemplate.from_dict(data["template_mention"]) - elif t == "user": - mention.user = People.from_dict(data["user"]) - - return mention - - def get_html(self) -> Optional[HtmlTag]: - t = self.type - if t == "date": - return self.date.get_html() if self.date else None - elif t == "database": - return self.database.get_html() if self.database else None - elif t == "link_preview": - return self.link_preview.get_html() if self.link_preview else None - elif t == "page": - return self.page.get_html() if self.page else None - elif t == "user": - return self.user.get_html() if self.user else None - return None - - -@dataclass -class Text(FromJSONMixin): - content: str - link: Optional[dict] - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class RichText(FromJSONMixin, GetHTMLMixin): - type: str - plain_text: str - annotations: Optional[Annotations] = None - href: Optional[str] = None - text: Optional[Text] = None - mention: Optional[Mention] = None - equation: Optional[Equation] = None - - def get_html(self) -> Optional[HtmlTag]: - text = HtmlText(self.plain_text) - if self.href: - text = A([Href(self.href)], text) - if self.annotations: - annotations = self.annotations - if annotations.bold: - text = B([], text) - if annotations.code: - text = Code([], text) - if annotations.italic: - text = I([], text) - if annotations.strikethrough: - text = S([], text) - if annotations.underline: - text = U([], text) - if annotations.color and annotations.color != "default": - if isinstance(text, HtmlText): - text = Span([], text) - text.attributes.append(Style(f"color:{annotations.color}")) - return text - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - rich_text = cls( - annotations=Annotations.from_dict(data.pop("annotations")), - **data, - ) - if t == "text": - rich_text.text = Text.from_dict(data["text"]) - elif t == "mention": - rich_text.mention = Mention.from_dict(data["mention"]) - elif t == "equation": - rich_text.equation = Equation.from_dict(data["equation"]) - - return rich_text diff --git a/unstructured/ingest/connector/notion/types/user.py b/unstructured/ingest/connector/notion/types/user.py deleted file mode 100644 index 4574c0b8f..000000000 --- a/unstructured/ingest/connector/notion/types/user.py +++ /dev/null @@ -1,76 +0,0 @@ -# https://developers.notion.com/reference/user -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin - - -@dataclass -class PartialUser(FromJSONMixin): - id: str - object: str = "user" - - @classmethod - def from_dict(cls, data: dict): - return cls(id=data["id"]) - - -@dataclass -class User(FromJSONMixin, GetHTMLMixin): - object: dict - id: str - type: Optional[str] = None - name: Optional[str] = None - avatar_url: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_text(self) -> Optional[str]: - text = self.name - if self.avatar_url: - text = f"[{text}]({self.avatar_url}" - return text - - def get_html(self) -> Optional[HtmlTag]: - if self.avatar_url: - return A([Href(self.avatar_url)], self.name) - else: - return Div([], self.name) - - -@dataclass -class People(User): - person: dict = field(default_factory=dict) - - -@dataclass -class Bots(FromJSONMixin, GetHTMLMixin): - object: dict - id: str - bot: dict - owner: dict - type: str - workspace_name: str - name: Optional[str] = None - avatar_url: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_text(self) -> Optional[str]: - text = self.name - if self.avatar_url: - text = f"[{text}]({self.avatar_url}" - return text - - def get_html(self) -> Optional[HtmlTag]: - if self.avatar_url: - return A([Href(self.avatar_url)], self.name) - else: - return Div([], self.name) diff --git a/unstructured/ingest/connector/onedrive.py b/unstructured/ingest/connector/onedrive.py deleted file mode 100644 index 303e7f8fc..000000000 --- a/unstructured/ingest/connector/onedrive.py +++ /dev/null @@ -1,232 +0,0 @@ -import typing as t -from dataclasses import dataclass, field -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from office365.graph_client import GraphClient - from office365.onedrive.driveitems.driveItem import DriveItem -MAX_MB_SIZE = 512_000_000 - - -@dataclass -class OneDriveAccessConfig(AccessConfig): - client_credential: str = enhanced_field(repr=False, sensitive=True, overload_name="client_cred") - - -@dataclass -class SimpleOneDriveConfig(BaseConnectorConfig): - access_config: OneDriveAccessConfig - client_id: str - user_pname: str - tenant: str = field(repr=False) - authority_url: t.Optional[str] = field(repr=False, default="https://login.microsoftonline.com") - path: t.Optional[str] = field(default="") - recursive: bool = False - - def __post_init__(self): - if not (self.client_id and self.access_config.client_credential and self.user_pname): - raise ValueError( - "Please provide all the following mandatory values:" - "\n-ms-client_id\n-ms-client_cred\n-ms-user-pname", - ) - self.token_factory = self._acquire_token - - @SourceConnectionError.wrap - @requires_dependencies(["msal"]) - def _acquire_token(self): - from msal import ConfidentialClientApplication - - try: - app = ConfidentialClientApplication( - authority=f"{self.authority_url}/{self.tenant}", - client_id=self.client_id, - client_credential=self.access_config.client_credential, - ) - token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - except ValueError as exc: - logger.error("Couldn't set up credentials for OneDrive") - raise exc - return token - - -@dataclass -class OneDriveIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleOneDriveConfig - file_name: str - file_path: str - registry_name: str = "onedrive" - - def __post_init__(self): - self.ext = Path(self.file_name).suffix - if not self.ext: - raise ValueError("Unsupported file without extension.") - - self.server_relative_path = self.file_path + "/" + self.file_name - self._set_download_paths() - - def _set_download_paths(self) -> None: - """Parses the folder structure from the source and creates the download and output paths""" - download_path = Path(f"{self.read_config.download_dir}") - output_path = Path(f"{self.processor_config.output_dir}") - - if parent_path := self.file_path: - download_path = ( - download_path if parent_path == "" else (download_path / parent_path).resolve() - ) - output_path = ( - output_path if parent_path == "" else (output_path / parent_path).resolve() - ) - - self.download_dir = download_path - self.download_filepath = (download_path / self.file_name).resolve() - output_filename = output_filename = self.file_name + ".json" - self.output_dir = output_path - self.output_filepath = (output_path / output_filename).resolve() - - @property - def filename(self): - return Path(self.download_filepath).resolve() - - @property - def _output_filename(self): - return Path(self.output_filepath).resolve() - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "user_pname": self.connector_config.user_pname, - "server_relative_path": self.server_relative_path, - } - - @SourceConnectionNetworkError.wrap - @requires_dependencies(["office365"], extras="onedrive") - def _fetch_file(self): - from office365.graph_client import GraphClient - - client = GraphClient(self.connector_config.token_factory) - root = client.users[self.connector_config.user_pname].drive.get().execute_query().root - file = root.get_by_path(self.server_relative_path).get().execute_query() - return file - - def update_source_metadata(self, **kwargs): - file = kwargs.get("file", self._fetch_file()) - if file is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - - version = None - if (n_versions := len(file.versions)) > 0: - version = file.versions[n_versions - 1].properties.get("id", None) - - self.source_metadata = SourceMetadata( - date_created=ensure_isoformat_datetime(timestamp=file.created_datetime), - date_modified=ensure_isoformat_datetime(timestamp=file.last_modified_datetime), - version=version, - source_url=file.parent_reference.path + "/" + self.file_name, - exists=True, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - file = self._fetch_file() - self.update_source_metadata(file=file) - if file is None: - raise ValueError( - f"Failed to retrieve file {self.file_path}/{self.file_name}", - ) - - fsize = file.get_property("size", 0) - self.output_dir.mkdir(parents=True, exist_ok=True) - - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - - if fsize > MAX_MB_SIZE: - logger.info(f"Downloading file with size: {fsize} bytes in chunks") - with self.filename.open(mode="wb") as f: - file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() - else: - with self.filename.open(mode="wb") as f: - file.download(f).execute_query() - logger.info(f"File downloaded: {self.filename}") - return - - -@dataclass -class OneDriveSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleOneDriveConfig - _client: t.Optional["GraphClient"] = field(init=False, default=None) - - @property - def client(self) -> "GraphClient": - from office365.graph_client import GraphClient - - if self._client is None: - self._client = GraphClient(self.connector_config.token_factory) - return self._client - - @requires_dependencies(["office365"], extras="onedrive") - def initialize(self): - _ = self.client - - @requires_dependencies(["office365"], extras="onedrive") - def check_connection(self): - try: - token_resp: dict = self.connector_config.token_factory() - if error := token_resp.get("error"): - raise SourceConnectionError( - "{} ({})".format(error, token_resp.get("error_description")) - ) - _ = self.client - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def _list_objects(self, folder, recursive) -> t.List["DriveItem"]: - drive_items = folder.children.get().execute_query() - files = [d for d in drive_items if d.is_file] - if not recursive: - return files - folders = [d for d in drive_items if d.is_folder] - for f in folders: - files += self._list_objects(f, recursive) - return files - - def _gen_ingest_doc(self, file: "DriveItem") -> OneDriveIngestDoc: - file_path = file.parent_reference.path.split(":")[-1] - file_path = file_path[1:] if file_path[0] == "/" else file_path - return OneDriveIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - file_name=file.name, - file_path=file_path, - ) - - def get_ingest_docs(self): - root = self.client.users[self.connector_config.user_pname].drive.get().execute_query().root - if fpath := self.connector_config.path: - root = root.get_by_path(fpath).get().execute_query() - if root is None or not root.is_folder: - raise ValueError(f"Unable to find directory, given: {fpath}") - files = self._list_objects(root, self.connector_config.recursive) - return [self._gen_ingest_doc(f) for f in files] diff --git a/unstructured/ingest/connector/opensearch.py b/unstructured/ingest/connector/opensearch.py deleted file mode 100644 index 543bfbc39..000000000 --- a/unstructured/ingest/connector/opensearch.py +++ /dev/null @@ -1,219 +0,0 @@ -import typing as t -from dataclasses import dataclass, field - -from dataclasses_json.core import Json - -from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchDestinationConnector, - ElasticsearchDocumentMeta, - ElasticsearchIngestDoc, - ElasticsearchIngestDocBatch, - ElasticsearchSourceConnector, - SimpleElasticsearchConfig, -) -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import AccessConfig, BaseSingleIngestDoc -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import generator_batching_wbytes -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from opensearchpy import OpenSearch - -"""Since the actual OpenSearch project is a fork of Elasticsearch, we are relying -heavily on the Elasticsearch connector code, inheriting the functionality as much as possible.""" - - -@dataclass -class OpenSearchAccessConfig(AccessConfig): - hosts: t.Optional[t.List[str]] = None - username: t.Optional[str] = None - password: t.Optional[str] = enhanced_field(default=None, sensitive=True) - use_ssl: bool = False - verify_certs: bool = False - ssl_show_warn: bool = False - ca_certs: t.Optional[str] = None - client_cert: t.Optional[str] = None - client_key: t.Optional[str] = None - - def to_dict(self, **kwargs) -> t.Dict[str, Json]: - d = super().to_dict(**kwargs) - d["http_auth"] = (self.username, self.password) - return d - - -@dataclass -class SimpleOpenSearchConfig(SimpleElasticsearchConfig): - access_config: OpenSearchAccessConfig = None - - -@dataclass -class OpenSearchIngestDoc(ElasticsearchIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Current implementation creates a python OpenSearch client to fetch each doc, - rather than creating a client for each thread. - """ - - connector_config: SimpleOpenSearchConfig - registry_name: str = "opensearch" - - @SourceConnectionError.wrap - @requires_dependencies(["opensearchpy"], extras="opensearch") - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - pass - - -@dataclass -class OpenSearchIngestDocBatch(ElasticsearchIngestDocBatch): - connector_config: SimpleOpenSearchConfig - ingest_docs: t.List[OpenSearchIngestDoc] = field(default_factory=list) - registry_name: str = "opensearch_batch" - - @requires_dependencies(["opensearchpy"], extras="opensearch") - def _get_docs(self): - from opensearchpy import OpenSearch - from opensearchpy.helpers import scan - - ops = OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False)) - scan_query = { - "_source": self.connector_config.fields, - "version": True, - "query": {"ids": {"values": self.list_of_ids}}, - } - - result = scan( - ops, - query=scan_query, - scroll="1m", - index=self.connector_config.index_name, - ) - return list(result) - - @SourceConnectionError.wrap - @requires_dependencies(["opensearchpy"], extras="opensearch") - def get_files(self): - documents = self._get_docs() - for doc in documents: - ingest_doc = OpenSearchIngestDoc( - processor_config=self.processor_config, - read_config=self.read_config, - connector_config=self.connector_config, - document=doc, - document_meta=ElasticsearchDocumentMeta( - self.connector_config.index_name, doc["_id"] - ), - ) - ingest_doc.update_source_metadata() - doc_body = doc["_source"] - filename = ingest_doc.filename - flattened_dict = flatten_dict(dictionary=doc_body) - str_values = [str(value) for value in flattened_dict.values()] - concatenated_values = "\n".join(str_values) - - filename.parent.mkdir(parents=True, exist_ok=True) - with open(filename, "w", encoding="utf8") as f: - f.write(concatenated_values) - self.ingest_docs.append(ingest_doc) - - -@dataclass -class OpenSearchSourceConnector(ElasticsearchSourceConnector): - """Fetches particular fields from all documents in a given opensearch cluster and index""" - - connector_config: SimpleOpenSearchConfig - _ops: t.Optional["OpenSearch"] = field(init=False, default=None) - - @property - def ops(self): - from opensearchpy import OpenSearch - - if self._ops is None: - self._ops = OpenSearch( - **self.connector_config.access_config.to_dict(apply_name_overload=False) - ) - return self._ops - - def check_connection(self): - try: - assert self.ops.ping() - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - @requires_dependencies(["opensearchpy"], extras="opensearch") - def _get_doc_ids(self): - """Fetches all document ids in an index""" - from opensearchpy.helpers import scan - - hits = scan( - self.ops, - query=self.scan_query, - scroll="1m", - index=self.connector_config.index_name, - ) - - return [hit["_id"] for hit in hits] - - def get_ingest_docs(self): - """Fetches all documents in an index, using ids that are fetched with _get_doc_ids""" - ids = self._get_doc_ids() - id_batches = [ - ids[ - i - * self.connector_config.batch_size : (i + 1) # noqa - * self.connector_config.batch_size - ] - for i in range( - (len(ids) + self.connector_config.batch_size - 1) - // self.connector_config.batch_size - ) - ] - return [ - OpenSearchIngestDocBatch( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - list_of_ids=batched_ids, - ) - for batched_ids in id_batches - ] - - -@dataclass -class OpenSearchDestinationConnector(ElasticsearchDestinationConnector): - connector_config: SimpleOpenSearchConfig - _client: t.Optional["OpenSearch"] = field(init=False, default=None) - - @DestinationConnectionError.wrap - @requires_dependencies(["opensearchpy"], extras="opensearch") - def generate_client(self) -> "OpenSearch": - from opensearchpy import OpenSearch - - return OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False)) - - @requires_dependencies(["opensearchpy"], extras="opensearch") - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]]) -> None: - logger.info( - f"writing document batches to destination" - f" index named {self.connector_config.index_name}" - f" at {self.connector_config.access_config.hosts}" - f" with batch size (in bytes) {self.write_config.batch_size_bytes}" - f" with {self.write_config.num_processes} (number of) processes" - ) - from opensearchpy.helpers import parallel_bulk - - for batch in generator_batching_wbytes( - elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes - ): - for success, info in parallel_bulk( - self.client, batch, thread_count=self.write_config.num_processes - ): - if not success: - logger.error( - "upload failed for a batch in opensearch destination connector:", info - ) diff --git a/unstructured/ingest/connector/outlook.py b/unstructured/ingest/connector/outlook.py deleted file mode 100644 index 58684a6db..000000000 --- a/unstructured/ingest/connector/outlook.py +++ /dev/null @@ -1,285 +0,0 @@ -import hashlib -import os -import typing as t -from collections import defaultdict -from dataclasses import dataclass, field -from itertools import chain -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder -if t.TYPE_CHECKING: - from office365.graph_client import GraphClient - - -class MissingFolderError(Exception): - """There are no root folders with those names.""" - - -@dataclass -class OutlookAccessConfig(AccessConfig): - client_credential: str = enhanced_field(repr=False, sensitive=True, overload_name="client_cred") - - -@dataclass -class SimpleOutlookConfig(BaseConnectorConfig): - """This class is getting the token.""" - - access_config: OutlookAccessConfig - user_email: str - client_id: str - tenant: t.Optional[str] = field(repr=False, default="common") - authority_url: t.Optional[str] = field(repr=False, default="https://login.microsoftonline.com") - outlook_folders: t.List[str] = field(default_factory=list) - recursive: bool = False - registry_name: str = "outlook" - - def __post_init__(self): - if not (self.client_id and self.access_config.client_credential and self.user_email): - raise ValueError( - "Please provide one of the following mandatory values:" - "\nclient_id\nclient_cred\nuser_email", - ) - self.token_factory = self._acquire_token - - @requires_dependencies(["msal"]) - def _acquire_token(self): - from msal import ConfidentialClientApplication - - try: - app = ConfidentialClientApplication( - authority=f"{self.authority_url}/{self.tenant}", - client_id=self.client_id, - client_credential=self.access_config.client_credential, - ) - token = app.acquire_token_for_client( - scopes=["https://graph.microsoft.com/.default"], - ) - except ValueError as exc: - logger.error("Couldn't set up credentials for Outlook") - raise exc - return token - - @requires_dependencies(["office365"], extras="outlook") - def _get_client(self): - from office365.graph_client import GraphClient - - return GraphClient(self.token_factory) - - -@dataclass -class OutlookIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleOutlookConfig - message_id: str - registry_name: str = "outlook" - - def __post_init__(self): - self._set_download_paths() - - def hash_mail_name(self, id): - """Outlook email ids are 152 char long. Hash to shorten to 16.""" - return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16] - - def _set_download_paths(self) -> None: - """Creates paths for downloading and parsing.""" - download_path = Path(f"{self.read_config.download_dir}") - output_path = Path(f"{self.processor_config.output_dir}") - - self.download_dir = download_path - self.download_filepath = ( - download_path / f"{self.hash_mail_name(self.message_id)}.eml" - ).resolve() - oname = f"{self.hash_mail_name(self.message_id)}.eml.json" - self.output_dir = output_path - self.output_filepath = (output_path / oname).resolve() - - @property - def filename(self): - return Path(self.download_filepath).resolve() - - @property - def _output_filename(self): - return Path(self.output_filepath).resolve() - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "message_id": self.message_id, - "user_email": self.connector_config.user_email, - } - - @requires_dependencies(["office365"], extras="outlook") - def update_source_metadata(self, **kwargs): - from office365.runtime.client_request_exception import ClientRequestException - - try: - client = self.connector_config._get_client() - msg = ( - client.users[self.connector_config.user_email] - .messages[self.message_id] - .get() - .execute_query() - ) - except ClientRequestException as e: - if e.response.status_code == 404: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - raise - self.source_metadata = SourceMetadata( - date_created=msg.created_datetime.isoformat(), - date_modified=msg.last_modified_datetime.isoformat(), - version=msg.get_property("changeKey"), - source_url=msg.get_property("webLink"), - exists=True, - ) - - @SourceConnectionNetworkError.wrap - def _run_download(self, local_file): - client = self.connector_config._get_client() - client.users[self.connector_config.user_email].messages[self.message_id].download( - local_file, - ).execute_query() - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - @requires_dependencies(["office365"], extras="outlook") - def get_file(self): - """Relies on Office365 python sdk message object to do the download.""" - try: - self.connector_config._get_client() - self.update_source_metadata() - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - - with open( - os.path.join( - self.download_dir, - self.hash_mail_name(self.message_id) + ".eml", - ), - "wb", - ) as local_file: - self._run_download(local_file=local_file) - - except Exception as e: - logger.error( - f"Error while downloading and saving file: {self.hash_mail_name(self.message_id)}.", - ) - logger.error(e) - return - logger.info(f"File downloaded: {self.hash_mail_name(self.message_id)}") - return - - -@dataclass -class OutlookSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleOutlookConfig - _client: t.Optional["GraphClient"] = field(init=False, default=None) - - @property - def client(self) -> "GraphClient": - if self._client is None: - self._client = self.connector_config._get_client() - return self._client - - def initialize(self): - try: - self.get_folder_ids() - except Exception as e: - raise SourceConnectionError(f"failed to validate connection: {e}") - - def check_connection(self): - try: - _ = self.client - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def recurse_folders(self, folder_id, main_folder_dict): - """We only get a count of subfolders for any folder. - Have to make additional calls to get subfolder ids.""" - subfolders = ( - self.client.users[self.connector_config.user_email] - .mail_folders[folder_id] - .child_folders.get() - .execute_query() - ) - for subfolder in subfolders: - for k, v in main_folder_dict.items(): - if subfolder.get_property("parentFolderId") in v: - v.append(subfolder.id) - if subfolder.get_property("childFolderCount") > 0: - self.recurse_folders(subfolder.id, main_folder_dict) - - def get_folder_ids(self): - """Sets the mail folder ids and subfolder ids for requested root mail folders.""" - self.root_folders = defaultdict(list) - root_folders_with_subfolders = [] - get_root_folders = ( - self.client.users[self.connector_config.user_email].mail_folders.get().execute_query() - ) - - for folder in get_root_folders: - self.root_folders[folder.display_name].append(folder.id) - if folder.get_property("childFolderCount") > 0: - root_folders_with_subfolders.append(folder.id) - - for folder in root_folders_with_subfolders: - self.recurse_folders(folder, self.root_folders) - - # Narrow down all mail folder ids (plus all subfolders) to the ones that were requested. - self.selected_folder_ids = list( - chain.from_iterable( - [ - v - for k, v in self.root_folders.items() - if k.lower() in [x.lower() for x in self.connector_config.outlook_folders] - ], - ), - ) - if not self.selected_folder_ids: - raise MissingFolderError( - "There are no root folders with the names: " - f"{self.connector_config.outlook_folders}", - ) - - def get_ingest_docs(self): - """Returns a list of all the message objects that are in the requested root folder(s).""" - filtered_messages = [] - - # Get all the relevant messages in the selected folders/subfolders. - for folder_id in self.selected_folder_ids: - messages = ( - self.client.users[self.connector_config.user_email] - .mail_folders[folder_id] - .messages.get() - .top(MAX_NUM_EMAILS) # Prevents the return from paging - .execute_query() - ) - # Skip empty list if there are no messages in folder. - if messages: - filtered_messages.append(messages) - return [ - OutlookIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - message_id=message.id, - ) - for message in list(chain.from_iterable(filtered_messages)) - ] diff --git a/unstructured/ingest/connector/pinecone.py b/unstructured/ingest/connector/pinecone.py deleted file mode 100644 index 6599185a1..000000000 --- a/unstructured/ingest/connector/pinecone.py +++ /dev/null @@ -1,142 +0,0 @@ -import copy -import json -import multiprocessing as mp -import typing as t -import uuid -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError, WriteError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - ConfigSessionHandleMixin, - IngestDocSessionHandleMixin, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from pinecone import Index as PineconeIndex - - -@dataclass -class PineconeAccessConfig(AccessConfig): - api_key: str = enhanced_field(sensitive=True) - - -@dataclass -class SimplePineconeConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - index_name: str - environment: str - access_config: PineconeAccessConfig - - -@dataclass -class PineconeWriteConfig(WriteConfig): - batch_size: int = 50 - num_processes: int = 1 - - -@dataclass -class PineconeDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector): - write_config: PineconeWriteConfig - connector_config: SimplePineconeConfig - _index: t.Optional["PineconeIndex"] = None - - def to_dict(self, **kwargs): - """ - The _index variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle '_thread.lock' object - When serializing, remove it, meaning client data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - if hasattr(self_cp, "_index"): - setattr(self_cp, "_index", None) - return _asdict(self_cp, **kwargs) - - @property - def pinecone_index(self): - if self._index is None: - self._index = self.create_index() - return self._index - - def initialize(self): - pass - - @requires_dependencies(["pinecone"], extras="pinecone") - def create_index(self) -> "PineconeIndex": - from pinecone import Pinecone - - from unstructured import __version__ as unstructured_version - - pc = Pinecone( - api_key=self.connector_config.access_config.api_key, - source_tag=f"unstructured=={unstructured_version}", - ) - - index = pc.Index(self.connector_config.index_name) - logger.debug(f"Connected to index: {pc.describe_index(self.connector_config.index_name)}") - return index - - @DestinationConnectionError.wrap - def check_connection(self): - _ = self.pinecone_index - - @DestinationConnectionError.wrap - @requires_dependencies(["pinecone"], extras="pinecone") - def upsert_batch(self, batch): - import pinecone.exceptions - - index = self.pinecone_index - try: - response = index.upsert(batch) - except pinecone.exceptions.PineconeApiException as api_error: - raise WriteError(f"http error: {api_error}") from api_error - logger.debug(f"results: {response}") - - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info( - f"Upserting {len(elements_dict)} elements to destination " - f"index at {self.connector_config.index_name}", - ) - - pinecone_batch_size = self.write_config.batch_size - - logger.info(f"using {self.write_config.num_processes} processes to upload") - if self.write_config.num_processes == 1: - for chunk in batch_generator(elements_dict, pinecone_batch_size): - self.upsert_batch(chunk) # noqa: E203 - - else: - with mp.Pool( - processes=self.write_config.num_processes, - ) as pool: - pool.map( - self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size)) - ) - - def normalize_dict(self, element_dict: dict) -> dict: - # While flatten_dict enables indexing on various fields, - # element_serialized enables easily reloading the element object to memory. - # element_serialized is formed without text/embeddings to avoid data bloating. - return { - "id": str(uuid.uuid4()), - "values": element_dict.pop("embeddings", None), - "metadata": { - "text": element_dict.pop("text", None), - "element_serialized": json.dumps(element_dict), - **flatten_dict( - element_dict, - separator="-", - flatten_lists=True, - remove_none=True, - ), - }, - } diff --git a/unstructured/ingest/connector/qdrant.py b/unstructured/ingest/connector/qdrant.py deleted file mode 100644 index da19c2dae..000000000 --- a/unstructured/ingest/connector/qdrant.py +++ /dev/null @@ -1,145 +0,0 @@ -import json -import multiprocessing as mp -import typing as t -import uuid -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError, WriteError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - ConfigSessionHandleMixin, - IngestDocSessionHandleMixin, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from qdrant_client import QdrantClient - - -@dataclass -class QdrantAccessConfig(AccessConfig): - api_key: t.Optional[str] = enhanced_field(sensitive=True) - - -@dataclass -class SimpleQdrantConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - collection_name: str - location: t.Optional[str] = None - url: t.Optional[str] = None - port: t.Optional[int] = 6333 - grpc_port: t.Optional[int] = 6334 - prefer_grpc: t.Optional[bool] = False - https: t.Optional[bool] = None - prefix: t.Optional[str] = None - timeout: t.Optional[float] = None - host: t.Optional[str] = None - path: t.Optional[str] = None - force_disable_check_same_thread: t.Optional[bool] = False - access_config: t.Optional[QdrantAccessConfig] = None - - -@dataclass -class QdrantWriteConfig(WriteConfig): - batch_size: int = 50 - num_processes: int = 1 - - -@dataclass -class QdrantDestinationConnector(IngestDocSessionHandleMixin, BaseDestinationConnector): - write_config: QdrantWriteConfig - connector_config: SimpleQdrantConfig - _client: t.Optional["QdrantClient"] = None - - @property - def qdrant_client(self): - if self._client is None: - self._client = self.create_client() - return self._client - - def initialize(self): - ... # fmt: skip - - @requires_dependencies(["qdrant_client"], extras="qdrant") - def create_client(self) -> "QdrantClient": - from qdrant_client import QdrantClient - - client = QdrantClient( - location=self.connector_config.location, - url=self.connector_config.url, - port=self.connector_config.port, - grpc_port=self.connector_config.grpc_port, - prefer_grpc=self.connector_config.prefer_grpc, - https=self.connector_config.https, - api_key=( - self.connector_config.access_config.api_key - if self.connector_config.access_config - else None - ), - prefix=self.connector_config.prefix, - timeout=self.connector_config.timeout, - host=self.connector_config.host, - path=self.connector_config.path, - force_disable_check_same_thread=self.connector_config.force_disable_check_same_thread, - ) - - return client - - @DestinationConnectionError.wrap - def check_connection(self): - self.qdrant_client.get_collections() - - @DestinationConnectionError.wrap - @requires_dependencies(["qdrant_client"], extras="qdrant") - def upsert_batch(self, batch: t.List[t.Dict[str, t.Any]]): - from qdrant_client import models - - client = self.qdrant_client - try: - points: list[models.PointStruct] = [models.PointStruct(**item) for item in batch] - response = client.upsert( - self.connector_config.collection_name, points=points, wait=True - ) - except Exception as api_error: - raise WriteError(f"Qdrant error: {api_error}") from api_error - logger.debug(f"results: {response}") - - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info( - f"Upserting {len(elements_dict)} elements to " - f"{self.connector_config.collection_name}", - ) - - qdrant_batch_size = self.write_config.batch_size - - logger.info(f"using {self.write_config.num_processes} processes to upload") - if self.write_config.num_processes == 1: - for chunk in batch_generator(elements_dict, qdrant_batch_size): - self.upsert_batch(chunk) - - else: - with mp.Pool( - processes=self.write_config.num_processes, - ) as pool: - pool.map(self.upsert_batch, list(batch_generator(elements_dict, qdrant_batch_size))) - - def normalize_dict(self, element_dict: dict) -> dict: - return { - "id": str(uuid.uuid4()), - "vector": element_dict.pop("embeddings", {}), - "payload": { - "text": element_dict.pop("text", None), - "element_serialized": json.dumps(element_dict), - **flatten_dict( - element_dict, - separator="-", - flatten_lists=True, - ), - }, - } diff --git a/unstructured/ingest/connector/reddit.py b/unstructured/ingest/connector/reddit.py deleted file mode 100644 index 18f8ba7c7..000000000 --- a/unstructured/ingest/connector/reddit.py +++ /dev/null @@ -1,166 +0,0 @@ -import typing as t -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from praw import Reddit - - -@dataclass -class RedditAccessConfig(AccessConfig): - client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True) - - -@dataclass -class SimpleRedditConfig(BaseConnectorConfig): - access_config: RedditAccessConfig - subreddit_name: str - num_posts: int - user_agent: str - client_id: str - search_query: t.Optional[str] = None - - def __post_init__(self): - if self.num_posts <= 0: - raise ValueError("The number of Reddit posts to fetch must be positive.") - - -@dataclass -class RedditIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleRedditConfig = field(repr=False) - post_id: str - registry_name: str = "reddit" - - def _create_full_tmp_dir_path(self): - self.filename.parent.mkdir(parents=True, exist_ok=True) - - @SourceConnectionNetworkError.wrap - @requires_dependencies(["praw"]) - def get_post(self): - from praw import Reddit - from praw.models import Submission - - reddit = Reddit( - client_id=self.connector_config.client_id, - client_secret=self.connector_config.access_config.client_secret, - user_agent=self.connector_config.user_agent, - ) - post = Submission(reddit, self.post_id) - return post - - def update_source_metadata(self, **kwargs): - post = kwargs.get("post", self.get_post()) - if post is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - - file_exists = (post.author != "[deleted]" or post.auth is not None) and ( - post.selftext != "[deleted]" or post.selftext != "[removed]" - ) - - self.source_metadata = SourceMetadata( - date_created=datetime.utcfromtimestamp(post.created_utc).isoformat(), - source_url=post.permalink, - exists=file_exists, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - self._create_full_tmp_dir_path() - # Write the title plus the body, if any - post = self.get_post() - self.update_source_metadata(post=post) - if post is None: - raise ValueError( - f"Failed to retrieve post {self.post_id}", - ) - - text_to_write = f"# {post.title}\n{post.selftext}" - with open(self.filename, "w", encoding="utf8") as f: - f.write(text_to_write) - - @property - def filename(self) -> Path: - return (Path(self.read_config.download_dir) / f"{self.post_id}.md").resolve() - - @property - def _output_filename(self): - return Path(self.processor_config.output_dir) / f"{self.post_id}.json" - - @property - def date_modified(self) -> t.Optional[str]: - return None - - @property - def version(self) -> t.Optional[str]: - return None - - -@dataclass -class RedditSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleRedditConfig - _reddit: t.Optional["Reddit"] = field(init=False, default=None) - - @property - def reddit(self) -> "Reddit": - from praw import Reddit - - if self._reddit is None: - self._reddit = Reddit( - client_id=self.connector_config.client_id, - client_secret=self.connector_config.access_config.client_secret, - user_agent=self.connector_config.user_agent, - ) - return self._reddit - - @requires_dependencies(["praw"], extras="reddit") - def initialize(self): - _ = self.reddit - - def check_connection(self): - from praw.endpoints import API_PATH - from prawcore import ResponseException - - try: - self.reddit._objectify_request(method="HEAD", params=None, path=API_PATH["me"]) - except ResponseException as response_error: - logger.error(f"failed to validate connection: {response_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {response_error}") - - def get_ingest_docs(self): - subreddit = self.reddit.subreddit(self.connector_config.subreddit_name) - if self.connector_config.search_query: - posts = subreddit.search( - self.connector_config.search_query, - limit=self.connector_config.num_posts, - ) - else: - posts = subreddit.hot(limit=self.connector_config.num_posts) - return [ - RedditIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - post_id=post.id, - ) - for post in posts - ] diff --git a/unstructured/ingest/connector/registry.py b/unstructured/ingest/connector/registry.py deleted file mode 100644 index 35250d6f0..000000000 --- a/unstructured/ingest/connector/registry.py +++ /dev/null @@ -1,109 +0,0 @@ -import json -from typing import Dict, Type, cast - -from unstructured.ingest.connector.airtable import AirtableIngestDoc -from unstructured.ingest.connector.astradb import AstraDBIngestDoc -from unstructured.ingest.connector.biomed import BiomedIngestDoc -from unstructured.ingest.connector.confluence import ConfluenceIngestDoc -from unstructured.ingest.connector.delta_table import DeltaTableIngestDoc -from unstructured.ingest.connector.discord import DiscordIngestDoc -from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchIngestDoc, - ElasticsearchIngestDocBatch, -) -from unstructured.ingest.connector.fsspec.azure import AzureBlobStorageIngestDoc -from unstructured.ingest.connector.fsspec.box import BoxIngestDoc -from unstructured.ingest.connector.fsspec.dropbox import DropboxIngestDoc -from unstructured.ingest.connector.fsspec.gcs import GcsIngestDoc -from unstructured.ingest.connector.fsspec.s3 import S3IngestDoc -from unstructured.ingest.connector.fsspec.sftp import SftpIngestDoc -from unstructured.ingest.connector.github import GitHubIngestDoc -from unstructured.ingest.connector.gitlab import GitLabIngestDoc -from unstructured.ingest.connector.google_drive import GoogleDriveIngestDoc -from unstructured.ingest.connector.hubspot import HubSpotIngestDoc -from unstructured.ingest.connector.jira import JiraIngestDoc -from unstructured.ingest.connector.kafka import KafkaIngestDoc -from unstructured.ingest.connector.local import LocalIngestDoc -from unstructured.ingest.connector.mongodb import MongoDBIngestDoc, MongoDBIngestDocBatch -from unstructured.ingest.connector.notion.connector import ( - NotionDatabaseIngestDoc, - NotionPageIngestDoc, -) -from unstructured.ingest.connector.onedrive import OneDriveIngestDoc -from unstructured.ingest.connector.opensearch import OpenSearchIngestDoc, OpenSearchIngestDocBatch -from unstructured.ingest.connector.outlook import OutlookIngestDoc -from unstructured.ingest.connector.reddit import RedditIngestDoc -from unstructured.ingest.connector.salesforce import SalesforceIngestDoc -from unstructured.ingest.connector.sharepoint import SharepointIngestDoc -from unstructured.ingest.connector.slack import SlackIngestDoc -from unstructured.ingest.connector.wikipedia import ( - WikipediaIngestHTMLDoc, - WikipediaIngestSummaryDoc, - WikipediaIngestTextDoc, -) -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.interfaces import BaseIngestDoc - -INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = { - "airtable": AirtableIngestDoc, - "astradb": AstraDBIngestDoc, - "azure": AzureBlobStorageIngestDoc, - "biomed": BiomedIngestDoc, - "box": BoxIngestDoc, - "confluence": ConfluenceIngestDoc, - "delta-table": DeltaTableIngestDoc, - "discord": DiscordIngestDoc, - "dropbox": DropboxIngestDoc, - "elasticsearch": ElasticsearchIngestDoc, - "elasticsearch_batch": ElasticsearchIngestDocBatch, - "gcs": GcsIngestDoc, - "github": GitHubIngestDoc, - "gitlab": GitLabIngestDoc, - "google_drive": GoogleDriveIngestDoc, - "hubspot": HubSpotIngestDoc, - "jira": JiraIngestDoc, - "kafka": KafkaIngestDoc, - "local": LocalIngestDoc, - "mongodb": MongoDBIngestDoc, - "mongodb_batch": MongoDBIngestDocBatch, - "notion_database": NotionDatabaseIngestDoc, - "notion_page": NotionPageIngestDoc, - "onedrive": OneDriveIngestDoc, - "opensearch": OpenSearchIngestDoc, - "opensearch_batch": OpenSearchIngestDocBatch, - "outlook": OutlookIngestDoc, - "reddit": RedditIngestDoc, - "s3": S3IngestDoc, - "salesforce": SalesforceIngestDoc, - "sftp": SftpIngestDoc, - "sharepoint": SharepointIngestDoc, - "slack": SlackIngestDoc, - "wikipedia_html": WikipediaIngestHTMLDoc, - "wikipedia_text": WikipediaIngestTextDoc, - "wikipedia_summary": WikipediaIngestSummaryDoc, -} - - -def create_ingest_doc_from_json(ingest_doc_json: str) -> BaseIngestDoc: - try: - ingest_doc_dict: dict = json.loads(ingest_doc_json) - except TypeError as te: - raise TypeError( - f"failed to load json string when deserializing IngestDoc: {ingest_doc_json}", - ) from te - return create_ingest_doc_from_dict(ingest_doc_dict) - - -def create_ingest_doc_from_dict(ingest_doc_dict: dict) -> BaseIngestDoc: - ingest_doc_dict = ingest_doc_dict.copy() - if "registry_name" not in ingest_doc_dict: - raise ValueError(f"registry_name not present in ingest doc: {ingest_doc_dict}") - registry_name = ingest_doc_dict.pop("registry_name") - try: - ingest_doc_cls = INGEST_DOC_NAME_TO_CLASS[registry_name] - return cast(BaseIngestDoc, ingest_doc_cls.from_dict(ingest_doc_dict)) - except KeyError: - raise ValueError( - f"Error: Received unknown IngestDoc name: {registry_name} while deserializing", - "IngestDoc.", - ) diff --git a/unstructured/ingest/connector/salesforce.py b/unstructured/ingest/connector/salesforce.py deleted file mode 100644 index b17810120..000000000 --- a/unstructured/ingest/connector/salesforce.py +++ /dev/null @@ -1,301 +0,0 @@ -""" -Salesforce Connector -Able to download Account, Case, Campaign, EmailMessage, Lead -Salesforce returns everything as a list of json. -This saves each entry as a separate file to be partitioned. -Using JWT authorization -https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm -https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm -""" - -import json -import typing as t -from collections import OrderedDict -from dataclasses import dataclass, field -from datetime import datetime -from email.utils import formatdate -from pathlib import Path -from string import Template -from textwrap import dedent - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -class MissingCategoryError(Exception): - """There are no categories with that name.""" - - -SALESFORCE_API_VERSION = "57.0" - -ACCEPTED_CATEGORIES = ["Account", "Case", "Campaign", "EmailMessage", "Lead"] - -EMAIL_TEMPLATE = Template( - """MIME-Version: 1.0 -Date: $date -Message-ID: $message_identifier -Subject: $subject -From: $from_email -To: $to_email -Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" ---00000000000095c9b205eff92630 -Content-Type: text/plain; charset="UTF-8" -$textbody ---00000000000095c9b205eff92630 -Content-Type: text/html; charset="UTF-8" -$htmlbody ---00000000000095c9b205eff92630-- -""", -) - - -@dataclass -class SalesforceAccessConfig(AccessConfig): - consumer_key: str = enhanced_field(sensitive=True) - private_key: str = enhanced_field(sensitive=True) - - @requires_dependencies(["cryptography"]) - def get_private_key_value_and_type(self) -> t.Tuple[str, t.Type]: - from cryptography.hazmat.primitives import serialization - - try: - serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None) - except ValueError: - pass - else: - return self.private_key, str - - if Path(self.private_key).is_file(): - return self.private_key, Path - - raise ValueError("private_key does not contain PEM private key or path") - - -@dataclass -class SimpleSalesforceConfig(BaseConnectorConfig): - """Connector specific attributes""" - - access_config: SalesforceAccessConfig - categories: t.List[str] - username: str - recursive: bool = False - - @requires_dependencies(["simple_salesforce"], extras="salesforce") - def get_client(self): - from simple_salesforce import Salesforce - - pkey_value, pkey_type = self.access_config.get_private_key_value_and_type() - - return Salesforce( - username=self.username, - consumer_key=self.access_config.consumer_key, - privatekey_file=pkey_value if pkey_type is Path else None, - privatekey=pkey_value if pkey_type is str else None, - version=SALESFORCE_API_VERSION, - ) - - -@dataclass -class SalesforceIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleSalesforceConfig - record_type: str - record_id: str - registry_name: str = "salesforce" - _record: OrderedDict = field(default_factory=lambda: OrderedDict()) - - @property - def record(self): - if not self._record: - self._record = self.get_record() - return self._record - - def get_file_extension(self) -> str: - if self.record_type == "EmailMessage": - extension = ".eml" - elif self.record_type in ["Account", "Lead", "Case", "Campaign"]: - extension = ".xml" - else: - raise MissingCategoryError( - f"There are no categories with the name: {self.record_type}", - ) - return extension - - def _tmp_download_file(self) -> Path: - record_file = self.record_id + self.get_file_extension() - return Path(self.read_config.download_dir) / self.record_type / record_file - - @property - def _output_filename(self) -> Path: - record_file = self.record_id + self.get_file_extension() + ".json" - return Path(self.processor_config.output_dir) / self.record_type / record_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - def _xml_for_record(self, record: OrderedDict) -> str: - """Creates partitionable xml file from a record""" - import xml.etree.ElementTree as ET - - def flatten_dict(data, parent, prefix=""): - for key, value in data.items(): - if isinstance(value, OrderedDict): - flatten_dict(value, parent, prefix=f"{prefix}{key}.") - else: - item = ET.Element("item") - item.text = f"{prefix}{key}: {value}" - parent.append(item) - - root = ET.Element("root") - flatten_dict(record, root) - xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode() - return xml_string - - def _eml_for_record(self, email_json: t.Dict[str, t.Any]) -> str: - from dateutil import parser # type: ignore - - """Recreates standard expected .eml format using template.""" - eml = EMAIL_TEMPLATE.substitute( - date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()), - message_identifier=email_json.get("MessageIdentifier"), - subject=email_json.get("Subject"), - from_email=email_json.get("FromAddress"), - to_email=email_json.get("ToAddress"), - textbody=email_json.get("TextBody"), - # TODO: This is a hack to get emails to process correctly. - # The HTML partitioner seems to have issues with
and text without tags like

- htmlbody=email_json.get("HtmlBody", "") # "" because you can't .replace None - .replace("
", "

") - .replace(" OrderedDict: - # Get record from Salesforce based on id - response = self._get_response() - logger.debug(f"response was returned for salesforce record id: {self.record_id}") - records = response["records"] - if not records: - raise ValueError( - f"No record found with record id {self.record_id}: {json.dumps(response)}" - ) - record_json = records[0] - return record_json - - def update_source_metadata(self) -> None: # type: ignore - record_json = self.record - - date_format = "%Y-%m-%dT%H:%M:%S.000+0000" - self.source_metadata = SourceMetadata( - date_created=datetime.strptime(record_json["CreatedDate"], date_format).isoformat(), - date_modified=datetime.strptime( - record_json["LastModifiedDate"], - date_format, - ).isoformat(), - # SystemModstamp is Timestamp if record has been modified by person or automated system - version=record_json.get("SystemModstamp"), - source_url=record_json["attributes"].get("url"), - exists=True, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - """Saves individual json records locally.""" - self._create_full_tmp_dir_path() - record = self.record - - self.update_source_metadata() - - try: - if self.record_type == "EmailMessage": - document = self._eml_for_record(record) - else: - document = self._xml_for_record(record) - - with open(self._tmp_download_file(), "w") as page_file: - page_file.write(document) - - except Exception as e: - logger.error( - f"Error while downloading and saving file: {self.record_id}.", - ) - logger.error(e) - - @property - def filename(self): - """The filename of the file created from a Salesforce record""" - return self._tmp_download_file() - - -@dataclass -class SalesforceSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleSalesforceConfig - - def __post_init__(self): - self.ingest_doc_cls: t.Type[SalesforceIngestDoc] = SalesforceIngestDoc - - def initialize(self): - pass - - @requires_dependencies(["simple_salesforce"], extras="salesforce") - def check_connection(self): - from simple_salesforce.exceptions import SalesforceError - - try: - self.connector_config.get_client() - except SalesforceError as salesforce_error: - logger.error(f"failed to validate connection: {salesforce_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {salesforce_error}") - - @requires_dependencies(["simple_salesforce"], extras="salesforce") - def get_ingest_docs(self) -> t.List[SalesforceIngestDoc]: - """Get Salesforce Ids for the records. - Send them to next phase where each doc gets downloaded into the - appropriate format for partitioning. - """ - from simple_salesforce.exceptions import SalesforceMalformedRequest - - client = self.connector_config.get_client() - - ingest_docs = [] - for record_type in self.connector_config.categories: - if record_type not in ACCEPTED_CATEGORIES: - raise ValueError(f"{record_type} not currently an accepted Salesforce category") - - try: - # Get ids from Salesforce - records = client.query_all( - f"select Id from {record_type}", - ) - for record in records["records"]: - ingest_docs.append( - SalesforceIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - record_type=record_type, - record_id=record["Id"], - ), - ) - except SalesforceMalformedRequest as e: - raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}") - - return ingest_docs diff --git a/unstructured/ingest/connector/sharepoint.py b/unstructured/ingest/connector/sharepoint.py deleted file mode 100644 index c65722404..000000000 --- a/unstructured/ingest/connector/sharepoint.py +++ /dev/null @@ -1,573 +0,0 @@ -import json -import os -import typing as t -from dataclasses import dataclass -from html import unescape -from pathlib import Path -from urllib.parse import urlparse - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.interfaces import PermissionsConfig as SharepointPermissionsConfig -from unstructured.ingest.logger import logger -from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from office365.sharepoint.client_context import ClientContext - from office365.sharepoint.files.file import File - from office365.sharepoint.publishing.pages.page import SitePage - -MAX_MB_SIZE = 512_000_000 -CONTENT_LABELS = ["CanvasContent1", "LayoutWebpartsContent1", "TimeCreated"] - - -@dataclass -class SharepointAccessConfig(AccessConfig): - client_cred: str = enhanced_field(repr=False, sensitive=True) - - -@dataclass -class SimpleSharepointConfig(BaseConnectorConfig): - access_config: SharepointAccessConfig - client_id: str - site: str - path: str - process_pages: bool = enhanced_field(default=True, init=False) - recursive: bool = False - files_only: bool = False - permissions_config: t.Optional[SharepointPermissionsConfig] = None - - def __post_init__(self): - if not (self.client_id and self.access_config.client_cred and self.site): - raise ValueError( - "Please provide one of the following mandatory values:" - "\n--client-id\n--client-cred\n--site", - ) - self.process_pages = not self.files_only - - @requires_dependencies(["office365"], extras="sharepoint") - def get_site_client(self, site_url: str = "") -> "ClientContext": - from office365.runtime.auth.client_credential import ClientCredential - from office365.sharepoint.client_context import ClientContext - - try: - site_client = ClientContext(site_url or self.site).with_credentials( - ClientCredential(self.client_id, self.access_config.client_cred), - ) - except Exception: - logger.error("Couldn't set Sharepoint client.") - raise - return site_client - - def get_permissions_client(self): - try: - permissions_connector = SharepointPermissionsConnector(self.permissions_config) - assert permissions_connector.access_token - return permissions_connector - except Exception as e: - logger.error("Couldn't obtain Sharepoint permissions ingestion access token:", e) - - -@dataclass -class SharepointIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleSharepointConfig - site_url: str - server_path: str - is_page: bool - file_path: str - registry_name: str = "sharepoint" - - def __post_init__(self): - self.extension = Path(self.file_path).suffix if not self.is_page else ".html" - self.extension = ".html" if self.extension == ".aspx" else self.extension - if not self.extension: - raise ValueError("Unsupported file without extension.") - - self._set_download_paths() - - def _set_download_paths(self) -> None: - """Parses the folder structure from the source and creates the download and output paths""" - download_path = Path(f"{self.read_config.download_dir}") - output_path = Path(f"{self.processor_config.output_dir}") - parent = Path(self.file_path).with_suffix(self.extension) - self.download_dir = (download_path / parent.parent).resolve() - self.download_filepath = (download_path / parent).resolve() - output_filename = str(parent) + ".json" - self.output_dir = (output_path / parent.parent).resolve() - self.output_filepath = (output_path / output_filename).resolve() - - @property - def filename(self): - return Path(self.download_filepath).resolve() - - @property - def _output_filename(self): - return Path(self.output_filepath).resolve() - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "server_path": self.server_path, - "site_url": self.site_url, - } - - @SourceConnectionNetworkError.wrap - @requires_dependencies(["office365"], extras="sharepoint") - def _fetch_file(self, properties_only: bool = False): - """Retrieves the actual page/file from the Sharepoint instance""" - from office365.runtime.client_request_exception import ClientRequestException - - site_client = self.connector_config.get_site_client(self.site_url) - - try: - if self.is_page: - file = site_client.web.get_file_by_server_relative_path("/" + self.server_path) - file = file.listItemAllFields.select(CONTENT_LABELS).get().execute_query() - else: - file = site_client.web.get_file_by_server_relative_url(self.server_path) - if properties_only: - file = file.get().execute_query() - except ClientRequestException as e: - if e.response.status_code == 404: - return None - raise - return file - - def _fetch_page(self): - site_client = self.connector_config.get_site_client(self.site_url) - try: - page = ( - site_client.site_pages.pages.get_by_url(self.server_path) - .expand(["FirstPublished", "Modified", "Version"]) - .get() - .execute_query() - ) - except Exception as e: - logger.error(f"Failed to retrieve page {self.server_path} from site {self.site_url}") - logger.error(e) - return None - return page - - def update_permissions_data(self): - def parent_name_matches(parent_type, permissions_filename, ingest_doc_filepath): - permissions_filename = permissions_filename.split("_SEP_") - ingest_doc_filepath = ingest_doc_filepath.split("/") - - if parent_type == "sites": - return permissions_filename[0] == ingest_doc_filepath[1] - - elif parent_type == "SitePages" or parent_type == "Shared Documents": - return True - - permissions_data = None - permissions_dir = Path(self.processor_config.output_dir) / "permissions_data" - - if permissions_dir.is_dir(): - parent_type = self.file_path.split("/")[0] - - if parent_type == "sites": - read_dir = permissions_dir / "sites" - elif parent_type == "SitePages" or parent_type == "Shared Documents": - read_dir = permissions_dir / "other" - else: - read_dir = permissions_dir / "other" - - for filename in os.listdir(read_dir): - permissions_docname = os.path.splitext(filename)[0].split("_SEP_")[1] - ingestdoc_docname = self.file_path.split("/")[-1] - - if ingestdoc_docname == permissions_docname and parent_name_matches( - parent_type=parent_type, - permissions_filename=filename, - ingest_doc_filepath=self.file_path, - ): - with open(read_dir / filename) as f: - permissions_data = json.loads(f.read()) - - return permissions_data - - def update_source_metadata(self, **kwargs): - if self.is_page: - page = self._fetch_page() - if page is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - self.source_metadata = SourceMetadata( - date_created=page.get_property("FirstPublished", None), - date_modified=page.get_property("Modified", None), - version=page.get_property("Version", ""), - source_url=page.absolute_url, - exists=True, - permissions_data=( - self.update_permissions_data() - if self.connector_config.permissions_config - else None - ), - ) - return - - file = self._fetch_file(True) - if file is None: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - self.source_metadata = SourceMetadata( - date_created=ensure_isoformat_datetime(timestamp=file.time_created), - date_modified=ensure_isoformat_datetime(timestamp=file.time_last_modified), - version=file.major_version, - source_url=file.properties.get("LinkingUrl", None), - exists=True, - permissions_data=( - self.update_permissions_data() if self.connector_config.permissions_config else None - ), - ) - - def _download_page(self): - """Formats and saves locally page content""" - content = self._fetch_file() - self.update_source_metadata() - pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + ( - content.properties.get("CanvasContent1", "") or "" - ) - if pld != "": - pld = unescape(pld) - else: - logger.info( - f"Page {self.server_path} has no retrievable content. \ - Dumping empty doc.", - ) - pld = "

" - - self.output_dir.mkdir(parents=True, exist_ok=True) - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - with self.filename.open(mode="w") as f: - f.write(pld) - logger.info(f"File downloaded: {self.filename}") - - def _download_file(self): - file = self._fetch_file() - self.update_source_metadata() - fsize = file.length - self.output_dir.mkdir(parents=True, exist_ok=True) - - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - - if fsize > MAX_MB_SIZE: - logger.info(f"Downloading file with size: {fsize} bytes in chunks") - with self.filename.open(mode="wb") as f: - file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() - else: - with self.filename.open(mode="wb") as f: - file.download(f).execute_query() - logger.info(f"File downloaded: {self.filename}") - - @BaseSingleIngestDoc.skip_if_file_exists - @SourceConnectionError.wrap - @requires_dependencies(["office365"]) - def get_file(self): - if self.is_page: - self._download_page() - else: - self._download_file() - return - - -@dataclass -class SharepointSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleSharepointConfig - - def check_connection(self): - try: - site_client = self.connector_config.get_site_client() - site_client.site_pages.pages.get().execute_query() - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - @requires_dependencies(["office365"], extras="sharepoint") - def _list_files(self, folder, recursive) -> t.List["File"]: - from office365.runtime.client_request_exception import ClientRequestException - - try: - objects = folder.expand(["Files", "Folders"]).get().execute_query() - files = list(objects.files) - if not recursive: - return files - for f in objects.folders: - if "/Forms" in f.serverRelativeUrl: - continue - files += self._list_files(f, recursive) - return files - except ClientRequestException as e: - if e.response.status_code != 404: - logger.info("Caught an error while processing documents %s", e.response.text) - return [] - - def _prepare_ingest_doc(self, obj: t.Union["File", "SitePage"], base_url, is_page=False): - if is_page: - file_path = obj.get_property("Url", "") - server_path = file_path if file_path[0] != "/" else file_path[1:] - if (url_path := (urlparse(base_url).path)) and (url_path != "/"): - file_path = url_path[1:] + "/" + file_path - else: - server_path = obj.serverRelativeUrl - file_path = obj.serverRelativeUrl[1:] - - return SharepointIngestDoc( - processor_config=self.processor_config, - read_config=self.read_config, - connector_config=self.connector_config, - site_url=base_url, - server_path=server_path, - is_page=is_page, - file_path=file_path, - ) - - @requires_dependencies(["office365"], extras="sharepoint") - def _list_pages(self, site_client) -> list: - from office365.runtime.client_request_exception import ClientRequestException - - try: - site_pages = site_client.site_pages.pages.get().execute_query() - except ClientRequestException as e: - logger.info( - "Caught an error while retrieving site pages from %s \n%s", - site_client.base_url, - e.response.text, - ) - return [] - - return [self._prepare_ingest_doc(page, site_client.base_url, True) for page in site_pages] - - def _ingest_site_docs(self, site_client) -> t.List["SharepointIngestDoc"]: - root_folder = site_client.web.get_folder_by_server_relative_path(self.connector_config.path) - files = self._list_files(root_folder, self.connector_config.recursive) - if not files: - logger.info( - f"No processable files at path {self.connector_config.path}\ - for site {site_client.base_url}", - ) - output = [] - for file in files: - try: - output.append(self._prepare_ingest_doc(file, site_client.base_url)) - except ValueError as e: - logger.error("Unable to process file %s", file.properties["Name"]) - logger.error(e) - if self.connector_config.process_pages: - page_output = self._list_pages(site_client) - if not page_output: - logger.info(f"Couldn't process pages for site {site_client.base_url}") - output = output + page_output - return output - - def initialize(self): - pass - - def get_ingest_docs(self): - base_site_client = self.connector_config.get_site_client() - - if not all( - getattr(self.connector_config.permissions_config, attr, False) - for attr in ["application_id", "client_cred", "tenant"] - ): - logger.info( - "Permissions config is not fed with 'application_id', 'client_cred' and 'tenant'." - "Skipping permissions ingestion.", - ) - else: - permissions_client = self.connector_config.get_permissions_client() - if permissions_client: - permissions_client.write_all_permissions(self.processor_config.output_dir) - - if not base_site_client.is_tenant: - return self._ingest_site_docs(base_site_client) - tenant = base_site_client.tenant - tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query() - tenant_sites = {s.url for s in tenant_sites if (s.url is not None)} - ingest_docs: t.List[SharepointIngestDoc] = [] - for site_url in tenant_sites: - logger.info(f"Processing docs for site: {site_url}") - site_client = self.connector_config.get_site_client(site_url) - ingest_docs = ingest_docs + self._ingest_site_docs(site_client) - return ingest_docs - - -@dataclass -class SharepointPermissionsConnector: - def __init__(self, permissions_config): - self.permissions_config: SharepointPermissionsConfig = permissions_config - self.initialize() - - def initialize(self): - self.access_token: str = self.get_access_token() - - @requires_dependencies(["requests"], extras="sharepoint") - def get_access_token(self) -> str: - import requests - - url = ( - f"https://login.microsoftonline.com/{self.permissions_config.tenant}/oauth2/v2.0/token" - ) - headers = {"Content-Type": "application/x-www-form-urlencoded"} - data = { - "client_id": self.permissions_config.application_id, - "scope": "https://graph.microsoft.com/.default", - "client_secret": self.permissions_config.client_cred, - "grant_type": "client_credentials", - } - response = requests.post(url, headers=headers, data=data) - return response.json()["access_token"] - - def validated_response(self, response): - if response.status_code == 200: - return response.json() - else: - logger.info(f"Request failed with status code {response.status_code}:") - logger.info(response.text) - - @requires_dependencies(["requests"], extras="sharepoint") - def get_sites(self): - import requests - - url = "https://graph.microsoft.com/v1.0/sites" - params = { - "$select": "webUrl, id", - } - - headers = { - "Authorization": f"Bearer {self.access_token}", - } - - response = requests.get(url, params=params, headers=headers) - return self.validated_response(response) - - @requires_dependencies(["requests"], extras="sharepoint") - def get_drives(self, site): - import requests - - url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives" - - headers = { - "Authorization": f"Bearer {self.access_token}", - } - - response = requests.get(url, headers=headers) - - return self.validated_response(response) - - @requires_dependencies(["requests"], extras="sharepoint") - def get_drive_items(self, site, drive_id): - import requests - - url = f"https://graph.microsoft.com/v1.0/sites/{site}/drives/{drive_id}/root/children" - - headers = { - "Authorization": f"Bearer {self.access_token}", - } - - response = requests.get(url, headers=headers) - - return self.validated_response(response) - - def extract_site_name_from_weburl(self, weburl): - split_path = urlparse(weburl).path.lstrip("/").split("/") - - if split_path[0] == "sites": - return "sites", split_path[1] - - elif split_path[0] == "Shared%20Documents": - return "Shared Documents", "Shared Documents" - - elif split_path[0] == "personal": - return "Personal", "Personal" - - elif split_path[0] == "_layouts": - return "layouts", "layouts" - - # if other weburl structures are found, additional logic might need to be implemented - - logger.warning( - """Couldn't extract sitename, unknown site or parent type. Skipping permissions - ingestion for the document with the URL:""", - weburl, - ) - - return None, None - - @requires_dependencies(["requests"], extras="sharepoint") - def get_permissions_for_drive_item(self, site, drive_id, item_id): - import requests - - url = f"https://graph.microsoft.com/v1.0/sites/ \ - {site}/drives/{drive_id}/items/{item_id}/permissions" - - headers = { - "Authorization": f"Bearer {self.access_token}", - } - - response = requests.get(url, headers=headers) - - return self.validated_response(response) - - def write_all_permissions(self, output_dir): - sites = [(site["id"], site["webUrl"]) for site in self.get_sites()["value"]] - drive_ids = [] - - logger.info("Obtaining drive data for sites for permissions (rbac)") - for site_id, site_url in sites: - drives = self.get_drives(site_id) - if drives: - drives_for_site = drives["value"] - drive_ids.extend([(site_id, drive["id"]) for drive in drives_for_site]) - - logger.info("Obtaining item data from drives for permissions (rbac)") - item_ids = [] - for site, drive_id in drive_ids: - drive_items = self.get_drive_items(site, drive_id) - if drive_items: - item_ids.extend( - [ - (site, drive_id, item["id"], item["name"], item["webUrl"]) - for item in drive_items["value"] - ], - ) - - permissions_dir = Path(output_dir) / "permissions_data" - - logger.info("Writing permissions data to disk") - for site, drive_id, item_id, item_name, item_web_url in item_ids: - res = self.get_permissions_for_drive_item(site, drive_id, item_id) - if res: - parent_type, parent_name = self.extract_site_name_from_weburl(item_web_url) - - if parent_type == "sites": - write_path = permissions_dir / "sites" / f"{parent_name}_SEP_{item_name}.json" - - elif parent_type == "Personal" or parent_type == "Shared Documents": - write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json" - else: - write_path = permissions_dir / "other" / f"{parent_name}_SEP_{item_name}.json" - - if not Path(os.path.dirname(write_path)).is_dir(): - os.makedirs(os.path.dirname(write_path)) - - with open(write_path, "w") as f: - json.dump(res["value"], f) diff --git a/unstructured/ingest/connector/slack.py b/unstructured/ingest/connector/slack.py deleted file mode 100644 index 4f6a8ce42..000000000 --- a/unstructured/ingest/connector/slack.py +++ /dev/null @@ -1,224 +0,0 @@ -import typing as t -import xml.etree.ElementTree as ET -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - requires_dependencies, - validate_date_args, -) - -DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z") - - -@dataclass -class SlackAccessConfig(AccessConfig): - token: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleSlackConfig(BaseConnectorConfig): - """Connector config to process all messages by channel id's.""" - - access_config: SlackAccessConfig - channels: t.List[str] - start_date: t.Optional[str] = None - end_date: t.Optional[str] = None - - def validate_inputs(self): - oldest_valid = True - latest_valid = True - - if self.start_date: - oldest_valid = validate_date_args(self.start_date) - - if self.end_date: - latest_valid = validate_date_args(self.end_date) - - return oldest_valid, latest_valid - - def __post_init__(self): - oldest_valid, latest_valid = self.validate_inputs() - if not oldest_valid and not latest_valid: - raise ValueError( - "Start and/or End dates are not valid. ", - ) - - -@dataclass -class SlackIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - connector_config: SimpleSlackConfig - channel: str - registry_name: str = "slack" - - # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file - # __post_init__ for multiprocessing simplicity (no Path objects in initially - # instantiated object) - def _tmp_download_file(self): - channel_file = self.channel + ".xml" - return Path(self.read_config.download_dir) / channel_file - - @property - def _output_filename(self): - output_file = self.channel + ".json" - return Path(self.processor_config.output_dir) / output_file - - @property - def version(self) -> t.Optional[str]: - return None - - @property - def source_url(self) -> t.Optional[str]: - return None - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @SourceConnectionNetworkError.wrap - @requires_dependencies(dependencies=["slack_sdk"], extras="slack") - def _fetch_messages(self): - from slack_sdk import WebClient - - self.client = WebClient(token=self.connector_config.access_config.token) - oldest = "0" - latest = "0" - if self.connector_config.start_date: - oldest = self.convert_datetime(self.connector_config.start_date) - - if self.connector_config.end_date: - latest = self.convert_datetime(self.connector_config.end_date) - - result = self.client.conversations_history( - channel=self.channel, - oldest=oldest, - latest=latest, - ) - return result - - def update_source_metadata(self, **kwargs): - result = kwargs.get("result", self._fetch_messages()) - if result is None: - self.source_metadata = SourceMetadata( - exists=True, - ) - return - timestamps = [m["ts"] for m in result["messages"]] - timestamps.sort() - date_created = None - date_modified = None - if len(timestamps) > 0: - date_created = datetime.fromtimestamp(float(timestamps[0])).isoformat() - date_modified = datetime.fromtimestamp( - float(timestamps[len(timestamps) - 1]), - ).isoformat() - - self.source_metadata = SourceMetadata( - date_created=date_created, - date_modified=date_modified, - exists=True, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["slack_sdk"], extras="slack") - def get_file(self): - from slack_sdk.errors import SlackApiError - - """Fetches the data from a slack channel and stores it locally.""" - - self._create_full_tmp_dir_path() - - result = self._fetch_messages() - self.update_source_metadata(result=result) - root = ET.Element("messages") - for message in result["messages"]: - message_elem = ET.SubElement(root, "message") - text_elem = ET.SubElement(message_elem, "text") - text_elem.text = message.get("text") - - cursor = None - while True: - try: - response = self.client.conversations_replies( - channel=self.channel, - ts=message["ts"], - cursor=cursor, - ) - - for reply in response["messages"]: - reply_msg = reply.get("text") - text_elem.text = "".join([str(text_elem.text), " ", reply_msg]) - - if not response["has_more"]: - break - - cursor = response["response_metadata"]["next_cursor"] - - except SlackApiError as e: - logger.error(f"Error retrieving replies: {e.response['error']}") - tree = ET.ElementTree(root) - tree.write(self._tmp_download_file(), encoding="utf-8", xml_declaration=True) - - def convert_datetime(self, date_time): - for format in DATE_FORMATS: - try: - return datetime.strptime(date_time, format).timestamp() - except ValueError: - pass - - @property - def filename(self): - """The filename of the file created from a slack channel""" - return self._tmp_download_file() - - -class SlackSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - """Objects of this class support fetching document(s) from""" - - connector_config: SimpleSlackConfig - - @requires_dependencies(dependencies=["slack_sdk"], extras="slack") - def check_connection(self): - from slack_sdk import WebClient - from slack_sdk.errors import SlackClientError - - try: - client = WebClient(token=self.connector_config.access_config.token) - client.users_identity() - except SlackClientError as slack_error: - logger.error(f"failed to validate connection: {slack_error}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {slack_error}") - - def initialize(self): - """Verify that can get metadata for an object, validates connections info.""" - - def get_ingest_docs(self): - return [ - SlackIngestDoc( - connector_config=self.connector_config, - processor_config=self.processor_config, - read_config=self.read_config, - channel=channel, - ) - for channel in self.connector_config.channels - ] diff --git a/unstructured/ingest/connector/sql.py b/unstructured/ingest/connector/sql.py deleted file mode 100644 index 21f1f4a1f..000000000 --- a/unstructured/ingest/connector/sql.py +++ /dev/null @@ -1,196 +0,0 @@ -import copy -import json -import typing as t -import uuid -from dataclasses import dataclass, field - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -ELEMENTS_TABLE_NAME = "elements" - - -@dataclass -class SqlAccessConfig(AccessConfig): - username: t.Optional[str] - password: t.Optional[str] = enhanced_field(sensitive=True) - - -@dataclass -class SimpleSqlConfig(BaseConnectorConfig): - db_type: t.Optional[str] - host: t.Optional[str] - database: t.Optional[str] - port: t.Optional[int] - access_config: SqlAccessConfig - - def __post_init__(self): - if (self.db_type == "sqlite") and (self.database is None): - raise ValueError( - "A sqlite connection requires a path to a *.db file " - "through the `database` argument" - ) - - @property - def connection(self): - if self.db_type == "postgresql": - return self._make_psycopg_connection - elif self.db_type == "sqlite": - return self._make_sqlite_connection - raise ValueError(f"Unsupported database {self.db_type} connection.") - - def _make_sqlite_connection(self): - from sqlite3 import connect - - return connect(database=self.database) - - @requires_dependencies(["psycopg2"], extras="postgres") - def _make_psycopg_connection(self): - from psycopg2 import connect - - return connect( - user=self.access_config.username, - password=self.access_config.password, - dbname=self.database, - host=self.host, - port=self.port, - ) - - -@dataclass -class SqlDestinationConnector(BaseDestinationConnector): - connector_config: SimpleSqlConfig - _client: t.Optional[t.Any] = field(init=False, default=None) - - def to_dict(self, **kwargs): - """ - The _client variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle '_thread.lock' object - When serializing, remove it, meaning client data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - if hasattr(self_cp, "_client"): - setattr(self_cp, "_client", None) - return _asdict(self_cp, **kwargs) - - @property - def client(self): - if self._client is None: - self._client = self.connector_config.connection() - return self._client - - @DestinationConnectionError.wrap - def initialize(self): - _ = self.client - - def check_connection(self): - try: - cursor = self.client.cursor() - cursor.execute("SELECT 1;") - cursor.close() - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def conform_dict(self, data: dict) -> None: - """ - Updates the element dictionary to conform to the sql schema - """ - from datetime import datetime - - data["id"] = str(uuid.uuid4()) - - # Dict as string formatting - if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"): - # Explicit casting otherwise fails schema type checking - data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator)) - - # Array of items as string formatting - if (embeddings := data.get("embeddings")) and ( - self.connector_config.db_type != "postgresql" - ): - data["embeddings"] = str(json.dumps(embeddings)) - - if points := data.get("metadata", {}).get("coordinates", {}).get("points"): - data["metadata"]["coordinates"]["points"] = str(json.dumps(points)) - - if links := data.get("metadata", {}).get("links", {}): - data["metadata"]["links"] = str(json.dumps(links)) - - if permissions_data := ( - data.get("metadata", {}).get("data_source", {}).get("permissions_data") - ): - data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data) - - if sent_from := data.get("metadata", {}).get("sent_from", {}): - data["metadata"]["sent_from"] = str(json.dumps(sent_from)) - - if sent_to := data.get("metadata", {}).get("sent_to", {}): - data["metadata"]["sent_to"] = str(json.dumps(sent_to)) - - # Datetime formatting - if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"): - data["metadata"]["data_source"]["date_created"] = datetime.fromisoformat(date_created) - - if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"): - data["metadata"]["data_source"]["date_modified"] = datetime.fromisoformat(date_modified) - - if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"): - data["metadata"]["data_source"]["date_processed"] = datetime.fromisoformat( - date_processed - ) - - if last_modified := data.get("metadata", {}).get("last_modified", {}): - data["metadata"]["last_modified"] = datetime.fromisoformat(last_modified) - - # String casting - if version := data.get("metadata", {}).get("data_source", {}).get("version"): - data["metadata"]["data_source"]["version"] = str(version) - - if page_number := data.get("metadata", {}).get("page_number"): - data["metadata"]["page_number"] = str(page_number) - - if data.get("metadata", {}).get("data_source", None): - data.update(data.get("metadata", {}).pop("data_source", None)) - if data.get("metadata", {}).get("coordinates", None): - data.update(data.get("metadata", {}).pop("coordinates", None)) - if data.get("metadata", {}): - data.update(data.pop("metadata", None)) - - @DestinationConnectionError.wrap - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info( - f"writing {len(elements_dict)} objects to database {self.connector_config.database} " - f"at {self.connector_config.host}" - ) - - with self.client as conn: - cursor = conn.cursor() - - # Since we have no guarantee that each element will have the same keys - # we insert each element individually - for elem in elements_dict: - query = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(elem.keys())}) \ - VALUES({','.join(['?' if self.connector_config.db_type=='sqlite' else '%s' for x in elem])})" # noqa E501 - values = [] - for v in elem.values(): - if self.connector_config.db_type == "sqlite" and isinstance(v, list): - values.append(json.dumps(v)) - else: - values.append(v) - cursor.execute(query, values) - - conn.commit() - cursor.close() - - # Leaving contexts doesn't close the connection, so doing it here - conn.close() diff --git a/unstructured/ingest/connector/vectara.py b/unstructured/ingest/connector/vectara.py deleted file mode 100644 index e94ff9c4f..000000000 --- a/unstructured/ingest/connector/vectara.py +++ /dev/null @@ -1,248 +0,0 @@ -import datetime -import json -import typing as t -import uuid -from dataclasses import dataclass, field - -import requests - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - BaseIngestDoc, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.staging.base import flatten_dict - -BASE_URL = "https://api.vectara.io/v1" - - -@dataclass -class VectaraAccessConfig(AccessConfig): - oauth_client_id: str = enhanced_field(sensitive=True) - oauth_secret: str = enhanced_field(sensitive=True) - - -@dataclass -class SimpleVectaraConfig(BaseConnectorConfig): - access_config: VectaraAccessConfig - customer_id: str - corpus_name: t.Optional[str] = None - corpus_id: t.Optional[str] = None - token_url: str = "https://vectara-prod-{}.auth.us-west-2.amazoncognito.com/oauth2/token" - - -@dataclass -class VectaraDestinationConnector(BaseDestinationConnector): - write_config: WriteConfig - connector_config: SimpleVectaraConfig - _jwt_token: t.Optional[str] = field(init=False, default=None) - _jwt_token_expires_ts: t.Optional[float] = field(init=False, default=None) - - @property - def jwt_token(self): - if ( - not self._jwt_token - or self._jwt_token_expires_ts - datetime.datetime.now().timestamp() <= 60 - ): - self._jwt_token = self._get_jwt_token() - return self._jwt_token - - @DestinationConnectionError.wrap - def vectara(self): - """ - Check the connection for Vectara and validate corpus exists. - - If more than one corpus with the same name exists - then return a message - - If exactly one corpus exists with this name - use it. - - If does not exist - create it. - """ - try: - # Get token if not already set - self.jwt_token - - list_corpora_response = self._request( - endpoint="list-corpora", - data={"numResults": 1, "filter": self.connector_config.corpus_name}, - ) - - possible_corpora_ids_names_map = { - corpus.get("id"): corpus.get("name") - for corpus in list_corpora_response.get("corpus") - if corpus.get("name") == self.connector_config.corpus_name - } - - if len(possible_corpora_ids_names_map) > 1: - return f"Multiple Corpora exist with name {self.connector_config.corpus_name}" - if len(possible_corpora_ids_names_map) == 1: - self.connector_config.corpus_id = list(possible_corpora_ids_names_map.keys())[0] - else: - data = { - "corpus": { - "name": self.connector_config.corpus_name, - } - } - create_corpus_response = self._request(endpoint="create-corpus", data=data) - self.connector_config.corpus_id = create_corpus_response.get("corpusId") - - except Exception as e: - logger.error(f"failed to create Vectara connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to create Vectara connection: {e}") - - def initialize(self): - self.vectara() - - def _request( - self, - endpoint: str, - http_method: str = "POST", - params: t.Mapping[str, t.Any] = None, - data: t.Mapping[str, t.Any] = None, - ): - url = f"{BASE_URL}/{endpoint}" - - headers = { - "Content-Type": "application/json", - "Accept": "application/json", - "Authorization": f"Bearer {self.jwt_token}", - "customer-id": self.connector_config.customer_id, - "X-source": "unstructured", - } - - response = requests.request( - method=http_method, url=url, headers=headers, params=params, data=json.dumps(data) - ) - response.raise_for_status() - return response.json() - - # Get Oauth2 JWT token - def _get_jwt_token(self): - """Connect to the server and get a JWT token.""" - token_endpoint = self.connector_config.token_url.format(self.connector_config.customer_id) - headers = { - "Content-Type": "application/x-www-form-urlencoded", - } - data = { - "grant_type": "client_credentials", - "client_id": self.connector_config.access_config.oauth_client_id, - "client_secret": self.connector_config.access_config.oauth_secret, - } - - response = requests.request(method="POST", url=token_endpoint, headers=headers, data=data) - response.raise_for_status() - response_json = response.json() - - request_time = datetime.datetime.now().timestamp() - self._jwt_token_expires_ts = request_time + response_json.get("expires_in") - - return response_json.get("access_token") - - @DestinationConnectionError.wrap - def check_connection(self): - try: - self.vectara() - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise DestinationConnectionError(f"failed to validate connection: {e}") - - def _delete_doc(self, doc_id: str) -> None: - """ - Delete a document from the Vectara corpus. - - Args: - url (str): URL of the page to delete. - doc_id (str): ID of the document to delete. - """ - body = { - "customer_id": self.connector_config.customer_id, - "corpus_id": self.connector_config.corpus_id, - "document_id": doc_id, - } - self._request(endpoint="delete-doc", data=body) - - def _index_document(self, document: t.Dict[str, t.Any]) -> None: - """ - Index a document (by uploading it to the Vectara corpus) from the document dictionary - """ - body = { - "customer_id": self.connector_config.customer_id, - "corpus_id": self.connector_config.corpus_id, - "document": document, - } - - try: - result = self._request(endpoint="index", data=body, http_method="POST") - except Exception as e: - logger.info(f"Exception {e} while indexing document {document['documentId']}") - return - - if ( - "status" in result - and result["status"] - and ( - "ALREADY_EXISTS" in result["status"]["code"] - or ( - "CONFLICT" in result["status"]["code"] - and "Indexing doesn't support updating documents" - in result["status"]["statusDetail"] - ) - ) - ): - logger.info(f"Document {document['documentId']} already exists, re-indexing") - self._delete_doc(document["documentId"]) - result = self._request(endpoint="index", data=body, http_method="POST") - return - - if "status" in result and result["status"] and "OK" in result["status"]["code"]: - logger.info(f"Indexing document {document['documentId']} succeeded") - else: - logger.info(f"Indexing document {document['documentId']} failed, response = {result}") - - def write_dict(self, *args, docs_list: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info(f"Inserting / updating {len(docs_list)} documents to Vectara ") - for vdoc in docs_list: - self._index_document(vdoc) - - def write(self, docs: t.List[BaseIngestDoc]) -> None: - docs_list: t.Dict[t.Dict[str, t.Any]] = [] - - def get_metadata(element) -> t.Dict[str, t.Any]: - """ - Select which meta-data fields to include and optionaly map them to a new new. - remove the "metadata-" prefix from the keys - """ - metadata_map = { - "page_number": "page_number", - "data_source-url": "url", - "filename": "filename", - "filetype": "filetype", - "last_modified": "last_modified", - } - md = flatten_dict(element, separator="-", flatten_lists=True) - md = {k.replace("metadata-", ""): v for k, v in md.items()} - md = {metadata_map[k]: v for k, v in md.items() if k in metadata_map} - return md - - for doc in docs: - local_path = doc._output_filename - with open(local_path) as json_file: - dict_content = json.load(json_file) - vdoc = { - "documentId": str(uuid.uuid4()), - "title": dict_content[0].get("metadata", {}).get("data_source", {}).get("url"), - "section": [ - { - "text": element.pop("text", None), - "metadataJson": json.dumps(get_metadata(element)), - } - for element in dict_content - ], - } - logger.info( - f"Extending {len(vdoc)} json elements from content in {local_path}", - ) - docs_list.append(vdoc) - self.write_dict(docs_list=docs_list) diff --git a/unstructured/ingest/connector/weaviate.py b/unstructured/ingest/connector/weaviate.py deleted file mode 100644 index 5039b2f99..000000000 --- a/unstructured/ingest/connector/weaviate.py +++ /dev/null @@ -1,187 +0,0 @@ -import copy -import json -import typing as t -from dataclasses import dataclass, field - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError -from unstructured.ingest.interfaces import ( - AccessConfig, - BaseConnectorConfig, - BaseDestinationConnector, - WriteConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from weaviate import Client - - -@dataclass -class WeaviateAccessConfig(AccessConfig): - access_token: t.Optional[str] = enhanced_field(default=None, sensitive=True) - refresh_token: t.Optional[str] = enhanced_field(default=None, sensitive=True) - api_key: t.Optional[str] = enhanced_field(default=None, sensitive=True) - client_secret: t.Optional[str] = enhanced_field(default=None, sensitive=True) - scope: t.Optional[t.List[str]] = None - username: t.Optional[str] = None - password: t.Optional[str] = enhanced_field(default=None, sensitive=True) - anonymous: bool = False - - -@dataclass -class SimpleWeaviateConfig(BaseConnectorConfig): - access_config: WeaviateAccessConfig - host_url: str - class_name: str - - -@dataclass -class WeaviateWriteConfig(WriteConfig): - batch_size: int = 100 - - -@dataclass -class WeaviateDestinationConnector(BaseDestinationConnector): - write_config: WeaviateWriteConfig - connector_config: SimpleWeaviateConfig - _client: t.Optional["Client"] = field(init=False, default=None) - - def to_dict(self, **kwargs): - """ - The _client variable in this dataclass breaks deepcopy due to: - TypeError: cannot pickle '_thread.lock' object - When serializing, remove it, meaning client data will need to be reinitialized - when deserialized - """ - self_cp = copy.copy(self) - if hasattr(self_cp, "_client"): - setattr(self_cp, "_client", None) - return _asdict(self_cp, **kwargs) - - @property - @requires_dependencies(["weaviate"], extras="weaviate") - def client(self) -> "Client": - if self._client is None: - from weaviate import Client - - auth = self._resolve_auth_method() - self._client = Client(url=self.connector_config.host_url, auth_client_secret=auth) - return self._client - - @requires_dependencies(["weaviate"], extras="weaviate") - @DestinationConnectionError.wrap - def initialize(self): - _ = self.client - - @requires_dependencies(["weaviate"], extras="weaviate") - def check_connection(self): - try: - _ = self.client - except Exception as e: - logger.error(f"Failed to validate connection {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def _resolve_auth_method(self): - access_configs = self.connector_config.access_config - if access_configs.anonymous: - return None - - if access_configs.access_token: - from weaviate.auth import AuthBearerToken - - return AuthBearerToken( - access_token=access_configs.access_token, - refresh_token=access_configs.refresh_token, - ) - elif access_configs.api_key: - from weaviate.auth import AuthApiKey - - return AuthApiKey(api_key=access_configs.api_key) - elif access_configs.client_secret: - from weaviate.auth import AuthClientCredentials - - return AuthClientCredentials( - client_secret=access_configs.client_secret, scope=access_configs.scope - ) - elif access_configs.username and access_configs.password: - from weaviate.auth import AuthClientPassword - - return AuthClientPassword( - username=access_configs.username, - password=access_configs.password, - scope=access_configs.scope, - ) - return None - - def conform_dict(self, data: dict) -> None: - """ - Updates the element dictionary to conform to the Weaviate schema - """ - from dateutil import parser - - # Dict as string formatting - if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"): - # Explicit casting otherwise fails schema type checking - data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator)) - - # Array of items as string formatting - if points := data.get("metadata", {}).get("coordinates", {}).get("points"): - data["metadata"]["coordinates"]["points"] = str(json.dumps(points)) - - if links := data.get("metadata", {}).get("links", {}): - data["metadata"]["links"] = str(json.dumps(links)) - - if permissions_data := ( - data.get("metadata", {}).get("data_source", {}).get("permissions_data") - ): - data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data) - - # Datetime formatting - if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"): - data["metadata"]["data_source"]["date_created"] = parser.parse(date_created).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"): - data["metadata"]["data_source"]["date_modified"] = parser.parse(date_modified).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"): - data["metadata"]["data_source"]["date_processed"] = parser.parse( - date_processed - ).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - if last_modified := data.get("metadata", {}).get("last_modified", {}): - data["metadata"]["last_modified"] = parser.parse(last_modified).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - # String casting - if version := data.get("metadata", {}).get("data_source", {}).get("version"): - data["metadata"]["data_source"]["version"] = str(version) - - if page_number := data.get("metadata", {}).get("page_number"): - data["metadata"]["page_number"] = str(page_number) - - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - logger.info( - f"writing {len(elements_dict)} objects to destination " - f"class {self.connector_config.class_name} " - f"at {self.connector_config.host_url}", - ) - - self.client.batch.configure(batch_size=self.write_config.batch_size) - with self.client.batch as b: - for e in elements_dict: - vector = e.pop("embeddings", None) - b.add_data_object( - e, - self.connector_config.class_name, - vector=vector, - ) diff --git a/unstructured/ingest/connector/wikipedia.py b/unstructured/ingest/connector/wikipedia.py deleted file mode 100644 index 239e4636c..000000000 --- a/unstructured/ingest/connector/wikipedia.py +++ /dev/null @@ -1,208 +0,0 @@ -import typing as t -from dataclasses import dataclass, field -from pathlib import Path - -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseSingleIngestDoc, - BaseSourceConnector, - IngestDocCleanupMixin, - SourceConnectorCleanupMixin, - SourceMetadata, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from wikipedia import WikipediaPage - - -@dataclass -class SimpleWikipediaConfig(BaseConnectorConfig): - page_title: str - auto_suggest: bool = False - - -@dataclass -class WikipediaIngestDoc(IngestDocCleanupMixin, BaseSingleIngestDoc): - connector_config: SimpleWikipediaConfig = field(repr=False) - - @property - @requires_dependencies(["wikipedia"], extras="wikipedia") - def page(self) -> "WikipediaPage": - import wikipedia - - return wikipedia.page( - self.connector_config.page_title, - auto_suggest=self.connector_config.auto_suggest, - ) - - def get_filename_prefix(self) -> str: - title: str = str(self.connector_config.page_title) - title = " ".join(title.split()).replace(" ", "-") - return title - - @property - def filename(self) -> Path: - raise NotImplementedError() - - @property - def text(self) -> str: - raise NotImplementedError() - - @property - def _output_filename(self): - raise NotImplementedError() - - @property - def date_created(self) -> t.Optional[str]: - return None - - @property - def date_modified(self) -> t.Optional[str]: - return None - - @property - def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: - return { - "page_title": self.connector_config.page_title, - "page_url": self.source_metadata.source_url, # type: ignore - } - - def _create_full_tmp_dir_path(self): - self.filename.parent.mkdir(parents=True, exist_ok=True) - - @requires_dependencies(["wikipedia"], extras="wikipedia") - def update_source_metadata(self): - from wikipedia.exceptions import PageError - - try: - page = self.page - except PageError: - self.source_metadata = SourceMetadata( - exists=False, - ) - return - - self.source_metadata = SourceMetadata( - version=page.revision_id, - source_url=page.url, - exists=True, - ) - - @SourceConnectionError.wrap - @BaseSingleIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - self._create_full_tmp_dir_path() - self.update_source_metadata() - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.text) - - -@dataclass -class WikipediaIngestHTMLDoc(WikipediaIngestDoc): - registry_name: str = "wikipedia_html" - - @property - def filename(self) -> Path: - return ( - Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.html" - ).resolve() - - @property - def text(self): - return self._get_html() - - @SourceConnectionNetworkError.wrap - def _get_html(self): - return self.page.html() - - @property - def _output_filename(self): - return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-html.json" - - -@dataclass -class WikipediaIngestTextDoc(WikipediaIngestDoc): - registry_name: str = "wikipedia_text" - - @property - def filename(self) -> Path: - return (Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}.txt").resolve() - - @property - def text(self): - return self._get_content() - - @SourceConnectionNetworkError.wrap - def _get_content(self): - return self.page.content - - @property - def _output_filename(self): - return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-txt.json" - - -@dataclass -class WikipediaIngestSummaryDoc(WikipediaIngestDoc): - registry_name: str = "wikipedia_summary" - - @property - def filename(self) -> Path: - return ( - Path(self.read_config.download_dir) / f"{self.get_filename_prefix()}-summary.txt" - ).resolve() - - @property - def text(self): - return self._get_summary() - - @SourceConnectionNetworkError.wrap - def _get_summary(self): - return self.page.summary - - @property - def _output_filename(self): - return Path(self.processor_config.output_dir) / f"{self.get_filename_prefix()}-summary.json" - - -@dataclass -class WikipediaSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): - connector_config: SimpleWikipediaConfig - - def initialize(self): - pass - - @requires_dependencies(["wikipedia"], extras="wikipedia") - def check_connection(self): - import wikipedia - - try: - wikipedia.page( - self.connector_config.page_title, - auto_suggest=self.connector_config.auto_suggest, - ) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def get_ingest_docs(self): - return [ - WikipediaIngestTextDoc( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - ), - WikipediaIngestHTMLDoc( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - ), - WikipediaIngestSummaryDoc( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - ), - ] diff --git a/unstructured/ingest/enhanced_dataclass/__init__.py b/unstructured/ingest/enhanced_dataclass/__init__.py deleted file mode 100644 index 38c598c4a..000000000 --- a/unstructured/ingest/enhanced_dataclass/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .dataclasses import enhanced_field -from .json_mixin import EnhancedDataClassJsonMixin - -__all__ = ["enhanced_field", "EnhancedDataClassJsonMixin"] diff --git a/unstructured/ingest/enhanced_dataclass/core.py b/unstructured/ingest/enhanced_dataclass/core.py deleted file mode 100644 index 8fd79af39..000000000 --- a/unstructured/ingest/enhanced_dataclass/core.py +++ /dev/null @@ -1,99 +0,0 @@ -import _thread -import copy -import functools -from dataclasses import fields - -from dataclasses_json.core import ( - Collection, - Enum, - Mapping, - _encode_overrides, - _handle_undefined_parameters_safe, - _user_overrides_or_exts, - is_dataclass, -) - - -def _recursive_repr(user_function): - # Copied from dataclasses as this method isn't exposed for importing - repr_running = set() - - @functools.wraps(user_function) - def wrapper(self): - key = id(self), _thread.get_ident() - if key in repr_running: - return "..." - repr_running.add(key) - try: - result = user_function(self) - finally: - repr_running.discard(key) - return result - - return wrapper - - -def _asdict( - obj, - encode_json=False, - redact_sensitive=False, - redacted_text="***REDACTED***", - apply_name_overload: bool = True, -): - """ - A re-implementation of `asdict` (based on the original in the `dataclasses` - source) to support arbitrary Collection and Mapping types. - """ - if is_dataclass(obj): - result = [] - overrides = _user_overrides_or_exts(obj) - for field in fields(obj): - if overrides[field.name].encoder: - value = getattr(obj, field.name) - else: - value = _asdict( - getattr(obj, field.name), - encode_json=encode_json, - redact_sensitive=redact_sensitive, - redacted_text=redacted_text, - apply_name_overload=apply_name_overload, - ) - if getattr(field, "sensitive", False) and redact_sensitive and value: - value = redacted_text - if getattr(field, "overload_name", None) and apply_name_overload: - overload_name = getattr(field, "overload_name") - result.append((overload_name, value)) - else: - result.append((field.name, value)) - - result = _handle_undefined_parameters_safe(cls=obj, kvs=dict(result), usage="to") - return _encode_overrides( - dict(result), _user_overrides_or_exts(obj), encode_json=encode_json - ) - elif isinstance(obj, Mapping): - return { - _asdict( - k, - encode_json=encode_json, - redact_sensitive=redact_sensitive, - redacted_text=redacted_text, - ): _asdict( - v, - encode_json=encode_json, - redact_sensitive=redact_sensitive, - redacted_text=redacted_text, - ) - for k, v in obj.items() - } - elif isinstance(obj, Collection) and not isinstance(obj, (str, bytes, Enum)): - return [ - _asdict( - v, - encode_json=encode_json, - redact_sensitive=redact_sensitive, - redacted_text=redacted_text, - ) - for v in obj - ] - else: - return copy.deepcopy(obj) diff --git a/unstructured/ingest/enhanced_dataclass/dataclasses.py b/unstructured/ingest/enhanced_dataclass/dataclasses.py deleted file mode 100644 index a58fb3b79..000000000 --- a/unstructured/ingest/enhanced_dataclass/dataclasses.py +++ /dev/null @@ -1,54 +0,0 @@ -import typing as t -from dataclasses import MISSING, Field - -from unstructured.ingest.enhanced_dataclass.core import _recursive_repr - - -class EnhancedField(Field): - def __init__(self, *args, sensitive=False, overload_name: t.Optional[str] = None): - super().__init__(*args) - self.sensitive = sensitive - self.overload_name = overload_name - - @_recursive_repr - def __repr__(self): - # Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map - fields_array = [ - f"name={self.name!r}", - f"type={self.type!r}", - f"default={self.default!r}", - f"default_factory={self.default_factory!r}", - f"init={self.init!r}", - f"repr={self.repr!r}", - f"hash={self.hash!r}", - f"compare={self.compare!r}", - f"metadata={self.metadata!r}", - f"sensitive={self.sensitive!r}", - f"overload_name={self.overload_name!r}", - f"_field_type={self._field_type}", - ] - if kw_only := getattr(self, "kw_only", None): - fields_array.append(f"kw_only={kw_only!r}") - return "Field({})".format(",".join(fields_array)) - - -def enhanced_field( - *, - default=MISSING, - default_factory=MISSING, - init: bool = True, - repr: bool = True, - hash=None, - compare: bool = True, - metadata=None, - kw_only=MISSING, - sensitive: bool = False, - overload_name: t.Optional[str] = None, -): - if default is not MISSING and default_factory is not MISSING: - raise ValueError("cannot specify both default and default_factory") - args = [default, default_factory, init, repr, hash, compare, metadata] - # Support for kw_only added in 3.10, to support as low as 3.8, need to dynamically map - if "kw_only" in EnhancedField.__slots__: - args.append(kw_only) - return EnhancedField(*args, sensitive=sensitive, overload_name=overload_name) diff --git a/unstructured/ingest/enhanced_dataclass/json_mixin.py b/unstructured/ingest/enhanced_dataclass/json_mixin.py deleted file mode 100644 index 04f365a6b..000000000 --- a/unstructured/ingest/enhanced_dataclass/json_mixin.py +++ /dev/null @@ -1,125 +0,0 @@ -from __future__ import annotations - -import json -from dataclasses import InitVar, fields -from typing import Any, Callable, Optional, Type, TypeVar, Union - -import dataclasses_json.core as dataclasses_json_core -from dataclasses_json import DataClassJsonMixin - -from unstructured.ingest.enhanced_dataclass.core import _asdict - -A = TypeVar("A", bound="EnhancedDataClassJsonMixin") - -# Monkey-patch _decode_dataclass class to support name override -og_decode_dataclass = dataclasses_json_core._decode_dataclass - - -def custom_decode_dataclass(cls, kvs, infer_missing): - dataclass_fields = fields(cls) - for f in [ - field - for field in dataclass_fields - if hasattr(field, "overload_name") and getattr(field, "overload_name", None) - ]: - field_name = f.name - overload_name = getattr(f, "overload_name") - if isinstance(kvs, dict) and overload_name in kvs: - kvs[field_name] = kvs.pop(overload_name) - return og_decode_dataclass(cls, kvs, infer_missing) - - -dataclasses_json_core._decode_dataclass = custom_decode_dataclass - - -class EnhancedDataClassJsonMixin(DataClassJsonMixin): - """A mixin class extending DataClassJsonMixin. - - This class extends the functionality of DataClassJsonMixin to provide enhanced functionality - for JSON serialization and deserialization. It introduces options for redacting sensitive - information, custom encoding, and more advanced schema handling. - - Attributes: - N/A (No additional attributes) - - Methods: - to_json: Serialize the object to JSON format with customizable options. - from_dict: Deserialize a dictionary into an object of this class. - to_dict: Convert the object to a dictionary with customizable options. - schema: Generate a schema for validating and parsing JSON data based on this class. - """ - - @classmethod - def check_init_var(cls): - ann = cls.__dict__.get("__annotations__", {}) - init_vars = {k: v for k, v in ann.items() if isinstance(v, InitVar)} - if init_vars: - raise TypeError( - "Class {} has the following fields defined with an InitVar which " - "cannot be used with EnhancedDataClassJsonMixin: {}".format( - cls.__name__, ", ".join(init_vars.keys()) - ) - ) - - def to_json( - self, - *, - skipkeys: bool = False, - ensure_ascii: bool = True, - check_circular: bool = True, - allow_nan: bool = True, - indent: Optional[Union[int, str]] = None, - separators: Optional[tuple[str, str]] = None, - default: Optional[Callable[..., Any]] = None, - sort_keys: bool = False, - redact_sensitive: bool = False, - redacted_text: str = "***REDACTED***", - apply_name_overload: bool = True, - **kw: Any, - ) -> str: - self.check_init_var() - return json.dumps( - self.to_dict( - encode_json=False, - redact_sensitive=redact_sensitive, - redacted_text=redacted_text, - apply_name_overload=apply_name_overload, - ), - cls=dataclasses_json_core._ExtendedEncoder, - skipkeys=skipkeys, - ensure_ascii=ensure_ascii, - check_circular=check_circular, - allow_nan=allow_nan, - indent=indent, - separators=separators, - default=default, - sort_keys=sort_keys, - **kw, - ) - - @classmethod - def from_dict( - cls: Type[A], - kvs: dataclasses_json_core.Json, - *, - infer_missing=False, - apply_name_overload=False, - ) -> A: - cls.check_init_var() - return dataclasses_json_core._decode_dataclass(cls, kvs, infer_missing) - - def to_dict( - self, - encode_json: bool = False, - redact_sensitive: bool = False, - redacted_text: str = "***REDACTED***", - apply_name_overload: bool = True, - ) -> dict[str, dataclasses_json_core.Json]: - self.check_init_var() - return _asdict( - self, - encode_json=encode_json, - redact_sensitive=redact_sensitive, - redacted_text=redacted_text, - apply_name_overload=apply_name_overload, - ) diff --git a/unstructured/ingest/error.py b/unstructured/ingest/error.py deleted file mode 100644 index 8397caf6d..000000000 --- a/unstructured/ingest/error.py +++ /dev/null @@ -1,49 +0,0 @@ -from abc import ABC -from functools import wraps - - -class CustomError(Exception, ABC): - error_string: str - - @classmethod - def wrap(cls, f): - """ - Provides a wrapper for a function that catches any exception and - re-raises it as the customer error. If the exception itself is already an instance - of the custom error, re-raises original error. - """ - - @wraps(f) - def wrapper(*args, **kwargs): - try: - return f(*args, **kwargs) - except BaseException as error: - if not isinstance(error, cls) and not issubclass(type(error), cls): - raise cls(cls.error_string.format(str(error))) from error - raise - - return wrapper - - -class SourceConnectionError(CustomError): - error_string = "Error in getting data from upstream data source: {}" - - -class SourceConnectionNetworkError(SourceConnectionError): - error_string = "Error in connecting to upstream data source: {}" - - -class DestinationConnectionError(CustomError): - error_string = "Error in connecting to downstream data source: {}" - - -class EmbeddingEncoderConnectionError(CustomError): - error_string = "Error in connecting to the embedding model provider: {}" - - -class WriteError(CustomError): - error_string = "Error in writing to downstream data source: {}" - - -class PartitionError(CustomError): - error_string = "Error in partitioning content: {}" diff --git a/unstructured/ingest/evaluate.py b/unstructured/ingest/evaluate.py deleted file mode 100755 index c6446ac9d..000000000 --- a/unstructured/ingest/evaluate.py +++ /dev/null @@ -1,349 +0,0 @@ -#! /usr/bin/env python3 - -from typing import List, Optional, Tuple, Union - -import click - -from unstructured.metrics.evaluate import ( - ElementTypeMetricsCalculator, - ObjectDetectionAggregatedMetricsCalculator, - ObjectDetectionPerClassMetricsCalculator, - TableStructureMetricsCalculator, - TextExtractionMetricsCalculator, - filter_metrics, - get_mean_grouping, -) - - -@click.group() -def main(): - pass - - -@main.command() -@click.option("--output_dir", type=str, help="Directory to structured output.") -@click.option("--source_dir", type=str, help="Directory to source.") -@click.option( - "--output_list", - type=str, - multiple=True, - help="Optional: list of selected structured output file names under the \ - directory to be evaluate. If none, all files under directory will be use.", -) -@click.option( - "--source_list", - type=str, - multiple=True, - help="Optional: list of selected source file names under the directory \ - to be evaluate. If none, all files under directory will be use.", -) -@click.option( - "--export_dir", - type=str, - default="metrics", - help="Directory to save the output evaluation metrics to. Default to \ - your/working/dir/metrics/", -) -@click.option("--group_by", type=str, help="Input field for aggregration, or leave blank if none.") -@click.option( - "--weights", - type=(int, int, int), - default=(2, 1, 1), - show_default=True, - help="A list of weights to the Levenshtein distance calculation. Takes input as --weights 2 2 2\ - See text_extraction.py/calculate_edit_distance for more details.", -) -@click.option( - "--visualize", - is_flag=True, - show_default=True, - default=False, - help="Add the flag to show progress bar.", -) -@click.option( - "--output_type", - type=str, - default="json", - show_default=True, - help="Takes in either `txt` or `json` as output_type.", -) -def measure_text_extraction_accuracy_command( - output_dir: str, - source_dir: str, - export_dir: str, - weights: Tuple[int, int, int], - visualize: bool, - output_type: str, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, - group_by: Optional[str] = None, -): - return ( - TextExtractionMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - group_by=group_by, - weights=weights, - document_type=output_type, - ) - .on_files(document_paths=output_list, ground_truth_paths=source_list) - .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) - ) - - -@main.command() -@click.option("--output_dir", type=str, help="Directory to structured output.") -@click.option("--source_dir", type=str, help="Directory to structured source.") -@click.option( - "--output_list", - type=str, - multiple=True, - help="Optional: list of selected structured output file names under the \ - directory to be evaluate. If none, all files under directory will be used.", -) -@click.option( - "--source_list", - type=str, - multiple=True, - help="Optional: list of selected source file names under the directory \ - to be evaluate. If none, all files under directory will be used.", -) -@click.option( - "--export_dir", - type=str, - default="metrics", - help="Directory to save the output evaluation metrics to. Default to \ - your/working/dir/metrics/", -) -@click.option( - "--visualize", - is_flag=True, - show_default=True, - default=False, - help="Add the flag to show progress bar.", -) -def measure_element_type_accuracy_command( - output_dir: str, - source_dir: str, - export_dir: str, - visualize: bool, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, -): - return ( - ElementTypeMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - ) - .on_files(document_paths=output_list, ground_truth_paths=source_list) - .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) - ) - - -@main.command() -@click.option( - "--group_by", - type=str, - required=True, - help="The category to group by; valid values are 'doctype' and 'connector'.", -) -@click.option( - "--data_input", - type=str, - required=True, - help="A datafram or path to the CSV/TSV file containing the data", -) -@click.option( - "--export_dir", - type=str, - default="metrics", - help="Directory to save the output evaluation metrics to. Default to \ - your/working/dir/metrics/", -) -@click.option( - "--eval_name", - type=str, - help="Evaluated metric. Expecting one of 'text_extraction' or 'element_type'", -) -@click.option( - "--agg_name", - type=str, - help="String to use with export filename. Default is `cct` for `text_extraction` \ - and `element-type` for `element_type`", -) -@click.option( - "--export_filename", type=str, help="Optional. Define your file name for the output here." -) -def get_mean_grouping_command( - group_by: str, - data_input: str, - export_dir: str, - eval_name: str, - agg_name: Optional[str] = None, - export_filename: Optional[str] = None, -): - return get_mean_grouping( - group_by=group_by, - data_input=data_input, - export_dir=export_dir, - eval_name=eval_name, - agg_name=agg_name, - export_filename=export_filename, - ) - - -@main.command() -@click.option("--output_dir", type=str, help="Directory to structured output.") -@click.option("--source_dir", type=str, help="Directory to structured source.") -@click.option( - "--output_list", - type=str, - multiple=True, - help="Optional: list of selected structured output file names under the \ - directory to be evaluate. If none, all files under directory will be used.", -) -@click.option( - "--source_list", - type=str, - multiple=True, - help="Optional: list of selected source file names under the directory \ - to be evaluate. If none, all files under directory will be used.", -) -@click.option( - "--export_dir", - type=str, - default="metrics", - help="Directory to save the output evaluation metrics to. Default to \ - your/working/dir/metrics/", -) -@click.option( - "--visualize", - is_flag=True, - show_default=True, - default=False, - help="Add the flag to show progress bar.", -) -@click.option( - "--cutoff", - type=float, - show_default=True, - default=0.8, - help="The cutoff value for the element level alignment. \ - If not set, a default value is used", -) -def measure_table_structure_accuracy_command( - output_dir: str, - source_dir: str, - export_dir: str, - visualize: bool, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, - cutoff: Optional[float] = None, -): - return ( - TableStructureMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - cutoff=cutoff, - ) - .on_files(document_paths=output_list, ground_truth_paths=source_list) - .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) - ) - - -@main.command() -@click.option("--output_dir", type=str, help="Directory to structured output.") -@click.option("--source_dir", type=str, help="Directory to structured source.") -@click.option( - "--output_list", - type=str, - multiple=True, - help=( - "Optional: list of selected structured output file names under the " - "directory to be evaluated. If none, all files under directory will be used." - ), -) -@click.option( - "--source_list", - type=str, - multiple=True, - help="Optional: list of selected source file names under the directory \ - to be evaluate. If none, all files under directory will be used.", -) -@click.option( - "--export_dir", - type=str, - default="metrics", - help="Directory to save the output evaluation metrics to. Default to \ - your/working/dir/metrics/", -) -@click.option( - "--visualize", - is_flag=True, - show_default=True, - default=False, - help="Add the flag to show progress bar.", -) -def measure_object_detection_metrics_command( - output_dir: str, - source_dir: str, - export_dir: str, - visualize: bool, - output_list: Optional[List[str]] = None, - source_list: Optional[List[str]] = None, -): - aggregated_df = ( - ObjectDetectionAggregatedMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - ) - .on_files(document_paths=output_list, ground_truth_paths=source_list) - .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) - ) - per_class_df = ( - ObjectDetectionPerClassMetricsCalculator( - documents_dir=output_dir, - ground_truths_dir=source_dir, - ) - .on_files(document_paths=output_list, ground_truth_paths=source_list) - .calculate(export_dir=export_dir, visualize_progress=visualize, display_agg_df=True) - ) - return aggregated_df, per_class_df - - -@main.command() -@click.option( - "--data_input", type=str, required=True, help="Takes in path to data file as .tsv .csv .txt" -) -@click.option( - "--filter_list", - type=str, - required=True, - help="Takes in list of string to filter the data_input.", -) -@click.option( - "--filter_by", - type=str, - required=True, - help="Field from data_input to match with filter_list. Default is `filename`.", -) -@click.option( - "--export_filename", type=str, help="Export filename. Required when return_type is `file`" -) -@click.option("--export_dir", type=str, help="Export directory.") -@click.option("--return_type", type=str, help="`dataframe` or `file`. Default is `file`.") -def filter_metrics_command( - data_input: str, - filter_list: Union[str, List[str]], - filter_by: str = "filename", - export_filename: Optional[str] = None, - export_dir: str = "metrics", - return_type: str = "file", -): - return filter_metrics( - data_input, filter_list, filter_by, export_filename, export_dir, return_type - ) - - -if __name__ == "__main__": - main() diff --git a/unstructured/ingest/img/unstructured_ingest_cli_pipeline_diagram.png b/unstructured/ingest/img/unstructured_ingest_cli_pipeline_diagram.png deleted file mode 100644 index cf2c94f47..000000000 Binary files a/unstructured/ingest/img/unstructured_ingest_cli_pipeline_diagram.png and /dev/null differ diff --git a/unstructured/ingest/ingest_backoff/__init__.py b/unstructured/ingest/ingest_backoff/__init__.py deleted file mode 100644 index 81d08bf36..000000000 --- a/unstructured/ingest/ingest_backoff/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from ._wrapper import RetryHandler - -__all__ = ["RetryHandler"] diff --git a/unstructured/ingest/ingest_backoff/_common.py b/unstructured/ingest/ingest_backoff/_common.py deleted file mode 100644 index 5b1f87759..000000000 --- a/unstructured/ingest/ingest_backoff/_common.py +++ /dev/null @@ -1,102 +0,0 @@ -import logging -import sys -import traceback - - -# Default startup handler -def _log_start(details, logger, log_level): - max_tried = details.get("max_tries") - max_time = details.get("max_time") - if max_tried is not None and max_time is not None: - s = "%.1fs or %d tries" - s_args = [max_time, max_tried] - elif max_tried is not None: - s = "%d tries" - s_args = [max_tried] - else: - s = "%.1fs" - s_args = [max_time] - exception = details.get("exception") - if isinstance(exception, tuple): - exception = list(exception) - elif not isinstance(exception, list): - exception = [exception] - exception_s = ", ".join([e.__name__ for e in exception]) - if log_level >= logging.INFO: - msg = f"Attempting %s(...), will retry for {s} given these issues: %s" - log_args = [details["target"].__name__] + s_args + [exception_s] - else: - msg = f"Attempting %s(%s), will retry for {s} given these issues: %s" - target_input_list = [] - if args := details.get("args"): - target_input_list.extend([str(d) for d in args]) - if kwargs := details.get("kwargs"): - target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()]) - target_input = ", ".join(target_input_list) if target_input_list else "" - log_args = ( - [ - details["target"].__name__, - target_input, - ] - + s_args - + [exception_s] - ) - logger.log(log_level, msg, *log_args) - - -# Default backoff handler -def _log_backoff(details, logger, log_level): - if log_level >= logging.INFO: - msg = "Backing off %s(...) for %.1fs (%s)" - log_args = [details["target"].__name__, details["tries"]] - else: - msg = "Backing off %.1fs seconds after %d tries calling function %s(%s) -> %s" - target_input_list = [] - if args := details.get("args"): - target_input_list.extend([str(d) for d in args]) - if kwargs := details.get("kwargs"): - target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()]) - target_input = ", ".join(target_input_list) if target_input_list else "" - log_args = [ - details["wait"], - details["tries"], - details["target"].__name__, - target_input, - ] - exc_typ, exc, _ = sys.exc_info() - if exc is not None: - exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1] - log_args.append(exc_fmt.rstrip("\n")) - else: - log_args.append(str(details["value"])) - logger.log(log_level, msg, *log_args) - - -# Default giveup handler -def _log_giveup(details, logger, log_level): - if log_level >= logging.INFO: - msg = "Giving up %s(...) after %.1fs (%s)" - log_args = [details["target"].__name__, details["tries"]] - else: - msg = "Giving up after %d tries (%.1fs) calling function %s(%s) -> %s" - target_input_list = [] - if args := details.get("args"): - target_input_list.extend([str(d) for d in args]) - if kwargs := details.get("kwargs"): - target_input_list.extend([f"{k}={str(v)}" for k, v in kwargs.items()]) - target_input = ", ".join(target_input_list) if target_input_list else "..." - log_args = [ - details["tries"], - details["wait"], - details["target"].__name__, - target_input, - ] - - exc_typ, exc, _ = sys.exc_info() - if exc is not None: - exc_fmt = traceback.format_exception_only(exc_typ, exc)[-1] - log_args.append(exc_fmt.rstrip("\n")) - else: - log_args.append(details["value"]) - - logger.log(log_level, msg, *log_args) diff --git a/unstructured/ingest/ingest_backoff/_wrapper.py b/unstructured/ingest/ingest_backoff/_wrapper.py deleted file mode 100644 index 66e9d193a..000000000 --- a/unstructured/ingest/ingest_backoff/_wrapper.py +++ /dev/null @@ -1,122 +0,0 @@ -# coding:utf-8 -import logging -from collections.abc import Iterable as IterableType -from typing import Any, Iterable, Optional, Type, Union - -from backoff import _sync -from backoff._common import _config_handlers, _prepare_logger -from backoff._jitter import full_jitter -from backoff._typing import ( - _Handler, - _Jitterer, - _MaybeCallable, - _MaybeLogger, - _MaybeSequence, - _Predicate, - _WaitGenerator, -) - -from unstructured.ingest.ingest_backoff._common import _log_backoff, _log_giveup, _log_start - - -class RetryHandler: - def __init__( - self, - wait_gen: _WaitGenerator, - exception: _MaybeSequence[Type[Exception]], - *, - max_tries: Optional[_MaybeCallable[int]] = None, - max_time: Optional[_MaybeCallable[float]] = None, - jitter: Union[_Jitterer, None] = full_jitter, - giveup: _Predicate[Exception] = lambda e: False, - on_start: Union[_Handler, Iterable[_Handler], None] = None, - on_success: Union[_Handler, Iterable[_Handler], None] = None, - on_backoff: Union[_Handler, Iterable[_Handler], None] = None, - on_giveup: Union[_Handler, Iterable[_Handler], None] = None, - raise_on_giveup: bool = True, - logger: _MaybeLogger = "backoff", - start_log_level: int = logging.INFO, - backoff_log_level: int = logging.INFO, - giveup_log_level: int = logging.ERROR, - **wait_gen_kwargs: Any, - ): - prepared_logger = _prepare_logger(logger) - on_success = _config_handlers(on_success) - on_start = _config_handlers( - on_start, - default_handler=_log_start, - logger=prepared_logger, - log_level=start_log_level, - ) - on_backoff = _config_handlers( - on_backoff, - default_handler=_log_backoff, - logger=prepared_logger, - log_level=backoff_log_level, - ) - on_giveup = _config_handlers( - on_giveup, - default_handler=_log_giveup, - logger=prepared_logger, - log_level=giveup_log_level, - ) - prepared_logger.debug( - "Initiating retry handler with " - "max_tries={}, " - "max_time={}, " - "exception={}, " - "start_log_level={}, " - "backoff_log_level={}, " - "giveup_log_level={}".format( - max_tries, - max_time, - ( - ", ".join([e.__name__ for e in exception]) - if isinstance(exception, IterableType) - else exception.__name__ - ), - logging.getLevelName(start_log_level), - logging.getLevelName(backoff_log_level), - logging.getLevelName(giveup_log_level), - ), - ) - self.on_start = on_start - self.on_success = on_success - self.on_backoff = on_backoff - self.on_giveup = on_giveup - self.jitter = jitter - self.giveup = giveup - self.raise_on_giveup = raise_on_giveup - self.wait_gen_kwargs = wait_gen_kwargs - self.wait_gen = wait_gen - self.exception = exception - self.max_tries = max_tries - self.max_time = max_time - - def __call__(self, target, *args, **kwargs): - _sync._call_handlers( - self.on_start, - target=target, - args=args, - kwargs=kwargs, - tries=None, - elapsed=None, - max_tries=self.max_tries, - max_time=self.max_time, - exception=self.exception, - ) - wrapped_func = _sync.retry_exception( - target, - self.wait_gen, - self.exception, - max_tries=self.max_tries, - max_time=self.max_time, - jitter=self.jitter, - giveup=self.giveup, - on_success=self.on_success, - on_backoff=self.on_backoff, - on_giveup=self.on_giveup, - raise_on_giveup=self.raise_on_giveup, - wait_gen_kwargs=self.wait_gen_kwargs, - ) - return wrapped_func(*args, **kwargs) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py deleted file mode 100644 index 95edd13b1..000000000 --- a/unstructured/ingest/interfaces.py +++ /dev/null @@ -1,845 +0,0 @@ -"""Defines Abstract Base Classes (ABC's) core to batch processing documents -through Unstructured.""" - -from __future__ import annotations - -import functools -import json -import os -import re -from abc import ABC, abstractmethod -from dataclasses import InitVar, dataclass, field -from datetime import datetime -from pathlib import Path -from typing import Any, Optional, Type, TypeVar - -from dataclasses_json import DataClassJsonMixin -from dataclasses_json.core import Json, _decode_dataclass - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.enhanced_dataclass.core import _asdict -from unstructured.ingest.error import PartitionError, SourceConnectionError -from unstructured.ingest.logger import logger -from unstructured.partition.api import partition_via_api -from unstructured.staging.base import elements_to_dicts, flatten_dict - -A = TypeVar("A", bound="DataClassJsonMixin") - -# -- Needed to resolve TypeError raised by using InitVar and __future__.annotations -# -- See more here: https://stackoverflow.com/questions/70400639/ -InitVar.__call__ = lambda *args: None # type: ignore - -SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [ - "s3", - "s3a", - "abfs", - "az", - "gs", - "gcs", - "box", - "dropbox", - "sftp", -] - - -@dataclass -class BaseSessionHandle(ABC): - """Abstract Base Class for sharing resources that are local to an individual process. - e.g., a connection for making a request for fetching documents.""" - - -@dataclass -class BaseConfig(EnhancedDataClassJsonMixin, ABC): - pass - - -@dataclass -class AccessConfig(BaseConfig): - """Meant to designate holding any sensitive information associated with other configs - and also for access specific configs.""" - - -@dataclass -class RetryStrategyConfig(BaseConfig): - """ - Contains all info needed for decorator to pull from `self` for backoff - and retry triggered by exception. - - Args: - max_retries: The maximum number of attempts to make before giving - up. Once exhausted, the exception will be allowed to escape. - The default value of None means there is no limit to the - number of tries. If a callable is passed, it will be - evaluated at runtime and its return value used. - max_retry_time: The maximum total amount of time to try for before - giving up. Once expired, the exception will be allowed to - escape. If a callable is passed, it will be - evaluated at runtime and its return value used. - """ - - max_retries: Optional[int] = None - max_retry_time: Optional[float] = None - - -@dataclass -class PartitionConfig(BaseConfig): - # where to write structured data outputs - pdf_infer_table_structure: bool = False - strategy: str = "auto" - ocr_languages: Optional[list[str]] = None - encoding: Optional[str] = None - additional_partition_args: dict[str, Any] = field(default_factory=dict) - skip_infer_table_types: Optional[list[str]] = None - fields_include: list[str] = field( - default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"], - ) - flatten_metadata: bool = False - metadata_exclude: list[str] = field(default_factory=list) - metadata_include: list[str] = field(default_factory=list) - partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general" - partition_by_api: bool = False - api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None - hi_res_model_name: Optional[str] = None - - -@dataclass -class ProcessorConfig(BaseConfig): - reprocess: bool = False - verbose: bool = False - work_dir: str = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve()) - output_dir: str = "structured-output" - num_processes: int = 2 - raise_on_error: bool = False - - -@dataclass -class FileStorageConfig(BaseConfig): - remote_url: str - uncompress: bool = False - recursive: bool = False - file_glob: Optional[list[str]] = None - - -@dataclass -class FsspecConfig(FileStorageConfig): - access_config: Optional[AccessConfig] = None - protocol: str = field(init=False) - path_without_protocol: str = field(init=False) - dir_path: str = field(init=False) - file_path: str = field(init=False) - - def get_access_config(self) -> dict[str, Any]: - if self.access_config: - return self.access_config.to_dict(apply_name_overload=False) - else: - return {} - - def __post_init__(self): - self.protocol, self.path_without_protocol = self.remote_url.split("://") - if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS: - raise ValueError( - f"Protocol {self.protocol} not supported yet, only " - f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.", - ) - - # dropbox root is an empty string - match = re.match(rf"{self.protocol}://([\s])/", self.remote_url) - if match and self.protocol == "dropbox": - self.dir_path = " " - self.file_path = "" - return - - # dropbox paths can start with slash - match = re.match(rf"{self.protocol}:///([^/\s]+?)/([^\s]*)", self.remote_url) - if match and self.protocol == "dropbox": - self.dir_path = match.group(1) - self.file_path = match.group(2) or "" - return - - # just a path with no trailing prefix - match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.remote_url) - if match: - self.dir_path = match.group(1) - self.file_path = "" - return - - # valid path with a dir and/or file - match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.remote_url) - if not match: - raise ValueError( - f"Invalid path {self.remote_url}. " - f"Expected :///.", - ) - self.dir_path = match.group(1) - self.file_path = match.group(2) or "" - - -@dataclass -class ReadConfig(BaseConfig): - # where raw documents are stored for processing, and then removed if not preserve_downloads - download_dir: Optional[str] = "" - re_download: bool = False - preserve_downloads: bool = False - download_only: bool = False - max_docs: Optional[int] = None - - -@dataclass -class EmbeddingConfig(BaseConfig): - provider: str - api_key: Optional[str] = str(enhanced_field(default=None, sensitive=True)) or None - model_name: Optional[str] = None - aws_access_key_id: Optional[str] = None - aws_secret_access_key: Optional[str] = None - aws_region: Optional[str] = None - - def get_embedder(self) -> BaseEmbeddingEncoder: - kwargs: dict[str, Any] = {} - if self.api_key: - kwargs["api_key"] = self.api_key - if self.model_name: - kwargs["model_name"] = self.model_name - # TODO make this more dynamic to map to encoder configs - if self.provider == "langchain-openai": - from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder - - return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs)) - elif self.provider == "langchain-huggingface": - from unstructured.embed.huggingface import ( - HuggingFaceEmbeddingConfig, - HuggingFaceEmbeddingEncoder, - ) - - return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs)) - elif self.provider == "octoai": - from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder - - return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs)) - elif self.provider == "langchain-aws-bedrock": - from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder - - return BedrockEmbeddingEncoder( - config=BedrockEmbeddingConfig( - aws_access_key_id=self.aws_access_key_id, - aws_secret_access_key=self.aws_secret_access_key, - region_name=self.aws_region, - ) - ) - elif self.provider == "langchain-vertexai": - from unstructured.embed.vertexai import ( - VertexAIEmbeddingConfig, - VertexAIEmbeddingEncoder, - ) - - return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs)) - elif self.provider == "langchain-voyageai": - from unstructured.embed.voyageai import ( - VoyageAIEmbeddingConfig, - VoyageAIEmbeddingEncoder, - ) - - return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**kwargs)) - elif self.provider == "mixedbread-ai": - from unstructured.embed.mixedbreadai import ( - MixedbreadAIEmbeddingConfig, - MixedbreadAIEmbeddingEncoder, - ) - - return MixedbreadAIEmbeddingEncoder(config=MixedbreadAIEmbeddingConfig(**kwargs)) - else: - raise ValueError(f"{self.provider} not a recognized encoder") - - -@dataclass -class ChunkingConfig(BaseConfig): - chunk_elements: InitVar[bool] = False - chunking_strategy: Optional[str] = None - combine_text_under_n_chars: Optional[int] = None - include_orig_elements: Optional[bool] = None - max_characters: Optional[int] = None - multipage_sections: Optional[bool] = None - new_after_n_chars: Optional[int] = None - overlap: Optional[int] = None - overlap_all: Optional[bool] = None - - def __post_init__(self, chunk_elements: bool) -> None: - """Resolve chunking_strategy if chunk_elements is True. - - If chunk_elements is True and chunking_strategy is None, default to 'by_title'. Otherwise, - do nothing and keep the defined value of chunking_strategy." - """ - if chunk_elements and self.chunking_strategy is None: - self.chunking_strategy = "by_title" - - -@dataclass -class PermissionsConfig(BaseConfig): - application_id: Optional[str] = enhanced_field(overload_name="permissions_application_id") - tenant: Optional[str] = enhanced_field(overload_name="permissions_tenant") - client_cred: Optional[str] = enhanced_field( - default=None, sensitive=True, overload_name="permissions_client_cred" - ) - - -# module-level variable to store session handle -global_write_session_handle: Optional[BaseSessionHandle] = None - - -@dataclass -class WriteConfig(BaseConfig): - pass - - -@dataclass -class BaseConnectorConfig(BaseConfig, ABC): - """Abstract definition on which to define connector-specific attributes.""" - - -@dataclass -class SourceMetadata(EnhancedDataClassJsonMixin, ABC): - date_created: Optional[str] = None - date_modified: Optional[str] = None - version: Optional[str] = None - source_url: Optional[str] = None - exists: Optional[bool] = None - permissions_data: Optional[list[dict[str, Any]]] = None - - -class IngestDocJsonMixin(EnhancedDataClassJsonMixin): - """ - Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict - created from the dataclass. This explicitly sets properties to look for on the IngestDoc - class when creating the json/dict for serialization purposes. - """ - - metadata_properties = [ - "date_created", - "date_modified", - "date_processed", - "exists", - "permissions_data", - "version", - "source_url", - ] - properties_to_serialize = [ - "base_filename", - "filename", - "_output_filename", - "record_locator", - "_source_metadata", - "unique_id", - ] - - def add_props(self, as_dict: dict[str, Any], props: list[str]): - for prop in props: - val = getattr(self, prop) - if isinstance(val, Path): - val = str(val) - if isinstance(val, DataClassJsonMixin): - val = val.to_dict(encode_json=False) - as_dict[prop] = val - - def to_dict(self, **kwargs) -> dict[str, Json]: - as_dict = _asdict(self, **kwargs) - if "_session_handle" in as_dict: - as_dict.pop("_session_handle", None) - self.add_props(as_dict=as_dict, props=self.properties_to_serialize) - if getattr(self, "_source_metadata") is not None: - self.add_props(as_dict=as_dict, props=self.metadata_properties) - return as_dict - - @classmethod - def from_dict( - cls: Type[A], kvs: Json, *, infer_missing=False, apply_name_overload: bool = True - ) -> A: - doc = super().from_dict( - kvs=kvs, infer_missing=infer_missing, apply_name_overload=apply_name_overload - ) - if meta := kvs.get("_source_metadata"): - setattr(doc, "_source_metadata", SourceMetadata.from_dict(meta)) - if date_processed := kvs.get("_date_processed"): - setattr(doc, "_date_processed", date_processed) - return doc - - -class BatchIngestDocJsonMixin(EnhancedDataClassJsonMixin): - """ - Inherently, DataClassJsonMixin does not add in any @property fields to the json/dict - created from the dataclass. This explicitly sets properties to look for on the IngestDoc - class when creating the json/dict for serialization purposes. - """ - - properties_to_serialize = ["unique_id"] - - def add_props(self, as_dict: dict[str, Any], props: list[str]): - for prop in props: - val = getattr(self, prop) - if isinstance(val, Path): - val = str(val) - if isinstance(val, DataClassJsonMixin): - val = val.to_dict(encode_json=False) - as_dict[prop] = val - - def to_dict(self, encode_json=False) -> dict[str, Json]: - as_dict = _asdict(self, encode_json=encode_json) - self.add_props(as_dict=as_dict, props=self.properties_to_serialize) - return as_dict - - @classmethod - def from_dict(cls: Type[A], kvs: Json, *, infer_missing=False) -> A: - doc = _decode_dataclass(cls, kvs, infer_missing) - return doc - - -@dataclass -class BaseIngestDoc(ABC): - processor_config: ProcessorConfig - read_config: ReadConfig - connector_config: BaseConnectorConfig - - @property - @abstractmethod - def unique_id(self) -> str: - pass - - -@dataclass -class BaseSingleIngestDoc(BaseIngestDoc, IngestDocJsonMixin, ABC): - """An "ingest document" is specific to a connector, and provides - methods to fetch a single raw document, store it locally for processing, any cleanup - needed after successful processing of the doc, and the ability to write the doc's - structured outputs once processed. - - Crucially, it is not responsible for the actual processing of the raw document. - """ - - _source_metadata: Optional[SourceMetadata] = field(init=False, default=None) - _date_processed: Optional[str] = field(init=False, default=None) - - @property - def source_metadata(self) -> SourceMetadata: - if self._source_metadata is None: - self.update_source_metadata() - # Provide guarantee that the field was set by update_source_metadata() - if self._source_metadata is None: - raise ValueError("failed to set source metadata") - return self._source_metadata - - @source_metadata.setter - def source_metadata(self, value: SourceMetadata): - self._source_metadata = value - - @property - def date_created(self) -> Optional[str]: - """The date the document was created on the source system.""" - return self.source_metadata.date_created - - @property - def date_modified(self) -> Optional[str]: - """The date the document was last modified on the source system.""" - return self.source_metadata.date_modified - - @property - def date_processed(self) -> Optional[str]: - """The date the document was last processed by Unstructured. - self._date_processed is assigned internally in self.partition_file()""" - return self._date_processed - - @property - def exists(self) -> Optional[bool]: - """Whether the document exists on the remote source.""" - return self.source_metadata.exists - - @property - @abstractmethod - def filename(self): - """The local filename of the document after fetching from remote source.""" - - @property - def base_filename(self) -> Optional[str]: - if self.read_config.download_dir and self.filename: - download_path = str(Path(self.read_config.download_dir).resolve()) - full_path = str(self.filename) - base_path = full_path.replace(download_path, "") - return base_path - return None - - @property - def base_output_filename(self) -> Optional[str]: - if self.processor_config.output_dir and self._output_filename: - output_path = str(Path(self.processor_config.output_dir).resolve()) - full_path = str(self._output_filename) - base_path = full_path.replace(output_path, "") - return base_path - return None - - @property - @abstractmethod - def _output_filename(self): - """Filename of the structured output for this doc.""" - - @property - def record_locator(self) -> Optional[dict[str, Any]]: # Values must be JSON-serializable - """A dictionary with any data necessary to uniquely identify the document on - the source system.""" - return None - - @property - def unique_id(self) -> str: - return self.filename - - @property - def source_url(self) -> Optional[str]: - """The url of the source document.""" - return self.source_metadata.source_url # type: ignore - - @property - def version(self) -> Optional[str]: - """The version of the source document, this could be the last modified date, an - explicit version number, or anything else that can be used to uniquely identify - the version of the document.""" - return self.source_metadata.version # type: ignore - - @property - def permissions_data(self) -> Optional[list[dict[str, Any]]]: - """Access control data, aka permissions or sharing, from the source system.""" - if self.source_metadata is None: - self.update_source_metadata() - return self.source_metadata.permissions_data # type: ignore - - @abstractmethod - def cleanup_file(self): - """Removes the local copy the file (or anything else) after successful processing.""" - - @staticmethod - def skip_if_file_exists(func): - """Decorator that checks if a file exists, is not empty, and should not re-download, - if so log a message indicating as much and skip the decorated function.""" - - @functools.wraps(func) - def wrapper(self, *args, **kwargs): - if ( - not self.read_config.re_download - and self.filename.is_file() - and self.filename.stat().st_size - ): - logger.debug(f"File exists: {self.filename}, skipping {func.__name__}") - return None - return func(self, *args, **kwargs) - - return wrapper - - # TODO: set as @abstractmethod and pass or raise NotImplementedError - def update_source_metadata(self, **kwargs) -> None: - """Sets the SourceMetadata and the properties for the doc""" - self._source_metadata = SourceMetadata() - - def update_permissions_data(self): - """Sets the _permissions_data property for the doc. - This property is later used to fill the corresponding SourceMetadata.permissions_data field, - and after that carries on to the permissions_data property.""" - self._permissions_data: Optional[list[dict[str, Any]]] = None - - # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods - # in addition to or instead of get_file() - @abstractmethod - @SourceConnectionError.wrap - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - - def has_output(self) -> bool: - """Determine if structured output for this doc already exists.""" - return self._output_filename.is_file() and self._output_filename.stat().st_size - - @PartitionError.wrap - def partition_file( - self, - partition_config: PartitionConfig, - **partition_kwargs, - ) -> list[Element]: - from unstructured.partition.auto import partition - - if not partition_config.partition_by_api: - logger.debug("Using local partition") - elements = partition( - filename=str(self.filename), - data_source_metadata=DataSourceMetadata( - url=self.source_url, - version=self.version, - record_locator=self.record_locator, - date_created=self.date_created, - date_modified=self.date_modified, - date_processed=self.date_processed, - permissions_data=self.permissions_data, - ), - **partition_kwargs, - ) - else: - endpoint = partition_config.partition_endpoint - - logger.debug(f"Using remote partition ({endpoint})") - - elements = partition_via_api( - filename=str(self.filename), - api_key=partition_config.api_key, - api_url=endpoint, - **partition_kwargs, - ) - # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then - # pass the stringified json here - return elements - - def process_file( - self, - partition_config: PartitionConfig, - **partition_kwargs, - ) -> Optional[list[dict[str, Any]]]: - self._date_processed = datetime.utcnow().isoformat() - if self.read_config.download_only: - return None - logger.info(f"Processing {self.filename}") - - elements = self.partition_file(partition_config=partition_config, **partition_kwargs) - element_dicts = elements_to_dicts(elements) - - self.isd_elems_no_filename: list[dict[str, Any]] = [] - for elem in element_dicts: - if partition_config.metadata_exclude and partition_config.metadata_include: - raise ValueError( - "Arguments `--metadata-include` and `--metadata-exclude` are " - "mutually exclusive with each other.", - ) - elif partition_config.metadata_exclude: - ex_list = partition_config.metadata_exclude - for ex in ex_list: - if "." in ex: # handle nested fields - nested_fields = ex.split(".") - current_elem = elem - for f in nested_fields[:-1]: - if f in current_elem: - current_elem = current_elem[f] - field_to_exclude = nested_fields[-1] - if field_to_exclude in current_elem: - current_elem.pop(field_to_exclude, None) - else: # handle top-level fields - elem["metadata"].pop(ex, None) # type: ignore[attr-defined] - elif partition_config.metadata_include: - in_list = partition_config.metadata_include - for k in list(elem["metadata"].keys()): # type: ignore[attr-defined] - if k not in in_list: - elem["metadata"].pop(k, None) # type: ignore[attr-defined] - in_list = partition_config.fields_include - elem = {k: v for k, v in elem.items() if k in in_list} - - if partition_config.flatten_metadata and "metadata" in elem: - metadata = elem.pop("metadata") - elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"])) - - self.isd_elems_no_filename.append(elem) - - return self.isd_elems_no_filename - - -@dataclass -class BaseIngestDocBatch(BaseIngestDoc, BatchIngestDocJsonMixin, ABC): - ingest_docs: list[BaseSingleIngestDoc] = field(default_factory=list) - - @abstractmethod - @SourceConnectionError.wrap - def get_files(self): - """Fetches the "remote" docs and stores it locally on the filesystem.""" - - -@dataclass -class BaseConnector(EnhancedDataClassJsonMixin, ABC): - @abstractmethod - def check_connection(self): - pass - - -@dataclass -class BaseSourceConnector(BaseConnector, ABC): - """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive.""" - - processor_config: ProcessorConfig - read_config: ReadConfig - connector_config: BaseConnectorConfig - - @abstractmethod - def cleanup(self, cur_dir=None): - """Any additional cleanup up need after processing is complete. E.g., removing - temporary download dirs that are empty. - - By convention, documents that failed to process are typically not cleaned up.""" - - @abstractmethod - def initialize(self): - """Initializes the connector. Should also validate the connector is properly - configured: e.g., list a single a document from the source.""" - - @abstractmethod - def get_ingest_docs(self): - """Returns all ingest docs (derived from BaseIngestDoc). - This does not imply downloading all the raw documents themselves, - rather each IngestDoc is capable of fetching its content (in another process) - with IngestDoc.get_file().""" - - -@dataclass -class BaseDestinationConnector(BaseConnector, ABC): - write_config: WriteConfig - connector_config: BaseConnectorConfig - - def __init__(self, write_config: WriteConfig, connector_config: BaseConnectorConfig): - self.write_config = write_config - self.connector_config = connector_config - - def conform_dict(self, data: dict[str, Any]) -> None: - """ - When the original dictionary needs to be modified in place - """ - return - - def normalize_dict(self, element_dict: dict[str, Any]) -> dict[str, Any]: - """ - When the original dictionary needs to be mapped to a new one - """ - return element_dict - - @abstractmethod - def initialize(self): - """Initializes the connector. Should also validate the connector is properly - configured.""" - - def write(self, docs: list[BaseSingleIngestDoc]) -> None: - elements_dict = self.get_elements_dict(docs=docs) - self.modify_and_write_dict(elements_dict=elements_dict) - - def get_elements_dict(self, docs: list[BaseSingleIngestDoc]) -> list[dict[str, Any]]: - dict_list: list[dict[str, Any]] = [] - for doc in docs: - local_path = doc._output_filename - with open(local_path) as json_file: - dict_content = json.load(json_file) - logger.info( - f"Extending {len(dict_content)} json elements from content in {local_path}", - ) - dict_list.extend(dict_content) - return dict_list - - @abstractmethod - def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None: - pass - - def modify_and_write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None: - """ - Modify in this instance means this method wraps calls to conform_dict() and - normalize() before actually processing the content via write_dict() - """ - for d in elements_dict: - self.conform_dict(data=d) - elements_dict_normalized = [self.normalize_dict(element_dict=d) for d in elements_dict] - return self.write_dict(*args, elements_dict=elements_dict_normalized, **kwargs) - - def write_elements(self, elements: list[Element], *args, **kwargs) -> None: - elements_dict = [e.to_dict() for e in elements] - self.modify_and_write_dict(*args, elements_dict=elements_dict, **kwargs) - - -class SourceConnectorCleanupMixin: - read_config: ReadConfig - - def cleanup(self, cur_dir=None): - """Recursively clean up downloaded files and directories.""" - if self.read_config.preserve_downloads or self.read_config.download_only: - return - if cur_dir is None: - cur_dir = self.read_config.download_dir - if cur_dir is None or not Path(cur_dir).is_dir(): - return - sub_dirs = os.listdir(cur_dir) - os.chdir(cur_dir) - for sub_dir in sub_dirs: - # don't traverse symlinks, not that there every should be any - if os.path.isdir(sub_dir) and not os.path.islink(sub_dir): - self.cleanup(sub_dir) - os.chdir("..") - if len(os.listdir(cur_dir)) == 0: - os.rmdir(cur_dir) - - -class PermissionsCleanupMixin: - processor_config: ProcessorConfig - - def cleanup_permissions(self, cur_dir=None): - def has_no_folders(folder_path): - folders = [ - item - for item in os.listdir(folder_path) - if os.path.isdir(os.path.join(folder_path, item)) - ] - return len(folders) == 0 - - """Recursively clean up downloaded files and directories.""" - if cur_dir is None: - cur_dir = Path(self.processor_config.output_dir, "permissions_data") - if not Path(cur_dir).exists(): - return - if Path(cur_dir).is_file(): - cur_file = cur_dir - os.remove(cur_file) - return - sub_dirs = os.listdir(cur_dir) - os.chdir(cur_dir) - for sub_dir in sub_dirs: - # don't traverse symlinks, not that there every should be any - if not os.path.islink(sub_dir): - self.cleanup_permissions(sub_dir) - os.chdir("..") - if has_no_folders(cur_dir): - os.rmdir(cur_dir) - - -class IngestDocCleanupMixin: - read_config: ReadConfig - - @property - @abstractmethod - def filename(self): - """The local filename of the document after fetching from remote source.""" - - def cleanup_file(self): - """Removes the local copy of the file after successful processing.""" - if ( - not self.read_config.preserve_downloads - and self.filename.is_file() - and not self.read_config.download_only - ): - logger.debug(f"Cleaning up {self}") - os.unlink(self.filename) - - -class ConfigSessionHandleMixin: - @abstractmethod - def create_session_handle(self) -> BaseSessionHandle: - """Creates a session handle that will be assigned on each IngestDoc to share - session related resources across all document handling for a given subprocess.""" - - -@dataclass -class IngestDocSessionHandleMixin: - connector_config: ConfigSessionHandleMixin - _session_handle: Optional[BaseSessionHandle] = field(default=None, init=False) - - @property - def session_handle(self): - """If a session handle is not assigned, creates a new one and assigns it.""" - if self._session_handle is None: - self._session_handle = self.connector_config.create_session_handle() - return self._session_handle - - @session_handle.setter - def session_handle(self, session_handle: BaseSessionHandle): - self._session_handle = session_handle diff --git a/unstructured/ingest/logger.py b/unstructured/ingest/logger.py deleted file mode 100644 index ed4e7180e..000000000 --- a/unstructured/ingest/logger.py +++ /dev/null @@ -1,130 +0,0 @@ -import ast -import json -import logging -import typing as t - -logger = logging.getLogger("unstructured.ingest") - - -def default_is_data_sensitive(k: str, v: t.Any) -> bool: - sensitive_fields = [ - "account_name", - "client_id", - ] - sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"] - return ( - v - and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419 - or k.lower() in sensitive_fields - ) - - -def hide_sensitive_fields( - data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive -) -> dict: - """ - Will recursively look through every k, v pair in this dict and any nested ones and run - is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if - any string value can be parsed as valid json and process that dict as well and replace - the original string with the json.dumps() version of the redacted dict. - """ - new_data = data.copy() - for k, v in new_data.items(): - if is_sensitive_fn(k, v): - new_data[k] = "*******" - if isinstance(v, dict): - new_data[k] = hide_sensitive_fields(v) - if isinstance(v, str): - # Need to take into account strings generated via json.dumps() or simply printing a dict - try: - json_data = json.loads(v) - if isinstance(json_data, dict): - updated_data = hide_sensitive_fields(json_data) - new_data[k] = json.dumps(updated_data) - except json.JSONDecodeError: - pass - - return new_data - - -def redact_jsons(s: str) -> str: - """ - Takes in a generic string and pulls out all valid json content. Leverages - hide_sensitive_fields() to redact any sensitive information and replaces the - original json with the new redacted format. There can be any number of valid - jsons in a generic string and this will work. Having extra '{' without a - closing '}' will cause this to break though. i.e '{ text, {"a": 3}'. - - """ - chars = list(s) - if "{" not in chars: - return s - i = 0 - jsons = [] - i = 0 - while i < len(chars): - char = chars[i] - if char == "{": - stack = [char] - current = [char] - while len(stack) != 0 and i < len(chars): - i += 1 - char = chars[i] - current.append(char) - if char == "{": - stack.append(char) - if char == "}": - stack.pop(-1) - jsons.append("".join(current)) - continue - i += 1 - for j in jsons: - try: - formatted_j = json.dumps(json.loads(j)) - except json.JSONDecodeError: - formatted_j = json.dumps(ast.literal_eval(j)) - hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j))) - s = s.replace(j, hidden_j) - return s - - -class SensitiveFormatter(logging.Formatter): - def format(self, record): - s = super().format(record=record) - return redact_jsons(s) - - -def remove_root_handlers(logger: logging.Logger) -> None: - # NOTE(robinson) - in some environments such as Google Colab, there is a root handler - # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs. - # Removing these when they exist prevents this behavior - if logger.root.hasHandlers(): - for handler in logger.root.handlers: - logger.root.removeHandler(handler) - - -def ingest_log_streaming_init(level: int) -> None: - handler = logging.StreamHandler() - handler.name = "ingest_log_handler" - formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s") - handler.setFormatter(formatter) - - # Only want to add the handler once - if "ingest_log_handler" not in [h.name for h in logger.handlers]: - logger.addHandler(handler) - - remove_root_handlers(logger) - logger.setLevel(level) - - -def make_default_logger(level: int) -> logging.Logger: - """Return a custom logger.""" - logger = logging.getLogger("unstructured.ingest") - handler = logging.StreamHandler() - handler.name = "ingest_log_handler" - formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s") - handler.setFormatter(formatter) - logger.addHandler(handler) - logger.setLevel(level) - remove_root_handlers(logger) - return logger diff --git a/unstructured/ingest/main.py b/unstructured/ingest/main.py deleted file mode 100755 index ead616f40..000000000 --- a/unstructured/ingest/main.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python3 -from unstructured.ingest.cli.cli import get_cmd - - -def main(): - ingest_cmd = get_cmd() - ingest_cmd() - - -if __name__ == "__main__": - main() diff --git a/unstructured/ingest/pipeline/__init__.py b/unstructured/ingest/pipeline/__init__.py deleted file mode 100644 index 439647b60..000000000 --- a/unstructured/ingest/pipeline/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -from .doc_factory import DocFactory -from .interfaces import PipelineContext, ReformatNode -from .partition import Partitioner -from .permissions import PermissionsDataCleaner -from .pipeline import Pipeline -from .reformat.chunking import Chunker -from .reformat.embedding import Embedder -from .source import Reader -from .write import Writer - -__all__ = [ - "DocFactory", - "Partitioner", - "Reader", - "Embedder", - "PipelineContext", - "Pipeline", - "Writer", - "Chunker", - "ReformatNode", - "PermissionsDataCleaner", -] diff --git a/unstructured/ingest/pipeline/copy.py b/unstructured/ingest/pipeline/copy.py deleted file mode 100644 index 5ec195265..000000000 --- a/unstructured/ingest/pipeline/copy.py +++ /dev/null @@ -1,19 +0,0 @@ -import os -import shutil -from pathlib import Path - -from unstructured.ingest.connector.registry import create_ingest_doc_from_dict -from unstructured.ingest.logger import logger -from unstructured.ingest.pipeline.interfaces import CopyNode - - -class Copier(CopyNode): - def run(self, json_path: str): - filename = os.path.basename(json_path) - doc_hash = os.path.splitext(filename)[0] - ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash] - ingest_doc = create_ingest_doc_from_dict(ingest_doc_dict) - desired_output = ingest_doc._output_filename - Path(desired_output).parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Copying {json_path} -> {desired_output}") - shutil.copy(json_path, desired_output) diff --git a/unstructured/ingest/pipeline/doc_factory.py b/unstructured/ingest/pipeline/doc_factory.py deleted file mode 100644 index 38feca4e4..000000000 --- a/unstructured/ingest/pipeline/doc_factory.py +++ /dev/null @@ -1,12 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.pipeline.interfaces import DocFactoryNode - - -@dataclass -class DocFactory(DocFactoryNode): - def run(self, *args, **kwargs) -> t.Iterable[dict]: - docs = self.source_doc_connector.get_ingest_docs() - json_docs = [doc.to_dict() for doc in docs] - return json_docs diff --git a/unstructured/ingest/pipeline/interfaces.py b/unstructured/ingest/pipeline/interfaces.py deleted file mode 100644 index 8db9e536c..000000000 --- a/unstructured/ingest/pipeline/interfaces.py +++ /dev/null @@ -1,265 +0,0 @@ -import hashlib -import json -import logging -import multiprocessing as mp -import typing as t -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from multiprocessing.managers import DictProxy -from pathlib import Path - -import backoff -from dataclasses_json import DataClassJsonMixin - -from unstructured.ingest.error import SourceConnectionNetworkError -from unstructured.ingest.ingest_backoff import RetryHandler -from unstructured.ingest.interfaces import ( - BaseDestinationConnector, - BaseSourceConnector, - PartitionConfig, - ProcessorConfig, - ReadConfig, - RetryStrategyConfig, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger - - -@dataclass -class PipelineContext(ProcessorConfig): - """ - Data that gets shared across each pipeline node - """ - - def __post_init__(self): - self._ingest_docs_map: t.Optional[DictProxy] = None - - @property - def ingest_docs_map(self) -> DictProxy: - if self._ingest_docs_map is None: - raise ValueError("ingest_docs_map never initialized") - return self._ingest_docs_map - - @ingest_docs_map.setter - def ingest_docs_map(self, value: DictProxy): - self._ingest_docs_map = value - - -@dataclass -class PipelineNode(DataClassJsonMixin, ABC): - """ - Class that encapsulates logic to run during a single pipeline step - """ - - pipeline_context: PipelineContext - - def __call__(self, iterable: t.Optional[t.Iterable[t.Any]] = None) -> t.Any: - iterable = iterable if iterable else [] - if iterable: - logger.info( - f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore - ) - - self.initialize() - if not self.supported_multiprocessing(): - if iterable: - self.result = self.run(iterable) - else: - self.result = self.run() - elif self.pipeline_context.num_processes == 1: - if iterable: - self.result = [self.run(it) for it in iterable] - else: - self.result = self.run() - else: - with mp.Pool( - processes=self.pipeline_context.num_processes, - initializer=ingest_log_streaming_init, - initargs=(logging.DEBUG if self.pipeline_context.verbose else logging.INFO,), - ) as pool: - self.result = pool.map(self.run, iterable) - # Remove None which may be caused by failed docs that didn't raise an error - if isinstance(self.result, t.Iterable): - self.result = [r for r in self.result if r is not None] - return self.result - - def supported_multiprocessing(self) -> bool: - return True - - @abstractmethod - def run(self, *args, **kwargs) -> t.Optional[t.Any]: - pass - - def initialize(self): - if path := self.get_path(): - logger.info(f"Creating {path}") - path.mkdir(parents=True, exist_ok=True) - ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO) - - def get_path(self) -> t.Optional[Path]: - return None - - -@dataclass -class DocFactoryNode(PipelineNode): - """ - Encapsulated logic to generate a list of ingest docs - """ - - source_doc_connector: BaseSourceConnector - - def initialize(self): - logger.info( - f"Running doc factory to generate ingest docs. " - f"Source connector: {self.source_doc_connector.to_json()}", - ) - super().initialize() - self.source_doc_connector.initialize() - - @abstractmethod - def run(self, *args, **kwargs) -> t.Iterable[dict]: - pass - - def supported_multiprocessing(self) -> bool: - return False - - -@dataclass -class SourceNode(PipelineNode): - """A pipeline node representing logic to pull data from a source using base ingest documents. - - This class encapsulates the logic for pulling data from a specified source using base ingest - documents. The output of this logic is expected to be in JSON format representing the data - itself. - - Attributes: - read_config: A configuration object specifying how to read data from the source. - retry_strategy_config: Optional configuration specifying the strategy for network errors. - - Properties: - retry_strategy: A retry handler configured based on the retry strategy configuration. - - Methods: - initialize: Initializes the source node and logs the process. - run: Abstract method for downloading data associated with ingest documents. - """ - - read_config: ReadConfig - retry_strategy_config: t.Optional[RetryStrategyConfig] = None - - @property - def retry_strategy(self) -> t.Optional[RetryHandler]: - if retry_strategy_config := self.retry_strategy_config: - return RetryHandler( - backoff.expo, - SourceConnectionNetworkError, - max_time=retry_strategy_config.max_retry_time, - max_tries=retry_strategy_config.max_retries, - logger=logger, - start_log_level=logger.level, - backoff_log_level=logger.level, - ) - return None - - def initialize(self): - logger.info("Running source node to download data associated with ingest docs") - super().initialize() - - @abstractmethod - def run(self, ingest_doc_json: str) -> t.Optional[str]: - pass - - -@dataclass -class PartitionNode(PipelineNode): - """ - Encapsulates logic to run partition on the json files as the output of the source node - """ - - partition_config: PartitionConfig - partition_kwargs: dict = field(default_factory=dict) - - def initialize(self): - logger.info( - f"Running partition node to extract content from json files. " - f"Config: {self.partition_config.to_json()}, " - f"partition kwargs: {json.dumps(self.partition_kwargs)}]", - ) - super().initialize() - - def create_hash(self) -> str: - hash_dict = self.partition_config.to_dict() - hash_dict["partition_kwargs"] = self.partition_kwargs - return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32] - - @abstractmethod - def run(self, json_path: str) -> t.Optional[str]: - pass - - def get_path(self) -> Path: - return (Path(self.pipeline_context.work_dir) / "partitioned").resolve() - - -@dataclass -class ReformatNode(PipelineNode, ABC): - """ - Encapsulated any logic to reformat the output List[Element] - content from partition before writing it - """ - - @abstractmethod - def run(self, elements_json: str) -> t.Optional[str]: - pass - - -@dataclass -class WriteNode(PipelineNode): - """ - Encapsulated logic to write the final result to a downstream data connection - """ - - dest_doc_connector: BaseDestinationConnector - - @abstractmethod - def run(self, json_paths: t.List[str]): - pass - - def initialize(self): - logger.info( - f"Running write node to upload content. " - f"Destination connector: {self.dest_doc_connector.to_json(redact_sensitive=True)}]", - ) - super().initialize() - self.dest_doc_connector.initialize() - - def supported_multiprocessing(self) -> bool: - return False - - -@dataclass -class CopyNode(PipelineNode): - """ - Encapsulated logic to copy the final result of the pipeline to the designated output location. - """ - - def initialize(self): - logger.info("Running copy node to move content to desired output location") - super().initialize() - - @abstractmethod - def run(self, json_path: str): - pass - - -@dataclass -class PermissionsNode(PipelineNode): - """ - Encapsulated logic to do operations on permissions related data. - """ - - def initialize(self): - logger.info("Running permissions node to cleanup the permissions folder") - super().initialize() - - @abstractmethod - def run(self): - pass diff --git a/unstructured/ingest/pipeline/partition.py b/unstructured/ingest/pipeline/partition.py deleted file mode 100644 index 4aa2ccc86..000000000 --- a/unstructured/ingest/pipeline/partition.py +++ /dev/null @@ -1,60 +0,0 @@ -import hashlib -import json -import typing as t -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -from unstructured.ingest.connector.registry import create_ingest_doc_from_dict -from unstructured.ingest.error import PartitionError -from unstructured.ingest.logger import logger -from unstructured.ingest.pipeline.interfaces import PartitionNode -from unstructured.ingest.pipeline.utils import get_ingest_doc_hash - - -@dataclass -class Partitioner(PartitionNode): - @PartitionError.wrap - def run(self, ingest_doc_dict) -> Optional[str]: - try: - doc = create_ingest_doc_from_dict(ingest_doc_dict) - doc_filename_hash = get_ingest_doc_hash(ingest_doc_dict) - hashed_filename = hashlib.sha256( - f"{self.create_hash()}{doc_filename_hash}".encode(), - ).hexdigest()[:32] - self.pipeline_context.ingest_docs_map[hashed_filename] = ingest_doc_dict - doc_filename = f"{hashed_filename}.json" - json_path = (Path(self.get_path()) / doc_filename).resolve() - if ( - not self.pipeline_context.reprocess - and json_path.is_file() - and json_path.stat().st_size - ): - logger.info(f"File exists: {json_path}, skipping partition") - return str(json_path) - partition_kwargs: t.Dict[str, t.Any] = { - "strategy": self.partition_config.strategy, - "encoding": self.partition_config.encoding, - "pdf_infer_table_structure": self.partition_config.pdf_infer_table_structure, - "languages": self.partition_config.ocr_languages, - "hi_res_model_name": self.partition_config.hi_res_model_name, - } - if self.partition_config.skip_infer_table_types: - partition_kwargs["skip_infer_table_types"] = ( - self.partition_config.skip_infer_table_types - ) - if self.partition_config.additional_partition_args: - partition_kwargs.update(self.partition_config.additional_partition_args) - elements = doc.process_file( - partition_config=self.partition_config, - **partition_kwargs, - ) - with open(json_path, "w", encoding="utf8") as output_f: - logger.info(f"writing partitioned content to {json_path}") - json.dump(elements, output_f, ensure_ascii=False, indent=2, sort_keys=True) - return str(json_path) - except Exception as e: - if self.pipeline_context.raise_on_error: - raise - logger.error(f"failed to partition doc: {ingest_doc_dict}, {e}", exc_info=True) - return None diff --git a/unstructured/ingest/pipeline/permissions.py b/unstructured/ingest/pipeline/permissions.py deleted file mode 100644 index 5a93b3cca..000000000 --- a/unstructured/ingest/pipeline/permissions.py +++ /dev/null @@ -1,12 +0,0 @@ -from dataclasses import dataclass - -from unstructured.ingest.interfaces import PermissionsCleanupMixin, ProcessorConfig -from unstructured.ingest.pipeline.interfaces import PermissionsNode - - -@dataclass -class PermissionsDataCleaner(PermissionsNode, PermissionsCleanupMixin): - processor_config: ProcessorConfig - - def run(self): - self.cleanup_permissions() diff --git a/unstructured/ingest/pipeline/pipeline.py b/unstructured/ingest/pipeline/pipeline.py deleted file mode 100644 index 6c6897885..000000000 --- a/unstructured/ingest/pipeline/pipeline.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -import multiprocessing as mp -from dataclasses import dataclass, field -from typing import Any, Optional - -from dataclasses_json import DataClassJsonMixin - -from unstructured.ingest.connector.registry import create_ingest_doc_from_dict -from unstructured.ingest.interfaces import BaseIngestDocBatch, BaseSingleIngestDoc -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.pipeline.copy import Copier -from unstructured.ingest.pipeline.interfaces import ( - DocFactoryNode, - PartitionNode, - PipelineContext, - ReformatNode, - SourceNode, - WriteNode, -) -from unstructured.ingest.pipeline.permissions import PermissionsDataCleaner -from unstructured.ingest.pipeline.utils import get_ingest_doc_hash - - -@dataclass -class Pipeline(DataClassJsonMixin): - pipeline_context: PipelineContext - doc_factory_node: DocFactoryNode - source_node: SourceNode - partition_node: Optional[PartitionNode] = None - write_node: Optional[WriteNode] = None - reformat_nodes: "list[ReformatNode]" = field(default_factory=list) - permissions_node: Optional[PermissionsDataCleaner] = None - - def initialize(self): - ingest_log_streaming_init(logging.DEBUG if self.pipeline_context.verbose else logging.INFO) - - def get_nodes_str(self): - nodes = [self.doc_factory_node, self.source_node, self.partition_node] - nodes.extend(self.reformat_nodes) - if self.write_node: - nodes.append(self.write_node) - nodes.append(Copier(pipeline_context=self.pipeline_context)) - return " -> ".join([node.__class__.__name__ for node in nodes]) - - def expand_batch_docs(self, dict_docs: "list[dict[str, Any]]") -> "list[dict[str, Any]]": - expanded_docs: list[dict[str, Any]] = [] - for d in dict_docs: - doc = create_ingest_doc_from_dict(d) - if isinstance(doc, BaseSingleIngestDoc): - expanded_docs.append(doc.to_dict()) - elif isinstance(doc, BaseIngestDocBatch): - expanded_docs.extend([single_doc.to_dict() for single_doc in doc.ingest_docs]) - else: - raise ValueError( - f"type of doc ({type(doc)}) is not a recognized type: " - f"BaseSingleIngestDoc or BaseSingleIngestDoc" - ) - return expanded_docs - - def run(self): - logger.info( - f"running pipeline: {self.get_nodes_str()} " - f"with config: {self.pipeline_context.to_json()}", - ) - self.initialize() - manager = mp.Manager() - self.pipeline_context.ingest_docs_map = manager.dict() - # -- Get the documents to be processed -- - dict_docs = self.doc_factory_node() - dict_docs = [manager.dict(d) for d in dict_docs] - if not dict_docs: - logger.info("no docs found to process") - return - logger.info( - f"processing {len(dict_docs)} docs via " - f"{self.pipeline_context.num_processes} processes", - ) - for doc in dict_docs: - self.pipeline_context.ingest_docs_map[get_ingest_doc_hash(doc)] = doc - fetched_filenames = self.source_node(iterable=dict_docs) - if self.source_node.read_config.download_only: - logger.info("stopping pipeline after downloading files") - return - if not fetched_filenames: - logger.info("No files to run partition over") - return - # -- To support batches ingest docs, expand those into the populated single ingest - # -- docs after downloading content - dict_docs = self.expand_batch_docs(dict_docs=dict_docs) - if self.partition_node is None: - raise ValueError("partition node not set") - partitioned_jsons = self.partition_node(iterable=dict_docs) - if not partitioned_jsons: - logger.info("No files to process after partitioning") - return - for reformat_node in self.reformat_nodes: - reformatted_jsons = reformat_node(iterable=partitioned_jsons) - if not reformatted_jsons: - logger.info(f"No files to process after {reformat_node.__class__.__name__}") - return - partitioned_jsons = reformatted_jsons - - # -- Copy the final destination to the desired location -- - copier = Copier( - pipeline_context=self.pipeline_context, - ) - copier(iterable=partitioned_jsons) - - if self.write_node: - logger.info( - f"uploading elements from {len(partitioned_jsons)} " - "document(s) to the destination" - ) - self.write_node(iterable=partitioned_jsons) - - if self.permissions_node: - self.permissions_node.cleanup_permissions() diff --git a/unstructured/ingest/pipeline/reformat/__init__.py b/unstructured/ingest/pipeline/reformat/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/pipeline/reformat/chunking.py b/unstructured/ingest/pipeline/reformat/chunking.py deleted file mode 100644 index b061cfa1c..000000000 --- a/unstructured/ingest/pipeline/reformat/chunking.py +++ /dev/null @@ -1,129 +0,0 @@ -from __future__ import annotations - -import hashlib -import json -import os.path -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -from unstructured.chunking import dispatch -from unstructured.documents.elements import Element, assign_and_map_hash_ids -from unstructured.ingest.interfaces import ChunkingConfig, PartitionConfig -from unstructured.ingest.logger import logger -from unstructured.ingest.pipeline.interfaces import ReformatNode -from unstructured.partition.api import partition_via_api -from unstructured.staging.base import elements_from_json, elements_to_dicts - - -@dataclass -class Chunker(ReformatNode): - """Implementation for the chunking node in the ingest pipeline. - - Parameters - ---------- - pipeline_context: PipelineContext (inherited from parent class) - chunking_config: ChunkingConfig - partition_config: PartitionConfig - """ - - chunking_config: ChunkingConfig - partition_config: PartitionConfig - - def initialize(self): - logger.info( - f"Running chunking node. Chunking config: {self.chunking_config.to_json()}]", - ) - super().initialize() - - def create_hash(self) -> str: - hash_dict = self.chunking_config.to_dict() - return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32] - - def run(self, elements_json: str) -> Optional[str]: - try: - elements_json_filename = os.path.basename(elements_json) - filename_ext = os.path.basename(elements_json_filename) - filename = os.path.splitext(filename_ext)[0] - hashed_filename = hashlib.sha256( - f"{self.create_hash()}{filename}".encode(), - ).hexdigest()[:32] - json_filename = f"{hashed_filename}.json" - json_path = (Path(self.get_path()) / json_filename).resolve() - self.pipeline_context.ingest_docs_map[hashed_filename] = ( - self.pipeline_context.ingest_docs_map[filename] - ) - if ( - not self.pipeline_context.reprocess - and json_path.is_file() - and json_path.stat().st_size - ): - logger.debug(f"File exists: {json_path}, skipping chunking") - return str(json_path) - - chunked_elements = self.chunk(elements_json) - - # -- return if chunking_strategy is None -- - if chunked_elements is None: - logger.info(f"chunking_strategy is None, skipping chunking for {filename_ext}") - return - - assign_and_map_hash_ids(chunked_elements) - - element_dicts = elements_to_dicts(chunked_elements) - with open(json_path, "w", encoding="utf8") as output_f: - logger.info(f"writing chunking content to {json_path}") - json.dump(element_dicts, output_f, ensure_ascii=False, indent=2) - return str(json_path) - - except Exception as e: - if self.pipeline_context.raise_on_error: - raise - logger.error(f"failed to run chunking on file {elements_json}, {e}", exc_info=True) - return None - - def get_path(self) -> Path: - return (Path(self.pipeline_context.work_dir) / "chunked").resolve() - - def chunk(self, elements_json_file: str) -> Optional[list[Element]]: - """Called by Chunker.run() to properly execute the defined chunking_strategy.""" - # -- No chunking_strategy means no chunking -- - if self.chunking_config.chunking_strategy is None: - return - # -- Chunk locally for open-source chunking strategies, even when partitioning remotely -- - if self.chunking_config.chunking_strategy in ("basic", "by_title"): - return dispatch.chunk( - elements=elements_from_json(filename=elements_json_file), - chunking_strategy=self.chunking_config.chunking_strategy, - combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars, - include_orig_elements=self.chunking_config.include_orig_elements, - max_characters=self.chunking_config.max_characters, - multipage_sections=self.chunking_config.multipage_sections, - new_after_n_chars=self.chunking_config.new_after_n_chars, - overlap=self.chunking_config.overlap, - overlap_all=self.chunking_config.overlap_all, - ) - # -- Chunk remotely -- - if self.partition_config.partition_by_api: - return partition_via_api( - filename=elements_json_file, - # -- (jennings) If api_key or api_url are None, partition_via_api will raise an - # -- error, which will be caught and logged by Chunker.run() - api_key=self.partition_config.api_key, # type: ignore - api_url=self.partition_config.partition_endpoint, # type: ignore - chunking_strategy=self.chunking_config.chunking_strategy, - combine_under_n_chars=self.chunking_config.combine_text_under_n_chars, - include_orig_elements=self.chunking_config.include_orig_elements, - max_characters=self.chunking_config.max_characters, - multipage_sections=self.chunking_config.multipage_sections, - new_after_n_chars=self.chunking_config.new_after_n_chars, - overlap=self.chunking_config.overlap, - overlap_all=self.chunking_config.overlap_all, - ) - # -- Warn that the defined chunking_strategy is not locally available -- - logger.warning( - f"There is no locally available chunking_strategy:" - f" {self.chunking_config.chunking_strategy}." - f" If trying to partition remotely, check that `partition_by_api`, `api_url`," - f" and `api_key` are correctly defined." - ) diff --git a/unstructured/ingest/pipeline/reformat/embedding.py b/unstructured/ingest/pipeline/reformat/embedding.py deleted file mode 100644 index 58d47b429..000000000 --- a/unstructured/ingest/pipeline/reformat/embedding.py +++ /dev/null @@ -1,65 +0,0 @@ -import hashlib -import json -import os.path -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -from unstructured.ingest.interfaces import ( - EmbeddingConfig, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.pipeline.interfaces import ReformatNode -from unstructured.staging.base import elements_from_json, elements_to_dicts - - -@dataclass -class Embedder(ReformatNode): - embedder_config: EmbeddingConfig - - def initialize(self): - logger.info( - f"Running embedding node. Embedding config: {self.embedder_config.to_json()}]", - ) - super().initialize() - - def create_hash(self) -> str: - hash_dict = self.embedder_config.to_dict() - return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32] - - def run(self, elements_json: str) -> Optional[str]: - try: - elements_json_filename = os.path.basename(elements_json) - filename_ext = os.path.basename(elements_json_filename) - filename = os.path.splitext(filename_ext)[0] - hashed_filename = hashlib.sha256( - f"{self.create_hash()}{filename}".encode(), - ).hexdigest()[:32] - json_filename = f"{hashed_filename}.json" - json_path = (Path(self.get_path()) / json_filename).resolve() - self.pipeline_context.ingest_docs_map[hashed_filename] = ( - self.pipeline_context.ingest_docs_map[filename] - ) - if ( - not self.pipeline_context.reprocess - and json_path.is_file() - and json_path.stat().st_size - ): - logger.debug(f"File exists: {json_path}, skipping embedding") - return str(json_path) - elements = elements_from_json(filename=elements_json) - embedder = self.embedder_config.get_embedder() - embedded_elements = embedder.embed_documents(elements=elements) - element_dicts = elements_to_dicts(embedded_elements) - with open(json_path, "w", encoding="utf8") as output_f: - logger.info(f"writing embeddings content to {json_path}") - json.dump(element_dicts, output_f, ensure_ascii=False, indent=2) - return str(json_path) - except Exception as e: - if self.pipeline_context.raise_on_error: - raise - logger.error(f"failed to embed content from file {elements_json}, {e}", exc_info=True) - return None - - def get_path(self) -> Path: - return (Path(self.pipeline_context.work_dir) / "embedded").resolve() diff --git a/unstructured/ingest/pipeline/source.py b/unstructured/ingest/pipeline/source.py deleted file mode 100644 index ee1087a07..000000000 --- a/unstructured/ingest/pipeline/source.py +++ /dev/null @@ -1,77 +0,0 @@ -import os -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.connector.registry import create_ingest_doc_from_dict -from unstructured.ingest.interfaces import ( - BaseIngestDocBatch, - BaseSessionHandle, - BaseSingleIngestDoc, - IngestDocSessionHandleMixin, -) -from unstructured.ingest.logger import logger -from unstructured.ingest.pipeline.interfaces import SourceNode - -# module-level variable to store session handle -session_handle: t.Optional[BaseSessionHandle] = None - - -@dataclass -class Reader(SourceNode): - def get_single(self, doc: BaseSingleIngestDoc, ingest_doc_dict: dict) -> str: - if ( - not self.read_config.re_download - and doc.filename.is_file() - and doc.filename.stat().st_size - ): - logger.info(f"File exists: {doc.filename}, skipping download") - # Still need to fetch metadata if file exists locally - doc.update_source_metadata() - else: - serialized_doc = doc.to_json(redact_sensitive=True) - logger.debug(f"Fetching {serialized_doc} - PID: {os.getpid()}") - if self.retry_strategy: - self.retry_strategy(doc.get_file) - else: - doc.get_file() - for k, v in doc.to_dict().items(): - ingest_doc_dict[k] = v - return doc.filename - - def get_batch(self, doc_batch: BaseIngestDocBatch, ingest_doc_dict: dict) -> t.List[str]: - if self.retry_strategy: - self.retry_strategy(doc_batch.get_files) - else: - doc_batch.get_files() - for k, v in doc_batch.to_dict().items(): - ingest_doc_dict[k] = v - return [doc.filename for doc in doc_batch.ingest_docs] - - def run(self, ingest_doc_dict: dict) -> t.Optional[t.Union[str, t.List[str]]]: - try: - global session_handle - doc = create_ingest_doc_from_dict(ingest_doc_dict) - if isinstance(doc, IngestDocSessionHandleMixin): - if session_handle is None: - # create via doc.session_handle, which is a property that creates a - # session handle if one is not already defined - session_handle = doc.session_handle - else: - doc._session_handle = session_handle - if isinstance(doc, BaseSingleIngestDoc): - return self.get_single(doc=doc, ingest_doc_dict=ingest_doc_dict) - elif isinstance(doc, BaseIngestDocBatch): - return self.get_batch(doc_batch=doc, ingest_doc_dict=ingest_doc_dict) - else: - raise ValueError( - f"type of doc ({type(doc)}) is not a recognized type: " - f"BaseSingleIngestDoc or BaseSingleIngestDoc" - ) - except Exception as e: - if self.pipeline_context.raise_on_error: - raise - logger.error( - f"failed to get data associated with source doc: {ingest_doc_dict}, {e}", - exc_info=True, - ) - return None diff --git a/unstructured/ingest/pipeline/utils.py b/unstructured/ingest/pipeline/utils.py deleted file mode 100644 index bcd6aa2ab..000000000 --- a/unstructured/ingest/pipeline/utils.py +++ /dev/null @@ -1,6 +0,0 @@ -import hashlib - - -def get_ingest_doc_hash(json_as_dict: dict) -> str: - hashed = hashlib.sha256(json_as_dict["unique_id"].encode()).hexdigest()[:32] - return hashed diff --git a/unstructured/ingest/pipeline/write.py b/unstructured/ingest/pipeline/write.py deleted file mode 100644 index 7a0540983..000000000 --- a/unstructured/ingest/pipeline/write.py +++ /dev/null @@ -1,18 +0,0 @@ -import os.path -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.connector.registry import create_ingest_doc_from_dict -from unstructured.ingest.pipeline.interfaces import WriteNode - - -@dataclass -class Writer(WriteNode): - def run(self, json_paths: t.List[str]): - ingest_docs = [] - for json_path in json_paths: - filename = os.path.basename(json_path) - doc_hash = os.path.splitext(filename)[0] - ingest_doc_dict = self.pipeline_context.ingest_docs_map[doc_hash] - ingest_docs.append(create_ingest_doc_from_dict(ingest_doc_dict)) - self.dest_doc_connector.write(docs=ingest_docs) diff --git a/unstructured/ingest/processor.py b/unstructured/ingest/processor.py deleted file mode 100644 index cf4c775cd..000000000 --- a/unstructured/ingest/processor.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import annotations - -import multiprocessing as mp -from contextlib import suppress -from typing import Optional - -from unstructured.ingest.interfaces import ( - BaseDestinationConnector, - BaseSourceConnector, - ChunkingConfig, - EmbeddingConfig, - PartitionConfig, - PermissionsConfig, - ProcessorConfig, - RetryStrategyConfig, -) -from unstructured.ingest.pipeline import ( - Chunker, - DocFactory, - Embedder, - Partitioner, - PermissionsDataCleaner, - Pipeline, - PipelineContext, - Reader, - ReformatNode, - Writer, -) - -with suppress(RuntimeError): - mp.set_start_method("spawn") - - -def process_documents( - processor_config: ProcessorConfig, - source_doc_connector: BaseSourceConnector, - partition_config: PartitionConfig, - dest_doc_connector: Optional[BaseDestinationConnector] = None, - chunking_config: Optional[ChunkingConfig] = None, - embedder_config: Optional[EmbeddingConfig] = None, - permissions_config: Optional[PermissionsConfig] = None, - retry_strategy_config: Optional[RetryStrategyConfig] = None, -) -> None: - pipeline_config = PipelineContext.from_dict(processor_config.to_dict()) - doc_factory = DocFactory( - pipeline_context=pipeline_config, - source_doc_connector=source_doc_connector, - ) - reader = Reader( - pipeline_context=pipeline_config, - retry_strategy_config=retry_strategy_config, - read_config=source_doc_connector.read_config, - ) - partitioner = Partitioner(pipeline_context=pipeline_config, partition_config=partition_config) - reformat_nodes: list[ReformatNode] = [] - if chunking_config: - reformat_nodes.append( - Chunker( - pipeline_context=pipeline_config, - chunking_config=chunking_config, - partition_config=partition_config, - ), - ) - if embedder_config: - reformat_nodes.append( - Embedder( - pipeline_context=pipeline_config, - embedder_config=embedder_config, - ), - ) - writer = ( - Writer( - pipeline_context=pipeline_config, - dest_doc_connector=dest_doc_connector, - ) - if dest_doc_connector - else None - ) - permissions_data_cleaner = ( - PermissionsDataCleaner(pipeline_context=pipeline_config, processor_config=processor_config) - if permissions_config - else None - ) - pipeline = Pipeline( - pipeline_context=pipeline_config, - doc_factory_node=doc_factory, - source_node=reader, - partition_node=partitioner, - reformat_nodes=reformat_nodes, - write_node=writer, - permissions_node=permissions_data_cleaner, - ) - pipeline.run() diff --git a/unstructured/ingest/runner/__init__.py b/unstructured/ingest/runner/__init__.py deleted file mode 100644 index 872ebb10d..000000000 --- a/unstructured/ingest/runner/__init__.py +++ /dev/null @@ -1,104 +0,0 @@ -import typing as t -from typing import Type - -from .airtable import AirtableRunner -from .astradb import AstraDBRunner -from .base_runner import Runner -from .biomed import BiomedRunner -from .confluence import ConfluenceRunner -from .delta_table import DeltaTableRunner -from .discord import DiscordRunner -from .elasticsearch import ElasticSearchRunner -from .fsspec.azure import AzureRunner -from .fsspec.box import BoxRunner -from .fsspec.dropbox import DropboxRunner -from .fsspec.fsspec import FsspecRunner -from .fsspec.gcs import GCSRunner -from .fsspec.s3 import S3Runner -from .fsspec.sftp import SftpRunner -from .github import GithubRunner -from .gitlab import GitlabRunner -from .google_drive import GoogleDriveRunner -from .hubspot import HubSpotRunner -from .jira import JiraRunner -from .kafka import KafkaRunner -from .local import LocalRunner -from .mongodb import MongoDBRunner -from .notion import NotionRunner -from .onedrive import OneDriveRunner -from .opensearch import OpenSearchRunner -from .outlook import OutlookRunner -from .reddit import RedditRunner -from .salesforce import SalesforceRunner -from .sharepoint import SharePointRunner -from .slack import SlackRunner -from .wikipedia import WikipediaRunner - -runner_map: t.Dict[str, Type[Runner]] = { - "airtable": AirtableRunner, - "astradb": AstraDBRunner, - "azure": AzureRunner, - "biomed": BiomedRunner, - "box": BoxRunner, - "confluence": ConfluenceRunner, - "delta_table": DeltaTableRunner, - "discord": DiscordRunner, - "dropbox": DropboxRunner, - "elasticsearch": ElasticSearchRunner, - "fsspec": FsspecRunner, - "gcs": GCSRunner, - "github": GithubRunner, - "gitlab": GitlabRunner, - "gdrive": GoogleDriveRunner, - "google_drive": GoogleDriveRunner, - "hubspot": HubSpotRunner, - "jira": JiraRunner, - "kafka": KafkaRunner, - "local": LocalRunner, - "mongodb": MongoDBRunner, - "notion": NotionRunner, - "onedrive": OneDriveRunner, - "opensearch": OpenSearchRunner, - "outlook": OutlookRunner, - "reddit": RedditRunner, - "s3": S3Runner, - "salesforce": SalesforceRunner, - "sftp": SftpRunner, - "sharepoint": SharePointRunner, - "slack": SlackRunner, - "wikipedia": WikipediaRunner, -} - -__all__ = [ - "AirtableRunner", - "AstraRunner", - "AzureRunner", - "BiomedRunner", - "BoxRunner", - "ConfluenceRunner", - "DeltaTableRunner", - "DiscordRunner", - "DropboxRunner", - "ElasticSearchRunner", - "FsspecRunner", - "GCSRunner", - "GoogleDriveRunner", - "GithubRunner", - "GitlabRunner", - "JiraRunner", - "KafkaRunner", - "LocalRunner", - "MongoDBRunner", - "NotionRunner", - "OneDriveRunner", - "OpenSearchRunner", - "OutlookRunner", - "RedditRunner", - "S3Runner", - "SalesforceRunner", - "SharePointRunner", - "SlackRunner", - "WikipediaRunner", - "runner_map", - "Runner", -] diff --git a/unstructured/ingest/runner/airtable.py b/unstructured/ingest/runner/airtable.py deleted file mode 100644 index ec148221c..000000000 --- a/unstructured/ingest/runner/airtable.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.airtable import SimpleAirtableConfig - - -@dataclass -class AirtableRunner(Runner): - connector_config: "SimpleAirtableConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.access_config.personal_access_token.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="airtable", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.airtable import ( - AirtableSourceConnector, - ) - - return AirtableSourceConnector diff --git a/unstructured/ingest/runner/astradb.py b/unstructured/ingest/runner/astradb.py deleted file mode 100644 index a07c66b93..000000000 --- a/unstructured/ingest/runner/astradb.py +++ /dev/null @@ -1,34 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.astradb import SimpleAstraDBConfig - - -@dataclass -class AstraDBRunner(Runner): - connector_config: "SimpleAstraDBConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - str(self.connector_config.access_config.api_endpoint).encode("utf-8"), - ) - self.read_config.download_dir = update_download_dir_hash( - connector_name="astradb", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.astradb import ( - AstraDBSourceConnector, - ) - - return AstraDBSourceConnector diff --git a/unstructured/ingest/runner/base_runner.py b/unstructured/ingest/runner/base_runner.py deleted file mode 100644 index dbc9c58d1..000000000 --- a/unstructured/ingest/runner/base_runner.py +++ /dev/null @@ -1,89 +0,0 @@ -import logging -import typing as t -from abc import ABC, abstractmethod -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseDestinationConnector, - BaseSourceConnector, - ChunkingConfig, - EmbeddingConfig, - PartitionConfig, - PermissionsConfig, - ProcessorConfig, - ReadConfig, - RetryStrategyConfig, -) -from unstructured.ingest.logger import ingest_log_streaming_init -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.writers.base_writer import Writer - - -@dataclass -class Runner(EnhancedDataClassJsonMixin, ABC): - connector_config: BaseConnectorConfig - processor_config: ProcessorConfig - read_config: ReadConfig - partition_config: PartitionConfig - writer: t.Optional[Writer] = None - writer_kwargs: t.Optional[dict] = None - embedding_config: t.Optional[EmbeddingConfig] = None - chunking_config: t.Optional[ChunkingConfig] = None - permissions_config: t.Optional[PermissionsConfig] = None - retry_strategy_config: t.Optional[RetryStrategyConfig] = None - - def run(self, *args, **kwargs): - ingest_log_streaming_init(logging.DEBUG if self.processor_config.verbose else logging.INFO) - self.update_read_config() - source_connector = self.get_source_connector() - self.process_documents( - source_doc_connector=source_connector, - ) - - @abstractmethod - def update_read_config(self): - pass - - @abstractmethod - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - pass - - def get_source_connector(self) -> BaseSourceConnector: - source_connector_cls = self.get_source_connector_cls() - return source_connector_cls( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - ) - - def get_dest_doc_connector(self) -> t.Optional[BaseDestinationConnector]: - writer_kwargs = self.writer_kwargs if self.writer_kwargs else {} - if self.writer: - return self.writer.get_connector(**writer_kwargs) - return None - - def get_permissions_config(self) -> t.Optional[PermissionsConfig]: - if self.permissions_config is None: - return None - - permissions_config_filled = bool( - self.permissions_config.application_id - and self.permissions_config.client_cred - and self.permissions_config.tenant, - ) - - return self.permissions_config if permissions_config_filled else None - - def process_documents(self, source_doc_connector: BaseSourceConnector): - process_documents( - processor_config=self.processor_config, - source_doc_connector=source_doc_connector, - partition_config=self.partition_config, - dest_doc_connector=self.get_dest_doc_connector(), - embedder_config=self.embedding_config, - chunking_config=self.chunking_config, - permissions_config=self.get_permissions_config(), - retry_strategy_config=self.retry_strategy_config, - ) diff --git a/unstructured/ingest/runner/biomed.py b/unstructured/ingest/runner/biomed.py deleted file mode 100644 index 045d4486c..000000000 --- a/unstructured/ingest/runner/biomed.py +++ /dev/null @@ -1,45 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.biomed import SimpleBiomedConfig - - -@dataclass -class BiomedRunner(Runner): - connector_config: "SimpleBiomedConfig" - - def update_read_config(self): - base_path = ( - self.connector_config.path - if self.connector_config.path - else "{}-{}-{}".format( - self.connector_config.api_id if self.connector_config.api_id else "", - self.connector_config.api_from if self.connector_config.api_from else "", - self.connector_config.api_until if self.connector_config.api_until else "", - ) - ) - - hashed_dir_name = hashlib.sha256( - base_path.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="biomed", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.biomed import ( - BiomedSourceConnector, - ) - - return BiomedSourceConnector diff --git a/unstructured/ingest/runner/confluence.py b/unstructured/ingest/runner/confluence.py deleted file mode 100644 index 3f6057512..000000000 --- a/unstructured/ingest/runner/confluence.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.confluence import SimpleConfluenceConfig - - -@dataclass -class ConfluenceRunner(Runner): - connector_config: "SimpleConfluenceConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.url.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="confluence", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.confluence import ( - ConfluenceSourceConnector, - ) - - return ConfluenceSourceConnector diff --git a/unstructured/ingest/runner/delta_table.py b/unstructured/ingest/runner/delta_table.py deleted file mode 100644 index 5dc418710..000000000 --- a/unstructured/ingest/runner/delta_table.py +++ /dev/null @@ -1,34 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.delta_table import SimpleDeltaTableConfig - - -@dataclass -class DeltaTableRunner(Runner): - connector_config: "SimpleDeltaTableConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - str(self.connector_config.table_uri).encode("utf-8"), - ) - self.read_config.download_dir = update_download_dir_hash( - connector_name="delta_table", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.delta_table import ( - DeltaTableSourceConnector, - ) - - return DeltaTableSourceConnector diff --git a/unstructured/ingest/runner/discord.py b/unstructured/ingest/runner/discord.py deleted file mode 100644 index 28f11a9be..000000000 --- a/unstructured/ingest/runner/discord.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.discord import SimpleDiscordConfig - - -@dataclass -class DiscordRunner(Runner): - connector_config: "SimpleDiscordConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - ",".join(self.connector_config.channels).encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="discord", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.discord import ( - DiscordSourceConnector, - ) - - return DiscordSourceConnector diff --git a/unstructured/ingest/runner/elasticsearch.py b/unstructured/ingest/runner/elasticsearch.py deleted file mode 100644 index a1cb75b84..000000000 --- a/unstructured/ingest/runner/elasticsearch.py +++ /dev/null @@ -1,40 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.elasticsearch import SimpleElasticsearchConfig - - -@dataclass -class ElasticSearchRunner(Runner): - connector_config: "SimpleElasticsearchConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - "{}_{}".format( - ",".join(self.connector_config.access_config.hosts), - self.connector_config.index_name, - ).encode( - "utf-8", - ), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="elasticsearch", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchSourceConnector, - ) - - return ElasticsearchSourceConnector diff --git a/unstructured/ingest/runner/fsspec/__init__.py b/unstructured/ingest/runner/fsspec/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/runner/fsspec/azure.py b/unstructured/ingest/runner/fsspec/azure.py deleted file mode 100644 index e92f4502f..000000000 --- a/unstructured/ingest/runner/fsspec/azure.py +++ /dev/null @@ -1,30 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.azure import SimpleAzureBlobStorageConfig - - -@dataclass -class AzureRunner(Runner): - connector_config: "SimpleAzureBlobStorageConfig" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="azure", - read_config=self.read_config, - remote_url=self.connector_config.remote_url, # type: ignore - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.azure import ( - AzureBlobStorageSourceConnector, - ) - - return AzureBlobStorageSourceConnector diff --git a/unstructured/ingest/runner/fsspec/box.py b/unstructured/ingest/runner/fsspec/box.py deleted file mode 100644 index c219576f5..000000000 --- a/unstructured/ingest/runner/fsspec/box.py +++ /dev/null @@ -1,28 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.box import SimpleBoxConfig - - -@dataclass -class BoxRunner(Runner): - connector_config: "SimpleBoxConfig" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="box", - read_config=self.read_config, - remote_url=self.connector_config.remote_url, # type: ignore - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.box import BoxSourceConnector - - return BoxSourceConnector diff --git a/unstructured/ingest/runner/fsspec/dropbox.py b/unstructured/ingest/runner/fsspec/dropbox.py deleted file mode 100644 index ef408918c..000000000 --- a/unstructured/ingest/runner/fsspec/dropbox.py +++ /dev/null @@ -1,30 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.dropbox import SimpleDropboxConfig - - -@dataclass -class DropboxRunner(Runner): - connector_config: "SimpleDropboxConfig" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="dropbox", - read_config=self.read_config, - remote_url=self.connector_config.remote_url, # type: ignore - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.dropbox import ( - DropboxSourceConnector, - ) - - return DropboxSourceConnector diff --git a/unstructured/ingest/runner/fsspec/fsspec.py b/unstructured/ingest/runner/fsspec/fsspec.py deleted file mode 100644 index e98251a81..000000000 --- a/unstructured/ingest/runner/fsspec/fsspec.py +++ /dev/null @@ -1,40 +0,0 @@ -import typing as t -import warnings -from dataclasses import dataclass -from urllib.parse import urlparse - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.fsspec import SimpleFsspecConfig - - -@dataclass -class FsspecRunner(Runner): - connector_config: "SimpleFsspecConfig" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="fsspec", - read_config=self.read_config, - remote_url=self.fsspec_config.remote_url, # type: ignore - logger=logger, - ) - - protocol = urlparse(self.fsspec_config.remote_url).scheme # type: ignore - warnings.warn( - f"`fsspec` protocol {protocol} is not directly supported by `unstructured`," - " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`," - "`dropbox`, `abfs`, `az` and `sftp`.", - UserWarning, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.fsspec import ( - FsspecSourceConnector, - ) - - return FsspecSourceConnector diff --git a/unstructured/ingest/runner/fsspec/gcs.py b/unstructured/ingest/runner/fsspec/gcs.py deleted file mode 100644 index 1c3e043e3..000000000 --- a/unstructured/ingest/runner/fsspec/gcs.py +++ /dev/null @@ -1,28 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.gcs import SimpleGcsConfig - - -@dataclass -class GCSRunner(Runner): - connector_config: "SimpleGcsConfig" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="gcs", - read_config=self.read_config, - remote_url=self.connector_config.remote_url, # type: ignore - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.gcs import GcsSourceConnector - - return GcsSourceConnector diff --git a/unstructured/ingest/runner/fsspec/s3.py b/unstructured/ingest/runner/fsspec/s3.py deleted file mode 100644 index 086e2a58d..000000000 --- a/unstructured/ingest/runner/fsspec/s3.py +++ /dev/null @@ -1,28 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.s3 import SimpleS3Config - - -@dataclass -class S3Runner(Runner): - connector_config: "SimpleS3Config" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="s3", - read_config=self.read_config, - remote_url=self.connector_config.remote_url, # type: ignore - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.s3 import S3SourceConnector - - return S3SourceConnector diff --git a/unstructured/ingest/runner/fsspec/sftp.py b/unstructured/ingest/runner/fsspec/sftp.py deleted file mode 100644 index db73ad7e1..000000000 --- a/unstructured/ingest/runner/fsspec/sftp.py +++ /dev/null @@ -1,28 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_remote_url - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.sftp import SimpleSftpConfig - - -@dataclass -class SftpRunner(Runner): - connector_config: "SimpleSftpConfig" - - def update_read_config(self): - self.read_config.download_dir = update_download_dir_remote_url( - connector_name="sftp", - read_config=self.read_config, - remote_url=self.connector_config.remote_url, # type: ignore - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.fsspec.sftp import SftpSourceConnector - - return SftpSourceConnector diff --git a/unstructured/ingest/runner/github.py b/unstructured/ingest/runner/github.py deleted file mode 100644 index 86cf191be..000000000 --- a/unstructured/ingest/runner/github.py +++ /dev/null @@ -1,37 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.github import SimpleGitHubConfig - - -@dataclass -class GithubRunner(Runner): - connector_config: "SimpleGitHubConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - f"{self.connector_config.url}_{self.connector_config.branch}".encode( - "utf-8", - ), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="github", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.github import ( - GitHubSourceConnector, - ) - - return GitHubSourceConnector diff --git a/unstructured/ingest/runner/gitlab.py b/unstructured/ingest/runner/gitlab.py deleted file mode 100644 index c6b8e5c3a..000000000 --- a/unstructured/ingest/runner/gitlab.py +++ /dev/null @@ -1,37 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.gitlab import SimpleGitlabConfig - - -@dataclass -class GitlabRunner(Runner): - connector_config: "SimpleGitlabConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - f"{self.connector_config.url}_{self.connector_config.branch}".encode( - "utf-8", - ), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="gitlab", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.gitlab import ( - GitLabSourceConnector, - ) - - return GitLabSourceConnector diff --git a/unstructured/ingest/runner/google_drive.py b/unstructured/ingest/runner/google_drive.py deleted file mode 100644 index 8972c9a15..000000000 --- a/unstructured/ingest/runner/google_drive.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.google_drive import SimpleGoogleDriveConfig - - -@dataclass -class GoogleDriveRunner(Runner): - connector_config: "SimpleGoogleDriveConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.drive_id.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="google_drive", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.google_drive import ( - GoogleDriveSourceConnector, - ) - - return GoogleDriveSourceConnector diff --git a/unstructured/ingest/runner/hubspot.py b/unstructured/ingest/runner/hubspot.py deleted file mode 100644 index 2e988e759..000000000 --- a/unstructured/ingest/runner/hubspot.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.hubspot import SimpleHubSpotConfig - - -@dataclass -class HubSpotRunner(Runner): - connector_config: "SimpleHubSpotConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.access_config.api_token.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="hubspot", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.hubspot import ( - HubSpotSourceConnector, - ) - - return HubSpotSourceConnector diff --git a/unstructured/ingest/runner/jira.py b/unstructured/ingest/runner/jira.py deleted file mode 100644 index d632de9d8..000000000 --- a/unstructured/ingest/runner/jira.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.jira import SimpleJiraConfig - - -@dataclass -class JiraRunner(Runner): - connector_config: "SimpleJiraConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.url.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="jira", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.jira import ( - JiraSourceConnector, - ) - - return JiraSourceConnector diff --git a/unstructured/ingest/runner/kafka.py b/unstructured/ingest/runner/kafka.py deleted file mode 100644 index ba8a75094..000000000 --- a/unstructured/ingest/runner/kafka.py +++ /dev/null @@ -1,34 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.kafka import SimpleKafkaConfig - - -@dataclass -class KafkaRunner(Runner): - connector_config: "SimpleKafkaConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - str(self.connector_config.bootstrap_server).encode("utf-8"), - ) - self.read_config.download_dir = update_download_dir_hash( - connector_name="kafka", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.kafka import ( - KafkaSourceConnector, - ) - - return KafkaSourceConnector diff --git a/unstructured/ingest/runner/local.py b/unstructured/ingest/runner/local.py deleted file mode 100644 index a8c4ab19c..000000000 --- a/unstructured/ingest/runner/local.py +++ /dev/null @@ -1,23 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.runner.base_runner import Runner - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.local import SimpleLocalConfig - - -@dataclass -class LocalRunner(Runner): - connector_config: "SimpleLocalConfig" - - def update_read_config(self): - pass - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.local import ( - LocalSourceConnector, - ) - - return LocalSourceConnector diff --git a/unstructured/ingest/runner/mongodb.py b/unstructured/ingest/runner/mongodb.py deleted file mode 100644 index bdde249cd..000000000 --- a/unstructured/ingest/runner/mongodb.py +++ /dev/null @@ -1,34 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.mongodb import SimpleMongoDBConfig - - -@dataclass -class MongoDBRunner(Runner): - connector_config: "SimpleMongoDBConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - str(self.connector_config.access_config.uri).encode("utf-8"), - ) - self.read_config.download_dir = update_download_dir_hash( - connector_name="mongodb", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.mongodb import ( - MongoDBSourceConnector, - ) - - return MongoDBSourceConnector diff --git a/unstructured/ingest/runner/notion.py b/unstructured/ingest/runner/notion.py deleted file mode 100644 index ee7fd9c5e..000000000 --- a/unstructured/ingest/runner/notion.py +++ /dev/null @@ -1,61 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.notion.connector import SimpleNotionConfig - - -@dataclass -class NotionRunner(Runner): - connector_config: "SimpleNotionConfig" - - def update_read_config(self): - if not self.connector_config.page_ids and not self.connector_config.database_ids: - raise ValueError("no page ids nor database ids provided") - - if self.connector_config.page_ids and self.connector_config.database_ids: - hashed_dir_name = hashlib.sha256( - "{},{}".format( - ",".join(self.connector_config.page_ids), - ",".join(self.connector_config.database_ids), - ).encode("utf-8"), - ) - elif self.connector_config.page_ids: - hashed_dir_name = hashlib.sha256( - ",".join(self.connector_config.page_ids).encode("utf-8"), - ) - elif self.connector_config.database_ids: - hashed_dir_name = hashlib.sha256( - ",".join(self.connector_config.database_ids).encode("utf-8"), - ) - else: - raise ValueError("could not create local cache directory name") - - self.read_config.download_dir = update_download_dir_hash( - connector_name="notion", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.notion.connector import ( - NotionSourceConnector, - ) - - return NotionSourceConnector - - def get_source_connector(self) -> BaseSourceConnector: - source_connector_cls = self.get_source_connector_cls() - return source_connector_cls( - processor_config=self.processor_config, - connector_config=self.connector_config, - read_config=self.read_config, - retry_strategy_config=self.retry_strategy_config, - ) diff --git a/unstructured/ingest/runner/onedrive.py b/unstructured/ingest/runner/onedrive.py deleted file mode 100644 index 6c2312614..000000000 --- a/unstructured/ingest/runner/onedrive.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.onedrive import SimpleOneDriveConfig - - -@dataclass -class OneDriveRunner(Runner): - connector_config: "SimpleOneDriveConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - f"{self.connector_config.tenant}_{self.connector_config.user_pname}".encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="onedrive", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.onedrive import ( - OneDriveSourceConnector, - ) - - return OneDriveSourceConnector diff --git a/unstructured/ingest/runner/opensearch.py b/unstructured/ingest/runner/opensearch.py deleted file mode 100644 index e3ce03a71..000000000 --- a/unstructured/ingest/runner/opensearch.py +++ /dev/null @@ -1,40 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.opensearch import SimpleOpenSearchConfig - - -@dataclass -class OpenSearchRunner(Runner): - connector_config: "SimpleOpenSearchConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - "{}_{}".format( - ",".join(self.connector_config.access_config.hosts), - self.connector_config.index_name, - ).encode( - "utf-8", - ), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="opensearch", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.opensearch import ( - OpenSearchSourceConnector, - ) - - return OpenSearchSourceConnector diff --git a/unstructured/ingest/runner/outlook.py b/unstructured/ingest/runner/outlook.py deleted file mode 100644 index 3672dacef..000000000 --- a/unstructured/ingest/runner/outlook.py +++ /dev/null @@ -1,33 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.outlook import SimpleOutlookConfig - - -@dataclass -class OutlookRunner(Runner): - connector_config: "SimpleOutlookConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256(self.connector_config.user_email.encode("utf-8")) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="outlook", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.outlook import ( - OutlookSourceConnector, - ) - - return OutlookSourceConnector diff --git a/unstructured/ingest/runner/reddit.py b/unstructured/ingest/runner/reddit.py deleted file mode 100644 index 0d59acd74..000000000 --- a/unstructured/ingest/runner/reddit.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.reddit import SimpleRedditConfig - - -@dataclass -class RedditRunner(Runner): - connector_config: "SimpleRedditConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.subreddit_name.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="reddit", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.reddit import ( - RedditSourceConnector, - ) - - return RedditSourceConnector diff --git a/unstructured/ingest/runner/salesforce.py b/unstructured/ingest/runner/salesforce.py deleted file mode 100644 index 06326e556..000000000 --- a/unstructured/ingest/runner/salesforce.py +++ /dev/null @@ -1,33 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.salesforce import SimpleSalesforceConfig - - -@dataclass -class SalesforceRunner(Runner): - connector_config: "SimpleSalesforceConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256(self.connector_config.username.encode("utf-8")) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="salesforce", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.salesforce import ( - SalesforceSourceConnector, - ) - - return SalesforceSourceConnector diff --git a/unstructured/ingest/runner/sharepoint.py b/unstructured/ingest/runner/sharepoint.py deleted file mode 100644 index f5e0dd36b..000000000 --- a/unstructured/ingest/runner/sharepoint.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.sharepoint import SimpleSharepointConfig - - -@dataclass -class SharePointRunner(Runner): - connector_config: "SimpleSharepointConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - f"{self.connector_config.site}_{self.connector_config.path}".encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="sharepoint", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.sharepoint import ( - SharepointSourceConnector, - ) - - return SharepointSourceConnector diff --git a/unstructured/ingest/runner/slack.py b/unstructured/ingest/runner/slack.py deleted file mode 100644 index 2d4231473..000000000 --- a/unstructured/ingest/runner/slack.py +++ /dev/null @@ -1,33 +0,0 @@ -import hashlib -import typing as t - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.slack import SimpleSlackConfig - - -class SlackRunner(Runner): - connector_config: "SimpleSlackConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - ",".join(self.connector_config.channels).encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="slack", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.slack import ( - SlackSourceConnector, - ) - - return SlackSourceConnector diff --git a/unstructured/ingest/runner/utils.py b/unstructured/ingest/runner/utils.py deleted file mode 100644 index 0816923ed..000000000 --- a/unstructured/ingest/runner/utils.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -import hashlib -import logging -from pathlib import Path - -from unstructured.ingest.interfaces import ( - ReadConfig, -) - - -def update_download_dir_remote_url( - connector_name: str, - read_config: ReadConfig, - remote_url: str, - logger: logging.Logger, -) -> str: - hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8")) - return update_download_dir_hash( - connector_name=connector_name, - read_config=read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - -def update_download_dir_hash( - connector_name: str, - read_config: ReadConfig, - hashed_dir_name: hashlib._Hash, - logger: logging.Logger, -) -> str: - if not read_config.download_dir: - cache_path = Path.home() / ".cache" / "unstructured" / "ingest" - if not cache_path.exists(): - cache_path.mkdir(parents=True, exist_ok=True) - download_dir = cache_path / connector_name / hashed_dir_name.hexdigest()[:10] - if read_config.preserve_downloads: - logger.warning( - f"Preserving downloaded files but download_dir is not specified," - f" using {download_dir}", - ) - new_download_dir = str(download_dir) - logger.debug(f"updating download directory to: {new_download_dir}") - else: - new_download_dir = read_config.download_dir - return new_download_dir diff --git a/unstructured/ingest/runner/wikipedia.py b/unstructured/ingest/runner/wikipedia.py deleted file mode 100644 index 7a67dcd43..000000000 --- a/unstructured/ingest/runner/wikipedia.py +++ /dev/null @@ -1,35 +0,0 @@ -import hashlib -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseSourceConnector -from unstructured.ingest.logger import logger -from unstructured.ingest.runner.base_runner import Runner -from unstructured.ingest.runner.utils import update_download_dir_hash - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.wikipedia import SimpleWikipediaConfig - - -@dataclass -class WikipediaRunner(Runner): - connector_config: "SimpleWikipediaConfig" - - def update_read_config(self): - hashed_dir_name = hashlib.sha256( - self.connector_config.page_title.encode("utf-8"), - ) - - self.read_config.download_dir = update_download_dir_hash( - connector_name="wikipedia", - read_config=self.read_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]: - from unstructured.ingest.connector.wikipedia import ( - WikipediaSourceConnector, - ) - - return WikipediaSourceConnector diff --git a/unstructured/ingest/runner/writers/__init__.py b/unstructured/ingest/runner/writers/__init__.py deleted file mode 100644 index 8b07adb9e..000000000 --- a/unstructured/ingest/runner/writers/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -import typing as t - -from .astradb import AstraDBWriter -from .azure_cognitive_search import AzureCognitiveSearchWriter -from .base_writer import Writer -from .chroma import ChromaWriter -from .clarifai import ClarifaiWriter -from .databricks_volumes import DatabricksVolumesWriter -from .delta_table import DeltaTableWriter -from .elasticsearch import ElasticsearchWriter -from .fsspec.azure import AzureWriter -from .fsspec.box import BoxWriter -from .fsspec.dropbox import DropboxWriter -from .fsspec.gcs import GcsWriter -from .fsspec.s3 import S3Writer -from .kafka import KafkaWriter -from .mongodb import MongodbWriter -from .opensearch import OpenSearchWriter -from .pinecone import PineconeWriter -from .qdrant import QdrantWriter -from .sql import SqlWriter -from .vectara import VectaraWriter -from .weaviate import WeaviateWriter - -writer_map: t.Dict[str, t.Type[Writer]] = { - "astradb": AstraDBWriter, - "azure": AzureWriter, - "azure_cognitive_search": AzureCognitiveSearchWriter, - "box": BoxWriter, - "chroma": ChromaWriter, - "clarifai": ClarifaiWriter, - "databricks_volumes": DatabricksVolumesWriter, - "delta_table": DeltaTableWriter, - "dropbox": DropboxWriter, - "elasticsearch": ElasticsearchWriter, - "gcs": GcsWriter, - "kafka": KafkaWriter, - "mongodb": MongodbWriter, - "opensearch": OpenSearchWriter, - "pinecone": PineconeWriter, - "qdrant": QdrantWriter, - "s3": S3Writer, - "sql": SqlWriter, - "vectara": VectaraWriter, - "weaviate": WeaviateWriter, -} - -__all__ = ["writer_map"] diff --git a/unstructured/ingest/runner/writers/astradb.py b/unstructured/ingest/runner/writers/astradb.py deleted file mode 100644 index b12ee7234..000000000 --- a/unstructured/ingest/runner/writers/astradb.py +++ /dev/null @@ -1,22 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.astradb import AstraDBWriteConfig, SimpleAstraDBConfig - - -@dataclass -class AstraDBWriter(Writer, EnhancedDataClassJsonMixin): - write_config: "AstraDBWriteConfig" - connector_config: "SimpleAstraDBConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.astradb import ( - AstraDBDestinationConnector, - ) - - return AstraDBDestinationConnector diff --git a/unstructured/ingest/runner/writers/azure_cognitive_search.py b/unstructured/ingest/runner/writers/azure_cognitive_search.py deleted file mode 100644 index 69204e3f3..000000000 --- a/unstructured/ingest/runner/writers/azure_cognitive_search.py +++ /dev/null @@ -1,24 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.azure_cognitive_search import ( - AzureCognitiveSearchWriteConfig, - SimpleAzureCognitiveSearchStorageConfig, - ) - - -@dataclass -class AzureCognitiveSearchWriter(Writer): - connector_config: "SimpleAzureCognitiveSearchStorageConfig" - write_config: "AzureCognitiveSearchWriteConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.azure_cognitive_search import ( - AzureCognitiveSearchDestinationConnector, - ) - - return AzureCognitiveSearchDestinationConnector diff --git a/unstructured/ingest/runner/writers/base_writer.py b/unstructured/ingest/runner/writers/base_writer.py deleted file mode 100644 index e28d11b07..000000000 --- a/unstructured/ingest/runner/writers/base_writer.py +++ /dev/null @@ -1,26 +0,0 @@ -import typing as t -from abc import ABC, abstractmethod -from dataclasses import dataclass - -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseDestinationConnector, - WriteConfig, -) - - -@dataclass -class Writer(ABC): - connector_config: BaseConnectorConfig - write_config: WriteConfig - - @abstractmethod - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - pass - - def get_connector(self, **kwargs) -> BaseDestinationConnector: - connector_cls = self.get_connector_cls() - return connector_cls( - write_config=self.write_config, - connector_config=self.connector_config, - ) diff --git a/unstructured/ingest/runner/writers/chroma.py b/unstructured/ingest/runner/writers/chroma.py deleted file mode 100644 index e41753d01..000000000 --- a/unstructured/ingest/runner/writers/chroma.py +++ /dev/null @@ -1,22 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.chroma import ChromaWriteConfig, SimpleChromaConfig - - -@dataclass -class ChromaWriter(Writer, EnhancedDataClassJsonMixin): - write_config: "ChromaWriteConfig" - connector_config: "SimpleChromaConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.chroma import ( - ChromaDestinationConnector, - ) - - return ChromaDestinationConnector diff --git a/unstructured/ingest/runner/writers/clarifai.py b/unstructured/ingest/runner/writers/clarifai.py deleted file mode 100644 index 9742e1eee..000000000 --- a/unstructured/ingest/runner/writers/clarifai.py +++ /dev/null @@ -1,19 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.clarifai import ClarifaiWriteConfig, SimpleClarifaiConfig - - -@dataclass -class ClarifaiWriter(Writer): - write_config: "ClarifaiWriteConfig" - connector_config: "SimpleClarifaiConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.clarifai import ClarifaiDestinationConnector - - return ClarifaiDestinationConnector diff --git a/unstructured/ingest/runner/writers/databricks_volumes.py b/unstructured/ingest/runner/writers/databricks_volumes.py deleted file mode 100644 index 74703f850..000000000 --- a/unstructured/ingest/runner/writers/databricks_volumes.py +++ /dev/null @@ -1,25 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.databricks_volumes import ( - DatabricksVolumesWriteConfig, - SimpleDatabricksVolumesConfig, - ) - - -@dataclass -class DatabricksVolumesWriter(Writer, EnhancedDataClassJsonMixin): - write_config: "DatabricksVolumesWriteConfig" - connector_config: "SimpleDatabricksVolumesConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.databricks_volumes import ( - DatabricksVolumesDestinationConnector, - ) - - return DatabricksVolumesDestinationConnector diff --git a/unstructured/ingest/runner/writers/delta_table.py b/unstructured/ingest/runner/writers/delta_table.py deleted file mode 100644 index 6337e03d9..000000000 --- a/unstructured/ingest/runner/writers/delta_table.py +++ /dev/null @@ -1,24 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.delta_table import ( - DeltaTableWriteConfig, - SimpleDeltaTableConfig, - ) - - -@dataclass -class DeltaTableWriter(Writer): - write_config: "DeltaTableWriteConfig" - connector_config: "SimpleDeltaTableConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.delta_table import ( - DeltaTableDestinationConnector, - ) - - return DeltaTableDestinationConnector diff --git a/unstructured/ingest/runner/writers/elasticsearch.py b/unstructured/ingest/runner/writers/elasticsearch.py deleted file mode 100644 index 7ce8b451f..000000000 --- a/unstructured/ingest/runner/writers/elasticsearch.py +++ /dev/null @@ -1,24 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchWriteConfig, - SimpleElasticsearchConfig, - ) - - -@dataclass -class ElasticsearchWriter(Writer): - connector_config: "SimpleElasticsearchConfig" - write_config: "ElasticsearchWriteConfig" - - def get_connector_cls(self) -> BaseDestinationConnector: - from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchDestinationConnector, - ) - - return ElasticsearchDestinationConnector diff --git a/unstructured/ingest/runner/writers/fsspec/__init__.py b/unstructured/ingest/runner/writers/fsspec/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/runner/writers/fsspec/azure.py b/unstructured/ingest/runner/writers/fsspec/azure.py deleted file mode 100644 index 66835898e..000000000 --- a/unstructured/ingest/runner/writers/fsspec/azure.py +++ /dev/null @@ -1,24 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.azure import ( - AzureWriteConfig, - SimpleAzureBlobStorageConfig, - ) - - -@dataclass -class AzureWriter(Writer): - connector_config: "SimpleAzureBlobStorageConfig" - write_config: "AzureWriteConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.fsspec.azure import ( - AzureBlobStorageDestinationConnector, - ) - - return AzureBlobStorageDestinationConnector diff --git a/unstructured/ingest/runner/writers/fsspec/box.py b/unstructured/ingest/runner/writers/fsspec/box.py deleted file mode 100644 index 5f4599a40..000000000 --- a/unstructured/ingest/runner/writers/fsspec/box.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.box import BoxWriteConfig, SimpleBoxConfig - - -@dataclass -class BoxWriter(Writer): - connector_config: "SimpleBoxConfig" - write_config: "BoxWriteConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.fsspec.box import ( - BoxDestinationConnector, - ) - - return BoxDestinationConnector diff --git a/unstructured/ingest/runner/writers/fsspec/dropbox.py b/unstructured/ingest/runner/writers/fsspec/dropbox.py deleted file mode 100644 index 0c9389079..000000000 --- a/unstructured/ingest/runner/writers/fsspec/dropbox.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.dropbox import DropboxWriteConfig, SimpleDropboxConfig - - -@dataclass -class DropboxWriter(Writer): - connector_config: "SimpleDropboxConfig" - write_config: "DropboxWriteConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.fsspec.dropbox import ( - DropboxDestinationConnector, - ) - - return DropboxDestinationConnector diff --git a/unstructured/ingest/runner/writers/fsspec/gcs.py b/unstructured/ingest/runner/writers/fsspec/gcs.py deleted file mode 100644 index 728a109d2..000000000 --- a/unstructured/ingest/runner/writers/fsspec/gcs.py +++ /dev/null @@ -1,19 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.gcs import GcsWriteConfig, SimpleGcsConfig - - -@dataclass -class GcsWriter(Writer): - connector_config: "SimpleGcsConfig" - write_config: "GcsWriteConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.fsspec.gcs import GcsDestinationConnector - - return GcsDestinationConnector diff --git a/unstructured/ingest/runner/writers/fsspec/s3.py b/unstructured/ingest/runner/writers/fsspec/s3.py deleted file mode 100644 index 64d2b3131..000000000 --- a/unstructured/ingest/runner/writers/fsspec/s3.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.fsspec.s3 import S3WriteConfig, SimpleS3Config - - -@dataclass -class S3Writer(Writer): - connector_config: "SimpleS3Config" - write_config: "S3WriteConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.fsspec.s3 import ( - S3DestinationConnector, - ) - - return S3DestinationConnector diff --git a/unstructured/ingest/runner/writers/kafka.py b/unstructured/ingest/runner/writers/kafka.py deleted file mode 100644 index f8e5a3e3d..000000000 --- a/unstructured/ingest/runner/writers/kafka.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.kafka import KafkaWriteConfig, SimpleKafkaConfig - - -@dataclass -class KafkaWriter(Writer): - write_config: "KafkaWriteConfig" - connector_config: "SimpleKafkaConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.kafka import ( - KafkaDestinationConnector, - ) - - return KafkaDestinationConnector diff --git a/unstructured/ingest/runner/writers/mongodb.py b/unstructured/ingest/runner/writers/mongodb.py deleted file mode 100644 index 5798a0161..000000000 --- a/unstructured/ingest/runner/writers/mongodb.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.mongodb import MongoDBWriteConfig, SimpleMongoDBConfig - - -@dataclass -class MongodbWriter(Writer): - write_config: "MongoDBWriteConfig" - connector_config: "SimpleMongoDBConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.mongodb import ( - MongoDBDestinationConnector, - ) - - return MongoDBDestinationConnector diff --git a/unstructured/ingest/runner/writers/opensearch.py b/unstructured/ingest/runner/writers/opensearch.py deleted file mode 100644 index f0c62b578..000000000 --- a/unstructured/ingest/runner/writers/opensearch.py +++ /dev/null @@ -1,26 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchWriteConfig, - ) - from unstructured.ingest.connector.opensearch import ( - SimpleOpenSearchConfig, - ) - - -@dataclass -class OpenSearchWriter(Writer): - connector_config: "SimpleOpenSearchConfig" - write_config: "ElasticsearchWriteConfig" - - def get_connector_cls(self) -> BaseDestinationConnector: - from unstructured.ingest.connector.opensearch import ( - OpenSearchDestinationConnector, - ) - - return OpenSearchDestinationConnector diff --git a/unstructured/ingest/runner/writers/pinecone.py b/unstructured/ingest/runner/writers/pinecone.py deleted file mode 100644 index 86fd9580a..000000000 --- a/unstructured/ingest/runner/writers/pinecone.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.pinecone import PineconeWriteConfig, SimplePineconeConfig - - -@dataclass -class PineconeWriter(Writer): - write_config: "PineconeWriteConfig" - connector_config: "SimplePineconeConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.pinecone import ( - PineconeDestinationConnector, - ) - - return PineconeDestinationConnector diff --git a/unstructured/ingest/runner/writers/qdrant.py b/unstructured/ingest/runner/writers/qdrant.py deleted file mode 100644 index e7e632405..000000000 --- a/unstructured/ingest/runner/writers/qdrant.py +++ /dev/null @@ -1,19 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.qdrant import QdrantWriteConfig, SimpleQdrantConfig - - -@dataclass -class QdrantWriter(Writer): - write_config: "QdrantWriteConfig" - connector_config: "SimpleQdrantConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.qdrant import QdrantDestinationConnector - - return QdrantDestinationConnector diff --git a/unstructured/ingest/runner/writers/sql.py b/unstructured/ingest/runner/writers/sql.py deleted file mode 100644 index 70c710a1f..000000000 --- a/unstructured/ingest/runner/writers/sql.py +++ /dev/null @@ -1,22 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.sql import SimpleSqlConfig - from unstructured.ingest.interfaces import WriteConfig - - -@dataclass -class SqlWriter(Writer): - write_config: "WriteConfig" - connector_config: "SimpleSqlConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.sql import ( - SqlDestinationConnector, - ) - - return SqlDestinationConnector diff --git a/unstructured/ingest/runner/writers/vectara.py b/unstructured/ingest/runner/writers/vectara.py deleted file mode 100644 index f29128022..000000000 --- a/unstructured/ingest/runner/writers/vectara.py +++ /dev/null @@ -1,22 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.vectara import SimpleVectaraConfig, VectaraWriteConfig - - -@dataclass -class VectaraWriter(Writer, EnhancedDataClassJsonMixin): - write_config: "VectaraWriteConfig" - connector_config: "SimpleVectaraConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.vectara import ( - VectaraDestinationConnector, - ) - - return VectaraDestinationConnector diff --git a/unstructured/ingest/runner/writers/weaviate.py b/unstructured/ingest/runner/writers/weaviate.py deleted file mode 100644 index 96c7b0071..000000000 --- a/unstructured/ingest/runner/writers/weaviate.py +++ /dev/null @@ -1,21 +0,0 @@ -import typing as t -from dataclasses import dataclass - -from unstructured.ingest.interfaces import BaseDestinationConnector -from unstructured.ingest.runner.writers.base_writer import Writer - -if t.TYPE_CHECKING: - from unstructured.ingest.connector.weaviate import SimpleWeaviateConfig, WeaviateWriteConfig - - -@dataclass -class WeaviateWriter(Writer): - write_config: "WeaviateWriteConfig" - connector_config: "SimpleWeaviateConfig" - - def get_connector_cls(self) -> t.Type[BaseDestinationConnector]: - from unstructured.ingest.connector.weaviate import ( - WeaviateDestinationConnector, - ) - - return WeaviateDestinationConnector diff --git a/unstructured/ingest/utils/__init__.py b/unstructured/ingest/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/utils/compression.py b/unstructured/ingest/utils/compression.py deleted file mode 100644 index 41f4b3240..000000000 --- a/unstructured/ingest/utils/compression.py +++ /dev/null @@ -1,117 +0,0 @@ -import copy -import os -import sys -import tarfile -import zipfile -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional - -from unstructured.ingest.connector.local import LocalSourceConnector, SimpleLocalConfig -from unstructured.ingest.interfaces import ( - BaseConnectorConfig, - BaseSingleIngestDoc, - ProcessorConfig, - ReadConfig, -) -from unstructured.ingest.logger import logger - -ZIP_FILE_EXT = [".zip"] -TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"] - - -def uncompress_file(filename: str, path: Optional[str] = None) -> str: - """ - Takes in a compressed zip or tar file and uncompresses it - """ - # Create path if it doesn't already exist - if path: - Path(path).mkdir(parents=True, exist_ok=True) - - if any(filename.endswith(ext) for ext in ZIP_FILE_EXT): - return uncompress_zip_file(zip_filename=filename, path=path) - elif any(filename.endswith(ext) for ext in TAR_FILE_EXT): - return uncompress_tar_file(tar_filename=filename, path=path) - else: - raise ValueError( - "filename {} not a recognized compressed extension: {}".format( - filename, - ", ".join(ZIP_FILE_EXT + TAR_FILE_EXT), - ), - ) - - -def uncompress_zip_file(zip_filename: str, path: Optional[str] = None) -> str: - head, tail = os.path.split(zip_filename) - for ext in ZIP_FILE_EXT: - if tail.endswith(ext): - tail = tail[: -(len(ext))] - break - path = path if path else os.path.join(head, f"{tail}-zip-uncompressed") - logger.info(f"extracting zip {zip_filename} -> {path}") - with zipfile.ZipFile(zip_filename) as zfile: - zfile.extractall(path=path) - return path - - -def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str: - head, tail = os.path.split(tar_filename) - for ext in TAR_FILE_EXT: - if tail.endswith(ext): - tail = tail[: -(len(ext))] - break - - path = path if path else os.path.join(head, f"{tail}-tar-uncompressed") - logger.info(f"extracting tar {tar_filename} -> {path}") - with tarfile.open(tar_filename, "r:gz") as tfile: - # NOTE(robinson: Mitigate against malicious content being extracted from the tar file. - # This was added in Python 3.12 - # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters - if sys.version_info >= (3, 12): - tfile.extraction_filter = tarfile.tar_filter - else: - logger.warning( - "Extraction filtering for tar files is available for Python 3.12 and above. " - "Consider upgrading your Python version to improve security. " - "See https://docs.python.org/3/library/tarfile.html#extraction-filters" - ) - tfile.extractall(path=path) - return path - - -@dataclass -class CompressionSourceConnectorMixin: - processor_config: ProcessorConfig - read_config: ReadConfig - connector_config: BaseConnectorConfig - - def process_compressed_doc(self, doc: BaseSingleIngestDoc) -> List[BaseSingleIngestDoc]: - """ - Utility function which helps process compressed files. Extracts the contents and returns - generated ingest docs via local source connector - """ - # Download the raw file to local - doc.get_file() - path = uncompress_file(filename=str(doc.filename)) - new_read_configs = copy.copy(self.read_config) - new_process_configs = copy.copy(self.processor_config) - relative_path = path.replace(self.read_config.download_dir, "") - - if self.processor_config.output_dir.endswith(os.sep): - new_process_configs.output_dir = f"{self.processor_config.output_dir}{relative_path}" - else: - new_process_configs.output_dir = ( - f"{self.processor_config.output_dir}{os.sep}{relative_path}" - ) - - local_connector = LocalSourceConnector( - connector_config=SimpleLocalConfig( - input_path=path, - recursive=True, - ), - read_config=new_read_configs, - processor_config=new_process_configs, - ) - logger.info(f"Created local source connector: {local_connector.to_json()}") - local_connector.initialize() - return local_connector.get_ingest_docs() diff --git a/unstructured/ingest/utils/data_prep.py b/unstructured/ingest/utils/data_prep.py deleted file mode 100644 index 722de16e4..000000000 --- a/unstructured/ingest/utils/data_prep.py +++ /dev/null @@ -1,29 +0,0 @@ -import itertools -import json - - -def batch_generator(iterable, batch_size=100): - """A helper function to break an iterable into batches of size batch_size.""" - it = iter(iterable) - chunk = tuple(itertools.islice(it, batch_size)) - while chunk: - yield chunk - chunk = tuple(itertools.islice(it, batch_size)) - - -def generator_batching_wbytes(iterable, batch_size_limit_bytes=15_000_000): - """A helper function to break an iterable into chunks of specified bytes.""" - current_batch, current_batch_size = [], 0 - - for item in iterable: - item_size_bytes = len(json.dumps(item).encode("utf-8")) - - if current_batch_size + item_size_bytes <= batch_size_limit_bytes: - current_batch.append(item) - current_batch_size += item_size_bytes - else: - yield current_batch - current_batch, current_batch_size = [item], item_size_bytes - - if current_batch: - yield current_batch diff --git a/unstructured/ingest/utils/string_and_date_utils.py b/unstructured/ingest/utils/string_and_date_utils.py deleted file mode 100644 index 89f1ca84d..000000000 --- a/unstructured/ingest/utils/string_and_date_utils.py +++ /dev/null @@ -1,39 +0,0 @@ -import json -import typing as t -from datetime import datetime - -from dateutil import parser - - -def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]: - """Helper function attempts to deserialize json string to a dictionary.""" - try: - return json.loads(json_string) - except json.JSONDecodeError: - # Not neccessary an error if it is a path or malformed json - pass - try: - # This is common when single quotes are used instead of double quotes - return json.loads(json_string.replace("'", '"')) - except json.JSONDecodeError: - # Not neccessary an error if it is a path - pass - return json_string - - -def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str: - """ - Ensures that the input value is converted to an ISO format datetime string. - Handles both datetime objects and strings. - """ - if isinstance(timestamp, datetime): - return timestamp.isoformat() - elif isinstance(timestamp, str): - try: - # Parse the datetime string in various formats - dt = parser.parse(timestamp) - return dt.isoformat() - except ValueError as e: - raise ValueError(f"String '{timestamp}' could not be parsed as a datetime.") from e - else: - raise TypeError(f"Expected input type datetime or str, but got {type(timestamp)}.") diff --git a/unstructured/ingest/utils/table.py b/unstructured/ingest/utils/table.py deleted file mode 100644 index 65fd7b92f..000000000 --- a/unstructured/ingest/utils/table.py +++ /dev/null @@ -1,24 +0,0 @@ -import typing as t - -import pandas as pd - -from unstructured.staging.base import flatten_dict, get_default_pandas_dtypes - - -def convert_to_pandas_dataframe( - elements_dict: t.List[t.Dict[str, t.Any]], - drop_empty_cols: bool = False, -) -> pd.DataFrame: - # Flatten metadata if it hasn't already been flattened - for d in elements_dict: - if metadata := d.pop("metadata", None): - d.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"])) - - df = pd.DataFrame.from_dict( - elements_dict, - ) - dt = {k: v for k, v in get_default_pandas_dtypes().items() if k in df.columns} - df = df.astype(dt) - if drop_empty_cols: - df.dropna(axis=1, how="all", inplace=True) - return df diff --git a/unstructured/ingest/v2/README.md b/unstructured/ingest/v2/README.md deleted file mode 100644 index f7291aa5a..000000000 --- a/unstructured/ingest/v2/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Ingest -![Project unmaintained](https://img.shields.io/badge/project-unmaintained-red.svg) - -Project has been moved to: [Unstructured Ingest](https://github.com/Unstructured-IO/unstructured-ingest) - -This python module will be removed from this repo in the near future. diff --git a/unstructured/ingest/v2/__init__.py b/unstructured/ingest/v2/__init__.py deleted file mode 100644 index 9d48db4f9..000000000 --- a/unstructured/ingest/v2/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from __future__ import annotations diff --git a/unstructured/ingest/v2/assets/pipeline.excalidraw b/unstructured/ingest/v2/assets/pipeline.excalidraw deleted file mode 100644 index d59bc99dd..000000000 --- a/unstructured/ingest/v2/assets/pipeline.excalidraw +++ /dev/null @@ -1,1417 +0,0 @@ -{ - "type": "excalidraw", - "version": 2, - "source": "https://excalidraw.com", - "elements": [ - { - "id": "Y3a1yUDvwFK9AB6KmSl9a", - "type": "rectangle", - "x": 637.48046875, - "y": 239.11328125, - "width": 322.44921875, - "height": 97.015625, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1a", - "roundness": { - "type": 3 - }, - "seed": 2131406971, - "version": 139, - "versionNonce": 1482689781, - "isDeleted": false, - "boundElements": [ - { - "type": "text", - "id": "7paHS6cDsoMgh1vsOhizN" - }, - { - "id": "e6DNVpQ-gH7v6WNDWWSPD", - "type": "arrow" - } - ], - "updated": 1715951675553, - "link": null, - "locked": false - }, - { - "id": "7paHS6cDsoMgh1vsOhizN", - "type": "text", - "x": 759.9351119995117, - "y": 275.12109375, - "width": 77.53993225097656, - "height": 25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1b", - "roundness": null, - "seed": 860081397, - "version": 12, - "versionNonce": 1588840341, - "isDeleted": false, - "boundElements": null, - "updated": 1715951674833, - "link": null, - "locked": false, - "text": "Indexing", - "fontSize": 20, - "fontFamily": 1, - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "Y3a1yUDvwFK9AB6KmSl9a", - "originalText": "Indexing", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 205, - "versionNonce": 1999066491, - "index": "b1c", - "isDeleted": false, - "id": "LZrKOvKX6nGWVOrEpPaPS", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 637.244140625, - "y": 406.7421875, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "width": 322.44921875, - "height": 97.015625, - "seed": 882087163, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "SjYgGO3cAHPreH7mJVBdm" - }, - { - "id": "e6DNVpQ-gH7v6WNDWWSPD", - "type": "arrow" - }, - { - "id": "Dn6kngn7QXyxmlCbzgO2R", - "type": "arrow" - } - ], - "updated": 1715951678396, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 88, - "versionNonce": 1992691451, - "index": "b1d", - "isDeleted": false, - "id": "SjYgGO3cAHPreH7mJVBdm", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 741.9687957763672, - "y": 442.75, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 112.99990844726562, - "height": 25, - "seed": 820854171, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951530614, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Downloading", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "LZrKOvKX6nGWVOrEpPaPS", - "originalText": "Downloading", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 252, - "versionNonce": 1617745173, - "index": "b1e", - "isDeleted": false, - "id": "62UjU0YjVR7TvLe7hLQCV", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "dotted", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 644.884765625, - "y": 586.75390625, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "width": 322.44921875, - "height": 97.015625, - "seed": 1549110491, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "vRabBFX0KOEkJ6d4rZF5D" - }, - { - "id": "Dn6kngn7QXyxmlCbzgO2R", - "type": "arrow" - }, - { - "id": "0Q1io01It2PX9ESFiW49G", - "type": "arrow" - } - ], - "updated": 1715951680142, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 146, - "versionNonce": 1440901275, - "index": "b1f", - "isDeleted": false, - "id": "vRabBFX0KOEkJ6d4rZF5D", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 739.9794387817383, - "y": 622.76171875, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 132.25987243652344, - "height": 25, - "seed": 560281979, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951539363, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Uncompressing", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "62UjU0YjVR7TvLe7hLQCV", - "originalText": "Uncompressing", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 329, - "versionNonce": 1236647227, - "index": "b1g", - "isDeleted": false, - "id": "GZLTgdXXsgXo-4rDdd7BN", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 642.740234375, - "y": 752.87109375, - "strokeColor": "#1e1e1e", - "backgroundColor": "#a5d8ff", - "width": 322.44921875, - "height": 97.015625, - "seed": 857787003, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "3nbrNuxDWK3BIkJVVUKYs" - }, - { - "id": "0Q1io01It2PX9ESFiW49G", - "type": "arrow" - }, - { - "id": "5rxlnALV4R8RNKSSzjawZ", - "type": "arrow" - } - ], - "updated": 1715951692576, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 237, - "versionNonce": 1218981717, - "index": "b1h", - "isDeleted": false, - "id": "3nbrNuxDWK3BIkJVVUKYs", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 748.6249008178711, - "y": 788.87890625, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 110.67988586425781, - "height": 25, - "seed": 590856987, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951571504, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Partitioning", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "GZLTgdXXsgXo-4rDdd7BN", - "originalText": "Partitioning", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 425, - "versionNonce": 1862353237, - "index": "b1i", - "isDeleted": false, - "id": "JGKFyGpX1KS2mJhIpFiBT", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "dotted", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 642.431640625, - "y": 916.02734375, - "strokeColor": "#1e1e1e", - "backgroundColor": "#eebefa", - "width": 322.44921875, - "height": 97.015625, - "seed": 1945073307, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "mPevqaKIOyvM1_XLXsPLZ" - }, - { - "id": "5rxlnALV4R8RNKSSzjawZ", - "type": "arrow" - }, - { - "id": "xsN-wlmdU5K7UGi95CYsI", - "type": "arrow" - } - ], - "updated": 1715951696070, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 340, - "versionNonce": 937753339, - "index": "b1j", - "isDeleted": false, - "id": "mPevqaKIOyvM1_XLXsPLZ", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 765.1862869262695, - "y": 952.03515625, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 76.93992614746094, - "height": 25, - "seed": 161213243, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951559401, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Chunking", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "JGKFyGpX1KS2mJhIpFiBT", - "originalText": "Chunking", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 527, - "versionNonce": 1327555355, - "index": "b1k", - "isDeleted": false, - "id": "7SOrKIkV23-VpsfKkBWnF", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "dotted", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 641.716796875, - "y": 1079.15234375, - "strokeColor": "#1e1e1e", - "backgroundColor": "#eebefa", - "width": 322.44921875, - "height": 97.015625, - "seed": 1437476219, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "-UFDNMIXOpAYsEf9ubpNz" - }, - { - "id": "xsN-wlmdU5K7UGi95CYsI", - "type": "arrow" - }, - { - "id": "foUafDsehtG66kl3x246k", - "type": "arrow" - } - ], - "updated": 1715951698569, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 451, - "versionNonce": 1228878331, - "index": "b1l", - "isDeleted": false, - "id": "-UFDNMIXOpAYsEf9ubpNz", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 756.0714492797852, - "y": 1115.16015625, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 93.73991394042969, - "height": 25, - "seed": 1633795611, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951569483, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Embedding", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "7SOrKIkV23-VpsfKkBWnF", - "originalText": "Embedding", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 421, - "versionNonce": 1862165339, - "index": "b1m", - "isDeleted": false, - "id": "JncRqJ0FdwNeHFO0WQj7j", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "dotted", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 641.271484375, - "y": 1250.0859375, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 322.44921875, - "height": 97.015625, - "seed": 207501755, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "4aD6_9mkOZYxvLuujjZJ3" - }, - { - "id": "foUafDsehtG66kl3x246k", - "type": "arrow" - }, - { - "id": "bZvxt2MfEmkgYplJGYvAF", - "type": "arrow" - } - ], - "updated": 1715951685444, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 335, - "versionNonce": 1654728507, - "index": "b1n", - "isDeleted": false, - "id": "4aD6_9mkOZYxvLuujjZJ3", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 767.2161254882812, - "y": 1286.09375, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 70.5599365234375, - "height": 25, - "seed": 696601179, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951578801, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Staging", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "JncRqJ0FdwNeHFO0WQj7j", - "originalText": "Staging", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 405, - "versionNonce": 2565851, - "index": "b1o", - "isDeleted": false, - "id": "YZqdS6HqxV0eCvZhb-1TG", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 637.533203125, - "y": 1406.921875, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 322.44921875, - "height": 97.015625, - "seed": 586095477, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [ - { - "type": "text", - "id": "X0wnY-7I3y5NxPAIay-cU" - }, - { - "id": "bZvxt2MfEmkgYplJGYvAF", - "type": "arrow" - } - ], - "updated": 1715952782049, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 327, - "versionNonce": 236892981, - "index": "b1p", - "isDeleted": false, - "id": "X0wnY-7I3y5NxPAIay-cU", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 754.2878494262695, - "y": 1442.9296875, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 88.93992614746094, - "height": 25, - "seed": 1170597077, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715952784484, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Uploading", - "textAlign": "center", - "verticalAlign": "middle", - "containerId": "YZqdS6HqxV0eCvZhb-1TG", - "originalText": "Uploading", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "e6DNVpQ-gH7v6WNDWWSPD", - "type": "arrow", - "x": 792.36328125, - "y": 344.94140625, - "width": 0, - "height": 56.38671875, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1r", - "roundness": { - "type": 2 - }, - "seed": 1826370165, - "version": 50, - "versionNonce": 1269906229, - "isDeleted": false, - "boundElements": null, - "updated": 1715951643784, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0, - 56.38671875 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "Y3a1yUDvwFK9AB6KmSl9a", - "focus": 0.03933516663234279, - "gap": 8.8125 - }, - "endBinding": { - "elementId": "LZrKOvKX6nGWVOrEpPaPS", - "focus": -0.037869335045489234, - "gap": 5.4140625 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "Dn6kngn7QXyxmlCbzgO2R", - "type": "arrow", - "x": 796.0859375, - "y": 512.30078125, - "width": 0, - "height": 62.3828125, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1s", - "roundness": { - "type": 2 - }, - "seed": 414059669, - "version": 60, - "versionNonce": 138024373, - "isDeleted": false, - "boundElements": null, - "updated": 1715951647788, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0, - 62.3828125 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "LZrKOvKX6nGWVOrEpPaPS", - "focus": 0.014779458974887034, - "gap": 8.54296875 - }, - "endBinding": { - "elementId": "62UjU0YjVR7TvLe7hLQCV", - "focus": -0.06217064217960677, - "gap": 12.0703125 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "0Q1io01It2PX9ESFiW49G", - "type": "arrow", - "x": 796.01953125, - "y": 695.125, - "width": 0, - "height": 47.18359375, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1t", - "roundness": { - "type": 2 - }, - "seed": 2076044405, - "version": 53, - "versionNonce": 518155253, - "isDeleted": false, - "boundElements": null, - "updated": 1715951652693, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0, - 47.18359375 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "62UjU0YjVR7TvLe7hLQCV", - "focus": 0.06258252874120199, - "gap": 11.35546875 - }, - "endBinding": { - "elementId": "GZLTgdXXsgXo-4rDdd7BN", - "focus": -0.049281015663803655, - "gap": 10.5625 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "5rxlnALV4R8RNKSSzjawZ", - "type": "arrow", - "x": 796.625, - "y": 862.3984375, - "width": 0, - "height": 40.19921875, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1u", - "roundness": { - "type": 2 - }, - "seed": 343257781, - "version": 31, - "versionNonce": 60053493, - "isDeleted": false, - "boundElements": null, - "updated": 1715951657891, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0, - 40.19921875 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "GZLTgdXXsgXo-4rDdd7BN", - "focus": 0.04552557936690613, - "gap": 12.51171875 - }, - "endBinding": { - "elementId": "JGKFyGpX1KS2mJhIpFiBT", - "focus": -0.0436115182865519, - "gap": 13.4296875 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "xsN-wlmdU5K7UGi95CYsI", - "type": "arrow", - "x": 795.421875, - "y": 1024.8828125, - "width": 0, - "height": 39.421875, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1v", - "roundness": { - "type": 2 - }, - "seed": 1318887093, - "version": 38, - "versionNonce": 303905173, - "isDeleted": false, - "boundElements": null, - "updated": 1715951661064, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0, - 39.421875 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "JGKFyGpX1KS2mJhIpFiBT", - "focus": 0.05107393363780634, - "gap": 11.83984375 - }, - "endBinding": { - "elementId": "7SOrKIkV23-VpsfKkBWnF", - "focus": -0.04664009594534023, - "gap": 14.84765625 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "foUafDsehtG66kl3x246k", - "type": "arrow", - "x": 792.3203125, - "y": 1187.8671875, - "width": 0, - "height": 44.78515625, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1w", - "roundness": { - "type": 2 - }, - "seed": 1280415829, - "version": 34, - "versionNonce": 1235268021, - "isDeleted": false, - "boundElements": null, - "updated": 1715951664610, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0, - 44.78515625 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "7SOrKIkV23-VpsfKkBWnF", - "focus": 0.06587762123396368, - "gap": 11.69921875 - }, - "endBinding": { - "elementId": "JncRqJ0FdwNeHFO0WQj7j", - "focus": -0.06311555840914873, - "gap": 17.43359375 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "bZvxt2MfEmkgYplJGYvAF", - "type": "arrow", - "x": 789.81640625, - "y": 1358.8125, - "width": 0.08602962445024787, - "height": 35.25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1x", - "roundness": { - "type": 2 - }, - "seed": 288196725, - "version": 41, - "versionNonce": 714813627, - "isDeleted": false, - "boundElements": null, - "updated": 1715952782050, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 0.08602962445024787, - 35.25 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "JncRqJ0FdwNeHFO0WQj7j", - "focus": 0.07864610464341526, - "gap": 11.7109375 - }, - "endBinding": { - "elementId": "YZqdS6HqxV0eCvZhb-1TG", - "focus": -0.05395713956897283, - "gap": 12.859375 - }, - "startArrowhead": null, - "endArrowhead": "arrow" - }, - { - "id": "u-6rLKVGZ91K-do_X6_7h", - "type": "rectangle", - "x": 1014.77734375, - "y": 243.0625, - "width": 22.22265625, - "height": 22.22265625, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b1y", - "roundness": { - "type": 3 - }, - "seed": 643949941, - "version": 184, - "versionNonce": 115789461, - "isDeleted": false, - "boundElements": null, - "updated": 1715951856984, - "link": null, - "locked": false - }, - { - "id": "i8TMmsB--w6DYXWYRe_qm", - "type": "text", - "x": 1059.00390625, - "y": 242.80859375, - "width": 758.3992919921875, - "height": 25, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "b20", - "roundness": null, - "seed": 2000384187, - "version": 169, - "versionNonce": 848966645, - "isDeleted": false, - "boundElements": null, - "updated": 1715951856984, - "link": null, - "locked": false, - "text": "Steps associated with getting data from a source and ready for processing", - "fontSize": 20, - "fontFamily": 1, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "Steps associated with getting data from a source and ready for processing", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 271, - "versionNonce": 1366945109, - "index": "b21", - "isDeleted": false, - "id": "UMttgjHgvnZXjUlDiqbaB", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1015.4722290039062, - "y": 297.1875, - "strokeColor": "#1e1e1e", - "backgroundColor": "#a5d8ff", - "width": 22.22265625, - "height": 22.22265625, - "seed": 2058850293, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [], - "updated": 1715951856984, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 298, - "versionNonce": 1658550965, - "index": "b22", - "isDeleted": false, - "id": "hf4pKQ55184WTVhdPC92w", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1059.6987915039062, - "y": 296.93359375, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "width": 365.3796691894531, - "height": 25, - "seed": 1703659861, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951856984, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Creating structured/enriched content", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "Creating structured/enriched content", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 269, - "versionNonce": 1600412693, - "index": "b23", - "isDeleted": false, - "id": "N4kjMAQ-BqLtvUxn3gpN_", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1017.2026977539062, - "y": 354.03125, - "strokeColor": "#1e1e1e", - "backgroundColor": "#eebefa", - "width": 22.22265625, - "height": 22.22265625, - "seed": 548622613, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [], - "updated": 1715951856984, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 292, - "versionNonce": 252318069, - "index": "b24", - "isDeleted": false, - "id": "VZCSNlIntRGixA1659IRA", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1061.4292602539062, - "y": 353.77734375, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffec99", - "width": 367.4396667480469, - "height": 25, - "seed": 347235957, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951856984, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Reformatting the structured content", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "Reformatting the structured content", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 249, - "versionNonce": 521280213, - "index": "b25", - "isDeleted": false, - "id": "-mFRWLXO9Tam2O1loV1l8", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1017.7183227539062, - "y": 410.453125, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 22.22265625, - "height": 22.22265625, - "seed": 1321641467, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [], - "updated": 1715951856984, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 299, - "versionNonce": 2014443573, - "index": "b26", - "isDeleted": false, - "id": "l8FTa1uhh3FXC4DdeCjJX", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1061.9448852539062, - "y": 410.19921875, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 652.2393798828125, - "height": 25, - "seed": 345386651, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715951856984, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Steps associated with uploading the final result to a destination", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "Steps associated with uploading the final result to a destination", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 358, - "versionNonce": 998367509, - "index": "b27", - "isDeleted": false, - "id": "3uQWJDRthA7AWVdHSokLt", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1018.3490600585938, - "y": 538.45703125, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 22.22265625, - "height": 22.22265625, - "seed": 1078125621, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [], - "updated": 1715952831362, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 418, - "versionNonce": 2035692411, - "index": "b28", - "isDeleted": false, - "id": "4iycrxYTvkePRrwE9d55_", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1062.5756225585938, - "y": 538.203125, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 135.0398712158203, - "height": 25, - "seed": 1059231125, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715952836177, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Required step", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "Required step", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "type": "rectangle", - "version": 409, - "versionNonce": 1303811067, - "index": "b2B", - "isDeleted": false, - "id": "Jr-S8g5xKeXX4hA1S9VNt", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "dotted", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1019.7730331420898, - "y": 589.04296875, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "width": 22.22265625, - "height": 22.22265625, - "seed": 832846773, - "groupIds": [], - "frameId": null, - "roundness": { - "type": 3 - }, - "boundElements": [], - "updated": 1715952853068, - "link": null, - "locked": false - }, - { - "type": "text", - "version": 481, - "versionNonce": 989351029, - "index": "b2C", - "isDeleted": false, - "id": "23iPs-E6gExYad4eWTKFP", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "angle": 0, - "x": 1063.9995956420898, - "y": 588.7890625, - "strokeColor": "#1e1e1e", - "backgroundColor": "#ffc9c9", - "width": 133.33987426757812, - "height": 25, - "seed": 963443989, - "groupIds": [], - "frameId": null, - "roundness": null, - "boundElements": [], - "updated": 1715952857188, - "link": null, - "locked": false, - "fontSize": 20, - "fontFamily": 1, - "text": "Optional Step", - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "Optional Step", - "autoResize": true, - "lineHeight": 1.25 - } - ], - "appState": { - "gridSize": null, - "viewBackgroundColor": "#ffffff" - }, - "files": {} -} \ No newline at end of file diff --git a/unstructured/ingest/v2/assets/pipeline.png b/unstructured/ingest/v2/assets/pipeline.png deleted file mode 100644 index 9cfcf64e8..000000000 Binary files a/unstructured/ingest/v2/assets/pipeline.png and /dev/null differ diff --git a/unstructured/ingest/v2/assets/sequence.png b/unstructured/ingest/v2/assets/sequence.png deleted file mode 100644 index 6b79db305..000000000 Binary files a/unstructured/ingest/v2/assets/sequence.png and /dev/null differ diff --git a/unstructured/ingest/v2/assets/sequence.txt b/unstructured/ingest/v2/assets/sequence.txt deleted file mode 100644 index 618859a6a..000000000 --- a/unstructured/ingest/v2/assets/sequence.txt +++ /dev/null @@ -1,38 +0,0 @@ -title Ingest Flow - - -Pipeline->Index: Pipeline.indexer_step.run() -Index->Data Provider:fetch list of docs with metadata -Data Provider->Index: -Index->Local Filesystem:for each record, save the metadata as a json file -Index->Pipeline: pipeline records a list of files -Pipeline->Download: Pipeline.downloader_step(records) -Download->Local Filesystem: Fetch the associated metadata -Local Filesystem->Download: -Download->Data Provider: Get raw data from data provider -Download->Local Filesystem: Persist the data as raw files -Download->Pipeline: Send back a reference to the local file to process -Pipeline-->Uncompress: Optionally run if flag set to True -Uncompress->Local Filesystem: Extract tar and zip files -Uncompress->Local Filesystem: New metadata records are created for new extracted files -Uncompress->Pipeline: Send back list of pointers to new metadata files -Pipeline->Partition: Pipeline.partitioner_step(downloaded_data) -Partition-->Unstructured Api: If credentials passed in,\npassed file data to API for partitioning -Unstructured Api->Partition: -Partition->Local Filesystem: Persist results -Partition->Pipeline: Pointers to persisted results -Pipeline-->Chunk: Optionally Pipeline.chunker_step.run(records) -Chunk-->Unstructured Api: If credentials passed in,\npassed file data to API for chunking -Unstructured Api->Chunk: -Chunk->Local Filesystem: Persist results -Chunk->Pipeline: Pointers to persisted results -Pipeline-->Embed: Optionally Pipeline.embed_step.run(records) -Embed-->Embedder Api: Depending on which embedder\nis chosen, make API calls to provider -Embed->Local Filesystem: Persist results -Embed->Pipeline: Pointers to persisted results -Pipeline->Stage: Optionally Pipeline.stager_step.run(records) -Stage->Local Filesystem: manipulate the records to better upload -Stage->Pipeline: Pointers to persisted results -Pipeline->Upload: Pipeline.upload_step.run() -Upload->Data Destination: -Pipeline->Local Filesystem: Cleanup diff --git a/unstructured/ingest/v2/cli/README.md b/unstructured/ingest/v2/cli/README.md deleted file mode 100644 index 4d60d4ccf..000000000 --- a/unstructured/ingest/v2/cli/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Ingest CLI -This package helps map user input via a cli to the underlying ingest code to run a small ETL pipeline. - -## Design Reference -[cli.py](./cli.py) is the main entrypoint to run the cli itself. The key points for this is the interaction between all -source and destination connectors. - -To manually run the cli: -```shell -PYTHONPATH=. python unstructured/ingest/v2/main.py --help -``` - -The `main.py` file simply wraps the generated Click command created in `cli.py`. - -### Source Commands -All source commands are added as sub commands to the parent ingest Click group. This allows each command to map to -different connectors with shared and unique parameters. - -### Destination Commands -All destination commands are added as sub commands to each parent source command. This allows each invocation of the source -sub command to display all possible destination subcommands. The code un [utils.py](./utils.py) helps structure the -generated text from the Click library to be more intuitive on this approach (i.e. list sub commands as `Destinations`). - -### Configs -The configs in [configs/](./configs) and connector specific ones in [cmds/](./cmds) help surface all user parameters that -are needed to marshall the input dictionary from Click into all the respective configs needed to create a full pipeline run. -Because click returns a flat dictionary of user inputs, the `extract_config` method in `utils.py` helps deserialize this dictionary -into dataclasses that have nexted fields (such as access configs). diff --git a/unstructured/ingest/v2/cli/__init__.py b/unstructured/ingest/v2/cli/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/v2/cli/base/__init__.py b/unstructured/ingest/v2/cli/base/__init__.py deleted file mode 100644 index ed07a1684..000000000 --- a/unstructured/ingest/v2/cli/base/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .dest import DestCmd -from .src import SrcCmd - -__all__ = ["SrcCmd", "DestCmd"] diff --git a/unstructured/ingest/v2/cli/base/cmd.py b/unstructured/ingest/v2/cli/base/cmd.py deleted file mode 100644 index 0a5d5c138..000000000 --- a/unstructured/ingest/v2/cli/base/cmd.py +++ /dev/null @@ -1,215 +0,0 @@ -import inspect -from abc import ABC, abstractmethod -from dataclasses import dataclass, field, fields -from typing import Any, Optional, Type, TypeVar - -import click - -from unstructured.ingest.v2.cli.base.importer import import_from_string -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import extract_config -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import Chunker, ChunkerConfig -from unstructured.ingest.v2.processes.connector_registry import ( - DownloaderT, - IndexerT, - UploaderT, - UploadStager, - UploadStagerConfig, - UploadStagerT, - destination_registry, - source_registry, -) -from unstructured.ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig -from unstructured.ingest.v2.processes.embedder import Embedder, EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import Partitioner, PartitionerConfig - -CommandT = TypeVar("CommandT", bound=click.Command) - - -@dataclass -class BaseCmd(ABC): - cmd_name: str - default_configs: list[Type[CliConfig]] = field(default_factory=list) - - @property - def cmd_name_key(self): - return self.cmd_name.replace("-", "_") - - @property - def cli_cmd_name(self): - return self.cmd_name.replace("_", "-") - - @abstractmethod - def cmd(self, ctx: click.Context, **options) -> None: - pass - - def add_options(self, cmd: CommandT, extras: list[Type[CliConfig]]) -> CommandT: - configs = self.default_configs - # make sure what's unique to this cmd appears first - extras.extend(configs) - for config in extras: - try: - config.add_cli_options(cmd=cmd) - except ValueError as e: - raise ValueError(f"failed to set configs from {config.__name__}: {e}") - return cmd - - def get_pipeline( - self, - src: str, - source_options: dict[str, Any], - dest: Optional[str] = None, - destination_options: Optional[dict[str, Any]] = None, - ) -> Pipeline: - logger.debug( - f"creating pipeline from cli using source {src} with options: {source_options}" - ) - pipeline_kwargs: dict[str, Any] = { - "context": self.get_processor_config(options=source_options), - "downloader": self.get_downloader(src=src, options=source_options), - "indexer": self.get_indexer(src=src, options=source_options), - "partitioner": self.get_partitioner(options=source_options), - } - if chunker := self.get_chunker(options=source_options): - pipeline_kwargs["chunker"] = chunker - if embedder := self.get_embeder(options=source_options): - pipeline_kwargs["embedder"] = embedder - if dest: - logger.debug( - f"setting destination on pipeline {dest} with options: {destination_options}" - ) - if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options): - pipeline_kwargs["stager"] = uploader_stager - pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options) - else: - # Default to local uploader - # TODO remove after v1 no longer supported - destination_options = destination_options or {} - if "output_dir" not in destination_options: - destination_options["output_dir"] = source_options["output_dir"] - pipeline_kwargs["uploader"] = self.get_default_uploader(options=destination_options) - return Pipeline(**pipeline_kwargs) - - @staticmethod - def get_default_uploader(options: dict[str, Any]) -> UploaderT: - uploader_config = extract_config(flat_data=options, config=LocalUploaderConfig) - return LocalUploader(upload_config=uploader_config) - - @staticmethod - def get_chunker(options: dict[str, Any]) -> Optional[Chunker]: - chunker_config = extract_config(flat_data=options, config=ChunkerConfig) - if not chunker_config.chunking_strategy: - return None - return Chunker(config=chunker_config) - - @staticmethod - def get_embeder(options: dict[str, Any]) -> Optional[Embedder]: - embedder_config = extract_config(flat_data=options, config=EmbedderConfig) - if not embedder_config.embedding_provider: - return None - return Embedder(config=embedder_config) - - @staticmethod - def get_partitioner(options: dict[str, Any]) -> Partitioner: - partitioner_config = extract_config(flat_data=options, config=PartitionerConfig) - return Partitioner(config=partitioner_config) - - @staticmethod - def get_processor_config(options: dict[str, Any]) -> ProcessorConfig: - return extract_config(flat_data=options, config=ProcessorConfig) - - @staticmethod - def get_indexer(src: str, options: dict[str, Any]) -> IndexerT: - source_entry = source_registry[src] - indexer_kwargs: dict[str, Any] = {} - if indexer_config_cls := source_entry.indexer_config: - indexer_kwargs["index_config"] = extract_config( - flat_data=options, config=indexer_config_cls - ) - if connection_config_cls := source_entry.connection_config: - indexer_kwargs["connection_config"] = extract_config( - flat_data=options, config=connection_config_cls - ) - indexer_cls = source_entry.indexer - return indexer_cls(**indexer_kwargs) - - @staticmethod - def get_downloader(src: str, options: dict[str, Any]) -> DownloaderT: - source_entry = source_registry[src] - downloader_kwargs: dict[str, Any] = {} - if downloader_config_cls := source_entry.downloader_config: - downloader_kwargs["download_config"] = extract_config( - flat_data=options, config=downloader_config_cls - ) - if connection_config_cls := source_entry.connection_config: - downloader_kwargs["connection_config"] = extract_config( - flat_data=options, config=connection_config_cls - ) - downloader_cls = source_entry.downloader - return downloader_cls(**downloader_kwargs) - - @staticmethod - def get_custom_stager( - stager_reference: str, stager_config_kwargs: Optional[dict] = None - ) -> Optional[UploadStagerT]: - uploader_cls = import_from_string(stager_reference) - if not inspect.isclass(uploader_cls): - raise ValueError( - f"custom stager must be a reference to a python class, got: {type(uploader_cls)}" - ) - if not issubclass(uploader_cls, UploadStager): - raise ValueError( - "custom stager must be an implementation of the UploadStager interface" - ) - fields_dict = {f.name: f.type for f in fields(uploader_cls)} - upload_stager_config_cls = fields_dict["upload_stager_config"] - if not inspect.isclass(upload_stager_config_cls): - raise ValueError( - f"custom stager config must be a class, got: {type(upload_stager_config_cls)}" - ) - if not issubclass(upload_stager_config_cls, UploadStagerConfig): - raise ValueError( - "custom stager config must be an implementation " - "of the UploadStagerUploadStagerConfig interface" - ) - upload_stager_kwargs: dict[str, Any] = {} - if stager_config_kwargs: - upload_stager_kwargs["upload_stager_config"] = upload_stager_config_cls( - **stager_config_kwargs - ) - return uploader_cls(**upload_stager_kwargs) - - @staticmethod - def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStagerT]: - if custom_stager := options.get("custom_stager"): - return BaseCmd.get_custom_stager( - stager_reference=custom_stager, - stager_config_kwargs=options.get("custom_stager_config_kwargs"), - ) - dest_entry = destination_registry[dest] - upload_stager_kwargs: dict[str, Any] = {} - if upload_stager_config_cls := dest_entry.upload_stager_config: - upload_stager_kwargs["upload_stager_config"] = extract_config( - flat_data=options, config=upload_stager_config_cls - ) - if upload_stager_cls := dest_entry.upload_stager: - return upload_stager_cls(**upload_stager_kwargs) - return None - - @staticmethod - def get_uploader(dest, options: dict[str, Any]) -> UploaderT: - dest_entry = destination_registry[dest] - uploader_kwargs: dict[str, Any] = {} - if uploader_config_cls := dest_entry.uploader_config: - uploader_kwargs["upload_config"] = extract_config( - flat_data=options, config=uploader_config_cls - ) - if connection_config_cls := dest_entry.connection_config: - uploader_kwargs["connection_config"] = extract_config( - flat_data=options, config=connection_config_cls - ) - uploader_cls = dest_entry.uploader - return uploader_cls(**uploader_kwargs) diff --git a/unstructured/ingest/v2/cli/base/dest.py b/unstructured/ingest/v2/cli/base/dest.py deleted file mode 100644 index b1703dcc8..000000000 --- a/unstructured/ingest/v2/cli/base/dest.py +++ /dev/null @@ -1,76 +0,0 @@ -import logging -from dataclasses import dataclass -from typing import Optional, Type - -import click - -from unstructured.ingest.v2.cli.base.cmd import BaseCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import Dict, conform_click_options -from unstructured.ingest.v2.logger import logger - - -@dataclass -class DestCmd(BaseCmd): - connection_config: Optional[Type[CliConfig]] = None - uploader_config: Optional[Type[CliConfig]] = None - upload_stager_config: Optional[Type[CliConfig]] = None - - def cmd(self, ctx: click.Context, **options) -> None: - logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO) - if not ctx.parent: - raise click.ClickException("destination command called without a parent") - if not ctx.parent.info_name: - raise click.ClickException("parent command missing info name") - source_cmd = ctx.parent.info_name.replace("-", "_") - source_options: dict = ctx.parent.params if ctx.parent else {} - conform_click_options(options) - try: - pipeline = self.get_pipeline( - src=source_cmd, - source_options=source_options, - dest=self.cmd_name, - destination_options=options, - ) - pipeline.run() - except Exception as e: - logger.error(f"failed to run destination command {self.cmd_name}: {e}", exc_info=True) - raise click.ClickException(str(e)) from e - - def get_cmd(self) -> click.Command: - # Dynamically create the command without the use of click decorators - fn = self.cmd - fn = click.pass_context(fn) - cmd = click.command(fn) - if not isinstance(cmd, click.core.Command): - raise ValueError(f"generated command was not of expected type Command: {type(cmd)}") - cmd.name = self.cli_cmd_name - cmd.short_help = "v2" - cmd.invoke_without_command = True - extras = [ - x - for x in [self.uploader_config, self.upload_stager_config, self.connection_config] - if x - ] - self.add_options(cmd, extras=extras) - cmd.params.append( - click.Option( - ["--custom-stager"], - required=False, - type=str, - default=None, - help="Pass a pointer to a custom upload stager to use, " - "must be in format ':'", - ) - ) - cmd.params.append( - click.Option( - ["--custom-stager-config-kwargs"], - required=False, - type=Dict(), - default=None, - help="Any kwargs to instantiate the configuration " - "associated with the customer stager", - ) - ) - return cmd diff --git a/unstructured/ingest/v2/cli/base/importer.py b/unstructured/ingest/v2/cli/base/importer.py deleted file mode 100644 index f77520ee1..000000000 --- a/unstructured/ingest/v2/cli/base/importer.py +++ /dev/null @@ -1,34 +0,0 @@ -import importlib -from typing import Any - - -class ImportFromStringError(Exception): - pass - - -def import_from_string(import_str: Any) -> Any: - if not isinstance(import_str, str): - return import_str - - module_str, _, attrs_str = import_str.partition(":") - if not module_str or not attrs_str: - message = 'Import string "{import_str}" must be in format ":".' - raise ImportFromStringError(message.format(import_str=import_str)) - - try: - module = importlib.import_module(module_str) - except ModuleNotFoundError as exc: - if exc.name != module_str: - raise exc from None - message = 'Could not import module "{module_str}".' - raise ImportFromStringError(message.format(module_str=module_str)) - - instance = module - try: - for attr_str in attrs_str.split("."): - instance = getattr(instance, attr_str) - except AttributeError: - message = 'Attribute "{attrs_str}" not found in module "{module_str}".' - raise ImportFromStringError(message.format(attrs_str=attrs_str, module_str=module_str)) - - return instance diff --git a/unstructured/ingest/v2/cli/base/src.py b/unstructured/ingest/v2/cli/base/src.py deleted file mode 100644 index 9ec350cad..000000000 --- a/unstructured/ingest/v2/cli/base/src.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging -from dataclasses import dataclass, field -from typing import Any, Optional, Type - -import click - -from unstructured.ingest.v2.cli.base.cmd import BaseCmd -from unstructured.ingest.v2.cli.configs import ( - ChunkerCliConfig, - EmbedderCliConfig, - PartitionerCliConfig, - ProcessorCliConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import Group, conform_click_options -from unstructured.ingest.v2.logger import logger - - -@dataclass -class SrcCmd(BaseCmd): - indexer_config: Optional[Type[CliConfig]] = None - downloader_config: Optional[Type[CliConfig]] = None - connection_config: Optional[Type[CliConfig]] = None - default_configs: list[CliConfig] = field( - default_factory=lambda: [ - ProcessorCliConfig, - PartitionerCliConfig, - EmbedderCliConfig, - ChunkerCliConfig, - ] - ) - - def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None: - if ctx.invoked_subcommand: - return - - conform_click_options(options) - logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO) - try: - pipeline = self.get_pipeline(src=self.cmd_name, source_options=options) - pipeline.run() - except Exception as e: - logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True) - raise click.ClickException(str(e)) from e - - def get_cmd(self) -> click.Group: - # Dynamically create the command without the use of click decorators - fn = self.cmd - fn = click.pass_context(fn) - cmd = click.group(fn, cls=Group) - if not isinstance(cmd, click.core.Group): - raise ValueError(f"generated src command was not of expected type Group: {type(cmd)}") - cmd.name = self.cli_cmd_name - cmd.short_help = "v2" - cmd.invoke_without_command = True - extras = [ - x for x in [self.indexer_config, self.downloader_config, self.connection_config] if x - ] - self.add_options(cmd, extras=extras) - - # TODO remove after v1 no longer supported - cmd.params.append( - click.Option( - ["--output-dir"], - required=False, - type=str, - help="Local path to write partitioned output to", - ) - ) - return cmd diff --git a/unstructured/ingest/v2/cli/cli.py b/unstructured/ingest/v2/cli/cli.py deleted file mode 100644 index a53c43565..000000000 --- a/unstructured/ingest/v2/cli/cli.py +++ /dev/null @@ -1,24 +0,0 @@ -import click - -from unstructured.ingest.v2.cli.cmds import dest, src - - -@click.group() -def ingest(): - pass - - -def get_cmd() -> click.Command: - """Construct and return a Click command object representing the main command for the CLI. - - This function adds all dest_subcommand(s) to each src_subcommand, and adds all of those - to the main command as nested subcommands. - """ - cmd = ingest - # Add all subcommands - for src_subcommand in src: - # Add all destination subcommands - for dest_subcommand in dest: - src_subcommand.add_command(dest_subcommand) - cmd.add_command(src_subcommand) - return cmd diff --git a/unstructured/ingest/v2/cli/cmds/__init__.py b/unstructured/ingest/v2/cli/cmds/__init__.py deleted file mode 100644 index 4a4a74c5d..000000000 --- a/unstructured/ingest/v2/cli/cmds/__init__.py +++ /dev/null @@ -1,87 +0,0 @@ -from collections import Counter - -import click - -from .astradb import astradb_dest_cmd -from .azure_cognitive_search import azure_cognitive_search_dest_cmd -from .chroma import chroma_dest_cmd -from .databricks_volumes import databricks_volumes_dest_cmd -from .elasticsearch import elasticsearch_dest_cmd, elasticsearch_src_cmd -from .fsspec.azure import azure_dest_cmd, azure_src_cmd -from .fsspec.box import box_dest_cmd, box_src_cmd -from .fsspec.dropbox import dropbox_dest_cmd, dropbox_src_cmd -from .fsspec.gcs import gcs_dest_cmd, gcs_src_cmd -from .fsspec.s3 import s3_dest_cmd, s3_src_cmd -from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd -from .google_drive import google_drive_src_cmd -from .local import local_dest_cmd, local_src_cmd -from .mongodb import mongodb_dest_cmd -from .onedrive import onedrive_drive_src_cmd -from .opensearch import opensearch_dest_cmd, opensearch_src_cmd -from .pinecone import pinecone_dest_cmd -from .salesforce import salesforce_src_cmd -from .sharepoint import sharepoint_drive_src_cmd -from .singlestore import singlestore_dest_cmd -from .sql import sql_dest_cmd -from .weaviate import weaviate_dest_cmd - -src_cmds = [ - azure_src_cmd, - box_src_cmd, - dropbox_src_cmd, - elasticsearch_src_cmd, - gcs_src_cmd, - google_drive_src_cmd, - local_src_cmd, - onedrive_drive_src_cmd, - opensearch_src_cmd, - s3_src_cmd, - salesforce_src_cmd, - sharepoint_drive_src_cmd, - sftp_src_cmd, -] -duplicate_src_names = [ - name for name, count in Counter([s.cmd_name for s in src_cmds]).items() if count > 1 -] -if duplicate_src_names: - raise ValueError( - "the following source cmd names were reused, all must be unique: {}".format( - ", ".join(duplicate_src_names) - ) - ) - -dest_cmds = [ - astradb_dest_cmd, - azure_cognitive_search_dest_cmd, - azure_dest_cmd, - box_dest_cmd, - chroma_dest_cmd, - dropbox_dest_cmd, - elasticsearch_dest_cmd, - gcs_dest_cmd, - local_dest_cmd, - opensearch_dest_cmd, - pinecone_dest_cmd, - s3_dest_cmd, - sftp_dest_cmd, - singlestore_dest_cmd, - weaviate_dest_cmd, - mongodb_dest_cmd, - databricks_volumes_dest_cmd, - sql_dest_cmd, -] - -duplicate_dest_names = [ - name for name, count in Counter([d.cmd_name for d in dest_cmds]).items() if count > 1 -] -if duplicate_dest_names: - raise ValueError( - "the following dest cmd names were reused, all must be unique: {}".format( - ", ".join(duplicate_dest_names) - ) - ) - - -src: list[click.Group] = [v.get_cmd() for v in src_cmds] - -dest: list[click.Command] = [v.get_cmd() for v in dest_cmds] diff --git a/unstructured/ingest/v2/cli/cmds/astradb.py b/unstructured/ingest/v2/cli/cmds/astradb.py deleted file mode 100644 index 36de30f70..000000000 --- a/unstructured/ingest/v2/cli/cmds/astradb.py +++ /dev/null @@ -1,85 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import Dict -from unstructured.ingest.v2.processes.connectors.astradb import CONNECTOR_TYPE - - -@dataclass -class AstraDBCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--token"], - required=True, - type=str, - help="Astra DB Token with access to the database.", - envvar="ASTRA_DB_APPLICATION_TOKEN", - show_envvar=True, - ), - click.Option( - ["--api-endpoint"], - required=True, - type=str, - help="The API endpoint for the Astra DB.", - envvar="ASTRA_DB_API_ENDPOINT", - show_envvar=True, - ), - ] - return options - - -@dataclass -class AstraDBCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--collection-name"], - required=False, - type=str, - help="The name of the Astra DB collection. " - "Note that the collection name must only include letters, " - "numbers, and underscores.", - ), - click.Option( - ["--embedding-dimension"], - required=True, - default=384, - type=int, - help="The dimensionality of the embeddings", - ), - click.Option( - ["--namespace"], - required=False, - default=None, - type=str, - help="The Astra DB connection namespace.", - ), - click.Option( - ["--requested-indexing-policy"], - required=False, - default=None, - type=Dict(), - help="The indexing policy to use for the collection." - 'example: \'{"deny": ["metadata"]}\' ', - ), - click.Option( - ["--batch-size"], - default=20, - type=int, - help="Number of records per batch", - ), - ] - return options - - -astradb_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=AstraDBCliConnectionConfig, - uploader_config=AstraDBCliUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/azure_cognitive_search.py b/unstructured/ingest/v2/cli/cmds/azure_cognitive_search.py deleted file mode 100644 index 6097606e5..000000000 --- a/unstructured/ingest/v2/cli/cmds/azure_cognitive_search.py +++ /dev/null @@ -1,72 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.azure_cognitive_search import CONNECTOR_TYPE - - -@dataclass -class AzureCognitiveSearchCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--index"], - required=True, - type=str, - help="The name of the Azure AI (Cognitive) Search index to connect to.", - envvar="AZURE_SEARCH_INDEX", - show_envvar=True, - ), - click.Option( - ["--endpoint"], - required=True, - type=str, - help="The URL endpoint of an Azure AI (Cognitive) search service." - "In the form of https://{{service_name}}.search.windows.net", - envvar="AZURE_SEARCH_ENDPOINT", - show_envvar=True, - ), - click.Option( - ["--key"], - required=True, - type=str, - help="Credential that is used for authenticating to an Azure service." - "(is an AzureKeyCredential)", - envvar="AZURE_SEARCH_API_KEY", - show_envvar=True, - ), - ] - return options - - -@dataclass -class AzureCognitiveSearchCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ), - ] - return options - - -@dataclass -class AzureCognitiveSearchCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [] - - -azure_cognitive_search_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=AzureCognitiveSearchCliConnectionConfig, - uploader_config=AzureCognitiveSearchCliUploaderConfig, - upload_stager_config=AzureCognitiveSearchCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/chroma.py b/unstructured/ingest/v2/cli/cmds/chroma.py deleted file mode 100644 index c13816351..000000000 --- a/unstructured/ingest/v2/cli/cmds/chroma.py +++ /dev/null @@ -1,108 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import Dict -from unstructured.ingest.v2.processes.connectors.chroma import CONNECTOR_TYPE - - -@dataclass -class ChromaCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--path"], - required=False, - type=str, - help="Location where Chroma is persisted," "if not connecting via http.", - ), - click.Option( - ["--settings"], - required=False, - type=Dict(), - help="A dictionary of settings to communicate with the chroma server." - 'example: \'{"persist_directory":"./chroma-persist"}\' ', - ), - click.Option( - ["--tenant"], - required=False, - default="default_tenant", - type=str, - help="The tenant to use for this client. Chroma defaults to 'default_tenant'.", - ), - click.Option( - ["--database"], - required=False, - default="default_database", - type=str, - help="The database to use for this client." - "Chroma defaults to 'default_database'.", - ), - click.Option( - ["--host"], - required=False, - type=str, - help="The hostname of the Chroma server.", - ), - click.Option( - ["--port"], - required=False, - type=int, - help="The port of the Chroma server.", - ), - click.Option( - ["--ssl"], - required=False, - default=False, - is_flag=True, - type=bool, - help="Whether to use SSL to connect to the Chroma server.", - ), - click.Option( - ["--headers"], - required=False, - type=Dict(), - help="A dictionary of headers to send to the Chroma server." - 'example: \'{"Authorization":"Basic()"}\' ', - ), - click.Option( - ["--collection-name"], - required=True, - type=str, - help="The name of the Chroma collection to write into.", - ), - ] - return options - - -@dataclass -class ChromaCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ) - ] - return options - - -@dataclass -class ChromaCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [] - - -chroma_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=ChromaCliConnectionConfig, - uploader_config=ChromaCliUploaderConfig, - upload_stager_config=ChromaCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/databricks_volumes.py b/unstructured/ingest/v2/cli/cmds/databricks_volumes.py deleted file mode 100644 index e8f8e2486..000000000 --- a/unstructured/ingest/v2/cli/cmds/databricks_volumes.py +++ /dev/null @@ -1,161 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.databricks_volumes import CONNECTOR_TYPE - - -@dataclass -class DatabricksVolumesCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--host"], - type=str, - default=None, - help="The Databricks host URL for either the " - "Databricks workspace endpoint or the " - "Databricks accounts endpoint.", - ), - click.Option( - ["--account-id"], - type=str, - default=None, - help="The Databricks account ID for the Databricks " - "accounts endpoint. Only has effect when Host is " - "either https://accounts.cloud.databricks.com/ (AWS), " - "https://accounts.azuredatabricks.net/ (Azure), " - "or https://accounts.gcp.databricks.com/ (GCP).", - ), - click.Option( - ["--username"], - type=str, - default=None, - help="The Databricks username part of basic authentication. " - "Only possible when Host is *.cloud.databricks.com (AWS).", - ), - click.Option( - ["--password"], - type=str, - default=None, - help="The Databricks password part of basic authentication. " - "Only possible when Host is *.cloud.databricks.com (AWS).", - ), - click.Option(["--client-id"], type=str, default=None), - click.Option(["--client-secret"], type=str, default=None), - click.Option( - ["--token"], - type=str, - default=None, - help="The Databricks personal access token (PAT) (AWS, Azure, and GCP) or " - "Azure Active Directory (Azure AD) token (Azure).", - ), - click.Option( - ["--azure-workspace-resource-id"], - type=str, - default=None, - help="The Azure Resource Manager ID for the Azure Databricks workspace, " - "which is exchanged for a Databricks host URL.", - ), - click.Option( - ["--azure-client-secret"], - type=str, - default=None, - help="The Azure AD service principal’s client secret.", - ), - click.Option( - ["--azure-client-id"], - type=str, - default=None, - help="The Azure AD service principal’s application ID.", - ), - click.Option( - ["--azure-tenant-id"], - type=str, - default=None, - help="The Azure AD service principal’s tenant ID.", - ), - click.Option( - ["--azure-environment"], - type=str, - default=None, - help="The Azure environment type (such as Public, UsGov, China, and Germany) for a " - "specific set of API endpoints. Defaults to PUBLIC.", - ), - click.Option( - ["--auth-type"], - type=str, - default=None, - help="When multiple auth attributes are available in the " - "environment, use the auth type specified by this " - "argument. This argument also holds the currently " - "selected auth.", - ), - click.Option(["--cluster-id"], type=str, default=None), - click.Option(["--google-credentials"], type=str, default=None), - click.Option(["--google-service-account"], type=str, default=None), - ] - return options - - -@dataclass -class DatabricksVolumesCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--volume"], type=str, required=True, help="Name of volume in the Unity Catalog" - ), - click.Option( - ["--catalog"], - type=str, - required=True, - help="Name of the catalog in the Databricks Unity Catalog service", - ), - click.Option( - ["--volume-path"], - type=str, - required=False, - default=None, - help="Optional path within the volume to write to", - ), - click.Option( - ["--overwrite"], - type=bool, - is_flag=True, - help="If true, an existing file will be overwritten.", - ), - click.Option( - ["--encoding"], - type=str, - required=True, - default="utf-8", - help="Encoding applied to the data when written to the volume", - ), - click.Option( - ["--schema"], - type=str, - required=True, - default="default", - help="Schema associated with the volume to write to in the Unity Catalog service", - ), - ] - return options - - -@dataclass -class DatabricksVolumesCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [] - - -databricks_volumes_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=DatabricksVolumesCliConnectionConfig, - uploader_config=DatabricksVolumesCliUploaderConfig, - upload_stager_config=DatabricksVolumesCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/elasticsearch.py b/unstructured/ingest/v2/cli/cmds/elasticsearch.py deleted file mode 100644 index 8c52c97f7..000000000 --- a/unstructured/ingest/v2/cli/cmds/elasticsearch.py +++ /dev/null @@ -1,159 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString -from unstructured.ingest.v2.processes.connectors.elasticsearch import CONNECTOR_TYPE - - -@dataclass -class ElasticsearchCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--hosts"], - type=DelimitedString(), - help='List of the Elasticsearch hosts to connect to, e.g. "http://localhost:9200"', - ), - click.Option( - ["--username"], type=str, default=None, help="username when using basic auth" - ), - click.Option( - ["--password"], - type=str, - default=None, - help="password when using basic auth or connecting to a cloud instance", - ), - click.Option( - ["--cloud-id"], type=str, default=None, help="id used to connect to Elastic Cloud" - ), - click.Option( - ["--es-api-key"], type=str, default=None, help="api key used for authentication" - ), - click.Option( - ["--api-key-id"], - type=str, - default=None, - help="id associated with api key used for authentication: " - "https://www.elastic.co/guide/en/elasticsearch/reference/current/security-api-create-api-key.html", # noqa: E501 - # noqa: E501 - ), - click.Option( - ["--bearer-auth"], - type=str, - default=None, - help="bearer token used for HTTP bearer authentication", - ), - click.Option( - ["--ca-certs"], - type=click.Path(), - default=None, - ), - click.Option( - ["--ssl-assert-fingerprint"], - type=str, - default=None, - help="SHA256 fingerprint value", - ), - ] - return options - - -@dataclass -class ElasticsearchCliDownloadConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - click.Option( - ["--fields"], - type=DelimitedString(), - default=[], - help="If provided, will limit the fields returned by Elasticsearch " - "to this comma-delimited list", - ), - ] - return options - - -@dataclass -class ElasticsearchCliIndexerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--index-name"], - required=True, - type=str, - help="Name of the Elasticsearch index to pull data from, or upload data to.", - ), - click.Option( - ["--batch-size"], - default=100, - type=click.IntRange(0), - help="how many records to read at a time per process", - ), - ] - return options - - -@dataclass -class ElasticsearchCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--index-name"], - required=True, - type=str, - help="Name of the Elasticsearch index to pull data from, or upload data to.", - ), - ] - return options - - -@dataclass -class ElasticsearchUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size-bytes"], - required=False, - default=15_000_000, - type=int, - help="Size limit (in bytes) for each batch of items to be uploaded. Check" - " https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html" - "#_how_big_is_too_big for more information.", - ), - click.Option( - ["--num-threads"], - required=False, - default=1, - type=int, - help="Number of threads to be used while uploading content", - ), - ] - return options - - -elasticsearch_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=ElasticsearchCliConnectionConfig, - indexer_config=ElasticsearchCliIndexerConfig, - downloader_config=ElasticsearchCliDownloadConfig, -) - -elasticsearch_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=ElasticsearchCliConnectionConfig, - upload_stager_config=ElasticsearchCliUploadStagerConfig, - uploader_config=ElasticsearchUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/__init__.py b/unstructured/ingest/v2/cli/cmds/fsspec/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/azure.py b/unstructured/ingest/v2/cli/cmds/fsspec/azure.py deleted file mode 100644 index c5bdd2ab3..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/azure.py +++ /dev/null @@ -1,84 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.fsspec.fsspec import ( - FsspecCliDownloadConfig, - FsspecCliIndexerConfig, - FsspecCliUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.fsspec.azure import ( - CONNECTOR_TYPE, -) - - -@dataclass -class AzureCliDownloadConfig(FsspecCliDownloadConfig): - pass - - -@dataclass -class AzureCliIndexerConfig(FsspecCliIndexerConfig): - pass - - -@dataclass -class AzureCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--account-key"], - default=None, - help="The storage account key. This is used for shared key " - "authentication. If any of account key, sas token or " - "client_id are not specified, anonymous access will be used.", - ), - click.Option( - ["--account-name"], - default=None, - help="The storage account name. This is used to authenticate " - "requests signed with an account key and to construct " - "the storage endpoint. It is required unless a connection " - "string is given, or if a custom domain is used with " - "anonymous authentication.", - ), - click.Option( - ["--connection-string"], - default=None, - help="If specified, this will override all other parameters. See " - "http://azure.microsoft.com/en-us/documentation/articles/storage-configure-connection-string/ " # noqa: E501 - "for the connection string format.", - ), - click.Option( - ["--sas_token"], - default=None, - help="A shared access signature token to use to authenticate " - "requests instead of the account key. If account key and " - "sas token are both specified, account key will be used " - "to sign. If any of account key, sas token or client_id " - "are not specified, anonymous access will be used.", - ), - ] - return options - - -@dataclass -class AzureUploaderConfig(FsspecCliUploaderConfig): - pass - - -azure_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=AzureCliIndexerConfig, - connection_config=AzureCliConnectionConfig, - downloader_config=AzureCliDownloadConfig, -) - -azure_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=AzureCliConnectionConfig, - uploader_config=AzureUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/box.py b/unstructured/ingest/v2/cli/cmds/fsspec/box.py deleted file mode 100644 index 99241b917..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/box.py +++ /dev/null @@ -1,58 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.fsspec.fsspec import ( - FsspecCliDownloadConfig, - FsspecCliIndexerConfig, - FsspecCliUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.fsspec.box import ( - CONNECTOR_TYPE, -) - - -@dataclass -class BoxCliDownloadConfig(FsspecCliDownloadConfig): - pass - - -@dataclass -class BoxCliIndexerConfig(FsspecCliIndexerConfig): - pass - - -@dataclass -class BoxCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--box-app-config"], - default=None, - type=click.Path(), - help="Path to Box app credentials as json file.", - ), - ] - return options - - -@dataclass -class BoxUploaderConfig(FsspecCliUploaderConfig): - pass - - -box_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=BoxCliIndexerConfig, - connection_config=BoxCliConnectionConfig, - downloader_config=BoxCliDownloadConfig, -) - -box_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=BoxCliConnectionConfig, - uploader_config=BoxUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/dropbox.py b/unstructured/ingest/v2/cli/cmds/fsspec/dropbox.py deleted file mode 100644 index 7b7c4406d..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/dropbox.py +++ /dev/null @@ -1,58 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.fsspec.fsspec import ( - FsspecCliDownloadConfig, - FsspecCliIndexerConfig, - FsspecCliUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.fsspec.dropbox import ( - CONNECTOR_TYPE, -) - - -@dataclass -class DropboxCliDownloadConfig(FsspecCliDownloadConfig): - pass - - -@dataclass -class DropboxCliIndexerConfig(FsspecCliIndexerConfig): - pass - - -@dataclass -class DropboxCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--token"], - required=True, - type=str, - help="Dropbox access token.", - ), - ] - return options - - -@dataclass -class DropboxUploaderConfig(FsspecCliUploaderConfig): - pass - - -dropbox_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=DropboxCliIndexerConfig, - connection_config=DropboxCliConnectionConfig, - downloader_config=DropboxCliDownloadConfig, -) - -dropbox_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=DropboxCliConnectionConfig, - uploader_config=DropboxUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/fsspec.py b/unstructured/ingest/v2/cli/cmds/fsspec/fsspec.py deleted file mode 100644 index 858586c76..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/fsspec.py +++ /dev/null @@ -1,77 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString - - -@dataclass -class FsspecCliDownloadConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - ] - - -@dataclass -class FsspecCliFileConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [ - click.Option( - ["--remote-url"], - required=True, - help="Remote fsspec URL formatted as `protocol://dir/path`", - ) - ] - - -@dataclass -class FsspecCliUploaderConfig(FsspecCliFileConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = super(FsspecCliUploaderConfig, FsspecCliUploaderConfig).get_cli_options() - options.extend( - [ - click.Option( - ["--overwrite"], - is_flag=True, - default=False, - show_default=True, - help="If set, will overwrite content if content already exists", - ) - ] - ) - return options - - -@dataclass -class FsspecCliIndexerConfig(FsspecCliFileConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = super(FsspecCliIndexerConfig, FsspecCliIndexerConfig).get_cli_options() - options.extend( - [ - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - click.Option( - ["--file-glob"], - default=None, - type=DelimitedString(), - help="A comma-separated list of file globs to limit which types of " - "local files are accepted, e.g. '*.html,*.txt'", - ), - ] - ) - return options diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/gcs.py b/unstructured/ingest/v2/cli/cmds/fsspec/gcs.py deleted file mode 100644 index 7464d7769..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/gcs.py +++ /dev/null @@ -1,81 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.fsspec.fsspec import ( - FsspecCliDownloadConfig, - FsspecCliIndexerConfig, - FsspecCliUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import FileOrJson -from unstructured.ingest.v2.processes.connectors.fsspec.gcs import ( - CONNECTOR_TYPE, -) - - -@dataclass -class GcsCliDownloadConfig(FsspecCliDownloadConfig): - pass - - -@dataclass -class GcsCliIndexerConfig(FsspecCliIndexerConfig): - pass - - -@dataclass -class GcsCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - help_string = """ - Options: - - ``None``, GCSFS will attempt to guess your credentials in the - following order: gcloud CLI default, gcsfs cached token, google compute - metadata service, anonymous. - - ``'google_default'``, your default gcloud credentials will be used, - which are typically established by doing ``gcloud login`` in a terminal. - - ``'cache'``, credentials from previously successful gcsfs - authentication will be used (use this after "browser" auth succeeded) - - ``'anon'``, no authentication is performed, and you can only - access data which is accessible to allUsers (in this case, the project and - access level parameters are meaningless) - - ``'browser'``, you get an access code with which you can - authenticate via a specially provided URL - - if ``'cloud'``, we assume we are running within google compute - or google container engine, and query the internal metadata directly for - a token. - - you may supply a token generated by the - [gcloud](https://cloud.google.com/sdk/docs/) - utility; this is either a python dictionary or the name of a file - containing the JSON returned by logging in with the gcloud CLI tool. - """ - options = [ - click.Option( - ["--service-account-key"], - default=None, - type=FileOrJson(allow_raw_str=True), - help=help_string, - ), - ] - return options - - -@dataclass -class GcsUploaderConfig(FsspecCliUploaderConfig): - pass - - -gcs_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=GcsCliIndexerConfig, - connection_config=GcsCliConnectionConfig, - downloader_config=GcsCliDownloadConfig, -) - -gcs_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=GcsCliConnectionConfig, - uploader_config=GcsUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/s3.py b/unstructured/ingest/v2/cli/cmds/fsspec/s3.py deleted file mode 100644 index 4af72d4d4..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/s3.py +++ /dev/null @@ -1,84 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.fsspec.fsspec import ( - FsspecCliDownloadConfig, - FsspecCliIndexerConfig, - FsspecCliUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.fsspec.s3 import ( - CONNECTOR_TYPE, -) - - -@dataclass -class S3CliDownloadConfig(FsspecCliDownloadConfig): - pass - - -@dataclass -class S3CliIndexerConfig(FsspecCliIndexerConfig): - pass - - -@dataclass -class S3CliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--anonymous"], - is_flag=True, - default=False, - help="Connect to s3 without local AWS credentials.", - ), - click.Option( - ["--endpoint-url"], - type=str, - default=None, - help="Use this endpoint_url, if specified. Needed for " - "connecting to non-AWS S3 buckets.", - ), - click.Option( - ["--key"], - type=str, - default=None, - help="If not anonymous, use this access key ID, if specified. Takes precedence " - "over `aws_access_key_id` in client_kwargs.", - ), - click.Option( - ["--secret"], - type=str, - default=None, - help="If not anonymous, use this secret access key, if specified.", - ), - click.Option( - ["--token"], - type=str, - default=None, - help="If not anonymous, use this security token, if specified.", - ), - ] - return options - - -@dataclass -class S3UploaderConfig(FsspecCliUploaderConfig): - pass - - -s3_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=S3CliIndexerConfig, - connection_config=S3CliConnectionConfig, - downloader_config=S3CliDownloadConfig, -) - -s3_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=S3CliConnectionConfig, - uploader_config=S3UploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/fsspec/sftp.py b/unstructured/ingest/v2/cli/cmds/fsspec/sftp.py deleted file mode 100644 index b4bfcb6c8..000000000 --- a/unstructured/ingest/v2/cli/cmds/fsspec/sftp.py +++ /dev/null @@ -1,80 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.fsspec.fsspec import ( - FsspecCliDownloadConfig, - FsspecCliIndexerConfig, - FsspecCliUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.fsspec.sftp import ( - CONNECTOR_TYPE, -) - - -@dataclass -class SftpCliDownloadConfig(FsspecCliDownloadConfig): - pass - - -@dataclass -class SftpCliIndexerConfig(FsspecCliIndexerConfig): - pass - - -@dataclass -class SftpCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--username"], - required=True, - type=str, - help="Username for sftp connection", - ), - click.Option( - ["--password"], - required=True, - type=str, - help="Password for sftp connection", - ), - click.Option( - ["--look-for-keys"], - required=False, - default=False, - is_flag=True, - type=bool, - help="Whether to search for private key files in ~/.ssh/", - ), - click.Option( - ["--allow-agent"], - required=False, - default=False, - is_flag=True, - type=bool, - help="Whether to connect to the SSH agent.", - ), - ] - return options - - -@dataclass -class SftpUploaderConfig(FsspecCliUploaderConfig): - pass - - -sftp_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=SftpCliIndexerConfig, - connection_config=SftpCliConnectionConfig, - downloader_config=SftpCliDownloadConfig, -) - -sftp_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=SftpCliConnectionConfig, - uploader_config=SftpUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/google_drive.py b/unstructured/ingest/v2/cli/cmds/google_drive.py deleted file mode 100644 index 2a8d7960c..000000000 --- a/unstructured/ingest/v2/cli/cmds/google_drive.py +++ /dev/null @@ -1,74 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import SrcCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString, FileOrJson -from unstructured.ingest.v2.processes.connectors.google_drive import CONNECTOR_TYPE - - -@dataclass -class GoogleDriveCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--drive-id"], - required=True, - type=str, - help="Google Drive File or Folder ID.", - ), - click.Option( - ["--service-account-key"], - required=True, - type=FileOrJson(), - help="Either the file path of the credentials file to use or a json string of " - "those values to use for authentication", - ), - ] - return options - - -@dataclass -class GoogleDriveCliIndexerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--extensions"], - default=None, - type=DelimitedString(), - help="Filters the files to be processed based on extension e.g. jpg, docx, etc.", - ), - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - ] - return options - - -@dataclass -class GoogleDriveCliDownloadConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - ] - return options - - -google_drive_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=GoogleDriveCliConnectionConfig, - indexer_config=GoogleDriveCliIndexerConfig, - downloader_config=GoogleDriveCliDownloadConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/local.py b/unstructured/ingest/v2/cli/cmds/local.py deleted file mode 100644 index f9ab17308..000000000 --- a/unstructured/ingest/v2/cli/cmds/local.py +++ /dev/null @@ -1,60 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString -from unstructured.ingest.v2.processes.connectors.local import CONNECTOR_TYPE - - -@dataclass -class LocalCliIndexerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--input-path"], - required=True, - type=click.Path(file_okay=True, dir_okay=True, exists=True), - help="Path to the location in the local file system that will be processed.", - ), - click.Option( - ["--file-glob"], - default=None, - type=DelimitedString(), - help="A comma-separated list of file globs to limit which types of " - "local files are accepted, e.g. '*.html,*.txt'", - ), - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - ] - return options - - -@dataclass -class LocalCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--output-dir"], - required=True, - type=str, - help="Local path to write partitioned output to", - ) - ] - return options - - -local_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - indexer_config=LocalCliIndexerConfig, -) - -local_dest_cmd = DestCmd(cmd_name=CONNECTOR_TYPE, uploader_config=LocalCliUploaderConfig) diff --git a/unstructured/ingest/v2/cli/cmds/mongodb.py b/unstructured/ingest/v2/cli/cmds/mongodb.py deleted file mode 100644 index 49ad3e53d..000000000 --- a/unstructured/ingest/v2/cli/cmds/mongodb.py +++ /dev/null @@ -1,62 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.mongodb import CONNECTOR_TYPE - - -@dataclass -class MongoDBCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--uri"], - help="URI to user when connecting", - ), - click.Option( - ["--host"], - help="hostname or IP address or Unix domain socket path of a single mongod or " - "mongos instance to connect to, or a list of hostnames", - ), - click.Option(["--port"], type=int, default=27017), - click.Option( - ["--database"], type=str, required=True, help="database name to connect to" - ), - click.Option( - ["--collection"], required=True, type=str, help="collection name to connect to" - ), - ] - return options - - -@dataclass -class MongoDBCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ) - ] - return options - - -@dataclass -class MongoDBCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [] - - -mongodb_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=MongoDBCliConnectionConfig, - uploader_config=MongoDBCliUploaderConfig, - upload_stager_config=MongoDBCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/onedrive.py b/unstructured/ingest/v2/cli/cmds/onedrive.py deleted file mode 100644 index d9bc7df2c..000000000 --- a/unstructured/ingest/v2/cli/cmds/onedrive.py +++ /dev/null @@ -1,91 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import SrcCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.onedrive import CONNECTOR_TYPE - - -@dataclass -class OnedriveCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--client-id"], - required=True, - type=str, - help="Microsoft app client ID", - ), - click.Option( - ["--client-cred"], - required=True, - type=str, - help="Microsoft App client secret", - ), - click.Option( - ["--user-pname"], - required=True, - type=str, - help="User principal name, usually is your Azure AD email.", - ), - click.Option( - ["--tenant"], - default="common", - type=str, - help="ID or domain name associated with your Azure AD instance", - ), - click.Option( - ["--authority-url"], - default="https://login.microsoftonline.com", - type=str, - help="Authentication token provider for Microsoft apps, default is " - "https://login.microsoftonline.com", - ), - ] - return options - - -@dataclass -class OnedriveCliIndexerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--path"], - default=None, - type=str, - help="Folder to start parsing files from.", - ), - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - ] - return options - - -@dataclass -class OnedriveCliDownloadConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - ] - return options - - -onedrive_drive_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=OnedriveCliConnectionConfig, - indexer_config=OnedriveCliIndexerConfig, - downloader_config=OnedriveCliDownloadConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/opensearch.py b/unstructured/ingest/v2/cli/cmds/opensearch.py deleted file mode 100644 index 8d93b7be3..000000000 --- a/unstructured/ingest/v2/cli/cmds/opensearch.py +++ /dev/null @@ -1,93 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd, SrcCmd -from unstructured.ingest.v2.cli.cmds.elasticsearch import ( - ElasticsearchCliDownloadConfig, - ElasticsearchCliIndexerConfig, - ElasticsearchCliUploadStagerConfig, - ElasticsearchUploaderConfig, -) -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString -from unstructured.ingest.v2.processes.connectors.opensearch import CONNECTOR_TYPE - - -@dataclass -class OpenSearchCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--hosts"], - type=DelimitedString(), - help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"', - ), - click.Option( - ["--username"], type=str, default=None, help="username when using basic auth" - ), - click.Option( - ["--password"], - type=str, - default=None, - help="password when using basic auth", - ), - click.Option( - ["--use-ssl"], - type=bool, - default=False, - is_flag=True, - help="use ssl for the connection", - ), - click.Option( - ["--verify-certs"], - type=bool, - default=False, - is_flag=True, - help="whether to verify SSL certificates", - ), - click.Option( - ["--ssl-show-warn"], - type=bool, - default=False, - is_flag=True, - help="show warning when verify certs is disabled", - ), - click.Option( - ["--ca-certs"], - type=click.Path(), - default=None, - help="path to CA bundle", - ), - click.Option( - ["--client-cert"], - type=click.Path(), - default=None, - help="path to the file containing the private key and the certificate," - " or cert only if using client_key", - ), - click.Option( - ["--client-key"], - type=click.Path(), - default=None, - help="path to the file containing the private key" - " if using separate cert and key files", - ), - ] - return options - - -opensearch_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=OpenSearchCliConnectionConfig, - indexer_config=ElasticsearchCliIndexerConfig, - downloader_config=ElasticsearchCliDownloadConfig, -) - -opensearch_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=OpenSearchCliConnectionConfig, - upload_stager_config=ElasticsearchCliUploadStagerConfig, - uploader_config=ElasticsearchUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/pinecone.py b/unstructured/ingest/v2/cli/cmds/pinecone.py deleted file mode 100644 index 010cc703c..000000000 --- a/unstructured/ingest/v2/cli/cmds/pinecone.py +++ /dev/null @@ -1,62 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.pinecone import CONNECTOR_TYPE - - -@dataclass -class PineconeCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--api-key"], - required=True, - type=str, - help="API key for Pinecone.", - ), - click.Option( - ["--index-name"], - required=True, - type=str, - help="Name of the index to connect to. Example: my-index", - ), - click.Option( - ["--environment"], - required=True, - type=str, - help="Environment to connect to. Example: us-east-1", - ), - ] - return options - - -@dataclass -class PineconeCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ), - click.Option( - ["--num-processes"], - default=4, - type=int, - help="Number of processes to use for uploading", - ), - ] - return options - - -pinecone_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=PineconeCliConnectionConfig, - uploader_config=PineconeCliUploaderConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/salesforce.py b/unstructured/ingest/v2/cli/cmds/salesforce.py deleted file mode 100644 index ac910b546..000000000 --- a/unstructured/ingest/v2/cli/cmds/salesforce.py +++ /dev/null @@ -1,79 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import SrcCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString -from unstructured.ingest.v2.processes.connectors.salesforce import ( - ACCEPTED_CATEGORIES, - CONNECTOR_TYPE, -) - - -@dataclass -class SalesforceCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--username"], - required=True, - type=str, - help="Salesforce username usually looks like an email.", - ), - click.Option( - ["--consumer-key"], - required=True, - type=str, - help="For the Salesforce JWT auth. Found in Consumer Details.", - ), - click.Option( - ["--private-key"], - required=True, - type=str, - help="Path to the private key or its contents for the Salesforce JWT auth. " - "Key file is usually named server.key.", - ), - ] - return options - - -@dataclass -class SalesforceCliIndexerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - possible_categories = ACCEPTED_CATEGORIES - options = [ - click.Option( - ["--categories"], - default=None, - required=True, - type=DelimitedString(choices=possible_categories), - help="Comma-delimited salesforce categories to download. " - "Currently only {}.".format(", ".join(possible_categories)), - ), - ] - return options - - -@dataclass -class SalesforceCliDownloadConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - ] - return options - - -salesforce_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=SalesforceCliConnectionConfig, - indexer_config=SalesforceCliIndexerConfig, - downloader_config=SalesforceCliDownloadConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/sharepoint.py b/unstructured/ingest/v2/cli/cmds/sharepoint.py deleted file mode 100644 index 27d5cf3ed..000000000 --- a/unstructured/ingest/v2/cli/cmds/sharepoint.py +++ /dev/null @@ -1,112 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import SrcCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.sharepoint import CONNECTOR_TYPE - - -@dataclass -class SharepointCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--client-id"], - default=None, - type=str, - help="Sharepoint app client ID", - ), - click.Option( - ["--client-cred"], - default=None, - type=str, - help="Sharepoint app secret", - ), - click.Option( - ["--site"], - default=None, - type=str, - help="Sharepoint site url. Process either base url e.g \ - https://[tenant].sharepoint.com or relative sites \ - https://[tenant].sharepoint.com/sites/. \ - To process all sites within the tenant pass a site url as \ - https://[tenant]-admin.sharepoint.com.\ - This requires the app to be registered at a tenant level", - ), - click.Option( - ["--permissions-application-id"], - type=str, - help="Microsoft Graph API application id", - ), - click.Option( - ["--permissions-client-cred"], - type=str, - help="Microsoft Graph API application credentials", - ), - click.Option( - ["--permissions-tenant"], - type=str, - help="e.g https://contoso.onmicrosoft.com to get permissions data within tenant.", - ), - ] - return options - - -@dataclass -class SharepointCliIndexerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--path"], - default=None, - type=str, - help="Path from which to start parsing files. If the connector is to \ - process all sites within the tenant this filter will be applied to \ - all sites document libraries.", - ), - click.Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders " - "otherwise stop at the files in provided folder level.", - ), - click.Option( - ["--omit-files"], - is_flag=True, - default=False, - help="Don't process files.", - ), - click.Option( - ["--omit-pages"], - is_flag=True, - default=False, - help="Don't process site pages.", - ), - ] - return options - - -@dataclass -class SharepointCliDownloadConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to a location at" - "`$HOME/.cache/unstructured/ingest//`.", - ), - ] - return options - - -sharepoint_drive_src_cmd = SrcCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=SharepointCliConnectionConfig, - indexer_config=SharepointCliIndexerConfig, - downloader_config=SharepointCliDownloadConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/singlestore.py b/unstructured/ingest/v2/cli/cmds/singlestore.py deleted file mode 100644 index 1b7809d09..000000000 --- a/unstructured/ingest/v2/cli/cmds/singlestore.py +++ /dev/null @@ -1,96 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.singlestore import CONNECTOR_TYPE - - -@dataclass -class SingleStoreCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--host"], - required=False, - type=str, - default=None, - help="SingleStore host", - ), - click.Option( - ["--port"], - required=False, - type=int, - default=None, - help="SingleStore port", - ), - click.Option( - ["--user"], - required=False, - type=str, - default=None, - help="SingleStore user", - ), - click.Option( - ["--password"], - required=False, - type=str, - default=None, - help="SingleStore password", - ), - click.Option( - ["--database"], - required=False, - type=str, - default=None, - help="SingleStore database", - ), - ] - return options - - -@dataclass -class SingleStoreCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--drop-empty-cols"], - required=False, - type=bool, - is_flag=True, - default=False, - help="Drop any columns that have no data", - ), - ] - return options - - -@dataclass -class SingleStoreCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [ - click.Option( - ["--table-name"], - required=False, - type=str, - help="SingleStore table to write contents to", - ), - click.Option( - ["--batch-size"], - required=False, - type=click.IntRange(min=1), - help="Batch size when writing to SingleStore", - ), - ] - - -singlestore_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=SingleStoreCliConnectionConfig, - uploader_config=SingleStoreCliUploaderConfig, - upload_stager_config=SingleStoreCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/sql.py b/unstructured/ingest/v2/cli/cmds/sql.py deleted file mode 100644 index b36f3c3ac..000000000 --- a/unstructured/ingest/v2/cli/cmds/sql.py +++ /dev/null @@ -1,84 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.processes.connectors.sql import CONNECTOR_TYPE - -SQL_DRIVERS = {"postgresql", "sqlite"} - - -@dataclass -class SQLCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--db-type"], - required=True, - type=click.Choice(SQL_DRIVERS), - help="Type of the database backend", - ), - click.Option( - ["--username"], - default=None, - type=str, - help="DB username", - ), - click.Option( - ["--password"], - default=None, - type=str, - help="DB password", - ), - click.Option( - ["--host"], - default=None, - type=str, - help="DB host", - ), - click.Option( - ["--port"], - default=None, - type=int, - help="DB host connection port", - ), - click.Option( - ["--database"], - default=None, - type=str, - help="Database name. For sqlite databases, this is the path to the .db file.", - ), - ] - return options - - -@dataclass -class SQLCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ) - ] - return options - - -@dataclass -class SQLCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [] - - -sql_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=SQLCliConnectionConfig, - uploader_config=SQLCliUploaderConfig, - upload_stager_config=SQLCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/cmds/weaviate.py b/unstructured/ingest/v2/cli/cmds/weaviate.py deleted file mode 100644 index aaa051d05..000000000 --- a/unstructured/ingest/v2/cli/cmds/weaviate.py +++ /dev/null @@ -1,100 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.base import DestCmd -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString -from unstructured.ingest.v2.processes.connectors.weaviate import CONNECTOR_TYPE - - -@dataclass -class WeaviateCliConnectionConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--host-url"], - required=True, - help="Weaviate instance url", - ), - click.Option( - ["--class-name"], - default=None, - type=str, - help="Name of the class to push the records into, e.g: Pdf-elements", - ), - click.Option( - ["--access-token"], default=None, type=str, help="Used to create the bearer token." - ), - click.Option( - ["--refresh-token"], - default=None, - type=str, - help="Will tie this value to the bearer token. If not provided, " - "the authentication will expire once the lifetime of the access token is up.", - ), - click.Option( - ["--api-key"], - default=None, - type=str, - ), - click.Option( - ["--client-secret"], - default=None, - type=str, - ), - click.Option( - ["--scope"], - default=None, - type=DelimitedString(), - ), - click.Option( - ["--username"], - default=None, - type=str, - ), - click.Option( - ["--password"], - default=None, - type=str, - ), - click.Option( - ["--anonymous"], - is_flag=True, - default=False, - type=bool, - help="if set, all auth values will be ignored", - ), - ] - return options - - -@dataclass -class WeaviateCliUploaderConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--batch-size"], - default=100, - type=int, - help="Number of records per batch", - ) - ] - return options - - -@dataclass -class WeaviateCliUploadStagerConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - return [] - - -weaviate_dest_cmd = DestCmd( - cmd_name=CONNECTOR_TYPE, - connection_config=WeaviateCliConnectionConfig, - uploader_config=WeaviateCliUploaderConfig, - upload_stager_config=WeaviateCliUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/cli/configs/__init__.py b/unstructured/ingest/v2/cli/configs/__init__.py deleted file mode 100644 index 2b3a42192..000000000 --- a/unstructured/ingest/v2/cli/configs/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .chunk import ChunkerCliConfig -from .embed import EmbedderCliConfig -from .partition import PartitionerCliConfig -from .processor import ProcessorCliConfig - -__all__ = ["ChunkerCliConfig", "ProcessorCliConfig", "PartitionerCliConfig", "EmbedderCliConfig"] diff --git a/unstructured/ingest/v2/cli/configs/chunk.py b/unstructured/ingest/v2/cli/configs/chunk.py deleted file mode 100644 index b6f79641d..000000000 --- a/unstructured/ingest/v2/cli/configs/chunk.py +++ /dev/null @@ -1,89 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.chunking import CHUNK_MAX_CHARS_DEFAULT, CHUNK_MULTI_PAGE_DEFAULT -from unstructured.ingest.v2.cli.interfaces import CliConfig - - -@dataclass -class ChunkerCliConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--chunking-strategy"], - type=str, - default=None, - help="The rule-set to use to form chunks. Omit to disable chunking.", - ), - click.Option( - ["--chunk-combine-text-under-n-chars"], - type=int, - help=( - "Combine consecutive chunks when the first does not exceed this length and" - " the second will fit without exceeding the hard-maximum length. Only" - " operative for 'by_title' chunking-strategy." - ), - ), - click.Option( - ["--chunk-include-orig-elements/--chunk-no-include-orig-elements"], - is_flag=True, - default=True, - help=( - "When chunking, add the original elements consolidated to form each chunk to" - " `.metadata.orig_elements` on that chunk." - ), - ), - click.Option( - ["--chunk-max-characters"], - type=int, - default=CHUNK_MAX_CHARS_DEFAULT, - show_default=True, - help=( - "Hard maximum chunk length. No chunk will exceed this length. An oversized" - " element will be divided by text-splitting to fit this window." - ), - ), - click.Option( - ["--chunk-multipage-sections/--chunk-no-multipage-sections"], - is_flag=True, - default=CHUNK_MULTI_PAGE_DEFAULT, - help=( - "Ignore page boundaries when chunking such that elements from two different" - " pages can appear in the same chunk. Only operative for 'by_title'" - " chunking-strategy." - ), - ), - click.Option( - ["--chunk-new-after-n-chars"], - type=int, - help=( - "Soft-maximum chunk length. Another element will not be added to a chunk of" - " this length even when it would fit without exceeding the hard-maximum" - " length." - ), - ), - click.Option( - ["--chunk-overlap"], - type=int, - default=0, - show_default=True, - help=( - "Prefix chunk text with last overlap=N characters of prior chunk. Only" - " applies to oversized chunks divided by text-splitting. To apply overlap to" - " non-oversized chunks use the --overlap-all option." - ), - ), - click.Option( - ["--chunk-overlap-all"], - is_flag=True, - default=False, - help=( - "Apply overlap to chunks formed from whole elements as well as those formed" - " by text-splitting oversized elements. Overlap length is take from --overlap" - " option value." - ), - ), - ] - return options diff --git a/unstructured/ingest/v2/cli/configs/embed.py b/unstructured/ingest/v2/cli/configs/embed.py deleted file mode 100644 index 69f6bc657..000000000 --- a/unstructured/ingest/v2/cli/configs/embed.py +++ /dev/null @@ -1,74 +0,0 @@ -from dataclasses import dataclass -from typing import Any - -import click -from dataclasses_json.core import Json - -from unstructured.embed import EMBEDDING_PROVIDER_TO_CLASS_MAP -from unstructured.ingest.v2.cli.interfaces import CliConfig - - -@dataclass -class EmbedderCliConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--embedding-provider"], - help="Type of the embedding class to be used. Can be one of: " - f"{list(EMBEDDING_PROVIDER_TO_CLASS_MAP)}", - type=click.Choice(list(EMBEDDING_PROVIDER_TO_CLASS_MAP)), - ), - click.Option( - ["--embedding-api-key"], - help="API key for the embedding model, for the case an API key is needed.", - type=str, - default=None, - ), - click.Option( - ["--embedding-model-name"], - help="Embedding model name, if needed. " - "Chooses a particular LLM between different options, to embed with it.", - type=str, - default=None, - ), - click.Option( - ["--embedding-aws-access-key-id"], - help="AWS access key used for AWS-based embedders, such as bedrock", - type=str, - default=None, - ), - click.Option( - ["--embedding-aws-secret-access-key"], - help="AWS secret key used for AWS-based embedders, such as bedrock", - type=str, - default=None, - ), - click.Option( - ["--embedding-aws-region"], - help="AWS region used for AWS-based embedders, such as bedrock", - type=str, - default="us-west-2", - ), - ] - return options - - @classmethod - def from_dict(cls, kvs: Json, **kwargs: Any): - """ - Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with embedding_ during CLI invocation but - doesn't require that as part of the field names in this class - """ - if isinstance(kvs, dict): - new_kvs = { - k[len("embedding_") :]: v # noqa: E203 - for k, v in kvs.items() - if k.startswith("embedding_") - } - if len(new_kvs.keys()) == 0: - return None - if not new_kvs.get("provider"): - return None - return super().from_dict(new_kvs, **kwargs) - return super().from_dict(kvs, **kwargs) diff --git a/unstructured/ingest/v2/cli/configs/partition.py b/unstructured/ingest/v2/cli/configs/partition.py deleted file mode 100644 index 5ec5c0dbe..000000000 --- a/unstructured/ingest/v2/cli/configs/partition.py +++ /dev/null @@ -1,99 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.cli.utils import DelimitedString, Dict - - -@dataclass -class PartitionerCliConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--strategy"], - default="auto", - help="The method that will be used to process the documents. " - "Default: auto. Other strategies include `fast` and `hi_res`.", - ), - click.Option( - ["--ocr-languages"], - default=None, - type=DelimitedString(delimiter="+"), - help="A list of language packs to specify which languages to use for OCR, " - "separated by '+' e.g. 'eng+deu' to use the English and German language packs. " - "The appropriate Tesseract " - "language pack needs to be installed.", - ), - click.Option( - ["--encoding"], - default=None, - help="Text encoding to use when reading documents. By default the encoding is " - "detected automatically.", - ), - click.Option( - ["--skip-infer-table-types"], - type=DelimitedString(), - default=None, - help="Optional list of document types to skip table extraction on", - ), - click.Option( - ["--additional-partition-args"], - type=Dict(), - help="A json string representation of values to pass through to partition()", - ), - click.Option( - ["--fields-include"], - type=DelimitedString(), - default=["element_id", "text", "type", "metadata", "embeddings"], - help="Comma-delimited list. If set, include the specified top-level " - "fields in an element.", - ), - click.Option( - ["--flatten-metadata"], - is_flag=True, - default=False, - help="Results in flattened json elements. " - "Specifically, the metadata key values are brought to " - "the top-level of the element, and the `metadata` key itself is removed.", - ), - click.Option( - ["--metadata-include"], - default=[], - type=DelimitedString(), - help="Comma-delimited list. If set, include the specified metadata " - "fields if they exist and drop all other fields. ", - ), - click.Option( - ["--metadata-exclude"], - default=[], - type=DelimitedString(), - help="Comma-delimited list. If set, drop the specified metadata " - "fields if they exist.", - ), - click.Option( - ["--partition-by-api"], - is_flag=True, - default=False, - help="Use a remote API to partition the files." - " Otherwise, use the function from partition.auto", - ), - click.Option( - ["--partition-endpoint"], - default="https://api.unstructured.io/general/v0/general", - help="If partitioning via api, use the following host. " - "Default: https://api.unstructured.io/general/v0/general", - ), - click.Option( - ["--api-key"], - default=None, - help="API Key for partition endpoint.", - ), - click.Option( - ["--hi-res-model-name"], - default=None, - help="Model name for hi-res strategy.", - ), - ] - return options diff --git a/unstructured/ingest/v2/cli/configs/processor.py b/unstructured/ingest/v2/cli/configs/processor.py deleted file mode 100644 index b9236fad5..000000000 --- a/unstructured/ingest/v2/cli/configs/processor.py +++ /dev/null @@ -1,88 +0,0 @@ -from dataclasses import dataclass - -import click - -from unstructured.ingest.v2.cli.interfaces import CliConfig -from unstructured.ingest.v2.interfaces.processor import DEFAULT_WORK_DIR - - -@dataclass -class ProcessorCliConfig(CliConfig): - @staticmethod - def get_cli_options() -> list[click.Option]: - options = [ - click.Option( - ["--reprocess"], - is_flag=True, - default=False, - help="Reprocess a downloaded file even if the relevant structured " - "output .json file in output directory already exists.", - ), - click.Option( - ["--work-dir"], - type=str, - default=DEFAULT_WORK_DIR, - show_default=True, - help="Where to place working files when processing each step", - ), - click.Option( - ["--num-processes"], - default=2, - show_default=True, - type=click.IntRange(min=1), - help="Number of parallel processes with which to process docs", - ), - click.Option( - ["--max-connections"], - default=None, - show_default=True, - type=click.IntRange(min=1), - help="Max number of connections allowed when running an async step", - ), - click.Option( - ["--raise-on-error"], - is_flag=True, - default=False, - help="Is set, will raise error if any doc in the pipeline fail. Otherwise will " - "log error and continue with other docs", - ), - click.Option( - ["--re-download"], - is_flag=True, - default=False, - help="Re-download files even if they are already present in download dir.", - ), - click.Option( - ["--preserve-downloads"], - is_flag=True, - default=False, - help="Preserve downloaded files. Otherwise each file is removed " - "after being processed successfully.", - ), - click.Option( - ["--download-only"], - is_flag=True, - default=False, - help="Download any files that are not already present in either --download-dir or " - "the default download ~/.cache/... location in case --download-dir " - "is not specified and " - "skip processing them through unstructured.", - ), - click.Option( - ["--max-docs"], - default=None, - type=int, - help="If specified, process at most the specified number of documents.", - ), - click.Option( - ["--uncompress"], - type=bool, - default=False, - is_flag=True, - help="Uncompress any archived files. Currently supporting zip and tar " - "files based on file extension.", - ), - click.Option(["--verbose"], is_flag=True, default=False), - click.Option(["--tqdm"], is_flag=True, default=False, help="Show progress bar"), - ] - return options diff --git a/unstructured/ingest/v2/cli/interfaces.py b/unstructured/ingest/v2/cli/interfaces.py deleted file mode 100644 index 2a8a0e18b..000000000 --- a/unstructured/ingest/v2/cli/interfaces.py +++ /dev/null @@ -1,27 +0,0 @@ -from abc import ABC, abstractmethod - -import click - - -class CliConfig(ABC): - @staticmethod - @abstractmethod - def get_cli_options() -> list[click.Option]: - pass - - @classmethod - def add_cli_options(cls, cmd: click.Command) -> None: - options_to_add = cls.get_cli_options() - CliConfig.add_params(cmd, params=options_to_add) - - @staticmethod - def add_params(cmd: click.Command, params: list[click.Parameter]): - existing_opts = [] - for param in cmd.params: - existing_opts.extend(param.opts) - for param in params: - for opt in param.opts: - if opt in existing_opts: - raise ValueError(f"{opt} is already defined on the command {cmd.name}") - existing_opts.append(opt) - cmd.params.append(param) diff --git a/unstructured/ingest/v2/cli/utils.py b/unstructured/ingest/v2/cli/utils.py deleted file mode 100644 index 66d414f61..000000000 --- a/unstructured/ingest/v2/cli/utils.py +++ /dev/null @@ -1,240 +0,0 @@ -import json -import os.path -import sys -from dataclasses import fields, is_dataclass -from gettext import gettext, ngettext -from gettext import gettext as _ -from pathlib import Path -from typing import Any, ForwardRef, Optional, Type, TypeVar, Union, get_args, get_origin - -import click - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.v2.logger import logger - - -def conform_click_options(options: dict[str, Any]) -> None: - # Click sets all multiple fields as tuple, this needs to be updated to list - for k, v in options.items(): - if isinstance(v, tuple): - options[k] = list(v) - - -class Dict(click.ParamType): - name = "dict" - - def convert( - self, - value: Any, - param: Optional[click.Parameter] = None, - ctx: Optional[click.Context] = None, - ) -> Any: - try: - return json.loads(value) - except json.JSONDecodeError: - self.fail( - gettext( - "{value} is not a valid json value.", - ).format(value=value), - param, - ctx, - ) - - -class FileOrJson(click.ParamType): - name = "file-or-json" - - def __init__(self, allow_raw_str: bool = False): - self.allow_raw_str = allow_raw_str - - def convert( - self, - value: Any, - param: Optional[click.Parameter] = None, - ctx: Optional[click.Context] = None, - ) -> Any: - # check if valid file - full_path = os.path.abspath(os.path.expanduser(value)) - if os.path.isfile(full_path): - return str(Path(full_path).resolve()) - if isinstance(value, str): - try: - return json.loads(value) - except json.JSONDecodeError: - if self.allow_raw_str: - return value - self.fail( - gettext( - "{value} is not a valid json string nor an existing filepath.", - ).format(value=value), - param, - ctx, - ) - - -class DelimitedString(click.ParamType): - name = "delimited-string" - - def __init__(self, delimiter: str = ",", choices: Optional[list[str]] = None): - self.choices = choices if choices else [] - self.delimiter = delimiter - - def convert( - self, - value: Any, - param: Optional[click.Parameter] = None, - ctx: Optional[click.Context] = None, - ) -> Any: - # In case a list is provided as the default, will not break - if isinstance(value, list): - split = [str(v).strip() for v in value] - else: - split = [v.strip() for v in value.split(self.delimiter)] - if not self.choices: - return split - choices_str = ", ".join(map(repr, self.choices)) - for s in split: - if s not in self.choices: - self.fail( - ngettext( - "{value!r} is not {choice}.", - "{value!r} is not one of {choices}.", - len(self.choices), - ).format(value=s, choice=choices_str, choices=choices_str), - param, - ctx, - ) - return split - - -EnhancedDataClassJsonMixinT = TypeVar( - "EnhancedDataClassJsonMixinT", bound=EnhancedDataClassJsonMixin -) - - -def extract_config( - flat_data: dict, config: Type[EnhancedDataClassJsonMixinT] -) -> EnhancedDataClassJsonMixinT: - """ - To be able to extract a nested dataclass from a flat dictionary (as in one coming - from a click-based options input), the config class is dynamically looked through for - nested dataclass fields and new nested dictionaries are created to conform to the - shape the overall class expects when parsing from a dict. During the process, this will create - copies of the original dictionary to avoid pruning fields but this isn't a - problem since the `from_dict()` method ignores unneeded values. - - Not handling more complex edge cases for now such as nested types i.e Union[List[List[...]]] - """ - - def conform_dict(inner_d: dict, inner_config: Type[EnhancedDataClassJsonMixinT]): - # Catch edge cases (i.e. Dict[str, ...]) where underlying type is not a concrete Class, - # causing 'issubclass() arg 1 must be a class' errors, return False - def is_subclass(instance, class_type) -> bool: - try: - return issubclass(instance, class_type) - except Exception: - return False - - dd = inner_d.copy() - for field in fields(inner_config): - f_type = field.type - # typing can be defined using a string, in which case it needs to be resolved - # to the actual type. following logic is cherry picked from the typing - # get_type_hints() since type resolution can be expensive, only do it - # when the type is a string - if isinstance(f_type, str): - try: - base_globals = sys.modules[inner_config.__module__].__dict__ - for_ref = ForwardRef(f_type, is_argument=False, is_class=True) - f_type = for_ref._evaluate( - globalns=base_globals, localns=None, recursive_guard=frozenset() - ) - except NameError as e: - logger.warning(f"couldn't resolve type {f_type}: {e}") - # Handle the case where the type of a value if a Union (possibly optional) - if get_origin(f_type) is Union: - union_values = get_args(f_type) - # handle List types - union_values = [ - get_args(u)[0] if get_origin(u) is list else u for u in union_values - ] - # Ignore injected NoneType when optional - concrete_union_values = [v for v in union_values if not is_subclass(v, type(None))] - dataclass_union_values = [v for v in concrete_union_values if is_dataclass(v)] - non_dataclass_union_values = [ - v for v in concrete_union_values if not is_dataclass(v) - ] - if not dataclass_union_values: - continue - # Check if the key for this field already exists in the dictionary, - # if so it might map to one of these non dataclass fields and this - # can't be enforced - if non_dataclass_union_values and field.name in dd: - continue - if len(dataclass_union_values) > 1: - logger.warning( - "more than one dataclass type possible for field {}, " - "not extracting: {}".format(field.name, ", ".join(dataclass_union_values)) - ) - continue - f_type = dataclass_union_values[0] - origin = get_origin(f_type) - if origin: - f_type = origin - if is_subclass(f_type, EnhancedDataClassJsonMixin): - dd[field.name] = conform_dict(inner_d=dd, inner_config=f_type) - return dd - - adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config) - return config.from_dict(adjusted_dict, apply_name_overload=False) - - -class Group(click.Group): - def parse_args(self, ctx, args): - """ - This allows for subcommands to be called with the --help flag without breaking - if parent command is missing any of its required parameters - """ - - try: - return super().parse_args(ctx, args) - except click.MissingParameter: - if "--help" not in args: - raise - - # remove the required params so that help can display - for param in self.params: - param.required = False - return super().parse_args(ctx, args) - - def format_commands(self, ctx: click.Context, formatter: click.HelpFormatter) -> None: - """ - Copy of the original click.Group format_commands() method but replacing - 'Commands' -> 'Destinations' - """ - commands = [] - for subcommand in self.list_commands(ctx): - cmd = self.get_command(ctx, subcommand) - # What is this, the tool lied about a command. Ignore it - if cmd is None: - continue - if cmd.hidden: - continue - - commands.append((subcommand, cmd)) - - # allow for 3 times the default spacing - if len(commands): - if formatter.width: - limit = formatter.width - 6 - max(len(cmd[0]) for cmd in commands) - else: - limit = -6 - max(len(cmd[0]) for cmd in commands) - - rows = [] - for subcommand, cmd in commands: - help = cmd.get_short_help_str(limit) - rows.append((subcommand, help)) - - if rows: - with formatter.section(_("Destinations")): - formatter.write_dl(rows) diff --git a/unstructured/ingest/v2/example.py b/unstructured/ingest/v2/example.py deleted file mode 100644 index c4545f926..000000000 --- a/unstructured/ingest/v2/example.py +++ /dev/null @@ -1,37 +0,0 @@ -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.fsspec.s3 import ( - S3ConnectionConfig, - S3DownloaderConfig, - S3IndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.local import ( - LocalUploaderConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig( - work_dir=str(work_dir.resolve()), tqdm=True, reprocess=True, verbose=True - ), - indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"), - downloader_config=S3DownloaderConfig(download_dir=download_path), - source_connection_config=S3ConnectionConfig(anonymous=True), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())), - ).run() diff --git a/unstructured/ingest/v2/examples/example_azure_cognitive_search.py b/unstructured/ingest/v2/examples/example_azure_cognitive_search.py deleted file mode 100644 index f3679ad1b..000000000 --- a/unstructured/ingest/v2/examples/example_azure_cognitive_search.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.azure_cognitive_search import ( - AzureCognitiveSearchAccessConfig, - AzureCognitiveSearchConnectionConfig, - AzureCognitiveSearchUploaderConfig, - AzureCognitiveSearchUploadStagerConfig, -) -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - index_name = "ingest-test-destination" - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig( - input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt" - ), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig( - chunking_strategy="by_title", chunk_include_orig_elements=False - ), - embedder_config=EmbedderConfig( - embedding_provider="langchain-openai", embedding_api_key=os.getenv("OPENAI_API_KEY") - ), - destination_connection_config=AzureCognitiveSearchConnectionConfig( - access_config=AzureCognitiveSearchAccessConfig(key=os.getenv("AZURE_SEARCH_API_KEY")), - index=os.getenv("AZURE_SEARCH_INDEX"), - endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"), - ), - uploader_config=AzureCognitiveSearchUploaderConfig(batch_size=10), - stager_config=AzureCognitiveSearchUploadStagerConfig(), - ).run() diff --git a/unstructured/ingest/v2/examples/example_chroma.py b/unstructured/ingest/v2/examples/example_chroma.py deleted file mode 100644 index f5773c4d8..000000000 --- a/unstructured/ingest/v2/examples/example_chroma.py +++ /dev/null @@ -1,53 +0,0 @@ -import random -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.chroma import ( - ChromaAccessConfig, - ChromaConnectionConfig, - ChromaUploaderConfig, - ChromaUploadStagerConfig, -) -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig( - chunking_strategy="by_title", - chunk_include_orig_elements=False, - chunk_max_characters=1500, - chunk_multipage_sections=True, - ), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=ChromaConnectionConfig( - access_config=ChromaAccessConfig(settings=None, headers=None), - host="localhost", - port=8047, - collection_name=f"test-collection-{random.randint(1000,9999)}", - tenant="default_tenant", - database="default_database", - ), - stager_config=ChromaUploadStagerConfig(), - uploader_config=ChromaUploaderConfig(batch_size=10), - ).run() diff --git a/unstructured/ingest/v2/examples/example_databricks_volumes.py b/unstructured/ingest/v2/examples/example_databricks_volumes.py deleted file mode 100644 index ecc8b6301..000000000 --- a/unstructured/ingest/v2/examples/example_databricks_volumes.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.databricks_volumes import ( - DatabricksVolumesAccessConfig, - DatabricksVolumesConnectionConfig, - DatabricksVolumesUploaderConfig, -) -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig( - chunking_strategy="by_title", - chunk_include_orig_elements=False, - chunk_max_characters=1500, - chunk_multipage_sections=True, - ), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=DatabricksVolumesConnectionConfig( - access_config=DatabricksVolumesAccessConfig( - username=os.environ["DATABRICKS_USERNAME"], - password=os.environ["DATABRICKS_PASSWORD"], - ), - host=os.environ["DATABRICKS_HOST"], - ), - uploader_config=DatabricksVolumesUploaderConfig( - catalog=os.environ["DATABRICKS_CATALOG"], - volume=os.environ["DATABRICKS_VOLUME"], - volume_path=os.environ["DATABRICKS_VOLUME_PATH"], - ), - ).run() diff --git a/unstructured/ingest/v2/examples/example_elasticsearch.py b/unstructured/ingest/v2/examples/example_elasticsearch.py deleted file mode 100644 index 96cdeef24..000000000 --- a/unstructured/ingest/v2/examples/example_elasticsearch.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.elasticsearch import ( - ElasticsearchAccessConfig, - ElasticsearchConnectionConfig, - ElasticsearchUploaderConfig, - ElasticsearchUploadStagerConfig, -) -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - index_name = "ingest-test-destination" - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig( - input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt" - ), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=ElasticsearchConnectionConfig( - access_config=ElasticsearchAccessConfig(password=os.getenv("ELASTIC_PASSWORD")), - username=os.getenv("ELASTIC_USERNAME"), - hosts=["http://localhost:9200"], - ), - uploader_config=ElasticsearchUploaderConfig(index_name=index_name), - stager_config=ElasticsearchUploadStagerConfig(index_name=index_name), - ).run() diff --git a/unstructured/ingest/v2/examples/example_local.py b/unstructured/ingest/v2/examples/example_local.py deleted file mode 100644 index f72334e40..000000000 --- a/unstructured/ingest/v2/examples/example_local.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, - LocalUploaderConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig( - input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt" - ), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())), - ).run() diff --git a/unstructured/ingest/v2/examples/example_mongodb.py b/unstructured/ingest/v2/examples/example_mongodb.py deleted file mode 100644 index 4ef562ae6..000000000 --- a/unstructured/ingest/v2/examples/example_mongodb.py +++ /dev/null @@ -1,52 +0,0 @@ -import random -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.mongodb import ( - MongoDBAccessConfig, - MongoDBConnectionConfig, - MongoDBUploaderConfig, - MongoDBUploadStagerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig( - chunking_strategy="by_title", - chunk_include_orig_elements=False, - chunk_max_characters=1500, - chunk_multipage_sections=True, - ), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=MongoDBConnectionConfig( - access_config=MongoDBAccessConfig(uri=None), - host="localhost", - port=27017, - collection=f"test-collection-{random.randint(1000,9999)}", - database="testDatabase", - ), - stager_config=MongoDBUploadStagerConfig(), - uploader_config=MongoDBUploaderConfig(batch_size=10), - ).run() diff --git a/unstructured/ingest/v2/examples/example_opensearch.py b/unstructured/ingest/v2/examples/example_opensearch.py deleted file mode 100644 index a5f654cfe..000000000 --- a/unstructured/ingest/v2/examples/example_opensearch.py +++ /dev/null @@ -1,51 +0,0 @@ -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.opensearch import ( - OpenSearchAccessConfig, - OpenSearchConnectionConfig, - OpenSearchUploaderConfig, - OpenSearchUploadStagerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig( - input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt" - ), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=OpenSearchConnectionConfig( - hosts="http://localhost:9247", - username="admin", - access_config=OpenSearchAccessConfig( - password="admin", - use_ssl=True, - ), - ), - stager_config=OpenSearchUploadStagerConfig(index_name="ingest-test-destination"), - uploader_config=OpenSearchUploaderConfig( - index_name="ingest-test-destination", batch_size_bytes=150 - ), - ).run() diff --git a/unstructured/ingest/v2/examples/example_pinecone.py b/unstructured/ingest/v2/examples/example_pinecone.py deleted file mode 100644 index 236a64df2..000000000 --- a/unstructured/ingest/v2/examples/example_pinecone.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.pinecone import ( - PineconeAccessConfig, - PineconeConnectionConfig, - PineconeUploaderConfig, - PineconeUploadStagerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig( - input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt" - ), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=PineconeConnectionConfig( - # You'll need to set PINECONE_API_KEY environment variable to run this example - access_config=PineconeAccessConfig(api_key=os.getenv("PINECONE_API_KEY")), - index_name=os.getenv( - "PINECONE_INDEX", - default="your index name here. e.g. my-index," - "or define in environment variable PINECONE_INDEX", - ), - environment=os.getenv( - "PINECONE_ENVIRONMENT", - default="your environment name here. e.g. us-east-1," - "or define in environment variable PINECONE_ENVIRONMENT", - ), - ), - stager_config=PineconeUploadStagerConfig(), - uploader_config=PineconeUploaderConfig(batch_size=10, num_of_processes=2), - ).run() diff --git a/unstructured/ingest/v2/examples/example_s3.py b/unstructured/ingest/v2/examples/example_s3.py deleted file mode 100644 index 2910f526d..000000000 --- a/unstructured/ingest/v2/examples/example_s3.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.fsspec.s3 import ( - S3ConnectionConfig, - S3DownloaderConfig, - S3IndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.local import ( - LocalUploaderConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"), - downloader_config=S3DownloaderConfig(download_dir=download_path), - source_connection_config=S3ConnectionConfig(anonymous=True), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())), - ).run() diff --git a/unstructured/ingest/v2/examples/example_salesforce.py b/unstructured/ingest/v2/examples/example_salesforce.py deleted file mode 100644 index b3439d5aa..000000000 --- a/unstructured/ingest/v2/examples/example_salesforce.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalUploaderConfig, -) -from unstructured.ingest.v2.processes.connectors.salesforce import ( - SalesforceAccessConfig, - SalesforceConnectionConfig, - SalesforceDownloaderConfig, - SalesforceIndexerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=SalesforceIndexerConfig(categories=["Campaign", "EmailMessage"]), - downloader_config=SalesforceDownloaderConfig(download_dir=download_path), - source_connection_config=SalesforceConnectionConfig( - SalesforceAccessConfig( - consumer_key=os.getenv("SALESFORCE_CONSUMER_KEY"), - private_key=os.getenv("SALESFORCE_PRIVATE_KEY"), - ), - username=os.getenv("SALESFORCE_USERNAME"), - ), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())), - ).run() diff --git a/unstructured/ingest/v2/examples/example_sharepoint.py b/unstructured/ingest/v2/examples/example_sharepoint.py deleted file mode 100644 index bc9139efc..000000000 --- a/unstructured/ingest/v2/examples/example_sharepoint.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.connectors.local import ( - LocalUploaderConfig, -) -from unstructured.ingest.v2.processes.connectors.sharepoint import ( - SharepointAccessConfig, - SharepointConnectionConfig, - SharepointDownloaderConfig, - SharepointIndexerConfig, - SharepointPermissionsConfig, -) -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True), - indexer_config=SharepointIndexerConfig(), - downloader_config=SharepointDownloaderConfig(download_dir=download_path), - source_connection_config=SharepointConnectionConfig( - client_id=os.getenv("SHAREPOINT_CLIENT_ID"), - site=os.getenv("SHAREPOINT_SITE"), - access_config=SharepointAccessConfig(client_cred=os.getenv("SHAREPOINT_CRED")), - permissions_config=SharepointPermissionsConfig( - permissions_application_id=os.getenv("SHAREPOINT_PERMISSIONS_APP_ID"), - permissions_client_cred=os.getenv("SHAREPOINT_PERMISSIONS_APP_CRED"), - permissions_tenant=os.getenv("SHAREPOINT_PERMISSIONS_TENANT"), - ), - ), - partitioner_config=PartitionerConfig(strategy="fast"), - # chunker_config=ChunkerConfig(chunking_strategy="by_title"), - # embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())), - ).run() diff --git a/unstructured/ingest/v2/examples/example_singlestore.py b/unstructured/ingest/v2/examples/example_singlestore.py deleted file mode 100644 index 47d4494a9..000000000 --- a/unstructured/ingest/v2/examples/example_singlestore.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.singlestore import ( - SingleStoreAccessConfig, - SingleStoreConnectionConfig, - SingleStoreUploaderConfig, - SingleStoreUploadStagerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True), - indexer_config=LocalIndexerConfig( - input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt" - ), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=SingleStoreConnectionConfig( - access_config=SingleStoreAccessConfig(password="password"), - host="localhost", - port=3306, - database="ingest_test", - user="root", - ), - stager_config=SingleStoreUploadStagerConfig(), - uploader_config=SingleStoreUploaderConfig(table_name="elements"), - ).run() diff --git a/unstructured/ingest/v2/examples/example_sql.py b/unstructured/ingest/v2/examples/example_sql.py deleted file mode 100644 index 4ed938192..000000000 --- a/unstructured/ingest/v2/examples/example_sql.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -import sqlite3 -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.sql import ( - DatabaseType, - SimpleSqlConfig, - SQLAccessConfig, - SQLUploaderConfig, - SQLUploadStagerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -SQLITE_DB = "test-sql-db.sqlite" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - - configs = { - "context": ProcessorConfig(work_dir=str(work_dir.resolve())), - "indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"), - "downloader_config": LocalDownloaderConfig(download_dir=download_path), - "source_connection_config": LocalConnectionConfig(), - "partitioner_config": PartitionerConfig(strategy="fast"), - "chunker_config": ChunkerConfig( - chunking_strategy="by_title", - chunk_include_orig_elements=False, - chunk_max_characters=1500, - chunk_multipage_sections=True, - ), - "embedder_config": EmbedderConfig(embedding_provider="langchain-huggingface"), - "stager_config": SQLUploadStagerConfig(), - "uploader_config": SQLUploaderConfig(batch_size=10), - } - - if os.path.exists(SQLITE_DB): - os.remove(SQLITE_DB) - - connection = sqlite3.connect(database=SQLITE_DB) - - query = None - script_path = ( - Path(__file__).parent.parent.parent.parent.parent - / Path("scripts/sql-test-helpers/create-sqlite-schema.sql") - ).resolve() - with open(script_path) as f: - query = f.read() - cursor = connection.cursor() - cursor.executescript(query) - connection.close() - - # sqlite test first - Pipeline.from_configs( - destination_connection_config=SimpleSqlConfig( - db_type=DatabaseType.SQLITE, - database=SQLITE_DB, - access_config=SQLAccessConfig(), - ), - **configs, - ).run() - - # now, pg with pgvector - Pipeline.from_configs( - destination_connection_config=SimpleSqlConfig( - db_type=DatabaseType.POSTGRESQL, - database="elements", - host="localhost", - port=5433, - access_config=SQLAccessConfig(username="unstructured", password="test"), - ), - **configs, - ).run() diff --git a/unstructured/ingest/v2/examples/example_weaviate.py b/unstructured/ingest/v2/examples/example_weaviate.py deleted file mode 100644 index 5b9e739c5..000000000 --- a/unstructured/ingest/v2/examples/example_weaviate.py +++ /dev/null @@ -1,44 +0,0 @@ -from pathlib import Path - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.pipeline import Pipeline -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connectors.local import ( - LocalConnectionConfig, - LocalDownloaderConfig, - LocalIndexerConfig, -) -from unstructured.ingest.v2.processes.connectors.weaviate import ( - WeaviateConnectionConfig, - WeaviateUploaderConfig, - WeaviateUploadStagerConfig, -) -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - -base_path = Path(__file__).parent.parent.parent.parent.parent -docs_path = base_path / "example-docs" -work_dir = base_path / "tmp_ingest" -output_path = work_dir / "output" -download_path = work_dir / "download" - -if __name__ == "__main__": - logger.info(f"Writing all content in: {work_dir.resolve()}") - Pipeline.from_configs( - context=ProcessorConfig(work_dir=str(work_dir.resolve())), - indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"), - downloader_config=LocalDownloaderConfig(download_dir=download_path), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig(strategy="fast"), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), - destination_connection_config=WeaviateConnectionConfig( - host_url="http://localhost:8080", - class_name="elements", - access_config=None, - anonymous=True, - ), - stager_config=WeaviateUploadStagerConfig(), - uploader_config=WeaviateUploaderConfig(batch_size=10), - ).run() diff --git a/unstructured/ingest/v2/interfaces/__init__.py b/unstructured/ingest/v2/interfaces/__init__.py deleted file mode 100644 index 5aa6240ab..000000000 --- a/unstructured/ingest/v2/interfaces/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from .connector import AccessConfig, BaseConnector, ConnectionConfig -from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses -from .file_data import FileData, SourceIdentifiers -from .indexer import Indexer, IndexerConfig -from .process import BaseProcess -from .processor import ProcessorConfig -from .upload_stager import UploadStager, UploadStagerConfig -from .uploader import UploadContent, Uploader, UploaderConfig - -__all__ = [ - "DownloadResponse", - "download_responses", - "Downloader", - "DownloaderConfig", - "FileData", - "Indexer", - "IndexerConfig", - "BaseProcess", - "ProcessorConfig", - "UploadStager", - "UploadStagerConfig", - "Uploader", - "UploaderConfig", - "SourceIdentifiers", - "UploadContent", - "AccessConfig", - "ConnectionConfig", - "BaseConnector", -] diff --git a/unstructured/ingest/v2/interfaces/connector.py b/unstructured/ingest/v2/interfaces/connector.py deleted file mode 100644 index dc700fc94..000000000 --- a/unstructured/ingest/v2/interfaces/connector.py +++ /dev/null @@ -1,32 +0,0 @@ -from abc import ABC -from dataclasses import dataclass -from typing import Any, TypeVar - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin - - -@dataclass -class AccessConfig(EnhancedDataClassJsonMixin): - """Meant to designate holding any sensitive information associated with other configs - and also for access specific configs.""" - - -AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig) - - -@dataclass -class ConnectionConfig(EnhancedDataClassJsonMixin): - access_config: AccessConfigT - - def get_access_config(self) -> dict[str, Any]: - if not self.access_config: - return {} - return self.access_config.to_dict(apply_name_overload=False) - - -ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig) - - -@dataclass -class BaseConnector(ABC): - connection_config: ConnectionConfigT diff --git a/unstructured/ingest/v2/interfaces/downloader.py b/unstructured/ingest/v2/interfaces/downloader.py deleted file mode 100644 index 3a493b017..000000000 --- a/unstructured/ingest/v2/interfaces/downloader.py +++ /dev/null @@ -1,79 +0,0 @@ -import os -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Optional, TypedDict, TypeVar, Union - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.v2.interfaces.connector import BaseConnector -from unstructured.ingest.v2.interfaces.file_data import FileData -from unstructured.ingest.v2.interfaces.process import BaseProcess - - -@dataclass -class DownloaderConfig(EnhancedDataClassJsonMixin): - download_dir: Optional[Path] = None - - -DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig) - - -class DownloadResponse(TypedDict): - file_data: FileData - path: Path - - -download_responses = Union[list[DownloadResponse], DownloadResponse] - - -class Downloader(BaseProcess, BaseConnector, ABC): - connector_type: str - download_config: DownloaderConfigT - - @staticmethod - def is_float(value: str): - try: - float(value) - return True - except ValueError: - return False - - def generate_download_response( - self, file_data: FileData, download_path: Path - ) -> DownloadResponse: - if ( - file_data.metadata.date_modified - and self.is_float(file_data.metadata.date_modified) - and file_data.metadata.date_created - and self.is_float(file_data.metadata.date_created) - ): - date_modified = float(file_data.metadata.date_modified) - date_created = float(file_data.metadata.date_created) - os.utime(download_path, times=(date_created, date_modified)) - return DownloadResponse(file_data=file_data, path=download_path) - - @property - def download_dir(self) -> Path: - if self.download_config.download_dir is None: - self.download_config.download_dir = ( - Path.home() - / ".cache" - / "unstructured" - / "ingest" - / "download" - / self.connector_type - ).resolve() - return self.download_config.download_dir - - def is_async(self) -> bool: - return True - - def get_download_path(self, file_data: FileData) -> Optional[Path]: - return None - - @abstractmethod - def run(self, file_data: FileData, **kwargs: Any) -> download_responses: - pass - - async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses: - return self.run(file_data=file_data, **kwargs) diff --git a/unstructured/ingest/v2/interfaces/file_data.py b/unstructured/ingest/v2/interfaces/file_data.py deleted file mode 100644 index 9cccbaff0..000000000 --- a/unstructured/ingest/v2/interfaces/file_data.py +++ /dev/null @@ -1,56 +0,0 @@ -import json -from dataclasses import dataclass, field -from enum import Enum -from pathlib import Path -from typing import Any, Optional - -from dataclasses_json import DataClassJsonMixin - -from unstructured.documents.elements import DataSourceMetadata - - -class IndexDocType(str, Enum): - BATCH = "batch" - FILE = "file" - - -@dataclass -class SourceIdentifiers: - filename: str - fullpath: str - rel_path: Optional[str] = None - - @property - def filename_stem(self) -> str: - return Path(self.filename).stem - - @property - def relative_path(self) -> str: - return self.rel_path or self.fullpath - - -@dataclass -class FileData(DataClassJsonMixin): - identifier: str - connector_type: str - source_identifiers: Optional[SourceIdentifiers] = None - doc_type: IndexDocType = field(default=IndexDocType.FILE) - metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata) - additional_metadata: dict[str, Any] = field(default_factory=dict) - reprocess: bool = False - - @classmethod - def from_file(cls, path: str) -> "FileData": - path = Path(path).resolve() - if not path.exists() or not path.is_file(): - raise ValueError(f"file path not valid: {path}") - with open(str(path.resolve()), "rb") as f: - file_data_dict = json.load(f) - file_data = FileData.from_dict(file_data_dict) - return file_data - - def to_file(self, path: str) -> None: - path = Path(path).resolve() - path.parent.mkdir(parents=True, exist_ok=True) - with open(str(path.resolve()), "w") as f: - json.dump(self.to_dict(), f, indent=2) diff --git a/unstructured/ingest/v2/interfaces/indexer.py b/unstructured/ingest/v2/interfaces/indexer.py deleted file mode 100644 index f3f2490ef..000000000 --- a/unstructured/ingest/v2/interfaces/indexer.py +++ /dev/null @@ -1,28 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Any, Generator, Optional, TypeVar - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.v2.interfaces.connector import BaseConnector -from unstructured.ingest.v2.interfaces.file_data import FileData -from unstructured.ingest.v2.interfaces.process import BaseProcess - - -@dataclass -class IndexerConfig(EnhancedDataClassJsonMixin): - pass - - -IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig) - - -class Indexer(BaseProcess, BaseConnector, ABC): - connector_type: str - index_config: Optional[IndexerConfigT] = None - - def is_async(self) -> bool: - return False - - @abstractmethod - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - pass diff --git a/unstructured/ingest/v2/interfaces/process.py b/unstructured/ingest/v2/interfaces/process.py deleted file mode 100644 index 028356111..000000000 --- a/unstructured/ingest/v2/interfaces/process.py +++ /dev/null @@ -1,20 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import Any - - -@dataclass -class BaseProcess(ABC): - def is_async(self) -> bool: - return False - - @abstractmethod - def run(self, **kwargs: Any) -> Any: - pass - - async def run_async(self, **kwargs: Any) -> Any: - return self.run(**kwargs) - - def check_connection(self): - # If the process requires external connections, run a quick check - pass diff --git a/unstructured/ingest/v2/interfaces/processor.py b/unstructured/ingest/v2/interfaces/processor.py deleted file mode 100644 index 96390e53f..000000000 --- a/unstructured/ingest/v2/interfaces/processor.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -from asyncio import Semaphore -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Optional - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin - -DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve()) - - -@dataclass -class ProcessorConfig(EnhancedDataClassJsonMixin): - reprocess: bool = False - verbose: bool = False - tqdm: bool = False - work_dir: str = field(default_factory=lambda: DEFAULT_WORK_DIR) - num_processes: int = 2 - max_connections: Optional[int] = None - raise_on_error: bool = False - disable_parallelism: bool = field( - default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true" - ) - preserve_downloads: bool = False - download_only: bool = False - max_docs: Optional[int] = None - re_download: bool = False - uncompress: bool = False - - # Used to keep track of state in pipeline - status: dict[str, Any] = field(default_factory=dict) - semaphore: Optional[Semaphore] = field(init=False, default=None) - - def __post_init__(self): - if self.max_connections is not None: - self.semaphore = Semaphore(self.max_connections) - - @property - def mp_supported(self) -> bool: - return not self.disable_parallelism and self.num_processes > 1 - - @property - def async_supported(self) -> bool: - if self.disable_parallelism: - return False - if self.max_connections is not None and isinstance(self.max_connections, int): - return self.max_connections > 1 - return True diff --git a/unstructured/ingest/v2/interfaces/upload_stager.py b/unstructured/ingest/v2/interfaces/upload_stager.py deleted file mode 100644 index 2aeef2e5d..000000000 --- a/unstructured/ingest/v2/interfaces/upload_stager.py +++ /dev/null @@ -1,48 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -from typing import Any, TypeVar - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.v2.interfaces.file_data import FileData -from unstructured.ingest.v2.interfaces.process import BaseProcess - - -@dataclass -class UploadStagerConfig(EnhancedDataClassJsonMixin): - pass - - -UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig) - - -@dataclass -class UploadStager(BaseProcess, ABC): - upload_stager_config: UploadStagerConfigT - - @abstractmethod - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any - ) -> Path: - pass - - async def run_async( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any - ) -> Path: - return self.run( - elements_filepath=elements_filepath, - output_dir=output_dir, - output_filename=output_filename, - file_data=file_data, - **kwargs - ) diff --git a/unstructured/ingest/v2/interfaces/uploader.py b/unstructured/ingest/v2/interfaces/uploader.py deleted file mode 100644 index b8c282983..000000000 --- a/unstructured/ingest/v2/interfaces/uploader.py +++ /dev/null @@ -1,39 +0,0 @@ -from abc import ABC, abstractmethod -from dataclasses import dataclass -from pathlib import Path -from typing import Any, TypeVar - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.v2.interfaces.connector import BaseConnector -from unstructured.ingest.v2.interfaces.file_data import FileData -from unstructured.ingest.v2.interfaces.process import BaseProcess - - -@dataclass -class UploaderConfig(EnhancedDataClassJsonMixin): - pass - - -UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig) - - -@dataclass -class UploadContent: - path: Path - file_data: FileData - - -@dataclass -class Uploader(BaseProcess, BaseConnector, ABC): - upload_config: UploaderConfigT - connector_type: str - - def is_async(self) -> bool: - return False - - @abstractmethod - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - pass - - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return self.run(contents=[UploadContent(path=path, file_data=file_data)], **kwargs) diff --git a/unstructured/ingest/v2/logger.py b/unstructured/ingest/v2/logger.py deleted file mode 100644 index 34c5c1df3..000000000 --- a/unstructured/ingest/v2/logger.py +++ /dev/null @@ -1,123 +0,0 @@ -import ast -import json -import os -from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger -from typing import Any, Callable - -log_level = os.getenv("INGEST_LOG_LEVEL", "INFO") -LOGGER_NAME = "unstructured.ingest.v2" - - -def default_is_data_sensitive(k: str, v: Any) -> bool: - sensitive_fields = [ - "account_name", - "client_id", - ] - sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"] - return ( - v - and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419 - or k.lower() in sensitive_fields - ) - - -def hide_sensitive_fields( - data: dict, is_sensitive_fn: Callable[[str, Any], bool] = default_is_data_sensitive -) -> dict: - """ - Will recursively look through every k, v pair in this dict and any nested ones and run - is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if - any string value can be parsed as valid json and process that dict as well and replace - the original string with the json.dumps() version of the redacted dict. - """ - new_data = data.copy() - for k, v in new_data.items(): - if is_sensitive_fn(k, v): - new_data[k] = "*******" - if isinstance(v, dict): - new_data[k] = hide_sensitive_fields(v) - if isinstance(v, str): - # Need to take into account strings generated via json.dumps() or simply printing a dict - try: - json_data = json.loads(v) - if isinstance(json_data, dict): - updated_data = hide_sensitive_fields(json_data) - new_data[k] = json.dumps(updated_data) - except json.JSONDecodeError: - pass - - return new_data - - -def redact_jsons(s: str) -> str: - """ - Takes in a generic string and pulls out all valid json content. Leverages - hide_sensitive_fields() to redact any sensitive information and replaces the - original json with the new redacted format. There can be any number of valid - jsons in a generic string and this will work. Having extra '{' without a - closing '}' will cause this to break though. i.e '{ text, {"a": 3}'. - - """ - chars = list(s) - if "{" not in chars: - return s - i = 0 - jsons = [] - i = 0 - while i < len(chars): - char = chars[i] - if char == "{": - stack = [char] - current = [char] - while len(stack) != 0 and i < len(chars): - i += 1 - char = chars[i] - current.append(char) - if char == "{": - stack.append(char) - if char == "}": - stack.pop(-1) - jsons.append("".join(current)) - continue - i += 1 - for j in jsons: - try: - formatted_j = json.dumps(json.loads(j)) - except json.JSONDecodeError: - lit = ast.literal_eval(j) - formatted_j = json.dumps(lit) - hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j))) - s = s.replace(j, hidden_j) - return s - - -class SensitiveFormatter(Formatter): - def format(self, record): - s = super().format(record=record) - return redact_jsons(s) - - -def remove_root_handlers(logger: Logger) -> None: - # NOTE(robinson) - in some environments such as Google Colab, there is a root handler - # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs. - # Removing these when they exist prevents this behavior - if logger.root.hasHandlers(): - for handler in logger.root.handlers: - logger.root.removeHandler(handler) - - -def make_default_logger(level: int) -> Logger: - """Return a custom logger.""" - logger = getLogger(LOGGER_NAME) - handler = StreamHandler() - handler.name = "ingest_log_handler" - formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s") - handler.setFormatter(formatter) - if handler.name not in [h.name for h in logger.handlers]: - logger.addHandler(handler) - logger.setLevel(level) - remove_root_handlers(logger) - return logger - - -logger = make_default_logger(level=getLevelName(log_level.upper())) diff --git a/unstructured/ingest/v2/main.py b/unstructured/ingest/v2/main.py deleted file mode 100644 index f1b697717..000000000 --- a/unstructured/ingest/v2/main.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python3 -from unstructured.ingest.v2.cli.cli import get_cmd - - -def main(): - ingest_cmd = get_cmd() - ingest_cmd() - - -if __name__ == "__main__": - main() diff --git a/unstructured/ingest/v2/pipeline/__init__.py b/unstructured/ingest/v2/pipeline/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/v2/pipeline/interfaces.py b/unstructured/ingest/v2/pipeline/interfaces.py deleted file mode 100644 index ae6dd95d7..000000000 --- a/unstructured/ingest/v2/pipeline/interfaces.py +++ /dev/null @@ -1,169 +0,0 @@ -from __future__ import annotations - -import asyncio -import logging -import multiprocessing as mp -from abc import ABC -from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass -from functools import wraps -from pathlib import Path -from time import time -from typing import Any, Awaitable, Callable, Optional, TypeVar - -from tqdm import tqdm -from tqdm.asyncio import tqdm as tqdm_asyncio - -from unstructured.ingest.v2.interfaces import BaseProcess, ProcessorConfig -from unstructured.ingest.v2.logger import logger, make_default_logger - -BaseProcessT = TypeVar("BaseProcessT", bound=BaseProcess) -iterable_input = list[dict[str, Any]] - - -def timed(func): - @wraps(func) - def time_it(self, *args, **kwargs): - start = time() - try: - return func(self, *args, **kwargs) - finally: - if func.__name__ == "__call__": - reported_name = f"{self.__class__.__name__} [cls]" - else: - reported_name = func.__name__ - logger.info(f"{reported_name} took {time() - start} seconds") - - return time_it - - -@dataclass -class PipelineStep(ABC): - process: BaseProcessT - context: ProcessorConfig - identifier: str - - def __str__(self): - return self.identifier - - def process_serially(self, iterable: iterable_input) -> Any: - logger.info("processing content serially") - if iterable: - if len(iterable) == 1: - return [self.run(**iterable[0])] - if self.context.tqdm: - return [self.run(**it) for it in tqdm(iterable, desc=self.identifier)] - return [self.run(**it) for it in iterable] - return [self.run()] - - async def _process_async(self, iterable: iterable_input) -> Any: - if iterable: - if len(iterable) == 1: - return [await self.run_async(**iterable[0])] - if self.context.tqdm: - return await tqdm_asyncio.gather( - *[self.run_async(**i) for i in iterable], desc=self.identifier - ) - return await asyncio.gather(*[self.run_async(**i) for i in iterable]) - return [await self.run_async()] - - def process_async(self, iterable: iterable_input) -> Any: - logger.info("processing content async") - return self.asyncio_run(fn=self._process_async, iterable=iterable) - - def asyncio_run( - self, fn: Callable[[Any, Any], Awaitable[Any]], *args: Any, **kwargs: Any - ) -> Any: - current_loop = asyncio._get_running_loop() - if current_loop is None: - return asyncio.run(fn(*args, **kwargs)) - with ThreadPoolExecutor(thread_name_prefix="asyncio") as thread_pool: - logger.warning( - f"async code being run in dedicated thread pool " - f"to not conflict with existing event loop: {current_loop}" - ) - - def wrapped(): - return asyncio.run(fn(*args, **kwargs)) - - future = thread_pool.submit(wrapped) - return future.result() - - def process_multiprocess(self, iterable: iterable_input) -> Any: - logger.info("processing content across processes") - - if iterable: - if len(iterable) == 1: - return [self.process_serially(iterable)] - if self.context.num_processes == 1: - return self.process_serially(iterable) - with mp.Pool( - processes=self.context.num_processes, - initializer=self._init_logger, - initargs=(logging.DEBUG if self.context.verbose else logging.INFO,), - ) as pool: - if self.context.tqdm: - return list( - tqdm( - pool.imap_unordered(func=self._wrap_mp, iterable=iterable), - total=len(iterable), - desc=self.identifier, - ) - ) - return pool.map(self._wrap_mp, iterable) - return [self.run()] - - def _wrap_mp(self, input_kwargs: dict) -> Any: - # Allow mapping of kwargs via multiprocessing map() - return self.run(**input_kwargs) - - def _init_logger(self, log_level: int): - # Init logger for each spawned process when using multiprocessing pool - make_default_logger(level=log_level) - - @timed - def __call__(self, iterable: Optional[iterable_input] = None) -> Any: - iterable = iterable or [] - if iterable: - logger.info( - f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore - ) - if self.context.async_supported and self.process.is_async(): - return self.process_async(iterable=iterable) - if self.context.mp_supported: - return self.process_multiprocess(iterable=iterable) - return self.process_serially(iterable=iterable) - - def _run(self, fn: Callable, **kwargs: Any) -> Optional[Any]: - return self.asyncio_run(fn=self.run_async, _fn=fn, **kwargs) - - async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]: - raise NotImplementedError - - def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]: - try: - fn = _fn or self.process.run - return self._run(fn=fn, **kwargs) - except Exception as e: - logger.error(f"Exception raised while running {self.identifier}", exc_info=e) - if "file_data_path" in kwargs: - self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)} - if self.context.raise_on_error: - raise e - return None - - async def run_async(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]: - try: - fn = _fn or self.process.run_async - return await self._run_async(fn=fn, **kwargs) - except Exception as e: - logger.error(f"Exception raised while running {self.identifier}", exc_info=e) - if "file_data_path" in kwargs: - self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)} - if self.context.raise_on_error: - raise e - return None - - @property - def cache_dir(self) -> Path: - return Path(self.context.work_dir) / self.identifier diff --git a/unstructured/ingest/v2/pipeline/pipeline.py b/unstructured/ingest/v2/pipeline/pipeline.py deleted file mode 100644 index 93c77dfa0..000000000 --- a/unstructured/ingest/v2/pipeline/pipeline.py +++ /dev/null @@ -1,286 +0,0 @@ -from __future__ import annotations - -import logging -import multiprocessing as mp -from dataclasses import InitVar, dataclass, field -from time import time -from typing import Any, Optional, Union - -from unstructured.ingest.v2.interfaces import ProcessorConfig -from unstructured.ingest.v2.logger import logger, make_default_logger -from unstructured.ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep -from unstructured.ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep -from unstructured.ingest.v2.pipeline.steps.embed import Embedder, EmbedStep -from unstructured.ingest.v2.pipeline.steps.index import IndexerT, IndexStep -from unstructured.ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep -from unstructured.ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep -from unstructured.ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep -from unstructured.ingest.v2.pipeline.steps.upload import Uploader, UploadStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict -from unstructured.ingest.v2.processes.chunker import ChunkerConfig -from unstructured.ingest.v2.processes.connector_registry import ( - ConnectionConfig, - DownloaderConfigT, - IndexerConfigT, - UploaderConfigT, - UploadStagerConfigT, - destination_registry, - source_registry, -) -from unstructured.ingest.v2.processes.connectors.local import LocalUploader -from unstructured.ingest.v2.processes.embedder import EmbedderConfig -from unstructured.ingest.v2.processes.partitioner import PartitionerConfig - - -class PipelineError(Exception): - pass - - -@dataclass -class Pipeline: - context: ProcessorConfig - indexer: InitVar[IndexerT] - indexer_step: IndexStep = field(init=False) - downloader: InitVar[DownloaderT] - downloader_step: DownloadStep = field(init=False) - partitioner: InitVar[Partitioner] - partitioner_step: PartitionStep = field(init=False) - chunker: InitVar[Optional[Chunker]] = None - chunker_step: ChunkStep | None = field(init=False, default=None) - embedder: InitVar[Optional[Embedder]] = None - embedder_step: EmbedStep | None = field(init=False, default=None) - stager: InitVar[Optional[UploadStager]] = None - stager_step: UploadStageStep | None = field(init=False, default=None) - uploader: InitVar[Uploader] = field(default=LocalUploader()) - uploader_step: UploadStep | None = field(init=False, default=None) - uncompress_step: UncompressStep | None = field(init=False, default=None) - - def __post_init__( - self, - indexer: IndexerT, - downloader: DownloaderT, - partitioner: Partitioner, - chunker: Chunker | None = None, - embedder: Embedder | None = None, - stager: UploadStager | None = None, - uploader: Uploader | None = None, - ): - make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO) - self.indexer_step = IndexStep(process=indexer, context=self.context) - self.downloader_step = DownloadStep(process=downloader, context=self.context) - self.partitioner_step = PartitionStep(process=partitioner, context=self.context) - self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None - - self.embedder_step = EmbedStep(process=embedder, context=self.context) if embedder else None - # TODO: support initialize() call from each step process - # Potential long call to download embedder models, run before any fanout: - if embedder and embedder.config: - embedder.config.get_embedder().initialize() - - self.stager_step = UploadStageStep(process=stager, context=self.context) if stager else None - self.uploader_step = UploadStep(process=uploader, context=self.context) - if self.context.uncompress: - process = Uncompressor() - self.uncompress_step = UncompressStep(process=process, context=self.context) - - self.check_destination_connector() - - def check_destination_connector(self): - # Make sure that if the set destination connector expects a stager, one is also set - if not self.uploader_step: - return - uploader_connector_type = self.uploader_step.process.connector_type - registry_entry = destination_registry[uploader_connector_type] - if registry_entry.upload_stager and self.stager_step is None: - raise ValueError( - f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} " - f"expects a stager of type {registry_entry.upload_stager.__name__} " - f"but one was not set" - ) - - def cleanup(self): - pass - - def log_statuses(self): - if status := self.context.status: - logger.error(f"{len(status)} failed documents:") - for k, v in status.items(): - for kk, vv in v.items(): - logger.error(f"{k}: [{kk}] {vv}") - - def run(self): - try: - start_time = time() - self._run() - logger.info(f"Finished ingest process in {time() - start_time}s") - finally: - self.log_statuses() - self.cleanup() - if self.context.status: - raise PipelineError("Pipeline did not run successfully") - - def clean_results(self, results: list[Union[Any, list[Any]]] | None) -> list[Any] | None: - if not results: - return None - results = [r for r in results if r] - flat = [] - for r in results: - if isinstance(r, list): - flat.extend(r) - else: - flat.append(r) - final = [f for f in flat if f] - return final or None - - def _run(self): - logger.info( - f"Running local pipeline: {self} with configs: " - f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}" - ) - if self.context.mp_supported: - manager = mp.Manager() - self.context.status = manager.dict() - else: - self.context.status = {} - - # Index into data source - indices = self.indexer_step.run() - indices_inputs = [{"file_data_path": i} for i in indices] - if not indices_inputs: - return - - # Download associated content to local file system - downloaded_data = self.downloader_step(indices_inputs) - downloaded_data = self.clean_results(results=downloaded_data) - if not downloaded_data: - return - - # Run uncompress if available - if self.uncompress_step: - downloaded_data = self.uncompress_step(downloaded_data) - # Flatten list of lists - downloaded_data = self.clean_results(results=downloaded_data) - - if not downloaded_data: - return - - # Partition content - elements = self.partitioner_step(downloaded_data) - elements = self.clean_results(results=elements) - if not elements: - return - - # Run element specific modifiers - for step in [self.chunker_step, self.embedder_step, self.stager_step]: - elements = step(elements) if step else elements - elements = self.clean_results(results=elements) - if not elements: - return - - # Upload the final result - self.uploader_step(iterable=elements) - - def __str__(self): - s = [str(self.indexer_step), str(self.downloader_step)] - if uncompress_step := self.uncompress_step: - s.append(str(uncompress_step)) - s.append(str(self.partitioner_step)) - if chunker_step := self.chunker_step: - s.append(str(chunker_step)) - if embedder_step := self.embedder_step: - s.append(str(embedder_step)) - if stager_step := self.stager_step: - s.append(str(stager_step)) - s.append(str(self.uploader_step)) - return " -> ".join(s) - - @classmethod - def from_configs( - cls, - context: ProcessorConfig, - indexer_config: IndexerConfigT, - downloader_config: DownloaderConfigT, - source_connection_config: ConnectionConfig, - partitioner_config: PartitionerConfig, - chunker_config: Optional[ChunkerConfig] = None, - embedder_config: Optional[EmbedderConfig] = None, - destination_connection_config: Optional[ConnectionConfig] = None, - stager_config: Optional[UploadStagerConfigT] = None, - uploader_config: Optional[UploaderConfigT] = None, - ) -> "Pipeline": - # Get registry key based on indexer config - source_entry = { - k: v - for k, v in source_registry.items() - if isinstance(indexer_config, v.indexer_config) - and isinstance(downloader_config, v.downloader_config) - and isinstance(source_connection_config, v.connection_config) - } - if len(source_entry) > 1: - raise ValueError( - f"multiple entries found matching provided indexer, " - f"downloader and connection configs: {source_entry}" - ) - if len(source_entry) != 1: - raise ValueError( - "no entry found in source registry with matching indexer, " - "downloader and connection configs" - ) - source = list(source_entry.values())[0] - pipeline_kwargs = { - "context": context, - "indexer": source.indexer( - index_config=indexer_config, connection_config=source_connection_config - ), - "downloader": source.downloader( - download_config=downloader_config, connection_config=source_connection_config - ), - "partitioner": Partitioner(config=partitioner_config), - } - if chunker_config: - pipeline_kwargs["chunker"] = Chunker(config=chunker_config) - if embedder_config: - pipeline_kwargs["embedder"] = Embedder(config=embedder_config) - if not uploader_config: - return Pipeline(**pipeline_kwargs) - - destination_entry = { - k: v - for k, v in destination_registry.items() - if isinstance(uploader_config, v.uploader_config) - } - if destination_connection_config: - destination_entry = { - k: v - for k, v in destination_entry.items() - if isinstance(destination_connection_config, v.connection_config) - } - if stager_config: - destination_entry = { - k: v - for k, v in destination_entry.items() - if isinstance(stager_config, v.upload_stager_config) - } - - if len(destination_entry) > 1: - raise ValueError( - f"multiple entries found matching provided uploader, " - f"stager and connection configs: {destination_entry}" - ) - if len(destination_entry) != 1: - raise ValueError( - "no entry found in source registry with matching uploader, " - "stager and connection configs" - ) - - destination = list(destination_entry.values())[0] - if stager_config: - pipeline_kwargs["stager"] = destination.upload_stager( - upload_stager_config=stager_config - ) - if uploader_config: - uploader_kwargs = {"upload_config": uploader_config} - if destination_connection_config: - uploader_kwargs["connection_config"] = destination_connection_config - pipeline_kwargs["uploader"] = destination.uploader(**uploader_kwargs) - return cls(**pipeline_kwargs) diff --git a/unstructured/ingest/v2/pipeline/steps/__init__.py b/unstructured/ingest/v2/pipeline/steps/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/v2/pipeline/steps/chunk.py b/unstructured/ingest/v2/pipeline/steps/chunk.py deleted file mode 100644 index b2e5d14c2..000000000 --- a/unstructured/ingest/v2/pipeline/steps/chunk.py +++ /dev/null @@ -1,84 +0,0 @@ -import asyncio -import hashlib -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Optional, TypedDict - -from unstructured.ingest.v2.interfaces import FileData -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict -from unstructured.ingest.v2.processes.chunker import Chunker -from unstructured.staging.base import elements_to_dicts - -STEP_ID = "chunk" - - -class ChunkStepResponse(TypedDict): - file_data_path: str - path: str - - -@dataclass -class ChunkStep(PipelineStep): - process: Chunker - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.config.chunking_strategy})" - - def __post_init__(self): - config = ( - sterilize_dict(self.process.config.to_dict(redact_sensitive=True)) - if self.process.config - else None - ) - logger.info(f"Created {self.identifier} with configs: {config}") - - def should_chunk(self, filepath: Path, file_data: FileData) -> bool: - if self.context.reprocess or file_data.reprocess: - return True - return not filepath.exists() - - def get_output_filepath(self, filename: Path) -> Path: - hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" - filepath = (self.cache_dir / hashed_output_file).resolve() - filepath.parent.mkdir(parents=True, exist_ok=True) - return filepath - - def _save_output(self, output_filepath: str, chunked_content: list[dict]): - with open(str(output_filepath), "w") as f: - logger.debug(f"Writing chunker output to: {output_filepath}") - json.dump(chunked_content, f, indent=2) - - async def _run_async( - self, fn: Callable, path: str, file_data_path: str, **kwargs - ) -> ChunkStepResponse: - path = Path(path) - file_data = FileData.from_file(path=file_data_path) - output_filepath = self.get_output_filepath(filename=path) - if not self.should_chunk(filepath=output_filepath, file_data=file_data): - logger.debug(f"Skipping chunking, output already exists: {output_filepath}") - return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath)) - fn_kwargs = {"elements_filepath": path} - if not asyncio.iscoroutinefunction(fn): - chunked_content_raw = fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - chunked_content_raw = await fn(**fn_kwargs) - else: - chunked_content_raw = await fn(**fn_kwargs) - self._save_output( - output_filepath=str(output_filepath), - chunked_content=elements_to_dicts(chunked_content_raw), - ) - return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath)) - - def get_hash(self, extras: Optional[list[str]]) -> str: - hashable_string = json.dumps( - self.process.config.to_dict(), sort_keys=True, ensure_ascii=True - ) - if extras: - hashable_string += "".join(extras) - return hashlib.sha256(hashable_string.encode()).hexdigest()[:12] diff --git a/unstructured/ingest/v2/pipeline/steps/download.py b/unstructured/ingest/v2/pipeline/steps/download.py deleted file mode 100644 index 84d00e35d..000000000 --- a/unstructured/ingest/v2/pipeline/steps/download.py +++ /dev/null @@ -1,124 +0,0 @@ -import asyncio -import hashlib -import json -from dataclasses import dataclass -from typing import Callable, Optional, TypedDict, TypeVar - -from unstructured.ingest.v2.interfaces import FileData, download_responses -from unstructured.ingest.v2.interfaces.downloader import Downloader -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict - -DownloaderT = TypeVar("DownloaderT", bound=Downloader) - -STEP_ID = "download" - - -class DownloadStepResponse(TypedDict): - file_data_path: str - path: str - - -@dataclass -class DownloadStep(PipelineStep): - process: DownloaderT - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.__class__.__name__})" - - def __post_init__(self): - config = ( - sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True)) - if self.process.download_config - else None - ) - connection_config = ( - sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True)) - if self.process.connection_config - else None - ) - logger.info( - f"Created {self.identifier} with configs: {config}, " - f"connection configs: {connection_config}" - ) - - @staticmethod - def is_float(value: str): - try: - float(value) - return True - except ValueError: - return False - - def should_download(self, file_data: FileData, file_data_path: str) -> bool: - if self.context.re_download: - return True - download_path = self.process.get_download_path(file_data=file_data) - if not download_path or not download_path.exists(): - return True - if ( - download_path.is_file() - and file_data.metadata.date_modified - and self.is_float(file_data.metadata.date_modified) - and download_path.stat().st_mtime > float(file_data.metadata.date_modified) - ): - # Also update file data to mark this to reprocess since this won't change the filename - file_data.reprocess = True - file_data.to_file(path=file_data_path) - return True - return False - - async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]: - file_data = FileData.from_file(path=file_data_path) - download_path = self.process.get_download_path(file_data=file_data) - if not self.should_download(file_data=file_data, file_data_path=file_data_path): - logger.debug(f"Skipping download, file already exists locally: {download_path}") - return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))] - fn_kwargs = {"file_data": file_data} - if not asyncio.iscoroutinefunction(fn): - download_results = fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - download_results = await fn(**fn_kwargs) - else: - download_results = await fn(**fn_kwargs) - return self.create_step_results( - current_file_data_path=file_data_path, download_results=download_results - ) - - def create_step_results( - self, current_file_data_path: str, download_results: download_responses - ) -> list[DownloadStepResponse]: - if not isinstance(download_results, list): - return [ - DownloadStepResponse( - file_data_path=current_file_data_path, path=str(download_results["path"]) - ) - ] - # Supplemental results generated as part of the download process - download_step_results = [] - for res in download_results: - file_data_path = self.persist_new_file_data(file_data=res["file_data"]) - download_step_results.append( - DownloadStepResponse(file_data_path=file_data_path, path=res["path"]) - ) - return download_step_results - - def persist_new_file_data(self, file_data: FileData) -> str: - record_hash = self.get_hash(extras=[file_data.identifier]) - filename = f"{record_hash}.json" - filepath = (self.cache_dir / filename).resolve() - filepath.parent.mkdir(parents=True, exist_ok=True) - with open(str(filepath), "w") as f: - json.dump(file_data.to_dict(), f, indent=2) - return str(filepath) - - def get_hash(self, extras: Optional[list[str]]) -> str: - hashable_string = json.dumps( - sterilize_dict(self.process.download_config.to_dict()), sort_keys=True - ) - if extras: - hashable_string += "".join(extras) - return hashlib.sha256(hashable_string.encode()).hexdigest()[:12] diff --git a/unstructured/ingest/v2/pipeline/steps/embed.py b/unstructured/ingest/v2/pipeline/steps/embed.py deleted file mode 100644 index 94103951c..000000000 --- a/unstructured/ingest/v2/pipeline/steps/embed.py +++ /dev/null @@ -1,83 +0,0 @@ -import asyncio -import hashlib -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Optional, TypedDict - -from unstructured.ingest.v2.interfaces import FileData -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict -from unstructured.ingest.v2.processes.embedder import Embedder -from unstructured.staging.base import elements_to_dicts - -STEP_ID = "embed" - - -class EmbedStepResponse(TypedDict): - file_data_path: str - path: str - - -@dataclass -class EmbedStep(PipelineStep): - process: Embedder - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.config.embedding_provider})" - - def __post_init__(self): - config = ( - sterilize_dict(self.process.config.to_dict(redact_sensitive=True)) - if self.process.config - else None - ) - logger.info(f"Created {self.identifier} with configs: {config}") - - def should_embed(self, filepath: Path, file_data: FileData) -> bool: - if self.context.reprocess or file_data.reprocess: - return True - return not filepath.exists() - - def get_output_filepath(self, filename: Path) -> Path: - hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" - filepath = (self.cache_dir / hashed_output_file).resolve() - filepath.parent.mkdir(parents=True, exist_ok=True) - return filepath - - def _save_output(self, output_filepath: str, embedded_content: list[dict]): - with open(str(output_filepath), "w") as f: - logger.debug(f"Writing embedded output to: {output_filepath}") - json.dump(embedded_content, f, indent=2) - - async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse: - path = Path(path) - file_data = FileData.from_file(path=file_data_path) - output_filepath = self.get_output_filepath(filename=path) - if not self.should_embed(filepath=output_filepath, file_data=file_data): - logger.debug(f"Skipping embedding, output already exists: {output_filepath}") - return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath)) - fn_kwargs = {"elements_filepath": path} - if not asyncio.iscoroutinefunction(fn): - embed_content_raw = fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - embed_content_raw = await fn(**fn_kwargs) - else: - embed_content_raw = await fn(**fn_kwargs) - - self._save_output( - output_filepath=str(output_filepath), - embedded_content=elements_to_dicts(embed_content_raw), - ) - return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath)) - - def get_hash(self, extras: Optional[list[str]]) -> str: - hashable_string = json.dumps( - self.process.config.to_dict(), sort_keys=True, ensure_ascii=True - ) - if extras: - hashable_string += "".join(extras) - return hashlib.sha256(hashable_string.encode()).hexdigest()[:12] diff --git a/unstructured/ingest/v2/pipeline/steps/index.py b/unstructured/ingest/v2/pipeline/steps/index.py deleted file mode 100644 index d91a035ab..000000000 --- a/unstructured/ingest/v2/pipeline/steps/index.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import hashlib -import json -from dataclasses import dataclass -from typing import Any, Callable, Generator, Optional, TypeVar - -from unstructured.ingest.v2.interfaces.indexer import Indexer -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict - -IndexerT = TypeVar("IndexerT", bound=Indexer) - -STEP_ID = "index" - - -@dataclass -class IndexStep(PipelineStep): - process: IndexerT - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.__class__.__name__})" - - def __post_init__(self): - config = ( - sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True)) - if self.process.index_config - else None - ) - connection_config = ( - sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True)) - if self.process.connection_config - else None - ) - logger.info( - f"Created {self.identifier} with configs: {config}, " - f"connection configs: {connection_config}" - ) - - def run( - self, _fn: Callable[..., Any] | None = None, **kwargs: Any - ) -> Generator[str, None, None]: - for file_data in self.process.run(): - logger.debug(f"Generated file data: {file_data}") - try: - record_hash = self.get_hash(extras=[file_data.identifier]) - filename = f"{record_hash}.json" - filepath = (self.cache_dir / filename).resolve() - filepath.parent.mkdir(parents=True, exist_ok=True) - with open(str(filepath), "w") as f: - json.dump(file_data.to_dict(), f, indent=2) - yield str(filepath) - except Exception as e: - logger.error(f"failed to create index for file data: {file_data}", exc_info=True) - if self.context.raise_on_error: - raise e - continue - - def get_hash(self, extras: Optional[list[str]]) -> str: - hashable_string = json.dumps(self.process.index_config.to_dict()) - if extras: - hashable_string += "".join(extras) - return hashlib.sha256(hashable_string.encode()).hexdigest()[:12] diff --git a/unstructured/ingest/v2/pipeline/steps/partition.py b/unstructured/ingest/v2/pipeline/steps/partition.py deleted file mode 100644 index 541d2cae9..000000000 --- a/unstructured/ingest/v2/pipeline/steps/partition.py +++ /dev/null @@ -1,78 +0,0 @@ -import asyncio -import hashlib -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Optional, TypedDict - -from unstructured.ingest.v2.interfaces import FileData -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict -from unstructured.ingest.v2.processes.partitioner import Partitioner - -STEP_ID = "partition" - - -class PartitionStepResponse(TypedDict): - file_data_path: str - path: str - - -@dataclass -class PartitionStep(PipelineStep): - process: Partitioner - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.config.strategy})" - - def __post_init__(self): - config = sterilize_dict(self.process.config.to_dict(redact_sensitive=True)) - logger.info(f"Created {self.identifier} with configs: {config}") - - def should_partition(self, filepath: Path, file_data: FileData) -> bool: - if self.context.reprocess or file_data.reprocess: - return True - return not filepath.exists() - - def get_output_filepath(self, filename: Path) -> Path: - hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json" - filepath = (self.cache_dir / hashed_output_file).resolve() - filepath.parent.mkdir(parents=True, exist_ok=True) - return filepath - - def _save_output(self, output_filepath: str, partitioned_content: list[dict]): - with open(str(output_filepath), "w") as f: - logger.debug(f"Writing partitioned output to: {output_filepath}") - json.dump(partitioned_content, f, indent=2) - - async def _run_async( - self, fn: Callable, path: str, file_data_path: str - ) -> Optional[PartitionStepResponse]: - path = Path(path) - file_data = FileData.from_file(path=file_data_path) - output_filepath = self.get_output_filepath(filename=Path(file_data_path)) - if not self.should_partition(filepath=output_filepath, file_data=file_data): - logger.debug(f"Skipping partitioning, output already exists: {output_filepath}") - return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath)) - fn_kwargs = {"filename": path, "metadata": file_data.metadata} - if not asyncio.iscoroutinefunction(fn): - partitioned_content = fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - partitioned_content = await fn(**fn_kwargs) - else: - partitioned_content = await fn(**fn_kwargs) - self._save_output( - output_filepath=str(output_filepath), partitioned_content=partitioned_content - ) - return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath)) - - def get_hash(self, extras: Optional[list[str]]) -> str: - hashable_string = json.dumps( - self.process.config.to_dict(), sort_keys=True, ensure_ascii=True - ) - if extras: - hashable_string += "".join(extras) - return hashlib.sha256(hashable_string.encode()).hexdigest()[:12] diff --git a/unstructured/ingest/v2/pipeline/steps/stage.py b/unstructured/ingest/v2/pipeline/steps/stage.py deleted file mode 100644 index b4c6204ad..000000000 --- a/unstructured/ingest/v2/pipeline/steps/stage.py +++ /dev/null @@ -1,64 +0,0 @@ -import asyncio -import hashlib -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Optional, TypedDict - -from unstructured.ingest.v2.interfaces.file_data import FileData -from unstructured.ingest.v2.interfaces.upload_stager import UploadStager -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict - -STEP_ID = "upload_stage" - - -class UploadStageStepResponse(TypedDict): - file_data_path: str - path: str - - -@dataclass -class UploadStageStep(PipelineStep): - process: UploadStager - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.__class__.__name__})" - - def __post_init__(self): - config = ( - sterilize_dict(self.process.upload_stager_config.to_dict(redact_sensitive=True)) - if self.process.upload_stager_config - else None - ) - self.cache_dir.mkdir(parents=True, exist_ok=True) - logger.info(f"Created {self.identifier} with configs: {config}") - - async def _run_async( - self, fn: Callable, path: str, file_data_path: str - ) -> UploadStageStepResponse: - path = Path(path) - fn_kwargs = { - "elements_filepath": path, - "file_data": FileData.from_file(path=file_data_path), - "output_dir": self.cache_dir, - "output_filename": self.get_hash(extras=[path.name]), - } - if not asyncio.iscoroutinefunction(fn): - staged_output_path = fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - staged_output_path = await fn(**fn_kwargs) - else: - staged_output_path = await fn(**fn_kwargs) - return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path)) - - def get_hash(self, extras: Optional[list[str]]) -> str: - hashable_string = json.dumps( - self.process.upload_stager_config.to_dict(), sort_keys=True, ensure_ascii=True - ) - if extras: - hashable_string += "".join(extras) - return hashlib.sha256(hashable_string.encode()).hexdigest()[:12] diff --git a/unstructured/ingest/v2/pipeline/steps/uncompress.py b/unstructured/ingest/v2/pipeline/steps/uncompress.py deleted file mode 100644 index 987c9d5f6..000000000 --- a/unstructured/ingest/v2/pipeline/steps/uncompress.py +++ /dev/null @@ -1,68 +0,0 @@ -import asyncio -from pathlib import Path -from typing import Callable, TypedDict - -from unstructured.ingest.v2.interfaces.file_data import FileData -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep -from unstructured.ingest.v2.pipeline.utils import sterilize_dict -from unstructured.ingest.v2.processes.uncompress import Uncompressor - -STEP_ID = "uncompress" - - -class UncompressStepResponse(TypedDict): - file_data_path: str - path: str - - -class UncompressStep(PipelineStep): - process: Uncompressor - identifier: str = STEP_ID - - def __post_init__(self): - config = ( - sterilize_dict(self.process.config.to_dict(redact_sensitive=True)) - if self.process.config - else None - ) - logger.info(f"Created {self.identifier} with configs: {config}") - - def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]: - file_data = FileData.from_file(path=file_data_path) - new_file_data = self.process.run(file_data=file_data) - responses = [] - for new_file in new_file_data: - new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json" - new_file.to_file(path=str(new_file_data_path.resolve())) - responses.append( - UncompressStepResponse( - path=new_file.source_identifiers.fullpath, - file_data_path=str(new_file_data_path), - ) - ) - return responses - - async def _run_async( - self, fn: Callable, path: str, file_data_path: str - ) -> list[UncompressStepResponse]: - file_data = FileData.from_file(path=file_data_path) - fn_kwargs = {"file_data": file_data} - if not asyncio.iscoroutinefunction(fn): - new_file_data = fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - new_file_data = await fn(**fn_kwargs) - else: - new_file_data = await fn(**fn_kwargs) - responses = [] - for new_file in new_file_data: - new_file_data_path = Path(file_data_path).parent / f"{new_file.identifier}.json" - new_file.to_file(path=str(new_file_data_path.resolve())) - responses.append( - UncompressStepResponse( - path=new_file.source_identifiers.fullpath, - file_data_path=str(new_file_data_path), - ) - ) - return responses diff --git a/unstructured/ingest/v2/pipeline/steps/upload.py b/unstructured/ingest/v2/pipeline/steps/upload.py deleted file mode 100644 index dc58d46ac..000000000 --- a/unstructured/ingest/v2/pipeline/steps/upload.py +++ /dev/null @@ -1,73 +0,0 @@ -import asyncio -from dataclasses import dataclass -from pathlib import Path -from typing import Callable, Optional, TypedDict - -from unstructured.ingest.v2.interfaces import FileData -from unstructured.ingest.v2.interfaces.uploader import UploadContent, Uploader -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed -from unstructured.ingest.v2.pipeline.utils import sterilize_dict - -STEP_ID = "upload" - - -class UploadStepContent(TypedDict): - path: str - file_data_path: str - - -@dataclass -class UploadStep(PipelineStep): - process: Uploader - identifier: str = STEP_ID - - def __str__(self): - return f"{self.identifier} ({self.process.__class__.__name__})" - - def __post_init__(self): - config = ( - sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True)) - if self.process.upload_config - else None - ) - connection_config = ( - sterilize_dict(self.process.connection_config.to_dict(redact_sensitive=True)) - if self.process.connection_config - else None - ) - logger.info( - f"Created {self.identifier} with configs: {config}, " - f"connection configs: {connection_config}" - ) - - def process_whole(self, iterable: iterable_input): - self.run(contents=iterable) - - @timed - def __call__(self, iterable: iterable_input): - logger.info( - f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore - ) - if self.process.is_async(): - self.process_async(iterable=iterable) - else: - self.process_whole(iterable=iterable) - - def _run(self, fn: Callable, contents: list[UploadStepContent]): - upload_contents = [ - UploadContent(path=Path(c["path"]), file_data=FileData.from_file(c["file_data_path"])) - for c in contents - ] - fn(contents=upload_contents) - - async def _run_async(self, path: str, file_data_path: str, fn: Optional[Callable] = None): - fn = fn or self.process.run_async - fn_kwargs = {"path": Path(path), "file_data": FileData.from_file(path=file_data_path)} - if not asyncio.iscoroutinefunction(fn): - fn(**fn_kwargs) - elif semaphore := self.context.semaphore: - async with semaphore: - await fn(**fn_kwargs) - else: - await fn(**fn_kwargs) diff --git a/unstructured/ingest/v2/pipeline/utils.py b/unstructured/ingest/v2/pipeline/utils.py deleted file mode 100644 index e684ebb10..000000000 --- a/unstructured/ingest/v2/pipeline/utils.py +++ /dev/null @@ -1,16 +0,0 @@ -import json -from datetime import datetime -from pathlib import Path -from typing import Any - - -def sterilize_dict(data: dict[str, Any]) -> dict[str, Any]: - def json_serial(obj: Any) -> str: - if isinstance(obj, Path): - return obj.as_posix() - if isinstance(obj, datetime): - return obj.isoformat() - raise TypeError("Type %s not serializable" % type(obj)) - - data_s = json.dumps(data, default=json_serial) - return json.loads(data_s) diff --git a/unstructured/ingest/v2/processes/__init__.py b/unstructured/ingest/v2/processes/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/unstructured/ingest/v2/processes/chunker.py b/unstructured/ingest/v2/processes/chunker.py deleted file mode 100644 index 11dffb073..000000000 --- a/unstructured/ingest/v2/processes/chunker.py +++ /dev/null @@ -1,96 +0,0 @@ -from abc import ABC -from dataclasses import dataclass, fields -from pathlib import Path -from typing import Any, Optional - -from unstructured.chunking import dispatch -from unstructured.documents.elements import Element, assign_and_map_hash_ids -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.v2.interfaces.process import BaseProcess -from unstructured.ingest.v2.logger import logger -from unstructured.staging.base import dict_to_elements, elements_from_json - - -@dataclass -class ChunkerConfig(EnhancedDataClassJsonMixin): - chunking_strategy: Optional[str] = None - chunking_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general" - chunk_by_api: bool = False - chunk_api_key: Optional[str] = enhanced_field(default=None, sensitive=True) - - chunk_combine_text_under_n_chars: Optional[int] = None - chunk_include_orig_elements: Optional[bool] = None - chunk_max_characters: Optional[int] = None - chunk_multipage_sections: Optional[bool] = None - chunk_new_after_n_chars: Optional[int] = None - chunk_overlap: Optional[int] = None - chunk_overlap_all: Optional[bool] = None - - def to_chunking_kwargs(self) -> dict[str, Any]: - return { - "chunking_strategy": self.chunking_strategy, - "combine_under_n_chars": self.chunk_combine_text_under_n_chars, - "max_characters": self.chunk_max_characters, - "include_orig_elements": self.chunk_include_orig_elements, - "multipage_sections": self.chunk_multipage_sections, - "new_after_n_chars": self.chunk_new_after_n_chars, - "overlap": self.chunk_overlap, - "overlap_all": self.chunk_overlap_all, - } - - -@dataclass -class Chunker(BaseProcess, ABC): - config: ChunkerConfig - - def is_async(self) -> bool: - return self.config.chunk_by_api - - def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]: - elements = elements_from_json(filename=str(elements_filepath)) - if not elements: - return elements - local_chunking_strategies = ("basic", "by_title") - if self.config.chunking_strategy not in local_chunking_strategies: - logger.warning( - "chunking strategy not supported for local chunking: {}, must be one of: {}".format( - self.config.chunking_strategy, ", ".join(local_chunking_strategies) - ) - ) - return elements - chunked_elements = dispatch.chunk(elements=elements, **self.config.to_chunking_kwargs()) - assign_and_map_hash_ids(chunked_elements) - return chunked_elements - - async def run_async(self, elements_filepath: Path, **kwargs: Any) -> list[Element]: - from unstructured_client import UnstructuredClient - from unstructured_client.models.shared import Files, PartitionParameters - - client = UnstructuredClient( - api_key_auth=self.config.chunk_api_key, - server_url=self.config.chunking_endpoint, - ) - partition_request = self.config.to_chunking_kwargs() - possible_fields = [f.name for f in fields(PartitionParameters)] - filtered_partition_request = { - k: v for k, v in partition_request.items() if k in possible_fields - } - if len(filtered_partition_request) != len(partition_request): - logger.debug( - "Following fields were omitted due to not being " - "supported by the currently used unstructured client: {}".format( - ", ".join([v for v in partition_request if v not in filtered_partition_request]) - ) - ) - with open(elements_filepath, "rb") as f: - files = Files( - content=f.read(), - file_name=str(elements_filepath.resolve()), - ) - filtered_partition_request["files"] = files - partition_params = PartitionParameters(**filtered_partition_request) - resp = client.general.partition(partition_params) - elements_raw = resp.elements or [] - elements = dict_to_elements(elements_raw) - assign_and_map_hash_ids(elements) - return elements diff --git a/unstructured/ingest/v2/processes/connector_registry.py b/unstructured/ingest/v2/processes/connector_registry.py deleted file mode 100644 index 41abdd4c8..000000000 --- a/unstructured/ingest/v2/processes/connector_registry.py +++ /dev/null @@ -1,63 +0,0 @@ -from dataclasses import dataclass -from typing import Optional, Type, TypeVar - -from unstructured.ingest.v2.interfaces import ( - ConnectionConfig, - Downloader, - DownloaderConfig, - Indexer, - IndexerConfig, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) - -IndexerT = TypeVar("IndexerT", bound=Indexer) -IndexerConfigT = TypeVar("IndexerConfigT", bound=IndexerConfig) -DownloaderT = TypeVar("DownloaderT", bound=Downloader) -DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig) -ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig) -UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig) -UploadStagerT = TypeVar("UploadStagerT", bound=UploadStager) -UploaderConfigT = TypeVar("UploaderConfigT", bound=UploaderConfig) -UploaderT = TypeVar("UploaderT", bound=Uploader) - - -@dataclass -class SourceRegistryEntry: - indexer: Type[IndexerT] - downloader: Type[DownloaderT] - - downloader_config: Optional[Type[DownloaderConfigT]] = None - indexer_config: Optional[Type[IndexerConfigT]] = None - connection_config: Optional[Type[ConnectionConfigT]] = None - - -source_registry: dict[str, SourceRegistryEntry] = {} - - -def add_source_entry(source_type: str, entry: SourceRegistryEntry): - if source_type in source_registry: - raise ValueError(f"source {source_type} has already been registered") - source_registry[source_type] = entry - - -@dataclass -class DestinationRegistryEntry: - uploader: Type[UploaderT] - upload_stager: Optional[Type[UploadStagerT]] = None - - upload_stager_config: Optional[Type[UploadStagerConfigT]] = None - uploader_config: Optional[Type[UploaderConfigT]] = None - - connection_config: Optional[Type[ConnectionConfigT]] = None - - -destination_registry: dict[str, DestinationRegistryEntry] = {} - - -def add_destination_entry(destination_type: str, entry: DestinationRegistryEntry): - if destination_type in destination_registry: - raise ValueError(f"destination {destination_type} has already been registered") - destination_registry[destination_type] = entry diff --git a/unstructured/ingest/v2/processes/connectors/__init__.py b/unstructured/ingest/v2/processes/connectors/__init__.py deleted file mode 100644 index 5e4e2cf13..000000000 --- a/unstructured/ingest/v2/processes/connectors/__init__.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import annotations - -import unstructured.ingest.v2.processes.connectors.fsspec # noqa: F401 -from unstructured.ingest.v2.processes.connector_registry import ( - add_destination_entry, - add_source_entry, -) - -from .astradb import CONNECTOR_TYPE as ASTRADB_CONNECTOR_TYPE -from .astradb import astradb_destination_entry -from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE -from .chroma import chroma_destination_entry -from .databricks_volumes import CONNECTOR_TYPE as DATABRICKS_VOLUMES_CONNECTOR_TYPE -from .databricks_volumes import databricks_volumes_destination_entry -from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE -from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry -from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE -from .google_drive import google_drive_source_entry -from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE -from .local import local_destination_entry, local_source_entry -from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE -from .mongodb import mongodb_destination_entry -from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE -from .onedrive import onedrive_source_entry -from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE -from .opensearch import opensearch_destination_entry, opensearch_source_entry -from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE -from .pinecone import pinecone_destination_entry -from .salesforce import CONNECTOR_TYPE as SALESFORCE_CONNECTOR_TYPE -from .salesforce import salesforce_source_entry -from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE -from .sharepoint import sharepoint_source_entry -from .singlestore import CONNECTOR_TYPE as SINGLESTORE_CONNECTOR_TYPE -from .singlestore import singlestore_destination_entry -from .sql import CONNECTOR_TYPE as SQL_CONNECTOR_TYPE -from .sql import sql_destination_entry -from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE -from .weaviate import weaviate_destination_entry - -add_destination_entry(destination_type=ASTRADB_CONNECTOR_TYPE, entry=astradb_destination_entry) - -add_destination_entry(destination_type=CHROMA_CONNECTOR_TYPE, entry=chroma_destination_entry) - -add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry) -add_destination_entry( - destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry -) - -add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry) - -add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry) -add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry) - -add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry) - -add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry) -add_destination_entry( - destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry -) - -add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry) - -add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry) - -add_destination_entry( - destination_type=DATABRICKS_VOLUMES_CONNECTOR_TYPE, entry=databricks_volumes_destination_entry -) - -add_destination_entry(destination_type=SQL_CONNECTOR_TYPE, entry=sql_destination_entry) - -add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry) -add_destination_entry(destination_type=PINECONE_CONNECTOR_TYPE, entry=pinecone_destination_entry) -add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_entry) -add_destination_entry( - destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry -) diff --git a/unstructured/ingest/v2/processes/connectors/astradb.py b/unstructured/ingest/v2/processes/connectors/astradb.py deleted file mode 100644 index dc10862e8..000000000 --- a/unstructured/ingest/v2/processes/connectors/astradb.py +++ /dev/null @@ -1,151 +0,0 @@ -import json -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional - -from unstructured import __name__ as integration_name -from unstructured.__version__ import __version__ as integration_version -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - FileData, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from astrapy.db import AstraDBCollection - -CONNECTOR_TYPE = "astradb" - - -@dataclass -class AstraDBAccessConfig(AccessConfig): - token: str - api_endpoint: str - - -@dataclass -class AstraDBConnectionConfig(ConnectionConfig): - connection_type: str = CONNECTOR_TYPE - access_config: AstraDBAccessConfig = enhanced_field(sensitive=True) - - -@dataclass -class AstraDBUploadStagerConfig(UploadStagerConfig): - pass - - -@dataclass -class AstraDBUploadStager(UploadStager): - upload_stager_config: AstraDBUploadStagerConfig = field( - default_factory=lambda: AstraDBUploadStagerConfig() - ) - - def conform_dict(self, element_dict: dict) -> dict: - return { - "$vector": element_dict.pop("embeddings", None), - "content": element_dict.pop("text", None), - "metadata": element_dict, - } - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - conformed_elements = [] - for element in elements_contents: - conformed_elements.append(self.conform_dict(element_dict=element)) - output_path = Path(output_dir) / Path(f"{output_filename}.json") - with open(output_path, "w") as output_file: - json.dump(conformed_elements, output_file) - return output_path - - -@dataclass -class AstraDBUploaderConfig(UploaderConfig): - collection_name: str - embedding_dimension: int - namespace: Optional[str] = None - requested_indexing_policy: Optional[dict[str, Any]] = None - batch_size: int = 20 - - -@dataclass -class AstraDBUploader(Uploader): - connection_config: AstraDBConnectionConfig - upload_config: AstraDBUploaderConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["astrapy"], extras="astradb") - def get_collection(self) -> "AstraDBCollection": - from astrapy.db import AstraDB - - # Get the collection_name and embedding dimension - collection_name = self.upload_config.collection_name - embedding_dimension = self.upload_config.embedding_dimension - requested_indexing_policy = self.upload_config.requested_indexing_policy - - # If the user has requested an indexing policy, pass it to the Astra DB - options = {"indexing": requested_indexing_policy} if requested_indexing_policy else None - - # Build the Astra DB object. - # caller_name/version for Astra DB tracking - astra_db = AstraDB( - api_endpoint=self.connection_config.access_config.api_endpoint, - token=self.connection_config.access_config.token, - namespace=self.upload_config.namespace, - caller_name=integration_name, - caller_version=integration_version, - ) - - # Create and connect to the newly created collection - astra_db_collection = astra_db.create_collection( - collection_name=collection_name, - dimension=embedding_dimension, - options=options, - ) - return astra_db_collection - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - - logger.info( - f"writing {len(elements_dict)} objects to destination " - f"collection {self.upload_config.collection_name}" - ) - - astra_batch_size = self.upload_config.batch_size - collection = self.get_collection() - - for chunk in batch_generator(elements_dict, astra_batch_size): - collection.insert_many(chunk) - - -astradb_destination_entry = DestinationRegistryEntry( - connection_config=AstraDBConnectionConfig, - upload_stager_config=AstraDBUploadStagerConfig, - upload_stager=AstraDBUploadStager, - uploader_config=AstraDBUploaderConfig, - uploader=AstraDBUploader, -) diff --git a/unstructured/ingest/v2/processes/connectors/azure_cognitive_search.py b/unstructured/ingest/v2/processes/connectors/azure_cognitive_search.py deleted file mode 100644 index aab7cfba4..000000000 --- a/unstructured/ingest/v2/processes/connectors/azure_cognitive_search.py +++ /dev/null @@ -1,208 +0,0 @@ -import json -import typing as t -import uuid -from dataclasses import dataclass, field -from pathlib import Path - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError, WriteError -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - add_destination_entry, -) -from unstructured.ingest.v2.processes.connectors.utils import parse_datetime -from unstructured.utils import requires_dependencies - -if t.TYPE_CHECKING: - from azure.search.documents import SearchClient - - -CONNECTOR_TYPE = "azure_cognitive_search" - - -@dataclass -class AzureCognitiveSearchAccessConfig(AccessConfig): - key: t.Optional[str] = enhanced_field(default=None, overload_name="azure_cognitive_search_key") - - -@dataclass -class AzureCognitiveSearchConnectionConfig(ConnectionConfig): - endpoint: str - index: str - access_config: AzureCognitiveSearchAccessConfig = enhanced_field(sensitive=True) - - @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search") - def generate_client(self) -> "SearchClient": - from azure.core.credentials import AzureKeyCredential - from azure.search.documents import SearchClient - - return SearchClient( - endpoint=self.endpoint, - index_name=self.index, - credential=AzureKeyCredential(self.access_config.key), - ) - - -@dataclass -class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig): - pass - - -@dataclass -class AzureCognitiveSearchUploaderConfig(UploaderConfig): - batch_size: int = 100 - - -@dataclass -class AzureCognitiveSearchUploadStager(UploadStager): - upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field( - default_factory=lambda: AzureCognitiveSearchUploadStagerConfig() - ) - - @staticmethod - def conform_dict(data: dict) -> dict: - """ - updates the dictionary that is from each Element being converted into a dict/json - into a dictionary that conforms to the schema expected by the - Azure Cognitive Search index - """ - - data["id"] = str(uuid.uuid4()) - - if points := data.get("metadata", {}).get("coordinates", {}).get("points"): - data["metadata"]["coordinates"]["points"] = json.dumps(points) - if version := data.get("metadata", {}).get("data_source", {}).get("version"): - data["metadata"]["data_source"]["version"] = str(version) - if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"): - data["metadata"]["data_source"]["record_locator"] = json.dumps(record_locator) - if permissions_data := ( - data.get("metadata", {}).get("data_source", {}).get("permissions_data") - ): - data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data) - if links := data.get("metadata", {}).get("links"): - data["metadata"]["links"] = [json.dumps(link) for link in links] - if last_modified := data.get("metadata", {}).get("last_modified"): - data["metadata"]["last_modified"] = parse_datetime(last_modified).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ" - ) - if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"): - data["metadata"]["data_source"]["date_created"] = parse_datetime(date_created).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ" - ) - - if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"): - data["metadata"]["data_source"]["date_modified"] = parse_datetime( - date_modified - ).strftime("%Y-%m-%dT%H:%M:%S.%fZ") - - if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"): - data["metadata"]["data_source"]["date_processed"] = parse_datetime( - date_processed - ).strftime("%Y-%m-%dT%H:%M:%S.%fZ") - - if page_number := data.get("metadata", {}).get("page_number"): - data["metadata"]["page_number"] = str(page_number) - return data - - def run( - self, - elements_filepath: Path, - output_dir: Path, - output_filename: str, - **kwargs: t.Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - - conformed_elements = [self.conform_dict(data=element) for element in elements_contents] - - output_path = Path(output_dir) / Path(f"{output_filename}.json") - with open(output_path, "w") as output_file: - json.dump(conformed_elements, output_file) - return output_path - - -@dataclass -class AzureCognitiveSearchUploader(Uploader): - upload_config: AzureCognitiveSearchUploaderConfig - connection_config: AzureCognitiveSearchConnectionConfig - connector_type: str = CONNECTOR_TYPE - - @DestinationConnectionError.wrap - @requires_dependencies(["azure"], extras="azure-cognitive-search") - def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None: - import azure.core.exceptions - - logger.info( - f"writing {len(elements_dict)} documents to destination " - f"index at {self.connection_config.index}", - ) - try: - results = self.connection_config.generate_client().upload_documents( - documents=elements_dict - ) - - except azure.core.exceptions.HttpResponseError as http_error: - raise WriteError(f"http error: {http_error}") from http_error - errors = [] - success = [] - for result in results: - if result.succeeded: - success.append(result) - else: - errors.append(result) - logger.debug(f"results: {len(success)} successes, {len(errors)} failures") - if errors: - raise WriteError( - ", ".join( - [ - f"{error.key}: [{error.status_code}] {error.error_message}" - for error in errors - ], - ), - ) - - def write_dict_wrapper(self, elements_dict): - return self.write_dict(elements_dict=elements_dict) - - def run(self, contents: list[UploadContent], **kwargs: t.Any) -> None: - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - - logger.info( - f"writing document batches to destination" - f" endpoint at {str(self.connection_config.endpoint)}" - f" index at {str(self.connection_config.index)}" - f" with batch size {str(self.upload_config.batch_size)}" - ) - - batch_size = self.upload_config.batch_size - - for chunk in batch_generator(elements_dict, batch_size): - self.write_dict(elements_dict=chunk) # noqa: E203 - - -add_destination_entry( - destination_type=CONNECTOR_TYPE, - entry=DestinationRegistryEntry( - connection_config=AzureCognitiveSearchConnectionConfig, - uploader=AzureCognitiveSearchUploader, - uploader_config=AzureCognitiveSearchUploaderConfig, - upload_stager=AzureCognitiveSearchUploadStager, - upload_stager_config=AzureCognitiveSearchUploadStagerConfig, - ), -) diff --git a/unstructured/ingest/v2/processes/connectors/chroma.py b/unstructured/ingest/v2/processes/connectors/chroma.py deleted file mode 100644 index e28e3d7f7..000000000 --- a/unstructured/ingest/v2/processes/connectors/chroma.py +++ /dev/null @@ -1,208 +0,0 @@ -from __future__ import annotations - -import json -import uuid -from dataclasses import dataclass, field -from datetime import date, datetime -from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, Optional - -from chromadb.config import Settings -from dateutil import parser - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - FileData, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, -) -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from chromadb.api import ClientAPI - -CONNECTOR_TYPE = "chroma" - - -@dataclass -class ChromaAccessConfig(AccessConfig): - settings: Optional[Settings] = None - headers: Optional[Dict[str, str]] = None - - -@dataclass -class ChromaConnectionConfig(ConnectionConfig): - collection_name: str - access_config: ChromaAccessConfig = enhanced_field(sensitive=True) - path: Optional[str] = None - tenant: str = "default_tenant" - database: str = "default_database" - host: Optional[str] = None - port: Optional[int] = None - ssl: bool = False - connector_type: str = CONNECTOR_TYPE - - -@dataclass -class ChromaUploadStagerConfig(UploadStagerConfig): - pass - - -@dataclass -class ChromaUploadStager(UploadStager): - upload_stager_config: ChromaUploadStagerConfig = field( - default_factory=lambda: ChromaUploadStagerConfig() - ) - - @staticmethod - def parse_date_string(date_string: str) -> date: - try: - timestamp = float(date_string) - return datetime.fromtimestamp(timestamp) - except Exception as e: - logger.debug(f"date {date_string} string not a timestamp: {e}") - return parser.parse(date_string) - - @staticmethod - def conform_dict(data: dict) -> dict: - """ - Prepares dictionary in the format that Chroma requires - """ - element_id = data.get("element_id", str(uuid.uuid4())) - return { - "id": element_id, - "embedding": data.pop("embeddings", None), - "document": data.pop("text", None), - "metadata": flatten_dict(data, separator="-", flatten_lists=True, remove_none=True), - } - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - conformed_elements = [self.conform_dict(data=element) for element in elements_contents] - output_path = Path(output_dir) / Path(f"{output_filename}.json") - with open(output_path, "w") as output_file: - json.dump(conformed_elements, output_file) - return output_path - - -@dataclass -class ChromaUploaderConfig(UploaderConfig): - batch_size: int = 100 - - -@dataclass -class ChromaUploader(Uploader): - connector_type: str = CONNECTOR_TYPE - upload_config: ChromaUploaderConfig - connection_config: ChromaConnectionConfig - client: Optional[ClientAPI] = field(init=False) - - def __post_init__(self): - self.client = self.create_client() - - @requires_dependencies(["chromadb"], extras="chroma") - def create_client(self) -> ClientAPI: - import chromadb - - if self.connection_config.path: - return chromadb.PersistentClient( - path=self.connection_config.path, - settings=self.connection_config.access_config.settings, - tenant=self.connection_config.tenant, - database=self.connection_config.database, - ) - - elif self.connection_config.host and self.connection_config.port: - return chromadb.HttpClient( - host=self.connection_config.host, - port=self.connection_config.port, - ssl=self.connection_config.ssl, - headers=self.connection_config.access_config.headers, - settings=self.connection_config.access_config.settings, - tenant=self.connection_config.tenant, - database=self.connection_config.database, - ) - else: - raise ValueError("Chroma connector requires either path or host and port to be set.") - - @DestinationConnectionError.wrap - def upsert_batch(self, collection, batch): - - try: - # Chroma wants lists even if there is only one element - # Upserting to prevent duplicates - collection.upsert( - ids=batch["ids"], - documents=batch["documents"], - embeddings=batch["embeddings"], - metadatas=batch["metadatas"], - ) - except Exception as e: - raise ValueError(f"chroma error: {e}") from e - - @staticmethod - def prepare_chroma_list(chunk: tuple[dict[str, Any]]) -> dict[str, list[Any]]: - """Helper function to break a tuple of dicts into list of parallel lists for ChromaDb. - ({'id':1}, {'id':2}, {'id':3}) -> {'ids':[1,2,3]}""" - chroma_dict = {} - chroma_dict["ids"] = [x.get("id") for x in chunk] - chroma_dict["documents"] = [x.get("document") for x in chunk] - chroma_dict["embeddings"] = [x.get("embedding") for x in chunk] - chroma_dict["metadatas"] = [x.get("metadata") for x in chunk] - # Make sure all lists are of the same length - assert ( - len(chroma_dict["ids"]) - == len(chroma_dict["documents"]) - == len(chroma_dict["embeddings"]) - == len(chroma_dict["metadatas"]) - ) - return chroma_dict - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - - logger.info( - f"writing {len(elements_dict)} objects to destination " - f"collection {self.connection_config.collection_name} " - f"at {self.connection_config.host}", - ) - - collection = self.client.get_or_create_collection( - name=self.connection_config.collection_name - ) - for chunk in batch_generator(elements_dict, self.upload_config.batch_size): - self.upsert_batch(collection, self.prepare_chroma_list(chunk)) - - -chroma_destination_entry = DestinationRegistryEntry( - connection_config=ChromaConnectionConfig, - uploader=ChromaUploader, - uploader_config=ChromaUploaderConfig, - upload_stager=ChromaUploadStager, - upload_stager_config=ChromaUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/databricks_volumes.py b/unstructured/ingest/v2/processes/connectors/databricks_volumes.py deleted file mode 100644 index e875535c2..000000000 --- a/unstructured/ingest/v2/processes/connectors/databricks_volumes.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Optional - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - UploadContent, - Uploader, - UploaderConfig, -) -from unstructured.ingest.v2.processes.connector_registry import DestinationRegistryEntry -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from databricks.sdk import WorkspaceClient - -CONNECTOR_TYPE = "databricks_volumes" - - -@dataclass -class DatabricksVolumesAccessConfig(AccessConfig): - account_id: Optional[str] = None - username: Optional[str] = None - password: Optional[str] = None - client_id: Optional[str] = None - client_secret: Optional[str] = None - token: Optional[str] = None - profile: Optional[str] = None - azure_workspace_resource_id: Optional[str] = None - azure_client_secret: Optional[str] = None - azure_client_id: Optional[str] = None - azure_tenant_id: Optional[str] = None - azure_environment: Optional[str] = None - auth_type: Optional[str] = None - cluster_id: Optional[str] = None - google_credentials: Optional[str] = None - google_service_account: Optional[str] = None - - -@dataclass -class DatabricksVolumesConnectionConfig(ConnectionConfig): - access_config: DatabricksVolumesAccessConfig = enhanced_field( - default_factory=DatabricksVolumesAccessConfig, sensitive=True - ) - host: Optional[str] = None - - -@dataclass -class DatabricksVolumesUploaderConfig(UploaderConfig): - volume: str - catalog: str - volume_path: Optional[str] = None - overwrite: bool = False - schema: str = "default" - - @property - def path(self) -> str: - path = f"/Volumes/{self.catalog}/{self.schema}/{self.volume}" - if self.volume_path: - path = f"{path}/{self.volume_path}" - return path - - -@dataclass -class DatabricksVolumesUploader(Uploader): - connector_type: str = CONNECTOR_TYPE - upload_config: DatabricksVolumesUploaderConfig - connection_config: DatabricksVolumesConnectionConfig - client: Optional["WorkspaceClient"] = field(init=False, default=None) - - @requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes") - def __post_init__(self) -> "WorkspaceClient": - from databricks.sdk import WorkspaceClient - - self.client = WorkspaceClient( - host=self.connection_config.host, **self.connection_config.access_config.to_dict() - ) - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - for content in contents: - with open(content.path, "rb") as elements_file: - output_path = os.path.join(self.upload_config.path, content.path.name) - self.client.files.upload( - file_path=output_path, - contents=elements_file, - overwrite=self.upload_config.overwrite, - ) - - -databricks_volumes_destination_entry = DestinationRegistryEntry( - connection_config=DatabricksVolumesConnectionConfig, - uploader=DatabricksVolumesUploader, - uploader_config=DatabricksVolumesUploaderConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/elasticsearch.py b/unstructured/ingest/v2/processes/connectors/elasticsearch.py deleted file mode 100644 index 4a45bae1b..000000000 --- a/unstructured/ingest/v2/processes/connectors/elasticsearch.py +++ /dev/null @@ -1,401 +0,0 @@ -import hashlib -import json -import sys -import uuid -from dataclasses import dataclass, field -from pathlib import Path -from time import time -from typing import TYPE_CHECKING, Any, Generator, Optional - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.utils.data_prep import generator_batching_wbytes -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - DownloadResponse, - FileData, - Indexer, - IndexerConfig, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, - download_responses, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from elasticsearch import Elasticsearch as ElasticsearchClient - -CONNECTOR_TYPE = "elasticsearch" - - -@dataclass -class ElasticsearchAccessConfig(AccessConfig): - password: Optional[str] = None - api_key: Optional[str] = enhanced_field(default=None, overload_name="es_api_key") - bearer_auth: Optional[str] = None - ssl_assert_fingerprint: Optional[str] = None - - -@dataclass -class ElasticsearchClientInput(EnhancedDataClassJsonMixin): - hosts: Optional[list[str]] = None - cloud_id: Optional[str] = None - ca_certs: Optional[str] = None - basic_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None) - api_key: Optional[str] = enhanced_field(sensitive=True, default=None) - - -@dataclass -class ElasticsearchConnectionConfig(ConnectionConfig): - hosts: Optional[list[str]] = None - username: Optional[str] = None - cloud_id: Optional[str] = None - api_key_id: Optional[str] = None - ca_certs: Optional[str] = None - access_config: ElasticsearchAccessConfig = enhanced_field(sensitive=True) - - def get_client_kwargs(self) -> dict: - # Update auth related fields to conform to what the SDK expects based on the - # supported methods: - # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html - client_input = ElasticsearchClientInput() - if self.hosts: - client_input.hosts = self.hosts - if self.cloud_id: - client_input.cloud_id = self.cloud_id - if self.ca_certs: - client_input.ca_certs = self.ca_certs - if self.access_config.password and ( - self.cloud_id or self.ca_certs or self.access_config.ssl_assert_fingerprint - ): - client_input.basic_auth = ("elastic", self.access_config.password) - elif not self.cloud_id and self.username and self.access_config.password: - client_input.basic_auth = (self.username, self.access_config.password) - elif self.access_config.api_key and self.api_key_id: - client_input.api_key = (self.api_key_id, self.access_config.api_key) - elif self.access_config.api_key: - client_input.api_key = self.access_config.api_key - logger.debug( - f"Elasticsearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}" - ) - client_kwargs = client_input.to_dict(redact_sensitive=False) - client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None} - return client_kwargs - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def get_client(self) -> "ElasticsearchClient": - from elasticsearch import Elasticsearch as ElasticsearchClient - - client = ElasticsearchClient(**self.get_client_kwargs()) - self.check_connection(client=client) - return client - - def check_connection(self, client: "ElasticsearchClient"): - try: - client.perform_request("HEAD", "/", headers={"accept": "application/json"}) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - -@dataclass -class ElasticsearchIndexerConfig(IndexerConfig): - index_name: str - batch_size: int = 100 - - -@dataclass -class ElasticsearchIndexer(Indexer): - connection_config: ElasticsearchConnectionConfig - index_config: ElasticsearchIndexerConfig - client: "ElasticsearchClient" = field(init=False) - connector_type: str = CONNECTOR_TYPE - - def __post_init__(self): - self.client = self.connection_config.get_client() - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def load_scan(self): - from elasticsearch.helpers import scan - - return scan - - def _get_doc_ids(self) -> set[str]: - """Fetches all document ids in an index""" - scan = self.load_scan() - - scan_query: dict = {"stored_fields": [], "query": {"match_all": {}}} - hits = scan( - self.client, - query=scan_query, - scroll="1m", - index=self.index_config.index_name, - ) - - return {hit["_id"] for hit in hits} - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - all_ids = self._get_doc_ids() - ids = list(all_ids) - id_batches: list[frozenset[str]] = [ - frozenset( - ids[ - i - * self.index_config.batch_size : (i + 1) # noqa - * self.index_config.batch_size - ] - ) - for i in range( - (len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size - ) - ] - for batch in id_batches: - # Make sure the hash is always a positive number to create identified - identified = str(hash(batch) + sys.maxsize + 1) - yield FileData( - identifier=identified, - connector_type=CONNECTOR_TYPE, - metadata=DataSourceMetadata( - url=f"{self.connection_config.hosts[0]}/{self.index_config.index_name}", - date_processed=str(time()), - ), - additional_metadata={ - "ids": list(batch), - "index_name": self.index_config.index_name, - }, - ) - - -@dataclass -class ElasticsearchDownloaderConfig(DownloaderConfig): - fields: list[str] = field(default_factory=list) - - -@dataclass -class ElasticsearchDownloader(Downloader): - connection_config: ElasticsearchConnectionConfig - download_config: ElasticsearchDownloaderConfig - connector_type: str = CONNECTOR_TYPE - - def is_async(self) -> bool: - return True - - def get_identifier(self, index_name: str, record_id: str) -> str: - f = f"{index_name}-{record_id}" - if self.download_config.fields: - f = "{}-{}".format( - f, - hashlib.sha256(",".join(self.download_config.fields).encode()).hexdigest()[:8], - ) - return f - - def map_es_results(self, es_results: dict) -> str: - doc_body = es_results["_source"] - flattened_dict = flatten_dict(dictionary=doc_body) - str_values = [str(value) for value in flattened_dict.values()] - concatenated_values = "\n".join(str_values) - return concatenated_values - - def generate_download_response( - self, result: dict, index_name: str, file_data: FileData - ) -> DownloadResponse: - record_id = result["_id"] - filename_id = self.get_identifier(index_name=index_name, record_id=record_id) - filename = f"{filename_id}.txt" - download_path = self.download_dir / Path(filename) - logger.debug( - f"Downloading results from index {index_name} and id {record_id} to {download_path}" - ) - download_path.parent.mkdir(parents=True, exist_ok=True) - try: - with open(download_path, "w", encoding="utf8") as f: - f.write(self.map_es_results(es_results=result)) - except Exception as e: - logger.error( - f"failed to download from index {index_name} " - f"and id {record_id} to {download_path}: {e}", - exc_info=True, - ) - raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}") - return DownloadResponse( - file_data=FileData( - identifier=filename_id, - connector_type=CONNECTOR_TYPE, - metadata=DataSourceMetadata( - version=str(result["_version"]) if "_version" in result else None, - date_processed=str(time()), - record_locator={ - "hosts": self.connection_config.hosts, - "index_name": index_name, - "document_id": record_id, - }, - ), - ), - path=download_path, - ) - - def run(self, file_data: FileData, **kwargs: Any) -> download_responses: - raise NotImplementedError() - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def load_async(self): - from elasticsearch import AsyncElasticsearch - from elasticsearch.helpers import async_scan - - return AsyncElasticsearch, async_scan - - async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses: - AsyncClient, async_scan = self.load_async() - - index_name: str = file_data.additional_metadata["index_name"] - ids: list[str] = file_data.additional_metadata["ids"] - - scan_query = { - "_source": self.download_config.fields, - "version": True, - "query": {"ids": {"values": ids}}, - } - - download_responses = [] - async with AsyncClient(**self.connection_config.get_client_kwargs()) as client: - async for result in async_scan( - client, - query=scan_query, - scroll="1m", - index=index_name, - ): - download_responses.append( - self.generate_download_response( - result=result, index_name=index_name, file_data=file_data - ) - ) - return download_responses - - -@dataclass -class ElasticsearchUploadStagerConfig(UploadStagerConfig): - index_name: str - - -@dataclass -class ElasticsearchUploadStager(UploadStager): - upload_stager_config: ElasticsearchUploadStagerConfig - - def conform_dict(self, data: dict) -> dict: - resp = { - "_index": self.upload_stager_config.index_name, - "_id": str(uuid.uuid4()), - "_source": { - "element_id": data.pop("element_id", None), - "embeddings": data.pop("embeddings", None), - "text": data.pop("text", None), - "type": data.pop("type", None), - }, - } - if "metadata" in data and isinstance(data["metadata"], dict): - resp["_source"]["metadata"] = flatten_dict(data["metadata"], separator="-") - return resp - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - conformed_elements = [self.conform_dict(data=element) for element in elements_contents] - output_path = Path(output_dir) / Path(f"{output_filename}.json") - with open(output_path, "w") as output_file: - json.dump(conformed_elements, output_file) - return output_path - - -@dataclass -class ElasticsearchUploaderConfig(UploaderConfig): - index_name: str - batch_size_bytes: int = 15_000_000 - num_threads: int = 4 - - -@dataclass -class ElasticsearchUploader(Uploader): - connector_type: str = CONNECTOR_TYPE - upload_config: ElasticsearchUploaderConfig - connection_config: ElasticsearchConnectionConfig - - @requires_dependencies(["elasticsearch"], extras="elasticsearch") - def load_parallel_bulk(self): - from elasticsearch.helpers import parallel_bulk - - return parallel_bulk - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - parallel_bulk = self.load_parallel_bulk() - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - upload_destination = self.connection_config.hosts or self.connection_config.cloud_id - logger.info( - f"writing {len(elements_dict)} elements via document batches to destination " - f"index named {self.upload_config.index_name} at {upload_destination} with " - f"batch size (in bytes) {self.upload_config.batch_size_bytes} with " - f"{self.upload_config.num_threads} (number of) threads" - ) - - client = self.connection_config.get_client() - if not client.indices.exists(index=self.upload_config.index_name): - logger.warning( - f"{(self.__class__.__name__).replace('Uploader', '')} index does not exist: " - f"{self.upload_config.index_name}. " - f"This may cause issues when uploading." - ) - for batch in generator_batching_wbytes( - elements_dict, batch_size_limit_bytes=self.upload_config.batch_size_bytes - ): - for success, info in parallel_bulk( - client=client, - actions=batch, - thread_count=self.upload_config.num_threads, - ): - if not success: - logger.error( - "upload failed for a batch in " - f"{(self.__class__.__name__).replace('Uploader', '')} " - "destination connector:", - info, - ) - - -elasticsearch_source_entry = SourceRegistryEntry( - connection_config=ElasticsearchConnectionConfig, - indexer=ElasticsearchIndexer, - indexer_config=ElasticsearchIndexerConfig, - downloader=ElasticsearchDownloader, - downloader_config=ElasticsearchDownloaderConfig, -) - -elasticsearch_destination_entry = DestinationRegistryEntry( - connection_config=ElasticsearchConnectionConfig, - upload_stager_config=ElasticsearchUploadStagerConfig, - upload_stager=ElasticsearchUploadStager, - uploader_config=ElasticsearchUploaderConfig, - uploader=ElasticsearchUploader, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py b/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py deleted file mode 100644 index eacc0df96..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/__init__.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import annotations - -from unstructured.ingest.v2.processes.connector_registry import ( - add_destination_entry, - add_source_entry, -) - -from .azure import CONNECTOR_TYPE as AZURE_CONNECTOR_TYPE -from .azure import azure_destination_entry, azure_source_entry -from .box import CONNECTOR_TYPE as BOX_CONNECTOR_TYPE -from .box import box_destination_entry, box_source_entry -from .dropbox import CONNECTOR_TYPE as DROPBOX_CONNECTOR_TYPE -from .dropbox import dropbox_destination_entry, dropbox_source_entry -from .gcs import CONNECTOR_TYPE as GCS_CONNECTOR_TYPE -from .gcs import gcs_destination_entry, gcs_source_entry -from .s3 import CONNECTOR_TYPE as S3_CONNECTOR_TYPE -from .s3 import s3_destination_entry, s3_source_entry -from .sftp import CONNECTOR_TYPE as SFTP_CONNECTOR_TYPE -from .sftp import sftp_destination_entry, sftp_source_entry - -add_source_entry(source_type=AZURE_CONNECTOR_TYPE, entry=azure_source_entry) -add_destination_entry(destination_type=AZURE_CONNECTOR_TYPE, entry=azure_destination_entry) - -add_source_entry(source_type=BOX_CONNECTOR_TYPE, entry=box_source_entry) -add_destination_entry(destination_type=BOX_CONNECTOR_TYPE, entry=box_destination_entry) - -add_source_entry(source_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_source_entry) -add_destination_entry(destination_type=DROPBOX_CONNECTOR_TYPE, entry=dropbox_destination_entry) - -add_source_entry(source_type=GCS_CONNECTOR_TYPE, entry=gcs_source_entry) -add_destination_entry(destination_type=GCS_CONNECTOR_TYPE, entry=gcs_destination_entry) - -add_source_entry(source_type=S3_CONNECTOR_TYPE, entry=s3_source_entry) -add_destination_entry(destination_type=S3_CONNECTOR_TYPE, entry=s3_destination_entry) - -add_source_entry(source_type=SFTP_CONNECTOR_TYPE, entry=sftp_source_entry) -add_destination_entry(destination_type=SFTP_CONNECTOR_TYPE, entry=sftp_destination_entry) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/azure.py b/unstructured/ingest/v2/processes/connectors/fsspec/azure.py deleted file mode 100644 index 8dd756600..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/azure.py +++ /dev/null @@ -1,144 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Generator, Optional - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import DownloadResponse, FileData, UploadContent -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( - FsspecAccessConfig, - FsspecConnectionConfig, - FsspecDownloader, - FsspecDownloaderConfig, - FsspecIndexer, - FsspecIndexerConfig, - FsspecUploader, - FsspecUploaderConfig, -) -from unstructured.ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "azure" - - -def azure_json_serial(obj): - from azure.storage.blob._models import ContentSettings - - if isinstance(obj, ContentSettings): - return dict(obj) - if isinstance(obj, bytearray): - return str(obj) - return json_serial(obj) - - -@dataclass -class AzureIndexerConfig(FsspecIndexerConfig): - pass - - -@dataclass -class AzureAccessConfig(FsspecAccessConfig): - account_name: Optional[str] = None - account_key: Optional[str] = None - connection_string: Optional[str] = None - sas_token: Optional[str] = None - - def __post_init__(self): - if self.connection_string is None and self.account_name is None: - raise ValueError("either connection_string or account_name must be set") - - -@dataclass -class AzureConnectionConfig(FsspecConnectionConfig): - supported_protocols: list[str] = field(default_factory=lambda: ["az"]) - access_config: AzureAccessConfig = enhanced_field( - sensitive=True, default_factory=lambda: AzureAccessConfig() - ) - connector_type: str = CONNECTOR_TYPE - - def get_access_config(self) -> dict[str, Any]: - # Avoid injecting None by filtering out k,v pairs where the value is None - access_configs: dict[str, Any] = { - k: v for k, v in self.access_config.to_dict().items() if v - } - return access_configs - - -@dataclass -class AzureIndexer(FsspecIndexer): - connection_config: AzureConnectionConfig - index_config: AzureIndexerConfig - connector_type: str = CONNECTOR_TYPE - - def sterilize_info(self, path) -> dict: - info = self.fs.info(path=path) - return sterilize_dict(data=info, default=azure_json_serial) - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - return super().run(**kwargs) - - -@dataclass -class AzureDownloaderConfig(FsspecDownloaderConfig): - pass - - -@dataclass -class AzureDownloader(FsspecDownloader): - protocol: str = "az" - connection_config: AzureConnectionConfig - connector_type: str = CONNECTOR_TYPE - download_config: Optional[AzureDownloaderConfig] = field(default_factory=AzureDownloaderConfig) - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return super().run(file_data=file_data, **kwargs) - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return await super().run_async(file_data=file_data, **kwargs) - - -@dataclass -class AzureUploaderConfig(FsspecUploaderConfig): - pass - - -@dataclass -class AzureUploader(FsspecUploader): - connector_type: str = CONNECTOR_TYPE - connection_config: AzureConnectionConfig - upload_config: AzureUploaderConfig = field(default=None) - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def __post_init__(self): - super().__post_init__() - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - return super().run(contents=contents, **kwargs) - - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return await super().run_async(path=path, file_data=file_data, **kwargs) - - -azure_source_entry = SourceRegistryEntry( - indexer=AzureIndexer, - indexer_config=AzureIndexerConfig, - downloader=AzureDownloader, - downloader_config=AzureDownloaderConfig, - connection_config=AzureConnectionConfig, -) - -azure_destination_entry = DestinationRegistryEntry( - uploader=AzureUploader, - uploader_config=AzureUploaderConfig, - connection_config=AzureConnectionConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/box.py b/unstructured/ingest/v2/processes/connectors/fsspec/box.py deleted file mode 100644 index 77d60c79e..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/box.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Generator, Optional - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import DownloadResponse, FileData, UploadContent -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( - FsspecAccessConfig, - FsspecConnectionConfig, - FsspecDownloader, - FsspecDownloaderConfig, - FsspecIndexer, - FsspecIndexerConfig, - FsspecUploader, - FsspecUploaderConfig, -) -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "box" - - -@dataclass -class BoxIndexerConfig(FsspecIndexerConfig): - pass - - -@dataclass -class BoxAccessConfig(FsspecAccessConfig): - box_app_config: Optional[str] = None - - -@dataclass -class BoxConnectionConfig(FsspecConnectionConfig): - supported_protocols: list[str] = field(default_factory=lambda: ["box"]) - access_config: BoxAccessConfig = enhanced_field( - sensitive=True, default_factory=lambda: BoxAccessConfig() - ) - connector_type: str = CONNECTOR_TYPE - - def get_access_config(self) -> dict[str, Any]: - # Return access_kwargs with oauth. The oauth object can not be stored directly in the config - # because it is not serializable. - from boxsdk import JWTAuth - - access_kwargs_with_oauth: dict[str, Any] = { - "oauth": JWTAuth.from_settings_file( - self.access_config.box_app_config, - ), - } - access_config: dict[str, Any] = self.access_config.to_dict() - access_config.pop("box_app_config", None) - access_kwargs_with_oauth.update(access_config) - - return access_kwargs_with_oauth - - -@dataclass -class BoxIndexer(FsspecIndexer): - connection_config: BoxConnectionConfig - index_config: BoxIndexerConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["boxfs"], extras="box") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - return super().run(**kwargs) - - -@dataclass -class BoxDownloaderConfig(FsspecDownloaderConfig): - pass - - -@dataclass -class BoxDownloader(FsspecDownloader): - protocol: str = "box" - connection_config: BoxConnectionConfig - connector_type: str = CONNECTOR_TYPE - download_config: Optional[BoxDownloaderConfig] = field(default_factory=BoxDownloaderConfig) - - @requires_dependencies(["boxfs"], extras="box") - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return super().run(file_data=file_data, **kwargs) - - @requires_dependencies(["boxfs"], extras="box") - async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return await super().run_async(file_data=file_data, **kwargs) - - -@dataclass -class BoxUploaderConfig(FsspecUploaderConfig): - pass - - -@dataclass -class BoxUploader(FsspecUploader): - connector_type: str = CONNECTOR_TYPE - connection_config: BoxConnectionConfig - upload_config: BoxUploaderConfig = field(default=None) - - @requires_dependencies(["boxfs"], extras="box") - def __post_init__(self): - super().__post_init__() - - @requires_dependencies(["boxfs"], extras="box") - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - return super().run(contents=contents, **kwargs) - - @requires_dependencies(["boxfs"], extras="box") - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return await super().run_async(path=path, file_data=file_data, **kwargs) - - -box_source_entry = SourceRegistryEntry( - indexer=BoxIndexer, - indexer_config=BoxIndexerConfig, - downloader=BoxDownloader, - downloader_config=BoxDownloaderConfig, - connection_config=BoxConnectionConfig, -) - -box_destination_entry = DestinationRegistryEntry( - uploader=BoxUploader, - uploader_config=BoxUploaderConfig, - connection_config=BoxConnectionConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py b/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py deleted file mode 100644 index 96dc3ba71..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/dropbox.py +++ /dev/null @@ -1,130 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Generator, Optional - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import DownloadResponse, FileData, UploadContent -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( - FsspecAccessConfig, - FsspecConnectionConfig, - FsspecDownloader, - FsspecDownloaderConfig, - FsspecIndexer, - FsspecIndexerConfig, - FsspecUploader, - FsspecUploaderConfig, -) -from unstructured.ingest.v2.processes.connectors.fsspec.utils import sterilize_dict -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "dropbox" - - -@dataclass -class DropboxIndexerConfig(FsspecIndexerConfig): - pass - - -@dataclass -class DropboxAccessConfig(FsspecAccessConfig): - token: Optional[str] = None - - -@dataclass -class DropboxConnectionConfig(FsspecConnectionConfig): - supported_protocols: list[str] = field(default_factory=lambda: ["dropbox"]) - access_config: DropboxAccessConfig = enhanced_field( - sensitive=True, default_factory=lambda: DropboxAccessConfig() - ) - connector_type: str = CONNECTOR_TYPE - - -@dataclass -class DropboxIndexer(FsspecIndexer): - connection_config: DropboxConnectionConfig - index_config: DropboxIndexerConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def __post_init__(self): - # dropbox expects the path to start with a / - if not self.index_config.path_without_protocol.startswith("/"): - self.index_config.path_without_protocol = "/" + self.index_config.path_without_protocol - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - return super().run(**kwargs) - - def sterilize_info(self, path) -> dict: - # the fs.info method defined in the dropboxdrivefs library expects a "url" - # kwarg rather than "path"; though both refer to the same thing - info = self.fs.info(url=path) - return sterilize_dict(data=info) - - -@dataclass -class DropboxDownloaderConfig(FsspecDownloaderConfig): - pass - - -@dataclass -class DropboxDownloader(FsspecDownloader): - protocol: str = "dropbox" - connection_config: DropboxConnectionConfig - connector_type: str = CONNECTOR_TYPE - download_config: Optional[DropboxDownloaderConfig] = field( - default_factory=DropboxDownloaderConfig - ) - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return super().run(file_data=file_data, **kwargs) - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return await super().run_async(file_data=file_data, **kwargs) - - -@dataclass -class DropboxUploaderConfig(FsspecUploaderConfig): - pass - - -@dataclass -class DropboxUploader(FsspecUploader): - connector_type: str = CONNECTOR_TYPE - connection_config: DropboxConnectionConfig - upload_config: DropboxUploaderConfig = field(default=None) - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def __post_init__(self): - super().__post_init__() - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - return super().run(contents=contents, **kwargs) - - @requires_dependencies(["dropboxdrivefs", "fsspec"], extras="dropbox") - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return await super().run_async(path=path, file_data=file_data, **kwargs) - - -dropbox_source_entry = SourceRegistryEntry( - indexer=DropboxIndexer, - indexer_config=DropboxIndexerConfig, - downloader=DropboxDownloader, - downloader_config=DropboxDownloaderConfig, - connection_config=DropboxConnectionConfig, -) - -dropbox_destination_entry = DestinationRegistryEntry( - uploader=DropboxUploader, - uploader_config=DropboxUploaderConfig, - connection_config=DropboxConnectionConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py b/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py deleted file mode 100644 index 2adfa99b0..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py +++ /dev/null @@ -1,344 +0,0 @@ -from __future__ import annotations - -import contextlib -import fnmatch -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path -from time import time -from typing import TYPE_CHECKING, Any, Generator, Optional, TypeVar - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - DownloadResponse, - FileData, - Indexer, - IndexerConfig, - SourceIdentifiers, - UploadContent, - Uploader, - UploaderConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connectors.fsspec.utils import sterilize_dict - -if TYPE_CHECKING: - from fsspec import AbstractFileSystem - -CONNECTOR_TYPE = "fsspec" - - -class Base(object): - def __post_init__(self): - pass - - -@dataclass -class FileConfig(Base): - remote_url: str - protocol: str = field(init=False) - path_without_protocol: str = field(init=False) - supported_protocols: list[str] = field( - default_factory=lambda: [ - "s3", - "s3a", - "abfs", - "az", - "gs", - "gcs", - "box", - "dropbox", - "sftp", - ] - ) - - def __post_init__(self): - super().__post_init__() - self.protocol, self.path_without_protocol = self.remote_url.split("://") - if self.protocol not in self.supported_protocols: - raise ValueError( - "Protocol {} not supported yet, only {} are supported.".format( - self.protocol, ", ".join(self.supported_protocols) - ), - ) - - -@dataclass -class FsspecIndexerConfig(FileConfig, IndexerConfig): - recursive: bool = False - file_glob: Optional[list[str]] = None - - -@dataclass -class FsspecAccessConfig(AccessConfig): - pass - - -FsspecAccessConfigT = TypeVar("FsspecAccessConfigT", bound=FsspecAccessConfig) - - -@dataclass -class FsspecConnectionConfig(ConnectionConfig): - access_config: FsspecAccessConfigT = enhanced_field(sensitive=True, default=None) - connector_type: str = CONNECTOR_TYPE - - -FsspecIndexerConfigT = TypeVar("FsspecIndexerConfigT", bound=FsspecIndexerConfig) -FsspecConnectionConfigT = TypeVar("FsspecConnectionConfigT", bound=FsspecConnectionConfig) - - -@dataclass -class FsspecIndexer(Indexer): - connection_config: FsspecConnectionConfigT - index_config: FsspecIndexerConfigT - connector_type: str = CONNECTOR_TYPE - - @property - def fs(self) -> "AbstractFileSystem": - from fsspec import get_filesystem_class - - return get_filesystem_class(self.index_config.protocol)( - **self.connection_config.get_access_config(), - ) - - def does_path_match_glob(self, path: str) -> bool: - if self.index_config.file_glob is None: - return True - patterns = self.index_config.file_glob - for pattern in patterns: - if fnmatch.filter([path], pattern): - return True - logger.debug(f"The file {path!r} is discarded as it does not match any given glob.") - return False - - def check_connection(self): - from fsspec import get_filesystem_class - - try: - fs = get_filesystem_class(self.index_config.protocol)( - **self.connection_config.get_access_config(), - ) - fs.ls(path=self.index_config.path_without_protocol, detail=False) - except Exception as e: - logger.error(f"failed to validate connection: {e}", exc_info=True) - raise SourceConnectionError(f"failed to validate connection: {e}") - - def list_files(self) -> list[str]: - if not self.index_config.recursive: - # fs.ls does not walk directories - # directories that are listed in cloud storage can cause problems - # because they are seen as 0 byte files - found = self.fs.ls(self.index_config.path_without_protocol, detail=True) - if isinstance(found, list): - return [ - x.get("name") for x in found if x.get("size") > 0 and x.get("type") == "file" - ] - else: - raise TypeError(f"unhandled response type from ls: {type(found)}") - else: - # fs.find will recursively walk directories - # "size" is a common key for all the cloud protocols with fs - found = self.fs.find( - self.index_config.path_without_protocol, - detail=True, - ) - if isinstance(found, dict): - return [ - k for k, v in found.items() if v.get("size") > 0 and v.get("type") == "file" - ] - else: - raise TypeError(f"unhandled response type from find: {type(found)}") - - def get_metadata(self, path: str) -> DataSourceMetadata: - date_created = None - date_modified = None - - try: - created: Optional[Any] = self.fs.created(path) - if created: - if isinstance(created, datetime): - date_created = str(created.timestamp()) - else: - date_created = str(created) - except NotImplementedError: - pass - - try: - modified: Optional[Any] = self.fs.modified(path) - if modified: - if isinstance(modified, datetime): - date_modified = str(modified.timestamp()) - else: - date_modified = str(modified) - except NotImplementedError: - pass - - version = self.fs.checksum(path) - metadata: dict[str, str] = {} - with contextlib.suppress(AttributeError): - metadata = self.fs.metadata(path) - record_locator = { - "protocol": self.index_config.protocol, - "remote_file_path": self.index_config.remote_url, - } - file_stat = self.fs.stat(path=path) - if file_id := file_stat.get("id"): - record_locator["file_id"] = file_id - if metadata: - record_locator["metadata"] = metadata - return DataSourceMetadata( - date_created=date_created, - date_modified=date_modified, - date_processed=str(time()), - version=str(version), - url=f"{self.index_config.protocol}://{path}", - record_locator=record_locator, - ) - - def sterilize_info(self, path) -> dict: - info = self.fs.info(path=path) - return sterilize_dict(data=info) - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - raw_files = self.list_files() - files = [f for f in raw_files if self.does_path_match_glob(f)] - for file in files: - # Note: we remove any remaining leading slashes (Box introduces these) - # to get a valid relative path - rel_path = file.replace(self.index_config.path_without_protocol, "").lstrip("/") - yield FileData( - identifier=file, - connector_type=self.connector_type, - source_identifiers=SourceIdentifiers( - filename=Path(file).name, - rel_path=rel_path or None, - fullpath=file, - ), - metadata=self.get_metadata(path=file), - additional_metadata=self.sterilize_info(path=file), - ) - - -@dataclass -class FsspecDownloaderConfig(DownloaderConfig): - pass - - -FsspecDownloaderConfigT = TypeVar("FsspecDownloaderConfigT", bound=FsspecDownloaderConfig) - - -@dataclass -class FsspecDownloader(Downloader): - protocol: str - connection_config: FsspecConnectionConfigT - connector_type: str = CONNECTOR_TYPE - download_config: Optional[FsspecDownloaderConfigT] = field( - default_factory=lambda: FsspecDownloaderConfig() - ) - - def is_async(self) -> bool: - return self.fs.async_impl - - @property - def fs(self) -> "AbstractFileSystem": - from fsspec import get_filesystem_class - - return get_filesystem_class(self.protocol)( - **self.connection_config.get_access_config(), - ) - - def get_download_path(self, file_data: FileData) -> Path: - return ( - self.download_dir / Path(file_data.source_identifiers.relative_path) - if self.download_config - else Path(file_data.source_identifiers.rel_path) - ) - - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - try: - self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix()) - except Exception as e: - logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True) - raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}") - return self.generate_download_response(file_data=file_data, download_path=download_path) - - async def async_run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - try: - await self.fs.get(rpath=file_data.identifier, lpath=download_path.as_posix()) - except Exception as e: - logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True) - raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}") - return self.generate_download_response(file_data=file_data, download_path=download_path) - - -@dataclass -class FsspecUploaderConfig(FileConfig, UploaderConfig): - overwrite: bool = False - - -FsspecUploaderConfigT = TypeVar("FsspecUploaderConfigT", bound=FsspecUploaderConfig) - - -@dataclass -class FsspecUploader(Uploader): - connector_type: str = CONNECTOR_TYPE - upload_config: FsspecUploaderConfigT = field(default=None) - - @property - def fs(self) -> "AbstractFileSystem": - from fsspec import get_filesystem_class - - fs_kwargs = self.connection_config.get_access_config() if self.connection_config else {} - return get_filesystem_class(self.upload_config.protocol)( - **fs_kwargs, - ) - - def __post_init__(self): - # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove: - if not self.upload_config: - raise TypeError( - f"{self.__class__.__name__}.__init__() " - f"missing 1 required positional argument: 'upload_config'" - ) - - def get_upload_path(self, file_data: FileData) -> Path: - upload_path = ( - Path(self.upload_config.path_without_protocol) - / file_data.source_identifiers.relative_path - ) - updated_upload_path = upload_path.parent / f"{upload_path.name}.json" - return updated_upload_path - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - for content in contents: - self._run(path=content.path, file_data=content.file_data) - - def _run(self, path: Path, file_data: FileData) -> None: - path_str = str(path.resolve()) - upload_path = self.get_upload_path(file_data=file_data) - if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite: - logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists") - return - logger.debug(f"Writing local file {path_str} to {upload_path}") - self.fs.upload(lpath=path_str, rpath=str(upload_path)) - - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - upload_path = self.get_upload_path(file_data=file_data) - path_str = str(path.resolve()) - # Odd that fsspec doesn't run exists() as async even when client support async - already_exists = self.fs.exists(path=str(upload_path)) - if already_exists and not self.upload_config.overwrite: - logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists") - return - logger.debug(f"Writing local file {path_str} to {upload_path}") - self.fs.upload(lpath=path_str, rpath=str(upload_path)) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py b/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py deleted file mode 100644 index 2c51f1c12..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/gcs.py +++ /dev/null @@ -1,141 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Generator, Optional, Union - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.utils.string_and_date_utils import json_to_dict -from unstructured.ingest.v2.interfaces import DownloadResponse, FileData, UploadContent -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( - FsspecAccessConfig, - FsspecConnectionConfig, - FsspecDownloader, - FsspecDownloaderConfig, - FsspecIndexer, - FsspecIndexerConfig, - FsspecUploader, - FsspecUploaderConfig, -) -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "gcs" - - -@dataclass -class GcsIndexerConfig(FsspecIndexerConfig): - pass - - -@dataclass -class GcsAccessConfig(FsspecAccessConfig): - service_account_key: Optional[str] = None - token: Union[str, dict, None] = field(init=False, default=None) - - def __post_init__(self): - ALLOWED_AUTH_VALUES = "google_default", "cache", "anon", "browser", "cloud" - - # Case: null value - if not self.service_account_key: - return - - # Case: one of auth constants - if self.service_account_key in ALLOWED_AUTH_VALUES: - self.token = self.service_account_key - return - - # Case: token as json - if isinstance(json_to_dict(self.service_account_key), dict): - self.token = json_to_dict(self.service_account_key) - return - - # Case: path to token - if Path(self.service_account_key).is_file(): - self.token = self.service_account_key - return - - raise ValueError("Invalid auth token value") - - -@dataclass -class GcsConnectionConfig(FsspecConnectionConfig): - supported_protocols: list[str] = field(default_factory=lambda: ["gs", "gcs"]) - access_config: GcsAccessConfig = enhanced_field( - sensitive=True, default_factory=lambda: GcsAccessConfig() - ) - connector_type: str = CONNECTOR_TYPE - - -@dataclass -class GcsIndexer(FsspecIndexer): - connection_config: GcsConnectionConfig - index_config: GcsIndexerConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - return super().run(**kwargs) - - -@dataclass -class GcsDownloaderConfig(FsspecDownloaderConfig): - pass - - -@dataclass -class GcsDownloader(FsspecDownloader): - protocol: str = "gcs" - connection_config: GcsConnectionConfig - connector_type: str = CONNECTOR_TYPE - download_config: Optional[GcsDownloaderConfig] = field(default_factory=GcsDownloaderConfig) - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return super().run(file_data=file_data, **kwargs) - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return await super().run_async(file_data=file_data, **kwargs) - - -@dataclass -class GcsUploaderConfig(FsspecUploaderConfig): - pass - - -@dataclass -class GcsUploader(FsspecUploader): - connector_type: str = CONNECTOR_TYPE - connection_config: GcsConnectionConfig - upload_config: GcsUploaderConfig = field(default=None) - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def __post_init__(self): - super().__post_init__() - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - return super().run(contents=contents, **kwargs) - - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return await super().run_async(path=path, file_data=file_data, **kwargs) - - -gcs_source_entry = SourceRegistryEntry( - indexer=GcsIndexer, - indexer_config=GcsIndexerConfig, - downloader=GcsDownloader, - downloader_config=GcsDownloaderConfig, - connection_config=GcsConnectionConfig, -) - -gcs_destination_entry = DestinationRegistryEntry( - uploader=GcsUploader, - uploader_config=GcsUploaderConfig, - connection_config=GcsConnectionConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/s3.py b/unstructured/ingest/v2/processes/connectors/fsspec/s3.py deleted file mode 100644 index 7f48bdc81..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/s3.py +++ /dev/null @@ -1,163 +0,0 @@ -import contextlib -from dataclasses import dataclass, field -from datetime import datetime -from pathlib import Path -from time import time -from typing import Any, Generator, Optional - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import DownloadResponse, FileData, UploadContent -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( - FsspecAccessConfig, - FsspecConnectionConfig, - FsspecDownloader, - FsspecDownloaderConfig, - FsspecIndexer, - FsspecIndexerConfig, - FsspecUploader, - FsspecUploaderConfig, -) -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "s3" - - -@dataclass -class S3IndexerConfig(FsspecIndexerConfig): - pass - - -@dataclass -class S3AccessConfig(FsspecAccessConfig): - key: Optional[str] = None - secret: Optional[str] = None - token: Optional[str] = None - - -@dataclass -class S3ConnectionConfig(FsspecConnectionConfig): - supported_protocols: list[str] = field(default_factory=lambda: ["s3", "s3a"]) - access_config: S3AccessConfig = enhanced_field( - sensitive=True, default_factory=lambda: S3AccessConfig() - ) - endpoint_url: Optional[str] = None - anonymous: bool = False - connector_type: str = CONNECTOR_TYPE - - def get_access_config(self) -> dict[str, Any]: - access_configs: dict[str, Any] = {"anon": self.anonymous} - if self.endpoint_url: - access_configs["endpoint_url"] = self.endpoint_url - - # Avoid injecting None by filtering out k,v pairs where the value is None - access_configs.update({k: v for k, v in self.access_config.to_dict().items() if v}) - return access_configs - - -@dataclass -class S3Indexer(FsspecIndexer): - connection_config: S3ConnectionConfig - index_config: S3IndexerConfig - connector_type: str = CONNECTOR_TYPE - - def get_metadata(self, path: str) -> DataSourceMetadata: - date_created = None - date_modified = None - try: - modified: Optional[datetime] = self.fs.modified(path) - if modified: - date_created = str(modified.timestamp()) - date_modified = str(modified.timestamp()) - except NotImplementedError: - pass - - version = None - info: dict[str, Any] = self.fs.info(path) - if etag := info.get("ETag"): - version = str(etag).rstrip('"').lstrip('"') - metadata: dict[str, str] = {} - with contextlib.suppress(AttributeError): - metadata = self.fs.metadata(path) - record_locator = { - "protocol": self.index_config.protocol, - "remote_file_path": self.index_config.remote_url, - } - if metadata: - record_locator["metadata"] = metadata - return DataSourceMetadata( - date_created=date_created, - date_modified=date_modified, - date_processed=str(time()), - version=version, - url=f"{self.index_config.protocol}://{path}", - record_locator=record_locator, - ) - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - return super().run(**kwargs) - - -@dataclass -class S3DownloaderConfig(FsspecDownloaderConfig): - pass - - -@dataclass -class S3Downloader(FsspecDownloader): - protocol: str = "s3" - connection_config: S3ConnectionConfig - connector_type: str = CONNECTOR_TYPE - download_config: Optional[S3DownloaderConfig] = field(default_factory=S3DownloaderConfig) - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return super().run(file_data=file_data, **kwargs) - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return await super().run_async(file_data=file_data, **kwargs) - - -@dataclass -class S3UploaderConfig(FsspecUploaderConfig): - pass - - -@dataclass -class S3Uploader(FsspecUploader): - connector_type: str = CONNECTOR_TYPE - connection_config: S3ConnectionConfig - upload_config: S3UploaderConfig = field(default=None) - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def __post_init__(self): - super().__post_init__() - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - return super().run(contents=contents, **kwargs) - - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return await super().run_async(path=path, file_data=file_data, **kwargs) - - -s3_source_entry = SourceRegistryEntry( - indexer=S3Indexer, - indexer_config=S3IndexerConfig, - downloader=S3Downloader, - downloader_config=S3DownloaderConfig, - connection_config=S3ConnectionConfig, -) - -s3_destination_entry = DestinationRegistryEntry( - uploader=S3Uploader, - uploader_config=S3UploaderConfig, - connection_config=S3ConnectionConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py b/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py deleted file mode 100644 index d73a22195..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/sftp.py +++ /dev/null @@ -1,166 +0,0 @@ -from __future__ import annotations - -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any, Generator, Optional -from urllib.parse import urlparse - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import DownloadResponse, FileData, UploadContent -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.fsspec.fsspec import ( - FsspecAccessConfig, - FsspecConnectionConfig, - FsspecDownloader, - FsspecDownloaderConfig, - FsspecIndexer, - FsspecIndexerConfig, - FsspecUploader, - FsspecUploaderConfig, -) -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "sftp" - - -@dataclass -class SftpIndexerConfig(FsspecIndexerConfig): - def __post_init__(self): - super().__post_init__() - _, ext = os.path.splitext(self.remote_url) - parsed_url = urlparse(self.remote_url) - if ext: - self.path_without_protocol = Path(parsed_url.path).parent.as_posix().lstrip("/") - else: - self.path_without_protocol = parsed_url.path.lstrip("/") - - -@dataclass -class SftpAccessConfig(FsspecAccessConfig): - password: str - - -@dataclass -class SftpConnectionConfig(FsspecConnectionConfig): - supported_protocols: list[str] = field(default_factory=lambda: ["sftp"]) - access_config: SftpAccessConfig = enhanced_field(sensitive=True) - connector_type: str = CONNECTOR_TYPE - username: Optional[str] = None - host: Optional[str] = None - port: int = 22 - look_for_keys: bool = False - allow_agent: bool = False - - def get_access_config(self) -> dict[str, Any]: - access_config = { - "username": self.username, - "host": self.host, - "port": self.port, - "look_for_keys": self.look_for_keys, - "allow_agent": self.allow_agent, - "password": self.access_config.password, - } - return access_config - - -@dataclass -class SftpIndexer(FsspecIndexer): - connection_config: SftpConnectionConfig - index_config: SftpIndexerConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def __post_init__(self): - parsed_url = urlparse(self.index_config.remote_url) - self.connection_config.host = parsed_url.hostname or self.connection_config.host - self.connection_config.port = parsed_url.port or self.connection_config.port - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - for file in super().run(**kwargs): - new_identifier = ( - f"sftp://" - f"{self.connection_config.host}:" - f"{self.connection_config.port}/" - f"{file.identifier}" - ) - file.identifier = new_identifier - yield file - - -@dataclass -class SftpDownloaderConfig(FsspecDownloaderConfig): - remote_url: Optional[str] = None - - def __post_init__(self): - # TODO once python3.9 no longer supported and kw_only is allowed in dataclasses, remove: - if not self.remote_url: - raise TypeError( - f"{self.__class__.__name__}.__init__() " - f"missing 1 required positional argument: 'remote_url'" - ) - - -@dataclass -class SftpDownloader(FsspecDownloader): - protocol: str = "sftp" - connection_config: SftpConnectionConfig - connector_type: str = CONNECTOR_TYPE - download_config: Optional[SftpDownloaderConfig] = field(default_factory=SftpDownloaderConfig) - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def __post_init__(self): - parsed_url = urlparse(self.download_config.remote_url) - self.connection_config.host = parsed_url.hostname or self.connection_config.host - self.connection_config.port = parsed_url.port or self.connection_config.port - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return super().run(file_data=file_data, **kwargs) - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return await super().run_async(file_data=file_data, **kwargs) - - -@dataclass -class SftpUploaderConfig(FsspecUploaderConfig): - pass - - -@dataclass -class SftpUploader(FsspecUploader): - connector_type: str = CONNECTOR_TYPE - connection_config: SftpConnectionConfig - upload_config: SftpUploaderConfig = field(default=None) - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def __post_init__(self): - super().__post_init__() - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - return super().run(contents=contents, **kwargs) - - @requires_dependencies(["paramiko", "fsspec"], extras="sftp") - async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None: - return await super().run_async(path=path, file_data=file_data, **kwargs) - - -sftp_source_entry = SourceRegistryEntry( - indexer=SftpIndexer, - indexer_config=SftpIndexerConfig, - downloader=SftpDownloader, - downloader_config=SftpDownloaderConfig, - connection_config=SftpConnectionConfig, -) - -sftp_destination_entry = DestinationRegistryEntry( - uploader=SftpUploader, - uploader_config=SftpUploaderConfig, - connection_config=SftpConnectionConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/fsspec/utils.py b/unstructured/ingest/v2/processes/connectors/fsspec/utils.py deleted file mode 100644 index e852e21dd..000000000 --- a/unstructured/ingest/v2/processes/connectors/fsspec/utils.py +++ /dev/null @@ -1,17 +0,0 @@ -import json -from datetime import datetime -from pathlib import Path -from typing import Callable - - -def json_serial(obj): - if isinstance(obj, Path): - return obj.as_posix() - if isinstance(obj, datetime): - return obj.isoformat() - raise TypeError("Type %s not serializable" % type(obj)) - - -def sterilize_dict(data: dict, default: Callable = json_serial) -> dict: - data_s = json.dumps(data, default=default) - return json.loads(data_s) diff --git a/unstructured/ingest/v2/processes/connectors/google_drive.py b/unstructured/ingest/v2/processes/connectors/google_drive.py deleted file mode 100644 index 8d61671cf..000000000 --- a/unstructured/ingest/v2/processes/connectors/google_drive.py +++ /dev/null @@ -1,335 +0,0 @@ -import io -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Any, Generator, Optional, Union - -from dateutil import parser - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionNetworkError -from unstructured.ingest.utils.string_and_date_utils import json_to_dict -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - FileData, - Indexer, - IndexerConfig, - SourceIdentifiers, - download_responses, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - SourceRegistryEntry, -) -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "google_drive" - -if TYPE_CHECKING: - from googleapiclient.discovery import Resource as GoogleAPIResource - from googleapiclient.http import MediaIoBaseDownload - - -@dataclass -class GoogleDriveAccessConfig(AccessConfig): - service_account_key: Union[str, dict] - - -@dataclass -class GoogleDriveConnectionConfig(ConnectionConfig): - drive_id: str - access_config: GoogleDriveAccessConfig = enhanced_field(sensitive=True) - - @requires_dependencies(["googleapiclient"], extras="google-drive") - def get_files_service(self) -> "GoogleAPIResource": - from google.auth import default, exceptions - from google.oauth2 import service_account - from googleapiclient.discovery import build - from googleapiclient.errors import HttpError - - # Service account key can be a dict or a file path(str) - # But the dict may come in as a string - if isinstance(self.access_config.service_account_key, str): - key_path = json_to_dict(self.access_config.service_account_key) - elif isinstance(self.access_config.service_account_key, dict): - key_path = self.access_config.service_account_key - else: - raise TypeError( - f"access_config.service_account_key must be " - f"str or dict, got: {type(self.access_config.service_account_key)}" - ) - - try: - if isinstance(key_path, dict): - creds = service_account.Credentials.from_service_account_info(key_path) - elif isinstance(key_path, str): - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path - creds, _ = default() - else: - raise ValueError( - f"key path not recognized as a dictionary or a file path: " - f"[{type(key_path)}] {key_path}", - ) - service = build("drive", "v3", credentials=creds) - return service.files() - - except HttpError as exc: - raise ValueError(f"{exc.reason}") - except exceptions.DefaultCredentialsError: - raise ValueError("The provided API key is invalid.") - - -@dataclass -class GoogleDriveIndexerConfig(IndexerConfig): - extensions: Optional[list[str]] = None - recursive: bool = False - - def __post_init__(self): - # Strip leading period of extension - if self.extensions is not None: - self.extensions = [e[1:] if e.startswith(".") else e for e in self.extensions] - - -@dataclass -class GoogleDriveIndexer(Indexer): - connection_config: GoogleDriveConnectionConfig - index_config: GoogleDriveIndexerConfig - fields: list[str] = field( - default_factory=lambda: [ - "id", - "name", - "mimeType", - "fileExtension", - "md5Checksum", - "sha1Checksum", - "sha256Checksum", - "headRevisionId", - "permissions", - "createdTime", - "modifiedTime", - "version", - "originalFilename", - "capabilities", - "permissionIds", - "webViewLink", - "webContentLink", - ] - ) - - @staticmethod - def is_dir(record: dict) -> bool: - return record.get("mimeType") == "application/vnd.google-apps.folder" - - @staticmethod - def map_file_data(f: dict) -> FileData: - file_id = f["id"] - filename = f.pop("name") - url = f.pop("webContentLink", None) - version = f.pop("version", None) - permissions = f.pop("permissions", None) - date_created_str = f.pop("createdTime", None) - date_created_dt = parser.parse(date_created_str) if date_created_str else None - date_modified_str = f.pop("modifiedTime", None) - parent_path = f.pop("parent_path", None) - parent_root_path = f.pop("parent_root_path", None) - date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None - if ( - parent_path - and isinstance(parent_path, str) - and parent_root_path - and isinstance(parent_root_path, str) - ): - fullpath = f"{parent_path}/{filename}" - rel_path = fullpath.replace(parent_root_path, "") - source_identifiers = SourceIdentifiers( - filename=filename, fullpath=fullpath, rel_path=rel_path - ) - else: - source_identifiers = SourceIdentifiers(fullpath=filename, filename=filename) - return FileData( - connector_type=CONNECTOR_TYPE, - identifier=file_id, - source_identifiers=source_identifiers, - metadata=DataSourceMetadata( - url=url, - version=version, - date_created=str(date_created_dt.timestamp()), - date_modified=str(date_modified_dt.timestamp()), - permissions_data=permissions, - record_locator={ - "file_id": file_id, - }, - ), - additional_metadata=f, - ) - - def get_paginated_results( - self, - files_client, - object_id: str, - extensions: Optional[list[str]] = None, - recursive: bool = False, - previous_path: Optional[str] = None, - ) -> list[dict]: - - fields_input = "nextPageToken, files({})".format(",".join(self.fields)) - q = f"'{object_id}' in parents" - # Filter by extension but still include any directories - if extensions: - ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions]) - q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')" - logger.debug(f"Query used when indexing: {q}") - logger.debug("response fields limited to: {}".format(", ".join(self.fields))) - done = False - page_token = None - files_response = [] - while not done: - response: dict = files_client.list( - spaces="drive", - fields=fields_input, - corpora="user", - pageToken=page_token, - q=q, - ).execute() - if files := response.get("files", []): - fs = [f for f in files if not self.is_dir(record=f)] - for r in fs: - r["parent_path"] = previous_path - dirs = [f for f in files if self.is_dir(record=f)] - files_response.extend(fs) - if recursive: - for d in dirs: - dir_id = d["id"] - dir_name = d["name"] - files_response.extend( - self.get_paginated_results( - files_client=files_client, - object_id=dir_id, - extensions=extensions, - recursive=recursive, - previous_path=f"{previous_path}/{dir_name}", - ) - ) - page_token = response.get("nextPageToken") - if page_token is None: - done = True - for r in files_response: - r["parent_root_path"] = previous_path - return files_response - - def get_root_info(self, files_client, object_id: str) -> dict: - return files_client.get(fileId=object_id, fields=",".join(self.fields)).execute() - - def get_files( - self, - files_client, - object_id: str, - recursive: bool = False, - extensions: Optional[list[str]] = None, - ) -> list[FileData]: - root_info = self.get_root_info(files_client=files_client, object_id=object_id) - if not self.is_dir(root_info): - data = [self.map_file_data(root_info)] - else: - - file_contents = self.get_paginated_results( - files_client=files_client, - object_id=object_id, - extensions=extensions, - recursive=recursive, - previous_path=root_info["name"], - ) - data = [self.map_file_data(f=f) for f in file_contents] - for d in data: - d.metadata.record_locator["drive_id"]: object_id - return data - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - for f in self.get_files( - files_client=self.connection_config.get_files_service(), - object_id=self.connection_config.drive_id, - recursive=self.index_config.recursive, - extensions=self.index_config.extensions, - ): - yield f - - -@dataclass -class GoogleDriveDownloaderConfig(DownloaderConfig): - pass - - -@dataclass -class GoogleDriveDownloader(Downloader): - connection_config: GoogleDriveConnectionConfig - download_config: GoogleDriveDownloaderConfig = field( - default_factory=lambda: GoogleDriveDownloaderConfig() - ) - connector_type: str = CONNECTOR_TYPE - - def get_download_path(self, file_data: FileData) -> Path: - rel_path = file_data.source_identifiers.relative_path - rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path - return self.download_dir / Path(rel_path) - - @SourceConnectionNetworkError.wrap - def _get_content(self, downloader: "MediaIoBaseDownload") -> bool: - downloaded = False - while downloaded is False: - _, downloaded = downloader.next_chunk() - return downloaded - - def _write_file(self, file_data: FileData, file_contents: io.BytesIO): - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}") - with open(download_path, "wb") as handler: - handler.write(file_contents.getbuffer()) - return self.generate_download_response(file_data=file_data, download_path=download_path) - - @requires_dependencies(["googleapiclient"], extras="google-drive") - def run(self, file_data: FileData, **kwargs: Any) -> download_responses: - from googleapiclient.http import MediaIoBaseDownload - - logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}") - mime_type = file_data.additional_metadata["mimeType"] - record_id = file_data.identifier - files_client = self.connection_config.get_files_service() - if mime_type.startswith("application/vnd.google-apps"): - export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get( - self.meta.get("mimeType"), # type: ignore - ) - if not export_mime: - raise TypeError( - f"File not supported. Name: {file_data.source_identifiers.filename} " - f"ID: {record_id} " - f"MimeType: {mime_type}" - ) - - request = files_client.export_media( - fileId=record_id, - mimeType=export_mime, - ) - else: - request = files_client.get_media(fileId=record_id) - - file_contents = io.BytesIO() - downloader = MediaIoBaseDownload(file_contents, request) - downloaded = self._get_content(downloader=downloader) - if not downloaded or not file_contents: - return [] - return self._write_file(file_data=file_data, file_contents=file_contents) - - -google_drive_source_entry = SourceRegistryEntry( - connection_config=GoogleDriveConnectionConfig, - indexer_config=GoogleDriveIndexerConfig, - indexer=GoogleDriveIndexer, - downloader_config=GoogleDriveDownloaderConfig, - downloader=GoogleDriveDownloader, -) diff --git a/unstructured/ingest/v2/processes/connectors/local.py b/unstructured/ingest/v2/processes/connectors/local.py deleted file mode 100644 index 811606d79..000000000 --- a/unstructured/ingest/v2/processes/connectors/local.py +++ /dev/null @@ -1,203 +0,0 @@ -import glob -import itertools -import shutil -from dataclasses import dataclass, field -from pathlib import Path -from time import time -from typing import Any, Generator, Optional - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - DownloadResponse, - FileData, - Indexer, - IndexerConfig, - SourceIdentifiers, - UploadContent, - Uploader, - UploaderConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) - -CONNECTOR_TYPE = "local" - - -@dataclass -class LocalAccessConfig(AccessConfig): - pass - - -@dataclass -class LocalConnectionConfig(ConnectionConfig): - access_config: LocalAccessConfig = field(default_factory=lambda: LocalAccessConfig()) - - -@dataclass -class LocalIndexerConfig(IndexerConfig): - input_path: str - recursive: bool = False - file_glob: Optional[list[str]] = None - - @property - def path(self) -> Path: - return Path(self.input_path).resolve() - - -@dataclass -class LocalIndexer(Indexer): - index_config: LocalIndexerConfig - connection_config: LocalConnectionConfig = field( - default_factory=lambda: LocalConnectionConfig() - ) - connector_type: str = CONNECTOR_TYPE - - def list_files(self) -> list[Path]: - input_path = self.index_config.path - if input_path.is_file(): - return [Path(s) for s in glob.glob(f"{self.index_config.path}")] - glob_fn = input_path.rglob if self.index_config.recursive else input_path.glob - if not self.index_config.file_glob: - return list(glob_fn("*")) - return list( - itertools.chain.from_iterable( - glob_fn(pattern) for pattern in self.index_config.file_glob - ) - ) - - def get_file_metadata(self, path: Path) -> DataSourceMetadata: - stats = path.stat() - try: - date_modified = str(stats.st_mtime) - except Exception as e: - logger.warning(f"Couldn't detect date modified: {e}") - date_modified = None - - try: - date_created = str(stats.st_birthtime) - except Exception as e: - logger.warning(f"Couldn't detect date created: {e}") - date_created = None - - try: - mode = stats.st_mode - permissions_data = [{"mode": mode}] - except Exception as e: - logger.warning(f"Couldn't detect file mode: {e}") - permissions_data = None - return DataSourceMetadata( - date_modified=date_modified, - date_created=date_created, - date_processed=str(time()), - permissions_data=permissions_data, - record_locator={"path": str(path.resolve())}, - ) - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - for file_path in self.list_files(): - file_data = FileData( - identifier=str(file_path.resolve()), - connector_type=CONNECTOR_TYPE, - source_identifiers=SourceIdentifiers( - fullpath=str(file_path.resolve()), - filename=file_path.name, - rel_path=( - str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[ - 1: - ] - if not self.index_config.path.is_file() - else self.index_config.path.name - ), - ), - metadata=self.get_file_metadata(path=file_path), - ) - yield file_data - - -@dataclass -class LocalDownloaderConfig(DownloaderConfig): - pass - - -@dataclass -class LocalDownloader(Downloader): - connector_type: str = CONNECTOR_TYPE - connection_config: LocalConnectionConfig = field( - default_factory=lambda: LocalConnectionConfig() - ) - download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig()) - - def get_download_path(self, file_data: FileData) -> Path: - return Path(file_data.source_identifiers.fullpath) - - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - return DownloadResponse( - file_data=file_data, path=Path(file_data.source_identifiers.fullpath) - ) - - -@dataclass -class LocalUploaderConfig(UploaderConfig): - output_dir: str = field(default="structured-output") - - @property - def output_path(self) -> Path: - return Path(self.output_dir).resolve() - - def __post_init__(self): - if self.output_path.exists() and self.output_path.is_file(): - raise ValueError("output path already exists as a file") - - -@dataclass -class LocalUploader(Uploader): - connector_type: str = CONNECTOR_TYPE - upload_config: LocalUploaderConfig = field(default_factory=lambda: LocalUploaderConfig()) - connection_config: LocalConnectionConfig = field( - default_factory=lambda: LocalConnectionConfig() - ) - - def is_async(self) -> bool: - return False - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - self.upload_config.output_path.mkdir(parents=True, exist_ok=True) - for content in contents: - if source_identifiers := content.file_data.source_identifiers: - identifiers = source_identifiers - rel_path = ( - identifiers.relative_path[1:] - if identifiers.relative_path.startswith("/") - else identifiers.relative_path - ) - new_path = self.upload_config.output_path / Path(rel_path) - final_path = str(new_path).replace( - identifiers.filename, f"{identifiers.filename}.json" - ) - else: - final_path = self.upload_config.output_path / Path( - f"{content.file_data.identifier}.json" - ) - Path(final_path).parent.mkdir(parents=True, exist_ok=True) - logger.debug(f"copying file from {content.path} to {final_path}") - shutil.copy(src=str(content.path), dst=str(final_path)) - - -local_source_entry = SourceRegistryEntry( - indexer=LocalIndexer, - indexer_config=LocalIndexerConfig, - downloader=LocalDownloader, - downloader_config=LocalDownloaderConfig, - connection_config=LocalConnectionConfig, -) - -local_destination_entry = DestinationRegistryEntry( - uploader=LocalUploader, uploader_config=LocalUploaderConfig -) diff --git a/unstructured/ingest/v2/processes/connectors/mongodb.py b/unstructured/ingest/v2/processes/connectors/mongodb.py deleted file mode 100644 index f5003911c..000000000 --- a/unstructured/ingest/v2/processes/connectors/mongodb.py +++ /dev/null @@ -1,137 +0,0 @@ -import json -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional - -from unstructured.__version__ import __version__ as unstructured_version -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - FileData, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from pymongo import MongoClient - -CONNECTOR_TYPE = "mongodb" -SERVER_API_VERSION = "1" - - -@dataclass -class MongoDBAccessConfig(AccessConfig): - uri: Optional[str] = None - - -@dataclass -class MongoDBConnectionConfig(ConnectionConfig): - access_config: MongoDBAccessConfig = enhanced_field( - sensitive=True, default_factory=MongoDBAccessConfig - ) - host: Optional[str] = None - database: Optional[str] = None - collection: Optional[str] = None - port: int = 27017 - batch_size: int = 100 - connector_type: str = CONNECTOR_TYPE - - -@dataclass -class MongoDBUploadStagerConfig(UploadStagerConfig): - pass - - -@dataclass -class MongoDBUploadStager(UploadStager): - upload_stager_config: MongoDBUploadStagerConfig = field( - default_factory=lambda: MongoDBUploadStagerConfig() - ) - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - - output_path = Path(output_dir) / Path(f"{output_filename}.json") - with open(output_path, "w") as output_file: - json.dump(elements_contents, output_file) - return output_path - - -@dataclass -class MongoDBUploaderConfig(UploaderConfig): - batch_size: int = 100 - - -@dataclass -class MongoDBUploader(Uploader): - upload_config: MongoDBUploaderConfig - connection_config: MongoDBConnectionConfig - client: Optional["MongoClient"] = field(init=False) - connector_type: str = CONNECTOR_TYPE - - def __post_init__(self): - self.client = self.create_client() - - @requires_dependencies(["pymongo"], extras="mongodb") - def create_client(self) -> "MongoClient": - from pymongo import MongoClient - from pymongo.driver_info import DriverInfo - from pymongo.server_api import ServerApi - - if self.connection_config.access_config.uri: - return MongoClient( - self.connection_config.access_config.uri, - server_api=ServerApi(version=SERVER_API_VERSION), - driver=DriverInfo(name="unstructured", version=unstructured_version), - ) - else: - return MongoClient( - host=self.connection_config.host, - port=self.connection_config.port, - server_api=ServerApi(version=SERVER_API_VERSION), - ) - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - - logger.info( - f"writing {len(elements_dict)} objects to destination " - f"db, {self.connection_config.database}, " - f"collection {self.connection_config.collection} " - f"at {self.connection_config.host}", - ) - db = self.client[self.connection_config.database] - collection = db[self.connection_config.collection] - for chunk in batch_generator(elements_dict, self.upload_config.batch_size): - collection.insert_many(chunk) - - -mongodb_destination_entry = DestinationRegistryEntry( - connection_config=MongoDBConnectionConfig, - uploader=MongoDBUploader, - uploader_config=MongoDBUploaderConfig, - upload_stager=MongoDBUploadStager, - upload_stager_config=MongoDBUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/onedrive.py b/unstructured/ingest/v2/processes/connectors/onedrive.py deleted file mode 100644 index 4769cf626..000000000 --- a/unstructured/ingest/v2/processes/connectors/onedrive.py +++ /dev/null @@ -1,218 +0,0 @@ -from __future__ import annotations - -import json -from dataclasses import dataclass, field -from pathlib import Path -from time import time -from typing import TYPE_CHECKING, Any, Generator, Optional - -from dateutil import parser - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionError, SourceConnectionNetworkError -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - DownloadResponse, - FileData, - Indexer, - IndexerConfig, - SourceIdentifiers, - download_responses, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - SourceRegistryEntry, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from office365.graph_client import GraphClient - from office365.onedrive.driveitems.driveItem import DriveItem - -CONNECTOR_TYPE = "onedrive" -MAX_MB_SIZE = 512_000_000 - - -@dataclass -class OnedriveAccessConfig(AccessConfig): - client_cred: str - - -@dataclass -class OnedriveConnectionConfig(ConnectionConfig): - client_id: str - user_pname: str - tenant: str = field(repr=False) - authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com") - access_config: OnedriveAccessConfig = enhanced_field(sensitive=True) - - @requires_dependencies(["msal"], extras="onedrive") - def get_token(self): - from msal import ConfidentialClientApplication - - try: - app = ConfidentialClientApplication( - authority=f"{self.authority_url}/{self.tenant}", - client_id=self.client_id, - client_credential=self.access_config.client_cred, - ) - token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - except ValueError as exc: - logger.error("Couldn't set up credentials for OneDrive") - raise exc - if "error" in token: - raise SourceConnectionNetworkError( - "failed to fetch token, {}: {}".format(token["error"], token["error_description"]) - ) - return token - - @requires_dependencies(["office365"], extras="onedrive") - def get_client(self) -> "GraphClient": - from office365.graph_client import GraphClient - - client = GraphClient(self.get_token) - return client - - -@dataclass -class OnedriveIndexerConfig(IndexerConfig): - path: Optional[str] = field(default="") - recursive: bool = False - - -@dataclass -class OnedriveIndexer(Indexer): - connection_config: OnedriveConnectionConfig - index_config: OnedriveIndexerConfig - - def list_objects(self, folder: DriveItem, recursive: bool) -> list[DriveItem]: - drive_items: list[DriveItem] = list(folder.children.get().execute_query()) - files = [d for d in drive_items if d.is_file] - if not recursive: - return files - folders = [d for d in drive_items if d.is_folder] - for f in folders: - files.extend(self.list_objects(f, recursive)) - return files - - def get_root(self, client: "GraphClient") -> "DriveItem": - root = client.users[self.connection_config.user_pname].drive.get().execute_query().root - if fpath := self.index_config.path: - root = root.get_by_path(fpath).get().execute_query() - if root is None or not root.is_folder: - raise ValueError(f"Unable to find directory, given: {fpath}") - return root - - def get_properties(self, drive_item: "DriveItem") -> dict: - properties = drive_item.properties - filtered_properties = {} - for k, v in properties.items(): - try: - json.dumps(v) - filtered_properties[k] = v - except TypeError: - pass - return filtered_properties - - def drive_item_to_file_data(self, drive_item: "DriveItem") -> FileData: - file_path = drive_item.parent_reference.path.split(":")[-1] - file_path = file_path[1:] if file_path and file_path[0] == "/" else file_path - filename = drive_item.name - server_path = file_path + "/" + filename - rel_path = server_path.replace(self.index_config.path, "").lstrip("/") - date_modified_dt = ( - parser.parse(str(drive_item.last_modified_datetime)) - if drive_item.last_modified_datetime - else None - ) - date_created_at = ( - parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None - ) - return FileData( - identifier=drive_item.id, - connector_type=CONNECTOR_TYPE, - source_identifiers=SourceIdentifiers( - fullpath=server_path, filename=drive_item.name, rel_path=rel_path - ), - metadata=DataSourceMetadata( - url=drive_item.parent_reference.path + "/" + drive_item.name, - version=drive_item.etag, - date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None, - date_created=str(date_created_at.timestamp()) if date_created_at else None, - date_processed=str(time()), - record_locator={ - "user_pname": self.connection_config.user_pname, - "server_relative_path": server_path, - }, - ), - additional_metadata=self.get_properties(drive_item=drive_item), - ) - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - client = self.connection_config.get_client() - root = self.get_root(client=client) - drive_items = self.list_objects(folder=root, recursive=self.index_config.recursive) - for drive_item in drive_items: - file_data = self.drive_item_to_file_data(drive_item=drive_item) - yield file_data - - -@dataclass -class OnedriveDownloaderConfig(DownloaderConfig): - pass - - -@dataclass -class OnedriveDownloader(Downloader): - connection_config: OnedriveConnectionConfig - download_config: OnedriveDownloaderConfig - - @SourceConnectionNetworkError.wrap - def _fetch_file(self, file_data: FileData): - if file_data.source_identifiers is None or not file_data.source_identifiers.fullpath: - raise ValueError( - f"file data doesn't have enough information to get " - f"file content: {file_data.to_dict()}" - ) - - server_relative_path = file_data.source_identifiers.fullpath - client = self.connection_config.get_client() - root = client.users[self.connection_config.user_pname].drive.get().execute_query().root - file = root.get_by_path(server_relative_path).get().execute_query() - if not file: - raise FileNotFoundError(f"file not found: {server_relative_path}") - return file - - def get_download_path(self, file_data: FileData) -> Optional[Path]: - rel_path = file_data.source_identifiers.relative_path - rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path - return self.download_dir / Path(rel_path) - - @SourceConnectionError.wrap - def run(self, file_data: FileData, **kwargs: Any) -> download_responses: - file = self._fetch_file(file_data=file_data) - fsize = file.get_property("size", 0) - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}") - if fsize > MAX_MB_SIZE: - logger.info(f"Downloading file with size: {fsize} bytes in chunks") - with download_path.open(mode="wb") as f: - file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() - else: - with download_path.open(mode="wb") as f: - file.download(f).execute_query() - return DownloadResponse(file_data=file_data, path=download_path) - - -onedrive_source_entry = SourceRegistryEntry( - connection_config=OnedriveConnectionConfig, - indexer_config=OnedriveIndexerConfig, - indexer=OnedriveIndexer, - downloader_config=OnedriveDownloaderConfig, - downloader=OnedriveDownloader, -) diff --git a/unstructured/ingest/v2/processes/connectors/opensearch.py b/unstructured/ingest/v2/processes/connectors/opensearch.py deleted file mode 100644 index 0933cd1fa..000000000 --- a/unstructured/ingest/v2/processes/connectors/opensearch.py +++ /dev/null @@ -1,155 +0,0 @@ -from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Optional - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.error import ( - DestinationConnectionError, -) -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, - SourceRegistryEntry, -) -from unstructured.ingest.v2.processes.connectors.elasticsearch import ( - ElasticsearchDownloader, - ElasticsearchDownloaderConfig, - ElasticsearchIndexer, - ElasticsearchIndexerConfig, - ElasticsearchUploader, - ElasticsearchUploaderConfig, - ElasticsearchUploadStager, - ElasticsearchUploadStagerConfig, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from opensearchpy import OpenSearch - -CONNECTOR_TYPE = "opensearch" - -"""Since the actual OpenSearch project is a fork of Elasticsearch, we are relying -heavily on the Elasticsearch connector code, inheriting the functionality as much as possible.""" - - -@dataclass -class OpenSearchAccessConfig(AccessConfig): - password: Optional[str] = enhanced_field(default=None, sensitive=True) - use_ssl: bool = False - verify_certs: bool = False - ssl_show_warn: bool = False - ca_certs: Optional[str] = None - client_cert: Optional[str] = None - client_key: Optional[str] = None - - -@dataclass -class OpenSearchClientInput(EnhancedDataClassJsonMixin): - http_auth: Optional[tuple[str, str]] = enhanced_field(sensitive=True, default=None) - hosts: Optional[list[str]] = None - use_ssl: bool = False - verify_certs: bool = False - ssl_show_warn: bool = False - ca_certs: Optional[str] = None - client_cert: Optional[str] = None - client_key: Optional[str] = None - - -@dataclass -class OpenSearchConnectionConfig(ConnectionConfig): - hosts: Optional[list[str]] = None - username: Optional[str] = None - access_config: OpenSearchAccessConfig = enhanced_field(sensitive=True) - - def get_client_kwargs(self) -> dict: - # Update auth related fields to conform to what the SDK expects based on the - # supported methods: - # https://github.com/opensearch-project/opensearch-py/blob/main/opensearchpy/client/__init__.py - client_input = OpenSearchClientInput() - if self.hosts: - client_input.hosts = self.hosts - if self.access_config.use_ssl: - client_input.use_ssl = self.access_config.use_ssl - if self.access_config.verify_certs: - client_input.verify_certs = self.access_config.verify_certs - if self.access_config.ssl_show_warn: - client_input.ssl_show_warn = self.access_config.ssl_show_warn - if self.access_config.ca_certs: - client_input.ca_certs = self.access_config.ca_certs - if self.access_config.client_cert: - client_input.client_cert = self.access_config.client_cert - if self.access_config.client_key: - client_input.client_key = self.access_config.client_key - if self.username and self.access_config.password: - client_input.http_auth = (self.username, self.access_config.password) - logger.debug( - f"OpenSearch client inputs mapped to: {client_input.to_dict(redact_sensitive=True)}" - ) - client_kwargs = client_input.to_dict(redact_sensitive=False) - client_kwargs = {k: v for k, v in client_kwargs.items() if v is not None} - return client_kwargs - - @DestinationConnectionError.wrap - @requires_dependencies(["opensearchpy"], extras="opensearch") - def get_client(self) -> "OpenSearch": - from opensearchpy import OpenSearch - - return OpenSearch(**self.get_client_kwargs()) - - -@dataclass -class OpenSearchIndexer(ElasticsearchIndexer): - connection_config: OpenSearchConnectionConfig - client: "OpenSearch" = field(init=False) - - @requires_dependencies(["opensearchpy"], extras="opensearch") - def load_scan(self): - from opensearchpy.helpers import scan - - return scan - - -@dataclass -class OpenSearchDownloader(ElasticsearchDownloader): - connection_config: OpenSearchConnectionConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["opensearchpy"], extras="opensearch") - def load_async(self): - from opensearchpy import AsyncOpenSearch - from opensearchpy.helpers import async_scan - - return AsyncOpenSearch, async_scan - - -@dataclass -class OpenSearchUploader(ElasticsearchUploader): - connection_config: OpenSearchConnectionConfig - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["opensearchpy"], extras="opensearch") - def load_parallel_bulk(self): - from opensearchpy.helpers import parallel_bulk - - return parallel_bulk - - -opensearch_source_entry = SourceRegistryEntry( - connection_config=OpenSearchConnectionConfig, - indexer=OpenSearchIndexer, - indexer_config=ElasticsearchIndexerConfig, - downloader=OpenSearchDownloader, - downloader_config=ElasticsearchDownloaderConfig, -) - - -opensearch_destination_entry = DestinationRegistryEntry( - connection_config=OpenSearchConnectionConfig, - upload_stager_config=ElasticsearchUploadStagerConfig, - upload_stager=ElasticsearchUploadStager, - uploader_config=ElasticsearchUploaderConfig, - uploader=OpenSearchUploader, -) diff --git a/unstructured/ingest/v2/processes/connectors/pinecone.py b/unstructured/ingest/v2/processes/connectors/pinecone.py deleted file mode 100644 index 0cd087b9c..000000000 --- a/unstructured/ingest/v2/processes/connectors/pinecone.py +++ /dev/null @@ -1,178 +0,0 @@ -import json -import multiprocessing as mp -import uuid -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import DestinationConnectionError -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, -) -from unstructured.staging.base import flatten_dict -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from pinecone import Index as PineconeIndex - - -CONNECTOR_TYPE = "pinecone" - - -@dataclass -class PineconeAccessConfig(AccessConfig): - api_key: Optional[str] = enhanced_field(default=None, overload_name="pinecone_api_key") - - -@dataclass -class PineconeConnectionConfig(ConnectionConfig): - index_name: str - environment: str - access_config: PineconeAccessConfig = enhanced_field(sensitive=True) - - @requires_dependencies(["pinecone"], extras="pinecone") - def get_index(self) -> "PineconeIndex": - from pinecone import Pinecone - - from unstructured import __version__ as unstructured_version - - pc = Pinecone( - api_key=self.access_config.api_key, - source_tag=f"unstructured=={unstructured_version}", - ) - - index = pc.Index(self.index_name) - logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}") - return index - - -@dataclass -class PineconeUploadStagerConfig(UploadStagerConfig): - pass - - -@dataclass -class PineconeUploaderConfig(UploaderConfig): - batch_size: int = 100 - num_of_processes: int = 4 - - -@dataclass -class PineconeUploadStager(UploadStager): - upload_stager_config: PineconeUploadStagerConfig = field( - default_factory=lambda: PineconeUploadStagerConfig() - ) - - @staticmethod - def conform_dict(element_dict: dict) -> dict: - # While flatten_dict enables indexing on various fields, - # element_serialized enables easily reloading the element object to memory. - # element_serialized is formed without text/embeddings to avoid data bloating. - return { - "id": str(uuid.uuid4()), - "values": element_dict.pop("embeddings", None), - "metadata": { - "text": element_dict.pop("text", None), - "element_serialized": json.dumps(element_dict), - **flatten_dict( - element_dict, - separator="-", - flatten_lists=True, - remove_none=True, - ), - }, - } - - def run( - self, - elements_filepath: Path, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - - conformed_elements = [ - self.conform_dict(element_dict=element) for element in elements_contents - ] - - output_path = Path(output_dir) / Path(f"{output_filename}.json") - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w") as output_file: - json.dump(conformed_elements, output_file) - return output_path - - -@dataclass -class PineconeUploader(Uploader): - upload_config: PineconeUploaderConfig - connection_config: PineconeConnectionConfig - connector_type: str = CONNECTOR_TYPE - - @DestinationConnectionError.wrap - def check_connection(self): - _ = self.connection_config.get_index() - - @requires_dependencies(["pinecone"], extras="pinecone") - def upsert_batch(self, batch): - from pinecone.exceptions import PineconeApiException - - try: - index = self.connection_config.get_index() - response = index.upsert(batch) - except PineconeApiException as api_error: - raise DestinationConnectionError(f"http error: {api_error}") from api_error - logger.debug(f"results: {response}") - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - - logger.info( - f"writing document batches to destination" - f" index named {self.connection_config.index_name}" - f" environment named {self.connection_config.environment}" - f" with batch size {self.upload_config.batch_size}" - f" with {self.upload_config.num_of_processes} (number of) processes" - ) - - pinecone_batch_size = self.upload_config.batch_size - - if self.upload_config.num_of_processes == 1: - for batch in batch_generator(elements_dict, pinecone_batch_size): - self.upsert_batch(batch) # noqa: E203 - - else: - with mp.Pool( - processes=self.upload_config.num_of_processes, - ) as pool: - pool.map( - self.upsert_batch, list(batch_generator(elements_dict, pinecone_batch_size)) - ) - - -pinecone_destination_entry = DestinationRegistryEntry( - connection_config=PineconeConnectionConfig, - uploader=PineconeUploader, - uploader_config=PineconeUploaderConfig, - upload_stager=PineconeUploadStager, - upload_stager_config=PineconeUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/salesforce.py b/unstructured/ingest/v2/processes/connectors/salesforce.py deleted file mode 100644 index e1f018895..000000000 --- a/unstructured/ingest/v2/processes/connectors/salesforce.py +++ /dev/null @@ -1,293 +0,0 @@ -""" -Salesforce Connector -Able to download Account, Case, Campaign, EmailMessage, Lead -Salesforce returns everything as a list of json. -This saves each entry as a separate file to be partitioned. -Using JWT authorization -https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_key_and_cert.htm -https://developer.salesforce.com/docs/atlas.en-us.sfdx_dev.meta/sfdx_dev/sfdx_dev_auth_connected_app.htm -""" - -import json -from collections import OrderedDict -from dataclasses import dataclass, field -from email.utils import formatdate -from pathlib import Path -from string import Template -from textwrap import dedent -from typing import TYPE_CHECKING, Any, Generator, Type - -from dateutil import parser - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.error import SourceConnectionNetworkError -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - DownloadResponse, - FileData, - Indexer, - IndexerConfig, - SourceIdentifiers, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - SourceRegistryEntry, -) -from unstructured.utils import requires_dependencies - - -class MissingCategoryError(Exception): - """There are no categories with that name.""" - - -CONNECTOR_TYPE = "salesforce" - -if TYPE_CHECKING: - from simple_salesforce import Salesforce - -SALESFORCE_API_VERSION = "57.0" - -# TODO: Add more categories as needed -ACCEPTED_CATEGORIES: list[str] = ["Account", "Case", "Campaign", "EmailMessage", "Lead"] - -# Generic minimal email template used only -# to process EmailMessage records as .eml files -EMAIL_TEMPLATE = Template( - """MIME-Version: 1.0 -Date: $date -Message-ID: $message_identifier -Subject: $subject -From: $from_email -To: $to_email -Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" ---00000000000095c9b205eff92630 -Content-Type: text/plain; charset="UTF-8" -$textbody ---00000000000095c9b205eff92630 -Content-Type: text/html; charset="UTF-8" -$htmlbody ---00000000000095c9b205eff92630-- -""", -) - - -@dataclass -class SalesforceAccessConfig(AccessConfig): - consumer_key: str - private_key: str - - @requires_dependencies(["cryptography"]) - def get_private_key_value_and_type(self) -> tuple[str, Type]: - from cryptography.hazmat.primitives import serialization - - try: - serialization.load_pem_private_key(data=self.private_key.encode("utf-8"), password=None) - except ValueError: - pass - else: - return self.private_key, str - - if Path(self.private_key).is_file(): - return self.private_key, Path - - raise ValueError("private_key does not contain PEM private key or path") - - -@dataclass -class SalesforceConnectionConfig(ConnectionConfig): - username: str - access_config: SalesforceAccessConfig = enhanced_field(sensitive=True) - - @requires_dependencies(["simple_salesforce"], extras="salesforce") - def get_client(self) -> "Salesforce": - from simple_salesforce import Salesforce - - pkey_value, pkey_type = self.access_config.get_private_key_value_and_type() - - return Salesforce( - username=self.username, - consumer_key=self.access_config.consumer_key, - privatekey_file=pkey_value if pkey_type is Path else None, - privatekey=pkey_value if pkey_type is str else None, - version=SALESFORCE_API_VERSION, - ) - - -@dataclass -class SalesforceIndexerConfig(IndexerConfig): - categories: list[str] - - -@dataclass -class SalesforceIndexer(Indexer): - connection_config: SalesforceConnectionConfig - index_config: SalesforceIndexerConfig - - def __post_init__(self): - for record_type in self.index_config.categories: - if record_type not in ACCEPTED_CATEGORIES: - raise ValueError(f"{record_type} not currently an accepted Salesforce category") - - def get_file_extension(self, record_type) -> str: - if record_type == "EmailMessage": - extension = ".eml" - elif record_type in ["Account", "Lead", "Case", "Campaign"]: - extension = ".xml" - else: - raise MissingCategoryError( - f"There are no categories with the name: {record_type}", - ) - return extension - - @requires_dependencies(["simple_salesforce"], extras="salesforce") - def list_files(self) -> list[FileData]: - """Get Salesforce Ids for the records. - Send them to next phase where each doc gets downloaded into the - appropriate format for partitioning. - """ - from simple_salesforce.exceptions import SalesforceMalformedRequest - - client = self.connection_config.get_client() - - files_list = [] - for record_type in self.index_config.categories: - try: - # Get ids from Salesforce - records = client.query_all_iter( - f"select Id, SystemModstamp, CreatedDate, LastModifiedDate from {record_type}", - ) - for record in records: - record_with_extension = record["Id"] + self.get_file_extension( - record["attributes"]["type"] - ) - files_list.append( - FileData( - connector_type=CONNECTOR_TYPE, - identifier=record["Id"], - source_identifiers=SourceIdentifiers( - filename=record_with_extension, - fullpath=f"{record['attributes']['type']}/{record_with_extension}", - ), - metadata=DataSourceMetadata( - url=record["attributes"]["url"], - version=str(parser.parse(record["SystemModstamp"]).timestamp()), - date_created=str(parser.parse(record["CreatedDate"]).timestamp()), - date_modified=str( - parser.parse(record["LastModifiedDate"]).timestamp() - ), - record_locator={"id": record["Id"]}, - ), - additional_metadata={"record_type": record["attributes"]["type"]}, - ) - ) - except SalesforceMalformedRequest as e: - raise SalesforceMalformedRequest(f"Problem with Salesforce query: {e}") - - return files_list - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - for f in self.list_files(): - yield f - - -@dataclass -class SalesforceDownloaderConfig(DownloaderConfig): - pass - - -@dataclass -class SalesforceDownloader(Downloader): - connection_config: SalesforceConnectionConfig - download_config: SalesforceDownloaderConfig = field( - default_factory=lambda: SalesforceDownloaderConfig() - ) - connector_type: str = CONNECTOR_TYPE - - def get_download_path(self, file_data: FileData) -> Path: - rel_path = file_data.source_identifiers.relative_path - rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path - return self.download_dir / Path(rel_path) - - def _xml_for_record(self, record: OrderedDict) -> str: - """Creates partitionable xml file from a record""" - import xml.etree.ElementTree as ET - - def create_xml_doc(data, parent, prefix=""): - for key, value in data.items(): - if isinstance(value, OrderedDict): - create_xml_doc(value, parent, prefix=f"{prefix}{key}.") - else: - item = ET.Element("item") - item.text = f"{prefix}{key}: {value}" - parent.append(item) - - root = ET.Element("root") - create_xml_doc(record, root) - - xml_string = ET.tostring(root, encoding="utf-8", xml_declaration=True).decode() - return xml_string - - def _eml_for_record(self, email_json: dict[str, Any]) -> str: - """Recreates standard expected .eml format using template.""" - eml = EMAIL_TEMPLATE.substitute( - date=formatdate(parser.parse(email_json.get("MessageDate")).timestamp()), - message_identifier=email_json.get("MessageIdentifier"), - subject=email_json.get("Subject"), - from_email=email_json.get("FromAddress"), - to_email=email_json.get("ToAddress"), - textbody=email_json.get("TextBody"), - htmlbody=email_json.get("HtmlBody"), - ) - return dedent(eml) - - @SourceConnectionNetworkError.wrap - def _get_response(self, file_data: FileData) -> OrderedDict: - client = self.connection_config.get_client() - return client.query( - f"select FIELDS(STANDARD) from {file_data.additional_metadata['record_type']} where Id='{file_data.identifier}'", # noqa: E501 - ) - - def get_record(self, file_data: FileData) -> OrderedDict: - # Get record from Salesforce based on id - response = self._get_response(file_data) - logger.debug(f"response was returned for salesforce record id: {file_data.identifier}") - records = response["records"] - if not records: - raise ValueError( - f"No record found with record id {file_data.identifier}: {json.dumps(response)}" - ) - record_json = records[0] - return record_json - - def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: - record = self.get_record(file_data) - - try: - if file_data.additional_metadata["record_type"] == "EmailMessage": - document = self._eml_for_record(record) - else: - document = self._xml_for_record(record) - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - - with open(download_path, "w") as page_file: - page_file.write(document) - - except Exception as e: - logger.error(f"failed to download file {file_data.identifier}: {e}", exc_info=True) - raise SourceConnectionNetworkError(f"failed to download file {file_data.identifier}") - - return self.generate_download_response(file_data=file_data, download_path=download_path) - - -salesforce_source_entry = SourceRegistryEntry( - connection_config=SalesforceConnectionConfig, - indexer_config=SalesforceIndexerConfig, - indexer=SalesforceIndexer, - downloader_config=SalesforceDownloaderConfig, - downloader=SalesforceDownloader, -) diff --git a/unstructured/ingest/v2/processes/connectors/sharepoint.py b/unstructured/ingest/v2/processes/connectors/sharepoint.py deleted file mode 100644 index 696d327ce..000000000 --- a/unstructured/ingest/v2/processes/connectors/sharepoint.py +++ /dev/null @@ -1,411 +0,0 @@ -import json -from dataclasses import dataclass, field -from enum import Enum -from pathlib import Path -from time import time -from typing import TYPE_CHECKING, Any, Generator, Optional -from urllib.parse import quote - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.error import SourceConnectionNetworkError -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - Downloader, - DownloaderConfig, - DownloadResponse, - FileData, - Indexer, - IndexerConfig, - SourceIdentifiers, - download_responses, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - SourceRegistryEntry, -) -from unstructured.utils import requires_dependencies - -from .utils import parse_datetime - -if TYPE_CHECKING: - from office365.graph_client import GraphClient - from office365.onedrive.driveitems.driveItem import DriveItem - from office365.onedrive.drives.drive import Drive - from office365.onedrive.permissions.permission import Permission - from office365.onedrive.sites.site import Site - from office365.sharepoint.client_context import ClientContext - from office365.sharepoint.files.file import File - from office365.sharepoint.folders.folder import Folder - from office365.sharepoint.publishing.pages.page import SitePage - -CONNECTOR_TYPE = "sharepoint" - -MAX_MB_SIZE = 512_000_000 - -# TODO handle other data types possible from Sharepoint -# exampled: https://github.com/vgrem/Office365-REST-Python-Client/tree/master/examples/sharepoint - - -class SharepointContentType(Enum): - DOCUMENT = "document" - SITEPAGE = "site_page" - LIST = "list" - - -@dataclass -class SharepointAccessConfig(AccessConfig): - client_cred: str - - -@dataclass -class SharepointPermissionsConfig(EnhancedDataClassJsonMixin): - permissions_application_id: str - permissions_tenant: str - permissions_client_cred: str = enhanced_field(sensitive=True) - authority_url: Optional[str] = field(repr=False, default="https://login.microsoftonline.com") - - -@dataclass -class SharepointConnectionConfig(ConnectionConfig): - client_id: str - site: str - access_config: SharepointAccessConfig = enhanced_field(sensitive=True) - permissions_config: Optional[SharepointPermissionsConfig] = None - - @requires_dependencies(["office365"], extras="sharepoint") - def get_client(self) -> "ClientContext": - from office365.runtime.auth.client_credential import ClientCredential - from office365.sharepoint.client_context import ClientContext - - try: - credentials = ClientCredential(self.client_id, self.access_config.client_cred) - site_client = ClientContext(self.site).with_credentials(credentials) - except Exception as e: - logger.error(f"Couldn't set Sharepoint client: {e}") - raise e - return site_client - - @requires_dependencies(["msal"], extras="sharepoint") - def get_permissions_token(self): - from msal import ConfidentialClientApplication - - try: - app = ConfidentialClientApplication( - authority=f"{self.permissions_config.authority_url}/" - f"{self.permissions_config.permissions_tenant}", - client_id=self.permissions_config.permissions_application_id, - client_credential=self.permissions_config.permissions_client_cred, - ) - token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - except ValueError as exc: - logger.error("Couldn't set up credentials for Sharepoint") - raise exc - if "error" in token: - raise SourceConnectionNetworkError( - "failed to fetch token, {}: {}".format(token["error"], token["error_description"]) - ) - return token - - @requires_dependencies(["office365"], extras="sharepoint") - def get_permissions_client(self) -> Optional["GraphClient"]: - from office365.graph_client import GraphClient - - if self.permissions_config is None: - return None - - client = GraphClient(self.get_permissions_token) - return client - - -@dataclass -class SharepointIndexerConfig(IndexerConfig): - path: Optional[str] = None - recursive: bool = False - omit_files: bool = False - omit_pages: bool = False - omit_lists: bool = False - - -@dataclass -class SharepointIndexer(Indexer): - connection_config: SharepointConnectionConfig - index_config: SharepointIndexerConfig = field(default_factory=lambda: SharepointIndexerConfig()) - - def list_files(self, folder: "Folder", recursive: bool = False) -> list["File"]: - if not recursive: - folder.expand(["Files"]).get().execute_query() - return folder.files - - folder.expand(["Files", "Folders"]).get().execute_query() - files: list["File"] = list(folder.files) - folders: list["Folder"] = list(folder.folders) - for f in folders: - if "/Forms" in f.serverRelativeUrl: - continue - files.extend(self.list_files(f, recursive)) - return files - - def get_properties(self, raw_properties: dict) -> dict: - raw_properties = {k: v for k, v in raw_properties.items() if v} - filtered_properties = {} - for k, v in raw_properties.items(): - try: - json.dumps(v) - filtered_properties[k] = v - except TypeError: - pass - return filtered_properties - - def list_pages(self, client: "ClientContext") -> list["SitePage"]: - pages = client.site_pages.pages.get().execute_query() - return pages - - def page_to_file_data(self, site_page: "SitePage") -> FileData: - site_page.expand(site_page.properties.keys()).get().execute_query() - version = site_page.properties.get("Version", None) - unique_id = site_page.properties.get("UniqueId", None) - modified_date = site_page.properties.get("Modified", None) - url = site_page.properties.get("AbsoluteUrl", None) - date_modified_dt = parse_datetime(modified_date) if modified_date else None - date_created_at = ( - parse_datetime(site_page.first_published) - if (site_page.first_published and site_page.first_published != "0001-01-01T08:00:00Z") - else None - ) - file_path = site_page.get_property("Url", "") - server_path = file_path if file_path[0] != "/" else file_path[1:] - additional_metadata = self.get_properties(raw_properties=site_page.properties) - additional_metadata["sharepoint_content_type"] = SharepointContentType.SITEPAGE.value - return FileData( - identifier=unique_id, - connector_type=CONNECTOR_TYPE, - source_identifiers=SourceIdentifiers( - filename=site_page.file_name, - fullpath=file_path, - rel_path=file_path.replace(self.index_config.path, ""), - ), - metadata=DataSourceMetadata( - url=url, - version=version, - date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None, - date_created=str(date_created_at.timestamp()) if date_created_at else None, - date_processed=str(time()), - record_locator={ - "server_path": server_path, - }, - ), - additional_metadata=additional_metadata, - ) - - def file_to_file_data(self, client: "ClientContext", file: "File") -> FileData: - file.expand(file.properties.keys()).get().execute_query() - absolute_url = f"{client.base_url}{quote(file.serverRelativeUrl)}" - date_modified_dt = ( - parse_datetime(file.time_last_modified) if file.time_last_modified else None - ) - - date_created_at = parse_datetime(file.time_created) if file.time_created else None - additional_metadata = self.get_properties(raw_properties=file.properties) - additional_metadata["sharepoint_content_type"] = SharepointContentType.DOCUMENT.value - fullpath = str(file.serverRelativeUrl) - rel_path = fullpath.replace(self.index_config.path, "") - while rel_path[0] == "/": - rel_path = rel_path[1:] - return FileData( - identifier=file.unique_id, - connector_type=CONNECTOR_TYPE, - source_identifiers=SourceIdentifiers( - filename=file.name, - fullpath=fullpath, - rel_path=rel_path, - ), - metadata=DataSourceMetadata( - url=absolute_url, - version=f"{file.major_version}.{file.minor_version}", - date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None, - date_created=str(date_created_at.timestamp()) if date_created_at else None, - date_processed=str(time()), - record_locator={"server_path": file.serverRelativeUrl, "site_url": client.base_url}, - ), - additional_metadata=additional_metadata, - ) - - def get_root(self, client: "ClientContext") -> "Folder": - if path := self.index_config.path: - return client.web.get_folder_by_server_relative_path(path) - default_document_library = client.web.default_document_library() - root_folder = default_document_library.root_folder - root_folder = root_folder.get().execute_query() - self.index_config.path = root_folder.name - return root_folder - - def get_site_url(self, client: "ClientContext") -> str: - res = client.web.get().execute_query() - return res.url - - def get_site(self, permissions_client: "GraphClient", site_url) -> "Site": - return permissions_client.sites.get_by_url(url=site_url).execute_query() - - def get_permissions_items(self, site: "Site") -> list["DriveItem"]: - # TODO find a way to narrow this search down by name of drive - items: list["DriveItem"] = [] - drives: list["Drive"] = site.drives.get_all().execute_query() - for drive in drives: - items.extend(drive.root.children.get_all().execute_query()) - return items - - def map_permission(self, permission: "Permission") -> dict: - return { - "id": permission.id, - "roles": list(permission.roles), - "share_id": permission.share_id, - "has_password": permission.has_password, - "link": permission.link.to_json(), - "granted_to_identities": permission.granted_to_identities.to_json(), - "granted_to": permission.granted_to.to_json(), - "granted_to_v2": permission.granted_to_v2.to_json(), - "granted_to_identities_v2": permission.granted_to_identities_v2.to_json(), - "invitation": permission.invitation.to_json(), - } - - def enrich_permissions_on_files(self, all_file_data: list[FileData], site_url: str) -> None: - logger.debug("Enriching permissions on files") - permission_client = self.connection_config.get_permissions_client() - if permission_client is None: - return - site = self.get_site(permissions_client=permission_client, site_url=site_url) - existing_items = self.get_permissions_items(site=site) - for file_data in all_file_data: - etag = file_data.additional_metadata.get("ETag") - if not etag: - continue - matching_items = list(filter(lambda x: x.etag == etag, existing_items)) - if not matching_items: - continue - if len(matching_items) > 1: - logger.warning( - "Found multiple drive items with etag matching {}, skipping: {}".format( - etag, ", ".join([i.name for i in matching_items]) - ) - ) - continue - matching_item = matching_items[0] - permissions: list["Permission"] = matching_item.permissions.get_all().execute_query() - permissions_data = [ - self.map_permission(permission=permission) for permission in permissions - ] - file_data.metadata.permissions_data = permissions_data - - @property - def process_permissions(self) -> bool: - return ( - self.connection_config.permissions_config.permissions_tenant - and self.connection_config.permissions_config.permissions_client_cred - and self.connection_config.permissions_config.permissions_application_id - ) - - def run(self, **kwargs: Any) -> Generator[FileData, None, None]: - client = self.connection_config.get_client() - root_folder = self.get_root(client=client) - logger.debug(f"processing content from path: {self.index_config.path}") - if not self.index_config.omit_files: - files = self.list_files(root_folder, recursive=self.index_config.recursive) - file_data = [self.file_to_file_data(file=file, client=client) for file in files] - if self.process_permissions: - self.enrich_permissions_on_files( - all_file_data=file_data, site_url=self.get_site_url(client=client) - ) - for file in file_data: - yield file - if not self.index_config.omit_pages: - pages = self.list_pages(client=client) - for page in pages: - file_data = self.page_to_file_data(site_page=page) - file_data.metadata.record_locator["site_url"] = client.base_url - yield file_data - - -@dataclass -class SharepointDownloaderConfig(DownloaderConfig): - pass - - -@dataclass -class SharepointDownloader(Downloader): - connection_config: SharepointConnectionConfig - download_config: SharepointDownloaderConfig - connector_type: str = CONNECTOR_TYPE - - def get_download_path(self, file_data: FileData) -> Path: - content_type = file_data.additional_metadata.get("sharepoint_content_type") - rel_path = file_data.source_identifiers.fullpath - rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path - download_path = self.download_dir / Path(rel_path) - if content_type == SharepointContentType.SITEPAGE.value: - # Update output extension to html if site page - download_path = download_path.with_suffix(".html") - return download_path - - def get_document(self, file_data: FileData) -> DownloadResponse: - client: "ClientContext" = self.connection_config.get_client() - file: "File" = client.web.get_file_by_id(unique_id=file_data.identifier) - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - logger.debug( - f"writing document content {file_data.source_identifiers.fullpath} to {download_path}" - ) - with download_path.open("wb") as f: - file.download(f).execute_query() - return self.generate_download_response(file_data=file_data, download_path=download_path) - - def get_site_page(self, file_data: FileData) -> DownloadResponse: - # TODO fetch comments for site page as well - from lxml import etree, html - - canvas_content_raw = file_data.additional_metadata.get("CanvasContent1") - layout_web_parts_content_raw = file_data.additional_metadata.get("LayoutWebpartsContent") - html_content = [] - if layout_web_parts_content_raw: - layout_web_parts_content = json.loads(layout_web_parts_content_raw) - for web_part in layout_web_parts_content: - properties = web_part.get("properties", {}) - if title := properties.get("title"): - html_content.append(f"{title}") - if canvas_content_raw: - canvas_content = json.loads(canvas_content_raw) - for content in canvas_content: - if inner_html := content.get("innerHTML"): - html_content.append(inner_html) - htmls = "".join(html_content) - content = f"
{htmls}
" - document = html.fromstring(content) - download_path = self.get_download_path(file_data=file_data) - download_path.parent.mkdir(parents=True, exist_ok=True) - logger.debug( - f"writing site page content {file_data.source_identifiers.filename} to {download_path}" - ) - with download_path.open("w") as f: - f.write(etree.tostring(document, encoding="unicode", pretty_print=True)) - return self.generate_download_response(file_data=file_data, download_path=download_path) - - def run(self, file_data: FileData, **kwargs: Any) -> download_responses: - content_type = file_data.additional_metadata.get("sharepoint_content_type") - if not content_type: - raise ValueError( - f"Missing sharepoint_content_type metadata: {file_data.additional_metadata}" - ) - if content_type == SharepointContentType.DOCUMENT.value: - return self.get_document(file_data=file_data) - elif content_type == SharepointContentType.SITEPAGE.value: - return self.get_site_page(file_data=file_data) - - -sharepoint_source_entry = SourceRegistryEntry( - connection_config=SharepointConnectionConfig, - indexer_config=SharepointIndexerConfig, - indexer=SharepointIndexer, - downloader_config=SharepointDownloaderConfig, - downloader=SharepointDownloader, -) diff --git a/unstructured/ingest/v2/processes/connectors/singlestore.py b/unstructured/ingest/v2/processes/connectors/singlestore.py deleted file mode 100644 index 3e2d534e2..000000000 --- a/unstructured/ingest/v2/processes/connectors/singlestore.py +++ /dev/null @@ -1,160 +0,0 @@ -import json -from dataclasses import dataclass -from datetime import date, datetime -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional - -import numpy as np -import pandas as pd -from dateutil import parser - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.utils.data_prep import batch_generator -from unstructured.ingest.utils.table import convert_to_pandas_dataframe -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - FileData, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from singlestoredb.connection import Connection - -CONNECTOR_TYPE = "singlestore" - - -@dataclass -class SingleStoreAccessConfig(AccessConfig): - password: Optional[str] = None - - -@dataclass -class SingleStoreConnectionConfig(ConnectionConfig): - host: Optional[str] = None - port: Optional[int] = None - user: Optional[str] = None - database: Optional[str] = None - access_config: SingleStoreAccessConfig = enhanced_field(sensitive=True) - - @requires_dependencies(["singlestoredb"], extras="singlestore") - def get_connection(self) -> "Connection": - import singlestoredb as s2 - - conn = s2.connect( - host=self.host, - port=self.port, - database=self.database, - user=self.user, - password=self.access_config.password, - ) - return conn - - -@dataclass -class SingleStoreUploadStagerConfig(UploadStagerConfig): - drop_empty_cols: bool = False - - -@dataclass -class SingleStoreUploadStager(UploadStager): - upload_stager_config: SingleStoreUploadStagerConfig - - @staticmethod - def parse_date_string(date_string: str) -> date: - try: - timestamp = float(date_string) - return datetime.fromtimestamp(timestamp) - except Exception as e: - logger.debug(f"date {date_string} string not a timestamp: {e}") - return parser.parse(date_string) - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - output_path = Path(output_dir) / Path(f"{output_filename}.csv") - output_path.parent.mkdir(parents=True, exist_ok=True) - - df = convert_to_pandas_dataframe( - elements_dict=elements_contents, - drop_empty_cols=self.upload_stager_config.drop_empty_cols, - ) - datetime_columns = [ - "data_source_date_created", - "data_source_date_modified", - "data_source_date_processed", - ] - for column in filter(lambda x: x in df.columns, datetime_columns): - df[column] = df[column].apply(self.parse_date_string) - if "data_source_record_locator" in df.columns: - df["data_source_record_locator"] = df["data_source_record_locator"].apply( - lambda x: json.dumps(x) if x else None - ) - - with output_path.open("w") as output_file: - df.to_csv(output_file, index=False) - return output_path - - -@dataclass -class SingleStoreUploaderConfig(UploaderConfig): - table_name: str - batch_size: int = 100 - - -@dataclass -class SingleStoreUploader(Uploader): - connection_config: SingleStoreConnectionConfig - upload_config: SingleStoreUploaderConfig - connector_type: str = CONNECTOR_TYPE - - def upload_csv(self, content: UploadContent) -> None: - df = pd.read_csv(content.path) - logger.debug( - f"uploading {len(df)} entries to {self.connection_config.database} " - f"db in table {self.upload_config.table_name}" - ) - stmt = "INSERT INTO {} ({}) VALUES ({})".format( - self.upload_config.table_name, - ", ".join(df.columns), - ", ".join(["%s"] * len(df.columns)), - ) - logger.debug(f"sql statement: {stmt}") - df.replace({np.nan: None}, inplace=True) - data_as_tuples = list(df.itertuples(index=False, name=None)) - with self.connection_config.get_connection() as conn: - with conn.cursor() as cur: - for chunk in batch_generator( - data_as_tuples, batch_size=self.upload_config.batch_size - ): - cur.executemany(stmt, chunk) - conn.commit() - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - for content in contents: - self.upload_csv(content=content) - - -singlestore_destination_entry = DestinationRegistryEntry( - connection_config=SingleStoreConnectionConfig, - uploader=SingleStoreUploader, - uploader_config=SingleStoreUploaderConfig, - upload_stager=SingleStoreUploadStager, - upload_stager_config=SingleStoreUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/sql.py b/unstructured/ingest/v2/processes/connectors/sql.py deleted file mode 100644 index cfec183a1..000000000 --- a/unstructured/ingest/v2/processes/connectors/sql.py +++ /dev/null @@ -1,265 +0,0 @@ -import enum -import json -import uuid -from dataclasses import dataclass, field -from datetime import date, datetime -from pathlib import Path -from typing import Any, Optional, Union - -import numpy as np -import pandas as pd -from dateutil import parser - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - FileData, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import DestinationRegistryEntry -from unstructured.utils import requires_dependencies - -CONNECTOR_TYPE = "sql" -ELEMENTS_TABLE_NAME = "elements" - - -@dataclass -class SQLAccessConfig(AccessConfig): - username: Optional[str] = None - password: Optional[str] = None - - -class DatabaseType(str, enum.Enum): - SQLITE = "sqlite" - POSTGRESQL = "postgresql" - - -@dataclass -class SimpleSqlConfig(ConnectionConfig): - db_type: DatabaseType = ( - # required default value here because of parent class - DatabaseType.SQLITE - ) - database: Optional[str] = None - host: Optional[str] = None - port: Optional[int] = 5432 - access_config: Optional[SQLAccessConfig] = enhanced_field(default=None, sensitive=True) - connector_type: str = CONNECTOR_TYPE - - def __post_init__(self): - if (self.db_type == DatabaseType.SQLITE) and (self.database is None): - raise ValueError( - "A sqlite connection requires a path to a *.db file " - "through the `database` argument" - ) - - -@dataclass -class SQLUploadStagerConfig(UploadStagerConfig): - pass - - -_COLUMNS = ( - "id", - "element_id", - "text", - "embeddings", - "type", - "system", - "layout_width", - "layout_height", - "points", - "url", - "version", - "date_created", - "date_modified", - "date_processed", - "permissions_data", - "record_locator", - "category_depth", - "parent_id", - "attached_filename", - "filetype", - "last_modified", - "file_directory", - "filename", - "languages", - "page_number", - "links", - "page_name", - "link_urls", - "link_texts", - "sent_from", - "sent_to", - "subject", - "section", - "header_footer_type", - "emphasized_text_contents", - "emphasized_text_tags", - "text_as_html", - "detection_class_prob", -) - -_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified") - - -def parse_date_string(date_value: Union[str, int]) -> date: - try: - timestamp = float(date_value) / 1000 if isinstance(date_value, int) else float(date_value) - return datetime.fromtimestamp(timestamp) - except Exception as e: - logger.debug(f"date {date_value} string not a timestamp: {e}") - return parser.parse(date_value) - - -@dataclass -class SQLUploadStager(UploadStager): - upload_stager_config: SQLUploadStagerConfig = field( - default_factory=lambda: SQLUploadStagerConfig() - ) - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - output_path = Path(output_dir) / Path(f"{output_filename}.json") - output_path.parent.mkdir(parents=True, exist_ok=True) - - output = [] - for data in elements_contents: - metadata: dict[str, Any] = data.pop("metadata", {}) - data_source = metadata.pop("data_source", {}) - coordinates = metadata.pop("coordinates", {}) - - data.update(metadata) - data.update(data_source) - data.update(coordinates) - - data["id"] = str(uuid.uuid4()) - - # remove extraneous, not supported columns - [data.pop(column) for column in data if column not in _COLUMNS] - - output.append(data) - - df = pd.DataFrame.from_dict(output) - for column in filter(lambda x: x in df.columns, _DATE_COLUMNS): - df[column] = df[column].apply(parse_date_string) - for column in filter( - lambda x: x in df.columns, - ("permissions_data", "record_locator", "points", "links"), - ): - df[column] = df[column].apply( - lambda x: json.dumps(x) if isinstance(x, (list, dict)) else None - ) - for column in filter(lambda x: x in df.columns, ("version", "page_number")): - df[column] = df[column].apply(str) - - with output_path.open("w") as output_file: - df.to_json(output_file, orient="records", lines=True) - return output_path - - -@dataclass -class SQLUploaderConfig(UploaderConfig): - batch_size: int = 50 - - -@dataclass -class SQLUploader(Uploader): - connector_type: str = CONNECTOR_TYPE - upload_config: SQLUploaderConfig - connection_config: SimpleSqlConfig - - @property - def connection(self): - if self.connection_config.db_type == DatabaseType.POSTGRESQL: - return self._make_psycopg_connection - elif self.connection_config.db_type == DatabaseType.SQLITE: - return self._make_sqlite_connection - raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.") - - def _make_sqlite_connection(self): - from sqlite3 import connect - - return connect(database=self.connection_config.database) - - @requires_dependencies(["psycopg2"], extras="postgres") - def _make_psycopg_connection(self): - from psycopg2 import connect - - return connect( - user=self.connection_config.access_config.username, - password=self.connection_config.access_config.password, - dbname=self.connection_config.database, - host=self.connection_config.host, - port=self.connection_config.port, - ) - - def prepare_data( - self, columns: list[str], data: tuple[tuple[Any, ...], ...] - ) -> list[tuple[Any, ...]]: - output = [] - for row in data: - parsed = [] - for column_name, value in zip(columns, row): - if self.connection_config.db_type == DatabaseType.SQLITE and isinstance( - value, (list, dict) - ): - value = json.dumps(value) - if column_name in _DATE_COLUMNS: - if value is None: - parsed.append(None) - else: - parsed.append(parse_date_string(value)) - else: - parsed.append(value) - output.append(tuple(parsed)) - return output - - def upload_contents(self, content: UploadContent) -> None: - df = pd.read_json(content.path, orient="records", lines=True) - logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ") - df.replace({np.nan: None}, inplace=True) - - columns = tuple(df.columns) - stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \ - VALUES({','.join(['?' if self.connection_config.db_type==DatabaseType.SQLITE else '%s' for x in columns])})" # noqa E501 - - for rows in pd.read_json( - content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size - ): - with self.connection() as conn: - values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None))) - if self.connection_config.db_type == DatabaseType.SQLITE: - conn.executemany(stmt, values) - else: - with conn.cursor() as cur: - cur.executemany(stmt, values) - - conn.commit() - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - for content in contents: - self.upload_contents(content=content) - - -sql_destination_entry = DestinationRegistryEntry( - connection_config=SimpleSqlConfig, - uploader=SQLUploader, - uploader_config=SQLUploaderConfig, - upload_stager=SQLUploadStager, - upload_stager_config=SQLUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/processes/connectors/utils.py b/unstructured/ingest/v2/processes/connectors/utils.py deleted file mode 100644 index 6e6a8e5fc..000000000 --- a/unstructured/ingest/v2/processes/connectors/utils.py +++ /dev/null @@ -1,19 +0,0 @@ -from datetime import datetime -from typing import Union - -from dateutil import parser - - -def parse_datetime(date_value: Union[int, str, float, datetime]) -> datetime: - if isinstance(date_value, datetime): - return date_value - elif isinstance(date_value, float): - return datetime.fromtimestamp(date_value) - elif isinstance(date_value, int): - return datetime.fromtimestamp(date_value / 1000) - - try: - timestamp = float(date_value) - return datetime.fromtimestamp(timestamp) - except ValueError: - return parser.parse(date_value) diff --git a/unstructured/ingest/v2/processes/connectors/weaviate.py b/unstructured/ingest/v2/processes/connectors/weaviate.py deleted file mode 100644 index 67a6c024c..000000000 --- a/unstructured/ingest/v2/processes/connectors/weaviate.py +++ /dev/null @@ -1,232 +0,0 @@ -import json -from dataclasses import dataclass, field -from datetime import date, datetime -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional - -from dateutil import parser - -from unstructured.ingest.enhanced_dataclass import enhanced_field -from unstructured.ingest.v2.interfaces import ( - AccessConfig, - ConnectionConfig, - FileData, - UploadContent, - Uploader, - UploaderConfig, - UploadStager, - UploadStagerConfig, -) -from unstructured.ingest.v2.logger import logger -from unstructured.ingest.v2.processes.connector_registry import ( - DestinationRegistryEntry, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from weaviate import Client - -CONNECTOR_TYPE = "weaviate" - - -@dataclass -class WeaviateAccessConfig(AccessConfig): - access_token: Optional[str] = None - api_key: Optional[str] = None - client_secret: Optional[str] = None - password: Optional[str] = None - - -@dataclass -class WeaviateConnectionConfig(ConnectionConfig): - host_url: str - class_name: str - access_config: WeaviateAccessConfig = enhanced_field(sensitive=True) - username: Optional[str] = None - anonymous: bool = False - scope: Optional[list[str]] = None - refresh_token: Optional[str] = None - connector_type: str = CONNECTOR_TYPE - - -@dataclass -class WeaviateUploadStagerConfig(UploadStagerConfig): - pass - - -@dataclass -class WeaviateUploadStager(UploadStager): - upload_stager_config: WeaviateUploadStagerConfig = field( - default_factory=lambda: WeaviateUploadStagerConfig() - ) - - @staticmethod - def parse_date_string(date_string: str) -> date: - try: - timestamp = float(date_string) - return datetime.fromtimestamp(timestamp) - except Exception as e: - logger.debug(f"date {date_string} string not a timestamp: {e}") - return parser.parse(date_string) - - @classmethod - def conform_dict(cls, data: dict) -> None: - """ - Updates the element dictionary to conform to the Weaviate schema - """ - - # Dict as string formatting - if record_locator := data.get("metadata", {}).get("data_source", {}).get("record_locator"): - # Explicit casting otherwise fails schema type checking - data["metadata"]["data_source"]["record_locator"] = str(json.dumps(record_locator)) - - # Array of items as string formatting - if points := data.get("metadata", {}).get("coordinates", {}).get("points"): - data["metadata"]["coordinates"]["points"] = str(json.dumps(points)) - - if links := data.get("metadata", {}).get("links", {}): - data["metadata"]["links"] = str(json.dumps(links)) - - if permissions_data := ( - data.get("metadata", {}).get("data_source", {}).get("permissions_data") - ): - data["metadata"]["data_source"]["permissions_data"] = json.dumps(permissions_data) - - # Datetime formatting - if date_created := data.get("metadata", {}).get("data_source", {}).get("date_created"): - data["metadata"]["data_source"]["date_created"] = cls.parse_date_string( - date_created - ).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - if date_modified := data.get("metadata", {}).get("data_source", {}).get("date_modified"): - data["metadata"]["data_source"]["date_modified"] = cls.parse_date_string( - date_modified - ).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - if date_processed := data.get("metadata", {}).get("data_source", {}).get("date_processed"): - data["metadata"]["data_source"]["date_processed"] = cls.parse_date_string( - date_processed - ).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - if last_modified := data.get("metadata", {}).get("last_modified"): - data["metadata"]["last_modified"] = cls.parse_date_string(last_modified).strftime( - "%Y-%m-%dT%H:%M:%S.%fZ", - ) - - # String casting - if version := data.get("metadata", {}).get("data_source", {}).get("version"): - data["metadata"]["data_source"]["version"] = str(version) - - if page_number := data.get("metadata", {}).get("page_number"): - data["metadata"]["page_number"] = str(page_number) - - def run( - self, - elements_filepath: Path, - file_data: FileData, - output_dir: Path, - output_filename: str, - **kwargs: Any, - ) -> Path: - with open(elements_filepath) as elements_file: - elements_contents = json.load(elements_file) - for element in elements_contents: - self.conform_dict(data=element) - output_path = Path(output_dir) / Path(f"{output_filename}.json") - with open(output_path, "w") as output_file: - json.dump(elements_contents, output_file) - return output_path - - -@dataclass -class WeaviateUploaderConfig(UploaderConfig): - batch_size: int = 100 - - -@dataclass -class WeaviateUploader(Uploader): - upload_config: WeaviateUploaderConfig - connection_config: WeaviateConnectionConfig - client: Optional["Client"] = field(init=False) - connector_type: str = CONNECTOR_TYPE - - @requires_dependencies(["weaviate"], extras="weaviate") - def __post_init__(self): - from weaviate import Client - - auth = self._resolve_auth_method() - self.client = Client(url=self.connection_config.host_url, auth_client_secret=auth) - - @requires_dependencies(["weaviate"], extras="weaviate") - def _resolve_auth_method(self): - access_configs = self.connection_config.access_config - connection_config = self.connection_config - if connection_config.anonymous: - return None - - if access_configs.access_token: - from weaviate.auth import AuthBearerToken - - return AuthBearerToken( - access_token=access_configs.access_token, - refresh_token=connection_config.refresh_token, - ) - elif access_configs.api_key: - from weaviate.auth import AuthApiKey - - return AuthApiKey(api_key=access_configs.api_key) - elif access_configs.client_secret: - from weaviate.auth import AuthClientCredentials - - return AuthClientCredentials( - client_secret=access_configs.client_secret, scope=connection_config.scope - ) - elif connection_config.username and access_configs.password: - from weaviate.auth import AuthClientPassword - - return AuthClientPassword( - username=connection_config.username, - password=access_configs.password, - scope=connection_config.scope, - ) - return None - - def run(self, contents: list[UploadContent], **kwargs: Any) -> None: - # TODO update to use async support in weaviate client - # once the version can be bumped to include it - elements_dict = [] - for content in contents: - with open(content.path) as elements_file: - elements = json.load(elements_file) - elements_dict.extend(elements) - - logger.info( - f"writing {len(elements_dict)} objects to destination " - f"class {self.connection_config.class_name} " - f"at {self.connection_config.host_url}", - ) - - self.client.batch.configure(batch_size=self.upload_config.batch_size) - with self.client.batch as b: - for e in elements_dict: - vector = e.pop("embeddings", None) - b.add_data_object( - e, - self.connection_config.class_name, - vector=vector, - ) - - -weaviate_destination_entry = DestinationRegistryEntry( - connection_config=WeaviateConnectionConfig, - uploader=WeaviateUploader, - uploader_config=WeaviateUploaderConfig, - upload_stager=WeaviateUploadStager, - upload_stager_config=WeaviateUploadStagerConfig, -) diff --git a/unstructured/ingest/v2/processes/embedder.py b/unstructured/ingest/v2/processes/embedder.py deleted file mode 100644 index 6ed1c560c..000000000 --- a/unstructured/ingest/v2/processes/embedder.py +++ /dev/null @@ -1,82 +0,0 @@ -from abc import ABC -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Optional - -from unstructured.documents.elements import Element -from unstructured.embed.interfaces import BaseEmbeddingEncoder -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field -from unstructured.ingest.v2.interfaces.process import BaseProcess -from unstructured.staging.base import elements_from_json - - -@dataclass -class EmbedderConfig(EnhancedDataClassJsonMixin): - embedding_provider: Optional[str] = None - embedding_api_key: Optional[str] = enhanced_field(default=None, sensitive=True) - embedding_model_name: Optional[str] = None - embedding_aws_access_key_id: Optional[str] = None - embedding_aws_secret_access_key: Optional[str] = None - embedding_aws_region: Optional[str] = None - - def get_embedder(self) -> BaseEmbeddingEncoder: - kwargs: dict[str, Any] = {} - if self.embedding_api_key: - kwargs["api_key"] = self.embedding_api_key - if self.embedding_model_name: - kwargs["model_name"] = self.embedding_model_name - # TODO make this more dynamic to map to encoder configs - if self.embedding_provider == "langchain-openai": - from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder - - return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs)) - elif self.embedding_provider == "langchain-huggingface": - from unstructured.embed.huggingface import ( - HuggingFaceEmbeddingConfig, - HuggingFaceEmbeddingEncoder, - ) - - return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs)) - elif self.embedding_provider == "octoai": - from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder - - return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs)) - elif self.embedding_provider == "langchain-aws-bedrock": - from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder - - return BedrockEmbeddingEncoder( - config=BedrockEmbeddingConfig( - aws_access_key_id=self.embedding_aws_access_key_id, - aws_secret_access_key=self.embedding_aws_secret_access_key, - region_name=self.embedding_aws_region, - ) - ) - elif self.embedding_provider == "langchain-vertexai": - from unstructured.embed.vertexai import ( - VertexAIEmbeddingConfig, - VertexAIEmbeddingEncoder, - ) - - return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs)) - elif self.embedding_provider == "mixedbread-ai": - from unstructured.embed.mixedbreadai import ( - MixedbreadAIEmbeddingConfig, - MixedbreadAIEmbeddingEncoder, - ) - - return MixedbreadAIEmbeddingEncoder(config=MixedbreadAIEmbeddingConfig(**kwargs)) - else: - raise ValueError(f"{self.embedding_provider} not a recognized encoder") - - -@dataclass -class Embedder(BaseProcess, ABC): - config: EmbedderConfig - - def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]: - # TODO update base embedder classes to support async - embedder = self.config.get_embedder() - elements = elements_from_json(filename=str(elements_filepath)) - if not elements: - return elements - return embedder.embed_documents(elements=elements) diff --git a/unstructured/ingest/v2/processes/partitioner.py b/unstructured/ingest/v2/processes/partitioner.py deleted file mode 100644 index 71bcd5700..000000000 --- a/unstructured/ingest/v2/processes/partitioner.py +++ /dev/null @@ -1,165 +0,0 @@ -import asyncio -from abc import ABC -from dataclasses import dataclass, field, fields -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.enhanced_dataclass.dataclasses import enhanced_field -from unstructured.ingest.v2.interfaces.process import BaseProcess -from unstructured.ingest.v2.logger import logger -from unstructured.staging.base import elements_to_dicts, flatten_dict - -if TYPE_CHECKING: - from unstructured_client import UnstructuredClient - from unstructured_client.models.shared import PartitionParameters - - -@dataclass -class PartitionerConfig(EnhancedDataClassJsonMixin): - strategy: str = "auto" - ocr_languages: Optional[list[str]] = None - encoding: Optional[str] = None - additional_partition_args: Optional[dict[str, Any]] = None - skip_infer_table_types: Optional[list[str]] = None - fields_include: list[str] = field( - default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"], - ) - flatten_metadata: bool = False - metadata_exclude: list[str] = field(default_factory=list) - metadata_include: list[str] = field(default_factory=list) - partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general" - partition_by_api: bool = False - api_key: Optional[str] = enhanced_field(default=None, sensitive=True) - hi_res_model_name: Optional[str] = None - - def __post_init__(self): - if self.metadata_exclude and self.metadata_include: - raise ValueError( - "metadata_exclude and metadata_include are " - "mutually exclusive with each other. Cannot specify both." - ) - - def to_partition_kwargs(self) -> dict[str, Any]: - partition_kwargs: dict[str, Any] = { - "strategy": self.strategy, - "languages": self.ocr_languages, - "hi_res_model_name": self.hi_res_model_name, - "skip_infer_table_types": self.skip_infer_table_types, - } - # Don't inject information if None and allow default values in method to be used - partition_kwargs = {k: v for k, v in partition_kwargs.items() if v is not None} - if self.additional_partition_args: - partition_kwargs.update(self.additional_partition_args) - return partition_kwargs - - -@dataclass -class Partitioner(BaseProcess, ABC): - config: PartitionerConfig - - def is_async(self) -> bool: - return self.config.partition_by_api - - def postprocess(self, elements: list[dict]) -> list[dict]: - element_dicts = [e.copy() for e in elements] - for elem in element_dicts: - if self.config.metadata_exclude: - ex_list = self.config.metadata_exclude - for ex in ex_list: - if "." in ex: # handle nested fields - nested_fields = ex.split(".") - current_elem = elem - for f in nested_fields[:-1]: - if f in current_elem: - current_elem = current_elem[f] - field_to_exclude = nested_fields[-1] - if field_to_exclude in current_elem: - current_elem.pop(field_to_exclude, None) - else: # handle top-level fields - elem["metadata"].pop(ex, None) # type: ignore[attr-defined] - elif self.config.metadata_include: - in_list = self.config.metadata_include - for k in list(elem["metadata"].keys()): # type: ignore[attr-defined] - if k not in in_list: - elem["metadata"].pop(k, None) # type: ignore[attr-defined] - in_list = self.config.fields_include - elem = {k: v for k, v in elem.items() if k in in_list} - - if self.config.flatten_metadata and "metadata" in elem: - metadata = elem.pop("metadata") - elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"])) - return element_dicts - - def partition_locally( - self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs - ) -> list[dict]: - from unstructured.partition.auto import partition - - logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}") - logger.debug(f"partitioning file {filename} with metadata {metadata.to_dict()}") - elements = partition( - filename=str(filename.resolve()), - data_source_metadata=metadata, - **self.config.to_partition_kwargs(), - ) - return self.postprocess(elements=elements_to_dicts(elements)) - - async def call_api(self, client: "UnstructuredClient", request: "PartitionParameters"): - # TODO when client supports async, run without using run_in_executor - # isolate the IO heavy call - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, client.general.partition, request) - - def create_partition_parameters(self, filename: Path) -> "PartitionParameters": - from unstructured_client.models.shared import Files, PartitionParameters - - partition_request = self.config.to_partition_kwargs() - possible_fields = [f.name for f in fields(PartitionParameters)] - filtered_partition_request = { - k: v for k, v in partition_request.items() if k in possible_fields - } - if len(filtered_partition_request) != len(partition_request): - logger.debug( - "Following fields were omitted due to not being " - "supported by the currently used unstructured client: {}".format( - ", ".join([v for v in partition_request if v not in filtered_partition_request]) - ) - ) - logger.debug(f"Using hosted partitioner with kwargs: {partition_request}") - with open(filename, "rb") as f: - files = Files( - content=f.read(), - file_name=str(filename.resolve()), - ) - filtered_partition_request["files"] = files - partition_params = PartitionParameters(**filtered_partition_request) - return partition_params - - async def partition_via_api( - self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs - ) -> list[dict]: - from unstructured_client import UnstructuredClient - - logger.debug(f"partitioning file {filename} with metadata: {metadata.to_dict()}") - client = UnstructuredClient( - server_url=self.config.partition_endpoint, api_key_auth=self.config.api_key - ) - partition_params = self.create_partition_parameters(filename=filename) - resp = await self.call_api(client=client, request=partition_params) - elements = resp.elements or [] - # Append the data source metadata the auto partition does for you - for element in elements: - element["metadata"]["data_source"] = metadata.to_dict() - return self.postprocess(elements=elements) - - def run( - self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs - ) -> list[dict]: - return self.partition_locally(filename, metadata=metadata, **kwargs) - - async def run_async( - self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs - ) -> list[dict]: - return await self.partition_via_api(filename, metadata=metadata, **kwargs) diff --git a/unstructured/ingest/v2/processes/uncompress.py b/unstructured/ingest/v2/processes/uncompress.py deleted file mode 100644 index e0b826461..000000000 --- a/unstructured/ingest/v2/processes/uncompress.py +++ /dev/null @@ -1,43 +0,0 @@ -from abc import ABC -from copy import copy -from dataclasses import dataclass, field -from pathlib import Path -from typing import Any - -from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin -from unstructured.ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file -from unstructured.ingest.v2.interfaces import FileData -from unstructured.ingest.v2.interfaces.process import BaseProcess - - -@dataclass -class UncompressConfig(EnhancedDataClassJsonMixin): - pass - - -@dataclass -class Uncompressor(BaseProcess, ABC): - config: UncompressConfig = field(default_factory=UncompressConfig) - - def is_async(self) -> bool: - return True - - def run(self, file_data: FileData, **kwargs: Any) -> list[FileData]: - local_filepath = Path(file_data.source_identifiers.fullpath) - if local_filepath.suffix not in TAR_FILE_EXT + ZIP_FILE_EXT: - return [file_data] - new_path = uncompress_file(filename=str(local_filepath)) - new_files = [i for i in Path(new_path).rglob("*") if i.is_file()] - responses = [] - for f in new_files: - new_file_data = copy(file_data) - new_file_data.source_identifiers.fullpath = str(f) - if new_file_data.source_identifiers.rel_path: - new_file_data.source_identifiers.rel_path = str(f).replace( - str(local_filepath.parent), "" - )[1:] - responses.append(new_file_data) - return responses - - async def run_async(self, file_data: FileData, **kwargs: Any) -> list[FileData]: - return self.run(file_data=file_data, **kwargs) diff --git a/unstructured/utils.py b/unstructured/utils.py index 03632e37a..523fcd4a0 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -10,7 +10,6 @@ import platform import subprocess import tempfile import threading -from datetime import datetime from functools import wraps from itertools import combinations from typing import ( @@ -238,36 +237,6 @@ def dependency_exists(dependency: str): return True -def validate_date_args(date: Optional[str] = None) -> bool: - """Validate whether the provided date string satisfies any of the supported date formats. - - Used by unstructured/ingest/connector/biomed.py - - Returns `True` if the date string satisfies any of the supported formats, otherwise raises - `ValueError`. - - Supported Date Formats: - - 'YYYY-MM-DD' - - 'YYYY-MM-DDTHH:MM:SS' - - 'YYYY-MM-DD+HH:MM:SS' - - 'YYYY-MM-DDTHH:MM:SS±HHMM' - """ - if not date: - raise ValueError("The argument date is None.") - - for format in DATE_FORMATS: - try: - datetime.strptime(date, format) - return True - except ValueError: - pass - - raise ValueError( - f"The argument {date} does not satisfy the format:" - f" YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SS±HHMM", - ) - - def _first_and_remaining_iterator(it: Iterable[_T]) -> Tuple[_T, Iterator[_T]]: iterator = iter(it) try: