From ad69bdcd4e457c5cb61a0b7cabe4a4cf22daa7ea Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 13 Jun 2024 11:59:34 -0400 Subject: [PATCH] build(deps): deltalake bump to `0.18.x` (#3197) ### Summary Closes #3173. Removes the `overwrite_schema` kwarg from the Delta Table connector and bumps the `deltalake` version. Per [this PR](https://github.com/delta-io/delta-rs/pull/2554) in the `deltalake` repo, the `overwrite_schema` kwarg is deprecated as of version `0.18.0`. Users can specify `schema_mode="merge"` to obtain the same behavior. - `schema_mode="merge"` is equivalent to `overwrite_schema=False` - `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` Also adds an `engine` parameter that you can use to set `"rust"` or `"pyarrow"` as the engine. `engine` defaults to `"pyarrow"` and `schema_mode` defaults to `None`, which is consistent with the behavior in `deltalake` documented [here](https://delta-io.github.io/delta-rs/api/delta_writer/). ### Testing The Delta Table ingest tests should pass on this PR. --------- Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com> --- CHANGELOG.md | 5 +++-- Dockerfile-amd64 | 12 ++++++------ requirements/dev.txt | 2 +- requirements/extra-pdf-image.txt | 2 +- requirements/ingest/airtable.txt | 2 +- requirements/ingest/azure.txt | 4 ++-- requirements/ingest/chroma.txt | 4 ++-- requirements/ingest/clarifai.txt | 2 +- requirements/ingest/delta-table.in | 2 +- requirements/ingest/delta-table.txt | 2 +- requirements/ingest/embed-aws-bedrock.txt | 6 +++--- requirements/ingest/embed-huggingface.txt | 6 +++--- requirements/ingest/embed-octoai.txt | 2 +- requirements/ingest/embed-openai.txt | 6 +++--- requirements/ingest/embed-vertexai.txt | 12 ++++++------ requirements/ingest/embed-voyageai.txt | 6 +++--- requirements/ingest/gcs.txt | 4 ++-- requirements/ingest/google-drive.txt | 2 +- requirements/ingest/onedrive.txt | 2 +- requirements/ingest/outlook.txt | 2 +- requirements/ingest/qdrant.txt | 2 +- requirements/ingest/sharepoint.txt | 2 +- requirements/ingest/slack.txt | 2 +- requirements/ingest/weaviate.txt | 2 +- requirements/test.txt | 2 +- scripts/docker-dl-packages.sh | 1 + unstructured/__version__.py | 2 +- unstructured/ingest/connector/delta_table.py | 18 +++++++++++------- 28 files changed, 61 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d1785bf9d..ed5880cc8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,13 @@ -## 0.14.6-dev3 +## 0.14.6-dev4 ### Enhancements ### Features ### Fixes -* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api` +* **Remove deprecated `overwrite_schema` kwarg from Delta Table connector.**. The `overwrite_schema` kwarg is deprecated in `deltalake>=0.18.0`. `schema_mode=` should be used now instead. `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` and `schema_mode="merge"` is equivalent to `overwrite_schema="False"`. `schema_mode` defaults to `None`. You can also now specify `engine`, which defaults to `"pyarrow"`. You need to specify `enginer="rust"` to use `"schema_mode"`. +* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api` * **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. * **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it. diff --git a/Dockerfile-amd64 b/Dockerfile-amd64 index ac346b827..f2fc3c675 100644 --- a/Dockerfile-amd64 +++ b/Dockerfile-amd64 @@ -15,16 +15,16 @@ RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \ apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \ apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \ apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \ - apk add libreoffice && \ + apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \ apk add bash && \ apk add libmagic && \ mv /share/tessdata/configs /usr/local/share/tessdata/ && \ mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \ - ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \ - ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \ - chmod +x /usr/lib/libreoffice/program/soffice.bin && \ - chmod +x /usr/bin/libreoffice && \ - chmod +x /usr/bin/soffice + ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \ + ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \ + chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \ + chmod +x /usr/local/bin/libreoffice && \ + chmod +x /usr/local/bin/soffice RUN chown -R nonroot:nonroot /app diff --git a/requirements/dev.txt b/requirements/dev.txt index 1bfb39ab4..84a541622 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -137,7 +137,7 @@ jinja2==3.1.4 # nbconvert json5==0.9.25 # via jupyterlab-server -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonschema jsonschema[format-nongpl]==4.22.0 # via diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index cab0f89bb..ef1d4852e 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -150,7 +150,7 @@ pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber -pdfplumber==0.11.0 +pdfplumber==0.11.1 # via layoutparser pikepdf==9.0.0 # via -r ./extra-pdf-image.in diff --git a/requirements/ingest/airtable.txt b/requirements/ingest/airtable.txt index 51083d818..4d01845b6 100644 --- a/requirements/ingest/airtable.txt +++ b/requirements/ingest/airtable.txt @@ -23,7 +23,7 @@ inflection==0.5.1 # via pyairtable pyairtable==2.3.3 # via -r ./ingest/airtable.in -pydantic==2.7.3 +pydantic==2.7.4 # via pyairtable pydantic-core==2.18.4 # via pydantic diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index 4c50eb50e..19af30fb2 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -21,7 +21,7 @@ azure-core==1.30.2 # azure-storage-blob azure-datalake-store==0.0.53 # via adlfs -azure-identity==1.16.0 +azure-identity==1.16.1 # via adlfs azure-storage-blob==12.20.0 # via adlfs @@ -60,7 +60,7 @@ idna==3.7 # yarl isodate==0.6.1 # via azure-storage-blob -msal==1.28.0 +msal==1.28.1 # via # azure-datalake-store # azure-identity diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index ba2490f75..cd6219723 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -147,7 +147,7 @@ opentelemetry-util-http==0.46b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi -orjson==3.10.3 +orjson==3.10.4 # via chromadb overrides==7.7.0 # via chromadb @@ -172,7 +172,7 @@ pyasn1==0.6.0 # rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.3 +pydantic==2.7.4 # via # chromadb # fastapi diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index e15fa09c6..fa3b83b06 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -15,7 +15,7 @@ charset-normalizer==3.3.2 # requests clarifai==10.5.0 # via -r ./ingest/clarifai.in -clarifai-grpc==10.5.0 +clarifai-grpc==10.5.1 # via clarifai contextlib2==21.6.0 # via schema diff --git a/requirements/ingest/delta-table.in b/requirements/ingest/delta-table.in index 7cbe7cb08..a3f7b82b2 100644 --- a/requirements/ingest/delta-table.in +++ b/requirements/ingest/delta-table.in @@ -1,4 +1,4 @@ -c ../deps/constraints.txt -c ../base.txt -deltalake<0.18.0 +deltalake fsspec diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt index 4a2f630f9..58feaae73 100644 --- a/requirements/ingest/delta-table.txt +++ b/requirements/ingest/delta-table.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/delta-table.in # -deltalake==0.17.4 +deltalake==0.18.1 # via -r ./ingest/delta-table.in fsspec==2024.5.0 # via diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index b705688d6..0f9316e59 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -53,7 +53,7 @@ jmespath==1.0.1 # botocore jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch langchain==0.2.3 # via langchain-community @@ -90,7 +90,7 @@ numpy==1.26.4 # -c ./ingest/../deps/constraints.txt # langchain # langchain-community -orjson==3.10.3 +orjson==3.10.4 # via langsmith packaging==23.2 # via @@ -98,7 +98,7 @@ packaging==23.2 # -c ./ingest/../deps/constraints.txt # langchain-core # marshmallow -pydantic==2.7.3 +pydantic==2.7.4 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 12d7dd88e..44c654cf4 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -65,7 +65,7 @@ joblib==1.4.2 # scikit-learn jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch langchain==0.2.3 # via langchain-community @@ -112,7 +112,7 @@ numpy==1.26.4 # scipy # sentence-transformers # transformers -orjson==3.10.3 +orjson==3.10.4 # via langsmith packaging==23.2 # via @@ -124,7 +124,7 @@ packaging==23.2 # transformers pillow==10.3.0 # via sentence-transformers -pydantic==2.7.3 +pydantic==2.7.4 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index d9fcad949..220b9411a 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -40,7 +40,7 @@ idna==3.7 # requests openai==1.33.0 # via -r ./ingest/embed-octoai.in -pydantic==2.7.3 +pydantic==2.7.4 # via openai pydantic-core==2.18.4 # via pydantic diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 4d6236eec..d0127e634 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -61,7 +61,7 @@ idna==3.7 # yarl jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch langchain==0.2.3 # via langchain-community @@ -100,7 +100,7 @@ numpy==1.26.4 # langchain-community openai==1.33.0 # via -r ./ingest/embed-openai.in -orjson==3.10.3 +orjson==3.10.4 # via langsmith packaging==23.2 # via @@ -108,7 +108,7 @@ packaging==23.2 # -c ./ingest/../deps/constraints.txt # langchain-core # marshmallow -pydantic==2.7.3 +pydantic==2.7.4 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index 049098f42..ab27a885e 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -54,7 +54,7 @@ google-auth==2.30.0 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-cloud-aiplatform==1.54.1 +google-cloud-aiplatform==1.55.0 # via langchain-google-vertexai google-cloud-bigquery==3.24.0 # via google-cloud-aiplatform @@ -64,7 +64,7 @@ google-cloud-core==2.4.1 # google-cloud-storage google-cloud-resource-manager==1.12.3 # via google-cloud-aiplatform -google-cloud-storage==2.16.0 +google-cloud-storage==2.17.0 # via # google-cloud-aiplatform # langchain-google-vertexai @@ -72,7 +72,7 @@ google-crc32c==1.5.0 # via # google-cloud-storage # google-resumable-media -google-resumable-media==2.7.0 +google-resumable-media==2.7.1 # via # google-cloud-bigquery # google-cloud-storage @@ -98,7 +98,7 @@ idna==3.7 # yarl jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch langchain==0.2.3 # via @@ -141,7 +141,7 @@ numpy==1.26.4 # langchain # langchain-community # shapely -orjson==3.10.3 +orjson==3.10.4 # via langsmith packaging==23.2 # via @@ -172,7 +172,7 @@ pyasn1==0.6.0 # rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.3 +pydantic==2.7.4 # via # google-cloud-aiplatform # langchain diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 637dcf73a..d09ae8b72 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -40,7 +40,7 @@ idna==3.7 # yarl jsonpatch==1.33 # via langchain-core -jsonpointer==2.4 +jsonpointer==3.0.0 # via jsonpatch langchain==0.2.3 # via -r ./ingest/embed-voyageai.in @@ -68,14 +68,14 @@ numpy==1.26.4 # -c ./ingest/../deps/constraints.txt # langchain # voyageai -orjson==3.10.3 +orjson==3.10.4 # via langsmith packaging==23.2 # via # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # langchain-core -pydantic==2.7.3 +pydantic==2.7.4 # via # langchain # langchain-core diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index 3e4a97c27..fcab812e3 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -57,13 +57,13 @@ google-auth-oauthlib==1.2.0 # via gcsfs google-cloud-core==2.4.1 # via google-cloud-storage -google-cloud-storage==2.16.0 +google-cloud-storage==2.17.0 # via gcsfs google-crc32c==1.5.0 # via # google-cloud-storage # google-resumable-media -google-resumable-media==2.7.0 +google-resumable-media==2.7.1 # via google-cloud-storage googleapis-common-protos==1.63.1 # via google-api-core diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index ed0060211..26ac35785 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -17,7 +17,7 @@ charset-normalizer==3.3.2 # requests google-api-core==2.19.0 # via google-api-python-client -google-api-python-client==2.132.0 +google-api-python-client==2.133.0 # via -r ./ingest/google-drive.in google-auth==2.30.0 # via diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index 394f8864f..36ca6171e 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -29,7 +29,7 @@ idna==3.7 # via # -c ./ingest/../base.txt # requests -msal==1.28.0 +msal==1.28.1 # via # -r ./ingest/onedrive.in # office365-rest-python-client diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt index 8b407de46..ef4ab87e9 100644 --- a/requirements/ingest/outlook.txt +++ b/requirements/ingest/outlook.txt @@ -23,7 +23,7 @@ idna==3.7 # via # -c ./ingest/../base.txt # requests -msal==1.28.0 +msal==1.28.1 # via # -r ./ingest/outlook.in # office365-rest-python-client diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index bdaf55f4d..d15d6d4dd 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -52,7 +52,7 @@ protobuf==4.23.4 # via # -c ./ingest/../deps/constraints.txt # grpcio-tools -pydantic==2.7.3 +pydantic==2.7.4 # via qdrant-client pydantic-core==2.18.4 # via pydantic diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt index dcee66a97..ce2d291ee 100644 --- a/requirements/ingest/sharepoint.txt +++ b/requirements/ingest/sharepoint.txt @@ -23,7 +23,7 @@ idna==3.7 # via # -c ./ingest/../base.txt # requests -msal==1.28.0 +msal==1.28.1 # via # -r ./ingest/sharepoint.in # office365-rest-python-client diff --git a/requirements/ingest/slack.txt b/requirements/ingest/slack.txt index db20507da..03a744153 100644 --- a/requirements/ingest/slack.txt +++ b/requirements/ingest/slack.txt @@ -4,5 +4,5 @@ # # pip-compile ./ingest/slack.in # -slack-sdk==3.27.2 +slack-sdk==3.28.0 # via -r ./ingest/slack.in diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index 016c7ac56..92da7ef20 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -57,7 +57,7 @@ protobuf==4.23.4 # grpcio-tools pycparser==2.22 # via cffi -pydantic==2.7.3 +pydantic==2.7.4 # via weaviate-client pydantic-core==2.18.4 # via pydantic diff --git a/requirements/test.txt b/requirements/test.txt index 32d107607..23b8b0a4c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -96,7 +96,7 @@ pycodestyle==2.11.1 # via # flake8 # flake8-print -pydantic==2.7.3 +pydantic==2.7.4 # via # -r ./test.in # label-studio-sdk diff --git a/scripts/docker-dl-packages.sh b/scripts/docker-dl-packages.sh index 8d5c6e5a4..b50b400e2 100755 --- a/scripts/docker-dl-packages.sh +++ b/scripts/docker-dl-packages.sh @@ -2,6 +2,7 @@ files=( "libreoffice-7.6.5-r0.apk" + "libreoffice-24-24.2.4.1-r0.67f8e014.apk" "openjpeg-2.5.0-r0.apk" "poppler-23.09.0-r0.apk" "leptonica-1.83.0-r0.apk" diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 01fc89e97..042f03fd9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.6-dev3" # pragma: no cover +__version__ = "0.14.6-dev4" # pragma: no cover diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index c4640b76c..1382ed05d 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -152,8 +152,9 @@ class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector @dataclass class DeltaTableWriteConfig(WriteConfig): drop_empty_cols: bool = False - overwrite_schema: bool = False mode: t.Literal["error", "append", "overwrite", "ignore"] = "error" + schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None + engine: t.Literal["pyarrow", "rust"] = "pyarrow" @dataclass @@ -182,18 +183,21 @@ class DeltaTableDestinationConnector(BaseDestinationConnector): f"writing {len(df)} rows to destination table " f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}", ) + writer_kwargs = { + "table_or_uri": self.connector_config.table_uri, + "data": df, + "mode": self.write_config.mode, + "engine": self.write_config.engine, + } + if self.write_config.schema_mode is not None: + writer_kwargs["schema_mode"] = self.write_config.schema_mode # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause # ingest to fail, even though all tasks are completed normally. Putting the writer into a # process mitigates this issue by ensuring python interpreter waits properly for deltalake's # rust backend to finish writer = Process( target=write_deltalake, - kwargs={ - "table_or_uri": self.connector_config.table_uri, - "data": df, - "mode": self.write_config.mode, - "overwrite_schema": self.write_config.overwrite_schema, - }, + kwargs=writer_kwargs, ) writer.start() writer.join()