build(deps): deltalake bump to 0.18.x (#3197)

### Summary

Closes #3173. Removes the `overwrite_schema` kwarg from the Delta Table
connector and bumps the `deltalake` version. Per [this
PR](https://github.com/delta-io/delta-rs/pull/2554) in the `deltalake`
repo, the `overwrite_schema` kwarg is deprecated as of version `0.18.0`.
Users can specify `schema_mode="merge"` to obtain the same behavior.

- `schema_mode="merge"` is equivalent to `overwrite_schema=False`
- `schema_mode="overwrite"` is equivalent to `overwrite_schema=True`

Also adds an `engine` parameter that you can use to set `"rust"` or
`"pyarrow"` as the engine. `engine` defaults to `"pyarrow"` and
`schema_mode` defaults to `None`, which is consistent with the behavior
in `deltalake` documented
[here](https://delta-io.github.io/delta-rs/api/delta_writer/).

### Testing

The Delta Table ingest tests should pass on this PR.

---------

Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
This commit is contained in:
Matt Robinson 2024-06-13 11:59:34 -04:00 committed by GitHub
parent 5f582f1716
commit ad69bdcd4e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 61 additions and 55 deletions

View File

@ -1,12 +1,13 @@
## 0.14.6-dev3 ## 0.14.6-dev4
### Enhancements ### Enhancements
### Features ### Features
### Fixes ### Fixes
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
* **Remove deprecated `overwrite_schema` kwarg from Delta Table connector.**. The `overwrite_schema` kwarg is deprecated in `deltalake>=0.18.0`. `schema_mode=` should be used now instead. `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` and `schema_mode="merge"` is equivalent to `overwrite_schema="False"`. `schema_mode` defaults to `None`. You can also now specify `engine`, which defaults to `"pyarrow"`. You need to specify `enginer="rust"` to use `"schema_mode"`.
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string. * **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it. * **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.

View File

@ -15,16 +15,16 @@ RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \ apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \ apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \ apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \
apk add libreoffice && \ apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \
apk add bash && \ apk add bash && \
apk add libmagic && \ apk add libmagic && \
mv /share/tessdata/configs /usr/local/share/tessdata/ && \ mv /share/tessdata/configs /usr/local/share/tessdata/ && \
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \ mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \ ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \ ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \
chmod +x /usr/lib/libreoffice/program/soffice.bin && \ chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \
chmod +x /usr/bin/libreoffice && \ chmod +x /usr/local/bin/libreoffice && \
chmod +x /usr/bin/soffice chmod +x /usr/local/bin/soffice
RUN chown -R nonroot:nonroot /app RUN chown -R nonroot:nonroot /app

View File

@ -137,7 +137,7 @@ jinja2==3.1.4
# nbconvert # nbconvert
json5==0.9.25 json5==0.9.25
# via jupyterlab-server # via jupyterlab-server
jsonpointer==2.4 jsonpointer==3.0.0
# via jsonschema # via jsonschema
jsonschema[format-nongpl]==4.22.0 jsonschema[format-nongpl]==4.22.0
# via # via

View File

@ -150,7 +150,7 @@ pdfminer-six==20231228
# via # via
# -r ./extra-pdf-image.in # -r ./extra-pdf-image.in
# pdfplumber # pdfplumber
pdfplumber==0.11.0 pdfplumber==0.11.1
# via layoutparser # via layoutparser
pikepdf==9.0.0 pikepdf==9.0.0
# via -r ./extra-pdf-image.in # via -r ./extra-pdf-image.in

View File

@ -23,7 +23,7 @@ inflection==0.5.1
# via pyairtable # via pyairtable
pyairtable==2.3.3 pyairtable==2.3.3
# via -r ./ingest/airtable.in # via -r ./ingest/airtable.in
pydantic==2.7.3 pydantic==2.7.4
# via pyairtable # via pyairtable
pydantic-core==2.18.4 pydantic-core==2.18.4
# via pydantic # via pydantic

View File

@ -21,7 +21,7 @@ azure-core==1.30.2
# azure-storage-blob # azure-storage-blob
azure-datalake-store==0.0.53 azure-datalake-store==0.0.53
# via adlfs # via adlfs
azure-identity==1.16.0 azure-identity==1.16.1
# via adlfs # via adlfs
azure-storage-blob==12.20.0 azure-storage-blob==12.20.0
# via adlfs # via adlfs
@ -60,7 +60,7 @@ idna==3.7
# yarl # yarl
isodate==0.6.1 isodate==0.6.1
# via azure-storage-blob # via azure-storage-blob
msal==1.28.0 msal==1.28.1
# via # via
# azure-datalake-store # azure-datalake-store
# azure-identity # azure-identity

View File

@ -147,7 +147,7 @@ opentelemetry-util-http==0.46b0
# via # via
# opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asgi
# opentelemetry-instrumentation-fastapi # opentelemetry-instrumentation-fastapi
orjson==3.10.3 orjson==3.10.4
# via chromadb # via chromadb
overrides==7.7.0 overrides==7.7.0
# via chromadb # via chromadb
@ -172,7 +172,7 @@ pyasn1==0.6.0
# rsa # rsa
pyasn1-modules==0.4.0 pyasn1-modules==0.4.0
# via google-auth # via google-auth
pydantic==2.7.3 pydantic==2.7.4
# via # via
# chromadb # chromadb
# fastapi # fastapi

View File

@ -15,7 +15,7 @@ charset-normalizer==3.3.2
# requests # requests
clarifai==10.5.0 clarifai==10.5.0
# via -r ./ingest/clarifai.in # via -r ./ingest/clarifai.in
clarifai-grpc==10.5.0 clarifai-grpc==10.5.1
# via clarifai # via clarifai
contextlib2==21.6.0 contextlib2==21.6.0
# via schema # via schema

View File

@ -1,4 +1,4 @@
-c ../deps/constraints.txt -c ../deps/constraints.txt
-c ../base.txt -c ../base.txt
deltalake<0.18.0 deltalake
fsspec fsspec

View File

@ -4,7 +4,7 @@
# #
# pip-compile ./ingest/delta-table.in # pip-compile ./ingest/delta-table.in
# #
deltalake==0.17.4 deltalake==0.18.1
# via -r ./ingest/delta-table.in # via -r ./ingest/delta-table.in
fsspec==2024.5.0 fsspec==2024.5.0
# via # via

View File

@ -53,7 +53,7 @@ jmespath==1.0.1
# botocore # botocore
jsonpatch==1.33 jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==3.0.0
# via jsonpatch # via jsonpatch
langchain==0.2.3 langchain==0.2.3
# via langchain-community # via langchain-community
@ -90,7 +90,7 @@ numpy==1.26.4
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# langchain # langchain
# langchain-community # langchain-community
orjson==3.10.3 orjson==3.10.4
# via langsmith # via langsmith
packaging==23.2 packaging==23.2
# via # via
@ -98,7 +98,7 @@ packaging==23.2
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# langchain-core # langchain-core
# marshmallow # marshmallow
pydantic==2.7.3 pydantic==2.7.4
# via # via
# langchain # langchain
# langchain-core # langchain-core

View File

@ -65,7 +65,7 @@ joblib==1.4.2
# scikit-learn # scikit-learn
jsonpatch==1.33 jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==3.0.0
# via jsonpatch # via jsonpatch
langchain==0.2.3 langchain==0.2.3
# via langchain-community # via langchain-community
@ -112,7 +112,7 @@ numpy==1.26.4
# scipy # scipy
# sentence-transformers # sentence-transformers
# transformers # transformers
orjson==3.10.3 orjson==3.10.4
# via langsmith # via langsmith
packaging==23.2 packaging==23.2
# via # via
@ -124,7 +124,7 @@ packaging==23.2
# transformers # transformers
pillow==10.3.0 pillow==10.3.0
# via sentence-transformers # via sentence-transformers
pydantic==2.7.3 pydantic==2.7.4
# via # via
# langchain # langchain
# langchain-core # langchain-core

View File

@ -40,7 +40,7 @@ idna==3.7
# requests # requests
openai==1.33.0 openai==1.33.0
# via -r ./ingest/embed-octoai.in # via -r ./ingest/embed-octoai.in
pydantic==2.7.3 pydantic==2.7.4
# via openai # via openai
pydantic-core==2.18.4 pydantic-core==2.18.4
# via pydantic # via pydantic

View File

@ -61,7 +61,7 @@ idna==3.7
# yarl # yarl
jsonpatch==1.33 jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==3.0.0
# via jsonpatch # via jsonpatch
langchain==0.2.3 langchain==0.2.3
# via langchain-community # via langchain-community
@ -100,7 +100,7 @@ numpy==1.26.4
# langchain-community # langchain-community
openai==1.33.0 openai==1.33.0
# via -r ./ingest/embed-openai.in # via -r ./ingest/embed-openai.in
orjson==3.10.3 orjson==3.10.4
# via langsmith # via langsmith
packaging==23.2 packaging==23.2
# via # via
@ -108,7 +108,7 @@ packaging==23.2
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# langchain-core # langchain-core
# marshmallow # marshmallow
pydantic==2.7.3 pydantic==2.7.4
# via # via
# langchain # langchain
# langchain-core # langchain-core

View File

@ -54,7 +54,7 @@ google-auth==2.30.0
# google-cloud-core # google-cloud-core
# google-cloud-resource-manager # google-cloud-resource-manager
# google-cloud-storage # google-cloud-storage
google-cloud-aiplatform==1.54.1 google-cloud-aiplatform==1.55.0
# via langchain-google-vertexai # via langchain-google-vertexai
google-cloud-bigquery==3.24.0 google-cloud-bigquery==3.24.0
# via google-cloud-aiplatform # via google-cloud-aiplatform
@ -64,7 +64,7 @@ google-cloud-core==2.4.1
# google-cloud-storage # google-cloud-storage
google-cloud-resource-manager==1.12.3 google-cloud-resource-manager==1.12.3
# via google-cloud-aiplatform # via google-cloud-aiplatform
google-cloud-storage==2.16.0 google-cloud-storage==2.17.0
# via # via
# google-cloud-aiplatform # google-cloud-aiplatform
# langchain-google-vertexai # langchain-google-vertexai
@ -72,7 +72,7 @@ google-crc32c==1.5.0
# via # via
# google-cloud-storage # google-cloud-storage
# google-resumable-media # google-resumable-media
google-resumable-media==2.7.0 google-resumable-media==2.7.1
# via # via
# google-cloud-bigquery # google-cloud-bigquery
# google-cloud-storage # google-cloud-storage
@ -98,7 +98,7 @@ idna==3.7
# yarl # yarl
jsonpatch==1.33 jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==3.0.0
# via jsonpatch # via jsonpatch
langchain==0.2.3 langchain==0.2.3
# via # via
@ -141,7 +141,7 @@ numpy==1.26.4
# langchain # langchain
# langchain-community # langchain-community
# shapely # shapely
orjson==3.10.3 orjson==3.10.4
# via langsmith # via langsmith
packaging==23.2 packaging==23.2
# via # via
@ -172,7 +172,7 @@ pyasn1==0.6.0
# rsa # rsa
pyasn1-modules==0.4.0 pyasn1-modules==0.4.0
# via google-auth # via google-auth
pydantic==2.7.3 pydantic==2.7.4
# via # via
# google-cloud-aiplatform # google-cloud-aiplatform
# langchain # langchain

View File

@ -40,7 +40,7 @@ idna==3.7
# yarl # yarl
jsonpatch==1.33 jsonpatch==1.33
# via langchain-core # via langchain-core
jsonpointer==2.4 jsonpointer==3.0.0
# via jsonpatch # via jsonpatch
langchain==0.2.3 langchain==0.2.3
# via -r ./ingest/embed-voyageai.in # via -r ./ingest/embed-voyageai.in
@ -68,14 +68,14 @@ numpy==1.26.4
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# langchain # langchain
# voyageai # voyageai
orjson==3.10.3 orjson==3.10.4
# via langsmith # via langsmith
packaging==23.2 packaging==23.2
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# langchain-core # langchain-core
pydantic==2.7.3 pydantic==2.7.4
# via # via
# langchain # langchain
# langchain-core # langchain-core

View File

@ -57,13 +57,13 @@ google-auth-oauthlib==1.2.0
# via gcsfs # via gcsfs
google-cloud-core==2.4.1 google-cloud-core==2.4.1
# via google-cloud-storage # via google-cloud-storage
google-cloud-storage==2.16.0 google-cloud-storage==2.17.0
# via gcsfs # via gcsfs
google-crc32c==1.5.0 google-crc32c==1.5.0
# via # via
# google-cloud-storage # google-cloud-storage
# google-resumable-media # google-resumable-media
google-resumable-media==2.7.0 google-resumable-media==2.7.1
# via google-cloud-storage # via google-cloud-storage
googleapis-common-protos==1.63.1 googleapis-common-protos==1.63.1
# via google-api-core # via google-api-core

View File

@ -17,7 +17,7 @@ charset-normalizer==3.3.2
# requests # requests
google-api-core==2.19.0 google-api-core==2.19.0
# via google-api-python-client # via google-api-python-client
google-api-python-client==2.132.0 google-api-python-client==2.133.0
# via -r ./ingest/google-drive.in # via -r ./ingest/google-drive.in
google-auth==2.30.0 google-auth==2.30.0
# via # via

View File

@ -29,7 +29,7 @@ idna==3.7
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# requests # requests
msal==1.28.0 msal==1.28.1
# via # via
# -r ./ingest/onedrive.in # -r ./ingest/onedrive.in
# office365-rest-python-client # office365-rest-python-client

View File

@ -23,7 +23,7 @@ idna==3.7
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# requests # requests
msal==1.28.0 msal==1.28.1
# via # via
# -r ./ingest/outlook.in # -r ./ingest/outlook.in
# office365-rest-python-client # office365-rest-python-client

View File

@ -52,7 +52,7 @@ protobuf==4.23.4
# via # via
# -c ./ingest/../deps/constraints.txt # -c ./ingest/../deps/constraints.txt
# grpcio-tools # grpcio-tools
pydantic==2.7.3 pydantic==2.7.4
# via qdrant-client # via qdrant-client
pydantic-core==2.18.4 pydantic-core==2.18.4
# via pydantic # via pydantic

View File

@ -23,7 +23,7 @@ idna==3.7
# via # via
# -c ./ingest/../base.txt # -c ./ingest/../base.txt
# requests # requests
msal==1.28.0 msal==1.28.1
# via # via
# -r ./ingest/sharepoint.in # -r ./ingest/sharepoint.in
# office365-rest-python-client # office365-rest-python-client

View File

@ -4,5 +4,5 @@
# #
# pip-compile ./ingest/slack.in # pip-compile ./ingest/slack.in
# #
slack-sdk==3.27.2 slack-sdk==3.28.0
# via -r ./ingest/slack.in # via -r ./ingest/slack.in

View File

@ -57,7 +57,7 @@ protobuf==4.23.4
# grpcio-tools # grpcio-tools
pycparser==2.22 pycparser==2.22
# via cffi # via cffi
pydantic==2.7.3 pydantic==2.7.4
# via weaviate-client # via weaviate-client
pydantic-core==2.18.4 pydantic-core==2.18.4
# via pydantic # via pydantic

View File

@ -96,7 +96,7 @@ pycodestyle==2.11.1
# via # via
# flake8 # flake8
# flake8-print # flake8-print
pydantic==2.7.3 pydantic==2.7.4
# via # via
# -r ./test.in # -r ./test.in
# label-studio-sdk # label-studio-sdk

View File

@ -2,6 +2,7 @@
files=( files=(
"libreoffice-7.6.5-r0.apk" "libreoffice-7.6.5-r0.apk"
"libreoffice-24-24.2.4.1-r0.67f8e014.apk"
"openjpeg-2.5.0-r0.apk" "openjpeg-2.5.0-r0.apk"
"poppler-23.09.0-r0.apk" "poppler-23.09.0-r0.apk"
"leptonica-1.83.0-r0.apk" "leptonica-1.83.0-r0.apk"

View File

@ -1 +1 @@
__version__ = "0.14.6-dev3" # pragma: no cover __version__ = "0.14.6-dev4" # pragma: no cover

View File

@ -152,8 +152,9 @@ class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
@dataclass @dataclass
class DeltaTableWriteConfig(WriteConfig): class DeltaTableWriteConfig(WriteConfig):
drop_empty_cols: bool = False drop_empty_cols: bool = False
overwrite_schema: bool = False
mode: t.Literal["error", "append", "overwrite", "ignore"] = "error" mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
engine: t.Literal["pyarrow", "rust"] = "pyarrow"
@dataclass @dataclass
@ -182,18 +183,21 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
f"writing {len(df)} rows to destination table " f"writing {len(df)} rows to destination table "
f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}", f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
) )
writer_kwargs = {
"table_or_uri": self.connector_config.table_uri,
"data": df,
"mode": self.write_config.mode,
"engine": self.write_config.engine,
}
if self.write_config.schema_mode is not None:
writer_kwargs["schema_mode"] = self.write_config.schema_mode
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause # NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
# ingest to fail, even though all tasks are completed normally. Putting the writer into a # ingest to fail, even though all tasks are completed normally. Putting the writer into a
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's # process mitigates this issue by ensuring python interpreter waits properly for deltalake's
# rust backend to finish # rust backend to finish
writer = Process( writer = Process(
target=write_deltalake, target=write_deltalake,
kwargs={ kwargs=writer_kwargs,
"table_or_uri": self.connector_config.table_uri,
"data": df,
"mode": self.write_config.mode,
"overwrite_schema": self.write_config.overwrite_schema,
},
) )
writer.start() writer.start()
writer.join() writer.join()