mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-07 05:38:38 +00:00
build(deps): deltalake bump to 0.18.x (#3197)
### Summary Closes #3173. Removes the `overwrite_schema` kwarg from the Delta Table connector and bumps the `deltalake` version. Per [this PR](https://github.com/delta-io/delta-rs/pull/2554) in the `deltalake` repo, the `overwrite_schema` kwarg is deprecated as of version `0.18.0`. Users can specify `schema_mode="merge"` to obtain the same behavior. - `schema_mode="merge"` is equivalent to `overwrite_schema=False` - `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` Also adds an `engine` parameter that you can use to set `"rust"` or `"pyarrow"` as the engine. `engine` defaults to `"pyarrow"` and `schema_mode` defaults to `None`, which is consistent with the behavior in `deltalake` documented [here](https://delta-io.github.io/delta-rs/api/delta_writer/). ### Testing The Delta Table ingest tests should pass on this PR. --------- Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
This commit is contained in:
parent
5f582f1716
commit
ad69bdcd4e
@ -1,12 +1,13 @@
|
|||||||
## 0.14.6-dev3
|
## 0.14.6-dev4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
|
|
||||||
|
|
||||||
|
* **Remove deprecated `overwrite_schema` kwarg from Delta Table connector.**. The `overwrite_schema` kwarg is deprecated in `deltalake>=0.18.0`. `schema_mode=` should be used now instead. `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` and `schema_mode="merge"` is equivalent to `overwrite_schema="False"`. `schema_mode` defaults to `None`. You can also now specify `engine`, which defaults to `"pyarrow"`. You need to specify `enginer="rust"` to use `"schema_mode"`.
|
||||||
|
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
|
||||||
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
|
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
|
||||||
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
|
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
|
||||||
|
|
||||||
|
|||||||
@ -15,16 +15,16 @@ RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \
|
|||||||
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \
|
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \
|
||||||
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \
|
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \
|
||||||
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \
|
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \
|
||||||
apk add libreoffice && \
|
apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \
|
||||||
apk add bash && \
|
apk add bash && \
|
||||||
apk add libmagic && \
|
apk add libmagic && \
|
||||||
mv /share/tessdata/configs /usr/local/share/tessdata/ && \
|
mv /share/tessdata/configs /usr/local/share/tessdata/ && \
|
||||||
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \
|
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \
|
||||||
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \
|
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \
|
||||||
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \
|
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \
|
||||||
chmod +x /usr/lib/libreoffice/program/soffice.bin && \
|
chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \
|
||||||
chmod +x /usr/bin/libreoffice && \
|
chmod +x /usr/local/bin/libreoffice && \
|
||||||
chmod +x /usr/bin/soffice
|
chmod +x /usr/local/bin/soffice
|
||||||
|
|
||||||
RUN chown -R nonroot:nonroot /app
|
RUN chown -R nonroot:nonroot /app
|
||||||
|
|
||||||
|
|||||||
@ -137,7 +137,7 @@ jinja2==3.1.4
|
|||||||
# nbconvert
|
# nbconvert
|
||||||
json5==0.9.25
|
json5==0.9.25
|
||||||
# via jupyterlab-server
|
# via jupyterlab-server
|
||||||
jsonpointer==2.4
|
jsonpointer==3.0.0
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
jsonschema[format-nongpl]==4.22.0
|
jsonschema[format-nongpl]==4.22.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -150,7 +150,7 @@ pdfminer-six==20231228
|
|||||||
# via
|
# via
|
||||||
# -r ./extra-pdf-image.in
|
# -r ./extra-pdf-image.in
|
||||||
# pdfplumber
|
# pdfplumber
|
||||||
pdfplumber==0.11.0
|
pdfplumber==0.11.1
|
||||||
# via layoutparser
|
# via layoutparser
|
||||||
pikepdf==9.0.0
|
pikepdf==9.0.0
|
||||||
# via -r ./extra-pdf-image.in
|
# via -r ./extra-pdf-image.in
|
||||||
|
|||||||
@ -23,7 +23,7 @@ inflection==0.5.1
|
|||||||
# via pyairtable
|
# via pyairtable
|
||||||
pyairtable==2.3.3
|
pyairtable==2.3.3
|
||||||
# via -r ./ingest/airtable.in
|
# via -r ./ingest/airtable.in
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via pyairtable
|
# via pyairtable
|
||||||
pydantic-core==2.18.4
|
pydantic-core==2.18.4
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
|||||||
@ -21,7 +21,7 @@ azure-core==1.30.2
|
|||||||
# azure-storage-blob
|
# azure-storage-blob
|
||||||
azure-datalake-store==0.0.53
|
azure-datalake-store==0.0.53
|
||||||
# via adlfs
|
# via adlfs
|
||||||
azure-identity==1.16.0
|
azure-identity==1.16.1
|
||||||
# via adlfs
|
# via adlfs
|
||||||
azure-storage-blob==12.20.0
|
azure-storage-blob==12.20.0
|
||||||
# via adlfs
|
# via adlfs
|
||||||
@ -60,7 +60,7 @@ idna==3.7
|
|||||||
# yarl
|
# yarl
|
||||||
isodate==0.6.1
|
isodate==0.6.1
|
||||||
# via azure-storage-blob
|
# via azure-storage-blob
|
||||||
msal==1.28.0
|
msal==1.28.1
|
||||||
# via
|
# via
|
||||||
# azure-datalake-store
|
# azure-datalake-store
|
||||||
# azure-identity
|
# azure-identity
|
||||||
|
|||||||
@ -147,7 +147,7 @@ opentelemetry-util-http==0.46b0
|
|||||||
# via
|
# via
|
||||||
# opentelemetry-instrumentation-asgi
|
# opentelemetry-instrumentation-asgi
|
||||||
# opentelemetry-instrumentation-fastapi
|
# opentelemetry-instrumentation-fastapi
|
||||||
orjson==3.10.3
|
orjson==3.10.4
|
||||||
# via chromadb
|
# via chromadb
|
||||||
overrides==7.7.0
|
overrides==7.7.0
|
||||||
# via chromadb
|
# via chromadb
|
||||||
@ -172,7 +172,7 @@ pyasn1==0.6.0
|
|||||||
# rsa
|
# rsa
|
||||||
pyasn1-modules==0.4.0
|
pyasn1-modules==0.4.0
|
||||||
# via google-auth
|
# via google-auth
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# chromadb
|
# chromadb
|
||||||
# fastapi
|
# fastapi
|
||||||
|
|||||||
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
|||||||
# requests
|
# requests
|
||||||
clarifai==10.5.0
|
clarifai==10.5.0
|
||||||
# via -r ./ingest/clarifai.in
|
# via -r ./ingest/clarifai.in
|
||||||
clarifai-grpc==10.5.0
|
clarifai-grpc==10.5.1
|
||||||
# via clarifai
|
# via clarifai
|
||||||
contextlib2==21.6.0
|
contextlib2==21.6.0
|
||||||
# via schema
|
# via schema
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
-c ../deps/constraints.txt
|
-c ../deps/constraints.txt
|
||||||
-c ../base.txt
|
-c ../base.txt
|
||||||
deltalake<0.18.0
|
deltalake
|
||||||
fsspec
|
fsspec
|
||||||
|
|||||||
@ -4,7 +4,7 @@
|
|||||||
#
|
#
|
||||||
# pip-compile ./ingest/delta-table.in
|
# pip-compile ./ingest/delta-table.in
|
||||||
#
|
#
|
||||||
deltalake==0.17.4
|
deltalake==0.18.1
|
||||||
# via -r ./ingest/delta-table.in
|
# via -r ./ingest/delta-table.in
|
||||||
fsspec==2024.5.0
|
fsspec==2024.5.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -53,7 +53,7 @@ jmespath==1.0.1
|
|||||||
# botocore
|
# botocore
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==3.0.0
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.3
|
langchain==0.2.3
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
@ -90,7 +90,7 @@ numpy==1.26.4
|
|||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
orjson==3.10.3
|
orjson==3.10.4
|
||||||
# via langsmith
|
# via langsmith
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
@ -98,7 +98,7 @@ packaging==23.2
|
|||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain-core
|
# langchain-core
|
||||||
# marshmallow
|
# marshmallow
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
|
|||||||
@ -65,7 +65,7 @@ joblib==1.4.2
|
|||||||
# scikit-learn
|
# scikit-learn
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==3.0.0
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.3
|
langchain==0.2.3
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
@ -112,7 +112,7 @@ numpy==1.26.4
|
|||||||
# scipy
|
# scipy
|
||||||
# sentence-transformers
|
# sentence-transformers
|
||||||
# transformers
|
# transformers
|
||||||
orjson==3.10.3
|
orjson==3.10.4
|
||||||
# via langsmith
|
# via langsmith
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
@ -124,7 +124,7 @@ packaging==23.2
|
|||||||
# transformers
|
# transformers
|
||||||
pillow==10.3.0
|
pillow==10.3.0
|
||||||
# via sentence-transformers
|
# via sentence-transformers
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
|
|||||||
@ -40,7 +40,7 @@ idna==3.7
|
|||||||
# requests
|
# requests
|
||||||
openai==1.33.0
|
openai==1.33.0
|
||||||
# via -r ./ingest/embed-octoai.in
|
# via -r ./ingest/embed-octoai.in
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via openai
|
# via openai
|
||||||
pydantic-core==2.18.4
|
pydantic-core==2.18.4
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
|||||||
@ -61,7 +61,7 @@ idna==3.7
|
|||||||
# yarl
|
# yarl
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==3.0.0
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.3
|
langchain==0.2.3
|
||||||
# via langchain-community
|
# via langchain-community
|
||||||
@ -100,7 +100,7 @@ numpy==1.26.4
|
|||||||
# langchain-community
|
# langchain-community
|
||||||
openai==1.33.0
|
openai==1.33.0
|
||||||
# via -r ./ingest/embed-openai.in
|
# via -r ./ingest/embed-openai.in
|
||||||
orjson==3.10.3
|
orjson==3.10.4
|
||||||
# via langsmith
|
# via langsmith
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
@ -108,7 +108,7 @@ packaging==23.2
|
|||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain-core
|
# langchain-core
|
||||||
# marshmallow
|
# marshmallow
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
|
|||||||
@ -54,7 +54,7 @@ google-auth==2.30.0
|
|||||||
# google-cloud-core
|
# google-cloud-core
|
||||||
# google-cloud-resource-manager
|
# google-cloud-resource-manager
|
||||||
# google-cloud-storage
|
# google-cloud-storage
|
||||||
google-cloud-aiplatform==1.54.1
|
google-cloud-aiplatform==1.55.0
|
||||||
# via langchain-google-vertexai
|
# via langchain-google-vertexai
|
||||||
google-cloud-bigquery==3.24.0
|
google-cloud-bigquery==3.24.0
|
||||||
# via google-cloud-aiplatform
|
# via google-cloud-aiplatform
|
||||||
@ -64,7 +64,7 @@ google-cloud-core==2.4.1
|
|||||||
# google-cloud-storage
|
# google-cloud-storage
|
||||||
google-cloud-resource-manager==1.12.3
|
google-cloud-resource-manager==1.12.3
|
||||||
# via google-cloud-aiplatform
|
# via google-cloud-aiplatform
|
||||||
google-cloud-storage==2.16.0
|
google-cloud-storage==2.17.0
|
||||||
# via
|
# via
|
||||||
# google-cloud-aiplatform
|
# google-cloud-aiplatform
|
||||||
# langchain-google-vertexai
|
# langchain-google-vertexai
|
||||||
@ -72,7 +72,7 @@ google-crc32c==1.5.0
|
|||||||
# via
|
# via
|
||||||
# google-cloud-storage
|
# google-cloud-storage
|
||||||
# google-resumable-media
|
# google-resumable-media
|
||||||
google-resumable-media==2.7.0
|
google-resumable-media==2.7.1
|
||||||
# via
|
# via
|
||||||
# google-cloud-bigquery
|
# google-cloud-bigquery
|
||||||
# google-cloud-storage
|
# google-cloud-storage
|
||||||
@ -98,7 +98,7 @@ idna==3.7
|
|||||||
# yarl
|
# yarl
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==3.0.0
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.3
|
langchain==0.2.3
|
||||||
# via
|
# via
|
||||||
@ -141,7 +141,7 @@ numpy==1.26.4
|
|||||||
# langchain
|
# langchain
|
||||||
# langchain-community
|
# langchain-community
|
||||||
# shapely
|
# shapely
|
||||||
orjson==3.10.3
|
orjson==3.10.4
|
||||||
# via langsmith
|
# via langsmith
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
@ -172,7 +172,7 @@ pyasn1==0.6.0
|
|||||||
# rsa
|
# rsa
|
||||||
pyasn1-modules==0.4.0
|
pyasn1-modules==0.4.0
|
||||||
# via google-auth
|
# via google-auth
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# google-cloud-aiplatform
|
# google-cloud-aiplatform
|
||||||
# langchain
|
# langchain
|
||||||
|
|||||||
@ -40,7 +40,7 @@ idna==3.7
|
|||||||
# yarl
|
# yarl
|
||||||
jsonpatch==1.33
|
jsonpatch==1.33
|
||||||
# via langchain-core
|
# via langchain-core
|
||||||
jsonpointer==2.4
|
jsonpointer==3.0.0
|
||||||
# via jsonpatch
|
# via jsonpatch
|
||||||
langchain==0.2.3
|
langchain==0.2.3
|
||||||
# via -r ./ingest/embed-voyageai.in
|
# via -r ./ingest/embed-voyageai.in
|
||||||
@ -68,14 +68,14 @@ numpy==1.26.4
|
|||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain
|
# langchain
|
||||||
# voyageai
|
# voyageai
|
||||||
orjson==3.10.3
|
orjson==3.10.4
|
||||||
# via langsmith
|
# via langsmith
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# langchain-core
|
# langchain-core
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# langchain
|
# langchain
|
||||||
# langchain-core
|
# langchain-core
|
||||||
|
|||||||
@ -57,13 +57,13 @@ google-auth-oauthlib==1.2.0
|
|||||||
# via gcsfs
|
# via gcsfs
|
||||||
google-cloud-core==2.4.1
|
google-cloud-core==2.4.1
|
||||||
# via google-cloud-storage
|
# via google-cloud-storage
|
||||||
google-cloud-storage==2.16.0
|
google-cloud-storage==2.17.0
|
||||||
# via gcsfs
|
# via gcsfs
|
||||||
google-crc32c==1.5.0
|
google-crc32c==1.5.0
|
||||||
# via
|
# via
|
||||||
# google-cloud-storage
|
# google-cloud-storage
|
||||||
# google-resumable-media
|
# google-resumable-media
|
||||||
google-resumable-media==2.7.0
|
google-resumable-media==2.7.1
|
||||||
# via google-cloud-storage
|
# via google-cloud-storage
|
||||||
googleapis-common-protos==1.63.1
|
googleapis-common-protos==1.63.1
|
||||||
# via google-api-core
|
# via google-api-core
|
||||||
|
|||||||
@ -17,7 +17,7 @@ charset-normalizer==3.3.2
|
|||||||
# requests
|
# requests
|
||||||
google-api-core==2.19.0
|
google-api-core==2.19.0
|
||||||
# via google-api-python-client
|
# via google-api-python-client
|
||||||
google-api-python-client==2.132.0
|
google-api-python-client==2.133.0
|
||||||
# via -r ./ingest/google-drive.in
|
# via -r ./ingest/google-drive.in
|
||||||
google-auth==2.30.0
|
google-auth==2.30.0
|
||||||
# via
|
# via
|
||||||
|
|||||||
@ -29,7 +29,7 @@ idna==3.7
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
msal==1.28.0
|
msal==1.28.1
|
||||||
# via
|
# via
|
||||||
# -r ./ingest/onedrive.in
|
# -r ./ingest/onedrive.in
|
||||||
# office365-rest-python-client
|
# office365-rest-python-client
|
||||||
|
|||||||
@ -23,7 +23,7 @@ idna==3.7
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
msal==1.28.0
|
msal==1.28.1
|
||||||
# via
|
# via
|
||||||
# -r ./ingest/outlook.in
|
# -r ./ingest/outlook.in
|
||||||
# office365-rest-python-client
|
# office365-rest-python-client
|
||||||
|
|||||||
@ -52,7 +52,7 @@ protobuf==4.23.4
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../deps/constraints.txt
|
# -c ./ingest/../deps/constraints.txt
|
||||||
# grpcio-tools
|
# grpcio-tools
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via qdrant-client
|
# via qdrant-client
|
||||||
pydantic-core==2.18.4
|
pydantic-core==2.18.4
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
|||||||
@ -23,7 +23,7 @@ idna==3.7
|
|||||||
# via
|
# via
|
||||||
# -c ./ingest/../base.txt
|
# -c ./ingest/../base.txt
|
||||||
# requests
|
# requests
|
||||||
msal==1.28.0
|
msal==1.28.1
|
||||||
# via
|
# via
|
||||||
# -r ./ingest/sharepoint.in
|
# -r ./ingest/sharepoint.in
|
||||||
# office365-rest-python-client
|
# office365-rest-python-client
|
||||||
|
|||||||
@ -4,5 +4,5 @@
|
|||||||
#
|
#
|
||||||
# pip-compile ./ingest/slack.in
|
# pip-compile ./ingest/slack.in
|
||||||
#
|
#
|
||||||
slack-sdk==3.27.2
|
slack-sdk==3.28.0
|
||||||
# via -r ./ingest/slack.in
|
# via -r ./ingest/slack.in
|
||||||
|
|||||||
@ -57,7 +57,7 @@ protobuf==4.23.4
|
|||||||
# grpcio-tools
|
# grpcio-tools
|
||||||
pycparser==2.22
|
pycparser==2.22
|
||||||
# via cffi
|
# via cffi
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via weaviate-client
|
# via weaviate-client
|
||||||
pydantic-core==2.18.4
|
pydantic-core==2.18.4
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
|||||||
@ -96,7 +96,7 @@ pycodestyle==2.11.1
|
|||||||
# via
|
# via
|
||||||
# flake8
|
# flake8
|
||||||
# flake8-print
|
# flake8-print
|
||||||
pydantic==2.7.3
|
pydantic==2.7.4
|
||||||
# via
|
# via
|
||||||
# -r ./test.in
|
# -r ./test.in
|
||||||
# label-studio-sdk
|
# label-studio-sdk
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
files=(
|
files=(
|
||||||
"libreoffice-7.6.5-r0.apk"
|
"libreoffice-7.6.5-r0.apk"
|
||||||
|
"libreoffice-24-24.2.4.1-r0.67f8e014.apk"
|
||||||
"openjpeg-2.5.0-r0.apk"
|
"openjpeg-2.5.0-r0.apk"
|
||||||
"poppler-23.09.0-r0.apk"
|
"poppler-23.09.0-r0.apk"
|
||||||
"leptonica-1.83.0-r0.apk"
|
"leptonica-1.83.0-r0.apk"
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.14.6-dev3" # pragma: no cover
|
__version__ = "0.14.6-dev4" # pragma: no cover
|
||||||
|
|||||||
@ -152,8 +152,9 @@ class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
|||||||
@dataclass
|
@dataclass
|
||||||
class DeltaTableWriteConfig(WriteConfig):
|
class DeltaTableWriteConfig(WriteConfig):
|
||||||
drop_empty_cols: bool = False
|
drop_empty_cols: bool = False
|
||||||
overwrite_schema: bool = False
|
|
||||||
mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
|
mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
|
||||||
|
schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
|
||||||
|
engine: t.Literal["pyarrow", "rust"] = "pyarrow"
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -182,18 +183,21 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
|
|||||||
f"writing {len(df)} rows to destination table "
|
f"writing {len(df)} rows to destination table "
|
||||||
f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
|
f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
|
||||||
)
|
)
|
||||||
|
writer_kwargs = {
|
||||||
|
"table_or_uri": self.connector_config.table_uri,
|
||||||
|
"data": df,
|
||||||
|
"mode": self.write_config.mode,
|
||||||
|
"engine": self.write_config.engine,
|
||||||
|
}
|
||||||
|
if self.write_config.schema_mode is not None:
|
||||||
|
writer_kwargs["schema_mode"] = self.write_config.schema_mode
|
||||||
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
||||||
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
||||||
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
||||||
# rust backend to finish
|
# rust backend to finish
|
||||||
writer = Process(
|
writer = Process(
|
||||||
target=write_deltalake,
|
target=write_deltalake,
|
||||||
kwargs={
|
kwargs=writer_kwargs,
|
||||||
"table_or_uri": self.connector_config.table_uri,
|
|
||||||
"data": df,
|
|
||||||
"mode": self.write_config.mode,
|
|
||||||
"overwrite_schema": self.write_config.overwrite_schema,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
writer.start()
|
writer.start()
|
||||||
writer.join()
|
writer.join()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user