mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 11:03:38 +00:00
build(deps): deltalake bump to 0.18.x (#3197)
### Summary Closes #3173. Removes the `overwrite_schema` kwarg from the Delta Table connector and bumps the `deltalake` version. Per [this PR](https://github.com/delta-io/delta-rs/pull/2554) in the `deltalake` repo, the `overwrite_schema` kwarg is deprecated as of version `0.18.0`. Users can specify `schema_mode="merge"` to obtain the same behavior. - `schema_mode="merge"` is equivalent to `overwrite_schema=False` - `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` Also adds an `engine` parameter that you can use to set `"rust"` or `"pyarrow"` as the engine. `engine` defaults to `"pyarrow"` and `schema_mode` defaults to `None`, which is consistent with the behavior in `deltalake` documented [here](https://delta-io.github.io/delta-rs/api/delta_writer/). ### Testing The Delta Table ingest tests should pass on this PR. --------- Co-authored-by: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com>
This commit is contained in:
parent
5f582f1716
commit
ad69bdcd4e
@ -1,12 +1,13 @@
|
||||
## 0.14.6-dev3
|
||||
## 0.14.6-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
|
||||
|
||||
* **Remove deprecated `overwrite_schema` kwarg from Delta Table connector.**. The `overwrite_schema` kwarg is deprecated in `deltalake>=0.18.0`. `schema_mode=` should be used now instead. `schema_mode="overwrite"` is equivalent to `overwrite_schema=True` and `schema_mode="merge"` is equivalent to `overwrite_schema="False"`. `schema_mode` defaults to `None`. You can also now specify `engine`, which defaults to `"pyarrow"`. You need to specify `enginer="rust"` to use `"schema_mode"`.
|
||||
* **Fix passing parameters to python-client** - Remove parsing list arguments to strings in passing arguments to python-client in Ingest workflow and `partition_via_api`
|
||||
* **table metric bug fix** get_element_level_alignment()now will find all the matched indices in predicted table data instead of only returning the first match in the case of multiple matches for the same gt string.
|
||||
* **fsspec connector path/permissions bug** V2 fsspec connectors were failing when defined relative filepaths had leading slash. This strips that slash to guarantee the relative path never has it.
|
||||
|
||||
|
||||
@ -15,16 +15,16 @@ RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \
|
||||
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \
|
||||
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \
|
||||
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \
|
||||
apk add libreoffice && \
|
||||
apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \
|
||||
apk add bash && \
|
||||
apk add libmagic && \
|
||||
mv /share/tessdata/configs /usr/local/share/tessdata/ && \
|
||||
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \
|
||||
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/libreoffice && \
|
||||
ln -s /usr/lib/libreoffice/program/soffice.bin /usr/bin/soffice && \
|
||||
chmod +x /usr/lib/libreoffice/program/soffice.bin && \
|
||||
chmod +x /usr/bin/libreoffice && \
|
||||
chmod +x /usr/bin/soffice
|
||||
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \
|
||||
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \
|
||||
chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \
|
||||
chmod +x /usr/local/bin/libreoffice && \
|
||||
chmod +x /usr/local/bin/soffice
|
||||
|
||||
RUN chown -R nonroot:nonroot /app
|
||||
|
||||
|
||||
@ -137,7 +137,7 @@ jinja2==3.1.4
|
||||
# nbconvert
|
||||
json5==0.9.25
|
||||
# via jupyterlab-server
|
||||
jsonpointer==2.4
|
||||
jsonpointer==3.0.0
|
||||
# via jsonschema
|
||||
jsonschema[format-nongpl]==4.22.0
|
||||
# via
|
||||
|
||||
@ -150,7 +150,7 @@ pdfminer-six==20231228
|
||||
# via
|
||||
# -r ./extra-pdf-image.in
|
||||
# pdfplumber
|
||||
pdfplumber==0.11.0
|
||||
pdfplumber==0.11.1
|
||||
# via layoutparser
|
||||
pikepdf==9.0.0
|
||||
# via -r ./extra-pdf-image.in
|
||||
|
||||
@ -23,7 +23,7 @@ inflection==0.5.1
|
||||
# via pyairtable
|
||||
pyairtable==2.3.3
|
||||
# via -r ./ingest/airtable.in
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via pyairtable
|
||||
pydantic-core==2.18.4
|
||||
# via pydantic
|
||||
|
||||
@ -21,7 +21,7 @@ azure-core==1.30.2
|
||||
# azure-storage-blob
|
||||
azure-datalake-store==0.0.53
|
||||
# via adlfs
|
||||
azure-identity==1.16.0
|
||||
azure-identity==1.16.1
|
||||
# via adlfs
|
||||
azure-storage-blob==12.20.0
|
||||
# via adlfs
|
||||
@ -60,7 +60,7 @@ idna==3.7
|
||||
# yarl
|
||||
isodate==0.6.1
|
||||
# via azure-storage-blob
|
||||
msal==1.28.0
|
||||
msal==1.28.1
|
||||
# via
|
||||
# azure-datalake-store
|
||||
# azure-identity
|
||||
|
||||
@ -147,7 +147,7 @@ opentelemetry-util-http==0.46b0
|
||||
# via
|
||||
# opentelemetry-instrumentation-asgi
|
||||
# opentelemetry-instrumentation-fastapi
|
||||
orjson==3.10.3
|
||||
orjson==3.10.4
|
||||
# via chromadb
|
||||
overrides==7.7.0
|
||||
# via chromadb
|
||||
@ -172,7 +172,7 @@ pyasn1==0.6.0
|
||||
# rsa
|
||||
pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# chromadb
|
||||
# fastapi
|
||||
|
||||
@ -15,7 +15,7 @@ charset-normalizer==3.3.2
|
||||
# requests
|
||||
clarifai==10.5.0
|
||||
# via -r ./ingest/clarifai.in
|
||||
clarifai-grpc==10.5.0
|
||||
clarifai-grpc==10.5.1
|
||||
# via clarifai
|
||||
contextlib2==21.6.0
|
||||
# via schema
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
-c ../deps/constraints.txt
|
||||
-c ../base.txt
|
||||
deltalake<0.18.0
|
||||
deltalake
|
||||
fsspec
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile ./ingest/delta-table.in
|
||||
#
|
||||
deltalake==0.17.4
|
||||
deltalake==0.18.1
|
||||
# via -r ./ingest/delta-table.in
|
||||
fsspec==2024.5.0
|
||||
# via
|
||||
|
||||
@ -53,7 +53,7 @@ jmespath==1.0.1
|
||||
# botocore
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.3
|
||||
# via langchain-community
|
||||
@ -90,7 +90,7 @@ numpy==1.26.4
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
orjson==3.10.3
|
||||
orjson==3.10.4
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
@ -98,7 +98,7 @@ packaging==23.2
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-core
|
||||
# marshmallow
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -65,7 +65,7 @@ joblib==1.4.2
|
||||
# scikit-learn
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.3
|
||||
# via langchain-community
|
||||
@ -112,7 +112,7 @@ numpy==1.26.4
|
||||
# scipy
|
||||
# sentence-transformers
|
||||
# transformers
|
||||
orjson==3.10.3
|
||||
orjson==3.10.4
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
@ -124,7 +124,7 @@ packaging==23.2
|
||||
# transformers
|
||||
pillow==10.3.0
|
||||
# via sentence-transformers
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -40,7 +40,7 @@ idna==3.7
|
||||
# requests
|
||||
openai==1.33.0
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via openai
|
||||
pydantic-core==2.18.4
|
||||
# via pydantic
|
||||
|
||||
@ -61,7 +61,7 @@ idna==3.7
|
||||
# yarl
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.3
|
||||
# via langchain-community
|
||||
@ -100,7 +100,7 @@ numpy==1.26.4
|
||||
# langchain-community
|
||||
openai==1.33.0
|
||||
# via -r ./ingest/embed-openai.in
|
||||
orjson==3.10.3
|
||||
orjson==3.10.4
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
@ -108,7 +108,7 @@ packaging==23.2
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-core
|
||||
# marshmallow
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -54,7 +54,7 @@ google-auth==2.30.0
|
||||
# google-cloud-core
|
||||
# google-cloud-resource-manager
|
||||
# google-cloud-storage
|
||||
google-cloud-aiplatform==1.54.1
|
||||
google-cloud-aiplatform==1.55.0
|
||||
# via langchain-google-vertexai
|
||||
google-cloud-bigquery==3.24.0
|
||||
# via google-cloud-aiplatform
|
||||
@ -64,7 +64,7 @@ google-cloud-core==2.4.1
|
||||
# google-cloud-storage
|
||||
google-cloud-resource-manager==1.12.3
|
||||
# via google-cloud-aiplatform
|
||||
google-cloud-storage==2.16.0
|
||||
google-cloud-storage==2.17.0
|
||||
# via
|
||||
# google-cloud-aiplatform
|
||||
# langchain-google-vertexai
|
||||
@ -72,7 +72,7 @@ google-crc32c==1.5.0
|
||||
# via
|
||||
# google-cloud-storage
|
||||
# google-resumable-media
|
||||
google-resumable-media==2.7.0
|
||||
google-resumable-media==2.7.1
|
||||
# via
|
||||
# google-cloud-bigquery
|
||||
# google-cloud-storage
|
||||
@ -98,7 +98,7 @@ idna==3.7
|
||||
# yarl
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.3
|
||||
# via
|
||||
@ -141,7 +141,7 @@ numpy==1.26.4
|
||||
# langchain
|
||||
# langchain-community
|
||||
# shapely
|
||||
orjson==3.10.3
|
||||
orjson==3.10.4
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
@ -172,7 +172,7 @@ pyasn1==0.6.0
|
||||
# rsa
|
||||
pyasn1-modules==0.4.0
|
||||
# via google-auth
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# google-cloud-aiplatform
|
||||
# langchain
|
||||
|
||||
@ -40,7 +40,7 @@ idna==3.7
|
||||
# yarl
|
||||
jsonpatch==1.33
|
||||
# via langchain-core
|
||||
jsonpointer==2.4
|
||||
jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.3
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
@ -68,14 +68,14 @@ numpy==1.26.4
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain
|
||||
# voyageai
|
||||
orjson==3.10.3
|
||||
orjson==3.10.4
|
||||
# via langsmith
|
||||
packaging==23.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# langchain-core
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
||||
@ -57,13 +57,13 @@ google-auth-oauthlib==1.2.0
|
||||
# via gcsfs
|
||||
google-cloud-core==2.4.1
|
||||
# via google-cloud-storage
|
||||
google-cloud-storage==2.16.0
|
||||
google-cloud-storage==2.17.0
|
||||
# via gcsfs
|
||||
google-crc32c==1.5.0
|
||||
# via
|
||||
# google-cloud-storage
|
||||
# google-resumable-media
|
||||
google-resumable-media==2.7.0
|
||||
google-resumable-media==2.7.1
|
||||
# via google-cloud-storage
|
||||
googleapis-common-protos==1.63.1
|
||||
# via google-api-core
|
||||
|
||||
@ -17,7 +17,7 @@ charset-normalizer==3.3.2
|
||||
# requests
|
||||
google-api-core==2.19.0
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.132.0
|
||||
google-api-python-client==2.133.0
|
||||
# via -r ./ingest/google-drive.in
|
||||
google-auth==2.30.0
|
||||
# via
|
||||
|
||||
@ -29,7 +29,7 @@ idna==3.7
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
msal==1.28.0
|
||||
msal==1.28.1
|
||||
# via
|
||||
# -r ./ingest/onedrive.in
|
||||
# office365-rest-python-client
|
||||
|
||||
@ -23,7 +23,7 @@ idna==3.7
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
msal==1.28.0
|
||||
msal==1.28.1
|
||||
# via
|
||||
# -r ./ingest/outlook.in
|
||||
# office365-rest-python-client
|
||||
|
||||
@ -52,7 +52,7 @@ protobuf==4.23.4
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# grpcio-tools
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via qdrant-client
|
||||
pydantic-core==2.18.4
|
||||
# via pydantic
|
||||
|
||||
@ -23,7 +23,7 @@ idna==3.7
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
msal==1.28.0
|
||||
msal==1.28.1
|
||||
# via
|
||||
# -r ./ingest/sharepoint.in
|
||||
# office365-rest-python-client
|
||||
|
||||
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile ./ingest/slack.in
|
||||
#
|
||||
slack-sdk==3.27.2
|
||||
slack-sdk==3.28.0
|
||||
# via -r ./ingest/slack.in
|
||||
|
||||
@ -57,7 +57,7 @@ protobuf==4.23.4
|
||||
# grpcio-tools
|
||||
pycparser==2.22
|
||||
# via cffi
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via weaviate-client
|
||||
pydantic-core==2.18.4
|
||||
# via pydantic
|
||||
|
||||
@ -96,7 +96,7 @@ pycodestyle==2.11.1
|
||||
# via
|
||||
# flake8
|
||||
# flake8-print
|
||||
pydantic==2.7.3
|
||||
pydantic==2.7.4
|
||||
# via
|
||||
# -r ./test.in
|
||||
# label-studio-sdk
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
files=(
|
||||
"libreoffice-7.6.5-r0.apk"
|
||||
"libreoffice-24-24.2.4.1-r0.67f8e014.apk"
|
||||
"openjpeg-2.5.0-r0.apk"
|
||||
"poppler-23.09.0-r0.apk"
|
||||
"leptonica-1.83.0-r0.apk"
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.6-dev3" # pragma: no cover
|
||||
__version__ = "0.14.6-dev4" # pragma: no cover
|
||||
|
||||
@ -152,8 +152,9 @@ class DeltaTableSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector
|
||||
@dataclass
|
||||
class DeltaTableWriteConfig(WriteConfig):
|
||||
drop_empty_cols: bool = False
|
||||
overwrite_schema: bool = False
|
||||
mode: t.Literal["error", "append", "overwrite", "ignore"] = "error"
|
||||
schema_mode: t.Optional[t.Literal["merge", "overwrite"]] = None
|
||||
engine: t.Literal["pyarrow", "rust"] = "pyarrow"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -182,18 +183,21 @@ class DeltaTableDestinationConnector(BaseDestinationConnector):
|
||||
f"writing {len(df)} rows to destination table "
|
||||
f"at {self.connector_config.table_uri}\ndtypes: {df.dtypes}",
|
||||
)
|
||||
writer_kwargs = {
|
||||
"table_or_uri": self.connector_config.table_uri,
|
||||
"data": df,
|
||||
"mode": self.write_config.mode,
|
||||
"engine": self.write_config.engine,
|
||||
}
|
||||
if self.write_config.schema_mode is not None:
|
||||
writer_kwargs["schema_mode"] = self.write_config.schema_mode
|
||||
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and cause
|
||||
# ingest to fail, even though all tasks are completed normally. Putting the writer into a
|
||||
# process mitigates this issue by ensuring python interpreter waits properly for deltalake's
|
||||
# rust backend to finish
|
||||
writer = Process(
|
||||
target=write_deltalake,
|
||||
kwargs={
|
||||
"table_or_uri": self.connector_config.table_uri,
|
||||
"data": df,
|
||||
"mode": self.write_config.mode,
|
||||
"overwrite_schema": self.write_config.overwrite_schema,
|
||||
},
|
||||
kwargs=writer_kwargs,
|
||||
)
|
||||
writer.start()
|
||||
writer.join()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user