feat(ingest): use uv for python package installs (#9885)

This commit is contained in:
Harshal Sheth 2024-02-26 15:02:47 -08:00 committed by GitHub
parent a1f2216da7
commit 02f41b74b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 245 additions and 237 deletions

View File

@ -1,7 +1,11 @@
**/node_modules/
*/build/
*/*/build/
*/venv/
**/venv/
**/.tox/
**/.mypy_cache/
**/.pytest_cache/
**/__pycache__/
out
**/*.class
# Have to copy gradle/wrapper/gradle-wrapper.jar, can't exclude ALL jars

View File

@ -24,7 +24,7 @@ function get_tag_full {
}
function get_python_docker_release_v {
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},0.0.0+docker.${SHORT_SHA},g" -e 's,refs/tags/v\(.*\),\1+docker,g' -e 's,refs/pull/\([0-9]*\).*,0.0.0+docker.pr\1,g')
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},1!0.0.0+docker.${SHORT_SHA},g" -e 's,refs/tags/v\(.*\),1!\1+docker,g' -e 's,refs/pull/\([0-9]*\).*,1!0.0.0+docker.pr\1,g')
}
function get_unique_tag {
@ -37,4 +37,4 @@ function get_unique_tag_slim {
function get_unique_tag_full {
echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA}-full,g" -e 's,refs/tags/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g')
}
}

View File

@ -38,11 +38,17 @@ jobs:
with:
python-version: "3.10"
cache: "pip"
- uses: actions/cache@v4
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
- name: Set up JDK 17
uses: actions/setup-java@v3
with:
distribution: "zulu"
java-version: 17
- uses: gradle/gradle-build-action@v2
- name: Ensure packages are correct
run: |
python ./.github/scripts/check_python_package.py
@ -978,14 +984,14 @@ jobs:
if: failure()
run: |
docker ps -a
docker logs datahub-gms >& gms-${{ matrix.test_strategy }}.log || true
docker logs datahub-actions >& actions-${{ matrix.test_strategy }}.log || true
docker logs datahub-mae-consumer >& mae-${{ matrix.test_strategy }}.log || true
docker logs datahub-mce-consumer >& mce-${{ matrix.test_strategy }}.log || true
docker logs broker >& broker-${{ matrix.test_strategy }}.log || true
docker logs mysql >& mysql-${{ matrix.test_strategy }}.log || true
docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true
docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true
docker logs datahub-datahub-gms-1 >& gms-${{ matrix.test_strategy }}.log || true
docker logs datahub-datahub-actions-1 >& actions-${{ matrix.test_strategy }}.log || true
docker logs datahub-datahub-mae-consumer-1 >& mae-${{ matrix.test_strategy }}.log || true
docker logs datahub-datahub-mce-consumer-1 >& mce-${{ matrix.test_strategy }}.log || true
docker logs datahub-broker-1 >& broker-${{ matrix.test_strategy }}.log || true
docker logs datahub-mysql-1 >& mysql-${{ matrix.test_strategy }}.log || true
docker logs datahub-elasticsearch-1 >& elasticsearch-${{ matrix.test_strategy }}.log || true
docker logs datahub-datahub-frontend-react-1 >& frontend-${{ matrix.test_strategy }}.log || true
- name: Upload logs
uses: actions/upload-artifact@v3
if: failure()

View File

@ -51,6 +51,15 @@ jobs:
java-version: 17
- uses: gradle/gradle-build-action@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: "pip"
- uses: actions/cache@v4
with:
path: |
~/.cache/uv
key: ${{ runner.os }}-uv-${{ hashFiles('**/requirements.txt') }}
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

View File

@ -50,7 +50,7 @@ RUN apt-get update && apt-get install -y -qq \
ldap-utils \
unixodbc \
libodbc2 \
&& python -m pip install --no-cache --upgrade pip wheel setuptools \
&& python -m pip install --no-cache --upgrade pip uv>=0.1.10 wheel setuptools \
&& rm -rf /var/lib/apt/lists/* /var/cache/apk/*
# compiled against newer golang for security fixes
@ -59,16 +59,22 @@ COPY --from=dockerize-binary /go/bin/dockerize /usr/local/bin
COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt
COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh
RUN pip install --no-cache -r requirements.txt && \
pip uninstall -y acryl-datahub && \
chmod +x /entrypoint.sh && \
addgroup --gid 1000 datahub && \
adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub
RUN addgroup --gid 1000 datahub && \
adduser --disabled-password --uid 1000 --gid 1000 --home /datahub-ingestion datahub && \
chmod +x /entrypoint.sh
USER datahub
ENV VIRTUAL_ENV=/datahub-ingestion/.venv
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
RUN python3 -m venv $VIRTUAL_ENV && \
uv pip install --no-cache -r requirements.txt && \
pip uninstall -y acryl-datahub
ENTRYPOINT [ "/entrypoint.sh" ]
FROM ${BASE_IMAGE} as full-install
USER 0
RUN apt-get update && apt-get install -y -qq \
default-jre-headless \
&& rm -rf /var/lib/apt/lists/* /var/cache/apk/*
@ -91,10 +97,11 @@ RUN if [ $(arch) = "x86_64" ]; then \
ldconfig; \
fi;
USER datahub
FROM ${BASE_IMAGE} as slim-install
# Do nothing else on top of base
FROM ${APP_ENV}-install
USER datahub
ENV PATH="/datahub-ingestion/.local/bin:$PATH"

View File

@ -3,22 +3,20 @@ acryl-datahub-classify==0.0.9
acryl-PyHive==0.6.16
acryl-sqlglot==20.4.1.dev14
aenum==3.1.15
aiohttp==3.9.1
aiohttp==3.9.3
aiosignal==1.3.1
alembic==1.13.1
altair==4.2.0
annotated-types==0.6.0
anyio==3.7.1
anyio==4.3.0
apache-airflow==2.7.3
apache-airflow-providers-common-sql==1.9.0
apache-airflow-providers-common-sql==1.11.0
apache-airflow-providers-ftp==3.7.0
apache-airflow-providers-http==4.8.0
apache-airflow-providers-http==4.9.1
apache-airflow-providers-imap==3.5.0
apache-airflow-providers-sqlite==3.6.0
apispec==6.3.1
appdirs==1.4.4
appnope==0.1.3
argcomplete==3.2.1
apache-airflow-providers-sqlite==3.7.1
apispec==6.4.0
appnope==0.1.4
argcomplete==3.2.2
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
asgiref==3.7.2
@ -26,25 +24,24 @@ asn1crypto==1.5.1
asttokens==2.4.1
async-timeout==4.0.3
asynch==0.2.3
attrs==23.1.0
attrs==23.2.0
avro==1.11.3
avro-gen3==0.7.11
Babel==2.14.0
backoff==2.2.1
beautifulsoup4==4.12.2
beautifulsoup4==4.12.3
bleach==6.1.0
blinker==1.7.0
blis==0.7.11
boto3==1.34.8
botocore==1.34.8
bowler==0.9.0
boto3==1.34.49
botocore==1.34.49
bracex==2.4
cached-property==1.5.2
cachelib==0.9.0
cachetools==5.3.2
catalogue==2.0.10
cattrs==23.2.3
certifi==2023.11.17
certifi==2024.2.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
@ -53,33 +50,31 @@ click==8.1.7
click-default-group==1.2.4
click-spinner==0.1.10
clickclick==20.10.2
clickhouse-driver==0.2.6
clickhouse-driver==0.2.7
clickhouse-sqlalchemy==0.2.4
cloudpickle==3.0.0
colorama==0.4.6
colorlog==4.8.0
comm==0.2.0
comm==0.2.1
confection==0.1.4
ConfigUpdater==3.2
confluent-kafka==2.3.0
connexion==2.14.2
cron-descriptor==1.4.0
connexion==2.14.1
cron-descriptor==1.4.3
croniter==2.0.1
cryptography==41.0.7
cx-Oracle==8.3.0
cryptography==42.0.4
cx_Oracle==8.3.0
cymem==2.0.8
dask==2023.12.1
databricks-cli==0.18.0
databricks-dbapi==0.6.0
databricks-sdk==0.15.0
databricks-sql-connector==2.9.3
debugpy==1.8.0
databricks-sdk==0.20.0
databricks-sql-connector==2.9.4
debugpy==1.8.1
decorator==5.1.1
defusedxml==0.7.1
deltalake==0.14.0
deltalake==0.15.3
Deprecated==1.2.14
dill==0.3.7
dnspython==2.4.2
dill==0.3.8
dnspython==2.6.1
docker==7.0.0
docutils==0.20.1
ecdsa==0.18.0
@ -90,25 +85,22 @@ et-xmlfile==1.1.0
exceptiongroup==1.2.0
executing==2.0.1
expandvars==0.12.0
fastapi==0.109.1
fastavro==1.9.2
fastjsonschema==2.19.0
feast==0.31.1
fastavro==1.9.4
fastjsonschema==2.19.1
filelock==3.13.1
fissix==21.11.13
Flask==2.2.5
flatdict==4.0.1
frozenlist==1.4.1
fsspec==2023.12.2
future==0.18.3
GeoAlchemy2==0.14.3
future==1.0.0
GeoAlchemy2==0.14.4
gitdb==4.0.11
GitPython==3.1.41
google-api-core==2.15.0
google-auth==2.25.2
google-cloud-appengine-logging==1.4.0
GitPython==3.1.42
google-api-core==2.17.1
google-auth==2.28.1
google-cloud-appengine-logging==1.4.2
google-cloud-audit-log==0.2.5
google-cloud-bigquery==3.14.1
google-cloud-bigquery==3.17.2
google-cloud-core==2.4.1
google-cloud-datacatalog-lineage==0.2.2
google-cloud-logging==3.5.0
@ -116,98 +108,92 @@ google-crc32c==1.5.0
google-re2==1.1
google-resumable-media==2.7.0
googleapis-common-protos==1.62.0
gql==3.4.1
gql==3.5.0
graphql-core==3.2.3
graphviz==0.20.1
great-expectations==0.15.50
greenlet==3.0.3
grpc-google-iam-v1==0.13.0
grpcio==1.60.0
grpcio-reflection==1.60.0
grpcio-status==1.60.0
grpcio-tools==1.60.0
grpcio==1.62.0
grpcio-status==1.62.0
grpcio-tools==1.62.0
gssapi==1.8.3
gunicorn==21.2.0
h11==0.14.0
hdbcli==2.19.20
httpcore==1.0.2
httptools==0.6.1
httpx==0.26.0
hdbcli==2.19.21
httpcore==1.0.4
httpx==0.27.0
humanfriendly==10.0
idna==3.6
ijson==3.2.3
importlib-metadata==6.11.0
importlib-metadata==7.0.1
importlib-resources==6.1.1
inflection==0.5.1
ipaddress==1.0.23
ipykernel==6.17.1
ipython==8.19.0
ipython==8.21.0
ipython-genutils==0.2.0
ipywidgets==8.1.1
ipywidgets==8.1.2
iso3166==2.1.1
isodate==0.6.1
itsdangerous==2.1.2
jedi==0.19.1
Jinja2==3.1.2
Jinja2==3.1.3
jmespath==1.0.1
JPype1==1.5.0
jsonlines==4.0.0
jsonpatch==1.33
jsonpointer==2.4
jsonref==1.1.0
jsonschema==4.20.0
jsonschema==4.21.1
jsonschema-specifications==2023.12.1
jupyter-server==1.24.0
jupyter-server==1.16.0
jupyter_client==7.4.9
jupyter_core==4.12.0
jupyterlab-widgets==3.0.9
jupyter_core==5.0.0
jupyterlab_pygments==0.3.0
jupyterlab_widgets==3.0.10
langcodes==3.3.0
lark==1.1.4
lazy-object-proxy==1.10.0
leb128==1.0.5
limits==3.7.0
limits==3.9.0
linear-tsv==1.1.0
linkify-it-py==2.0.2
lkml==1.3.3
locket==1.0.0
linkify-it-py==2.0.3
lkml==1.3.4
lockfile==0.12.2
looker-sdk==23.0.0
lxml==4.9.4
lz4==4.3.2
lxml==5.1.0
lz4==4.3.3
makefun==1.15.2
Mako==1.3.0
Markdown==3.5.1
Mako==1.3.2
Markdown==3.5.2
markdown-it-py==3.0.0
MarkupSafe==2.1.3
marshmallow==3.20.1
marshmallow-oneofschema==3.0.1
MarkupSafe==2.1.5
marshmallow==3.20.2
marshmallow-oneofschema==3.1.1
marshmallow-sqlalchemy==0.26.1
matplotlib-inline==0.1.6
mdit-py-plugins==0.4.0
mdurl==0.1.2
mistune==3.0.2
mixpanel==4.10.0
mlflow-skinny==2.9.2
mmh3==4.0.1
mlflow-skinny==2.10.2
mmhash3==3.0.1
more-itertools==10.1.0
moreorless==0.4.0
moto==4.2.12
more-itertools==10.2.0
moto==4.2.14
msal==1.22.0
multidict==6.0.4
multidict==6.0.5
murmurhash==1.0.10
mypy==1.8.0
mypy-extensions==1.0.0
nbclassic==1.0.0
nbclient==0.6.3
nbconvert==7.13.1
nbformat==5.9.1
nest-asyncio==1.5.8
nbconvert==7.16.1
nbformat==5.9.2
nest-asyncio==1.6.0
networkx==3.2.1
notebook==6.5.6
notebook_shim==0.2.3
numpy==1.26.2
notebook_shim==0.2.4
numpy==1.26.4
oauthlib==3.2.2
okta==1.7.0
openlineage-airflow==1.2.0
@ -215,133 +201,127 @@ openlineage-integration-common==1.2.0
openlineage-python==1.2.0
openlineage_sql==1.2.0
openpyxl==3.1.2
opentelemetry-api==1.22.0
opentelemetry-exporter-otlp==1.22.0
opentelemetry-exporter-otlp-proto-common==1.22.0
opentelemetry-exporter-otlp-proto-grpc==1.22.0
opentelemetry-exporter-otlp-proto-http==1.22.0
opentelemetry-proto==1.22.0
opentelemetry-sdk==1.22.0
opentelemetry-semantic-conventions==0.43b0
opentelemetry-api==1.16.0
opentelemetry-exporter-otlp==1.16.0
opentelemetry-exporter-otlp-proto-grpc==1.16.0
opentelemetry-exporter-otlp-proto-http==1.16.0
opentelemetry-proto==1.16.0
opentelemetry-sdk==1.16.0
opentelemetry-semantic-conventions==0.37b0
ordered-set==4.1.0
packaging==23.2
pandas==1.5.3
pandavro==1.5.2
pandocfilters==1.5.0
parse==1.20.0
pandas==2.2.1
pandocfilters==1.5.1
parse==1.20.1
parso==0.8.3
partd==1.4.1
pathlib_abc==0.1.1
pathspec==0.12.1
pathy==0.10.3
pathy==0.11.0
pendulum==2.1.2
pexpect==4.9.0
phonenumbers==8.13.0
platformdirs==3.11.0
pluggy==1.3.0
pluggy==1.4.0
preshed==3.0.9
prison==0.2.1
progressbar2==4.3.2
prometheus-client==0.19.0
prometheus_client==0.20.0
prompt-toolkit==3.0.43
proto-plus==1.23.0
protobuf==4.25.1
psutil==5.9.7
protobuf==4.25.3
psutil==5.9.8
psycopg2-binary==2.9.9
ptyprocess==0.7.0
pure-eval==0.2.2
pure-sasl==0.6.2
py-partiql-parser==0.5.0
pyarrow==11.0.0
pyarrow==12.0.1
pyarrow-hotfix==0.6
pyasn1==0.5.1
pyasn1-modules==0.3.0
pyathena==2.25.2
pycountry==23.12.11
pycparser==2.21
pycryptodome==3.19.0
pydantic==1.10.13
pydantic_core==2.14.6
pydash==7.0.6
pycryptodome==3.20.0
pydantic==1.10.14
pydash==7.0.7
pydruid==0.6.6
Pygments==2.17.2
pyiceberg==0.4.0
pymongo==4.6.1
pymongo==4.6.2
PyMySQL==1.1.0
pyOpenSSL==23.3.0
pyOpenSSL==24.0.0
pyparsing==3.0.9
pyspnego==0.10.2
python-daemon==3.0.1
python-dateutil==2.8.2
python-dotenv==1.0.0
python-jose==3.3.0
python-ldap==3.4.4
python-nvd3==0.15.0
python-slugify==8.0.1
python-slugify==8.0.4
python-stdnum==1.19
python-tds==1.14.0
python-utils==3.8.1
python3-openid==3.2.0
pytz==2023.3.post1
python-tds==1.15.0
python-utils==3.8.2
pytz==2023.4
pytzdata==2020.1
PyYAML==6.0.1
pyzmq==24.0.1
redash-toolbelt==0.1.9
redshift-connector==2.0.918
referencing==0.32.0
redshift-connector==2.1.0
referencing==0.33.0
regex==2023.12.25
requests==2.31.0
requests-file==1.5.1
requests-gssapi==1.2.3
requests-file==2.0.0
requests-gssapi==1.3.0
requests-ntlm==1.2.0
requests-toolbelt==0.10.1
responses==0.24.1
requests-toolbelt==1.0.0
responses==0.25.0
rfc3339-validator==0.1.4
rfc3986==2.0.0
rich==13.7.0
rich-argparse==1.4.0
rpds-py==0.15.2
rpds-py==0.18.0
rsa==4.9
ruamel.yaml==0.17.17
ruamel.yaml.clib==0.2.8
s3transfer==0.10.0
schwifty==2023.11.2
scipy==1.11.4
schwifty==2024.1.1.post0
scipy==1.12.0
scramp==1.4.4
Send2Trash==1.8.2
sentry-sdk==1.39.1
sentry-sdk==1.40.5
setproctitle==1.3.3
simple-salesforce==1.12.5
six==1.16.0
slack-sdk==3.18.1
smart-open==6.4.0
smmap==5.0.1
sniffio==1.3.0
snowflake-connector-python==3.6.0
snowflake-connector-python==3.7.1
snowflake-sqlalchemy==1.5.1
sortedcontainers==2.4.0
soupsieve==2.5
spacy==3.4.3
spacy==3.5.0
spacy-legacy==3.0.12
spacy-loggers==1.0.5
sql-metadata==2.2.2
SQLAlchemy==1.4.44
sqlalchemy-bigquery==1.9.0
sqlalchemy-hana==1.1.1
sqlalchemy-hana==1.3.0
SQLAlchemy-JSONField==1.0.2
sqlalchemy-pytds==0.3.5
sqlalchemy-redshift==0.8.14
SQLAlchemy-Utils==0.41.1
sqlalchemy2-stubs==0.0.2a37
sqllineage==1.3.8
sqlparse==0.4.4
srsly==2.4.8
stack-data==0.6.3
starlette==0.32.0.post1
strictyaml==1.7.3
tableauserverclient==0.25
tableschema==1.20.2
tabulate==0.9.0
tabulator==1.53.5
tenacity==8.2.3
teradatasql==20.0.0.2
teradatasql==20.0.0.7
teradatasqlalchemy==17.20.0.0
termcolor==2.4.0
terminado==0.18.0
@ -351,38 +331,31 @@ thrift==0.16.0
thrift-sasl==0.4.3
tinycss2==1.2.1
toml==0.10.2
tomli==2.0.1
tomlkit==0.12.3
toolz==0.12.0
toolz==0.12.1
tornado==6.4
tqdm==4.66.1
tqdm==4.66.2
traitlets==5.2.1.post0
trino==0.327.0
typeguard==2.13.3
trino==0.328.0
typer==0.7.0
typing-inspect==0.9.0
typing_extensions==4.9.0
tzdata==2024.1
tzlocal==5.2
uc-micro-py==1.0.2
uc-micro-py==1.0.3
ujson==5.9.0
unicodecsv==0.14.1
universal-pathlib==0.1.4
urllib3==1.26.18
uvicorn==0.25.0
uvloop==0.19.0
vertica-python==1.3.8
vertica-sqlalchemy-dialect==0.0.8.1
vininfo==1.7.0
volatile==2.1.0
wasabi==0.10.1
watchfiles==0.21.0
wcmatch==8.5
wcwidth==0.2.12
vininfo==1.8.0
wasabi==1.1.2
wcmatch==8.5.1
wcwidth==0.2.13
webencodings==0.5.1
websocket-client==1.7.0
websockets==12.0
Werkzeug==2.2.3
widgetsnbextension==4.0.9
Werkzeug==2.3.8
widgetsnbextension==4.0.10
wrapt==1.16.0
WTForms==3.0.1
xlrd==2.0.1

View File

@ -1,10 +1,10 @@
#!/usr/bin/bash
if [ ! -z "$ACTIONS_EXTRA_PACKAGES" ]; then
pip install --user $ACTIONS_EXTRA_PACKAGES
if [ -n "$ACTIONS_EXTRA_PACKAGES" ]; then
uv pip install $ACTIONS_EXTRA_PACKAGES
fi
if [[ ! -z "$ACTIONS_CONFIG" && ! -z "$ACTIONS_EXTRA_PACKAGES" ]]; then
if [[ -n "$ACTIONS_CONFIG" && -n "$ACTIONS_EXTRA_PACKAGES" ]]; then
mkdir -p /tmp/datahub/logs
curl -q "$ACTIONS_CONFIG" -o config.yaml
exec dockerize -wait ${DATAHUB_GMS_PROTOCOL:-http}://$DATAHUB_GMS_HOST:$DATAHUB_GMS_PORT/health -timeout 240s \

View File

@ -13,14 +13,12 @@ VENV_DIR=$(mktemp -d)
python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'"
python -m venv $VENV_DIR
source $VENV_DIR/bin/activate
pip install --upgrade pip setuptools wheel
pip install --upgrade pip uv setuptools wheel
echo "Using virtualenv at $VENV_DIR"
# Install stuff.
pushd $DATAHUB_DIR/metadata-ingestion
pip install -e .
pip install -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]'
pip install -e '.[all]'
uv pip install -e '.[all]' -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]'
popd
# Generate the requirements file.
@ -31,6 +29,7 @@ popd
echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt
pip freeze \
| grep -v -E "^-e" \
| grep -v -E "^uv==" \
| grep -v "Flask-" \
| grep -v -E "(py4j|PyJWT)==" \
| grep -v -E "(pyspark|pydeequ)==" \

View File

@ -6,26 +6,23 @@ ARG PIP_MIRROR_URL=null
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
FROM $BASE_IMAGE:$DOCKER_VERSION as base
USER 0
USER datahub
COPY ./metadata-ingestion /datahub-ingestion
COPY ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin
ARG RELEASE_VERSION
WORKDIR /datahub-ingestion
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \
cat src/datahub/__init__.py && \
chown -R datahub /datahub-ingestion
USER datahub
ENV PATH="/datahub-ingestion/.local/bin:$PATH"
cat src/datahub/__init__.py | grep __version__ && \
cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__
FROM base as slim-install
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN pip install --no-cache --user ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ."
FROM base as full-install-build
ARG PIP_MIRROR_URL
@ -39,14 +36,13 @@ USER datahub
COPY ./docker/datahub-ingestion/pyspark_jars.sh .
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN pip install --no-cache --user ".[base]" && \
pip install --no-cache --user "./airflow-plugin[acryl-datahub-airflow-plugin]" && \
pip install --no-cache --user ".[all]"
RUN uv pip install --no-cache "acryl-datahub[base,all] @ ." "acryl-datahub-airflow-plugin[plugin-v2] @ ./airflow-plugin" && \
datahub --version
RUN ./pyspark_jars.sh
FROM base as full-install
COPY --from=full-install-build /datahub-ingestion/.local /datahub-ingestion/.local
COPY --from=full-install-build ${VIRTUAL_ENV} ${VIRTUAL_ENV}
FROM base as dev-install
# Dummy stage for development. Assumes code is built on your machine and mounted to this image.
@ -55,4 +51,3 @@ FROM base as dev-install
FROM ${APP_ENV}-install as final
USER datahub
ENV PATH="/datahub-ingestion/.local/bin:$PATH"

View File

@ -15,16 +15,15 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS
chown -R datahub /datahub-ingestion
USER datahub
ENV PATH="/datahub-ingestion/.local/bin:$PATH"
FROM base as slim-install
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN pip install --no-cache --user ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." && \
datahub --version
FROM slim-install as final
USER datahub
ENV PATH="/datahub-ingestion/.local/bin:$PATH"

View File

@ -2,7 +2,7 @@
set -ex
PYSPARK_JARS="$(python -m site --user-site)/pyspark/jars"
PYSPARK_JARS="$(python -c 'import site; print(site.getsitepackages()[0])')/pyspark/jars"
function replace_jar {
JAR_PREFIX=$1

View File

@ -18,7 +18,7 @@ if (extra_pip_extras != "") {
ext.extra_pip_extras = "," + extra_pip_extras
}
def pip_install_command = "${venv_name}/bin/pip install -e ../../metadata-ingestion"
def pip_install_command = "VIRTUAL_ENV=${venv_name} ${venv_name}/bin/uv pip install -e ../../metadata-ingestion"
task checkPythonVersion(type: Exec) {
commandLine python_executable, '-c', 'import sys; assert sys.version_info >= (3, 7)'
@ -29,8 +29,8 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) {
inputs.file file('setup.py')
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"${python_executable} -m venv ${venv_name} &&" +
"${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " +
"${python_executable} -m venv ${venv_name} && set -x && " +
"${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " +
"touch ${sentinel_file}"
}
@ -41,8 +41,9 @@ task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingesti
// Workaround for https://github.com/yaml/pyyaml/issues/601.
// See https://github.com/yaml/pyyaml/issues/601#issuecomment-1638509577.
// and https://github.com/datahub-project/datahub/pull/8435.
commandLine 'bash', '-x', '-c',
"${pip_install_command} install 'Cython<3.0' 'PyYAML<6' --no-build-isolation && " +
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pip install 'Cython<3.0' 'PyYAML<6' --no-build-isolation && " +
"${pip_install_command} -e .[ignore${extra_pip_extras}] ${extra_pip_requirements} &&" +
"touch ${sentinel_file}"
}
@ -53,7 +54,8 @@ task installDev(type: Exec, dependsOn: [install]) {
def sentinel_file = "${venv_name}/.build_install_dev_sentinel"
inputs.file file('setup.py')
outputs.file("${sentinel_file}")
commandLine 'bash', '-x', '-c',
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${pip_install_command} -e .[dev${extra_pip_extras}] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
@ -72,8 +74,8 @@ task lint(type: Exec, dependsOn: installDev) {
"mypy --show-traceback --show-error-codes src/ tests/"
}
task lintFix(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-x', '-c',
"source ${venv_name}/bin/activate && " +
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"black src/ tests/ && " +
"isort src/ tests/ && " +
"flake8 src/ tests/ && " +
@ -85,30 +87,18 @@ task installDevTest(type: Exec, dependsOn: [installDev]) {
inputs.file file('setup.py')
outputs.dir("${venv_name}")
outputs.file("${sentinel_file}")
commandLine 'bash', '-x', '-c',
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${pip_install_command} -e .[dev,integration-tests${extra_pip_extras}] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
def testFile = hasProperty('testFile') ? testFile : 'unknown'
task testSingle(dependsOn: [installDevTest]) {
doLast {
if (testFile != 'unknown') {
exec {
commandLine 'bash', '-x', '-c',
"source ${venv_name}/bin/activate && pytest ${testFile}"
}
} else {
throw new GradleException("No file provided. Use -PtestFile=<test_file>")
}
}
}
task testQuick(type: Exec, dependsOn: installDevTest) {
inputs.files(project.fileTree(dir: "src/", include: "**/*.py"))
inputs.files(project.fileTree(dir: "tests/"))
commandLine 'bash', '-x', '-c',
"source ${venv_name}/bin/activate && pytest --cov-config=setup.cfg --cov-report xml:coverage_quick.xml -vv --continue-on-collection-errors --junit-xml=junit.quick.xml"
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pytest --cov-config=setup.cfg --cov-report xml:coverage_quick.xml -vv --continue-on-collection-errors --junit-xml=junit.quick.xml"
}
@ -117,7 +107,9 @@ task cleanPythonCache(type: Exec) {
"find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete"
}
task buildWheel(type: Exec, dependsOn: [install, cleanPythonCache]) {
commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh'
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh'
}
build.dependsOn install

View File

@ -15,7 +15,9 @@ def get_long_description():
_version: str = package_metadata["__version__"]
_self_pin = f"=={_version}" if not _version.endswith("dev0") else ""
_self_pin = (
f"=={_version}" if not (_version.endswith("dev0") or "docker" in _version) else ""
)
base_requirements = {
@ -163,6 +165,7 @@ setuptools.setup(
# Dependencies.
install_requires=list(base_requirements),
extras_require={
"ignore": [], # This is a dummy extra to allow for trailing commas in the list.
**{plugin: list(dependencies) for plugin, dependencies in plugins.items()},
"dev": list(dev_requirements),
"integration-tests": list(integration_test_requirements),

View File

@ -26,7 +26,7 @@ task environmentSetup(type: Exec, dependsOn: checkPythonVersion) {
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"${python_executable} -m venv ${venv_name} && " +
"${venv_name}/bin/python -m pip install --upgrade pip wheel 'setuptools>=63.0.0' && " +
"${venv_name}/bin/python -m pip install --upgrade pip uv wheel 'setuptools>=63.0.0' && " +
"touch ${sentinel_file}"
}
@ -43,7 +43,8 @@ task installPackageOnly(type: Exec, dependsOn: runPreFlightScript) {
inputs.file file('setup.py')
outputs.file(sentinel_file)
commandLine 'bash', '-x', '-c',
"${venv_name}/bin/pip install -e . &&" +
"source ${venv_name}/bin/activate && " +
"uv pip install -e . &&" +
"touch ${sentinel_file}"
}
@ -52,7 +53,8 @@ task installPackage(type: Exec, dependsOn: installPackageOnly) {
inputs.file file('setup.py')
outputs.file(sentinel_file)
commandLine 'bash', '-x', '-c',
"${venv_name}/bin/pip install -e . ${extra_pip_requirements} && " +
"source ${venv_name}/bin/activate && " +
"uv pip install -e . ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
@ -70,7 +72,7 @@ task customPackageGenerate(type: Exec, dependsOn: [environmentSetup, installPack
def package_version = project.findProperty('package_version')
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && " +
"pip install build && " +
"uv pip install build && " +
"./scripts/custom_package_codegen.sh '${package_name}' '${package_version}'"
}
@ -82,7 +84,7 @@ task installDev(type: Exec, dependsOn: [install]) {
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${venv_name}/bin/pip install -e .[dev] ${extra_pip_requirements} && " +
"uv pip install -e .[dev] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
@ -92,7 +94,7 @@ task installAll(type: Exec, dependsOn: [install]) {
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"${venv_name}/bin/pip install -e .[all] ${extra_pip_requirements} && " +
"uv pip install -e .[all] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
@ -142,7 +144,8 @@ task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJso
outputs.dir("${venv_name}")
def cvg_arg = get_coverage_arg("quick")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && pytest ${cvg_arg} tests/unit --random-order --durations=20 -m 'not integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml"
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} tests/unit --random-order --durations=20 -m 'not integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml"
}
task installDevTest(type: Exec, dependsOn: [install]) {
@ -151,7 +154,9 @@ task installDevTest(type: Exec, dependsOn: [install]) {
outputs.dir("${venv_name}")
outputs.file(sentinel_file)
commandLine 'bash', '-c',
"${venv_name}/bin/pip install -e .[dev,integration-tests] ${extra_pip_requirements} && touch ${sentinel_file}"
"source ${venv_name}/bin/activate && set -x && " +
"uv pip install -e .[dev,integration-tests] ${extra_pip_requirements} && " +
"touch ${sentinel_file}"
}
def testFile = hasProperty('testFile') ? testFile : 'unknown'
@ -171,22 +176,26 @@ task testSingle(dependsOn: [installDevTest]) {
task testIntegrationBatch0(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_arg("intBatch0")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=50 -m 'integration_batch_0' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch0.xml"
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} --durations=50 -m 'integration_batch_0' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch0.xml"
}
task testIntegrationBatch1(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_arg("intBatch1")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=50 -m 'integration_batch_1' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch1.xml"
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} --durations=50 -m 'integration_batch_1' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch1.xml"
}
task testIntegrationBatch2(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_arg("intBatch2")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && pytest ${cvg_arg} --durations=20 -m 'integration_batch_2' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch2.xml"
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} --durations=20 -m 'integration_batch_2' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch2.xml"
}
task testFull(type: Exec, dependsOn: [installDevTest]) {
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml"
"source ${venv_name}/bin/activate && set -x && " +
"pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml"
}
task specGen(type: Exec, dependsOn: [codegen, installDevTest]) {
@ -203,7 +212,7 @@ task cleanPythonCache(type: Exec) {
"find src tests -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete"
}
task buildWheel(type: Exec, dependsOn: [install, codegen, cleanPythonCache]) {
commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh'
commandLine 'bash', '-c', "source ${venv_name}/bin/activate && " + 'uv pip install build && RELEASE_VERSION="\${RELEASE_VERSION:-0.0.0.dev1}" RELEASE_SKIP_TEST=1 RELEASE_SKIP_UPLOAD=1 ./scripts/release.sh'
}
build.dependsOn install

View File

@ -6,6 +6,10 @@ package_metadata: dict = {}
with open("./src/datahub/__init__.py") as fp:
exec(fp.read(), package_metadata)
_version: str = package_metadata["__version__"]
_self_pin = (
f"=={_version}" if not (_version.endswith("dev0") or "docker" in _version) else ""
)
base_requirements = {
# Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict.
@ -17,7 +21,7 @@ base_requirements = {
# pydantic 1.10.3 is incompatible with typing-extensions 4.1.1 - https://github.com/pydantic/pydantic/issues/4885
"pydantic>=1.10.0,!=1.10.3",
"mixpanel>=4.9.0",
"sentry-sdk",
"sentry-sdk>=1.40.5",
}
framework_common = {
@ -272,7 +276,7 @@ plugins: Dict[str, Set[str]] = {
},
# Integrations.
"airflow": {
f"acryl-datahub-airflow-plugin == {package_metadata['__version__']}",
f"acryl-datahub-airflow-plugin{_self_pin}",
},
"circuit-breaker": {
"gql>=3.3.0",
@ -398,12 +402,18 @@ plugins: Dict[str, Set[str]] = {
# This is mainly used to exclude plugins from the Docker image.
all_exclude_plugins: Set[str] = {
# The Airflow extra is only retained for compatibility, but new users should
# be using the datahub-airflow-plugin package instead.
"airflow",
# SQL Server ODBC requires additional drivers, and so we don't want to keep
# it included in the default "all" installation.
"mssql-odbc",
# duckdb doesn't have a prebuilt wheel for Linux arm7l or aarch64, so we
# simply exclude it.
"datahub-lite",
# Feast tends to have overly restrictive dependencies and hence doesn't
# play nice with the "all" installation.
"feast",
}
mypy_stubs = {
@ -678,7 +688,7 @@ entry_points = {
setuptools.setup(
# Package metadata.
name=package_metadata["__package_name__"],
version=package_metadata["__version__"],
version=_version,
url="https://datahubproject.io/",
project_urls={
"Documentation": "https://datahubproject.io/docs/",

View File

@ -49,10 +49,12 @@ task installDev(type: Exec) {
inputs.file file('pyproject.toml')
inputs.file file('requirements.txt')
outputs.file("${venv_name}/.build_install_dev_sentinel")
commandLine 'bash', '-x', '-c',
commandLine 'bash', '-c',
"set -x && " +
"${python_executable} -m venv ${venv_name} && " +
"${venv_name}/bin/pip install --upgrade pip wheel setuptools && " +
"${venv_name}/bin/pip install -r requirements.txt && " +
"${venv_name}/bin/python -m pip install --upgrade pip uv wheel setuptools && " +
"set +x && source ${venv_name}/bin/activate && set -x && " +
"uv pip install -r requirements.txt && " +
"touch ${venv_name}/.build_install_dev_sentinel"
}