feat(build): remove base-requirements.txt (#11238)

Co-authored-by: David Leifker <david.leifker@acryl.io>
This commit is contained in:
Harshal Sheth 2024-08-29 20:34:00 -07:00 committed by GitHub
parent 9d84872c47
commit c4bc34f9ff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 119 additions and 481 deletions

View File

@ -57,7 +57,8 @@ runs:
- "metadata-ingestion-modules/**"
- "metadata-ingestion/**"
- "metadata-models/**"
- "docker/datahub-ingestion**"
- "docker/datahub-ingestion-base/**"
- "docker/datahub-ingestion/**"
ingestion-base:
- "docker/datahub-ingestion-base/**"
docker:

View File

@ -31,16 +31,21 @@ inputs:
description: "Main tag to use for the Docker image"
required: true
flavor:
description: 'Image flavor (e.g., slim, full)'
description: "Image flavor (e.g., slim, full)"
required: false
target:
description: "Sets the target stage to build"
required: false
depot-project:
# Setting this will use native arm64 docker builds instead of QEMU emulation.
# This speeds up builds by 2-3x.
description: "Depot project id"
required: false
outputs:
image_tag:
description: "Docker image tags"
value: ${{ steps.docker_meta.outputs.tags }}
# image_name: ${{ env.DATAHUB_GMS_IMAGE }}
runs:
using: "composite"
@ -58,9 +63,22 @@ runs:
type=raw,value=head,suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }},enable={{is_default_branch}}
type=sha,prefix=,format=short,suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
- name: Single Tag
id: single_tag
shell: bash
run: |
IMAGES="""
${{ inputs.images }}
"""
TAGS="""
${{ inputs.image_tag }}
"""
echo "SINGLE_IMAGE=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> "$GITHUB_OUTPUT"
echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> "$GITHUB_OUTPUT"
# Code for testing the build when not pushing to Docker Hub.
- name: Build and Load image for testing (if not publishing)
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
if: ${{ inputs.publish != 'true' }}
with:
context: ${{ inputs.context }}
@ -73,20 +91,11 @@ runs:
target: ${{ inputs.target }}
load: true
push: false
cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: type=inline
- name: Single Tag
if: ${{ inputs.publish != 'true' }}
shell: bash
run: |
IMAGES="""
${{ inputs.images }}
"""
TAGS="""
${{ inputs.image_tag }}
"""
echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> $GITHUB_OUTPUT
id: single_tag
cache-from: |
type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: |
type=inline
- name: Upload image locally for testing (if not publishing)
uses: ishworkh/docker-image-artifact-upload@v1
if: ${{ inputs.publish != 'true' }}
@ -96,19 +105,42 @@ runs:
# Code for building multi-platform images and pushing to Docker Hub.
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
if: ${{ inputs.publish == 'true' }}
if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
if: ${{ inputs.publish == 'true' }}
if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }}
- name: Setup Depot CLI
uses: depot/setup-action@v1
if: ${{ inputs.publish == 'true' && inputs.depot-project != '' }}
- name: Login to DockerHub
uses: docker/login-action@v3
if: ${{ inputs.publish == 'true' }}
with:
username: ${{ inputs.username }}
password: ${{ inputs.password }}
# Depot variant.
- name: Build and Push Multi-Platform image
uses: docker/build-push-action@v5
if: ${{ inputs.publish == 'true' }}
uses: depot/build-push-action@v1
if: ${{ inputs.publish == 'true' && inputs.depot-project != '' }}
with:
project: ${{ inputs.depot-project }}
context: ${{ inputs.context }}
file: ${{ inputs.file }}
platforms: ${{ inputs.platforms }}
build-args: ${{ inputs.build-args }}
tags: ${{ steps.docker_meta.outputs.tags }}
target: ${{ inputs.target }}
push: true
cache-from: |
type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: |
type=inline
- name: Build and Push Multi-Platform image
uses: docker/build-push-action@v6
if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }}
with:
context: ${{ inputs.context }}
file: ${{ inputs.file }}
@ -117,7 +149,10 @@ runs:
tags: ${{ steps.docker_meta.outputs.tags }}
target: ${{ inputs.target }}
push: true
cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: type=inline
cache-from: |
type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
type=registry,ref=${{ steps.docker_meta.outputs.tags }}
cache-to: |
type=inline
# TODO add code for vuln scanning?

View File

@ -33,6 +33,10 @@ env:
DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base"
DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion"
permissions:
contents: read
id-token: write
jobs:
setup:
runs-on: ubuntu-latest
@ -68,23 +72,23 @@ jobs:
id: tag
run: |
source .github/scripts/docker_helpers.sh
echo "short_sha=${SHORT_SHA}" >> $GITHUB_OUTPUT
echo "tag=$(get_tag)" >> $GITHUB_OUTPUT
echo "slim_tag=$(get_tag_slim)" >> $GITHUB_OUTPUT
echo "full_tag=$(get_tag_full)" >> $GITHUB_OUTPUT
echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT
echo "unique_slim_tag=$(get_unique_tag_slim)" >> $GITHUB_OUTPUT
echo "unique_full_tag=$(get_unique_tag_full)" >> $GITHUB_OUTPUT
echo "python_release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT
echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT
echo "repository_name=${GITHUB_REPOSITORY#*/}" >> $GITHUB_OUTPUT
echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
echo "tag=$(get_tag)" >> "$GITHUB_OUTPUT"
echo "slim_tag=$(get_tag_slim)" >> "$GITHUB_OUTPUT"
echo "full_tag=$(get_tag_full)" >> "$GITHUB_OUTPUT"
echo "unique_tag=$(get_unique_tag)" >> "$GITHUB_OUTPUT"
echo "unique_slim_tag=$(get_unique_tag_slim)" >> "$GITHUB_OUTPUT"
echo "unique_full_tag=$(get_unique_tag_full)" >> "$GITHUB_OUTPUT"
echo "python_release_version=$(get_python_docker_release_v)" >> "$GITHUB_OUTPUT"
echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> "$GITHUB_OUTPUT"
echo "repository_name=${GITHUB_REPOSITORY#*/}" >> "$GITHUB_OUTPUT"
- name: Check whether docker login is possible
id: docker-login
env:
ENABLE_DOCKER_LOGIN: ${{ secrets.ACRYL_DOCKER_PASSWORD != '' }}
run: |
echo "Enable Docker Login: ${{ env.ENABLE_DOCKER_LOGIN }}"
echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> $GITHUB_OUTPUT
echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> "$GITHUB_OUTPUT"
- name: Check whether publishing enabled
id: publish
env:
@ -95,7 +99,7 @@ jobs:
}}
run: |
echo "Enable publish: ${{ env.ENABLE_PUBLISH }}"
echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT"
- name: Check whether PR publishing enabled
id: pr-publish
env:
@ -106,7 +110,7 @@ jobs:
}}
run: |
echo "Enable PR publish: ${{ env.ENABLE_PUBLISH }}"
echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT"
- uses: ./.github/actions/ci-optimization
id: ci-optimize
- uses: actions/setup-python@v5
@ -543,9 +547,10 @@ jobs:
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
platforms: linux/amd64,linux/arm64/v8
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
- name: Compute DataHub Ingestion (Base) Tag
id: tag
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> $GITHUB_OUTPUT
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT"
datahub_ingestion_base_slim_build:
name: Build and Push DataHub Ingestion (Base-Slim) Docker Image
runs-on: ubuntu-latest
@ -585,9 +590,10 @@ jobs:
context: .
file: ./docker/datahub-ingestion-base/Dockerfile
platforms: linux/amd64,linux/arm64/v8
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
- name: Compute DataHub Ingestion (Base-Slim) Tag
id: tag
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT"
datahub_ingestion_base_full_build:
name: Build and Push DataHub Ingestion (Base-Full) Docker Image
runs-on: ubuntu-latest
@ -628,7 +634,7 @@ jobs:
platforms: linux/amd64,linux/arm64/v8
- name: Compute DataHub Ingestion (Base-Full) Tag
id: tag
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> "$GITHUB_OUTPUT"
datahub_ingestion_slim_build:
name: Build and Push DataHub Ingestion Docker Images
@ -681,9 +687,10 @@ jobs:
context: .
file: ./docker/datahub-ingestion/Dockerfile
platforms: linux/amd64,linux/arm64/v8
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
- name: Compute Tag
id: tag
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT"
datahub_ingestion_slim_scan:
permissions:
contents: read # for actions/checkout to fetch code
@ -713,6 +720,7 @@ jobs:
severity: "CRITICAL,HIGH"
ignore-unfixed: true
vuln-type: "os,library"
timeout: 15m
- name: Upload Trivy scan results to GitHub Security tab
uses: github/codeql-action/upload-sarif@v2
with:
@ -767,9 +775,10 @@ jobs:
context: .
file: ./docker/datahub-ingestion/Dockerfile
platforms: linux/amd64,linux/arm64/v8
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
- name: Compute Tag (Full)
id: tag
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> $GITHUB_OUTPUT
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT"
datahub_ingestion_full_scan:
permissions:
contents: read # for actions/checkout to fetch code
@ -799,6 +808,7 @@ jobs:
severity: "CRITICAL,HIGH"
ignore-unfixed: true
vuln-type: "os,library"
timeout: 15m
- name: Upload Trivy scan results to GitHub Security tab
uses: github/codeql-action/upload-sarif@v2
with:
@ -813,13 +823,13 @@ jobs:
- id: set-matrix
run: |
if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then
echo 'matrix=["cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT
echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT"
elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then
echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> $GITHUB_OUTPUT
echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT"
elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then
echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT
echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT"
else
echo 'matrix=[]' >> $GITHUB_OUTPUT
echo 'matrix=[]' >> "$GITHUB_OUTPUT"
fi
smoke_test:

View File

@ -53,9 +53,8 @@ RUN apt-get update && apt-get upgrade -y \
&& apt-get clean \
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
COPY --from=dockerize-binary /usr/local/bin/dockerize /usr/local/bin
COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin
COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt
COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh
RUN addgroup --gid 1000 datahub && \
@ -67,7 +66,14 @@ ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
ENV VIRTUAL_ENV=/datahub-ingestion/.venv
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
RUN python3 -m venv $VIRTUAL_ENV && \
uv pip install --no-cache -r requirements.txt
uv pip install --no-cache --upgrade pip setuptools wheel
# Note: Normally uv will create hardlinks from the cache directory to the venv.
# In our docker files, we normally use `RUN --mount=type=cache,... uv pip install ...`,
# which means the cache directory is on a separate filesystem. uv will emit a warning:
# Failed to hardlink files; falling back to full copy. This may lead to degraded performance.
# If the cache and target directories are on different filesystems, hardlinking may not be supported.
# If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.
ENTRYPOINT [ "/entrypoint.sh" ]

View File

@ -1,385 +0,0 @@
# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate.
acryl-datahub-classify==0.0.11
acryl-PyHive==0.6.16
acryl-sqlglot==25.3.1.dev3
aenum==3.1.15
aiohappyeyeballs==2.3.2
aiohttp==3.10.0
aiosignal==1.3.1
alembic==1.13.2
altair==4.2.0
anyio==4.4.0
apache-airflow==2.9.3
apache-airflow-providers-common-io==1.3.2
apache-airflow-providers-common-sql==1.14.2
apache-airflow-providers-fab==1.2.2
apache-airflow-providers-ftp==3.10.0
apache-airflow-providers-http==4.12.0
apache-airflow-providers-imap==3.6.1
apache-airflow-providers-smtp==1.7.1
apache-airflow-providers-sqlite==3.8.1
apispec==6.6.1
appnope==0.1.4
argcomplete==3.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
asgiref==3.8.1
asn1crypto==1.5.1
asttokens==2.4.1
async-timeout==4.0.3
asynch==0.2.4
attrs==23.2.0
avro==1.11.3
avro-gen3==0.7.13
azure-common==1.1.28
azure-core==1.29.4
azure-identity==1.14.1
azure-storage-blob==12.21.0
azure-storage-file-datalake==12.16.0
Babel==2.15.0
backoff==2.2.1
beautifulsoup4==4.12.3
bleach==6.1.0
blinker==1.8.2
blis==0.7.11
boto3==1.34.151
botocore==1.34.151
bracex==2.4
cached-property==1.5.2
cachelib==0.9.0
cachetools==5.4.0
catalogue==2.0.10
cattrs==23.2.3
certifi==2024.7.4
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
ciso8601==2.3.1
click==8.1.7
click-default-group==1.2.4
click-spinner==0.1.10
clickclick==20.10.2
clickhouse-driver==0.2.8
clickhouse-sqlalchemy==0.2.4
cloudpathlib==0.18.1
cloudpickle==3.0.0
colorama==0.4.6
colorlog==4.8.0
comm==0.2.2
confection==0.1.5
ConfigUpdater==3.2
confluent-kafka==2.5.0
connexion==2.14.2
cron-descriptor==1.4.3
croniter==3.0.3
cryptography==42.0.8
cx_Oracle==8.3.0
cymem==2.0.8
databricks-dbapi==0.6.0
databricks-sdk==0.29.0
databricks-sql-connector==2.9.6
dataflows-tabulator==1.54.3
db-dtypes==1.2.0
debugpy==1.8.2
decorator==5.1.1
defusedxml==0.7.1
deltalake==0.17.4
Deprecated==1.2.14
dill==0.3.8
dnspython==2.6.1
docker==7.1.0
docutils==0.21.2
ecdsa==0.19.0
elasticsearch==7.13.4
email_validator==2.2.0
entrypoints==0.4
et-xmlfile==1.1.0
exceptiongroup==1.2.2
executing==2.0.1
expandvars==0.12.0
fastavro==1.9.5
fastjsonschema==2.20.0
filelock==3.15.4
Flask==2.2.5
flatdict==4.0.1
frozenlist==1.4.1
fsspec==2023.12.2
future==1.0.0
GeoAlchemy2==0.15.2
gitdb==4.0.11
GitPython==3.1.43
google-api-core==2.19.1
google-auth==2.32.0
google-cloud-appengine-logging==1.4.5
google-cloud-audit-log==0.2.5
google-cloud-bigquery==3.25.0
google-cloud-core==2.4.1
google-cloud-datacatalog==3.20.0
google-cloud-datacatalog-lineage==0.2.2
google-cloud-logging==3.5.0
google-crc32c==1.5.0
google-re2==1.1.20240702
google-resumable-media==2.7.1
googleapis-common-protos==1.63.2
gql==3.5.0
graphql-core==3.2.3
great-expectations==0.15.50
greenlet==3.0.3
grpc-google-iam-v1==0.13.1
grpcio==1.65.2
grpcio-status==1.62.2
grpcio-tools==1.62.2
gssapi==1.8.3
gunicorn==22.0.0
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
humanfriendly==10.0
idna==3.7
ijson==3.3.0
importlib_metadata==7.2.1
importlib_resources==6.4.0
inflection==0.5.1
ipaddress==1.0.23
ipykernel==6.17.1
ipython==8.21.0
ipython-genutils==0.2.0
ipywidgets==8.1.3
isodate==0.6.1
itsdangerous==2.2.0
jedi==0.19.1
Jinja2==3.1.4
jmespath==1.0.1
JPype1==1.5.0
jsonlines==4.0.0
jsonpatch==1.33
jsonpointer==3.0.0
jsonref==1.1.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter-server==1.16.0
jupyter_client==7.4.9
jupyter_core==4.12.0
jupyterlab_pygments==0.3.0
jupyterlab_widgets==3.0.11
langcodes==3.4.0
language_data==1.2.0
lark==1.1.4
lazy-object-proxy==1.10.0
leb128==1.0.8
limits==3.13.0
linear-tsv==1.1.0
linkify-it-py==2.0.3
lkml==1.3.5
lockfile==0.12.2
looker-sdk==23.0.0
lxml==5.2.2
lz4==4.3.3
makefun==1.15.4
Mako==1.3.5
marisa-trie==1.2.0
markdown-it-py==3.0.0
MarkupSafe==2.1.5
marshmallow==3.21.3
marshmallow-oneofschema==3.1.1
marshmallow-sqlalchemy==0.28.2
matplotlib-inline==0.1.7
mdit-py-plugins==0.4.1
mdurl==0.1.2
methodtools==0.4.7
mistune==3.0.2
mixpanel==4.10.1
mlflow-skinny==2.15.0
mmhash3==3.0.1
more-itertools==10.3.0
moto==4.2.14
msal==1.22.0
msal-extensions==1.1.0
multidict==6.0.5
murmurhash==1.0.10
mypy-extensions==1.0.0
nbclassic==1.1.0
nbclient==0.6.3
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==3.3
notebook==6.5.7
notebook_shim==0.2.4
numpy==1.26.4
oauthlib==3.2.2
okta==1.7.0
openlineage-airflow==1.18.0
openlineage-integration-common==1.18.0
openlineage-python==1.18.0
openlineage_sql==1.18.0
openpyxl==3.1.5
opentelemetry-api==1.26.0
opentelemetry-exporter-otlp==1.26.0
opentelemetry-exporter-otlp-proto-common==1.26.0
opentelemetry-exporter-otlp-proto-grpc==1.26.0
opentelemetry-exporter-otlp-proto-http==1.26.0
opentelemetry-proto==1.26.0
opentelemetry-sdk==1.26.0
opentelemetry-semantic-conventions==0.47b0
ordered-set==4.1.0
packaging==24.1
pandas==2.1.4
pandocfilters==1.5.1
parse==1.20.2
parso==0.8.4
pathspec==0.12.1
pendulum==3.0.0
pexpect==4.9.0
phonenumbers==8.13.0
platformdirs==4.2.2
pluggy==1.5.0
portalocker==2.10.1
preshed==3.0.9
prison==0.2.1
progressbar2==4.4.2
prometheus_client==0.20.0
prompt_toolkit==3.0.47
proto-plus==1.24.0
protobuf==4.25.4
psutil==6.0.0
psycopg2-binary==2.9.9
ptyprocess==0.7.0
pure-sasl==0.6.2
pure_eval==0.2.3
py-partiql-parser==0.5.0
pyarrow==17.0.0
pyarrow-hotfix==0.6
pyasn1==0.6.0
pyasn1_modules==0.4.0
pyathena==2.25.2
pycountry==24.6.1
pycparser==2.22
pycryptodome==3.20.0
pydantic==1.10.17
pydash==8.0.3
pydruid==0.6.9
Pygments==2.18.0
pyiceberg==0.4.0
pymongo==4.8.0
PyMySQL==1.1.1
pyOpenSSL==24.2.1
pyparsing==3.0.9
pyspnego==0.11.1
python-daemon==3.0.1
python-dateutil==2.9.0.post0
python-jose==3.3.0
python-ldap==3.4.4
python-liquid==1.12.1
python-nvd3==0.16.0
python-slugify==8.0.4
python-stdnum==1.20
python-tds==1.15.0
python-utils==3.8.2
pytz==2024.1
PyYAML==6.0.1
pyzmq==26.0.3
redash-toolbelt==0.1.9
redshift-connector==2.1.2
referencing==0.35.1
regex==2024.7.24
requests==2.32.3
requests-file==2.1.0
requests-gssapi==1.3.0
requests-toolbelt==1.0.0
requests_ntlm==1.3.0
responses==0.25.3
rfc3339-validator==0.1.4
rfc3986==2.0.0
rich==13.7.1
rich-argparse==1.5.2
rpds-py==0.19.1
rsa==4.9
rstr==3.2.2
ruamel.yaml==0.17.17
s3transfer==0.10.2
schwifty==2024.6.1
scipy==1.14.0
scramp==1.4.5
Send2Trash==1.8.3
sentry-sdk==2.12.0
setproctitle==1.3.3
shellingham==1.5.4
simple-salesforce==1.12.6
six==1.16.0
slack-sdk==3.18.1
smart-open==7.0.4
smmap==5.0.1
sniffio==1.3.1
snowflake-connector-python==3.12.0
snowflake-sqlalchemy==1.6.1
sortedcontainers==2.4.0
soupsieve==2.5
spacy==3.7.5
spacy-legacy==3.0.12
spacy-loggers==1.0.5
sql_metadata==2.12.0
SQLAlchemy==1.4.44
sqlalchemy-bigquery==1.11.0
sqlalchemy-cockroachdb==1.4.4
SQLAlchemy-JSONField==1.0.2
sqlalchemy-pytds==0.3.5
sqlalchemy-redshift==0.8.14
SQLAlchemy-Utils==0.41.2
sqlglotrs==0.2.7
sqllineage==1.3.8
sqlparse==0.4.4
srsly==2.4.8
stack-data==0.6.3
strictyaml==1.7.3
tableauserverclient==0.25
tableschema==1.20.11
tabulate==0.9.0
tenacity==9.0.0
teradatasql==20.0.0.14
teradatasqlalchemy==20.0.0.1
termcolor==2.4.0
terminado==0.18.1
text-unidecode==1.3
thinc==8.2.5
thrift==0.16.0
thrift-sasl==0.4.3
time-machine==2.14.2
tinycss2==1.3.0
toml==0.10.2
tomlkit==0.13.0
toolz==0.12.1
tornado==6.4.1
tqdm==4.66.4
traitlets==5.2.1.post0
trino==0.329.0
typer==0.12.3
typing-inspect==0.9.0
typing_extensions==4.12.2
tzdata==2024.1
tzlocal==5.2
uc-micro-py==1.0.3
ujson==5.10.0
unicodecsv==0.14.1
universal_pathlib==0.2.2
urllib3==1.26.19
vertica-python==1.4.0
vertica-sqlalchemy-dialect==0.0.8.2
vininfo==1.8.0
wasabi==1.1.3
wcmatch==8.5.2
wcwidth==0.2.13
weasel==0.4.1
webencodings==0.5.1
websocket-client==1.8.0
Werkzeug==2.2.3
widgetsnbextension==4.0.11
wirerope==0.4.7
wrapt==1.16.0
WTForms==3.1.2
xlrd==2.0.1
xmltodict==0.13.0
yarl==1.9.4
zeep==4.2.1
zipp==3.19.2
zstd==1.5.5.1

View File

@ -12,7 +12,7 @@ ext {
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
revision = 6 // increment to trigger rebuild
revision = 7 // increment to trigger rebuild
}
docker {

View File

@ -1,36 +0,0 @@
#!/bin/bash
# This script is used to regenerate the base-requirements.txt file
set -euxo pipefail
cd "$( dirname "${BASH_SOURCE[0]}" )"
SCRIPT_NAME=$(basename "$0")
DATAHUB_DIR=$(pwd)/../..
# Create a virtualenv.
VENV_DIR=$(mktemp -d)
python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'"
python -m venv $VENV_DIR
source $VENV_DIR/bin/activate
pip install --upgrade pip uv setuptools wheel
echo "Using virtualenv at $VENV_DIR"
# Install stuff.
pushd $DATAHUB_DIR/metadata-ingestion
uv pip install -e '.[all]' -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]'
popd
# Generate the requirements file.
# Removing Flask deps due as per https://github.com/datahub-project/datahub/pull/6867/files
# Removing py4j and PyJWT due to https://github.com/datahub-project/datahub/pull/6868/files
# Removing pyspark and pydeequ because we don't want them in the slim image, so they can be added separately.
# TODO: It's unclear if these removals are still actually needed.
echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt
pip freeze \
| grep -v -E "^-e" \
| grep -v -E "^uv==" \
| grep -v "Flask-" \
| grep -v -E "(py4j|PyJWT)==" \
| grep -v -E "(pyspark|pydeequ)==" \
>> base-requirements.txt

View File

@ -21,11 +21,11 @@ ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /metadata-ingestion/airflow-plugin
ARG RELEASE_VERSION
WORKDIR /datahub-ingestion
WORKDIR /metadata-ingestion
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \
cat src/datahub/__init__.py | grep __version__ && \
@ -33,7 +33,8 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS
FROM base AS slim-install
RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
FROM base AS full-install-build
@ -43,9 +44,10 @@ RUN apt-get update && apt-get install -y -qq maven
USER datahub
COPY ./docker/datahub-ingestion/pyspark_jars.sh .
RUN uv pip install --no-cache -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
UV_LINK_MODE=copy uv pip install -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \
./pyspark_jars.sh && \
datahub --version
RUN ./pyspark_jars.sh
FROM base AS full-install
@ -57,4 +59,6 @@ FROM base AS dev-install
FROM ${APP_ENV}-install AS final
WORKDIR /datahub-ingestion
USER datahub

View File

@ -11,18 +11,21 @@ ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
ARG RELEASE_VERSION
WORKDIR /datahub-ingestion
WORKDIR /metadata-ingestion
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
cat src/datahub/__init__.py
FROM base as slim-install
RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
datahub --version
FROM slim-install as final
WORKDIR /datahub-ingestion
USER datahub

View File

@ -12,7 +12,7 @@ ext {
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
revision = 6 // increment to trigger rebuild
revision = 8 // increment to trigger rebuild
}
dependencies {