mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-27 09:58:14 +00:00
feat(build): remove base-requirements.txt (#11238)
Co-authored-by: David Leifker <david.leifker@acryl.io>
This commit is contained in:
parent
9d84872c47
commit
c4bc34f9ff
3
.github/actions/ci-optimization/action.yml
vendored
3
.github/actions/ci-optimization/action.yml
vendored
@ -57,7 +57,8 @@ runs:
|
||||
- "metadata-ingestion-modules/**"
|
||||
- "metadata-ingestion/**"
|
||||
- "metadata-models/**"
|
||||
- "docker/datahub-ingestion**"
|
||||
- "docker/datahub-ingestion-base/**"
|
||||
- "docker/datahub-ingestion/**"
|
||||
ingestion-base:
|
||||
- "docker/datahub-ingestion-base/**"
|
||||
docker:
|
||||
|
||||
@ -31,16 +31,21 @@ inputs:
|
||||
description: "Main tag to use for the Docker image"
|
||||
required: true
|
||||
flavor:
|
||||
description: 'Image flavor (e.g., slim, full)'
|
||||
description: "Image flavor (e.g., slim, full)"
|
||||
required: false
|
||||
target:
|
||||
description: "Sets the target stage to build"
|
||||
required: false
|
||||
depot-project:
|
||||
# Setting this will use native arm64 docker builds instead of QEMU emulation.
|
||||
# This speeds up builds by 2-3x.
|
||||
description: "Depot project id"
|
||||
required: false
|
||||
|
||||
outputs:
|
||||
image_tag:
|
||||
description: "Docker image tags"
|
||||
value: ${{ steps.docker_meta.outputs.tags }}
|
||||
# image_name: ${{ env.DATAHUB_GMS_IMAGE }}
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@ -58,9 +63,22 @@ runs:
|
||||
type=raw,value=head,suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }},enable={{is_default_branch}}
|
||||
type=sha,prefix=,format=short,suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
|
||||
|
||||
- name: Single Tag
|
||||
id: single_tag
|
||||
shell: bash
|
||||
run: |
|
||||
IMAGES="""
|
||||
${{ inputs.images }}
|
||||
"""
|
||||
TAGS="""
|
||||
${{ inputs.image_tag }}
|
||||
"""
|
||||
echo "SINGLE_IMAGE=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> "$GITHUB_OUTPUT"
|
||||
echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Code for testing the build when not pushing to Docker Hub.
|
||||
- name: Build and Load image for testing (if not publishing)
|
||||
uses: docker/build-push-action@v5
|
||||
uses: docker/build-push-action@v6
|
||||
if: ${{ inputs.publish != 'true' }}
|
||||
with:
|
||||
context: ${{ inputs.context }}
|
||||
@ -73,20 +91,11 @@ runs:
|
||||
target: ${{ inputs.target }}
|
||||
load: true
|
||||
push: false
|
||||
cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }}
|
||||
cache-to: type=inline
|
||||
- name: Single Tag
|
||||
if: ${{ inputs.publish != 'true' }}
|
||||
shell: bash
|
||||
run: |
|
||||
IMAGES="""
|
||||
${{ inputs.images }}
|
||||
"""
|
||||
TAGS="""
|
||||
${{ inputs.image_tag }}
|
||||
"""
|
||||
echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> $GITHUB_OUTPUT
|
||||
id: single_tag
|
||||
cache-from: |
|
||||
type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
|
||||
type=registry,ref=${{ steps.docker_meta.outputs.tags }}
|
||||
cache-to: |
|
||||
type=inline
|
||||
- name: Upload image locally for testing (if not publishing)
|
||||
uses: ishworkh/docker-image-artifact-upload@v1
|
||||
if: ${{ inputs.publish != 'true' }}
|
||||
@ -96,19 +105,42 @@ runs:
|
||||
# Code for building multi-platform images and pushing to Docker Hub.
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
if: ${{ inputs.publish == 'true' }}
|
||||
if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
if: ${{ inputs.publish == 'true' }}
|
||||
if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }}
|
||||
- name: Setup Depot CLI
|
||||
uses: depot/setup-action@v1
|
||||
if: ${{ inputs.publish == 'true' && inputs.depot-project != '' }}
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v3
|
||||
if: ${{ inputs.publish == 'true' }}
|
||||
with:
|
||||
username: ${{ inputs.username }}
|
||||
password: ${{ inputs.password }}
|
||||
|
||||
# Depot variant.
|
||||
- name: Build and Push Multi-Platform image
|
||||
uses: docker/build-push-action@v5
|
||||
if: ${{ inputs.publish == 'true' }}
|
||||
uses: depot/build-push-action@v1
|
||||
if: ${{ inputs.publish == 'true' && inputs.depot-project != '' }}
|
||||
with:
|
||||
project: ${{ inputs.depot-project }}
|
||||
context: ${{ inputs.context }}
|
||||
file: ${{ inputs.file }}
|
||||
platforms: ${{ inputs.platforms }}
|
||||
build-args: ${{ inputs.build-args }}
|
||||
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||
target: ${{ inputs.target }}
|
||||
push: true
|
||||
cache-from: |
|
||||
type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
|
||||
type=registry,ref=${{ steps.docker_meta.outputs.tags }}
|
||||
cache-to: |
|
||||
type=inline
|
||||
|
||||
- name: Build and Push Multi-Platform image
|
||||
uses: docker/build-push-action@v6
|
||||
if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }}
|
||||
with:
|
||||
context: ${{ inputs.context }}
|
||||
file: ${{ inputs.file }}
|
||||
@ -117,7 +149,10 @@ runs:
|
||||
tags: ${{ steps.docker_meta.outputs.tags }}
|
||||
target: ${{ inputs.target }}
|
||||
push: true
|
||||
cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }}
|
||||
cache-to: type=inline
|
||||
cache-from: |
|
||||
type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }}
|
||||
type=registry,ref=${{ steps.docker_meta.outputs.tags }}
|
||||
cache-to: |
|
||||
type=inline
|
||||
|
||||
# TODO add code for vuln scanning?
|
||||
|
||||
54
.github/workflows/docker-unified.yml
vendored
54
.github/workflows/docker-unified.yml
vendored
@ -33,6 +33,10 @@ env:
|
||||
DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base"
|
||||
DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion"
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-latest
|
||||
@ -68,23 +72,23 @@ jobs:
|
||||
id: tag
|
||||
run: |
|
||||
source .github/scripts/docker_helpers.sh
|
||||
echo "short_sha=${SHORT_SHA}" >> $GITHUB_OUTPUT
|
||||
echo "tag=$(get_tag)" >> $GITHUB_OUTPUT
|
||||
echo "slim_tag=$(get_tag_slim)" >> $GITHUB_OUTPUT
|
||||
echo "full_tag=$(get_tag_full)" >> $GITHUB_OUTPUT
|
||||
echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT
|
||||
echo "unique_slim_tag=$(get_unique_tag_slim)" >> $GITHUB_OUTPUT
|
||||
echo "unique_full_tag=$(get_unique_tag_full)" >> $GITHUB_OUTPUT
|
||||
echo "python_release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT
|
||||
echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT
|
||||
echo "repository_name=${GITHUB_REPOSITORY#*/}" >> $GITHUB_OUTPUT
|
||||
echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT"
|
||||
echo "tag=$(get_tag)" >> "$GITHUB_OUTPUT"
|
||||
echo "slim_tag=$(get_tag_slim)" >> "$GITHUB_OUTPUT"
|
||||
echo "full_tag=$(get_tag_full)" >> "$GITHUB_OUTPUT"
|
||||
echo "unique_tag=$(get_unique_tag)" >> "$GITHUB_OUTPUT"
|
||||
echo "unique_slim_tag=$(get_unique_tag_slim)" >> "$GITHUB_OUTPUT"
|
||||
echo "unique_full_tag=$(get_unique_tag_full)" >> "$GITHUB_OUTPUT"
|
||||
echo "python_release_version=$(get_python_docker_release_v)" >> "$GITHUB_OUTPUT"
|
||||
echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> "$GITHUB_OUTPUT"
|
||||
echo "repository_name=${GITHUB_REPOSITORY#*/}" >> "$GITHUB_OUTPUT"
|
||||
- name: Check whether docker login is possible
|
||||
id: docker-login
|
||||
env:
|
||||
ENABLE_DOCKER_LOGIN: ${{ secrets.ACRYL_DOCKER_PASSWORD != '' }}
|
||||
run: |
|
||||
echo "Enable Docker Login: ${{ env.ENABLE_DOCKER_LOGIN }}"
|
||||
echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> $GITHUB_OUTPUT
|
||||
echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> "$GITHUB_OUTPUT"
|
||||
- name: Check whether publishing enabled
|
||||
id: publish
|
||||
env:
|
||||
@ -95,7 +99,7 @@ jobs:
|
||||
}}
|
||||
run: |
|
||||
echo "Enable publish: ${{ env.ENABLE_PUBLISH }}"
|
||||
echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
|
||||
echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT"
|
||||
- name: Check whether PR publishing enabled
|
||||
id: pr-publish
|
||||
env:
|
||||
@ -106,7 +110,7 @@ jobs:
|
||||
}}
|
||||
run: |
|
||||
echo "Enable PR publish: ${{ env.ENABLE_PUBLISH }}"
|
||||
echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
|
||||
echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT"
|
||||
- uses: ./.github/actions/ci-optimization
|
||||
id: ci-optimize
|
||||
- uses: actions/setup-python@v5
|
||||
@ -543,9 +547,10 @@ jobs:
|
||||
context: .
|
||||
file: ./docker/datahub-ingestion-base/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64/v8
|
||||
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
|
||||
- name: Compute DataHub Ingestion (Base) Tag
|
||||
id: tag
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> $GITHUB_OUTPUT
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT"
|
||||
datahub_ingestion_base_slim_build:
|
||||
name: Build and Push DataHub Ingestion (Base-Slim) Docker Image
|
||||
runs-on: ubuntu-latest
|
||||
@ -585,9 +590,10 @@ jobs:
|
||||
context: .
|
||||
file: ./docker/datahub-ingestion-base/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64/v8
|
||||
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
|
||||
- name: Compute DataHub Ingestion (Base-Slim) Tag
|
||||
id: tag
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT"
|
||||
datahub_ingestion_base_full_build:
|
||||
name: Build and Push DataHub Ingestion (Base-Full) Docker Image
|
||||
runs-on: ubuntu-latest
|
||||
@ -628,7 +634,7 @@ jobs:
|
||||
platforms: linux/amd64,linux/arm64/v8
|
||||
- name: Compute DataHub Ingestion (Base-Full) Tag
|
||||
id: tag
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
datahub_ingestion_slim_build:
|
||||
name: Build and Push DataHub Ingestion Docker Images
|
||||
@ -681,9 +687,10 @@ jobs:
|
||||
context: .
|
||||
file: ./docker/datahub-ingestion/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64/v8
|
||||
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
|
||||
- name: Compute Tag
|
||||
id: tag
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT"
|
||||
datahub_ingestion_slim_scan:
|
||||
permissions:
|
||||
contents: read # for actions/checkout to fetch code
|
||||
@ -713,6 +720,7 @@ jobs:
|
||||
severity: "CRITICAL,HIGH"
|
||||
ignore-unfixed: true
|
||||
vuln-type: "os,library"
|
||||
timeout: 15m
|
||||
- name: Upload Trivy scan results to GitHub Security tab
|
||||
uses: github/codeql-action/upload-sarif@v2
|
||||
with:
|
||||
@ -767,9 +775,10 @@ jobs:
|
||||
context: .
|
||||
file: ./docker/datahub-ingestion/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64/v8
|
||||
depot-project: ${{ vars.DEPOT_PROJECT_ID }}
|
||||
- name: Compute Tag (Full)
|
||||
id: tag
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> $GITHUB_OUTPUT
|
||||
run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT"
|
||||
datahub_ingestion_full_scan:
|
||||
permissions:
|
||||
contents: read # for actions/checkout to fetch code
|
||||
@ -799,6 +808,7 @@ jobs:
|
||||
severity: "CRITICAL,HIGH"
|
||||
ignore-unfixed: true
|
||||
vuln-type: "os,library"
|
||||
timeout: 15m
|
||||
- name: Upload Trivy scan results to GitHub Security tab
|
||||
uses: github/codeql-action/upload-sarif@v2
|
||||
with:
|
||||
@ -813,13 +823,13 @@ jobs:
|
||||
- id: set-matrix
|
||||
run: |
|
||||
if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then
|
||||
echo 'matrix=["cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT
|
||||
echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT"
|
||||
elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then
|
||||
echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> $GITHUB_OUTPUT
|
||||
echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT"
|
||||
elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then
|
||||
echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT
|
||||
echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo 'matrix=[]' >> $GITHUB_OUTPUT
|
||||
echo 'matrix=[]' >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
smoke_test:
|
||||
|
||||
@ -53,9 +53,8 @@ RUN apt-get update && apt-get upgrade -y \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/{apt,dpkg,cache,log}/
|
||||
|
||||
COPY --from=dockerize-binary /usr/local/bin/dockerize /usr/local/bin
|
||||
COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin
|
||||
|
||||
COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt
|
||||
COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh
|
||||
|
||||
RUN addgroup --gid 1000 datahub && \
|
||||
@ -67,7 +66,14 @@ ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
|
||||
ENV VIRTUAL_ENV=/datahub-ingestion/.venv
|
||||
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
|
||||
RUN python3 -m venv $VIRTUAL_ENV && \
|
||||
uv pip install --no-cache -r requirements.txt
|
||||
uv pip install --no-cache --upgrade pip setuptools wheel
|
||||
|
||||
# Note: Normally uv will create hardlinks from the cache directory to the venv.
|
||||
# In our docker files, we normally use `RUN --mount=type=cache,... uv pip install ...`,
|
||||
# which means the cache directory is on a separate filesystem. uv will emit a warning:
|
||||
# Failed to hardlink files; falling back to full copy. This may lead to degraded performance.
|
||||
# If the cache and target directories are on different filesystems, hardlinking may not be supported.
|
||||
# If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.
|
||||
|
||||
ENTRYPOINT [ "/entrypoint.sh" ]
|
||||
|
||||
|
||||
@ -1,385 +0,0 @@
|
||||
# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate.
|
||||
acryl-datahub-classify==0.0.11
|
||||
acryl-PyHive==0.6.16
|
||||
acryl-sqlglot==25.3.1.dev3
|
||||
aenum==3.1.15
|
||||
aiohappyeyeballs==2.3.2
|
||||
aiohttp==3.10.0
|
||||
aiosignal==1.3.1
|
||||
alembic==1.13.2
|
||||
altair==4.2.0
|
||||
anyio==4.4.0
|
||||
apache-airflow==2.9.3
|
||||
apache-airflow-providers-common-io==1.3.2
|
||||
apache-airflow-providers-common-sql==1.14.2
|
||||
apache-airflow-providers-fab==1.2.2
|
||||
apache-airflow-providers-ftp==3.10.0
|
||||
apache-airflow-providers-http==4.12.0
|
||||
apache-airflow-providers-imap==3.6.1
|
||||
apache-airflow-providers-smtp==1.7.1
|
||||
apache-airflow-providers-sqlite==3.8.1
|
||||
apispec==6.6.1
|
||||
appnope==0.1.4
|
||||
argcomplete==3.4.0
|
||||
argon2-cffi==23.1.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
asgiref==3.8.1
|
||||
asn1crypto==1.5.1
|
||||
asttokens==2.4.1
|
||||
async-timeout==4.0.3
|
||||
asynch==0.2.4
|
||||
attrs==23.2.0
|
||||
avro==1.11.3
|
||||
avro-gen3==0.7.13
|
||||
azure-common==1.1.28
|
||||
azure-core==1.29.4
|
||||
azure-identity==1.14.1
|
||||
azure-storage-blob==12.21.0
|
||||
azure-storage-file-datalake==12.16.0
|
||||
Babel==2.15.0
|
||||
backoff==2.2.1
|
||||
beautifulsoup4==4.12.3
|
||||
bleach==6.1.0
|
||||
blinker==1.8.2
|
||||
blis==0.7.11
|
||||
boto3==1.34.151
|
||||
botocore==1.34.151
|
||||
bracex==2.4
|
||||
cached-property==1.5.2
|
||||
cachelib==0.9.0
|
||||
cachetools==5.4.0
|
||||
catalogue==2.0.10
|
||||
cattrs==23.2.3
|
||||
certifi==2024.7.4
|
||||
cffi==1.16.0
|
||||
chardet==5.2.0
|
||||
charset-normalizer==3.3.2
|
||||
ciso8601==2.3.1
|
||||
click==8.1.7
|
||||
click-default-group==1.2.4
|
||||
click-spinner==0.1.10
|
||||
clickclick==20.10.2
|
||||
clickhouse-driver==0.2.8
|
||||
clickhouse-sqlalchemy==0.2.4
|
||||
cloudpathlib==0.18.1
|
||||
cloudpickle==3.0.0
|
||||
colorama==0.4.6
|
||||
colorlog==4.8.0
|
||||
comm==0.2.2
|
||||
confection==0.1.5
|
||||
ConfigUpdater==3.2
|
||||
confluent-kafka==2.5.0
|
||||
connexion==2.14.2
|
||||
cron-descriptor==1.4.3
|
||||
croniter==3.0.3
|
||||
cryptography==42.0.8
|
||||
cx_Oracle==8.3.0
|
||||
cymem==2.0.8
|
||||
databricks-dbapi==0.6.0
|
||||
databricks-sdk==0.29.0
|
||||
databricks-sql-connector==2.9.6
|
||||
dataflows-tabulator==1.54.3
|
||||
db-dtypes==1.2.0
|
||||
debugpy==1.8.2
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
deltalake==0.17.4
|
||||
Deprecated==1.2.14
|
||||
dill==0.3.8
|
||||
dnspython==2.6.1
|
||||
docker==7.1.0
|
||||
docutils==0.21.2
|
||||
ecdsa==0.19.0
|
||||
elasticsearch==7.13.4
|
||||
email_validator==2.2.0
|
||||
entrypoints==0.4
|
||||
et-xmlfile==1.1.0
|
||||
exceptiongroup==1.2.2
|
||||
executing==2.0.1
|
||||
expandvars==0.12.0
|
||||
fastavro==1.9.5
|
||||
fastjsonschema==2.20.0
|
||||
filelock==3.15.4
|
||||
Flask==2.2.5
|
||||
flatdict==4.0.1
|
||||
frozenlist==1.4.1
|
||||
fsspec==2023.12.2
|
||||
future==1.0.0
|
||||
GeoAlchemy2==0.15.2
|
||||
gitdb==4.0.11
|
||||
GitPython==3.1.43
|
||||
google-api-core==2.19.1
|
||||
google-auth==2.32.0
|
||||
google-cloud-appengine-logging==1.4.5
|
||||
google-cloud-audit-log==0.2.5
|
||||
google-cloud-bigquery==3.25.0
|
||||
google-cloud-core==2.4.1
|
||||
google-cloud-datacatalog==3.20.0
|
||||
google-cloud-datacatalog-lineage==0.2.2
|
||||
google-cloud-logging==3.5.0
|
||||
google-crc32c==1.5.0
|
||||
google-re2==1.1.20240702
|
||||
google-resumable-media==2.7.1
|
||||
googleapis-common-protos==1.63.2
|
||||
gql==3.5.0
|
||||
graphql-core==3.2.3
|
||||
great-expectations==0.15.50
|
||||
greenlet==3.0.3
|
||||
grpc-google-iam-v1==0.13.1
|
||||
grpcio==1.65.2
|
||||
grpcio-status==1.62.2
|
||||
grpcio-tools==1.62.2
|
||||
gssapi==1.8.3
|
||||
gunicorn==22.0.0
|
||||
h11==0.14.0
|
||||
httpcore==1.0.5
|
||||
httpx==0.27.0
|
||||
humanfriendly==10.0
|
||||
idna==3.7
|
||||
ijson==3.3.0
|
||||
importlib_metadata==7.2.1
|
||||
importlib_resources==6.4.0
|
||||
inflection==0.5.1
|
||||
ipaddress==1.0.23
|
||||
ipykernel==6.17.1
|
||||
ipython==8.21.0
|
||||
ipython-genutils==0.2.0
|
||||
ipywidgets==8.1.3
|
||||
isodate==0.6.1
|
||||
itsdangerous==2.2.0
|
||||
jedi==0.19.1
|
||||
Jinja2==3.1.4
|
||||
jmespath==1.0.1
|
||||
JPype1==1.5.0
|
||||
jsonlines==4.0.0
|
||||
jsonpatch==1.33
|
||||
jsonpointer==3.0.0
|
||||
jsonref==1.1.0
|
||||
jsonschema==4.23.0
|
||||
jsonschema-specifications==2023.12.1
|
||||
jupyter-server==1.16.0
|
||||
jupyter_client==7.4.9
|
||||
jupyter_core==4.12.0
|
||||
jupyterlab_pygments==0.3.0
|
||||
jupyterlab_widgets==3.0.11
|
||||
langcodes==3.4.0
|
||||
language_data==1.2.0
|
||||
lark==1.1.4
|
||||
lazy-object-proxy==1.10.0
|
||||
leb128==1.0.8
|
||||
limits==3.13.0
|
||||
linear-tsv==1.1.0
|
||||
linkify-it-py==2.0.3
|
||||
lkml==1.3.5
|
||||
lockfile==0.12.2
|
||||
looker-sdk==23.0.0
|
||||
lxml==5.2.2
|
||||
lz4==4.3.3
|
||||
makefun==1.15.4
|
||||
Mako==1.3.5
|
||||
marisa-trie==1.2.0
|
||||
markdown-it-py==3.0.0
|
||||
MarkupSafe==2.1.5
|
||||
marshmallow==3.21.3
|
||||
marshmallow-oneofschema==3.1.1
|
||||
marshmallow-sqlalchemy==0.28.2
|
||||
matplotlib-inline==0.1.7
|
||||
mdit-py-plugins==0.4.1
|
||||
mdurl==0.1.2
|
||||
methodtools==0.4.7
|
||||
mistune==3.0.2
|
||||
mixpanel==4.10.1
|
||||
mlflow-skinny==2.15.0
|
||||
mmhash3==3.0.1
|
||||
more-itertools==10.3.0
|
||||
moto==4.2.14
|
||||
msal==1.22.0
|
||||
msal-extensions==1.1.0
|
||||
multidict==6.0.5
|
||||
murmurhash==1.0.10
|
||||
mypy-extensions==1.0.0
|
||||
nbclassic==1.1.0
|
||||
nbclient==0.6.3
|
||||
nbconvert==7.16.4
|
||||
nbformat==5.10.4
|
||||
nest-asyncio==1.6.0
|
||||
networkx==3.3
|
||||
notebook==6.5.7
|
||||
notebook_shim==0.2.4
|
||||
numpy==1.26.4
|
||||
oauthlib==3.2.2
|
||||
okta==1.7.0
|
||||
openlineage-airflow==1.18.0
|
||||
openlineage-integration-common==1.18.0
|
||||
openlineage-python==1.18.0
|
||||
openlineage_sql==1.18.0
|
||||
openpyxl==3.1.5
|
||||
opentelemetry-api==1.26.0
|
||||
opentelemetry-exporter-otlp==1.26.0
|
||||
opentelemetry-exporter-otlp-proto-common==1.26.0
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.26.0
|
||||
opentelemetry-exporter-otlp-proto-http==1.26.0
|
||||
opentelemetry-proto==1.26.0
|
||||
opentelemetry-sdk==1.26.0
|
||||
opentelemetry-semantic-conventions==0.47b0
|
||||
ordered-set==4.1.0
|
||||
packaging==24.1
|
||||
pandas==2.1.4
|
||||
pandocfilters==1.5.1
|
||||
parse==1.20.2
|
||||
parso==0.8.4
|
||||
pathspec==0.12.1
|
||||
pendulum==3.0.0
|
||||
pexpect==4.9.0
|
||||
phonenumbers==8.13.0
|
||||
platformdirs==4.2.2
|
||||
pluggy==1.5.0
|
||||
portalocker==2.10.1
|
||||
preshed==3.0.9
|
||||
prison==0.2.1
|
||||
progressbar2==4.4.2
|
||||
prometheus_client==0.20.0
|
||||
prompt_toolkit==3.0.47
|
||||
proto-plus==1.24.0
|
||||
protobuf==4.25.4
|
||||
psutil==6.0.0
|
||||
psycopg2-binary==2.9.9
|
||||
ptyprocess==0.7.0
|
||||
pure-sasl==0.6.2
|
||||
pure_eval==0.2.3
|
||||
py-partiql-parser==0.5.0
|
||||
pyarrow==17.0.0
|
||||
pyarrow-hotfix==0.6
|
||||
pyasn1==0.6.0
|
||||
pyasn1_modules==0.4.0
|
||||
pyathena==2.25.2
|
||||
pycountry==24.6.1
|
||||
pycparser==2.22
|
||||
pycryptodome==3.20.0
|
||||
pydantic==1.10.17
|
||||
pydash==8.0.3
|
||||
pydruid==0.6.9
|
||||
Pygments==2.18.0
|
||||
pyiceberg==0.4.0
|
||||
pymongo==4.8.0
|
||||
PyMySQL==1.1.1
|
||||
pyOpenSSL==24.2.1
|
||||
pyparsing==3.0.9
|
||||
pyspnego==0.11.1
|
||||
python-daemon==3.0.1
|
||||
python-dateutil==2.9.0.post0
|
||||
python-jose==3.3.0
|
||||
python-ldap==3.4.4
|
||||
python-liquid==1.12.1
|
||||
python-nvd3==0.16.0
|
||||
python-slugify==8.0.4
|
||||
python-stdnum==1.20
|
||||
python-tds==1.15.0
|
||||
python-utils==3.8.2
|
||||
pytz==2024.1
|
||||
PyYAML==6.0.1
|
||||
pyzmq==26.0.3
|
||||
redash-toolbelt==0.1.9
|
||||
redshift-connector==2.1.2
|
||||
referencing==0.35.1
|
||||
regex==2024.7.24
|
||||
requests==2.32.3
|
||||
requests-file==2.1.0
|
||||
requests-gssapi==1.3.0
|
||||
requests-toolbelt==1.0.0
|
||||
requests_ntlm==1.3.0
|
||||
responses==0.25.3
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986==2.0.0
|
||||
rich==13.7.1
|
||||
rich-argparse==1.5.2
|
||||
rpds-py==0.19.1
|
||||
rsa==4.9
|
||||
rstr==3.2.2
|
||||
ruamel.yaml==0.17.17
|
||||
s3transfer==0.10.2
|
||||
schwifty==2024.6.1
|
||||
scipy==1.14.0
|
||||
scramp==1.4.5
|
||||
Send2Trash==1.8.3
|
||||
sentry-sdk==2.12.0
|
||||
setproctitle==1.3.3
|
||||
shellingham==1.5.4
|
||||
simple-salesforce==1.12.6
|
||||
six==1.16.0
|
||||
slack-sdk==3.18.1
|
||||
smart-open==7.0.4
|
||||
smmap==5.0.1
|
||||
sniffio==1.3.1
|
||||
snowflake-connector-python==3.12.0
|
||||
snowflake-sqlalchemy==1.6.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.5
|
||||
spacy==3.7.5
|
||||
spacy-legacy==3.0.12
|
||||
spacy-loggers==1.0.5
|
||||
sql_metadata==2.12.0
|
||||
SQLAlchemy==1.4.44
|
||||
sqlalchemy-bigquery==1.11.0
|
||||
sqlalchemy-cockroachdb==1.4.4
|
||||
SQLAlchemy-JSONField==1.0.2
|
||||
sqlalchemy-pytds==0.3.5
|
||||
sqlalchemy-redshift==0.8.14
|
||||
SQLAlchemy-Utils==0.41.2
|
||||
sqlglotrs==0.2.7
|
||||
sqllineage==1.3.8
|
||||
sqlparse==0.4.4
|
||||
srsly==2.4.8
|
||||
stack-data==0.6.3
|
||||
strictyaml==1.7.3
|
||||
tableauserverclient==0.25
|
||||
tableschema==1.20.11
|
||||
tabulate==0.9.0
|
||||
tenacity==9.0.0
|
||||
teradatasql==20.0.0.14
|
||||
teradatasqlalchemy==20.0.0.1
|
||||
termcolor==2.4.0
|
||||
terminado==0.18.1
|
||||
text-unidecode==1.3
|
||||
thinc==8.2.5
|
||||
thrift==0.16.0
|
||||
thrift-sasl==0.4.3
|
||||
time-machine==2.14.2
|
||||
tinycss2==1.3.0
|
||||
toml==0.10.2
|
||||
tomlkit==0.13.0
|
||||
toolz==0.12.1
|
||||
tornado==6.4.1
|
||||
tqdm==4.66.4
|
||||
traitlets==5.2.1.post0
|
||||
trino==0.329.0
|
||||
typer==0.12.3
|
||||
typing-inspect==0.9.0
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.1
|
||||
tzlocal==5.2
|
||||
uc-micro-py==1.0.3
|
||||
ujson==5.10.0
|
||||
unicodecsv==0.14.1
|
||||
universal_pathlib==0.2.2
|
||||
urllib3==1.26.19
|
||||
vertica-python==1.4.0
|
||||
vertica-sqlalchemy-dialect==0.0.8.2
|
||||
vininfo==1.8.0
|
||||
wasabi==1.1.3
|
||||
wcmatch==8.5.2
|
||||
wcwidth==0.2.13
|
||||
weasel==0.4.1
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.8.0
|
||||
Werkzeug==2.2.3
|
||||
widgetsnbextension==4.0.11
|
||||
wirerope==0.4.7
|
||||
wrapt==1.16.0
|
||||
WTForms==3.1.2
|
||||
xlrd==2.0.1
|
||||
xmltodict==0.13.0
|
||||
yarl==1.9.4
|
||||
zeep==4.2.1
|
||||
zipp==3.19.2
|
||||
zstd==1.5.5.1
|
||||
@ -12,7 +12,7 @@ ext {
|
||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||
|
||||
revision = 6 // increment to trigger rebuild
|
||||
revision = 7 // increment to trigger rebuild
|
||||
}
|
||||
|
||||
docker {
|
||||
|
||||
@ -1,36 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This script is used to regenerate the base-requirements.txt file
|
||||
|
||||
set -euxo pipefail
|
||||
cd "$( dirname "${BASH_SOURCE[0]}" )"
|
||||
|
||||
SCRIPT_NAME=$(basename "$0")
|
||||
DATAHUB_DIR=$(pwd)/../..
|
||||
|
||||
# Create a virtualenv.
|
||||
VENV_DIR=$(mktemp -d)
|
||||
python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'"
|
||||
python -m venv $VENV_DIR
|
||||
source $VENV_DIR/bin/activate
|
||||
pip install --upgrade pip uv setuptools wheel
|
||||
echo "Using virtualenv at $VENV_DIR"
|
||||
|
||||
# Install stuff.
|
||||
pushd $DATAHUB_DIR/metadata-ingestion
|
||||
uv pip install -e '.[all]' -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]'
|
||||
popd
|
||||
|
||||
# Generate the requirements file.
|
||||
# Removing Flask deps due as per https://github.com/datahub-project/datahub/pull/6867/files
|
||||
# Removing py4j and PyJWT due to https://github.com/datahub-project/datahub/pull/6868/files
|
||||
# Removing pyspark and pydeequ because we don't want them in the slim image, so they can be added separately.
|
||||
# TODO: It's unclear if these removals are still actually needed.
|
||||
echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt
|
||||
pip freeze \
|
||||
| grep -v -E "^-e" \
|
||||
| grep -v -E "^uv==" \
|
||||
| grep -v "Flask-" \
|
||||
| grep -v -E "(py4j|PyJWT)==" \
|
||||
| grep -v -E "(pyspark|pydeequ)==" \
|
||||
>> base-requirements.txt
|
||||
@ -21,11 +21,11 @@ ARG PIP_MIRROR_URL
|
||||
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
||||
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
||||
|
||||
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
|
||||
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin
|
||||
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
|
||||
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /metadata-ingestion/airflow-plugin
|
||||
|
||||
ARG RELEASE_VERSION
|
||||
WORKDIR /datahub-ingestion
|
||||
WORKDIR /metadata-ingestion
|
||||
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
|
||||
sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \
|
||||
cat src/datahub/__init__.py | grep __version__ && \
|
||||
@ -33,7 +33,8 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS
|
||||
|
||||
FROM base AS slim-install
|
||||
|
||||
RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
|
||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
||||
UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]"
|
||||
|
||||
FROM base AS full-install-build
|
||||
|
||||
@ -43,9 +44,10 @@ RUN apt-get update && apt-get install -y -qq maven
|
||||
USER datahub
|
||||
COPY ./docker/datahub-ingestion/pyspark_jars.sh .
|
||||
|
||||
RUN uv pip install --no-cache -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \
|
||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
||||
UV_LINK_MODE=copy uv pip install -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \
|
||||
./pyspark_jars.sh && \
|
||||
datahub --version
|
||||
RUN ./pyspark_jars.sh
|
||||
|
||||
FROM base AS full-install
|
||||
|
||||
@ -57,4 +59,6 @@ FROM base AS dev-install
|
||||
|
||||
FROM ${APP_ENV}-install AS final
|
||||
|
||||
WORKDIR /datahub-ingestion
|
||||
|
||||
USER datahub
|
||||
|
||||
@ -11,18 +11,21 @@ ARG PIP_MIRROR_URL
|
||||
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
|
||||
ENV UV_INDEX_URL=${PIP_MIRROR_URL}
|
||||
|
||||
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
|
||||
COPY --chown=datahub ./metadata-ingestion /metadata-ingestion
|
||||
|
||||
ARG RELEASE_VERSION
|
||||
WORKDIR /datahub-ingestion
|
||||
WORKDIR /metadata-ingestion
|
||||
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
|
||||
cat src/datahub/__init__.py
|
||||
|
||||
FROM base as slim-install
|
||||
|
||||
RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
|
||||
RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \
|
||||
UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \
|
||||
datahub --version
|
||||
|
||||
FROM slim-install as final
|
||||
|
||||
WORKDIR /datahub-ingestion
|
||||
|
||||
USER datahub
|
||||
|
||||
@ -12,7 +12,7 @@ ext {
|
||||
docker_target = project.getProperties().getOrDefault("dockerTarget", "slim")
|
||||
docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}"
|
||||
|
||||
revision = 6 // increment to trigger rebuild
|
||||
revision = 8 // increment to trigger rebuild
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user