diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5fa65525..30861fd11 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,6 +55,27 @@ jobs: - name: Install all extras run: make check-extras + check-licenses: + strategy: + matrix: + python-version: [ "3.12" ] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + # NOTE(robinson) - dependencies are installed first because liccheck + # produces an error if there is a a mismatch between the dep version + # in the requirements file and the dep version in site packages + - name: Install all doc and test dependencies + run: | + make install-ci + make install-paddleocr + make install-all-ingest + make check-licenses + lint: strategy: matrix: diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d16c6f0d..41ac4e725 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * **Bump unstructured.paddleocr to 2.8.0.** * **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `

`, `

`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation. * **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner. +* **CI check for dependency licenses** Adds a CI check to ensure dependencies are appropriately licensed. ### Features diff --git a/Makefile b/Makefile index f4176b5bd..c1d02a0c6 100644 --- a/Makefile +++ b/Makefile @@ -395,6 +395,10 @@ check-black: check-flake8: flake8 . +.PHONY: check-licenses +check-licenses: + @scripts/check-licenses.sh + # Check for print statements in ingest since anything going to console should be using the ingest logger # as it has a built in filter to redact sensitive information .PHONY: check-flake8-print diff --git a/liccheck.ini b/liccheck.ini new file mode 100644 index 000000000..0b101ee6f --- /dev/null +++ b/liccheck.ini @@ -0,0 +1,79 @@ +# Authorized and unauthorized licenses in LOWER CASE +[Licenses] +authorized_licenses: + ###################### + # Permissive Licenses + ###################### + + # Apache-2.0 + apache + apache 2.0 + apache-2.0 + apache software license + apache software + apache license v2.0 + apache license 2.0 + apache license, version 2.0 + + # BSD + bsd + new bsd + bsd license + new bsd license + simplified bsd + 3-clause bsd + freebsd + bsd 3-clause + + # MIT + mit + mit license + + # ISC + isc license + isc license (iscl) + + # The Unlicense + the unlicense (unlicense) + + # HPND + historical permission notice and disclaimer (hpnd) + + ######################### + # Weak Copy Left Licenses + ######################### + + # MPL-2.0 + mozilla public license 2.0 (mpl 2.0) + + # LGPL + gnu lesser general public license v2 or later (lgplv2+) + gnu lgpl + lgpl with exceptions or zpl + gnu library or lesser general public license (lgpl) + gnu lesser general public license v3 (lgplv3) + gnu general public license v2 (gplv2) + + # PSF-2.0 + python software foundation + python software foundation license + + +unauthorized_licenses: + ########################### + # Strong Copy Left Licenses + ########################### + gpl v3 + +[Authorized Packages] +# Apache-2.0 https://github.com/chroma-core/hnswlib#Apache-2.0-1-ov-file +chroma-hnswlib: >=0.7.3 +# MIT https://github.com/facebookresearch/iopath?tab=MIT-1-ov-file#readme +iopath: >=0.1.10 +# BSD https://github.com/PDFium/PDFium?tab=BSD-3-Clause-1-ov-file#readme +pypdfium2: >=4.30.0 +# MIT https://github.com/voyage-ai/voyageai-python?tab=MIT-1-ov-file#readme +voyageai: >=0.2.3 +# OpenLDAP Public License, which is a permissive BSD style license +# https://github.com/jnwatson/py-lmdb/?tab=License-1-ov-file#readme +lmdb: >=1.5.1 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index d4f53a850..187bca54c 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -58,7 +58,7 @@ googleapis-common-protos==1.63.2 # via # google-api-core # grpcio-status -grpcio==1.64.1 +grpcio==1.65.0 # via # google-api-core # grpcio-status diff --git a/requirements/ingest/astra.txt b/requirements/ingest/astra.txt index ffbfec5fd..65d09f3c5 100644 --- a/requirements/ingest/astra.txt +++ b/requirements/ingest/astra.txt @@ -9,7 +9,7 @@ anyio==3.7.1 # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx -astrapy==1.3.1 +astrapy==1.4.0 # via -r ./ingest/astra.in bson==0.5.10 # via astrapy diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index e3733e761..271928d33 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -37,9 +37,9 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -chroma-hnswlib==0.7.3 +chroma-hnswlib==0.7.5 # via chromadb -chromadb==0.5.3 +chromadb==0.5.4 # via -r ./ingest/chroma.in click==8.1.7 # via @@ -70,7 +70,7 @@ google-auth==2.32.0 # via kubernetes googleapis-common-protos==1.63.2 # via opentelemetry-exporter-otlp-proto-grpc -grpcio==1.64.1 +grpcio==1.65.0 # via # chromadb # opentelemetry-exporter-otlp-proto-grpc @@ -216,7 +216,6 @@ pyyaml==6.0.1 requests==2.32.3 # via # -c ./ingest/../base.txt - # chromadb # huggingface-hub # kubernetes # posthog diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index 135f6eea4..628b057bd 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -21,7 +21,7 @@ contextlib2==21.6.0 # via schema googleapis-common-protos==1.63.2 # via clarifai-grpc -grpcio==1.64.1 +grpcio==1.65.0 # via clarifai-grpc idna==3.7 # via diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 2e3202f3d..85a71df31 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -61,14 +61,14 @@ langchain-community==0.2.7 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-aws-bedrock.in -langchain-core==0.2.12 +langchain-core==0.2.13 # via # langchain # langchain-community # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.84 +langsmith==0.1.85 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index c303d15f4..1f7c63bbd 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -73,14 +73,14 @@ langchain-community==0.2.7 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-huggingface.in -langchain-core==0.2.12 +langchain-core==0.2.13 # via # langchain # langchain-community # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.84 +langsmith==0.1.85 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index 50f36a6f3..780b4a914 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -47,7 +47,7 @@ idna==3.7 # anyio # httpx # requests -openai==1.35.12 +openai==1.35.13 # via -r ./ingest/embed-octoai.in pydantic==2.8.2 # via openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 56f9cee28..0e664c265 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -78,14 +78,14 @@ langchain-community==0.2.7 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-openai.in -langchain-core==0.2.12 +langchain-core==0.2.13 # via # langchain # langchain-community # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.84 +langsmith==0.1.85 # via # langchain # langchain-community @@ -107,7 +107,7 @@ numpy==1.26.4 # -c ./ingest/../base.txt # langchain # langchain-community -openai==1.35.12 +openai==1.35.13 # via -r ./ingest/embed-openai.in orjson==3.10.6 # via langsmith diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index 09a8a7d1a..fa0039cd9 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -54,7 +54,7 @@ google-auth==2.32.0 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-cloud-aiplatform==1.58.0 +google-cloud-aiplatform==1.59.0 # via langchain-google-vertexai google-cloud-bigquery==3.25.0 # via google-cloud-aiplatform @@ -83,7 +83,7 @@ googleapis-common-protos[grpc]==1.63.2 # grpcio-status grpc-google-iam-v1==0.13.1 # via google-cloud-resource-manager -grpcio==1.64.1 +grpcio==1.65.0 # via # google-api-core # googleapis-common-protos @@ -108,7 +108,7 @@ langchain-community==0.2.7 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-vertexai.in -langchain-core==0.2.12 +langchain-core==0.2.13 # via # langchain # langchain-community @@ -118,7 +118,7 @@ langchain-google-vertexai==1.0.6 # via -r ./ingest/embed-vertexai.in langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.84 +langsmith==0.1.85 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 00ce11bfb..264dbcca5 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -44,7 +44,7 @@ jsonpointer==3.0.0 # via jsonpatch langchain==0.2.7 # via -r ./ingest/embed-voyageai.in -langchain-core==0.2.12 +langchain-core==0.2.13 # via # langchain # langchain-text-splitters @@ -53,7 +53,7 @@ langchain-text-splitters==0.2.2 # via langchain langchain-voyageai==0.1.1 # via -r ./ingest/embed-voyageai.in -langsmith==0.1.84 +langsmith==0.1.85 # via # langchain # langchain-core diff --git a/requirements/ingest/kafka.txt b/requirements/ingest/kafka.txt index fa5486cc4..adc8a9b30 100644 --- a/requirements/ingest/kafka.txt +++ b/requirements/ingest/kafka.txt @@ -4,5 +4,5 @@ # # pip-compile ./ingest/kafka.in # -confluent-kafka==2.4.0 +confluent-kafka==2.5.0 # via -r ./ingest/kafka.in diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index 16dfcaef5..9347ad252 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -21,7 +21,7 @@ exceptiongroup==1.2.1 # via # -c ./ingest/../base.txt # anyio -grpcio==1.64.1 +grpcio==1.65.0 # via # grpcio-tools # qdrant-client diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt index c924cad3d..76854d98b 100644 --- a/requirements/ingest/singlestore.txt +++ b/requirements/ingest/singlestore.txt @@ -42,7 +42,7 @@ requests==2.32.3 # via # -c ./ingest/../base.txt # singlestoredb -singlestoredb==1.4.2 +singlestoredb==1.4.3 # via -r ./ingest/singlestore.in sqlparams==6.0.1 # via singlestoredb diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index 5b90049e5..09e5f9aac 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -32,7 +32,7 @@ exceptiongroup==1.2.1 # via # -c ./ingest/../base.txt # anyio -grpcio==1.64.1 +grpcio==1.65.0 # via # grpcio-health-checking # grpcio-tools diff --git a/requirements/test.in b/requirements/test.in index 312206730..c763c091d 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -20,3 +20,4 @@ types-tabulate vcrpy grpcio autoflake +liccheck diff --git a/requirements/test.txt b/requirements/test.txt index e50f5bd8f..ff110818e 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -47,7 +47,7 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.1 # via -r ./test.in -grpcio==1.64.1 +grpcio==1.65.0 # via -r ./test.in idna==3.7 # via @@ -64,6 +64,8 @@ label-studio-sdk==0.0.34 # via -r ./test.in label-studio-tools==0.0.4 # via label-studio-sdk +liccheck==0.9.2 + # via -r ./test.in lxml==5.2.2 # via # -c ./base.txt @@ -136,10 +138,14 @@ rpds-py==0.19.0 # referencing ruff==0.4.10 # via -r ./test.in +semantic-version==2.10.0 + # via liccheck six==1.16.0 # via # -c ./base.txt # python-dateutil +toml==0.10.2 + # via liccheck tomli==2.0.1 # via # autoflake diff --git a/scripts/check-licenses.sh b/scripts/check-licenses.sh new file mode 100755 index 000000000..a7e7a2eef --- /dev/null +++ b/scripts/check-licenses.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +REQUIREMENTS_FILES=$(find requirements -type f -name "*.txt" \ + -name "extra-pdf-image.in" \ + ! -name "extra-pdf-image.txt" \ + ! -name "constraints.txt" \ + ! -name "dev.txt") + +for REQUIREMENTS_FILE in $REQUIREMENTS_FILES; do + echo "Checking $REQUIREMENTS_FILE" + liccheck -r "$REQUIREMENTS_FILE" + EXIT_CODE=$? + if [ "$EXIT_CODE" -eq 0 ]; then + echo "All dependencies have authorized licenses." + else + echo "There are dependencies with unauthorized or unknown licenses." + exit 1 + fi +done + +exit 0