diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b5fa65525..30861fd11 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -55,6 +55,27 @@ jobs:
- name: Install all extras
run: make check-extras
+ check-licenses:
+ strategy:
+ matrix:
+ python-version: [ "3.12" ]
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ # NOTE(robinson) - dependencies are installed first because liccheck
+ # produces an error if there is a a mismatch between the dep version
+ # in the requirements file and the dep version in site packages
+ - name: Install all doc and test dependencies
+ run: |
+ make install-ci
+ make install-paddleocr
+ make install-all-ingest
+ make check-licenses
+
lint:
strategy:
matrix:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d16c6f0d..41ac4e725 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
* **Bump unstructured.paddleocr to 2.8.0.**
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `
`, `
`) nested inside a phrasing element (e.g. `` or ``). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
* **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.
+* **CI check for dependency licenses** Adds a CI check to ensure dependencies are appropriately licensed.
### Features
diff --git a/Makefile b/Makefile
index f4176b5bd..c1d02a0c6 100644
--- a/Makefile
+++ b/Makefile
@@ -395,6 +395,10 @@ check-black:
check-flake8:
flake8 .
+.PHONY: check-licenses
+check-licenses:
+ @scripts/check-licenses.sh
+
# Check for print statements in ingest since anything going to console should be using the ingest logger
# as it has a built in filter to redact sensitive information
.PHONY: check-flake8-print
diff --git a/liccheck.ini b/liccheck.ini
new file mode 100644
index 000000000..0b101ee6f
--- /dev/null
+++ b/liccheck.ini
@@ -0,0 +1,79 @@
+# Authorized and unauthorized licenses in LOWER CASE
+[Licenses]
+authorized_licenses:
+ ######################
+ # Permissive Licenses
+ ######################
+
+ # Apache-2.0
+ apache
+ apache 2.0
+ apache-2.0
+ apache software license
+ apache software
+ apache license v2.0
+ apache license 2.0
+ apache license, version 2.0
+
+ # BSD
+ bsd
+ new bsd
+ bsd license
+ new bsd license
+ simplified bsd
+ 3-clause bsd
+ freebsd
+ bsd 3-clause
+
+ # MIT
+ mit
+ mit license
+
+ # ISC
+ isc license
+ isc license (iscl)
+
+ # The Unlicense
+ the unlicense (unlicense)
+
+ # HPND
+ historical permission notice and disclaimer (hpnd)
+
+ #########################
+ # Weak Copy Left Licenses
+ #########################
+
+ # MPL-2.0
+ mozilla public license 2.0 (mpl 2.0)
+
+ # LGPL
+ gnu lesser general public license v2 or later (lgplv2+)
+ gnu lgpl
+ lgpl with exceptions or zpl
+ gnu library or lesser general public license (lgpl)
+ gnu lesser general public license v3 (lgplv3)
+ gnu general public license v2 (gplv2)
+
+ # PSF-2.0
+ python software foundation
+ python software foundation license
+
+
+unauthorized_licenses:
+ ###########################
+ # Strong Copy Left Licenses
+ ###########################
+ gpl v3
+
+[Authorized Packages]
+# Apache-2.0 https://github.com/chroma-core/hnswlib#Apache-2.0-1-ov-file
+chroma-hnswlib: >=0.7.3
+# MIT https://github.com/facebookresearch/iopath?tab=MIT-1-ov-file#readme
+iopath: >=0.1.10
+# BSD https://github.com/PDFium/PDFium?tab=BSD-3-Clause-1-ov-file#readme
+pypdfium2: >=4.30.0
+# MIT https://github.com/voyage-ai/voyageai-python?tab=MIT-1-ov-file#readme
+voyageai: >=0.2.3
+# OpenLDAP Public License, which is a permissive BSD style license
+# https://github.com/jnwatson/py-lmdb/?tab=License-1-ov-file#readme
+lmdb: >=1.5.1
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index d4f53a850..187bca54c 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -58,7 +58,7 @@ googleapis-common-protos==1.63.2
# via
# google-api-core
# grpcio-status
-grpcio==1.64.1
+grpcio==1.65.0
# via
# google-api-core
# grpcio-status
diff --git a/requirements/ingest/astra.txt b/requirements/ingest/astra.txt
index ffbfec5fd..65d09f3c5 100644
--- a/requirements/ingest/astra.txt
+++ b/requirements/ingest/astra.txt
@@ -9,7 +9,7 @@ anyio==3.7.1
# -c ./ingest/../base.txt
# -c ./ingest/../deps/constraints.txt
# httpx
-astrapy==1.3.1
+astrapy==1.4.0
# via -r ./ingest/astra.in
bson==0.5.10
# via astrapy
diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt
index e3733e761..271928d33 100644
--- a/requirements/ingest/chroma.txt
+++ b/requirements/ingest/chroma.txt
@@ -37,9 +37,9 @@ charset-normalizer==3.3.2
# via
# -c ./ingest/../base.txt
# requests
-chroma-hnswlib==0.7.3
+chroma-hnswlib==0.7.5
# via chromadb
-chromadb==0.5.3
+chromadb==0.5.4
# via -r ./ingest/chroma.in
click==8.1.7
# via
@@ -70,7 +70,7 @@ google-auth==2.32.0
# via kubernetes
googleapis-common-protos==1.63.2
# via opentelemetry-exporter-otlp-proto-grpc
-grpcio==1.64.1
+grpcio==1.65.0
# via
# chromadb
# opentelemetry-exporter-otlp-proto-grpc
@@ -216,7 +216,6 @@ pyyaml==6.0.1
requests==2.32.3
# via
# -c ./ingest/../base.txt
- # chromadb
# huggingface-hub
# kubernetes
# posthog
diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt
index 135f6eea4..628b057bd 100644
--- a/requirements/ingest/clarifai.txt
+++ b/requirements/ingest/clarifai.txt
@@ -21,7 +21,7 @@ contextlib2==21.6.0
# via schema
googleapis-common-protos==1.63.2
# via clarifai-grpc
-grpcio==1.64.1
+grpcio==1.65.0
# via clarifai-grpc
idna==3.7
# via
diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt
index 2e3202f3d..85a71df31 100644
--- a/requirements/ingest/embed-aws-bedrock.txt
+++ b/requirements/ingest/embed-aws-bedrock.txt
@@ -61,14 +61,14 @@ langchain-community==0.2.7
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-aws-bedrock.in
-langchain-core==0.2.12
+langchain-core==0.2.13
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.2
# via langchain
-langsmith==0.1.84
+langsmith==0.1.85
# via
# langchain
# langchain-community
diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
index c303d15f4..1f7c63bbd 100644
--- a/requirements/ingest/embed-huggingface.txt
+++ b/requirements/ingest/embed-huggingface.txt
@@ -73,14 +73,14 @@ langchain-community==0.2.7
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-huggingface.in
-langchain-core==0.2.12
+langchain-core==0.2.13
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.2
# via langchain
-langsmith==0.1.84
+langsmith==0.1.85
# via
# langchain
# langchain-community
diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt
index 50f36a6f3..780b4a914 100644
--- a/requirements/ingest/embed-octoai.txt
+++ b/requirements/ingest/embed-octoai.txt
@@ -47,7 +47,7 @@ idna==3.7
# anyio
# httpx
# requests
-openai==1.35.12
+openai==1.35.13
# via -r ./ingest/embed-octoai.in
pydantic==2.8.2
# via openai
diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt
index 56f9cee28..0e664c265 100644
--- a/requirements/ingest/embed-openai.txt
+++ b/requirements/ingest/embed-openai.txt
@@ -78,14 +78,14 @@ langchain-community==0.2.7
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-openai.in
-langchain-core==0.2.12
+langchain-core==0.2.13
# via
# langchain
# langchain-community
# langchain-text-splitters
langchain-text-splitters==0.2.2
# via langchain
-langsmith==0.1.84
+langsmith==0.1.85
# via
# langchain
# langchain-community
@@ -107,7 +107,7 @@ numpy==1.26.4
# -c ./ingest/../base.txt
# langchain
# langchain-community
-openai==1.35.12
+openai==1.35.13
# via -r ./ingest/embed-openai.in
orjson==3.10.6
# via langsmith
diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt
index 09a8a7d1a..fa0039cd9 100644
--- a/requirements/ingest/embed-vertexai.txt
+++ b/requirements/ingest/embed-vertexai.txt
@@ -54,7 +54,7 @@ google-auth==2.32.0
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
-google-cloud-aiplatform==1.58.0
+google-cloud-aiplatform==1.59.0
# via langchain-google-vertexai
google-cloud-bigquery==3.25.0
# via google-cloud-aiplatform
@@ -83,7 +83,7 @@ googleapis-common-protos[grpc]==1.63.2
# grpcio-status
grpc-google-iam-v1==0.13.1
# via google-cloud-resource-manager
-grpcio==1.64.1
+grpcio==1.65.0
# via
# google-api-core
# googleapis-common-protos
@@ -108,7 +108,7 @@ langchain-community==0.2.7
# via
# -c ./ingest/../deps/constraints.txt
# -r ./ingest/embed-vertexai.in
-langchain-core==0.2.12
+langchain-core==0.2.13
# via
# langchain
# langchain-community
@@ -118,7 +118,7 @@ langchain-google-vertexai==1.0.6
# via -r ./ingest/embed-vertexai.in
langchain-text-splitters==0.2.2
# via langchain
-langsmith==0.1.84
+langsmith==0.1.85
# via
# langchain
# langchain-community
diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt
index 00ce11bfb..264dbcca5 100644
--- a/requirements/ingest/embed-voyageai.txt
+++ b/requirements/ingest/embed-voyageai.txt
@@ -44,7 +44,7 @@ jsonpointer==3.0.0
# via jsonpatch
langchain==0.2.7
# via -r ./ingest/embed-voyageai.in
-langchain-core==0.2.12
+langchain-core==0.2.13
# via
# langchain
# langchain-text-splitters
@@ -53,7 +53,7 @@ langchain-text-splitters==0.2.2
# via langchain
langchain-voyageai==0.1.1
# via -r ./ingest/embed-voyageai.in
-langsmith==0.1.84
+langsmith==0.1.85
# via
# langchain
# langchain-core
diff --git a/requirements/ingest/kafka.txt b/requirements/ingest/kafka.txt
index fa5486cc4..adc8a9b30 100644
--- a/requirements/ingest/kafka.txt
+++ b/requirements/ingest/kafka.txt
@@ -4,5 +4,5 @@
#
# pip-compile ./ingest/kafka.in
#
-confluent-kafka==2.4.0
+confluent-kafka==2.5.0
# via -r ./ingest/kafka.in
diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt
index 16dfcaef5..9347ad252 100644
--- a/requirements/ingest/qdrant.txt
+++ b/requirements/ingest/qdrant.txt
@@ -21,7 +21,7 @@ exceptiongroup==1.2.1
# via
# -c ./ingest/../base.txt
# anyio
-grpcio==1.64.1
+grpcio==1.65.0
# via
# grpcio-tools
# qdrant-client
diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt
index c924cad3d..76854d98b 100644
--- a/requirements/ingest/singlestore.txt
+++ b/requirements/ingest/singlestore.txt
@@ -42,7 +42,7 @@ requests==2.32.3
# via
# -c ./ingest/../base.txt
# singlestoredb
-singlestoredb==1.4.2
+singlestoredb==1.4.3
# via -r ./ingest/singlestore.in
sqlparams==6.0.1
# via singlestoredb
diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt
index 5b90049e5..09e5f9aac 100644
--- a/requirements/ingest/weaviate.txt
+++ b/requirements/ingest/weaviate.txt
@@ -32,7 +32,7 @@ exceptiongroup==1.2.1
# via
# -c ./ingest/../base.txt
# anyio
-grpcio==1.64.1
+grpcio==1.65.0
# via
# grpcio-health-checking
# grpcio-tools
diff --git a/requirements/test.in b/requirements/test.in
index 312206730..c763c091d 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -20,3 +20,4 @@ types-tabulate
vcrpy
grpcio
autoflake
+liccheck
diff --git a/requirements/test.txt b/requirements/test.txt
index e50f5bd8f..ff110818e 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -47,7 +47,7 @@ flake8-print==5.0.0
# via -r ./test.in
freezegun==1.5.1
# via -r ./test.in
-grpcio==1.64.1
+grpcio==1.65.0
# via -r ./test.in
idna==3.7
# via
@@ -64,6 +64,8 @@ label-studio-sdk==0.0.34
# via -r ./test.in
label-studio-tools==0.0.4
# via label-studio-sdk
+liccheck==0.9.2
+ # via -r ./test.in
lxml==5.2.2
# via
# -c ./base.txt
@@ -136,10 +138,14 @@ rpds-py==0.19.0
# referencing
ruff==0.4.10
# via -r ./test.in
+semantic-version==2.10.0
+ # via liccheck
six==1.16.0
# via
# -c ./base.txt
# python-dateutil
+toml==0.10.2
+ # via liccheck
tomli==2.0.1
# via
# autoflake
diff --git a/scripts/check-licenses.sh b/scripts/check-licenses.sh
new file mode 100755
index 000000000..a7e7a2eef
--- /dev/null
+++ b/scripts/check-licenses.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+REQUIREMENTS_FILES=$(find requirements -type f -name "*.txt" \
+ -name "extra-pdf-image.in" \
+ ! -name "extra-pdf-image.txt" \
+ ! -name "constraints.txt" \
+ ! -name "dev.txt")
+
+for REQUIREMENTS_FILE in $REQUIREMENTS_FILES; do
+ echo "Checking $REQUIREMENTS_FILE"
+ liccheck -r "$REQUIREMENTS_FILE"
+ EXIT_CODE=$?
+ if [ "$EXIT_CODE" -eq 0 ]; then
+ echo "All dependencies have authorized licenses."
+ else
+ echo "There are dependencies with unauthorized or unknown licenses."
+ exit 1
+ fi
+done
+
+exit 0