mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
build: check dependency licenses in CI (#3349)
### Summary Adds a CI check to ensure that packages added as dependencies are appropriately licensed. All of the `.txt` files in the `requirements` directory are checked with the exception of: - `constraints.txt`, since those are not installed and are instead conditions on the other dependency files - `dev.txt`, since those are for local development and not shipped as part of the `unstructured` package - `extra-pdf-image.txt` - the `extra-pdf-image.in` since checking `extra-pdf-image.txt` pulls in NVIDIA GPU related packages with an `Other/Proprietary` license type, and there's not a good way to exclude those without adding `Other/Proprietary` to the allowed licenses list. ### Testing The new `check-licenses` job should pass in CI.
This commit is contained in:
parent
3d6e30a1f7
commit
ee2b247297
21
.github/workflows/ci.yml
vendored
21
.github/workflows/ci.yml
vendored
@ -55,6 +55,27 @@ jobs:
|
||||
- name: Install all extras
|
||||
run: make check-extras
|
||||
|
||||
check-licenses:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [ "3.12" ]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
# NOTE(robinson) - dependencies are installed first because liccheck
|
||||
# produces an error if there is a a mismatch between the dep version
|
||||
# in the requirements file and the dep version in site packages
|
||||
- name: Install all doc and test dependencies
|
||||
run: |
|
||||
make install-ci
|
||||
make install-paddleocr
|
||||
make install-all-ingest
|
||||
make check-licenses
|
||||
|
||||
lint:
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -5,6 +5,7 @@
|
||||
* **Bump unstructured.paddleocr to 2.8.0.**
|
||||
* **Refine HTML parser to accommodate block element nested in phrasing.** HTML parser no longer raises on a block element (e.g. `<p>`, `<div>`) nested inside a phrasing element (e.g. `<strong>` or `<cite>`). Instead it breaks the phrasing run (and therefore element) at the block-item start and begins a new phrasing run after the block-item. This is consistent with how the browser determines element boundaries in this situation.
|
||||
* **Install rewritten HTML parser to fix 12 existing bugs and provide headroom for refinement and growth.** A rewritten HTML parser resolves a collection of outstanding bugs with HTML partitioning and provides a firm foundation for further elaborating that important partitioner.
|
||||
* **CI check for dependency licenses** Adds a CI check to ensure dependencies are appropriately licensed.
|
||||
|
||||
### Features
|
||||
|
||||
|
4
Makefile
4
Makefile
@ -395,6 +395,10 @@ check-black:
|
||||
check-flake8:
|
||||
flake8 .
|
||||
|
||||
.PHONY: check-licenses
|
||||
check-licenses:
|
||||
@scripts/check-licenses.sh
|
||||
|
||||
# Check for print statements in ingest since anything going to console should be using the ingest logger
|
||||
# as it has a built in filter to redact sensitive information
|
||||
.PHONY: check-flake8-print
|
||||
|
79
liccheck.ini
Normal file
79
liccheck.ini
Normal file
@ -0,0 +1,79 @@
|
||||
# Authorized and unauthorized licenses in LOWER CASE
|
||||
[Licenses]
|
||||
authorized_licenses:
|
||||
######################
|
||||
# Permissive Licenses
|
||||
######################
|
||||
|
||||
# Apache-2.0
|
||||
apache
|
||||
apache 2.0
|
||||
apache-2.0
|
||||
apache software license
|
||||
apache software
|
||||
apache license v2.0
|
||||
apache license 2.0
|
||||
apache license, version 2.0
|
||||
|
||||
# BSD
|
||||
bsd
|
||||
new bsd
|
||||
bsd license
|
||||
new bsd license
|
||||
simplified bsd
|
||||
3-clause bsd
|
||||
freebsd
|
||||
bsd 3-clause
|
||||
|
||||
# MIT
|
||||
mit
|
||||
mit license
|
||||
|
||||
# ISC
|
||||
isc license
|
||||
isc license (iscl)
|
||||
|
||||
# The Unlicense
|
||||
the unlicense (unlicense)
|
||||
|
||||
# HPND
|
||||
historical permission notice and disclaimer (hpnd)
|
||||
|
||||
#########################
|
||||
# Weak Copy Left Licenses
|
||||
#########################
|
||||
|
||||
# MPL-2.0
|
||||
mozilla public license 2.0 (mpl 2.0)
|
||||
|
||||
# LGPL
|
||||
gnu lesser general public license v2 or later (lgplv2+)
|
||||
gnu lgpl
|
||||
lgpl with exceptions or zpl
|
||||
gnu library or lesser general public license (lgpl)
|
||||
gnu lesser general public license v3 (lgplv3)
|
||||
gnu general public license v2 (gplv2)
|
||||
|
||||
# PSF-2.0
|
||||
python software foundation
|
||||
python software foundation license
|
||||
|
||||
|
||||
unauthorized_licenses:
|
||||
###########################
|
||||
# Strong Copy Left Licenses
|
||||
###########################
|
||||
gpl v3
|
||||
|
||||
[Authorized Packages]
|
||||
# Apache-2.0 https://github.com/chroma-core/hnswlib#Apache-2.0-1-ov-file
|
||||
chroma-hnswlib: >=0.7.3
|
||||
# MIT https://github.com/facebookresearch/iopath?tab=MIT-1-ov-file#readme
|
||||
iopath: >=0.1.10
|
||||
# BSD https://github.com/PDFium/PDFium?tab=BSD-3-Clause-1-ov-file#readme
|
||||
pypdfium2: >=4.30.0
|
||||
# MIT https://github.com/voyage-ai/voyageai-python?tab=MIT-1-ov-file#readme
|
||||
voyageai: >=0.2.3
|
||||
# OpenLDAP Public License, which is a permissive BSD style license
|
||||
# https://github.com/jnwatson/py-lmdb/?tab=License-1-ov-file#readme
|
||||
lmdb: >=1.5.1
|
@ -58,7 +58,7 @@ googleapis-common-protos==1.63.2
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via
|
||||
# google-api-core
|
||||
# grpcio-status
|
||||
|
@ -9,7 +9,7 @@ anyio==3.7.1
|
||||
# -c ./ingest/../base.txt
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# httpx
|
||||
astrapy==1.3.1
|
||||
astrapy==1.4.0
|
||||
# via -r ./ingest/astra.in
|
||||
bson==0.5.10
|
||||
# via astrapy
|
||||
|
@ -37,9 +37,9 @@ charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# requests
|
||||
chroma-hnswlib==0.7.3
|
||||
chroma-hnswlib==0.7.5
|
||||
# via chromadb
|
||||
chromadb==0.5.3
|
||||
chromadb==0.5.4
|
||||
# via -r ./ingest/chroma.in
|
||||
click==8.1.7
|
||||
# via
|
||||
@ -70,7 +70,7 @@ google-auth==2.32.0
|
||||
# via kubernetes
|
||||
googleapis-common-protos==1.63.2
|
||||
# via opentelemetry-exporter-otlp-proto-grpc
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via
|
||||
# chromadb
|
||||
# opentelemetry-exporter-otlp-proto-grpc
|
||||
@ -216,7 +216,6 @@ pyyaml==6.0.1
|
||||
requests==2.32.3
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# chromadb
|
||||
# huggingface-hub
|
||||
# kubernetes
|
||||
# posthog
|
||||
|
@ -21,7 +21,7 @@ contextlib2==21.6.0
|
||||
# via schema
|
||||
googleapis-common-protos==1.63.2
|
||||
# via clarifai-grpc
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via clarifai-grpc
|
||||
idna==3.7
|
||||
# via
|
||||
|
@ -61,14 +61,14 @@ langchain-community==0.2.7
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/embed-aws-bedrock.in
|
||||
langchain-core==0.2.12
|
||||
langchain-core==0.2.13
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langsmith==0.1.84
|
||||
langsmith==0.1.85
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
@ -73,14 +73,14 @@ langchain-community==0.2.7
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/embed-huggingface.in
|
||||
langchain-core==0.2.12
|
||||
langchain-core==0.2.13
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langsmith==0.1.84
|
||||
langsmith==0.1.85
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
@ -47,7 +47,7 @@ idna==3.7
|
||||
# anyio
|
||||
# httpx
|
||||
# requests
|
||||
openai==1.35.12
|
||||
openai==1.35.13
|
||||
# via -r ./ingest/embed-octoai.in
|
||||
pydantic==2.8.2
|
||||
# via openai
|
||||
|
@ -78,14 +78,14 @@ langchain-community==0.2.7
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/embed-openai.in
|
||||
langchain-core==0.2.12
|
||||
langchain-core==0.2.13
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
# langchain-text-splitters
|
||||
langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langsmith==0.1.84
|
||||
langsmith==0.1.85
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -107,7 +107,7 @@ numpy==1.26.4
|
||||
# -c ./ingest/../base.txt
|
||||
# langchain
|
||||
# langchain-community
|
||||
openai==1.35.12
|
||||
openai==1.35.13
|
||||
# via -r ./ingest/embed-openai.in
|
||||
orjson==3.10.6
|
||||
# via langsmith
|
||||
|
@ -54,7 +54,7 @@ google-auth==2.32.0
|
||||
# google-cloud-core
|
||||
# google-cloud-resource-manager
|
||||
# google-cloud-storage
|
||||
google-cloud-aiplatform==1.58.0
|
||||
google-cloud-aiplatform==1.59.0
|
||||
# via langchain-google-vertexai
|
||||
google-cloud-bigquery==3.25.0
|
||||
# via google-cloud-aiplatform
|
||||
@ -83,7 +83,7 @@ googleapis-common-protos[grpc]==1.63.2
|
||||
# grpcio-status
|
||||
grpc-google-iam-v1==0.13.1
|
||||
# via google-cloud-resource-manager
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via
|
||||
# google-api-core
|
||||
# googleapis-common-protos
|
||||
@ -108,7 +108,7 @@ langchain-community==0.2.7
|
||||
# via
|
||||
# -c ./ingest/../deps/constraints.txt
|
||||
# -r ./ingest/embed-vertexai.in
|
||||
langchain-core==0.2.12
|
||||
langchain-core==0.2.13
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
@ -118,7 +118,7 @@ langchain-google-vertexai==1.0.6
|
||||
# via -r ./ingest/embed-vertexai.in
|
||||
langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langsmith==0.1.84
|
||||
langsmith==0.1.85
|
||||
# via
|
||||
# langchain
|
||||
# langchain-community
|
||||
|
@ -44,7 +44,7 @@ jsonpointer==3.0.0
|
||||
# via jsonpatch
|
||||
langchain==0.2.7
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langchain-core==0.2.12
|
||||
langchain-core==0.2.13
|
||||
# via
|
||||
# langchain
|
||||
# langchain-text-splitters
|
||||
@ -53,7 +53,7 @@ langchain-text-splitters==0.2.2
|
||||
# via langchain
|
||||
langchain-voyageai==0.1.1
|
||||
# via -r ./ingest/embed-voyageai.in
|
||||
langsmith==0.1.84
|
||||
langsmith==0.1.85
|
||||
# via
|
||||
# langchain
|
||||
# langchain-core
|
||||
|
@ -4,5 +4,5 @@
|
||||
#
|
||||
# pip-compile ./ingest/kafka.in
|
||||
#
|
||||
confluent-kafka==2.4.0
|
||||
confluent-kafka==2.5.0
|
||||
# via -r ./ingest/kafka.in
|
||||
|
@ -21,7 +21,7 @@ exceptiongroup==1.2.1
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# anyio
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via
|
||||
# grpcio-tools
|
||||
# qdrant-client
|
||||
|
@ -42,7 +42,7 @@ requests==2.32.3
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# singlestoredb
|
||||
singlestoredb==1.4.2
|
||||
singlestoredb==1.4.3
|
||||
# via -r ./ingest/singlestore.in
|
||||
sqlparams==6.0.1
|
||||
# via singlestoredb
|
||||
|
@ -32,7 +32,7 @@ exceptiongroup==1.2.1
|
||||
# via
|
||||
# -c ./ingest/../base.txt
|
||||
# anyio
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via
|
||||
# grpcio-health-checking
|
||||
# grpcio-tools
|
||||
|
@ -20,3 +20,4 @@ types-tabulate
|
||||
vcrpy
|
||||
grpcio
|
||||
autoflake
|
||||
liccheck
|
||||
|
@ -47,7 +47,7 @@ flake8-print==5.0.0
|
||||
# via -r ./test.in
|
||||
freezegun==1.5.1
|
||||
# via -r ./test.in
|
||||
grpcio==1.64.1
|
||||
grpcio==1.65.0
|
||||
# via -r ./test.in
|
||||
idna==3.7
|
||||
# via
|
||||
@ -64,6 +64,8 @@ label-studio-sdk==0.0.34
|
||||
# via -r ./test.in
|
||||
label-studio-tools==0.0.4
|
||||
# via label-studio-sdk
|
||||
liccheck==0.9.2
|
||||
# via -r ./test.in
|
||||
lxml==5.2.2
|
||||
# via
|
||||
# -c ./base.txt
|
||||
@ -136,10 +138,14 @@ rpds-py==0.19.0
|
||||
# referencing
|
||||
ruff==0.4.10
|
||||
# via -r ./test.in
|
||||
semantic-version==2.10.0
|
||||
# via liccheck
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c ./base.txt
|
||||
# python-dateutil
|
||||
toml==0.10.2
|
||||
# via liccheck
|
||||
tomli==2.0.1
|
||||
# via
|
||||
# autoflake
|
||||
|
21
scripts/check-licenses.sh
Executable file
21
scripts/check-licenses.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
REQUIREMENTS_FILES=$(find requirements -type f -name "*.txt" \
|
||||
-name "extra-pdf-image.in" \
|
||||
! -name "extra-pdf-image.txt" \
|
||||
! -name "constraints.txt" \
|
||||
! -name "dev.txt")
|
||||
|
||||
for REQUIREMENTS_FILE in $REQUIREMENTS_FILES; do
|
||||
echo "Checking $REQUIREMENTS_FILE"
|
||||
liccheck -r "$REQUIREMENTS_FILE"
|
||||
EXIT_CODE=$?
|
||||
if [ "$EXIT_CODE" -eq 0 ]; then
|
||||
echo "All dependencies have authorized licenses."
|
||||
else
|
||||
echo "There are dependencies with unauthorized or unknown licenses."
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
exit 0
|
Loading…
x
Reference in New Issue
Block a user