diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..bf1e2fe3d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,37 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: "v4.3.0" + hooks: + - id: check-added-large-files + - id: check-toml + - id: check-yaml + - id: check-json + - id: check-xml + - id: end-of-file-fixer + include: \.py$ + - id: trailing-whitespace + - id: mixed-line-ending + + - repo: https://github.com/psf/black + rev: 22.10.0 + hooks: + - id: black + args: ["--line-length=100"] + language_version: python3 + + - repo: https://github.com/charliermarsh/ruff-pre-commit + rev: "v0.0.230" + hooks: + - id: ruff + args: + [ + "--fix", + "--select=I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402", + "--ignore=PT011,PT012,SIM117", + ] + + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + language_version: python3 diff --git a/README.md b/README.md index af0a64cfe..f46aa3578 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,15 @@ locally if you are planning to contribute to the project. * For processing image files, `tesseract` is required. See [here](https://tesseract-ocr.github.io/tessdoc/Installation.html) for installation instructions. * For processing PDF files, `tesseract` and `poppler` are required. The [pdf2image docs](https://pdf2image.readthedocs.io/en/latest/installation.html) have instructions on installing `poppler` across various platforms. +Additionally, if you're planning to contribute to `unstructured`, we provide you an optional `pre-commit` configuration +file to ensure your code matches the formatting and linting standards used in `unstructured`. +If you'd prefer not having code changes auto-tidied before every commit, you can use `make check` to see +whether any linting or formatting changes should be applied, and `make tidy` to apply them. + +If using the optional `pre-commit`, you'll just need to install the hooks with `pre-commit install` since the +`pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit` +you can also uninstall the hooks with `pre-commit uninstall`. + ## :clap: Quick Tour You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below. diff --git a/docs/requirements.txt b/docs/requirements.txt index b24835d4b..ed71f3901 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -6,7 +6,7 @@ # alabaster==0.7.13 # via sphinx -babel==2.11.0 +babel==2.12.1 # via sphinx beautifulsoup4==4.11.2 # via furo diff --git a/requirements/build.txt b/requirements/build.txt index b24835d4b..ed71f3901 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -6,7 +6,7 @@ # alabaster==0.7.13 # via sphinx -babel==2.11.0 +babel==2.12.1 # via sphinx beautifulsoup4==4.11.2 # via furo diff --git a/requirements/dev.in b/requirements/dev.in index 72d9cd1d0..f2a7ebf0f 100644 --- a/requirements/dev.in +++ b/requirements/dev.in @@ -1,7 +1,7 @@ jupyter ipython pip-tools - +pre-commit # NOTE(robinson) - Required pins for security scans jupyter-core>=4.11.2 wheel>=0.38.1 diff --git a/requirements/dev.txt b/requirements/dev.txt index 437271ed3..ecac308b9 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -33,6 +33,8 @@ build==0.10.0 # via pip-tools cffi==1.15.1 # via argon2-cffi-bindings +cfgv==3.3.1 + # via pre-commit click==8.1.3 # via pip-tools comm==0.1.2 @@ -43,12 +45,18 @@ decorator==5.1.1 # via ipython defusedxml==0.7.1 # via nbconvert +distlib==0.3.6 + # via virtualenv executing==1.2.0 # via stack-data fastjsonschema==2.16.3 # via nbformat +filelock==3.9.0 + # via virtualenv fqdn==1.5.1 # via jsonschema +identify==2.5.18 + # via pre-commit idna==3.4 # via # anyio @@ -67,7 +75,7 @@ ipykernel==6.21.2 # nbclassic # notebook # qtconsole -ipython==8.10.0 +ipython==8.11.0 # via # -r requirements/dev.in # ipykernel @@ -166,6 +174,8 @@ nest-asyncio==1.5.6 # ipykernel # nbclassic # notebook +nodeenv==1.7.0 + # via pre-commit notebook==6.5.2 # via jupyter notebook-shim==0.2.2 @@ -185,18 +195,22 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.12.2 +pip-tools==6.12.3 # via -r requirements/dev.in pkgutil-resolve-name==1.3.10 # via jsonschema platformdirs==3.0.0 - # via jupyter-core + # via + # jupyter-core + # virtualenv +pre-commit==3.1.1 + # via -r requirements/dev.in prometheus-client==0.16.0 # via # jupyter-server # nbclassic # notebook -prompt-toolkit==3.0.37 +prompt-toolkit==3.0.38 # via # ipython # jupyter-console @@ -227,7 +241,9 @@ python-dateutil==2.8.2 python-json-logger==2.0.7 # via jupyter-events pyyaml==6.0 - # via jupyter-events + # via + # jupyter-events + # pre-commit pyzmq==25.0.0 # via # ipykernel @@ -306,6 +322,8 @@ traitlets==5.9.0 # qtconsole uri-template==1.2.0 # via jsonschema +virtualenv==20.20.0 + # via pre-commit wcwidth==0.2.6 # via prompt-toolkit webcolors==1.12 diff --git a/requirements/ingest-s3.txt b/requirements/ingest-s3.txt index 82e0057c5..428ff5bb9 100644 --- a/requirements/ingest-s3.txt +++ b/requirements/ingest-s3.txt @@ -16,9 +16,9 @@ backoff==2.2.1 # via # -r requirements/base.txt # argilla -boto3==1.26.80 +boto3==1.26.82 # via unstructured (setup.py) -botocore==1.29.80 +botocore==1.29.82 # via # boto3 # s3transfer diff --git a/requirements/ingest-wikipedia.txt b/requirements/ingest-wikipedia.txt index 40c3e6873..1865f3734 100644 --- a/requirements/ingest-wikipedia.txt +++ b/requirements/ingest-wikipedia.txt @@ -8,7 +8,7 @@ anyio==3.6.2 # via # -r requirements/base.txt # httpcore -argilla==1.3.0 +argilla==1.3.1 # via # -r requirements/base.txt # unstructured (setup.py) @@ -33,10 +33,6 @@ click==8.1.3 # via # -r requirements/base.txt # nltk -colorama==0.4.6 - # via - # click - # tqdm deprecated==1.2.13 # via # -r requirements/base.txt @@ -63,6 +59,10 @@ idna==3.4 # anyio # requests # rfc3986 +importlib-metadata==6.0.0 + # via + # -r requirements/base.txt + # markdown joblib==1.2.0 # via # -r requirements/base.txt @@ -73,6 +73,10 @@ lxml==4.9.2 # python-docx # python-pptx # unstructured (setup.py) +markdown==3.4.1 + # via + # -r requirements/base.txt + # unstructured (setup.py) monotonic==1.6 # via # -r requirements/base.txt @@ -104,7 +108,7 @@ pillow==9.4.0 # -r requirements/base.txt # python-pptx # unstructured (setup.py) -pydantic==1.10.4 +pydantic==1.10.5 # via # -r requirements/base.txt # argilla @@ -158,7 +162,7 @@ tqdm==4.64.1 # -r requirements/base.txt # argilla # nltk -typing-extensions==4.4.0 +typing-extensions==4.5.0 # via # -r requirements/base.txt # pydantic @@ -177,3 +181,7 @@ xlsxwriter==3.0.8 # via # -r requirements/base.txt # python-pptx +zipp==3.15.0 + # via + # -r requirements/base.txt + # importlib-metadata