mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
272 lines
7.4 KiB
TOML
272 lines
7.4 KiB
TOML
[project]
|
|
name = "docling"
|
|
version = "2.38.1" # DO NOT EDIT, updated automatically
|
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
|
license = "MIT"
|
|
keywords = [
|
|
"docling",
|
|
"convert",
|
|
"document",
|
|
"pdf",
|
|
"docx",
|
|
"html",
|
|
"markdown",
|
|
"layout model",
|
|
"segmentation",
|
|
"table structure",
|
|
"table former",
|
|
]
|
|
classifiers = [
|
|
"Operating System :: MacOS :: MacOS X",
|
|
"Operating System :: POSIX :: Linux",
|
|
"Operating System :: Microsoft :: Windows",
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Science/Research",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.9",
|
|
"Programming Language :: Python :: 3.10",
|
|
"Programming Language :: Python :: 3.11",
|
|
"Programming Language :: Python :: 3.12",
|
|
"Programming Language :: Python :: 3.13",
|
|
]
|
|
readme = "README.md"
|
|
authors = [
|
|
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
|
|
{ name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
|
|
{ name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
|
|
{ name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
|
|
{ name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
|
|
{ name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
|
|
{ name = "Peter Staar", email = "taa@zurich.ibm.com" },
|
|
]
|
|
requires-python = '>=3.9,<4.0'
|
|
dependencies = [
|
|
'pydantic (>=2.0.0,<3.0.0)',
|
|
'docling-core[chunking] (>=2.29.0,<3.0.0)',
|
|
'docling-ibm-models (>=3.4.4,<4.0.0)',
|
|
'docling-parse (>=4.0.0,<5.0.0)',
|
|
'filetype (>=1.2.0,<2.0.0)',
|
|
'pypdfium2 (>=4.30.0,<5.0.0)',
|
|
'pydantic-settings (>=2.3.0,<3.0.0)',
|
|
'huggingface_hub (>=0.23,<1)',
|
|
'requests (>=2.32.2,<3.0.0)',
|
|
'easyocr (>=1.7,<2.0)',
|
|
'certifi (>=2024.7.4)',
|
|
'rtree (>=1.3.0,<2.0.0)',
|
|
'typer (>=0.12.5,<0.17.0)',
|
|
'python-docx (>=1.1.2,<2.0.0)',
|
|
'python-pptx (>=1.0.2,<2.0.0)',
|
|
'beautifulsoup4 (>=4.12.3,<5.0.0)',
|
|
'pandas (>=2.1.4,<3.0.0)',
|
|
'marko (>=2.1.2,<3.0.0)',
|
|
'openpyxl (>=3.1.5,<4.0.0)',
|
|
'lxml (>=4.0.0,<6.0.0)',
|
|
'pillow (>=10.0.0,<12.0.0)',
|
|
'tqdm (>=4.65.0,<5.0.0)',
|
|
'pluggy (>=1.0.0,<2.0.0)',
|
|
'pylatexenc (>=2.10,<3.0)',
|
|
'scipy (>=1.6.0,<2.0.0)',
|
|
# 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
|
|
# 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
|
|
]
|
|
|
|
[project.urls]
|
|
homepage = "https://github.com/docling-project/docling"
|
|
repository = "https://github.com/docling-project/docling"
|
|
issues = "https://github.com/docling-project/docling/issues"
|
|
changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
|
|
|
|
[project.entry-points.docling]
|
|
"docling_defaults" = "docling.models.plugins.defaults"
|
|
|
|
[project.scripts]
|
|
docling = "docling.cli.main:app"
|
|
docling-tools = "docling.cli.tools:app"
|
|
|
|
[project.optional-dependencies]
|
|
tesserocr = ['tesserocr (>=2.7.1,<3.0.0)']
|
|
ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
|
|
vlm = [
|
|
'transformers (>=4.46.0,<5.0.0)',
|
|
'accelerate (>=1.2.1,<2.0.0)',
|
|
'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
|
]
|
|
rapidocr = [
|
|
'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
|
|
'onnxruntime (>=1.7.0,<2.0.0)',
|
|
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
|
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
|
]
|
|
asr = [
|
|
"openai-whisper>=20240930",
|
|
]
|
|
|
|
[dependency-groups]
|
|
dev = [
|
|
"pre-commit~=3.7",
|
|
"mypy~=1.10",
|
|
"types-setuptools~=70.3",
|
|
"pandas-stubs~=2.1",
|
|
"types-openpyxl~=3.1",
|
|
"types-requests~=2.31",
|
|
"boto3-stubs~=1.37",
|
|
"types-urllib3~=1.26",
|
|
"types-tqdm~=4.67",
|
|
"coverage~=7.6",
|
|
"pytest~=8.3",
|
|
"pytest-cov>=6.1.1",
|
|
"pytest-dependency~=0.6",
|
|
"pytest-xdist~=3.3",
|
|
"ipykernel~=6.29",
|
|
"ipywidgets~=8.1",
|
|
"nbqa~=1.9",
|
|
"python-semantic-release~=7.32",
|
|
]
|
|
docs = [
|
|
"mkdocs-material~=9.5",
|
|
"mkdocs-jupyter~=0.25",
|
|
"mkdocs-click~=0.8",
|
|
"mkdocstrings[python]~=0.27",
|
|
"griffe-pydantic~=1.1",
|
|
]
|
|
examples = [
|
|
"datasets~=2.21",
|
|
"python-dotenv~=1.0",
|
|
"langchain-huggingface>=0.0.3",
|
|
"langchain-milvus~=0.1",
|
|
"langchain-text-splitters~=0.2",
|
|
]
|
|
constraints = [
|
|
'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
|
'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
|
]
|
|
|
|
|
|
[tool.uv]
|
|
package = true
|
|
default-groups = "all"
|
|
|
|
[tool.uv.sources]
|
|
openai-whisper = { git = "https://github.com/openai/whisper.git", rev = "dd985ac4b90cafeef8712f2998d62c59c3e62d22" }
|
|
|
|
[tool.setuptools.packages.find]
|
|
include = ["docling*"]
|
|
|
|
[tool.ruff]
|
|
target-version = "py39"
|
|
line-length = 88
|
|
respect-gitignore = true
|
|
|
|
# extend-exclude = [
|
|
# "tests",
|
|
# ]
|
|
|
|
[tool.ruff.format]
|
|
skip-magic-trailing-comma = false
|
|
|
|
[tool.ruff.lint]
|
|
select = [
|
|
# "B", # flake8-bugbear
|
|
"C", # flake8-comprehensions
|
|
"C9", # mccabe
|
|
# "D", # flake8-docstrings
|
|
"E", # pycodestyle errors (default)
|
|
"F", # pyflakes (default)
|
|
"I", # isort
|
|
"PD", # pandas-vet
|
|
"PIE", # pie
|
|
# "PTH", # pathlib
|
|
"Q", # flake8-quotes
|
|
# "RET", # return
|
|
"RUF", # Enable all ruff-specific checks
|
|
# "SIM", # simplify
|
|
"S307", # eval
|
|
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
|
"W", # pycodestyle warnings
|
|
"ASYNC", # async
|
|
"UP", # pyupgrade
|
|
]
|
|
|
|
ignore = [
|
|
"C408", # Unnecessary `dict()` call (rewrite as a literal)
|
|
"E501", # Line too long, handled by ruff formatter
|
|
"D107", # "Missing docstring in __init__",
|
|
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
|
|
"F811", # "redefinition of the same function"
|
|
"PL", # Pylint
|
|
"RUF012", # Mutable Class Attributes
|
|
"UP006", # List vs list, etc
|
|
"UP007", # Option and Union
|
|
"UP035", # `typing.Set` is deprecated, use `set` instead"
|
|
]
|
|
|
|
#extend-select = []
|
|
|
|
[tool.ruff.lint.pep8-naming]
|
|
classmethod-decorators = [
|
|
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
|
|
"pydantic.validator",
|
|
]
|
|
|
|
[tool.ruff.lint.per-file-ignores]
|
|
"__init__.py" = ["E402", "F401"]
|
|
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
|
|
|
[tool.ruff.lint.mccabe]
|
|
max-complexity = 20
|
|
|
|
# [tool.ruff.lint.isort.sections]
|
|
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
|
|
|
|
[tool.ruff.lint.isort]
|
|
combine-as-imports = true
|
|
# section-order = [
|
|
# "future",
|
|
# "standard-library",
|
|
# "third-party",
|
|
# "docling",
|
|
# "first-party",
|
|
# "local-folder",
|
|
# ]
|
|
|
|
[tool.mypy]
|
|
pretty = true
|
|
# strict = true
|
|
no_implicit_optional = true
|
|
plugins = "pydantic.mypy"
|
|
python_version = "3.10"
|
|
|
|
[[tool.mypy.overrides]]
|
|
module = [
|
|
"docling_parse.*",
|
|
"pypdfium2.*",
|
|
"networkx.*",
|
|
"scipy.*",
|
|
"filetype.*",
|
|
"tesserocr.*",
|
|
"docling_ibm_models.*",
|
|
"easyocr.*",
|
|
"ocrmac.*",
|
|
"mlx_vlm.*",
|
|
"lxml.*",
|
|
"huggingface_hub.*",
|
|
"transformers.*",
|
|
"pylatexenc.*",
|
|
]
|
|
ignore_missing_imports = true
|
|
|
|
[tool.semantic_release]
|
|
# for default values check:
|
|
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
|
|
|
version_source = "tag_only"
|
|
branch = "main"
|
|
|
|
# configure types which should trigger minor and patch version bumps respectively
|
|
# (note that they must be a subset of the configured allowed types):
|
|
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
|
|
parser_angular_minor_types = "feat"
|
|
parser_angular_patch_types = "fix,perf"
|