[project] name = "docling" version = "2.38.1" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." license = "MIT" keywords = [ "docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former", ] classifiers = [ "Operating System :: MacOS :: MacOS X", "Operating System :: POSIX :: Linux", "Operating System :: Microsoft :: Windows", "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", ] readme = "README.md" authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, { name = "Michele Dolfi", email = "dol@zurich.ibm.com" }, { name = "Maxim Lysak", email = "mly@zurich.ibm.com" }, { name = "Nikos Livathinos", email = "nli@zurich.ibm.com" }, { name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" }, { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] requires-python = '>=3.9,<4.0' dependencies = [ 'pydantic (>=2.0.0,<3.0.0)', 'docling-core[chunking] (>=2.29.0,<3.0.0)', 'docling-ibm-models (>=3.4.4,<4.0.0)', 'docling-parse (>=4.0.0,<5.0.0)', 'filetype (>=1.2.0,<2.0.0)', 'pypdfium2 (>=4.30.0,<5.0.0)', 'pydantic-settings (>=2.3.0,<3.0.0)', 'huggingface_hub (>=0.23,<1)', 'requests (>=2.32.2,<3.0.0)', 'easyocr (>=1.7,<2.0)', 'certifi (>=2024.7.4)', 'rtree (>=1.3.0,<2.0.0)', 'typer (>=0.12.5,<0.17.0)', 'python-docx (>=1.1.2,<2.0.0)', 'python-pptx (>=1.0.2,<2.0.0)', 'beautifulsoup4 (>=4.12.3,<5.0.0)', 'pandas (>=2.1.4,<3.0.0)', 'marko (>=2.1.2,<3.0.0)', 'openpyxl (>=3.1.5,<4.0.0)', 'lxml (>=4.0.0,<6.0.0)', 'pillow (>=10.0.0,<12.0.0)', 'tqdm (>=4.65.0,<5.0.0)', 'pluggy (>=1.0.0,<2.0.0)', 'pylatexenc (>=2.10,<3.0)', 'scipy (>=1.6.0,<2.0.0)', # 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"', # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"', ] [project.urls] homepage = "https://github.com/docling-project/docling" repository = "https://github.com/docling-project/docling" issues = "https://github.com/docling-project/docling/issues" changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md" [project.entry-points.docling] "docling_defaults" = "docling.models.plugins.defaults" [project.scripts] docling = "docling.cli.main:app" docling-tools = "docling.cli.tools:app" [project.optional-dependencies] tesserocr = ['tesserocr (>=2.7.1,<3.0.0)'] ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"'] vlm = [ 'transformers (>=4.46.0,<5.0.0)', 'accelerate (>=1.2.1,<2.0.0)', 'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"', ] rapidocr = [ 'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"', 'onnxruntime (>=1.7.0,<2.0.0)', # 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"', # 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"', ] asr = [ "openai-whisper>=20240930", ] [dependency-groups] dev = [ "pre-commit~=3.7", "mypy~=1.10", "types-setuptools~=70.3", "pandas-stubs~=2.1", "types-openpyxl~=3.1", "types-requests~=2.31", "boto3-stubs~=1.37", "types-urllib3~=1.26", "types-tqdm~=4.67", "coverage~=7.6", "pytest~=8.3", "pytest-cov>=6.1.1", "pytest-dependency~=0.6", "pytest-xdist~=3.3", "ipykernel~=6.29", "ipywidgets~=8.1", "nbqa~=1.9", "python-semantic-release~=7.32", ] docs = [ "mkdocs-material~=9.5", "mkdocs-jupyter~=0.25", "mkdocs-click~=0.8", "mkdocstrings[python]~=0.27", "griffe-pydantic~=1.1", ] examples = [ "datasets~=2.21", "python-dotenv~=1.0", "langchain-huggingface>=0.0.3", "langchain-milvus~=0.1", "langchain-text-splitters~=0.2", ] constraints = [ 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"', 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"', ] [tool.uv] package = true default-groups = "all" [tool.uv.sources] openai-whisper = { git = "https://github.com/openai/whisper.git", rev = "dd985ac4b90cafeef8712f2998d62c59c3e62d22" } [tool.setuptools.packages.find] include = ["docling*"] [tool.ruff] target-version = "py39" line-length = 88 respect-gitignore = true # extend-exclude = [ # "tests", # ] [tool.ruff.format] skip-magic-trailing-comma = false [tool.ruff.lint] select = [ # "B", # flake8-bugbear "C", # flake8-comprehensions "C9", # mccabe # "D", # flake8-docstrings "E", # pycodestyle errors (default) "F", # pyflakes (default) "I", # isort "PD", # pandas-vet "PIE", # pie # "PTH", # pathlib "Q", # flake8-quotes # "RET", # return "RUF", # Enable all ruff-specific checks # "SIM", # simplify "S307", # eval # "T20", # (disallow print statements) keep debugging statements out of the codebase "W", # pycodestyle warnings "ASYNC", # async "UP", # pyupgrade ] ignore = [ "C408", # Unnecessary `dict()` call (rewrite as a literal) "E501", # Line too long, handled by ruff formatter "D107", # "Missing docstring in __init__", "F401", # imported but unused; consider using `importlib.util.find_spec` to test for " "F811", # "redefinition of the same function" "PL", # Pylint "RUF012", # Mutable Class Attributes "UP006", # List vs list, etc "UP007", # Option and Union "UP035", # `typing.Set` is deprecated, use `set` instead" ] #extend-select = [] [tool.ruff.lint.pep8-naming] classmethod-decorators = [ # Allow Pydantic's `@validator` decorator to trigger class method treatment. "pydantic.validator", ] [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401"] "tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests [tool.ruff.lint.mccabe] max-complexity = 20 # [tool.ruff.lint.isort.sections] # "docling" = ["docling_core", "docling_ibm_models", "docling_parse"] [tool.ruff.lint.isort] combine-as-imports = true # section-order = [ # "future", # "standard-library", # "third-party", # "docling", # "first-party", # "local-folder", # ] [tool.mypy] pretty = true # strict = true no_implicit_optional = true plugins = "pydantic.mypy" python_version = "3.10" [[tool.mypy.overrides]] module = [ "docling_parse.*", "pypdfium2.*", "networkx.*", "scipy.*", "filetype.*", "tesserocr.*", "docling_ibm_models.*", "easyocr.*", "ocrmac.*", "mlx_vlm.*", "lxml.*", "huggingface_hub.*", "transformers.*", "pylatexenc.*", ] ignore_missing_imports = true [tool.semantic_release] # for default values check: # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg version_source = "tag_only" branch = "main" # configure types which should trigger minor and patch version bumps respectively # (note that they must be a subset of the configured allowed types): parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test" parser_angular_minor_types = "feat" parser_angular_patch_types = "fix,perf"