2025-06-03 15:18:54 +02:00
[ project ]
2024-07-15 09:42:42 +02:00
name = "docling"
2025-06-25 16:27:46 +00:00
version = "2.38.1" # DO NOT EDIT, updated automatically
2024-11-04 15:48:00 +01:00
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
2024-07-15 09:42:42 +02:00
license = "MIT"
2025-03-13 15:12:22 +01:00
keywords = [
"docling" ,
"convert" ,
"document" ,
"pdf" ,
"docx" ,
"html" ,
"markdown" ,
"layout model" ,
"segmentation" ,
"table structure" ,
"table former" ,
]
classifiers = [
"Operating System :: MacOS :: MacOS X" ,
"Operating System :: POSIX :: Linux" ,
2025-06-03 15:18:54 +02:00
"Operating System :: Microsoft :: Windows" ,
2025-03-13 15:12:22 +01:00
"Development Status :: 5 - Production/Stable" ,
"Intended Audience :: Developers" ,
"Intended Audience :: Science/Research" ,
"Topic :: Scientific/Engineering :: Artificial Intelligence" ,
"Programming Language :: Python :: 3" ,
2025-06-03 15:18:54 +02:00
"Programming Language :: Python :: 3.9" ,
"Programming Language :: Python :: 3.10" ,
"Programming Language :: Python :: 3.11" ,
"Programming Language :: Python :: 3.12" ,
"Programming Language :: Python :: 3.13" ,
2025-03-13 15:12:22 +01:00
]
2025-06-03 15:18:54 +02:00
readme = "README.md"
authors = [
{ name = "Christoph Auer" , email = "cau@zurich.ibm.com" } ,
{ name = "Michele Dolfi" , email = "dol@zurich.ibm.com" } ,
{ name = "Maxim Lysak" , email = "mly@zurich.ibm.com" } ,
{ name = "Nikos Livathinos" , email = "nli@zurich.ibm.com" } ,
{ name = "Ahmed Nassar" , email = "ahn@zurich.ibm.com" } ,
{ name = "Panos Vagenas" , email = "pva@zurich.ibm.com" } ,
{ name = "Peter Staar" , email = "taa@zurich.ibm.com" } ,
2025-01-30 17:26:42 +01:00
]
2025-06-03 15:18:54 +02:00
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)' ,
'docling-core[chunking] (>=2.29.0,<3.0.0)' ,
'docling-ibm-models (>=3.4.4,<4.0.0)' ,
'docling-parse (>=4.0.0,<5.0.0)' ,
'filetype (>=1.2.0,<2.0.0)' ,
'pypdfium2 (>=4.30.0,<5.0.0)' ,
'pydantic-settings (>=2.3.0,<3.0.0)' ,
'huggingface_hub (>=0.23,<1)' ,
'requests (>=2.32.2,<3.0.0)' ,
'easyocr (>=1.7,<2.0)' ,
'certifi (>=2024.7.4)' ,
'rtree (>=1.3.0,<2.0.0)' ,
2025-06-04 13:06:23 +02:00
'typer (>=0.12.5,<0.17.0)' ,
2025-06-03 15:18:54 +02:00
'python-docx (>=1.1.2,<2.0.0)' ,
'python-pptx (>=1.0.2,<2.0.0)' ,
'beautifulsoup4 (>=4.12.3,<5.0.0)' ,
'pandas (>=2.1.4,<3.0.0)' ,
'marko (>=2.1.2,<3.0.0)' ,
'openpyxl (>=3.1.5,<4.0.0)' ,
'lxml (>=4.0.0,<6.0.0)' ,
'pillow (>=10.0.0,<12.0.0)' ,
'tqdm (>=4.65.0,<5.0.0)' ,
'pluggy (>=1.0.0,<2.0.0)' ,
'pylatexenc (>=2.10,<3.0)' ,
'scipy (>=1.6.0,<2.0.0)' ,
# 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
# 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
2024-11-27 18:27:41 +05:30
]
feat: [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model (#1054)
* Skeleton for SmolDocling model and VLM Pipeline
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* wip smolDocling inference and vlm pipeline
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* WIP, first working code for inference of SmolDocling, and vlm pipeline assembly code, example included.
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Fixes to preserve page image and demo export to html
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Enabled figure support in vlm_pipeline
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Fix for table span compute in vlm_pipeline
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Properly propagating image data per page, together with predicted tags in VLM pipeline. This enables correct figure extraction and page numbers in provenances
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Cleaned up logs, added pages to vlm_pipeline, basic timing per page measurement in smol_docling models
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Replaced hardcoded otsl tokens with the ones from docling-core tokens.py enum
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Added tokens/sec measurement, improved example
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Added capability for vlm_pipeline to grab text from preconfigured backend
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Exposed "force_backend_text" as pipeline parameter
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Flipped keep_backend to True for vlm_pipeline assembly to work
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Updated vlm pipeline assembly and smol docling model code to support updated doctags
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Fixing doctags starting tag, that broke elements on first line during assembly
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Introduced SmolDoclingOptions to configure model parameters (such as query and artifacts path) via client code, see example in minimal_smol_docling. Provisioning for other potential vlm all-in-one models.
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Moved artifacts_path for SmolDocling into vlm_options instead of global pipeline option
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* New assembly code for latest model revision, updated prompt and parsing of doctags, updated logging
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Updated example of Smol Docling usage
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Added captions for the images for SmolDocling assembly code, improved provenance definition for all elements
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Update minimal smoldocling example
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Fix repo id
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Cleaned up unnecessary logging
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* More elegant solution in removing the input prompt
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* removed minimal_smol_docling example from CI checks
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Removed special html code wrapping when exporting to docling document, cleaned up comments
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Addressing PR comments, added enabled property to SmolDocling, and related VLM pipeline option, few other minor things
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Moved keep_backend = True to vlm pipeline
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* removed pipeline_options.generate_table_images from vlm_pipeline (deprecated in the pipelines)
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Added example on how to get original predicted doctags in minimal_smol_docling
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* removing changes from base_pipeline
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Replaced remaining strings to appropriate enums
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Updated poetry.lock
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* re-built poetry.lock
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
* Generalize and refactor VLM pipeline and models
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Rename example
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Move imports
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Expose control over using flash_attention_2
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Fix VLM example exclusion in CI
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Add back device_map and accelerate
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* Make drawing code resilient against bad bboxes
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* chore: clean up code and comments
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* chore: more cleanup
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* chore: fix leftover .to(device)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
* fix: add proper table provenance
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---------
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
2025-02-26 14:43:26 +01:00
2025-06-03 15:18:54 +02:00
[ project . urls ]
homepage = "https://github.com/docling-project/docling"
repository = "https://github.com/docling-project/docling"
issues = "https://github.com/docling-project/docling/issues"
changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
2024-11-20 15:21:40 +01:00
2025-06-03 15:18:54 +02:00
[ project . entry-points . docling ]
"docling_defaults" = "docling.models.plugins.defaults"
2024-10-30 18:44:08 +01:00
2025-06-03 15:18:54 +02:00
[ project . scripts ]
docling = "docling.cli.main:app"
docling-tools = "docling.cli.tools:app"
[ project . optional-dependencies ]
tesserocr = [ 'tesserocr (>=2.7.1,<3.0.0)' ]
ocrmac = [ 'ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"' ]
vlm = [
'transformers (>=4.46.0,<5.0.0)' ,
'accelerate (>=1.2.1,<2.0.0)' ,
'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"' ,
2024-10-30 18:44:08 +01:00
]
2025-06-03 15:18:54 +02:00
rapidocr = [
'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"' ,
'onnxruntime (>=1.7.0,<2.0.0)' ,
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
2024-10-30 18:44:08 +01:00
]
2025-06-23 14:47:26 +02:00
asr = [
"openai-whisper>=20240930" ,
]
2024-10-30 18:44:08 +01:00
2025-06-03 15:18:54 +02:00
[ dependency-groups ]
dev = [
"pre-commit~=3.7" ,
"mypy~=1.10" ,
"types-setuptools~=70.3" ,
"pandas-stubs~=2.1" ,
"types-openpyxl~=3.1" ,
"types-requests~=2.31" ,
"boto3-stubs~=1.37" ,
"types-urllib3~=1.26" ,
"types-tqdm~=4.67" ,
"coverage~=7.6" ,
"pytest~=8.3" ,
"pytest-cov>=6.1.1" ,
"pytest-dependency~=0.6" ,
"pytest-xdist~=3.3" ,
"ipykernel~=6.29" ,
"ipywidgets~=8.1" ,
"nbqa~=1.9" ,
"python-semantic-release~=7.32" ,
]
docs = [
"mkdocs-material~=9.5" ,
"mkdocs-jupyter~=0.25" ,
"mkdocs-click~=0.8" ,
"mkdocstrings[python]~=0.27" ,
"griffe-pydantic~=1.1" ,
]
examples = [
"datasets~=2.21" ,
"python-dotenv~=1.0" ,
"langchain-huggingface>=0.0.3" ,
"langchain-milvus~=0.1" ,
"langchain-text-splitters~=0.2" ,
]
constraints = [
'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"' ,
'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"' ,
]
2025-06-02 17:01:06 +02:00
2024-09-13 14:03:09 +02:00
2025-06-03 15:18:54 +02:00
[ tool . uv ]
package = true
default-groups = "all"
2025-03-18 13:58:05 +01:00
2025-06-23 14:47:26 +02:00
[ tool . uv . sources ]
openai-whisper = { git = "https://github.com/openai/whisper.git" , rev = "dd985ac4b90cafeef8712f2998d62c59c3e62d22" }
2025-06-03 15:18:54 +02:00
[ tool . setuptools . packages . find ]
include = [ "docling*" ]
2024-07-15 09:42:42 +02:00
2025-04-14 18:01:26 +02:00
[ tool . ruff ]
target-version = "py39"
2024-07-15 09:42:42 +02:00
line-length = 88
2025-04-14 18:01:26 +02:00
respect-gitignore = true
2024-07-15 09:42:42 +02:00
2025-04-14 18:01:26 +02:00
# extend-exclude = [
# "tests",
# ]
[ tool . ruff . format ]
skip-magic-trailing-comma = false
[ tool . ruff . lint ]
select = [
2025-06-03 15:18:54 +02:00
# "B", # flake8-bugbear
"C" , # flake8-comprehensions
"C9" , # mccabe
# "D", # flake8-docstrings
"E" , # pycodestyle errors (default)
"F" , # pyflakes (default)
"I" , # isort
"PD" , # pandas-vet
"PIE" , # pie
# "PTH", # pathlib
"Q" , # flake8-quotes
# "RET", # return
"RUF" , # Enable all ruff-specific checks
# "SIM", # simplify
"S307" , # eval
# "T20", # (disallow print statements) keep debugging statements out of the codebase
"W" , # pycodestyle warnings
"ASYNC" , # async
"UP" , # pyupgrade
2025-04-14 18:01:26 +02:00
]
ignore = [
2025-06-03 15:18:54 +02:00
"C408" , # Unnecessary `dict()` call (rewrite as a literal)
"E501" , # Line too long, handled by ruff formatter
"D107" , # "Missing docstring in __init__",
"F401" , # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811" , # "redefinition of the same function"
"PL" , # Pylint
"RUF012" , # Mutable Class Attributes
"UP006" , # List vs list, etc
"UP007" , # Option and Union
"UP035" , # `typing.Set` is deprecated, use `set` instead"
2025-04-14 18:01:26 +02:00
]
#extend-select = []
[ tool . ruff . lint . pep8-naming ]
classmethod-decorators = [
2025-06-03 15:18:54 +02:00
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
"pydantic.validator" ,
2025-04-14 18:01:26 +02:00
]
[ tool . ruff . lint . per-file-ignores ]
"__init__.py" = [ "E402" , "F401" ]
2025-06-03 15:18:54 +02:00
"tests/*.py" = [ "ASYNC" ] # Disable ASYNC check for tests
2025-04-14 18:01:26 +02:00
[ tool . ruff . lint . mccabe ]
max-complexity = 20
# [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
[ tool . ruff . lint . isort ]
combine-as-imports = true
# section-order = [
# "future",
# "standard-library",
# "third-party",
# "docling",
# "first-party",
# "local-folder",
# ]
2024-07-15 09:42:42 +02:00
[ tool . mypy ]
pretty = true
# strict = true
no_implicit_optional = true
2024-10-16 21:02:03 +02:00
plugins = "pydantic.mypy"
2024-07-18 17:27:48 +02:00
python_version = "3.10"
2024-07-15 09:42:42 +02:00
2024-09-12 15:56:29 +02:00
[ [ tool . mypy . overrides ] ]
module = [
2025-03-13 15:12:22 +01:00
"docling_parse.*" ,
"pypdfium2.*" ,
"networkx.*" ,
"scipy.*" ,
"filetype.*" ,
"tesserocr.*" ,
"docling_ibm_models.*" ,
"easyocr.*" ,
"ocrmac.*" ,
2025-03-19 15:38:54 +01:00
"mlx_vlm.*" ,
2025-03-13 15:12:22 +01:00
"lxml.*" ,
"huggingface_hub.*" ,
"transformers.*" ,
"pylatexenc.*" ,
2024-09-12 15:56:29 +02:00
]
ignore_missing_imports = true
2024-07-16 13:05:04 +02:00
[ tool . semantic_release ]
# for default values check:
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
version_source = "tag_only"
branch = "main"
# configure types which should trigger minor and patch version bumps respectively
# (note that they must be a subset of the configured allowed types):
parser_angular_allowed_types = "build,chore,ci,docs,feat,fix,perf,style,refactor,test"
parser_angular_minor_types = "feat"
parser_angular_patch_types = "fix,perf"