[build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" [project] # See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options. name = "olmocr" description = "Fast, efficient, and high quality OCR powered by open visual language models" dynamic = ["version"] readme = "README.md" classifiers = [ "Intended Audience :: Science/Research", "Development Status :: 3 - Alpha", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Artificial Intelligence", ] authors = [ {name = "Allen Institute for Artificial Intelligence", email = "jakep@allenai.org"} ] requires-python = ">=3.11" dependencies = [ "cached-path", "smart_open", "pypdf>=5.2.0", "pypdfium2", "cryptography", "lingua-language-detector", "Pillow", "ftfy", "bleach", "markdown2", "filelock", "orjson", "requests", "zstandard", "boto3", "httpx", "torch>=2.7.0", "transformers>=4.51.1", "img2pdf", "beaker-py", ] license = {file = "LICENSE"} [project.urls] Homepage = "https://github.com/allenai/olmocr" Repository = "https://github.com/allenai/olmocr" Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md" # Documentation = "https://olmocr.readthedocs.io/" [project.optional-dependencies] gpu = [ "vllm==0.9.1" ] dev = [ "ruff", "mypy", "black", "isort", "pytest", "pytest-sphinx", "pytest-cov", "twine>=1.11.0", "build", "setuptools", "wheel", "Sphinx>=4.3.0,<7.1.0", "furo==2023.7.26", "myst-parser>=1.0,<2.1", "sphinx-copybutton==0.5.2", "sphinx-autobuild==2021.3.14", "sphinx-autodoc-typehints==1.23.3", "packaging", "necessary", "peft", "datasets", "omegaconf", "spacy", ] bench = [ "tinyhost", "fuzzysearch", "rapidfuzz", "sequence_align", "syntok", "openai", "google-genai", "playwright", "mistralai", "lxml", "flask", ] train = [ "torch", "torchvision", "accelerate", "datasets", "peft", "wandb", "omegaconf", "s3fs", "necessary", "einops", "transformers>=4.45.1" ] elo = [ "numpy", "scipy", "pandas", "matplotlib" ] [tool.setuptools.packages.find] exclude = [ "*.tests", "*.tests.*", "tests.*", "tests", "docs*", "scripts*" ] [tool.setuptools] include-package-data = true [tool.setuptools.package-data] olmocr = [ "py.typed", "viewer/*.html", "eval/*.html", ] [tool.setuptools.dynamic] version = {attr = "olmocr.version.VERSION"} [tool.black] line-length = 160 include = '\.pyi?$' exclude = ''' ( __pycache__ | \.git | \.mypy_cache | \.pytest_cache | \.vscode | \.venv | \bdist\b | \bdoc\b ) ''' [tool.isort] profile = "black" multi_line_output = 3 # You can override these pyright settings by adding a personal pyrightconfig.json file. [tool.pyright] reportPrivateImportUsage = false [tool.ruff] line-length = 160 target-version = "py311" exclude = ["olmocr/train/molmo", "tests/*"] ignore = ["E722"] #igore bare except [tool.ruff.per-file-ignores] "__init__.py" = ["F401"] [tool.mypy] ignore_missing_imports = true no_site_packages = true check_untyped_defs = true exclude = ["olmocr/train/molmo/", "tests/*"] [[tool.mypy.overrides]] module = "tests.*" strict_optional = false [tool.pytest.ini_options] testpaths = "tests/" python_classes = [ "Test*", "*Test" ] log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s" log_level = "DEBUG" markers = [ "nonci: mark test as not intended for CI runs" ]