mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
195 lines
3.6 KiB
TOML
195 lines
3.6 KiB
TOML
[build-system]
|
|
requires = ["setuptools", "wheel"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
# See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options.
|
|
name = "olmocr"
|
|
description = "Fast, efficient, and high quality OCR powered by open visual language models"
|
|
dynamic = ["version"]
|
|
readme = "README.md"
|
|
classifiers = [
|
|
"Intended Audience :: Science/Research",
|
|
"Development Status :: 3 - Alpha",
|
|
"License :: OSI Approved :: Apache Software License",
|
|
"Programming Language :: Python :: 3",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
]
|
|
authors = [
|
|
{name = "Allen Institute for Artificial Intelligence", email = "jakep@allenai.org"}
|
|
]
|
|
requires-python = ">=3.11"
|
|
dependencies = [
|
|
"cached-path",
|
|
"smart_open",
|
|
"pypdf>=5.2.0",
|
|
"pypdfium2",
|
|
"cryptography",
|
|
"lingua-language-detector",
|
|
"Pillow",
|
|
"ftfy",
|
|
"bleach",
|
|
"markdown2",
|
|
"filelock",
|
|
"orjson",
|
|
"requests",
|
|
"zstandard",
|
|
"boto3",
|
|
"httpx",
|
|
"torch>=2.7.0",
|
|
"transformers>=4.51.1",
|
|
"img2pdf",
|
|
"beaker-py",
|
|
]
|
|
license = {file = "LICENSE"}
|
|
|
|
[project.urls]
|
|
Homepage = "https://github.com/allenai/olmocr"
|
|
Repository = "https://github.com/allenai/olmocr"
|
|
Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md"
|
|
# Documentation = "https://olmocr.readthedocs.io/"
|
|
|
|
[project.optional-dependencies]
|
|
gpu = [
|
|
"vllm==0.9.1"
|
|
]
|
|
|
|
dev = [
|
|
"ruff",
|
|
"mypy",
|
|
"black",
|
|
"isort",
|
|
"pytest",
|
|
"pytest-sphinx",
|
|
"pytest-cov",
|
|
"twine>=1.11.0",
|
|
"build",
|
|
"setuptools",
|
|
"wheel",
|
|
"Sphinx>=4.3.0,<7.1.0",
|
|
"furo==2023.7.26",
|
|
"myst-parser>=1.0,<2.1",
|
|
"sphinx-copybutton==0.5.2",
|
|
"sphinx-autobuild==2021.3.14",
|
|
"sphinx-autodoc-typehints==1.23.3",
|
|
"packaging",
|
|
"necessary",
|
|
"peft",
|
|
"datasets",
|
|
"omegaconf",
|
|
"spacy",
|
|
]
|
|
|
|
bench = [
|
|
"tinyhost",
|
|
"fuzzysearch",
|
|
"rapidfuzz",
|
|
"sequence_align",
|
|
"syntok",
|
|
"openai",
|
|
"google-genai",
|
|
"playwright",
|
|
"mistralai",
|
|
"lxml",
|
|
"flask",
|
|
]
|
|
|
|
train = [
|
|
"torch",
|
|
"torchvision",
|
|
"accelerate",
|
|
"datasets",
|
|
"peft",
|
|
"wandb",
|
|
"omegaconf",
|
|
"s3fs",
|
|
"necessary",
|
|
"einops",
|
|
"transformers>=4.45.1"
|
|
]
|
|
|
|
elo = [
|
|
"numpy",
|
|
"scipy",
|
|
"pandas",
|
|
"matplotlib"
|
|
]
|
|
|
|
[tool.setuptools.packages.find]
|
|
exclude = [
|
|
"*.tests",
|
|
"*.tests.*",
|
|
"tests.*",
|
|
"tests",
|
|
"docs*",
|
|
"scripts*"
|
|
]
|
|
|
|
[tool.setuptools]
|
|
include-package-data = true
|
|
|
|
[tool.setuptools.package-data]
|
|
olmocr = [
|
|
"py.typed",
|
|
"viewer/*.html",
|
|
"eval/*.html",
|
|
]
|
|
|
|
[tool.setuptools.dynamic]
|
|
version = {attr = "olmocr.version.VERSION"}
|
|
|
|
[tool.black]
|
|
line-length = 160
|
|
include = '\.pyi?$'
|
|
exclude = '''
|
|
(
|
|
__pycache__
|
|
| \.git
|
|
| \.mypy_cache
|
|
| \.pytest_cache
|
|
| \.vscode
|
|
| \.venv
|
|
| \bdist\b
|
|
| \bdoc\b
|
|
)
|
|
'''
|
|
|
|
[tool.isort]
|
|
profile = "black"
|
|
multi_line_output = 3
|
|
|
|
# You can override these pyright settings by adding a personal pyrightconfig.json file.
|
|
[tool.pyright]
|
|
reportPrivateImportUsage = false
|
|
|
|
[tool.ruff]
|
|
line-length = 160
|
|
target-version = "py311"
|
|
exclude = ["olmocr/train/molmo", "tests/*"]
|
|
ignore = ["E722"] #igore bare except
|
|
|
|
[tool.ruff.per-file-ignores]
|
|
"__init__.py" = ["F401"]
|
|
|
|
[tool.mypy]
|
|
ignore_missing_imports = true
|
|
no_site_packages = true
|
|
check_untyped_defs = true
|
|
exclude = ["olmocr/train/molmo/", "tests/*"]
|
|
|
|
[[tool.mypy.overrides]]
|
|
module = "tests.*"
|
|
strict_optional = false
|
|
|
|
[tool.pytest.ini_options]
|
|
testpaths = "tests/"
|
|
python_classes = [
|
|
"Test*",
|
|
"*Test"
|
|
]
|
|
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
|
log_level = "DEBUG"
|
|
markers = [
|
|
"nonci: mark test as not intended for CI runs"
|
|
]
|