2024-09-17 07:53:43 -07:00
|
|
|
[build-system]
|
|
|
|
requires = ["setuptools", "wheel"]
|
|
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
|
|
|
|
[project]
|
|
|
|
# See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options.
|
2025-01-27 18:30:41 +00:00
|
|
|
name = "olmocr"
|
2025-05-23 15:14:07 -07:00
|
|
|
description = "Fast, efficient, and high quality OCR powered by open visual language models"
|
2024-09-17 07:53:43 -07:00
|
|
|
dynamic = ["version"]
|
|
|
|
readme = "README.md"
|
|
|
|
classifiers = [
|
|
|
|
"Intended Audience :: Science/Research",
|
|
|
|
"Development Status :: 3 - Alpha",
|
|
|
|
"License :: OSI Approved :: Apache Software License",
|
|
|
|
"Programming Language :: Python :: 3",
|
|
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
|
|
]
|
|
|
|
authors = [
|
2025-02-14 22:27:36 +00:00
|
|
|
{name = "Allen Institute for Artificial Intelligence", email = "jakep@allenai.org"}
|
2024-09-17 07:53:43 -07:00
|
|
|
]
|
2025-01-28 13:56:00 -08:00
|
|
|
requires-python = ">=3.11"
|
2024-09-17 07:53:43 -07:00
|
|
|
dependencies = [
|
2024-09-17 16:26:55 +00:00
|
|
|
"cached-path",
|
2024-09-20 08:22:10 -07:00
|
|
|
"smart_open",
|
2025-01-27 18:41:13 +00:00
|
|
|
"pypdf>=5.2.0",
|
2024-09-17 18:47:27 +00:00
|
|
|
"pypdfium2",
|
2024-10-16 13:18:24 -07:00
|
|
|
"cryptography",
|
2024-09-23 08:20:08 -07:00
|
|
|
"lingua-language-detector",
|
|
|
|
"Pillow",
|
2024-10-09 17:53:26 +00:00
|
|
|
"ftfy",
|
2024-10-10 22:10:26 +00:00
|
|
|
"bleach",
|
2024-10-16 16:45:07 +00:00
|
|
|
"markdown2",
|
2024-10-16 18:26:25 +00:00
|
|
|
"filelock",
|
2024-10-30 13:24:11 -07:00
|
|
|
"orjson",
|
2024-11-07 13:26:42 -08:00
|
|
|
"requests",
|
|
|
|
"zstandard",
|
2024-11-13 09:46:08 -08:00
|
|
|
"boto3",
|
2025-01-28 13:56:00 -08:00
|
|
|
"httpx",
|
2025-06-02 22:52:28 +00:00
|
|
|
"torch>=2.7.0",
|
2025-06-02 18:13:22 +00:00
|
|
|
"transformers>=4.51.1",
|
2025-03-31 10:59:38 -07:00
|
|
|
"img2pdf",
|
2025-01-27 15:58:48 -08:00
|
|
|
"beaker-py",
|
2024-09-17 07:53:43 -07:00
|
|
|
]
|
|
|
|
license = {file = "LICENSE"}
|
|
|
|
|
|
|
|
[project.urls]
|
2025-01-27 18:30:41 +00:00
|
|
|
Homepage = "https://github.com/allenai/olmocr"
|
|
|
|
Repository = "https://github.com/allenai/olmocr"
|
|
|
|
Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md"
|
|
|
|
# Documentation = "https://olmocr.readthedocs.io/"
|
2024-09-17 07:53:43 -07:00
|
|
|
|
|
|
|
[project.optional-dependencies]
|
2025-03-13 13:26:04 -07:00
|
|
|
gpu = [
|
2025-06-10 16:14:57 -07:00
|
|
|
"vllm==0.9.1"
|
2025-03-13 13:26:04 -07:00
|
|
|
]
|
2025-03-13 13:47:18 -07:00
|
|
|
|
2024-09-17 07:53:43 -07:00
|
|
|
dev = [
|
|
|
|
"ruff",
|
2025-01-30 14:32:08 -08:00
|
|
|
"mypy",
|
2025-01-29 15:42:34 -08:00
|
|
|
"black",
|
|
|
|
"isort",
|
2024-09-17 07:53:43 -07:00
|
|
|
"pytest",
|
|
|
|
"pytest-sphinx",
|
|
|
|
"pytest-cov",
|
|
|
|
"twine>=1.11.0",
|
|
|
|
"build",
|
|
|
|
"setuptools",
|
|
|
|
"wheel",
|
|
|
|
"Sphinx>=4.3.0,<7.1.0",
|
|
|
|
"furo==2023.7.26",
|
|
|
|
"myst-parser>=1.0,<2.1",
|
|
|
|
"sphinx-copybutton==0.5.2",
|
|
|
|
"sphinx-autobuild==2021.3.14",
|
|
|
|
"sphinx-autodoc-typehints==1.23.3",
|
2024-09-20 08:22:10 -07:00
|
|
|
"packaging",
|
|
|
|
"necessary",
|
2025-01-30 15:33:04 -08:00
|
|
|
"peft",
|
2025-01-30 15:38:29 -08:00
|
|
|
"datasets",
|
2025-02-26 17:49:04 +00:00
|
|
|
"omegaconf",
|
|
|
|
"spacy",
|
2024-11-08 15:02:40 -08:00
|
|
|
]
|
|
|
|
|
2025-02-28 10:14:47 -08:00
|
|
|
bench = [
|
2025-02-28 14:00:22 -08:00
|
|
|
"tinyhost",
|
2025-02-28 10:14:47 -08:00
|
|
|
"fuzzysearch",
|
|
|
|
"rapidfuzz",
|
|
|
|
"sequence_align",
|
|
|
|
"syntok",
|
2025-04-25 18:18:37 +00:00
|
|
|
"openai",
|
2025-02-28 10:14:47 -08:00
|
|
|
"google-genai",
|
2025-03-10 21:47:49 +00:00
|
|
|
"playwright",
|
2025-03-12 10:29:49 -07:00
|
|
|
"mistralai",
|
2025-03-13 12:50:52 -07:00
|
|
|
"lxml",
|
2025-03-18 16:53:36 +00:00
|
|
|
"flask",
|
2025-02-28 10:14:47 -08:00
|
|
|
]
|
2024-09-17 07:53:43 -07:00
|
|
|
|
2024-09-20 15:09:45 +00:00
|
|
|
train = [
|
|
|
|
"torch",
|
|
|
|
"torchvision",
|
2024-09-20 08:22:10 -07:00
|
|
|
"accelerate",
|
|
|
|
"datasets",
|
|
|
|
"peft",
|
|
|
|
"wandb",
|
|
|
|
"omegaconf",
|
|
|
|
"s3fs",
|
2024-09-27 15:16:12 +00:00
|
|
|
"necessary",
|
2025-01-22 15:23:08 -08:00
|
|
|
"einops",
|
2024-10-03 09:00:53 -07:00
|
|
|
"transformers>=4.45.1"
|
2024-09-20 15:09:45 +00:00
|
|
|
]
|
|
|
|
|
2025-02-13 16:59:09 -08:00
|
|
|
elo = [
|
|
|
|
"numpy",
|
|
|
|
"scipy",
|
2025-02-13 19:38:09 -08:00
|
|
|
"pandas",
|
|
|
|
"matplotlib"
|
2025-02-13 16:59:09 -08:00
|
|
|
]
|
|
|
|
|
2024-09-17 07:53:43 -07:00
|
|
|
[tool.setuptools.packages.find]
|
|
|
|
exclude = [
|
|
|
|
"*.tests",
|
|
|
|
"*.tests.*",
|
|
|
|
"tests.*",
|
|
|
|
"tests",
|
|
|
|
"docs*",
|
|
|
|
"scripts*"
|
|
|
|
]
|
|
|
|
|
|
|
|
[tool.setuptools]
|
|
|
|
include-package-data = true
|
|
|
|
|
|
|
|
[tool.setuptools.package-data]
|
2025-02-14 22:09:29 +00:00
|
|
|
olmocr = [
|
|
|
|
"py.typed",
|
|
|
|
"viewer/*.html",
|
|
|
|
"eval/*.html",
|
|
|
|
]
|
2024-09-17 07:53:43 -07:00
|
|
|
|
|
|
|
[tool.setuptools.dynamic]
|
2025-01-27 18:30:41 +00:00
|
|
|
version = {attr = "olmocr.version.VERSION"}
|
2024-09-17 07:53:43 -07:00
|
|
|
|
|
|
|
[tool.black]
|
2025-01-29 15:30:39 -08:00
|
|
|
line-length = 160
|
2024-09-17 07:53:43 -07:00
|
|
|
include = '\.pyi?$'
|
|
|
|
exclude = '''
|
|
|
|
(
|
|
|
|
__pycache__
|
|
|
|
| \.git
|
|
|
|
| \.mypy_cache
|
|
|
|
| \.pytest_cache
|
|
|
|
| \.vscode
|
|
|
|
| \.venv
|
|
|
|
| \bdist\b
|
|
|
|
| \bdoc\b
|
|
|
|
)
|
|
|
|
'''
|
|
|
|
|
|
|
|
[tool.isort]
|
|
|
|
profile = "black"
|
|
|
|
multi_line_output = 3
|
|
|
|
|
|
|
|
# You can override these pyright settings by adding a personal pyrightconfig.json file.
|
|
|
|
[tool.pyright]
|
|
|
|
reportPrivateImportUsage = false
|
|
|
|
|
|
|
|
[tool.ruff]
|
2025-01-29 15:57:26 -08:00
|
|
|
line-length = 160
|
|
|
|
target-version = "py311"
|
|
|
|
exclude = ["olmocr/train/molmo", "tests/*"]
|
|
|
|
ignore = ["E722"] #igore bare except
|
2024-09-17 07:53:43 -07:00
|
|
|
|
|
|
|
[tool.ruff.per-file-ignores]
|
|
|
|
"__init__.py" = ["F401"]
|
|
|
|
|
|
|
|
[tool.mypy]
|
|
|
|
ignore_missing_imports = true
|
|
|
|
no_site_packages = true
|
|
|
|
check_untyped_defs = true
|
2025-01-30 13:37:01 -08:00
|
|
|
exclude = ["olmocr/train/molmo/", "tests/*"]
|
2024-09-17 07:53:43 -07:00
|
|
|
|
|
|
|
[[tool.mypy.overrides]]
|
|
|
|
module = "tests.*"
|
|
|
|
strict_optional = false
|
|
|
|
|
|
|
|
[tool.pytest.ini_options]
|
|
|
|
testpaths = "tests/"
|
|
|
|
python_classes = [
|
|
|
|
"Test*",
|
|
|
|
"*Test"
|
|
|
|
]
|
|
|
|
log_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
|
|
|
log_level = "DEBUG"
|
2025-02-14 20:42:19 +00:00
|
|
|
markers = [
|
|
|
|
"nonci: mark test as not intended for CI runs"
|
2025-03-13 13:26:04 -07:00
|
|
|
]
|