diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index e28b1da..6ae102d 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -6,10 +6,10 @@ Thanks for considering contributing! Please read this document to learn the vari ### Did you find a bug? -First, do [a quick search](https://github.com/allenai/pdelfin/issues) to see whether your issue has already been reported. +First, do [a quick search](https://github.com/allenai/olmocrissues) to see whether your issue has already been reported. If your issue has already been reported, please comment on the existing issue. -Otherwise, open [a new GitHub issue](https://github.com/allenai/pdelfin/issues). Be sure to include a clear title +Otherwise, open [a new GitHub issue](https://github.com/allenai/olmocrissues). Be sure to include a clear title and description. The description should include as much relevant information as possible. The description should explain how to reproduce the erroneous behavior as well as the behavior you expect to see. Ideally you would include a code sample or an executable test case demonstrating the expected behavior. @@ -21,7 +21,7 @@ We use GitHub issues to track feature requests. Before you create a feature requ * Make sure you have a clear idea of the enhancement you would like. If you have a vague idea, consider discussing it first on a GitHub issue. * Check the documentation to make sure your feature does not already exist. -* Do [a quick search](https://github.com/allenai/pdelfin/issues) to see whether your feature has already been suggested. +* Do [a quick search](https://github.com/allenai/olmocrissues) to see whether your feature has already been suggested. When creating your request, please: @@ -41,31 +41,31 @@ When you're ready to contribute code to address an open issue, please follow the Then clone your fork locally with - git clone https://github.com/USERNAME/pdelfin.git + git clone https://github.com/USERNAME/olmocrgit or - git clone git@github.com:USERNAME/pdelfin.git + git clone git@github.com:USERNAME/olmocrgit - At this point the local clone of your fork only knows that it came from *your* repo, github.com/USERNAME/pdelfin.git, but doesn't know anything the *main* repo, [https://github.com/allenai/pdelfin.git](https://github.com/allenai/pdelfin). You can see this by running + At this point the local clone of your fork only knows that it came from *your* repo, github.com/USERNAME/olmocrgit, but doesn't know anything the *main* repo, [https://github.com/allenai/oolmocrit](https://github.com/allenai/ololmocrYou can see this by running git remote -v which will output something like this: - origin https://github.com/USERNAME/pdelfin.git (fetch) - origin https://github.com/USERNAME/pdelfin.git (push) + origin https://github.com/USERNAME/olmocrgit (fetch) + origin https://github.com/USERNAME/olmocrgit (push) - This means that your local clone can only track changes from your fork, but not from the main repo, and so you won't be able to keep your fork up-to-date with the main repo over time. Therefore you'll need to add another "remote" to your clone that points to [https://github.com/allenai/pdelfin.git](https://github.com/allenai/pdelfin). To do this, run the following: + This means that your local clone can only track changes from your fork, but not from the main repo, and so you won't be able to keep your fork up-to-date with the main repo over time. Therefore you'll need to add another "remote" to your clone that points to [https://github.com/allenai/olmocrgit](https://github.com/allenai/oolmocr To do this, run the following: - git remote add upstream https://github.com/allenai/pdelfin.git + git remote add upstream https://github.com/allenai/olmocrgit Now if you do `git remote -v` again, you'll see - origin https://github.com/USERNAME/pdelfin.git (fetch) - origin https://github.com/USERNAME/pdelfin.git (push) - upstream https://github.com/allenai/pdelfin.git (fetch) - upstream https://github.com/allenai/pdelfin.git (push) + origin https://github.com/USERNAME/olmocrgit (fetch) + origin https://github.com/USERNAME/olmocrgit (push) + upstream https://github.com/allenai/olmocrgit (fetch) + upstream https://github.com/allenai/olmocrgit (push) Finally, you'll need to create a Python 3 virtual environment suitable for working on this project. There a number of tools out there that making working with virtual environments easier. The most direct way is with the [`venv` module](https://docs.python.org/3.7/library/venv.html) in the standard library, but if you're new to Python or you don't already have a recent Python 3 version installed on your machine, @@ -77,8 +77,8 @@ When you're ready to contribute code to address an open issue, please follow the Then you can create and activate a new Python environment by running: - conda create -n pdelfin python=3.9 - conda activate pdelfin + conda create -n olmocrpython=3.9 + conda activate olmocr Once your virtual environment is activated, you can install your local clone in "editable mode" with @@ -93,7 +93,7 @@ When you're ready to contribute code to address an open issue, please follow the
Expand details 👇
- Once you've added an "upstream" remote pointing to [https://github.com/allenai/python-package-temlate.git](https://github.com/allenai/pdelfin), keeping your fork up-to-date is easy: + Once you've added an "upstream" remote pointing to [https://github.com/allenai/python-package-temlate.git](https://github.com/allenai/olmocr, keeping your fork up-to-date is easy: git checkout main # if not already on main git pull --rebase upstream main @@ -119,7 +119,7 @@ When you're ready to contribute code to address an open issue, please follow the
Expand details 👇
- Our continuous integration (CI) testing runs [a number of checks](https://github.com/allenai/pdelfin/actions) for each pull request on [GitHub Actions](https://github.com/features/actions). You can run most of these tests locally, which is something you should do *before* opening a PR to help speed up the review process and make it easier for us. + Our continuous integration (CI) testing runs [a number of checks](https://github.com/allenai/olmocractions) for each pull request on [GitHub Actions](https://github.com/features/actions). You can run most of these tests locally, which is something you should do *before* opening a PR to help speed up the review process and make it easier for us. First, you should run [`isort`](https://github.com/PyCQA/isort) and [`black`](https://github.com/psf/black) to make sure you code is formatted consistently. Many IDEs support code formatters as plugins, so you may be able to setup isort and black to run automatically everytime you save. @@ -137,9 +137,9 @@ When you're ready to contribute code to address an open issue, please follow the mypy . - We also strive to maintain high test coverage, so most contributions should include additions to [the unit tests](https://github.com/allenai/pdelfin/tree/main/tests). These tests are run with [`pytest`](https://docs.pytest.org/en/latest/), which you can use to locally run any test modules that you've added or changed. + We also strive to maintain high test coverage, so most contributions should include additions to [the unit tests](https://github.com/allenai/olmocrtree/main/tests). These tests are run with [`pytest`](https://docs.pytest.org/en/latest/), which you can use to locally run any test modules that you've added or changed. - For example, if you've fixed a bug in `pdelfin/a/b.py`, you can run the tests specific to that module with + For example, if you've fixed a bug in `olmocra/b.py`, you can run the tests specific to that module with pytest -v tests/a/b_test.py @@ -152,9 +152,9 @@ When you're ready to contribute code to address an open issue, please follow the If the build fails, it's most likely due to small formatting issues. If the error message isn't clear, feel free to comment on this in your pull request. - And finally, please update the [CHANGELOG](https://github.com/allenai/pdelfin/blob/main/CHANGELOG.md) with notes on your contribution in the "Unreleased" section at the top. + And finally, please update the [CHANGELOG](https://github.com/allenai/olmocrblob/main/CHANGELOG.md) with notes on your contribution in the "Unreleased" section at the top. - After all of the above checks have passed, you can now open [a new GitHub pull request](https://github.com/allenai/pdelfin/pulls). + After all of the above checks have passed, you can now open [a new GitHub pull request](https://github.com/allenai/olmocrpulls). Make sure you have a clear description of the problem and the solution, and include a link to relevant issues. We look forward to reviewing your PR! diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index bf5c62a..9c9983d 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -6,7 +6,7 @@ body: - type: markdown attributes: value: > - #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/allenai/pdelfin/issues?q=is%3Aissue+sort%3Acreated-desc+). + #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/allenai/olmocr/issues?q=is%3Aissue+sort%3Acreated-desc+). - type: textarea attributes: label: 🐛 Describe the bug @@ -17,7 +17,7 @@ body: ```python # All necessary imports at the beginning - import pdelfin + import olmocr # A succinct reproducing example trimmed down to the essential parts: assert False is True, "Oh no!" diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml index 08575cd..70b8c44 100644 --- a/.github/ISSUE_TEMPLATE/documentation.yml +++ b/.github/ISSUE_TEMPLATE/documentation.yml @@ -1,5 +1,5 @@ name: 📚 Documentation -description: Report an issue related to https://pdelfin.readthedocs.io/latest +description: Report an issue related to https://olmocr.readthedocs.io/latest labels: 'documentation' body: @@ -7,7 +7,7 @@ body: attributes: label: 📚 The doc issue description: > - A clear and concise description of what content in https://pdelfin.readthedocs.io/latest is an issue. + A clear and concise description of what content in https://olmocr.readthedocs.io/latest is an issue. validations: required: true - type: textarea diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 171e504..87fc9d4 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,9 +10,9 @@ Changes proposed in this pull request: ## Before submitting -- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/allenai/pdelfin/blob/main/.github/CONTRIBUTING.md#making-a-pull-request) +- [ ] I've read and followed all steps in the [Making a pull request](https://github.com/allenai/olmocr/blob/main/.github/CONTRIBUTING.md#making-a-pull-request) section of the `CONTRIBUTING` docs. - [ ] I've updated or added any relevant docstrings following the syntax described in the - [Writing docstrings](https://github.com/allenai/pdelfin/blob/main/.github/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs. + [Writing docstrings](https://github.com/allenai/olmocr/blob/main/.github/CONTRIBUTING.md#writing-docstrings) section of the `CONTRIBUTING` docs. - [ ] If this PR fixes a bug, I've added a test that will fail without my fix. - [ ] If this PR adds a new feature, I've added tests that sufficiently cover my new functionality. diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a01d6c0..ed84a4a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -97,7 +97,7 @@ jobs: if: always() run: | . .venv/bin/activate - pip uninstall -y pdelfin + pip uninstall -y olmocr release: name: Release diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml index aaca500..9e76d1b 100644 --- a/.github/workflows/pr_checks.yml +++ b/.github/workflows/pr_checks.yml @@ -9,7 +9,7 @@ on: branches: - main paths: - - 'pdelfin/**' + - 'olmocr/**' jobs: changelog: diff --git a/README.md b/README.md index 0a3314f..0246d71 100644 --- a/README.md +++ b/README.md @@ -7,12 +7,12 @@ Toolkit for training language models to work with PDF documents in the wild. What is included: - - A prompting strategy to get really good natural text parsing using ChatGPT 4o - [buildsilver.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/data/buildsilver.py) - - An eval toolkit for comparing different pipeline versions - [runeval.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/eval/runeval.py) - - Basic filtering by language and SEO spam removal - [filter.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/filter/filter.py) - - Finetuning code for Qwen2-VL (and soon other VLMs) - [train.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/train/train.py) - - Processing millions of PDFs through a finetuned model using Sglang - [beakerpipeline.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/beakerpipeline.py) - - Viewing Dolma Docs created from PDFs - [dolmaviewer.py](https://github.com/allenai/pdelfin/blob/main/pdelfin/viewer/dolmaviewer.py) + - A prompting strategy to get really good natural text parsing using ChatGPT 4o - [buildsilver.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/buildsilver.py) + - An eval toolkit for comparing different pipeline versions - [runeval.py](https://github.com/allenai/olmocr/blob/main/olmocr/eval/runeval.py) + - Basic filtering by language and SEO spam removal - [filter.py](https://github.com/allenai/olmocr/blob/main/olmocr/filter/filter.py) + - Finetuning code for Qwen2-VL (and soon other VLMs) - [train.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/train.py) + - Processing millions of PDFs through a finetuned model using Sglang - [beakerpipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/beakerpipeline.py) + - Viewing Dolma Docs created from PDFs - [dolmaviewer.py](https://github.com/allenai/olmocr/blob/main/olmocr/viewer/dolmaviewer.py) ### Installation @@ -22,10 +22,10 @@ You will need to install poppler-utils and then also some fonts on your computer sudo apt-get install poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools ``` -Then, clone and install the pdelfin package +Then, clone and install the olmocr package ```bash -git clone https://github.com/allenai/pdelfin.git -cd pdelfin +git clone https://github.com/allenai/olmocr.git +cd olmocr pip install -e . ``` @@ -43,7 +43,7 @@ It also runs at 2,800+ tokens per second per H100 GPU. For example: ```bash -python -m pdelfin.beakerpipeline s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename] --pdfs s3://ai2-oe-data/jakep/gnarly_pdfs/*.pdf --beaker +python -m olmocr.beakerpipeline s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename] --pdfs s3://ai2-oe-data/jakep/gnarly_pdfs/*.pdf --beaker ``` This will convert all the pdfs at `s3://ai2-oe-data/jakep/gnarly_pdfs/*.pdf` and output dolma formatted documents at `s3://ai2-oe-data/[your username]/pdfworkspaces/[workspacename]/results` @@ -53,7 +53,7 @@ With default settings, it should work fine on any available GPUs. ```bash -python -m pdelfin.beakerpipeline --help +python -m olmocr.beakerpipeline --help usage: beakerpipeline.py [-h] [--pdfs PDFS] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS] [--stats] [--model MODEL] [--model_max_context MODEL_MAX_CONTEXT] [--model_chat_template MODEL_CHAT_TEMPLATE] diff --git a/RELEASE_PROCESS.md b/RELEASE_PROCESS.md index a8d8b8d..3ba9eb8 100644 --- a/RELEASE_PROCESS.md +++ b/RELEASE_PROCESS.md @@ -2,7 +2,7 @@ ## Steps -1. Update the version in `pdelfin/version.py`. +1. Update the version in `olmocr/version.py`. 3. Run the release script: diff --git a/docs/source/conf.py b/docs/source/conf.py index 39428d1..0880186 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -18,11 +18,11 @@ from datetime import datetime sys.path.insert(0, os.path.abspath("../../")) -from pdelfin import VERSION, VERSION_SHORT # noqa: E402 +from olmocr import VERSION, VERSION_SHORT # noqa: E402 # -- Project information ----------------------------------------------------- -project = "pdelfin" +project = "olmocr" copyright = f"{datetime.today().year}, Allen Institute for Artificial Intelligence" author = "Allen Institute for Artificial Intelligence" version = VERSION_SHORT @@ -82,7 +82,7 @@ typehints_defaults = "comma" # html_theme = "furo" -html_title = f"pdelfin v{VERSION}" +html_title = f"olmocr v{VERSION}" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -97,7 +97,7 @@ html_theme_options = { "footer_icons": [ { "name": "GitHub", - "url": "https://github.com/allenai/pdelfin", + "url": "https://github.com/allenai/olmocr", "html": """ diff --git a/docs/source/index.md b/docs/source/index.md index 563ed5a..7e86124 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# **pdelfin** +# **olmocr** ```{toctree} :maxdepth: 2 @@ -15,8 +15,8 @@ overview CHANGELOG CONTRIBUTING -License -GitHub Repository +License +GitHub Repository ``` ## Indices and tables diff --git a/docs/source/installation.md b/docs/source/installation.md index 4dba26b..9c67ecc 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -1,23 +1,23 @@ Installation ============ -**pdelfin** supports Python >= 3.8. +**olmocr** supports Python >= 3.8. ## Installing with `pip` -**pdelfin** is available [on PyPI](https://pypi.org/project/pdelfin/). Just run +**olmocr** is available [on PyPI](https://pypi.org/project/olmocr/). Just run ```bash -pip install pdelfin +pip install olmocr ``` ## Installing from source -To install **pdelfin** from source, first clone [the repository](https://github.com/allenai/pdelfin): +To install **olmocr** from source, first clone [the repository](https://github.com/allenai/olmocr): ```bash -git clone https://github.com/allenai/pdelfin.git -cd pdelfin +git clone https://github.com/allenai/olmocr.git +cd olmocr ``` Then run diff --git a/pdelfin/__init__.py b/olmocr/__init__.py similarity index 100% rename from pdelfin/__init__.py rename to olmocr/__init__.py diff --git a/pdelfin/beakerpipeline.py b/olmocr/beakerpipeline.py similarity index 97% rename from pdelfin/beakerpipeline.py rename to olmocr/beakerpipeline.py index a1b87d0..ece4473 100644 --- a/pdelfin/beakerpipeline.py +++ b/olmocr/beakerpipeline.py @@ -31,15 +31,15 @@ from typing import Optional, Tuple, List, Dict, Set from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed from concurrent.futures.process import BrokenProcessPool -from pdelfin.s3_queue import S3WorkQueue, WorkItem -from pdelfin.s3_utils import expand_s3_glob, get_s3_bytes, get_s3_bytes_with_backoff, parse_s3_path, download_zstd_csv, upload_zstd_csv, download_directory -from pdelfin.data.renderpdf import render_pdf_to_base64png -from pdelfin.filter.filter import PdfFilter, Language -from pdelfin.prompts import build_finetuning_prompt, PageResponse -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.check import check_poppler_version -from pdelfin.metrics import MetricsKeeper, WorkerTracker -from pdelfin.version import VERSION +from olmocr.s3_queue import S3WorkQueue, WorkItem +from olmocr.s3_utils import expand_s3_glob, get_s3_bytes, get_s3_bytes_with_backoff, parse_s3_path, download_zstd_csv, upload_zstd_csv, download_directory +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.filter.filter import PdfFilter, Language +from olmocr.prompts import build_finetuning_prompt, PageResponse +from olmocr.prompts.anchor import get_anchor_text +from olmocr.check import check_poppler_version +from olmocr.metrics import MetricsKeeper, WorkerTracker +from olmocr.version import VERSION # Initialize logger logger = logging.getLogger(__name__) @@ -380,7 +380,7 @@ def build_dolma_document(pdf_s3_path, page_results): # Build the Dolma document metadata = { "Source-File": pdf_s3_path, - "pdelfin-version": VERSION, + "olmocr-version": VERSION, "pdf-total-pages": len(page_results), "total-input-tokens": sum(page.input_tokens for page in page_results), "total-output-tokens": sum(page.output_tokens for page in page_results), @@ -392,7 +392,7 @@ def build_dolma_document(pdf_s3_path, page_results): dolma_doc = { "id": id_, "text": document_text, - "source": "pdelfin", + "source": "olmocr", "added": datetime.datetime.now().strftime("%Y-%m-%d"), "created": datetime.datetime.now().strftime("%Y-%m-%d"), "metadata": metadata, @@ -463,7 +463,7 @@ async def worker(args, work_queue: S3WorkQueue, semaphore, worker_id): async def sglang_server_task(args, semaphore): - model_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'pdelfin', 'model') + model_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'olmocr', 'model') download_directory(args.model, model_cache_dir) # Check the rope config and make sure it's got the proper key @@ -642,7 +642,7 @@ def submit_beaker_job(args): owner = account.name beaker_image = f"jakep/pdelfin-inference-{VERSION}" - task_name = f"pdelfin-{os.path.basename(args.workspace.rstrip('/'))}" + task_name = f"olmocr-{os.path.basename(args.workspace.rstrip('/'))}" # Take out --beaker flag so the workers will just run things args_list = [arg for arg in sys.argv[1:] if arg != "--beaker"] @@ -695,7 +695,7 @@ def submit_beaker_job(args): preemptible=True, ), image=ImageSource(beaker=beaker_image), - command=["python", "-m", "pdelfin.beakerpipeline"] + args_list, + command=["python", "-m", "olmocr.beakerpipeline"] + args_list, env_vars=[ EnvVar(name="BEAKER_JOB_NAME", value=task_name), EnvVar(name="OWNER", value=owner), @@ -857,7 +857,7 @@ async def main(): # Beaker/job running stuff parser.add_argument('--beaker', action='store_true', help='Submit this job to beaker instead of running locally') - parser.add_argument('--beaker_workspace', help='Beaker workspace to submit to', default='ai2/pdelfin') + parser.add_argument('--beaker_workspace', help='Beaker workspace to submit to', default='ai2/olmocr') parser.add_argument('--beaker_cluster', help='Beaker clusters you want to run on', default=["ai2/jupiter-cirrascale-2", "ai2/ceres-cirrascale", "ai2/neptune-cirrascale", "ai2/saturn-cirrascale", "ai2/augusta-google-1"]) parser.add_argument('--beaker_gpus', type=int, default=1, help="Number of gpu replicas to run") parser.add_argument('--beaker_priority', type=str, default="normal", help="Beaker priority level for the job") diff --git a/pdelfin/birrpipeline.py b/olmocr/birrpipeline.py similarity index 98% rename from pdelfin/birrpipeline.py rename to olmocr/birrpipeline.py index 44ce1d8..73885fe 100644 --- a/pdelfin/birrpipeline.py +++ b/olmocr/birrpipeline.py @@ -25,11 +25,11 @@ from urllib.parse import urlparse import concurrent.futures from concurrent.futures import ProcessPoolExecutor, as_completed -from pdelfin.data.renderpdf import render_pdf_to_base64png -from pdelfin.prompts import build_finetuning_prompt, PageResponse -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.s3_utils import parse_custom_id, expand_s3_glob, get_s3_bytes, parse_s3_path -from pdelfin.check import check_poppler_version +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.prompts import build_finetuning_prompt, PageResponse +from olmocr.prompts.anchor import get_anchor_text +from olmocr.s3_utils import parse_custom_id, expand_s3_glob, get_s3_bytes, parse_s3_path +from olmocr.check import check_poppler_version # Initialize logger logger = logging.getLogger(__name__) @@ -79,7 +79,7 @@ class DatabaseManager: def __init__(self, s3_workspace: str, skip_init: bool=False): cache_key = hashlib.sha256(s3_workspace.strip().lower().encode('utf-8')).hexdigest() - home_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'pdelfin', cache_key) + home_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'olmocr', cache_key) os.makedirs(home_cache_dir, exist_ok=True) self.db_path = os.path.join(home_cache_dir, 'index.db') @@ -618,7 +618,7 @@ def build_dolma_doc(s3_workspace: str, pdf: DatabaseManager.PDFRecord) -> Option dolma_doc = { "id": id_, "text": document_text, - "source": "pdelfin", + "source": "olmocr", "added": datetime.datetime.now().strftime("%Y-%m-%d"), "created": datetime.datetime.now().strftime("%Y-%m-%d"), "metadata": metadata, diff --git a/pdelfin/cappedpool.py b/olmocr/cappedpool.py similarity index 100% rename from pdelfin/cappedpool.py rename to olmocr/cappedpool.py diff --git a/pdelfin/check.py b/olmocr/check.py similarity index 91% rename from pdelfin/check.py rename to olmocr/check.py index 12030b0..e08a57c 100644 --- a/pdelfin/check.py +++ b/olmocr/check.py @@ -14,7 +14,7 @@ def check_poppler_version(): sys.exit(1) except FileNotFoundError: logger.error("pdftoppm is not installed.") - logger.error("Check the README in the https://github.com/allenai/pdelfin/blob/main/README.md for installation instructions") + logger.error("Check the README in the https://github.com/allenai/olmocr/blob/main/README.md for installation instructions") sys.exit(1) if __name__ == "__main__": diff --git a/pdelfin/data/__init__.py b/olmocr/data/__init__.py similarity index 100% rename from pdelfin/data/__init__.py rename to olmocr/data/__init__.py diff --git a/pdelfin/data/buildsilver.py b/olmocr/data/buildsilver.py similarity index 97% rename from pdelfin/data/buildsilver.py rename to olmocr/data/buildsilver.py index b99622f..fd2269f 100644 --- a/pdelfin/data/buildsilver.py +++ b/olmocr/data/buildsilver.py @@ -12,10 +12,10 @@ from typing import Generator from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed from urllib.parse import urlparse -from pdelfin.data.renderpdf import render_pdf_to_base64png -from pdelfin.prompts import build_openai_silver_data_prompt, openai_response_format_schema -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.filter import PdfFilter +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.prompts import build_openai_silver_data_prompt, openai_response_format_schema +from olmocr.prompts.anchor import get_anchor_text +from olmocr.filter import PdfFilter TARGET_IMAGE_DIM = 2048 diff --git a/pdelfin/data/buildsilverdatasummary.py b/olmocr/data/buildsilverdatasummary.py similarity index 100% rename from pdelfin/data/buildsilverdatasummary.py rename to olmocr/data/buildsilverdatasummary.py diff --git a/pdelfin/data/buildtestset.py b/olmocr/data/buildtestset.py similarity index 98% rename from pdelfin/data/buildtestset.py rename to olmocr/data/buildtestset.py index 4eb973a..3cf2128 100644 --- a/pdelfin/data/buildtestset.py +++ b/olmocr/data/buildtestset.py @@ -10,8 +10,8 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from urllib.parse import urlparse from typing import List -from pdelfin.data.renderpdf import render_pdf_to_base64png -from pdelfin.filter import PdfFilter +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.filter import PdfFilter pdf_filter = PdfFilter() diff --git a/pdelfin/data/convertsilver_birr.py b/olmocr/data/convertsilver_birr.py similarity index 98% rename from pdelfin/data/convertsilver_birr.py rename to olmocr/data/convertsilver_birr.py index 197de02..83b56e7 100644 --- a/pdelfin/data/convertsilver_birr.py +++ b/olmocr/data/convertsilver_birr.py @@ -10,9 +10,9 @@ import os import smart_open import boto3 -from pdelfin.prompts import build_finetuning_prompt -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.data.renderpdf import render_pdf_to_base64png +from olmocr.prompts import build_finetuning_prompt +from olmocr.prompts.anchor import get_anchor_text +from olmocr.data.renderpdf import render_pdf_to_base64png # Import Plotly for plotting import plotly.express as px diff --git a/pdelfin/data/convertsilver_openai.py b/olmocr/data/convertsilver_openai.py similarity index 96% rename from pdelfin/data/convertsilver_openai.py rename to olmocr/data/convertsilver_openai.py index fa10a5b..4873a39 100644 --- a/pdelfin/data/convertsilver_openai.py +++ b/olmocr/data/convertsilver_openai.py @@ -9,7 +9,7 @@ import logging import smart_open from cached_path import cached_path -from pdelfin.prompts import build_finetuning_prompt +from olmocr.prompts import build_finetuning_prompt def setup_logging(): @@ -73,12 +73,12 @@ def process_file(input_file: str, output_file: str, rewrite_prompt_str: bool): # Save the pdf to a temporary cache folder local_pdf_path = cached_path(s3_path, quiet=True) - from pdelfin.prompts.anchor import get_anchor_text - from pdelfin.data.buildsilver import build_page_query + from olmocr.prompts.anchor import get_anchor_text + from olmocr.data.buildsilver import build_page_query obj = build_page_query(local_pdf_path, s3_path, page) # raw_page_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport") - # from pdelfin.prompts import build_openai_silver_data_prompt + # from olmocr.prompts import build_openai_silver_data_prompt # obj["body"]["messages"][0]["content"][0]["text"] = build_openai_silver_data_prompt(raw_page_text) if obj is not None: diff --git a/pdelfin/data/renderpdf.py b/olmocr/data/renderpdf.py similarity index 100% rename from pdelfin/data/renderpdf.py rename to olmocr/data/renderpdf.py diff --git a/pdelfin/data/runopenaibatch.py b/olmocr/data/runopenaibatch.py similarity index 100% rename from pdelfin/data/runopenaibatch.py rename to olmocr/data/runopenaibatch.py diff --git a/pdelfin/datatypes.py b/olmocr/datatypes.py similarity index 100% rename from pdelfin/datatypes.py rename to olmocr/datatypes.py diff --git a/pdelfin/eval/__init__.py b/olmocr/eval/__init__.py similarity index 100% rename from pdelfin/eval/__init__.py rename to olmocr/eval/__init__.py diff --git a/pdelfin/eval/buildelo.py b/olmocr/eval/buildelo.py similarity index 97% rename from pdelfin/eval/buildelo.py rename to olmocr/eval/buildelo.py index f49c8f0..370e58e 100644 --- a/pdelfin/eval/buildelo.py +++ b/olmocr/eval/buildelo.py @@ -8,12 +8,12 @@ import functools from tqdm import tqdm from itertools import combinations -from pdelfin.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes +from olmocr.s3_utils import parse_s3_path, expand_s3_glob, get_s3_bytes from dolma_refine.evaluate.metrics import DocumentEditSimilarity from dolma_refine.evaluate.segmenters import SpacySegmenter from dolma_refine.evaluate.aligners import HirschbergAligner -from pdelfin.eval.evalhtml import create_review_html +from olmocr.eval.evalhtml import create_review_html @dataclasses.dataclass class Comparison: diff --git a/pdelfin/eval/evalhtml.py b/olmocr/eval/evalhtml.py similarity index 98% rename from pdelfin/eval/evalhtml.py rename to olmocr/eval/evalhtml.py index 91cb548..0b5dffd 100644 --- a/pdelfin/eval/evalhtml.py +++ b/olmocr/eval/evalhtml.py @@ -7,7 +7,7 @@ from jinja2 import Template from urllib.parse import urlparse from difflib import SequenceMatcher from tqdm import tqdm -from pdelfin.data.renderpdf import render_pdf_to_base64png +from olmocr.data.renderpdf import render_pdf_to_base64png session = boto3.Session(profile_name='s2') s3_client = session.client('s3') diff --git a/pdelfin/eval/evalhtml_template.html b/olmocr/eval/evalhtml_template.html similarity index 100% rename from pdelfin/eval/evalhtml_template.html rename to olmocr/eval/evalhtml_template.html diff --git a/pdelfin/eval/runeval.py b/olmocr/eval/runeval.py similarity index 100% rename from pdelfin/eval/runeval.py rename to olmocr/eval/runeval.py diff --git a/pdelfin/eval/scoreelo.py b/olmocr/eval/scoreelo.py similarity index 100% rename from pdelfin/eval/scoreelo.py rename to olmocr/eval/scoreelo.py diff --git a/pdelfin/filter/__init__.py b/olmocr/filter/__init__.py similarity index 100% rename from pdelfin/filter/__init__.py rename to olmocr/filter/__init__.py diff --git a/pdelfin/filter/coherency.py b/olmocr/filter/coherency.py similarity index 100% rename from pdelfin/filter/coherency.py rename to olmocr/filter/coherency.py diff --git a/pdelfin/filter/filter.py b/olmocr/filter/filter.py similarity index 99% rename from pdelfin/filter/filter.py rename to olmocr/filter/filter.py index b3f3ac3..35f05e9 100644 --- a/pdelfin/filter/filter.py +++ b/olmocr/filter/filter.py @@ -125,7 +125,7 @@ class PdfFilter: if __name__ == "__main__": import tempfile import boto3 - from pdelfin.s3_utils import parse_s3_path + from olmocr.s3_utils import parse_s3_path from concurrent.futures import ProcessPoolExecutor, wait, FIRST_COMPLETED from tqdm import tqdm diff --git a/pdelfin/metrics.py b/olmocr/metrics.py similarity index 100% rename from pdelfin/metrics.py rename to olmocr/metrics.py diff --git a/pdelfin/prompts/__init__.py b/olmocr/prompts/__init__.py similarity index 100% rename from pdelfin/prompts/__init__.py rename to olmocr/prompts/__init__.py diff --git a/pdelfin/prompts/_adv_anchor.py b/olmocr/prompts/_adv_anchor.py similarity index 100% rename from pdelfin/prompts/_adv_anchor.py rename to olmocr/prompts/_adv_anchor.py diff --git a/pdelfin/prompts/anchor.py b/olmocr/prompts/anchor.py similarity index 99% rename from pdelfin/prompts/anchor.py rename to olmocr/prompts/anchor.py index c0b1b68..dfb0935 100644 --- a/pdelfin/prompts/anchor.py +++ b/olmocr/prompts/anchor.py @@ -19,11 +19,11 @@ from functools import lru_cache import pypdfium2 as pdfium import pymupdf -from pdelfin.filter.coherency import get_document_coherency +from olmocr.filter.coherency import get_document_coherency from pypdf import PdfReader from pypdf.generic import RectangleObject -from pdelfin.prompts._adv_anchor import mult +from olmocr.prompts._adv_anchor import mult def get_anchor_text(local_pdf_path: str, page: int, pdf_engine: Literal["pdftotext", "pdfium", "pymupdf", "pypdf", "topcoherency", "pdfreport"], target_length: int=4000) -> str: diff --git a/pdelfin/prompts/prompts.py b/olmocr/prompts/prompts.py similarity index 100% rename from pdelfin/prompts/prompts.py rename to olmocr/prompts/prompts.py diff --git a/pdelfin/py.typed b/olmocr/py.typed similarity index 100% rename from pdelfin/py.typed rename to olmocr/py.typed diff --git a/pdelfin/repeatdetect.py b/olmocr/repeatdetect.py similarity index 100% rename from pdelfin/repeatdetect.py rename to olmocr/repeatdetect.py diff --git a/pdelfin/s3_queue.py b/olmocr/s3_queue.py similarity index 99% rename from pdelfin/s3_queue.py rename to olmocr/s3_queue.py index 5c91ed5..c4e3a26 100644 --- a/pdelfin/s3_queue.py +++ b/olmocr/s3_queue.py @@ -9,7 +9,7 @@ from dataclasses import dataclass import asyncio from functools import partial -from pdelfin.s3_utils import ( +from olmocr.s3_utils import ( expand_s3_glob, download_zstd_csv, upload_zstd_csv, diff --git a/pdelfin/s3_utils.py b/olmocr/s3_utils.py similarity index 100% rename from pdelfin/s3_utils.py rename to olmocr/s3_utils.py diff --git a/pdelfin/train/__init__.py b/olmocr/train/__init__.py similarity index 100% rename from pdelfin/train/__init__.py rename to olmocr/train/__init__.py diff --git a/pdelfin/train/buildparquetdataset.py b/olmocr/train/buildparquetdataset.py similarity index 97% rename from pdelfin/train/buildparquetdataset.py rename to olmocr/train/buildparquetdataset.py index 7fabbe7..ecf10cb 100644 --- a/pdelfin/train/buildparquetdataset.py +++ b/olmocr/train/buildparquetdataset.py @@ -5,7 +5,7 @@ import os import boto3 from datasets import Dataset from botocore.exceptions import NoCredentialsError, PartialCredentialsError -from pdelfin.train.dataloader import build_batch_query_response_vision_dataset +from olmocr.train.dataloader import build_batch_query_response_vision_dataset def save_dataset_in_parquet(dataset: Dataset, output_dir: str, rows_per_file: int = 10000, s3_endpoint_url: str = None): diff --git a/pdelfin/train/config/molmo-o-lora-8192.yaml b/olmocr/train/config/molmo-o-lora-8192.yaml similarity index 100% rename from pdelfin/train/config/molmo-o-lora-8192.yaml rename to olmocr/train/config/molmo-o-lora-8192.yaml diff --git a/pdelfin/train/config/molmo-o-lora.yaml b/olmocr/train/config/molmo-o-lora.yaml similarity index 100% rename from pdelfin/train/config/molmo-o-lora.yaml rename to olmocr/train/config/molmo-o-lora.yaml diff --git a/pdelfin/train/config/qwen2vl-2b-lora.yaml b/olmocr/train/config/qwen2vl-2b-lora.yaml similarity index 100% rename from pdelfin/train/config/qwen2vl-2b-lora.yaml rename to olmocr/train/config/qwen2vl-2b-lora.yaml diff --git a/pdelfin/train/config/qwen2vl-2b.yaml b/olmocr/train/config/qwen2vl-2b.yaml similarity index 100% rename from pdelfin/train/config/qwen2vl-2b.yaml rename to olmocr/train/config/qwen2vl-2b.yaml diff --git a/pdelfin/train/config/qwen2vl-7b-lora.yaml b/olmocr/train/config/qwen2vl-7b-lora.yaml similarity index 100% rename from pdelfin/train/config/qwen2vl-7b-lora.yaml rename to olmocr/train/config/qwen2vl-7b-lora.yaml diff --git a/pdelfin/train/config/qwen2vl-7b.yaml b/olmocr/train/config/qwen2vl-7b.yaml similarity index 100% rename from pdelfin/train/config/qwen2vl-7b.yaml rename to olmocr/train/config/qwen2vl-7b.yaml diff --git a/pdelfin/train/core/__init__.py b/olmocr/train/core/__init__.py similarity index 100% rename from pdelfin/train/core/__init__.py rename to olmocr/train/core/__init__.py diff --git a/pdelfin/train/core/adapters.py b/olmocr/train/core/adapters.py similarity index 100% rename from pdelfin/train/core/adapters.py rename to olmocr/train/core/adapters.py diff --git a/pdelfin/train/core/cli.py b/olmocr/train/core/cli.py similarity index 100% rename from pdelfin/train/core/cli.py rename to olmocr/train/core/cli.py diff --git a/pdelfin/train/core/compression.py b/olmocr/train/core/compression.py similarity index 100% rename from pdelfin/train/core/compression.py rename to olmocr/train/core/compression.py diff --git a/pdelfin/train/core/config.py b/olmocr/train/core/config.py similarity index 100% rename from pdelfin/train/core/config.py rename to olmocr/train/core/config.py diff --git a/pdelfin/train/core/errors.py b/olmocr/train/core/errors.py similarity index 100% rename from pdelfin/train/core/errors.py rename to olmocr/train/core/errors.py diff --git a/pdelfin/train/core/loggers.py b/olmocr/train/core/loggers.py similarity index 100% rename from pdelfin/train/core/loggers.py rename to olmocr/train/core/loggers.py diff --git a/pdelfin/train/core/paths.py b/olmocr/train/core/paths.py similarity index 100% rename from pdelfin/train/core/paths.py rename to olmocr/train/core/paths.py diff --git a/pdelfin/train/core/state.py b/olmocr/train/core/state.py similarity index 100% rename from pdelfin/train/core/state.py rename to olmocr/train/core/state.py diff --git a/pdelfin/train/dataloader.py b/olmocr/train/dataloader.py similarity index 96% rename from pdelfin/train/dataloader.py rename to olmocr/train/dataloader.py index 5d9a386..4f6857d 100644 --- a/pdelfin/train/dataloader.py +++ b/olmocr/train/dataloader.py @@ -16,9 +16,9 @@ import boto3 from datasets import Dataset, Features, Value, load_dataset, concatenate_datasets, DatasetDict from .core.config import DataConfig, SourceConfig -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path -from pdelfin.data.renderpdf import get_pdf_media_box_width_height +from olmocr.prompts.anchor import get_anchor_text +from olmocr.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path +from olmocr.data.renderpdf import get_pdf_media_box_width_height # Configure logging logging.basicConfig(level=logging.INFO) @@ -143,7 +143,7 @@ def cache_s3_files(dataset: Dataset, pdf_cache_location: str, num_proc: int = 32 def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Optional[str]=None, num_proc: int=32) -> Dataset: if pdf_cache_location is None: - pdf_cache_location = os.path.join(os.path.expanduser('~'), '.cache', 'pdelfin_pdfs') + pdf_cache_location = os.path.join(os.path.expanduser('~'), '.cache', 'olmocr_pdfs') logger.info("Loading fine tuning dataset from OpenAI style batch responses") response_data = load_jsonl_into_ds(response_glob_path) diff --git a/pdelfin/train/dataprep.py b/olmocr/train/dataprep.py similarity index 97% rename from pdelfin/train/dataprep.py rename to olmocr/train/dataprep.py index e541a9a..46b181c 100644 --- a/pdelfin/train/dataprep.py +++ b/olmocr/train/dataprep.py @@ -6,9 +6,9 @@ import base64 import random import torch # Make sure to import torch as it's used in the DataCollator -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.prompts import build_finetuning_prompt -from pdelfin.data.renderpdf import render_pdf_to_base64png +from olmocr.prompts.anchor import get_anchor_text +from olmocr.prompts import build_finetuning_prompt +from olmocr.data.renderpdf import render_pdf_to_base64png def prepare_data_for_qwen2_training(example, processor, target_longest_image_dim: Union[int, list[int]], target_anchor_text_len: Union[int, list[int]]): diff --git a/pdelfin/train/fixqwen2vlcheckpoint.py b/olmocr/train/fixqwen2vlcheckpoint.py similarity index 99% rename from pdelfin/train/fixqwen2vlcheckpoint.py rename to olmocr/train/fixqwen2vlcheckpoint.py index 417d2c4..584e793 100644 --- a/pdelfin/train/fixqwen2vlcheckpoint.py +++ b/olmocr/train/fixqwen2vlcheckpoint.py @@ -9,7 +9,7 @@ import concurrent.futures from smart_open import smart_open from transformers import Qwen2VLForConditionalGeneration -from pdelfin.s3_utils import parse_s3_path +from olmocr.s3_utils import parse_s3_path s3_client = boto3.client('s3') diff --git a/pdelfin/train/inference.py b/olmocr/train/inference.py similarity index 90% rename from pdelfin/train/inference.py rename to olmocr/train/inference.py index 84b59e3..9e516f9 100644 --- a/pdelfin/train/inference.py +++ b/olmocr/train/inference.py @@ -29,11 +29,11 @@ from transformers import ( ) -from pdelfin.data.renderpdf import render_pdf_to_base64png -from pdelfin.prompts.anchor import get_anchor_text -from pdelfin.prompts.prompts import build_finetuning_prompt +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.prompts.anchor import get_anchor_text +from olmocr.prompts.prompts import build_finetuning_prompt -from pdelfin.train.dataprep import prepare_data_for_qwen2_inference +from olmocr.train.dataprep import prepare_data_for_qwen2_inference def build_page_query(local_pdf_path: str, page: int) -> dict: image_base64 = render_pdf_to_base64png(local_pdf_path, page, 1024) diff --git a/pdelfin/train/loaddataset.py b/olmocr/train/loaddataset.py similarity index 94% rename from pdelfin/train/loaddataset.py rename to olmocr/train/loaddataset.py index 279e932..c3c296f 100644 --- a/pdelfin/train/loaddataset.py +++ b/olmocr/train/loaddataset.py @@ -3,8 +3,8 @@ from transformers import ( DataCollatorForSeq2Seq ) -from pdelfin.train.core.cli import make_cli -from pdelfin.train.core.config import TrainConfig +from olmocr.train.core.cli import make_cli +from olmocr.train.core.config import TrainConfig from tqdm import tqdm from .utils import ( make_dataset, TruncatingCollator diff --git a/pdelfin/train/molmo/__init__.py b/olmocr/train/molmo/__init__.py similarity index 100% rename from pdelfin/train/molmo/__init__.py rename to olmocr/train/molmo/__init__.py diff --git a/pdelfin/train/molmo/config_molmo.py b/olmocr/train/molmo/config_molmo.py similarity index 100% rename from pdelfin/train/molmo/config_molmo.py rename to olmocr/train/molmo/config_molmo.py diff --git a/pdelfin/train/molmo/image_processing_molmo.py b/olmocr/train/molmo/image_processing_molmo.py similarity index 100% rename from pdelfin/train/molmo/image_processing_molmo.py rename to olmocr/train/molmo/image_processing_molmo.py diff --git a/pdelfin/train/molmo/modeling_molmo.py b/olmocr/train/molmo/modeling_molmo.py similarity index 100% rename from pdelfin/train/molmo/modeling_molmo.py rename to olmocr/train/molmo/modeling_molmo.py diff --git a/pdelfin/train/molmo/preprocessing_molmo.py b/olmocr/train/molmo/preprocessing_molmo.py similarity index 100% rename from pdelfin/train/molmo/preprocessing_molmo.py rename to olmocr/train/molmo/preprocessing_molmo.py diff --git a/pdelfin/train/train.py b/olmocr/train/train.py similarity index 97% rename from pdelfin/train/train.py rename to olmocr/train/train.py index fafe13b..cdac627 100644 --- a/pdelfin/train/train.py +++ b/olmocr/train/train.py @@ -36,11 +36,11 @@ from torch.utils.data import DataLoader import wandb -from pdelfin.train.core.cli import make_cli, save_config, to_native_types -from pdelfin.train.core.config import TrainConfig -from pdelfin.train.core.loggers import get_logger -from pdelfin.train.core.paths import copy_dir, join_path -from pdelfin.train.core.state import BeakerState +from olmocr.train.core.cli import make_cli, save_config, to_native_types +from olmocr.train.core.config import TrainConfig +from olmocr.train.core.loggers import get_logger +from olmocr.train.core.paths import copy_dir, join_path +from olmocr.train.core.state import BeakerState from .utils import ( RunName, diff --git a/pdelfin/train/utils.py b/olmocr/train/utils.py similarity index 97% rename from pdelfin/train/utils.py rename to olmocr/train/utils.py index 5d96101..0ab9a2c 100644 --- a/pdelfin/train/utils.py +++ b/olmocr/train/utils.py @@ -28,8 +28,8 @@ from .core.state import BeakerState T = TypeVar("T") -from pdelfin.train.dataloader import build_finetuning_dataset, list_dataset_files -from pdelfin.train.dataprep import batch_prepare_data_for_qwen2_training, batch_prepare_data_for_molmo_training +from olmocr.train.dataloader import build_finetuning_dataset, list_dataset_files +from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training, batch_prepare_data_for_molmo_training def accelerator_to_dtype(accelerator: Accelerator) -> torch.dtype: diff --git a/pdelfin/version.py b/olmocr/version.py similarity index 100% rename from pdelfin/version.py rename to olmocr/version.py diff --git a/pdelfin/viewer/dolmaviewer.py b/olmocr/viewer/dolmaviewer.py similarity index 97% rename from pdelfin/viewer/dolmaviewer.py rename to olmocr/viewer/dolmaviewer.py index 340d48a..96c1905 100644 --- a/pdelfin/viewer/dolmaviewer.py +++ b/olmocr/viewer/dolmaviewer.py @@ -11,8 +11,8 @@ from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor, as_completed import markdown2 -from pdelfin.s3_utils import get_s3_bytes -from pdelfin.data.renderpdf import render_pdf_to_base64webp +from olmocr.s3_utils import get_s3_bytes +from olmocr.data.renderpdf import render_pdf_to_base64webp def read_jsonl(path): with smart_open.smart_open(path, 'r', encoding='utf-8') as f: diff --git a/pdelfin/viewer/dolmaviewer_template.html b/olmocr/viewer/dolmaviewer_template.html similarity index 100% rename from pdelfin/viewer/dolmaviewer_template.html rename to olmocr/viewer/dolmaviewer_template.html diff --git a/pyproject.toml b/pyproject.toml index 6d33c6e..4648534 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] # See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options. -name = "pdelfin" +name = "olmocr" dynamic = ["version"] readme = "README.md" classifiers = [ @@ -42,10 +42,10 @@ dependencies = [ license = {file = "LICENSE"} [project.urls] -Homepage = "https://github.com/allenai/pdelfin" -Repository = "https://github.com/allenai/pdelfin" -Changelog = "https://github.com/allenai/pdelfin/blob/main/CHANGELOG.md" -# Documentation = "https://pdelfin.readthedocs.io/" +Homepage = "https://github.com/allenai/olmocr" +Repository = "https://github.com/allenai/olmocr" +Changelog = "https://github.com/allenai/olmocr/blob/main/CHANGELOG.md" +# Documentation = "https://olmocr.readthedocs.io/" [project.optional-dependencies] dev = [ @@ -103,10 +103,10 @@ exclude = [ include-package-data = true [tool.setuptools.package-data] -pdelfin = ["py.typed"] +olmocr = ["py.typed"] [tool.setuptools.dynamic] -version = {attr = "pdelfin.version.VERSION"} +version = {attr = "olmocr.version.VERSION"} [tool.black] line-length = 120 diff --git a/scripts/beaker/Dockerfile-inference b/scripts/beaker/Dockerfile-inference index 8f8fa84..6ae0069 100644 --- a/scripts/beaker/Dockerfile-inference +++ b/scripts/beaker/Dockerfile-inference @@ -36,17 +36,17 @@ RUN /root/.local/bin/uv pip install --system flashinfer -i https://flashinfer.ai ENV PYTHONUNBUFFERED=1 WORKDIR /root COPY pyproject.toml pyproject.toml -COPY pdelfin/version.py pdelfin/version.py +COPY olmocr/version.py olmocr/version.py RUN /root/.local/bin/uv pip install --system --no-cache -e .[inference] -COPY pdelfin pdelfin +COPY olmocr olmocr # TODO You can remove this once pypdf > 5.10 comes out RUN /root/.local/bin/uv pip install --system --no-cache git+https://github.com/py-pdf/pypdf.git@c6e43374ab002d76811ec85333fdc2c82c268251 WORKDIR /root -COPY pdelfin pdelfin +COPY olmocr olmocr RUN python3 -m sglang.launch_server --help -RUN python3 -m pdelfin.beakerpipeline --help \ No newline at end of file +RUN python3 -m olmocr.beakerpipeline --help \ No newline at end of file diff --git a/scripts/build-docker.sh b/scripts/build-docker.sh index 219a21a..d427c0f 100644 --- a/scripts/build-docker.sh +++ b/scripts/build-docker.sh @@ -2,8 +2,8 @@ set -e -VERSION=$(python -c 'import pdelfin.version; print(pdelfin.version.VERSION)') +VERSION=$(python -c 'import olmocr.version; print(olmocr.version.VERSION)') echo "$VERSION" -docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t pdelfin-inference-$VERSION . -beaker image create --workspace ai2/oe-data-pdf --name pdelfin-inference-$VERSION pdelfin-inference-$VERSION \ No newline at end of file +docker build --platform linux/amd64 -f ./scripts/beaker/Dockerfile-inference -t olmocr-inference-$VERSION . +beaker image create --workspace ai2/oe-data-pdf --name olmocr-inference-$VERSION olmocr-inference-$VERSION \ No newline at end of file diff --git a/scripts/molmo-7b-lora-gantry.sh b/scripts/molmo-7b-lora-gantry.sh index 5072c62..2640feb 100755 --- a/scripts/molmo-7b-lora-gantry.sh +++ b/scripts/molmo-7b-lora-gantry.sh @@ -10,7 +10,7 @@ then fi -EXTRA_ARGS="-c pdelfin/train/config/molmo-o-lora-8192.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/molmo-pdf/v1/models/\${BEAKER_USER_ID}\"" +EXTRA_ARGS="-c olmocr/train/config/molmo-o-lora-8192.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/molmo-pdf/v1/models/\${BEAKER_USER_ID}\"" run_name=$(basename "$0" .sh) @@ -45,4 +45,4 @@ gantry run \ --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ - -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m pdelfin.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" \ No newline at end of file diff --git a/scripts/prepare_changelog.py b/scripts/prepare_changelog.py index e6d99fa..bc493a3 100644 --- a/scripts/prepare_changelog.py +++ b/scripts/prepare_changelog.py @@ -1,7 +1,7 @@ from datetime import datetime from pathlib import Path -from pdelfin.version import VERSION +from olmocr.version import VERSION def main(): @@ -27,7 +27,7 @@ def main(): lines.insert(insert_index, "\n") lines.insert( insert_index + 1, - f"## [v{VERSION}](https://github.com/allenai/pdelfin/releases/tag/v{VERSION}) - " + f"## [v{VERSION}](https://github.com/allenai/olmocr/releases/tag/v{VERSION}) - " f"{datetime.now().strftime('%Y-%m-%d')}\n", ) diff --git a/scripts/qwen2vl-2b-gantry.sh b/scripts/qwen2vl-2b-gantry.sh index 2858431..4b06292 100755 --- a/scripts/qwen2vl-2b-gantry.sh +++ b/scripts/qwen2vl-2b-gantry.sh @@ -10,7 +10,7 @@ then fi -EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" +EXTRA_ARGS="-c olmocr/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" run_name=$(basename "$0" .sh) @@ -43,4 +43,4 @@ gantry run \ --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ - -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" \ No newline at end of file diff --git a/scripts/qwen2vl-7b-gantry.sh b/scripts/qwen2vl-7b-gantry.sh index 9400aba..8c2d3ae 100755 --- a/scripts/qwen2vl-7b-gantry.sh +++ b/scripts/qwen2vl-7b-gantry.sh @@ -10,7 +10,7 @@ then fi -EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" +EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" run_name=$(basename "$0" .sh) @@ -44,4 +44,4 @@ gantry run \ --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ - -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m pdelfin.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" \ No newline at end of file diff --git a/scripts/qwen2vl-7b-lora-gantry.sh b/scripts/qwen2vl-7b-lora-gantry.sh index bec3815..afd9fc7 100755 --- a/scripts/qwen2vl-7b-lora-gantry.sh +++ b/scripts/qwen2vl-7b-lora-gantry.sh @@ -10,7 +10,7 @@ then fi -EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" +EXTRA_ARGS="-c olmocr/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" run_name=$(basename "$0" .sh) @@ -46,4 +46,4 @@ gantry run \ --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ - -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m pdelfin.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}" \ No newline at end of file diff --git a/scripts/release.sh b/scripts/release.sh index 54d25c1..16447bf 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -2,7 +2,7 @@ set -e -TAG=$(python -c 'from pdelfin.version import VERSION; print("v" + VERSION)') +TAG=$(python -c 'from olmocr.version import VERSION; print("v" + VERSION)') read -p "Creating new release for $TAG. Do you want to continue? [Y/n] " prompt diff --git a/tests/test_anchor.py b/tests/test_anchor.py index b5e5a81..04795ee 100644 --- a/tests/test_anchor.py +++ b/tests/test_anchor.py @@ -6,8 +6,8 @@ import glob from pypdf import PdfReader -from pdelfin.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text -from pdelfin.data.renderpdf import get_pdf_media_box_width_height +from olmocr.prompts.anchor import _pdf_report, _linearize_pdf_report, get_anchor_text +from olmocr.data.renderpdf import get_pdf_media_box_width_height class AnchorTest(unittest.TestCase): def testExtractText(self): @@ -168,11 +168,11 @@ class BuildSilverTest(unittest.TestCase): def testSmallPage(self): local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf") - from pdelfin.data.buildsilver import build_page_query + from olmocr.data.buildsilver import build_page_query result = build_page_query(local_pdf_path, "s3://test.pdf", 1) - from pdelfin.data.renderpdf import get_png_dimensions_from_base64 + from olmocr.data.renderpdf import get_png_dimensions_from_base64 base64data = result["body"]["messages"][0]["content"][1]["image_url"]["url"] diff --git a/tests/test_birr_outputs.py b/tests/test_birr_outputs.py index 346c9b2..9513905 100644 --- a/tests/test_birr_outputs.py +++ b/tests/test_birr_outputs.py @@ -4,15 +4,15 @@ from io import BytesIO from PIL import Image from transformers import AutoProcessor -from pdelfin.data.renderpdf import render_pdf_to_base64png -from pdelfin.train.dataprep import ( +from olmocr.data.renderpdf import render_pdf_to_base64png +from olmocr.train.dataprep import ( prepare_data_for_qwen2_training, build_finetuning_prompt ) import numpy as np from tqdm import tqdm from torch.utils.data import DataLoader -from pdelfin.train.utils import make_dataset -from pdelfin.train.core.config import TrainConfig, DataConfig, SourceConfig +from olmocr.train.utils import make_dataset +from olmocr.train.core.config import TrainConfig, DataConfig, SourceConfig import math @@ -138,7 +138,7 @@ class TestBirrTokenization(unittest.TestCase): from birr.core.config import FormatConfig, LLMModelConfig from birr.batch_inference.data_models import RawInputItem - from pdelfin.birrpipeline import build_page_query + from olmocr.birrpipeline import build_page_query original_query = build_page_query(os.path.join( os.path.dirname(__file__), diff --git a/tests/test_birrpipeline.py b/tests/test_birrpipeline.py index 26a7d8a..03c56f9 100644 --- a/tests/test_birrpipeline.py +++ b/tests/test_birrpipeline.py @@ -7,11 +7,11 @@ import base64 from PIL import Image # Adjust the import path to match where your code resides -from pdelfin.birrpipeline import build_dolma_doc, DatabaseManager, build_finetuning_prompt, build_page_query +from olmocr.birrpipeline import build_dolma_doc, DatabaseManager, build_finetuning_prompt, build_page_query class TestBuildDolmaDoc(unittest.TestCase): - @patch('pdelfin.birrpipeline.DatabaseManager') - @patch('pdelfin.birrpipeline.get_s3_bytes') + @patch('olmocr.birrpipeline.DatabaseManager') + @patch('olmocr.birrpipeline.get_s3_bytes') def test_build_dolma_doc_with_multiple_page_entries(self, mock_get_s3_bytes, mock_DatabaseManager): # Mock DatabaseManager instance mock_db_instance = MagicMock() diff --git a/tests/test_cappedpool.py b/tests/test_cappedpool.py index 05c8b77..3f7912d 100644 --- a/tests/test_cappedpool.py +++ b/tests/test_cappedpool.py @@ -4,7 +4,7 @@ import concurrent.futures from concurrent.futures import TimeoutError # Assuming the CappedProcessPoolExecutor code is in a module named 'capped_executor' -from pdelfin.cappedpool import CappedProcessPoolExecutor +from olmocr.cappedpool import CappedProcessPoolExecutor # Define functions at the top level to ensure they are picklable by multiprocessing diff --git a/tests/test_coherency.py b/tests/test_coherency.py index 4edde00..d411d89 100644 --- a/tests/test_coherency.py +++ b/tests/test_coherency.py @@ -5,9 +5,9 @@ import time import unittest -from pdelfin.filter.coherency import get_document_coherency +from olmocr.filter.coherency import get_document_coherency -from pdelfin.prompts.anchor import get_anchor_text +from olmocr.prompts.anchor import get_anchor_text class TestCoherencyScores(unittest.TestCase): def testBadOcr1(self): diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index a74aee6..21396f3 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -5,14 +5,14 @@ from functools import partial from transformers import AutoProcessor -from pdelfin.train.dataloader import ( +from olmocr.train.dataloader import ( build_finetuning_dataset, extract_openai_batch_response, load_jsonl_into_ds, list_dataset_files ) -from pdelfin.train.dataprep import batch_prepare_data_for_qwen2_training +from olmocr.train.dataprep import batch_prepare_data_for_qwen2_training class TestBatchQueryResponseDataset(unittest.TestCase): diff --git a/tests/test_dataprep.py b/tests/test_dataprep.py index 5f0b272..8f0d01a 100644 --- a/tests/test_dataprep.py +++ b/tests/test_dataprep.py @@ -10,19 +10,19 @@ from PIL import Image from transformers import AutoProcessor from unittest.mock import patch -from pdelfin.train.dataloader import ( +from olmocr.train.dataloader import ( build_finetuning_dataset, ) -from pdelfin.train.dataprep import ( +from olmocr.train.dataprep import ( prepare_data_for_qwen2_training, build_finetuning_prompt, prepare_data_for_molmo_training, batch_prepare_data_for_molmo_training ) import numpy as np from tqdm import tqdm from torch.utils.data import DataLoader -from pdelfin.train.utils import make_dataset -from pdelfin.train.core.config import TrainConfig, DataConfig, SourceConfig +from olmocr.train.utils import make_dataset +from olmocr.train.core.config import TrainConfig, DataConfig, SourceConfig class TestDataprep(unittest.TestCase): def testFullDataloader(self): @@ -215,8 +215,8 @@ class TestMolmoDataPrep(unittest.TestCase): random.seed(42) # Mock the functions that require actual PDF files - with patch('pdelfin.prompts.anchor.get_anchor_text') as mock_get_anchor_text, \ - patch('pdelfin.data.renderpdf.render_pdf_to_base64png') as mock_render_pdf_to_base64png: + with patch('olmocr.prompts.anchor.get_anchor_text') as mock_get_anchor_text, \ + patch('olmocr.data.renderpdf.render_pdf_to_base64png') as mock_render_pdf_to_base64png: # Set return values for the mocked functions mock_get_anchor_text.return_value = "This is the anchor text." @@ -323,8 +323,8 @@ class TestMolmoDataPrep(unittest.TestCase): target_anchor_text_len = [0, 6000] # Mock the necessary functions - with patch('pdelfin.prompts.anchor.get_anchor_text') as mock_get_anchor_text, \ - patch('pdelfin.data.renderpdf.render_pdf_to_base64png') as mock_render_pdf_to_base64png: + with patch('olmocr.prompts.anchor.get_anchor_text') as mock_get_anchor_text, \ + patch('olmocr.data.renderpdf.render_pdf_to_base64png') as mock_render_pdf_to_base64png: mock_get_anchor_text.return_value = "This is the anchor text." img = Image.new('RGB', (100, 100), color='red') diff --git a/tests/test_filter.py b/tests/test_filter.py index 590c2de..f0f4d46 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -3,7 +3,7 @@ import unittest from pypdf import PdfReader -from pdelfin.filter import PdfFilter +from olmocr.filter import PdfFilter class PdfFilterTest(unittest.TestCase): diff --git a/tests/test_s3_work_queue.py b/tests/test_s3_work_queue.py index c814e0a..ecfd949 100644 --- a/tests/test_s3_work_queue.py +++ b/tests/test_s3_work_queue.py @@ -7,7 +7,7 @@ import hashlib from typing import List, Dict # Import the classes we're testing -from pdelfin.s3_queue import S3WorkQueue, WorkItem +from olmocr.s3_queue import S3WorkQueue, WorkItem class TestS3WorkQueue(unittest.TestCase): def setUp(self): @@ -70,8 +70,8 @@ class TestS3WorkQueue(unittest.TestCase): async def test_populate_queue_new_items(self): """Test populating queue with new items""" # Mock empty existing index - with patch('pdelfin.s3_queue.download_zstd_csv', return_value=[]): - with patch('pdelfin.s3_queue.upload_zstd_csv') as mock_upload: + with patch('olmocr.s3_queue.download_zstd_csv', return_value=[]): + with patch('olmocr.s3_queue.upload_zstd_csv') as mock_upload: await self.work_queue.populate_queue(self.sample_paths, items_per_group=2) # Verify upload was called with correct data @@ -97,8 +97,8 @@ class TestS3WorkQueue(unittest.TestCase): existing_hash = S3WorkQueue._compute_workgroup_hash(existing_paths) existing_line = f"{existing_hash},{existing_paths[0]}" - with patch('pdelfin.s3_queue.download_zstd_csv', return_value=[existing_line]): - with patch('pdelfin.s3_queue.upload_zstd_csv') as mock_upload: + with patch('olmocr.s3_queue.download_zstd_csv', return_value=[existing_line]): + with patch('olmocr.s3_queue.upload_zstd_csv') as mock_upload: await self.work_queue.populate_queue(existing_paths + new_paths, items_per_group=1) # Verify upload called with both existing and new items @@ -116,8 +116,8 @@ class TestS3WorkQueue(unittest.TestCase): completed_items = [f"s3://test-bucket/workspace/results/output_{work_hash}.jsonl"] - with patch('pdelfin.s3_queue.download_zstd_csv', return_value=[work_line]): - with patch('pdelfin.s3_queue.expand_s3_glob', return_value=completed_items): + with patch('olmocr.s3_queue.download_zstd_csv', return_value=[work_line]): + with patch('olmocr.s3_queue.expand_s3_glob', return_value=completed_items): await self.work_queue.initialize_queue() # Queue should be empty since all work is completed diff --git a/tests/test_sglang.py b/tests/test_sglang.py index e110579..5ca5b85 100644 --- a/tests/test_sglang.py +++ b/tests/test_sglang.py @@ -17,8 +17,8 @@ from io import BytesIO from PIL import Image from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration from pathlib import Path -from pdelfin.beakerpipeline import sglang_server_task, sglang_server_ready, build_page_query, SGLANG_SERVER_PORT, render_pdf_to_base64png, get_anchor_text, download_directory -from pdelfin.prompts import PageResponse +from olmocr.beakerpipeline import sglang_server_task, sglang_server_ready, build_page_query, SGLANG_SERVER_PORT, render_pdf_to_base64png, get_anchor_text, download_directory +from olmocr.prompts import PageResponse from httpx import AsyncClient import torch.nn.functional as F MODEL_FINETUNED_PATH = "s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/jakep/Qwen_Qwen2-VL-7B-Instruct-e4ecf8-01JAH8GMWHTJ376S2N7ETXRXH4/checkpoint-9500/bf16/" @@ -103,7 +103,7 @@ class TestSglangServer(unittest.IsolatedAsyncioTestCase): class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase): async def asyncSetUp(self): # Set up the Hugging Face model and tokenizer - model_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'pdelfin', 'model') + model_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'olmocr', 'model') download_directory([MODEL_FINETUNED_PATH], model_cache_dir) # Check the rope config and make sure it's got the proper key @@ -249,7 +249,7 @@ class TestHuggingFaceModel(unittest.IsolatedAsyncioTestCase): class RawSGLangTest(unittest.IsolatedAsyncioTestCase): def setUp(self): # Set up the Hugging Face model and tokenizer - model_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'pdelfin', 'model') + model_cache_dir = os.path.join(os.path.expanduser('~'), '.cache', 'olmocr', 'model') download_directory([MODEL_FINETUNED_PATH], model_cache_dir) # Check the rope config and make sure it's got the proper key