This commit is contained in:
Jake Poznanski 2025-10-09 22:12:19 +00:00
parent 702c42f8e7
commit 875337f962
11 changed files with 8 additions and 25 deletions

View File

@ -22,7 +22,6 @@ import threading
import unittest
import weakref
from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager
from dataclasses import dataclass
from typing import List, Optional

View File

@ -16,7 +16,6 @@ import argparse
import json
import os
import random
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional

View File

@ -1,6 +1,5 @@
from threading import Lock
import torch
from paddleocr import PPStructureV3
# Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det

View File

@ -20,7 +20,7 @@ from playwright.async_api import async_playwright
from syntok.segmenter import process
from tqdm import tqdm
from olmocr.bench.tests import TableTest, TestType, load_single_test, parse_html_tables
from olmocr.bench.tests import TableTest, TestType, parse_html_tables
from olmocr.data.renderpdf import (
get_png_dimensions_from_base64,
render_pdf_to_base64png,

View File

@ -13,17 +13,12 @@ The script:
"""
import argparse
import json
import os
import random
import re
import shutil
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from typing import List, Optional
import pypdf
from pypdf import PageObject, Transformation
from tqdm import tqdm

View File

@ -11,7 +11,7 @@ import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Tuple
from openai import OpenAI
from pydantic import BaseModel, Field

View File

@ -9,12 +9,11 @@
import argparse
import csv
import hashlib
import os
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from typing import Dict, Optional, Set, Tuple
import img2pdf
import requests

View File

@ -16,10 +16,9 @@ import json
import tarfile
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple
from typing import Dict, Iterator, List, Optional
import pandas as pd
import yaml
from tqdm import tqdm
from olmocr.prompts import PageResponse

View File

@ -32,14 +32,12 @@ from pypdf import PdfReader
from torch.utils.data import Dataset
from tqdm import tqdm
from olmocr.bench.katex.render import render_equation
from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import (
PageResponse,
build_finetuning_prompt,
build_no_anchoring_v4_yaml_prompt,
build_no_anchoring_yaml_prompt,
)
# Type alias for samples

View File

@ -3,27 +3,23 @@ GRPO (Generative Reward-based Policy Optimization) training script for OlmOCR.
"""
import argparse
import asyncio
import base64
import glob
import json
import logging
import os
import random
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache, partial
from functools import lru_cache
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import torch
import torch.distributed as dist
import wandb
from PIL import Image
from rapidfuzz import fuzz
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import Dataset
from transformers import (
AutoProcessor,
Qwen2_5_VLForConditionalGeneration,

View File

@ -25,7 +25,6 @@
import argparse
import json
import os
import re
import sqlite3
from pathlib import Path