This commit is contained in:
Jake Poznanski 2025-10-09 22:12:19 +00:00
parent 702c42f8e7
commit 875337f962
11 changed files with 8 additions and 25 deletions

View File

@ -22,7 +22,6 @@ import threading
import unittest import unittest
import weakref import weakref
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional from typing import List, Optional

View File

@ -16,7 +16,6 @@ import argparse
import json import json
import os import os
import random import random
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional from typing import Optional

View File

@ -1,6 +1,5 @@
from threading import Lock from threading import Lock
import torch
from paddleocr import PPStructureV3 from paddleocr import PPStructureV3
# Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det # Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det

View File

@ -20,7 +20,7 @@ from playwright.async_api import async_playwright
from syntok.segmenter import process from syntok.segmenter import process
from tqdm import tqdm from tqdm import tqdm
from olmocr.bench.tests import TableTest, TestType, load_single_test, parse_html_tables from olmocr.bench.tests import TableTest, TestType, parse_html_tables
from olmocr.data.renderpdf import ( from olmocr.data.renderpdf import (
get_png_dimensions_from_base64, get_png_dimensions_from_base64,
render_pdf_to_base64png, render_pdf_to_base64png,

View File

@ -13,17 +13,12 @@ The script:
""" """
import argparse import argparse
import json
import os import os
import random import random
import re
import shutil import shutil
import subprocess from typing import List, Optional
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pypdf import pypdf
from pypdf import PageObject, Transformation
from tqdm import tqdm from tqdm import tqdm

View File

@ -11,7 +11,7 @@ import sys
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Tuple
from openai import OpenAI from openai import OpenAI
from pydantic import BaseModel, Field from pydantic import BaseModel, Field

View File

@ -9,12 +9,11 @@
import argparse import argparse
import csv import csv
import hashlib import hashlib
import os
import threading import threading
import time import time
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple from typing import Dict, Optional, Set, Tuple
import img2pdf import img2pdf
import requests import requests

View File

@ -16,10 +16,9 @@ import json
import tarfile import tarfile
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Dict, Iterator, List, Optional, Tuple from typing import Dict, Iterator, List, Optional
import pandas as pd import pandas as pd
import yaml
from tqdm import tqdm from tqdm import tqdm
from olmocr.prompts import PageResponse from olmocr.prompts import PageResponse

View File

@ -32,14 +32,12 @@ from pypdf import PdfReader
from torch.utils.data import Dataset from torch.utils.data import Dataset
from tqdm import tqdm from tqdm import tqdm
from olmocr.bench.katex.render import render_equation
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.anchor import get_anchor_text
from olmocr.prompts.prompts import ( from olmocr.prompts.prompts import (
PageResponse, PageResponse,
build_finetuning_prompt, build_finetuning_prompt,
build_no_anchoring_v4_yaml_prompt, build_no_anchoring_v4_yaml_prompt,
build_no_anchoring_yaml_prompt,
) )
# Type alias for samples # Type alias for samples

View File

@ -3,27 +3,23 @@ GRPO (Generative Reward-based Policy Optimization) training script for OlmOCR.
""" """
import argparse import argparse
import asyncio
import base64 import base64
import glob import glob
import json import json
import logging import logging
import os import os
import random
import sys import sys
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache, partial from functools import lru_cache
from io import BytesIO from io import BytesIO
from pathlib import Path from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Set, Tuple
import numpy as np
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import wandb import wandb
from PIL import Image from PIL import Image
from rapidfuzz import fuzz from rapidfuzz import fuzz
from torch.utils.data import DataLoader, Dataset from torch.utils.data import Dataset
from transformers import ( from transformers import (
AutoProcessor, AutoProcessor,
Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration,

View File

@ -25,7 +25,6 @@
import argparse import argparse
import json import json
import os
import re import re
import sqlite3 import sqlite3
from pathlib import Path from pathlib import Path