mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-08 14:40:24 +00:00
Lints
This commit is contained in:
parent
702c42f8e7
commit
875337f962
@ -22,7 +22,6 @@ import threading
|
|||||||
import unittest
|
import unittest
|
||||||
import weakref
|
import weakref
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from contextlib import contextmanager
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,6 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import shutil
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
from threading import Lock
|
from threading import Lock
|
||||||
|
|
||||||
import torch
|
|
||||||
from paddleocr import PPStructureV3
|
from paddleocr import PPStructureV3
|
||||||
|
|
||||||
# Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det
|
# Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from playwright.async_api import async_playwright
|
|||||||
from syntok.segmenter import process
|
from syntok.segmenter import process
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from olmocr.bench.tests import TableTest, TestType, load_single_test, parse_html_tables
|
from olmocr.bench.tests import TableTest, TestType, parse_html_tables
|
||||||
from olmocr.data.renderpdf import (
|
from olmocr.data.renderpdf import (
|
||||||
get_png_dimensions_from_base64,
|
get_png_dimensions_from_base64,
|
||||||
render_pdf_to_base64png,
|
render_pdf_to_base64png,
|
||||||
|
|||||||
@ -13,17 +13,12 @@ The script:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
from typing import List, Optional
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, List, Optional, Tuple
|
|
||||||
|
|
||||||
import pypdf
|
import pypdf
|
||||||
from pypdf import PageObject, Transformation
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import sys
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|||||||
@ -9,12 +9,11 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import csv
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List, Optional, Set, Tuple
|
from typing import Dict, Optional, Set, Tuple
|
||||||
|
|
||||||
import img2pdf
|
import img2pdf
|
||||||
import requests
|
import requests
|
||||||
|
|||||||
@ -16,10 +16,9 @@ import json
|
|||||||
import tarfile
|
import tarfile
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterator, List, Optional, Tuple
|
from typing import Dict, Iterator, List, Optional
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import yaml
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from olmocr.prompts import PageResponse
|
from olmocr.prompts import PageResponse
|
||||||
|
|||||||
@ -32,14 +32,12 @@ from pypdf import PdfReader
|
|||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from olmocr.bench.katex.render import render_equation
|
|
||||||
from olmocr.data.renderpdf import render_pdf_to_base64png
|
from olmocr.data.renderpdf import render_pdf_to_base64png
|
||||||
from olmocr.prompts.anchor import get_anchor_text
|
from olmocr.prompts.anchor import get_anchor_text
|
||||||
from olmocr.prompts.prompts import (
|
from olmocr.prompts.prompts import (
|
||||||
PageResponse,
|
PageResponse,
|
||||||
build_finetuning_prompt,
|
build_finetuning_prompt,
|
||||||
build_no_anchoring_v4_yaml_prompt,
|
build_no_anchoring_v4_yaml_prompt,
|
||||||
build_no_anchoring_yaml_prompt,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Type alias for samples
|
# Type alias for samples
|
||||||
|
|||||||
@ -3,27 +3,23 @@ GRPO (Generative Reward-based Policy Optimization) training script for OlmOCR.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
|
||||||
import base64
|
import base64
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
import sys
|
import sys
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from functools import lru_cache, partial
|
from functools import lru_cache
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
import wandb
|
import wandb
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from rapidfuzz import fuzz
|
from rapidfuzz import fuzz
|
||||||
from torch.utils.data import DataLoader, Dataset
|
from torch.utils.data import Dataset
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoProcessor,
|
AutoProcessor,
|
||||||
Qwen2_5_VLForConditionalGeneration,
|
Qwen2_5_VLForConditionalGeneration,
|
||||||
|
|||||||
@ -25,7 +25,6 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user