mirror of
https://github.com/allenai/olmocr.git
synced 2025-06-27 04:00:02 +00:00
Ruff fixes to CI
This commit is contained in:
parent
1348a29ce8
commit
dc7cb5c8b5
@ -2,7 +2,6 @@ import argparse
|
|||||||
import asyncio
|
import asyncio
|
||||||
import glob
|
import glob
|
||||||
import importlib
|
import importlib
|
||||||
import inspect
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
@ -83,7 +82,7 @@ if __name__ == "__main__":
|
|||||||
"methods",
|
"methods",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
help="Methods to run in the format method[:key=value ...]. "
|
help="Methods to run in the format method[:key=value ...]. "
|
||||||
"Example: gotocr mineru:temperature=2 marker:runs=3. "
|
"Example: gotocr mineru:temperature=2 marker:u=3. "
|
||||||
"Use 'name=folder_name' to specify a custom output folder name.",
|
"Use 'name=folder_name' to specify a custom output folder name.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
|
parser.add_argument("--repeats", type=int, default=1, help="Number of times to repeat the conversion for each PDF.")
|
||||||
|
@ -7,7 +7,6 @@ from collections import Counter
|
|||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
|
|
||||||
import syntok.segmenter as segmenter
|
import syntok.segmenter as segmenter
|
||||||
import syntok.tokenizer as tokenizer
|
|
||||||
from google import genai
|
from google import genai
|
||||||
from google.genai import types
|
from google.genai import types
|
||||||
|
|
||||||
|
@ -1,12 +1,7 @@
|
|||||||
import argparse
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import partial
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
# Import necessary components from olmocr
|
# Import necessary components from olmocr
|
||||||
from olmocr.pipeline import (
|
from olmocr.pipeline import (
|
||||||
@ -58,13 +53,13 @@ async def run_olmocr(pdf_path: str, page_num: int = 1, temperature: float = 0.8)
|
|||||||
semaphore = asyncio.Semaphore(1)
|
semaphore = asyncio.Semaphore(1)
|
||||||
|
|
||||||
# Ensure server is running
|
# Ensure server is running
|
||||||
server_task = None
|
_server_task = None
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(sglang_server_ready(), timeout=5)
|
await asyncio.wait_for(sglang_server_ready(), timeout=5)
|
||||||
print("Using existing sglang server")
|
print("Using existing sglang server")
|
||||||
except Exception:
|
except Exception:
|
||||||
print("Starting new sglang server")
|
print("Starting new sglang server")
|
||||||
server_task = asyncio.create_task(sglang_server_host(args, semaphore))
|
_server_task = asyncio.create_task(sglang_server_host(args, semaphore))
|
||||||
await sglang_server_ready()
|
await sglang_server_ready()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -4,7 +4,7 @@ from typing import Type
|
|||||||
import regex as re
|
import regex as re
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from .aligners import BaseAligner
|
from .aligners import BaseAligner, AlignerRegistry
|
||||||
from .registry import BaseRegistry
|
from .registry import BaseRegistry
|
||||||
from .segmenters import BaseSegmenter, SegmenterRegistry
|
from .segmenters import BaseSegmenter, SegmenterRegistry
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
import re
|
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
import torch
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from .registry import BaseRegistry
|
from .registry import BaseRegistry
|
||||||
|
@ -50,7 +50,7 @@ def query_infinigram(ngram, index="v4_rpj_llama_s4", retries=3):
|
|||||||
result = response.json()
|
result = response.json()
|
||||||
if "count" in result:
|
if "count" in result:
|
||||||
return result["count"]
|
return result["count"]
|
||||||
except Exception as e: # type: ignore
|
except Exception: # type: ignore
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user