mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-02 02:54:53 +00:00
Adding paddlepaddle v5 runner for benchmarking
This commit is contained in:
parent
1c703917df
commit
e9ab2fd1bb
@ -231,6 +231,7 @@ if __name__ == "__main__":
|
||||
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
|
||||
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
|
||||
"rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
|
||||
"paddlepaddle": ("olmocr.bench.runners.run_paddlepaddle", "run_paddlepaddle"),
|
||||
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
|
||||
"server": ("olmocr.bench.runners.run_server", "run_server"),
|
||||
}
|
||||
|
||||
32
olmocr/bench/runners/run_paddlepaddle.py
Normal file
32
olmocr/bench/runners/run_paddlepaddle.py
Normal file
@ -0,0 +1,32 @@
|
||||
import json
|
||||
import os
|
||||
from typing import Literal
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
|
||||
# Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det
|
||||
# text_detection_model_name="PP-OCRv5_server_det",
|
||||
# and using the PP-StructureV3 pipeline to create markdown
|
||||
|
||||
|
||||
def run_paddlepaddle(
|
||||
pdf_path: str,
|
||||
page_num: int = 1,
|
||||
**kwargs
|
||||
) -> str:
|
||||
pipeline = PPStructureV3(
|
||||
text_detection_model_name="PP-OCRv5_server_det",
|
||||
use_doc_orientation_classify=False, # Use use_doc_orientation_classify to enable/disable document orientation classification model
|
||||
use_doc_unwarping=False, # Use use_doc_unwarping to enable/disable document unwarping module
|
||||
use_textline_orientation=False, # Use use_textline_orientation to enable/disable textline orientation classification model
|
||||
device="gpu:0", # Use device to specify GPU for model inference
|
||||
)
|
||||
output = pipeline.predict(pdf_path)
|
||||
for cur_page_0_indexed, res in enumerate(output):
|
||||
if cur_page_0_indexed == page_num - 1:
|
||||
return res.markdown
|
||||
|
||||
raise ValueError(f"Did not get markdown for page {page_num}")
|
||||
Loading…
x
Reference in New Issue
Block a user