mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-15 09:59:09 +00:00
Adding paddlepaddle v5 runner for benchmarking
This commit is contained in:
parent
1c703917df
commit
e9ab2fd1bb
@ -231,6 +231,7 @@ if __name__ == "__main__":
|
|||||||
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
|
"mistral": ("olmocr.bench.runners.run_mistral", "run_mistral"),
|
||||||
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
|
"docling": ("olmocr.bench.runners.run_docling", "run_docling"),
|
||||||
"rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
|
"rolmocr": ("olmocr.bench.runners.run_rolmocr", "run_rolmocr"),
|
||||||
|
"paddlepaddle": ("olmocr.bench.runners.run_paddlepaddle", "run_paddlepaddle"),
|
||||||
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
|
"transformers": ("olmocr.bench.runners.run_transformers", "run_transformers"),
|
||||||
"server": ("olmocr.bench.runners.run_server", "run_server"),
|
"server": ("olmocr.bench.runners.run_server", "run_server"),
|
||||||
}
|
}
|
||||||
|
|||||||
32
olmocr/bench/runners/run_paddlepaddle.py
Normal file
32
olmocr/bench/runners/run_paddlepaddle.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from paddleocr import PPStructureV3
|
||||||
|
|
||||||
|
|
||||||
|
# Run's paddle paddle as in the docs here: https://huggingface.co/PaddlePaddle/PP-OCRv5_server_det
|
||||||
|
# text_detection_model_name="PP-OCRv5_server_det",
|
||||||
|
# and using the PP-StructureV3 pipeline to create markdown
|
||||||
|
|
||||||
|
|
||||||
|
def run_paddlepaddle(
|
||||||
|
pdf_path: str,
|
||||||
|
page_num: int = 1,
|
||||||
|
**kwargs
|
||||||
|
) -> str:
|
||||||
|
pipeline = PPStructureV3(
|
||||||
|
text_detection_model_name="PP-OCRv5_server_det",
|
||||||
|
use_doc_orientation_classify=False, # Use use_doc_orientation_classify to enable/disable document orientation classification model
|
||||||
|
use_doc_unwarping=False, # Use use_doc_unwarping to enable/disable document unwarping module
|
||||||
|
use_textline_orientation=False, # Use use_textline_orientation to enable/disable textline orientation classification model
|
||||||
|
device="gpu:0", # Use device to specify GPU for model inference
|
||||||
|
)
|
||||||
|
output = pipeline.predict(pdf_path)
|
||||||
|
for cur_page_0_indexed, res in enumerate(output):
|
||||||
|
if cur_page_0_indexed == page_num - 1:
|
||||||
|
return res.markdown
|
||||||
|
|
||||||
|
raise ValueError(f"Did not get markdown for page {page_num}")
|
||||||
Loading…
x
Reference in New Issue
Block a user