mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-09 08:40:28 +00:00
Fixes to prevent errors later in dataloading
This commit is contained in:
parent
f13bcad943
commit
1c42a08d06
@ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[
|
||||
:return: A dictionary containing MediaBox dimensions or None if not found
|
||||
"""
|
||||
# Construct the pdfinfo command to extract info for the specific page
|
||||
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
|
||||
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path]
|
||||
|
||||
# Run the command using subprocess
|
||||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
|
@ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig
|
||||
|
||||
from pdelfin.prompts.anchor import get_anchor_text
|
||||
from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path
|
||||
from pdelfin.data.renderpdf import get_pdf_media_box_width_height
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option
|
||||
def _can_create_anchor_text(example):
|
||||
try:
|
||||
anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000)
|
||||
_ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"])
|
||||
return anchor_text is not None
|
||||
except:
|
||||
return False
|
||||
|
Loading…
x
Reference in New Issue
Block a user