mirror of
https://github.com/allenai/olmocr.git
synced 2025-09-11 09:40:11 +00:00
Fixes to prevent errors later in dataloading
This commit is contained in:
parent
f13bcad943
commit
1c42a08d06
@ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[
|
|||||||
:return: A dictionary containing MediaBox dimensions or None if not found
|
:return: A dictionary containing MediaBox dimensions or None if not found
|
||||||
"""
|
"""
|
||||||
# Construct the pdfinfo command to extract info for the specific page
|
# Construct the pdfinfo command to extract info for the specific page
|
||||||
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
|
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path]
|
||||||
|
|
||||||
# Run the command using subprocess
|
# Run the command using subprocess
|
||||||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||||
|
@ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig
|
|||||||
|
|
||||||
from pdelfin.prompts.anchor import get_anchor_text
|
from pdelfin.prompts.anchor import get_anchor_text
|
||||||
from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path
|
from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path
|
||||||
|
from pdelfin.data.renderpdf import get_pdf_media_box_width_height
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
@ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option
|
|||||||
def _can_create_anchor_text(example):
|
def _can_create_anchor_text(example):
|
||||||
try:
|
try:
|
||||||
anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000)
|
anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000)
|
||||||
|
_ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"])
|
||||||
return anchor_text is not None
|
return anchor_text is not None
|
||||||
except:
|
except:
|
||||||
return False
|
return False
|
||||||
|
Loading…
x
Reference in New Issue
Block a user