Fixes to prevent errors later in dataloading

This commit is contained in:
Jake Poznanski 2024-10-17 02:28:43 +00:00
parent f13bcad943
commit 1c42a08d06
2 changed files with 3 additions and 1 deletions

View File

@ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[
:return: A dictionary containing MediaBox dimensions or None if not found
"""
# Construct the pdfinfo command to extract info for the specific page
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path]
# Run the command using subprocess
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

View File

@ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig
from pdelfin.prompts.anchor import get_anchor_text
from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path
from pdelfin.data.renderpdf import get_pdf_media_box_width_height
# Configure logging
logging.basicConfig(level=logging.INFO)
@ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option
def _can_create_anchor_text(example):
try:
anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000)
_ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"])
return anchor_text is not None
except:
return False