diff --git a/pdelfin/data/renderpdf.py b/pdelfin/data/renderpdf.py index 2987411..b3bbec3 100644 --- a/pdelfin/data/renderpdf.py +++ b/pdelfin/data/renderpdf.py @@ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[ :return: A dictionary containing MediaBox dimensions or None if not found """ # Construct the pdfinfo command to extract info for the specific page - command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path] + command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path] # Run the command using subprocess result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) diff --git a/pdelfin/train/dataloader.py b/pdelfin/train/dataloader.py index c6dfbca..f5f4364 100644 --- a/pdelfin/train/dataloader.py +++ b/pdelfin/train/dataloader.py @@ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig from pdelfin.prompts.anchor import get_anchor_text from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path +from pdelfin.data.renderpdf import get_pdf_media_box_width_height # Configure logging logging.basicConfig(level=logging.INFO) @@ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option def _can_create_anchor_text(example): try: anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000) + _ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"]) return anchor_text is not None except: return False