mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-04 03:56:16 +00:00 
			
		
		
		
	Fixes to prevent errors later in dataloading
This commit is contained in:
		
							parent
							
								
									f13bcad943
								
							
						
					
					
						commit
						1c42a08d06
					
				@ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[
 | 
				
			|||||||
    :return: A dictionary containing MediaBox dimensions or None if not found
 | 
					    :return: A dictionary containing MediaBox dimensions or None if not found
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    # Construct the pdfinfo command to extract info for the specific page
 | 
					    # Construct the pdfinfo command to extract info for the specific page
 | 
				
			||||||
    command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path]
 | 
					    command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path]
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    # Run the command using subprocess
 | 
					    # Run the command using subprocess
 | 
				
			||||||
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 | 
					    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 | 
				
			||||||
 | 
				
			|||||||
@ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from pdelfin.prompts.anchor import get_anchor_text
 | 
					from pdelfin.prompts.anchor import get_anchor_text
 | 
				
			||||||
from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path
 | 
					from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path
 | 
				
			||||||
 | 
					from pdelfin.data.renderpdf import get_pdf_media_box_width_height
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Configure logging
 | 
					# Configure logging
 | 
				
			||||||
logging.basicConfig(level=logging.INFO)
 | 
					logging.basicConfig(level=logging.INFO)
 | 
				
			||||||
@ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option
 | 
				
			|||||||
    def _can_create_anchor_text(example):
 | 
					    def _can_create_anchor_text(example):
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000)
 | 
					            anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000)
 | 
				
			||||||
 | 
					            _ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"])
 | 
				
			||||||
            return anchor_text is not None
 | 
					            return anchor_text is not None
 | 
				
			||||||
        except:
 | 
					        except:
 | 
				
			||||||
            return False
 | 
					            return False
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user