mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 01:55:06 +00:00 
			
		
		
		
	Fixes to prevent errors later in dataloading
This commit is contained in:
		
							parent
							
								
									f13bcad943
								
							
						
					
					
						commit
						1c42a08d06
					
				| @ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[ | ||||
|     :return: A dictionary containing MediaBox dimensions or None if not found | ||||
|     """ | ||||
|     # Construct the pdfinfo command to extract info for the specific page | ||||
|     command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path] | ||||
|     command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path] | ||||
|      | ||||
|     # Run the command using subprocess | ||||
|     result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | ||||
|  | ||||
| @ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig | ||||
| 
 | ||||
| from pdelfin.prompts.anchor import get_anchor_text | ||||
| from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path | ||||
| from pdelfin.data.renderpdf import get_pdf_media_box_width_height | ||||
| 
 | ||||
| # Configure logging | ||||
| logging.basicConfig(level=logging.INFO) | ||||
| @ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option | ||||
|     def _can_create_anchor_text(example): | ||||
|         try: | ||||
|             anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000) | ||||
|             _ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"]) | ||||
|             return anchor_text is not None | ||||
|         except: | ||||
|             return False | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jake Poznanski
						Jake Poznanski