mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 10:04:26 +00:00 
			
		
		
		
	Fixes to prevent errors later in dataloading
This commit is contained in:
		
							parent
							
								
									f13bcad943
								
							
						
					
					
						commit
						1c42a08d06
					
				| @ -14,7 +14,7 @@ def get_pdf_media_box_width_height(local_pdf_path: str, page_num: int) -> tuple[ | |||||||
|     :return: A dictionary containing MediaBox dimensions or None if not found |     :return: A dictionary containing MediaBox dimensions or None if not found | ||||||
|     """ |     """ | ||||||
|     # Construct the pdfinfo command to extract info for the specific page |     # Construct the pdfinfo command to extract info for the specific page | ||||||
|     command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', local_pdf_path] |     command = ['pdfinfo', '-f', str(page_num), '-l', str(page_num), '-box', '-enc', 'UTF-8', local_pdf_path] | ||||||
|      |      | ||||||
|     # Run the command using subprocess |     # Run the command using subprocess | ||||||
|     result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |     result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | ||||||
|  | |||||||
| @ -18,6 +18,7 @@ from .core.config import DataConfig, SourceConfig | |||||||
| 
 | 
 | ||||||
| from pdelfin.prompts.anchor import get_anchor_text | from pdelfin.prompts.anchor import get_anchor_text | ||||||
| from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path | from pdelfin.s3_utils import parse_custom_id, get_s3_bytes, parse_s3_path | ||||||
|  | from pdelfin.data.renderpdf import get_pdf_media_box_width_height | ||||||
| 
 | 
 | ||||||
| # Configure logging | # Configure logging | ||||||
| logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||||
| @ -161,6 +162,7 @@ def build_finetuning_dataset(response_glob_path: str, pdf_cache_location: Option | |||||||
|     def _can_create_anchor_text(example): |     def _can_create_anchor_text(example): | ||||||
|         try: |         try: | ||||||
|             anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000) |             anchor_text = get_anchor_text(example["local_pdf_path"], example["page_num"], pdf_engine="pdfreport", target_length=4000) | ||||||
|  |             _ = get_pdf_media_box_width_height(example["local_pdf_path"], example["page_num"]) | ||||||
|             return anchor_text is not None |             return anchor_text is not None | ||||||
|         except: |         except: | ||||||
|             return False |             return False | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jake Poznanski
						Jake Poznanski