Validating that we get single pages

This commit is contained in:
Jake Poznanski 2025-06-11 18:14:36 +00:00
parent 0689676026
commit 9a390e3d58

View File

@ -5,6 +5,8 @@ import base64
from io import BytesIO from io import BytesIO
from PIL import Image from PIL import Image
from torch.utils.data import Dataset from torch.utils.data import Dataset
from pypdf import PdfReader
from tqdm import tqdm
from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.data.renderpdf import render_pdf_to_base64png
@ -27,7 +29,11 @@ class MarkdownPDFDocumentDataset(Dataset):
md_files = list(self.root_dir.rglob("*.md")) md_files = list(self.root_dir.rglob("*.md"))
# Verify each markdown file has a corresponding PDF # Verify each markdown file has a corresponding PDF
for md_path in md_files: valid_count = 0
invalid_pdfs = []
print(f"Validating {len(md_files)} markdown-PDF pairs...")
for md_path in tqdm(md_files, desc="Validating PDFs"):
# Look for PDF with same stem (filename without extension) # Look for PDF with same stem (filename without extension)
pdf_path = md_path.with_suffix('.pdf') pdf_path = md_path.with_suffix('.pdf')
@ -38,12 +44,32 @@ class MarkdownPDFDocumentDataset(Dataset):
# Verify the resolved path exists # Verify the resolved path exists
if pdf_path.exists(): if pdf_path.exists():
self.samples.append({ # Validate PDF - check it loads and has exactly one page
'markdown_path': md_path, try:
'pdf_path': pdf_path reader = PdfReader(str(pdf_path))
}) num_pages = len(reader.pages)
if num_pages != 1:
invalid_pdfs.append((pdf_path, f"Expected 1 page, found {num_pages}"))
continue
self.samples.append({
'markdown_path': md_path,
'pdf_path': pdf_path
})
valid_count += 1
except Exception as e:
invalid_pdfs.append((pdf_path, f"Failed to load: {str(e)}"))
print(f"Found {len(self.samples)} valid markdown-PDF pairs") print(f"Found {valid_count} valid markdown-PDF pairs")
if invalid_pdfs:
print(f"\nWarning: {len(invalid_pdfs)} invalid PDFs found:")
for pdf_path, reason in invalid_pdfs[:5]: # Show first 5
print(f" - {pdf_path.name}: {reason}")
if len(invalid_pdfs) > 5:
print(f" ... and {len(invalid_pdfs) - 5} more")
def __len__(self) -> int: def __len__(self) -> int:
return len(self.samples) return len(self.samples)