Viewer and gitignore

This commit is contained in:
Jake Poznanski 2025-01-29 11:46:46 -08:00
parent 86267d865f
commit b574766977
2 changed files with 121 additions and 47 deletions

1
.gitignore vendored
View File

@ -10,6 +10,7 @@ s2orc_previews_3200/*
sample200_vllm/* sample200_vllm/*
sample200_sglang/* sample200_sglang/*
pdelfin_testset/* pdelfin_testset/*
localworkspace/*
/*.html /*.html
scoreelo.csv scoreelo.csv
debug.log debug.log

View File

@ -4,6 +4,7 @@ import html
import argparse import argparse
import boto3 import boto3
import tempfile import tempfile
import glob
from botocore.exceptions import NoCredentialsError, PartialCredentialsError from botocore.exceptions import NoCredentialsError, PartialCredentialsError
from jinja2 import Template from jinja2 import Template
import smart_open import smart_open
@ -14,19 +15,29 @@ import markdown2
from olmocr.s3_utils import get_s3_bytes, parse_s3_path from olmocr.s3_utils import get_s3_bytes, parse_s3_path
from olmocr.data.renderpdf import render_pdf_to_base64webp from olmocr.data.renderpdf import render_pdf_to_base64webp
def read_jsonl(path): def read_jsonl(paths):
"""
Generator that yields lines from multiple JSONL files.
Supports both local and S3 paths.
"""
for path in paths:
try:
with smart_open.smart_open(path, 'r', encoding='utf-8') as f: with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
yield line.strip() yield line.strip()
except Exception as e:
print(f"Error reading {path}: {e}")
def generate_presigned_url(s3_client, bucket_name, key_name): def generate_presigned_url(s3_client, bucket_name, key_name):
try: try:
response = s3_client.generate_presigned_url('get_object', response = s3_client.generate_presigned_url(
'get_object',
Params={'Bucket': bucket_name, 'Key': key_name}, Params={'Bucket': bucket_name, 'Key': key_name},
ExpiresIn=3600 * 24 * 7 - 100) # Link expires in 1 week ExpiresIn=3600 * 24 * 7 - 100 # Link expires in 1 week
)
return response return response
except (NoCredentialsError, PartialCredentialsError): except (NoCredentialsError, PartialCredentialsError) as e:
print("Error: AWS credentials not found or incomplete.") print(f"Error generating presigned URL: {e}")
return None return None
def process_document(data, s3_client, template, output_dir): def process_document(data, s3_client, template, output_dir):
@ -38,8 +49,13 @@ def process_document(data, s3_client, template, output_dir):
source_file = metadata.get('Source-File') source_file = metadata.get('Source-File')
# Generate base64 image of the corresponding PDF page # Generate base64 image of the corresponding PDF page
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf") local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False)
local_pdf.write(get_s3_bytes(s3_client, source_file)) try:
pdf_bytes = get_s3_bytes(s3_client, source_file)
if pdf_bytes is None:
print(f"Failed to retrieve PDF from {source_file}")
return
local_pdf.write(pdf_bytes)
local_pdf.flush() local_pdf.flush()
pages = [] pages = []
@ -55,7 +71,12 @@ def process_document(data, s3_client, template, output_dir):
pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image}) pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})
except Exception as e:
print(f"Error processing document ID {id_}: {e}")
return
finally:
local_pdf.close() local_pdf.close()
os.unlink(local_pdf.name)
# Generate pre-signed URL if source_file is an S3 path # Generate pre-signed URL if source_file is an S3 path
s3_link = None s3_link = None
@ -64,49 +85,101 @@ def process_document(data, s3_client, template, output_dir):
s3_link = generate_presigned_url(s3_client, bucket_name, key_name) s3_link = generate_presigned_url(s3_client, bucket_name, key_name)
# Render the HTML using the Jinja template # Render the HTML using the Jinja template
try:
html_content = template.render(id=id_, pages=pages, s3_link=s3_link) html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
except Exception as e:
print(f"Error rendering HTML for document ID {id_}: {e}")
return
# Write the HTML content to a file # Write the HTML content to a file
filename = f'{source_file.replace("s3://", "").replace("/", "_").replace(".", "_")}.html' try:
safe_source = source_file.replace("s3://", "").replace("/", "_").replace(".", "_") if source_file else f"id_{id_}"
filename = f'{safe_source}.html'
filepath = os.path.join(output_dir, filename) filepath = os.path.join(output_dir, filename)
with open(filepath, 'w', encoding='utf-8') as f: with open(filepath, 'w', encoding='utf-8') as f:
f.write(html_content) f.write(html_content)
except Exception as e:
print(f"Error writing HTML file for document ID {id_}: {e}")
def main(jsonl_path, output_dir, template_path): def main(jsonl_paths, output_dir, template_path, s3_profile_name):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
# Expand glob patterns for local paths
expanded_paths = []
for path in jsonl_paths:
if path.startswith('s3://'):
expanded_paths.append(path)
else:
matched = glob.glob(path)
if not matched:
print(f"No files matched the pattern: {path}")
expanded_paths.extend(matched)
if not expanded_paths:
print("No JSONL files to process.")
return
# Load the Jinja template # Load the Jinja template
try:
with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file: with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
template_content = template_file.read() template_content = template_file.read()
template = Template(template_content) template = Template(template_content)
except Exception as e:
print(f"Error loading template: {e}")
return
# Initialize S3 client for generating presigned URLs # Initialize S3 client for generating presigned URLs
workspace_session = boto3.Session(profile_name="s2") try:
workspace_session = boto3.Session(profile_name=s3_profile_name)
s3_client = workspace_session.client("s3") s3_client = workspace_session.client("s3")
except Exception as e:
print(f"Error initializing S3 client: {e}")
return
# Create ThreadPoolExecutor # Create ThreadPoolExecutor
with ThreadPoolExecutor() as executor: with ThreadPoolExecutor() as executor:
futures = [] futures = []
for line in read_jsonl(jsonl_path): for line in read_jsonl(expanded_paths):
if not line: if not line:
continue continue
try:
data = json.loads(line) data = json.loads(line)
except json.JSONDecodeError as e:
print(f"Invalid JSON line: {e}")
continue
future = executor.submit(process_document, data, s3_client, template, output_dir) future = executor.submit(process_document, data, s3_client, template, output_dir)
futures.append(future) futures.append(future)
for future in tqdm(as_completed(futures), total=len(futures)): for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"):
try: pass # Progress bar updates automatically
future.result()
except Exception as e: print(f"Output HTML-viewable pages to directory: {args.output_dir}")
print(f"An error occurred: {e}")
raise
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.') parser = argparse.ArgumentParser(
parser.add_argument('jsonl_path', help='Path to the JSONL file (local or s3://)') description='Generate HTML pages from one or more JSONL files with pre-signed S3 links.'
parser.add_argument('--output_dir', default='dolma_previews', help='Directory to save HTML files') )
parser.add_argument('--template_path', default='dolmaviewer_template.html', help='Path to the Jinja2 template file') parser.add_argument(
'jsonl_paths',
nargs='+',
help='Path(s) to the JSONL file(s) (local or s3://). Supports glob patterns for local paths.'
)
parser.add_argument(
'--output_dir',
default='dolma_previews',
help='Directory to save HTML files'
)
parser.add_argument(
'--template_path',
default='dolmaviewer_template.html',
help='Path to the Jinja2 template file'
)
parser.add_argument(
'--s3_profile',
default=None,
help='S3 profile to use for accessing the source documents to render them in the viewer.'
)
args = parser.parse_args() args = parser.parse_args()
main(args.jsonl_path, args.output_dir, args.template_path) main(args.jsonl_paths, args.output_dir, args.template_path, args.s3_profile)