mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-03 18:50:42 +00:00
Viewer and gitignore
This commit is contained in:
parent
86267d865f
commit
b574766977
1
.gitignore
vendored
1
.gitignore
vendored
@ -10,6 +10,7 @@ s2orc_previews_3200/*
|
||||
sample200_vllm/*
|
||||
sample200_sglang/*
|
||||
pdelfin_testset/*
|
||||
localworkspace/*
|
||||
/*.html
|
||||
scoreelo.csv
|
||||
debug.log
|
||||
|
||||
@ -4,6 +4,7 @@ import html
|
||||
import argparse
|
||||
import boto3
|
||||
import tempfile
|
||||
import glob
|
||||
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
||||
from jinja2 import Template
|
||||
import smart_open
|
||||
@ -14,19 +15,29 @@ import markdown2
|
||||
from olmocr.s3_utils import get_s3_bytes, parse_s3_path
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64webp
|
||||
|
||||
def read_jsonl(path):
|
||||
with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
yield line.strip()
|
||||
def read_jsonl(paths):
|
||||
"""
|
||||
Generator that yields lines from multiple JSONL files.
|
||||
Supports both local and S3 paths.
|
||||
"""
|
||||
for path in paths:
|
||||
try:
|
||||
with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
yield line.strip()
|
||||
except Exception as e:
|
||||
print(f"Error reading {path}: {e}")
|
||||
|
||||
def generate_presigned_url(s3_client, bucket_name, key_name):
|
||||
try:
|
||||
response = s3_client.generate_presigned_url('get_object',
|
||||
Params={'Bucket': bucket_name, 'Key': key_name},
|
||||
ExpiresIn=3600 * 24 * 7 - 100) # Link expires in 1 week
|
||||
response = s3_client.generate_presigned_url(
|
||||
'get_object',
|
||||
Params={'Bucket': bucket_name, 'Key': key_name},
|
||||
ExpiresIn=3600 * 24 * 7 - 100 # Link expires in 1 week
|
||||
)
|
||||
return response
|
||||
except (NoCredentialsError, PartialCredentialsError):
|
||||
print("Error: AWS credentials not found or incomplete.")
|
||||
except (NoCredentialsError, PartialCredentialsError) as e:
|
||||
print(f"Error generating presigned URL: {e}")
|
||||
return None
|
||||
|
||||
def process_document(data, s3_client, template, output_dir):
|
||||
@ -38,24 +49,34 @@ def process_document(data, s3_client, template, output_dir):
|
||||
source_file = metadata.get('Source-File')
|
||||
|
||||
# Generate base64 image of the corresponding PDF page
|
||||
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
|
||||
local_pdf.write(get_s3_bytes(s3_client, source_file))
|
||||
local_pdf.flush()
|
||||
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False)
|
||||
try:
|
||||
pdf_bytes = get_s3_bytes(s3_client, source_file)
|
||||
if pdf_bytes is None:
|
||||
print(f"Failed to retrieve PDF from {source_file}")
|
||||
return
|
||||
local_pdf.write(pdf_bytes)
|
||||
local_pdf.flush()
|
||||
|
||||
pages = []
|
||||
for span in pdf_page_numbers:
|
||||
start_index, end_index, page_num = span
|
||||
page_text = text[start_index:end_index]
|
||||
|
||||
# Detect and convert Markdown to HTML
|
||||
page_text = html.escape(page_text, quote=True).replace('<br>', '<br>')
|
||||
page_text = markdown2.markdown(page_text, extras=["tables"])
|
||||
pages = []
|
||||
for span in pdf_page_numbers:
|
||||
start_index, end_index, page_num = span
|
||||
page_text = text[start_index:end_index]
|
||||
|
||||
# Detect and convert Markdown to HTML
|
||||
page_text = html.escape(page_text, quote=True).replace('<br>', '<br>')
|
||||
page_text = markdown2.markdown(page_text, extras=["tables"])
|
||||
|
||||
base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)
|
||||
base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)
|
||||
|
||||
pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})
|
||||
pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})
|
||||
|
||||
local_pdf.close()
|
||||
except Exception as e:
|
||||
print(f"Error processing document ID {id_}: {e}")
|
||||
return
|
||||
finally:
|
||||
local_pdf.close()
|
||||
os.unlink(local_pdf.name)
|
||||
|
||||
# Generate pre-signed URL if source_file is an S3 path
|
||||
s3_link = None
|
||||
@ -64,49 +85,101 @@ def process_document(data, s3_client, template, output_dir):
|
||||
s3_link = generate_presigned_url(s3_client, bucket_name, key_name)
|
||||
|
||||
# Render the HTML using the Jinja template
|
||||
html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
|
||||
try:
|
||||
html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
|
||||
except Exception as e:
|
||||
print(f"Error rendering HTML for document ID {id_}: {e}")
|
||||
return
|
||||
|
||||
# Write the HTML content to a file
|
||||
filename = f'{source_file.replace("s3://", "").replace("/", "_").replace(".", "_")}.html'
|
||||
filepath = os.path.join(output_dir, filename)
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
try:
|
||||
safe_source = source_file.replace("s3://", "").replace("/", "_").replace(".", "_") if source_file else f"id_{id_}"
|
||||
filename = f'{safe_source}.html'
|
||||
filepath = os.path.join(output_dir, filename)
|
||||
with open(filepath, 'w', encoding='utf-8') as f:
|
||||
f.write(html_content)
|
||||
except Exception as e:
|
||||
print(f"Error writing HTML file for document ID {id_}: {e}")
|
||||
|
||||
def main(jsonl_path, output_dir, template_path):
|
||||
def main(jsonl_paths, output_dir, template_path, s3_profile_name):
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
# Expand glob patterns for local paths
|
||||
expanded_paths = []
|
||||
for path in jsonl_paths:
|
||||
if path.startswith('s3://'):
|
||||
expanded_paths.append(path)
|
||||
else:
|
||||
matched = glob.glob(path)
|
||||
if not matched:
|
||||
print(f"No files matched the pattern: {path}")
|
||||
expanded_paths.extend(matched)
|
||||
|
||||
if not expanded_paths:
|
||||
print("No JSONL files to process.")
|
||||
return
|
||||
|
||||
# Load the Jinja template
|
||||
with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
|
||||
template_content = template_file.read()
|
||||
template = Template(template_content)
|
||||
try:
|
||||
with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
|
||||
template_content = template_file.read()
|
||||
template = Template(template_content)
|
||||
except Exception as e:
|
||||
print(f"Error loading template: {e}")
|
||||
return
|
||||
|
||||
# Initialize S3 client for generating presigned URLs
|
||||
workspace_session = boto3.Session(profile_name="s2")
|
||||
s3_client = workspace_session.client("s3")
|
||||
try:
|
||||
workspace_session = boto3.Session(profile_name=s3_profile_name)
|
||||
s3_client = workspace_session.client("s3")
|
||||
except Exception as e:
|
||||
print(f"Error initializing S3 client: {e}")
|
||||
return
|
||||
|
||||
# Create ThreadPoolExecutor
|
||||
with ThreadPoolExecutor() as executor:
|
||||
futures = []
|
||||
for line in read_jsonl(jsonl_path):
|
||||
for line in read_jsonl(expanded_paths):
|
||||
if not line:
|
||||
continue
|
||||
data = json.loads(line)
|
||||
try:
|
||||
data = json.loads(line)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Invalid JSON line: {e}")
|
||||
continue
|
||||
future = executor.submit(process_document, data, s3_client, template, output_dir)
|
||||
futures.append(future)
|
||||
|
||||
for future in tqdm(as_completed(futures), total=len(futures)):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
raise
|
||||
for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"):
|
||||
pass # Progress bar updates automatically
|
||||
|
||||
print(f"Output HTML-viewable pages to directory: {args.output_dir}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')
|
||||
parser.add_argument('jsonl_path', help='Path to the JSONL file (local or s3://)')
|
||||
parser.add_argument('--output_dir', default='dolma_previews', help='Directory to save HTML files')
|
||||
parser.add_argument('--template_path', default='dolmaviewer_template.html', help='Path to the Jinja2 template file')
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate HTML pages from one or more JSONL files with pre-signed S3 links.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'jsonl_paths',
|
||||
nargs='+',
|
||||
help='Path(s) to the JSONL file(s) (local or s3://). Supports glob patterns for local paths.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output_dir',
|
||||
default='dolma_previews',
|
||||
help='Directory to save HTML files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--template_path',
|
||||
default='dolmaviewer_template.html',
|
||||
help='Path to the Jinja2 template file'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--s3_profile',
|
||||
default=None,
|
||||
help='S3 profile to use for accessing the source documents to render them in the viewer.'
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.jsonl_path, args.output_dir, args.template_path)
|
||||
main(args.jsonl_paths, args.output_dir, args.template_path, args.s3_profile)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user