Viewer and gitignore

2025-12-03 18:50:42 +00:00 · 2025-01-29 11:46:46 -08:00 · 2025-01-29 11:46:46 -08:00 · b574766977
commit b574766977
parent 86267d865f
2 changed files with 121 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ s2orc_previews_3200/*
 sample200_vllm/*
 sample200_sglang/*
 pdelfin_testset/*
+localworkspace/*
 /*.html
 scoreelo.csv
 debug.log
--- a/olmocr/viewer/dolmaviewer.py
+++ b/olmocr/viewer/dolmaviewer.py
@ -4,6 +4,7 @@ import html
 import argparse
 import boto3
 import tempfile
+import glob
 from botocore.exceptions import NoCredentialsError, PartialCredentialsError
 from jinja2 import Template
 import smart_open
@ -14,19 +15,29 @@ import markdown2
 from olmocr.s3_utils import get_s3_bytes, parse_s3_path
 from olmocr.data.renderpdf import render_pdf_to_base64webp

-def read_jsonl(path):
-    with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
-        for line in f:
-            yield line.strip()
+def read_jsonl(paths):
+    """
+    Generator that yields lines from multiple JSONL files.
+    Supports both local and S3 paths.
+    """
+    for path in paths:
+        try:
+            with smart_open.smart_open(path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    yield line.strip()
+        except Exception as e:
+            print(f"Error reading {path}: {e}")

 def generate_presigned_url(s3_client, bucket_name, key_name):
    try:
-        response = s3_client.generate_presigned_url('get_object',
-                                                    Params={'Bucket': bucket_name, 'Key': key_name},
-                                                    ExpiresIn=3600 * 24 * 7 - 100)  # Link expires in 1 week
+        response = s3_client.generate_presigned_url(
+            'get_object',
+            Params={'Bucket': bucket_name, 'Key': key_name},
+            ExpiresIn=3600 * 24 * 7 - 100  # Link expires in 1 week
+        )
        return response
-    except (NoCredentialsError, PartialCredentialsError):
-        print("Error: AWS credentials not found or incomplete.")
+    except (NoCredentialsError, PartialCredentialsError) as e:
+        print(f"Error generating presigned URL: {e}")
        return None

 def process_document(data, s3_client, template, output_dir):
@ -38,24 +49,34 @@ def process_document(data, s3_client, template, output_dir):
    source_file = metadata.get('Source-File')

    # Generate base64 image of the corresponding PDF page
-    local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
-    local_pdf.write(get_s3_bytes(s3_client, source_file))
-    local_pdf.flush()
+    local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False)
+    try:
+        pdf_bytes = get_s3_bytes(s3_client, source_file)
+        if pdf_bytes is None:
+            print(f"Failed to retrieve PDF from {source_file}")
+            return
+        local_pdf.write(pdf_bytes)
+        local_pdf.flush()

-    pages = []
-    for span in pdf_page_numbers:
-        start_index, end_index, page_num = span
-        page_text = text[start_index:end_index]
-        
-        # Detect and convert Markdown to HTML
-        page_text = html.escape(page_text, quote=True).replace('&lt;br&gt;', '<br>')
-        page_text = markdown2.markdown(page_text, extras=["tables"])
+        pages = []
+        for span in pdf_page_numbers:
+            start_index, end_index, page_num = span
+            page_text = text[start_index:end_index]
+            
+            # Detect and convert Markdown to HTML
+            page_text = html.escape(page_text, quote=True).replace('&lt;br&gt;', '<br>')
+            page_text = markdown2.markdown(page_text, extras=["tables"])

-        base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)
+            base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)

-        pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})
+            pages.append({'page_num': page_num, 'text': page_text, 'image': base64_image})

-    local_pdf.close()
+    except Exception as e:
+        print(f"Error processing document ID {id_}: {e}")
+        return
+    finally:
+        local_pdf.close()
+        os.unlink(local_pdf.name)

    # Generate pre-signed URL if source_file is an S3 path
    s3_link = None
@ -64,49 +85,101 @@ def process_document(data, s3_client, template, output_dir):
        s3_link = generate_presigned_url(s3_client, bucket_name, key_name)

    # Render the HTML using the Jinja template
-    html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
+    try:
+        html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
+    except Exception as e:
+        print(f"Error rendering HTML for document ID {id_}: {e}")
+        return

    # Write the HTML content to a file
-    filename = f'{source_file.replace("s3://", "").replace("/", "_").replace(".", "_")}.html'
-    filepath = os.path.join(output_dir, filename)
-    with open(filepath, 'w', encoding='utf-8') as f:
-        f.write(html_content)
+    try:
+        safe_source = source_file.replace("s3://", "").replace("/", "_").replace(".", "_") if source_file else f"id_{id_}"
+        filename = f'{safe_source}.html'
+        filepath = os.path.join(output_dir, filename)
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+    except Exception as e:
+        print(f"Error writing HTML file for document ID {id_}: {e}")

-def main(jsonl_path, output_dir, template_path):
+def main(jsonl_paths, output_dir, template_path, s3_profile_name):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

+    # Expand glob patterns for local paths
+    expanded_paths = []
+    for path in jsonl_paths:
+        if path.startswith('s3://'):
+            expanded_paths.append(path)
+        else:
+            matched = glob.glob(path)
+            if not matched:
+                print(f"No files matched the pattern: {path}")
+            expanded_paths.extend(matched)
+
+    if not expanded_paths:
+        print("No JSONL files to process.")
+        return
+
    # Load the Jinja template
-    with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
-        template_content = template_file.read()
-        template = Template(template_content)
+    try:
+        with open(os.path.join(os.path.dirname(__file__), template_path), 'r', encoding='utf-8') as template_file:
+            template_content = template_file.read()
+            template = Template(template_content)
+    except Exception as e:
+        print(f"Error loading template: {e}")
+        return

    # Initialize S3 client for generating presigned URLs
-    workspace_session = boto3.Session(profile_name="s2")
-    s3_client = workspace_session.client("s3")
+    try:
+        workspace_session = boto3.Session(profile_name=s3_profile_name)
+        s3_client = workspace_session.client("s3")
+    except Exception as e:
+        print(f"Error initializing S3 client: {e}")
+        return

    # Create ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        futures = []
-        for line in read_jsonl(jsonl_path):
+        for line in read_jsonl(expanded_paths):
            if not line:
                continue
-            data = json.loads(line)
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError as e:
+                print(f"Invalid JSON line: {e}")
+                continue
            future = executor.submit(process_document, data, s3_client, template, output_dir)
            futures.append(future)

-        for future in tqdm(as_completed(futures), total=len(futures)):
-            try:
-                future.result()
-            except Exception as e:
-                print(f"An error occurred: {e}")
-                raise
+        for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing documents"):
+            pass  # Progress bar updates automatically
+
+    print(f"Output HTML-viewable pages to directory: {args.output_dir}")

 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Generate HTML pages from a JSONL file with pre-signed S3 links.')
-    parser.add_argument('jsonl_path', help='Path to the JSONL file (local or s3://)')
-    parser.add_argument('--output_dir', default='dolma_previews', help='Directory to save HTML files')
-    parser.add_argument('--template_path', default='dolmaviewer_template.html', help='Path to the Jinja2 template file')
+    parser = argparse.ArgumentParser(
+        description='Generate HTML pages from one or more JSONL files with pre-signed S3 links.'
+    )
+    parser.add_argument(
+        'jsonl_paths',
+        nargs='+',
+        help='Path(s) to the JSONL file(s) (local or s3://). Supports glob patterns for local paths.'
+    )
+    parser.add_argument(
+        '--output_dir',
+        default='dolma_previews',
+        help='Directory to save HTML files'
+    )
+    parser.add_argument(
+        '--template_path',
+        default='dolmaviewer_template.html',
+        help='Path to the Jinja2 template file'
+    )
+    parser.add_argument(
+        '--s3_profile',
+        default=None,
+        help='S3 profile to use for accessing the source documents to render them in the viewer.'
+    )
    args = parser.parse_args()

-    main(args.jsonl_path, args.output_dir, args.template_path)
+    main(args.jsonl_paths, args.output_dir, args.template_path, args.s3_profile)