mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-17 03:02:14 +00:00
More viewer updates
This commit is contained in:
parent
0ffcdc0272
commit
0516ff035f
@ -1,6 +1,5 @@
|
||||
import argparse
|
||||
import glob
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
@ -47,6 +46,11 @@ def process_document(data, s3_client, template, output_dir):
|
||||
attributes = data.get("attributes", {})
|
||||
pdf_page_numbers = attributes.get("pdf_page_numbers", [])
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
# Extract additional fields for display
|
||||
source = data.get("source", "")
|
||||
added = data.get("added", "")
|
||||
created = data.get("created", "")
|
||||
source_file = metadata.get("Source-File")
|
||||
|
||||
# Generate base64 image of the corresponding PDF page
|
||||
@ -64,8 +68,13 @@ def process_document(data, s3_client, template, output_dir):
|
||||
start_index, end_index, page_num = span
|
||||
page_text = text[start_index:end_index]
|
||||
|
||||
# Just escape HTML for safe rendering, markdown conversion will happen client-side
|
||||
page_text = html.escape(page_text, quote=False)
|
||||
# Escape only dangerous HTML characters, preserving curly braces for LaTeX
|
||||
# Don't escape curly braces {} as they're needed for LaTeX
|
||||
page_text = page_text.replace('&', '&')
|
||||
page_text = page_text.replace('<', '<')
|
||||
page_text = page_text.replace('>', '>')
|
||||
page_text = page_text.replace('"', '"')
|
||||
page_text = page_text.replace("'", ''')
|
||||
|
||||
base64_image = render_pdf_to_base64webp(local_pdf.name, page_num)
|
||||
|
||||
@ -84,9 +93,28 @@ def process_document(data, s3_client, template, output_dir):
|
||||
bucket_name, key_name = parse_s3_path(source_file)
|
||||
s3_link = generate_presigned_url(s3_client, bucket_name, key_name)
|
||||
|
||||
# Prepare metadata for display
|
||||
display_metadata = {
|
||||
"id": id_,
|
||||
"source": source,
|
||||
"added": added,
|
||||
"created": created,
|
||||
"pdf_pages": metadata.get("pdf-total-pages", ""),
|
||||
"tokens_in": metadata.get("total-input-tokens", ""),
|
||||
"tokens_out": metadata.get("total-output-tokens", ""),
|
||||
"olmocr_version": metadata.get("olmocr-version", ""),
|
||||
"source_file": source_file
|
||||
}
|
||||
|
||||
# Render the HTML using the Jinja template
|
||||
try:
|
||||
html_content = template.render(id=id_, pages=pages, s3_link=s3_link)
|
||||
html_content = template.render(
|
||||
id=id_,
|
||||
pages=pages,
|
||||
s3_link=s3_link,
|
||||
metadata=display_metadata,
|
||||
attributes=attributes
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error rendering HTML for document ID {id_}: {e}")
|
||||
return
|
||||
|
@ -200,6 +200,47 @@
|
||||
.katex-display {
|
||||
margin: 1em 0;
|
||||
}
|
||||
|
||||
/* Metadata styles */
|
||||
.metadata-container {
|
||||
background: #f8f9fa;
|
||||
border: 1px solid #dee2e6;
|
||||
border-radius: 8px;
|
||||
padding: 15px;
|
||||
margin-bottom: 20px;
|
||||
font-size: 13px;
|
||||
color: #495057;
|
||||
}
|
||||
|
||||
.metadata-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.metadata-item {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
}
|
||||
|
||||
.metadata-label {
|
||||
font-weight: 600;
|
||||
color: #6c757d;
|
||||
margin-right: 5px;
|
||||
min-width: fit-content;
|
||||
}
|
||||
|
||||
.metadata-value {
|
||||
color: #212529;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.metadata-source-file {
|
||||
grid-column: 1 / -1;
|
||||
margin-top: 5px;
|
||||
padding-top: 10px;
|
||||
border-top: 1px solid #dee2e6;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
@ -207,8 +248,7 @@
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
|
||||
<!-- KaTeX JavaScript -->
|
||||
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/katex.min.js" integrity="sha384-XjKyOOlGwcjNTAIQHIpgOno0Hl1YQqzUOEleOLALmuqehneUG+vnGctmUb0ZY0l8" crossorigin="anonymous"></script>
|
||||
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous"></script>
|
||||
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/katex.min.js" integrity="sha384-XjKyOOlGwcjNTAIQHIpgOno0Hl1YQqzUOEleOLALmuqehneUG+vnGctmUb0ZY0l8" crossorigin="anonymous"></script>
|
||||
</head>
|
||||
<body>
|
||||
<div class="document">
|
||||
@ -218,6 +258,66 @@
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Metadata Section -->
|
||||
<div class="metadata-container">
|
||||
<div class="metadata-grid">
|
||||
{% if metadata.source %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Source:</span>
|
||||
<span class="metadata-value">{{ metadata.source }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if metadata.olmocr_version %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">OlmOCR:</span>
|
||||
<span class="metadata-value">v{{ metadata.olmocr_version }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if metadata.created %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Created:</span>
|
||||
<span class="metadata-value">{{ metadata.created }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if metadata.pdf_pages %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Pages:</span>
|
||||
<span class="metadata-value">{{ metadata.pdf_pages }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if metadata.tokens_in %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Tokens In:</span>
|
||||
<span class="metadata-value">{{ metadata.tokens_in }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if metadata.tokens_out %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Tokens Out:</span>
|
||||
<span class="metadata-value">{{ metadata.tokens_out }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if attributes.primary_language %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Language:</span>
|
||||
<span class="metadata-value">{{ attributes.primary_language[0] }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if attributes.rotation_correction %}
|
||||
<div class="metadata-item">
|
||||
<span class="metadata-label">Rotation:</span>
|
||||
<span class="metadata-value">{{ attributes.rotation_correction[0] }}°</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if metadata.source_file %}
|
||||
<div class="metadata-item metadata-source-file">
|
||||
<span class="metadata-label">File:</span>
|
||||
<span class="metadata-value">{{ metadata.source_file }}</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for page in pages %}
|
||||
<div class="page-section" id="page-{{ page.page_num }}">
|
||||
<div class="text">
|
||||
@ -249,65 +349,41 @@
|
||||
mangle: false
|
||||
});
|
||||
|
||||
// Function to render LaTeX in an element
|
||||
function renderLatex(element) {
|
||||
renderMathInElement(element, {
|
||||
delimiters: [
|
||||
{left: '$$', right: '$$', display: true},
|
||||
{left: '$', right: '$', display: false},
|
||||
{left: '\\[', right: '\\]', display: true},
|
||||
{left: '\\(', right: '\\)', display: false}
|
||||
],
|
||||
throwOnError: false,
|
||||
errorColor: '#cc0000',
|
||||
strict: false,
|
||||
trust: true
|
||||
// Function to render LaTeX expressions to HTML
|
||||
function renderLatexToHtml(text) {
|
||||
const patterns = [
|
||||
{ regex: /\$\$([\s\S]+?)\$\$/g, display: true }, // Display math $$...$$
|
||||
{ regex: /\\\[([\s\S]+?)\\\]/g, display: true }, // Display math \[...\]
|
||||
{ regex: /\$([^\$\n]+?)\$/g, display: false }, // Inline math $...$
|
||||
{ regex: /\\\((.+?)\\\)/g, display: false } // Inline math \(...\)
|
||||
];
|
||||
|
||||
let result = text;
|
||||
patterns.forEach(({ regex, display }) => {
|
||||
result = result.replace(regex, (match, latex) => {
|
||||
try {
|
||||
return katex.renderToString(latex, {
|
||||
displayMode: display,
|
||||
throwOnError: false
|
||||
});
|
||||
} catch (e) {
|
||||
return match; // Return original if error
|
||||
}
|
||||
});
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
// Function to render markdown and LaTeX
|
||||
function renderMarkdown() {
|
||||
const textElements = document.querySelectorAll('.text-content');
|
||||
|
||||
textElements.forEach(element => {
|
||||
document.querySelectorAll('.text-content').forEach(element => {
|
||||
const rawText = element.getAttribute('data-raw-text');
|
||||
|
||||
if (isMarkdownView) {
|
||||
// Render as markdown
|
||||
element.className = 'text-content markdown';
|
||||
|
||||
// Parse markdown but protect LaTeX delimiters
|
||||
let processedText = rawText;
|
||||
|
||||
// Temporarily replace LaTeX delimiters to protect them from markdown parsing
|
||||
const latexPatterns = [
|
||||
{pattern: /\$\$(.+?)\$\$/gs, placeholder: '%%%DISPLAY_LATEX_$1%%%'},
|
||||
{pattern: /\$(.+?)\$/g, placeholder: '%%%INLINE_LATEX_$1%%%'},
|
||||
{pattern: /\\\[(.+?)\\\]/gs, placeholder: '%%%DISPLAY_LATEX2_$1%%%'},
|
||||
{pattern: /\\\((.+?)\\\)/g, placeholder: '%%%INLINE_LATEX2_$1%%%'}
|
||||
];
|
||||
|
||||
latexPatterns.forEach(({pattern, placeholder}) => {
|
||||
processedText = processedText.replace(pattern, (match, p1) => {
|
||||
return placeholder.replace('$1', p1);
|
||||
});
|
||||
});
|
||||
|
||||
// Parse markdown
|
||||
let html = marked.parse(processedText);
|
||||
|
||||
// Restore LaTeX delimiters
|
||||
html = html.replace(/%%%DISPLAY_LATEX_(.+?)%%%/gs, '$$$$1$$');
|
||||
html = html.replace(/%%%INLINE_LATEX_(.+?)%%%/g, '$$1$');
|
||||
html = html.replace(/%%%DISPLAY_LATEX2_(.+?)%%%/gs, '\\[$1\\]');
|
||||
html = html.replace(/%%%INLINE_LATEX2_(.+?)%%%/g, '\\($1\\)');
|
||||
|
||||
element.innerHTML = html;
|
||||
|
||||
// Render LaTeX with KaTeX
|
||||
renderLatex(element);
|
||||
// Render LaTeX first, then markdown
|
||||
element.innerHTML = marked.parse(renderLatexToHtml(rawText));
|
||||
} else {
|
||||
// Display raw text
|
||||
element.className = 'text-content raw';
|
||||
element.innerHTML = '<pre>' + rawText + '</pre>';
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user