viewer fix

This commit is contained in:
Jake Poznanski 2025-01-29 11:27:55 -08:00
parent 4c35105bd4
commit dbf647790a
2 changed files with 7 additions and 8 deletions

View File

@ -69,10 +69,12 @@ cat localworkspace/results/output_*.jsonl
You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command. You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command.
```python ```bash
python -m olmocr.viewer.dolmaviewer localworkspace/results/output_*.jsonl
``` ```
Now open `./dolma_previews/tests_gnarly_pdfs_horribleocr_pdf.html` in your favorite browser.
### Multi-node / Cluster Usage ### Multi-node / Cluster Usage

View File

@ -44,12 +44,9 @@ def process_document(data, s3_client, template, output_dir):
source_file = metadata.get('Source-File') source_file = metadata.get('Source-File')
# Generate base64 image of the corresponding PDF page # Generate base64 image of the corresponding PDF page
if source_file and source_file.startswith('s3://'): local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf") local_pdf.write(get_s3_bytes(s3_client, source_file))
local_pdf.write(get_s3_bytes(s3_client, source_file)) local_pdf.flush()
local_pdf.flush()
else:
raise ValueError("Expecting s3 files only")
pages = [] pages = []
for span in pdf_page_numbers: for span in pdf_page_numbers: