viewer fix

This commit is contained in:
Jake Poznanski 2025-01-29 11:27:55 -08:00
parent 4c35105bd4
commit dbf647790a
2 changed files with 7 additions and 8 deletions

View File

@ -69,10 +69,12 @@ cat localworkspace/results/output_*.jsonl
You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command.
```python
```bash
python -m olmocr.viewer.dolmaviewer localworkspace/results/output_*.jsonl
```
Now open `./dolma_previews/tests_gnarly_pdfs_horribleocr_pdf.html` in your favorite browser.
### Multi-node / Cluster Usage

View File

@ -44,12 +44,9 @@ def process_document(data, s3_client, template, output_dir):
source_file = metadata.get('Source-File')
# Generate base64 image of the corresponding PDF page
if source_file and source_file.startswith('s3://'):
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
local_pdf.write(get_s3_bytes(s3_client, source_file))
local_pdf.flush()
else:
raise ValueError("Expecting s3 files only")
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
local_pdf.write(get_s3_bytes(s3_client, source_file))
local_pdf.flush()
pages = []
for span in pdf_page_numbers: