mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-07 14:28:44 +00:00
viewer fix
This commit is contained in:
parent
4c35105bd4
commit
dbf647790a
@ -69,10 +69,12 @@ cat localworkspace/results/output_*.jsonl
|
||||
|
||||
You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command.
|
||||
|
||||
```python
|
||||
|
||||
```bash
|
||||
python -m olmocr.viewer.dolmaviewer localworkspace/results/output_*.jsonl
|
||||
```
|
||||
|
||||
Now open `./dolma_previews/tests_gnarly_pdfs_horribleocr_pdf.html` in your favorite browser.
|
||||
|
||||
|
||||
### Multi-node / Cluster Usage
|
||||
|
||||
|
@ -44,12 +44,9 @@ def process_document(data, s3_client, template, output_dir):
|
||||
source_file = metadata.get('Source-File')
|
||||
|
||||
# Generate base64 image of the corresponding PDF page
|
||||
if source_file and source_file.startswith('s3://'):
|
||||
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
|
||||
local_pdf.write(get_s3_bytes(s3_client, source_file))
|
||||
local_pdf.flush()
|
||||
else:
|
||||
raise ValueError("Expecting s3 files only")
|
||||
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
|
||||
local_pdf.write(get_s3_bytes(s3_client, source_file))
|
||||
local_pdf.flush()
|
||||
|
||||
pages = []
|
||||
for span in pdf_page_numbers:
|
||||
|
Loading…
x
Reference in New Issue
Block a user