mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-07 22:32:23 +00:00
viewer fix
This commit is contained in:
parent
4c35105bd4
commit
dbf647790a
@ -69,10 +69,12 @@ cat localworkspace/results/output_*.jsonl
|
|||||||
|
|
||||||
You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command.
|
You can view your documents side-by-side with the original PDF renders using the `dolmaviewer` command.
|
||||||
|
|
||||||
```python
|
```bash
|
||||||
|
python -m olmocr.viewer.dolmaviewer localworkspace/results/output_*.jsonl
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Now open `./dolma_previews/tests_gnarly_pdfs_horribleocr_pdf.html` in your favorite browser.
|
||||||
|
|
||||||
|
|
||||||
### Multi-node / Cluster Usage
|
### Multi-node / Cluster Usage
|
||||||
|
|
||||||
|
@ -44,12 +44,9 @@ def process_document(data, s3_client, template, output_dir):
|
|||||||
source_file = metadata.get('Source-File')
|
source_file = metadata.get('Source-File')
|
||||||
|
|
||||||
# Generate base64 image of the corresponding PDF page
|
# Generate base64 image of the corresponding PDF page
|
||||||
if source_file and source_file.startswith('s3://'):
|
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
|
||||||
local_pdf = tempfile.NamedTemporaryFile("wb+", suffix=".pdf")
|
local_pdf.write(get_s3_bytes(s3_client, source_file))
|
||||||
local_pdf.write(get_s3_bytes(s3_client, source_file))
|
local_pdf.flush()
|
||||||
local_pdf.flush()
|
|
||||||
else:
|
|
||||||
raise ValueError("Expecting s3 files only")
|
|
||||||
|
|
||||||
pages = []
|
pages = []
|
||||||
for span in pdf_page_numbers:
|
for span in pdf_page_numbers:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user