Viewer cleanup

This commit is contained in:
Jake Poznanski 2025-01-29 11:38:53 -08:00
parent a243c8923d
commit 86267d865f
2 changed files with 3 additions and 7 deletions

View File

@ -61,6 +61,8 @@ You can also bulk convert many PDFS with a glob pattern:
python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/*.pdf
```
#### Viewing Results
Once that finishes, output is stored as [Dolma](https://github.com/allenai/dolma)-style JSONL inside of the `./localworkspace/results` directory.
```bash

View File

@ -11,7 +11,7 @@ from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import markdown2
from olmocr.s3_utils import get_s3_bytes
from olmocr.s3_utils import get_s3_bytes, parse_s3_path
from olmocr.data.renderpdf import render_pdf_to_base64webp
def read_jsonl(path):
@ -19,12 +19,6 @@ def read_jsonl(path):
for line in f:
yield line.strip()
def parse_s3_path(path):
# s3://bucket_name/key_name
path = path[5:] # Remove 's3://'
bucket_name, key_name = path.split('/', 1)
return bucket_name, key_name
def generate_presigned_url(s3_client, bucket_name, key_name):
try:
response = s3_client.generate_presigned_url('get_object',