mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-26 14:47:13 +00:00
Viewer cleanup
This commit is contained in:
parent
a243c8923d
commit
86267d865f
@ -61,6 +61,8 @@ You can also bulk convert many PDFS with a glob pattern:
|
||||
python -m olmocr.pipeline ./localworkspace --pdfs tests/gnarly_pdfs/*.pdf
|
||||
```
|
||||
|
||||
#### Viewing Results
|
||||
|
||||
Once that finishes, output is stored as [Dolma](https://github.com/allenai/dolma)-style JSONL inside of the `./localworkspace/results` directory.
|
||||
|
||||
```bash
|
||||
|
||||
@ -11,7 +11,7 @@ from tqdm import tqdm
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import markdown2
|
||||
|
||||
from olmocr.s3_utils import get_s3_bytes
|
||||
from olmocr.s3_utils import get_s3_bytes, parse_s3_path
|
||||
from olmocr.data.renderpdf import render_pdf_to_base64webp
|
||||
|
||||
def read_jsonl(path):
|
||||
@ -19,12 +19,6 @@ def read_jsonl(path):
|
||||
for line in f:
|
||||
yield line.strip()
|
||||
|
||||
def parse_s3_path(path):
|
||||
# s3://bucket_name/key_name
|
||||
path = path[5:] # Remove 's3://'
|
||||
bucket_name, key_name = path.split('/', 1)
|
||||
return bucket_name, key_name
|
||||
|
||||
def generate_presigned_url(s3_client, bucket_name, key_name):
|
||||
try:
|
||||
response = s3_client.generate_presigned_url('get_object',
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user