mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-12 00:32:45 +00:00
Fix for iabooks
This commit is contained in:
parent
0a9c82927f
commit
cffbb82b0b
@ -9,6 +9,7 @@ that mirrors the original structure with side-by-side PDF and MD files.
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict, Any, Optional
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
@ -111,6 +112,15 @@ def process_single_result(
|
|||||||
|
|
||||||
if not original_pdf_path.exists():
|
if not original_pdf_path.exists():
|
||||||
print(f"Warning: Original PDF not found: {original_pdf_path}")
|
print(f"Warning: Original PDF not found: {original_pdf_path}")
|
||||||
|
|
||||||
|
original_pdf_path = str(original_pdf_path)
|
||||||
|
pattern = r'(.+?)(-\d+)\.pdf$'
|
||||||
|
replacement = r'\1.pdf\2.pdf'
|
||||||
|
|
||||||
|
original_pdf_path = Path(re.sub(pattern, replacement, original_pdf_path))
|
||||||
|
|
||||||
|
if not original_pdf_path.exists():
|
||||||
|
print(f"Error: Original PDF not found: {original_pdf_path}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Create output paths
|
# Create output paths
|
||||||
|
Loading…
x
Reference in New Issue
Block a user