mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
Fix for iabooks
This commit is contained in:
parent
0a9c82927f
commit
cffbb82b0b
@ -9,6 +9,7 @@ that mirrors the original structure with side-by-side PDF and MD files.
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
@ -111,6 +112,15 @@ def process_single_result(
|
||||
|
||||
if not original_pdf_path.exists():
|
||||
print(f"Warning: Original PDF not found: {original_pdf_path}")
|
||||
|
||||
original_pdf_path = str(original_pdf_path)
|
||||
pattern = r'(.+?)(-\d+)\.pdf$'
|
||||
replacement = r'\1.pdf\2.pdf'
|
||||
|
||||
original_pdf_path = Path(re.sub(pattern, replacement, original_pdf_path))
|
||||
|
||||
if not original_pdf_path.exists():
|
||||
print(f"Error: Original PDF not found: {original_pdf_path}")
|
||||
return False
|
||||
|
||||
# Create output paths
|
||||
|
Loading…
x
Reference in New Issue
Block a user