mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-19 20:18:46 +00:00
Cleaning up dependencies
This commit is contained in:
parent
569311c461
commit
e2a5d9f8f3
@ -4,7 +4,6 @@
|
|||||||
# This script will go through each CSV file, convert each page to PDF format, clean up the transcription using a grounded prompt in chatgpt-4o
|
# This script will go through each CSV file, convert each page to PDF format, clean up the transcription using a grounded prompt in chatgpt-4o
|
||||||
# and then output data in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
|
# and then output data in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
|
||||||
# each initial CSV
|
# each initial CSV
|
||||||
# We use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly.
|
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import csv
|
import csv
|
||||||
@ -14,8 +13,8 @@ import time
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Optional, Set, Tuple
|
from typing import Dict, Optional, Set, Tuple
|
||||||
|
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||||
|
|
||||||
import img2pdf
|
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@ -56,10 +55,10 @@ def download_image(url: str, output_path: Path, max_retries: int = 3) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def convert_image_to_pdf(image_path: Path, pdf_path: Path) -> bool:
|
def convert_image_to_pdf(image_path: Path, pdf_path: Path) -> bool:
|
||||||
"""Convert image to PDF using img2pdf."""
|
"""Convert image to PDF."""
|
||||||
try:
|
try:
|
||||||
with open(pdf_path, "wb") as f:
|
with open(pdf_path, "wb") as f:
|
||||||
f.write(img2pdf.convert(str(image_path)))
|
f.write(convert_image_to_pdf_bytes(str(image_path)))
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Failed to convert {image_path} to PDF: {e}")
|
print(f"Failed to convert {image_path} to PDF: {e}")
|
||||||
|
@ -82,7 +82,6 @@
|
|||||||
# Then, for each image, which is typically a scanned document page, we create a dataset in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
|
# Then, for each image, which is typically a scanned document page, we create a dataset in olmocr-format, where you have a .md file and a .pdf file named with the ItemID in a folder structure for
|
||||||
# each initial jsonl file. Ex if you had rg_341/rg_341-53.jsonl, then you'd make rg_341/object_id.md and rg_341/object_id.pdf
|
# each initial jsonl file. Ex if you had rg_341/rg_341-53.jsonl, then you'd make rg_341/object_id.md and rg_341/object_id.pdf
|
||||||
# If you have a TIFF file, you can compress it to jpg at 98% quality, targetting around 1-2MB in size.
|
# If you have a TIFF file, you can compress it to jpg at 98% quality, targetting around 1-2MB in size.
|
||||||
# Then use https://pypi.org/project/img2pdf/ to convert the images to PDFs losslessly.
|
|
||||||
# Output files are named as naId-objectId-page-pageNum.{md,pdf} based on the target object from transcriptions.
|
# Output files are named as naId-objectId-page-pageNum.{md,pdf} based on the target object from transcriptions.
|
||||||
# Each JSONL file gets its own subfolder for organization.
|
# Each JSONL file gets its own subfolder for organization.
|
||||||
|
|
||||||
@ -94,11 +93,11 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Optional, Set, Tuple
|
from typing import Dict, Optional, Set, Tuple
|
||||||
|
|
||||||
import img2pdf
|
|
||||||
import requests
|
import requests
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from olmocr.image_utils import convert_image_to_pdf_bytes
|
||||||
|
|
||||||
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
|
def download_image(url: str, output_path: Path, max_retries: int = 5) -> bool:
|
||||||
"""Download image from URL with exponential backoff retry logic."""
|
"""Download image from URL with exponential backoff retry logic."""
|
||||||
@ -152,7 +151,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa
|
|||||||
|
|
||||||
# Convert JPEG to PDF
|
# Convert JPEG to PDF
|
||||||
with open(output_path, "wb") as f:
|
with open(output_path, "wb") as f:
|
||||||
f.write(img2pdf.convert(str(temp_jpg)))
|
f.write(convert_image_to_pdf_bytes(str(temp_jpg)))
|
||||||
|
|
||||||
# Clean up temp file
|
# Clean up temp file
|
||||||
temp_jpg.unlink(missing_ok=True)
|
temp_jpg.unlink(missing_ok=True)
|
||||||
@ -160,7 +159,7 @@ def process_image_file(image_path: Path, output_path: Path, target_size_mb: floa
|
|||||||
else:
|
else:
|
||||||
# For other formats, convert directly to PDF
|
# For other formats, convert directly to PDF
|
||||||
with open(output_path, "wb") as f:
|
with open(output_path, "wb") as f:
|
||||||
f.write(img2pdf.convert(str(image_path)))
|
f.write(convert_image_to_pdf_bytes(str(image_path)))
|
||||||
|
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -38,7 +38,6 @@ dependencies = [
|
|||||||
"httpx",
|
"httpx",
|
||||||
"torch>=2.7.0",
|
"torch>=2.7.0",
|
||||||
"transformers==4.55.2",
|
"transformers==4.55.2",
|
||||||
"img2pdf",
|
|
||||||
"beaker-py",
|
"beaker-py",
|
||||||
]
|
]
|
||||||
license = {file = "LICENSE"}
|
license = {file = "LICENSE"}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user