mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-14 09:29:32 +00:00
Ok, looks like we have a nice extractor script for the dataset
This commit is contained in:
parent
f19f7c1271
commit
0e17b50583
@ -72,34 +72,39 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
|||||||
extracted_dir = pdf_tarballs_dir / "extracted"
|
extracted_dir = pdf_tarballs_dir / "extracted"
|
||||||
extracted_dir.mkdir(exist_ok=True)
|
extracted_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
# Find all tarball files
|
# Check if PDFs are already extracted
|
||||||
tarball_files = list(pdf_tarballs_dir.glob("*.tar*")) + list(pdf_tarballs_dir.glob("*.tgz"))
|
existing_pdfs = list(extracted_dir.glob("*.pdf"))
|
||||||
|
if existing_pdfs:
|
||||||
|
print(f"Found {len(existing_pdfs)} already extracted PDFs in {extracted_dir}, skipping extraction step")
|
||||||
|
else:
|
||||||
|
# Find all tarball files
|
||||||
|
tarball_files = list(pdf_tarballs_dir.glob("*.tar*")) + list(pdf_tarballs_dir.glob("*.tgz"))
|
||||||
|
|
||||||
if tarball_files:
|
if tarball_files:
|
||||||
print(f"\nFound {len(tarball_files)} PDF tarballs to extract...")
|
print(f"\nFound {len(tarball_files)} PDF tarballs to extract...")
|
||||||
|
|
||||||
# Use ProcessPoolExecutor for parallel extraction
|
# Use ProcessPoolExecutor for parallel extraction
|
||||||
with ProcessPoolExecutor() as executor:
|
with ProcessPoolExecutor() as executor:
|
||||||
# Submit all tasks
|
# Submit all tasks
|
||||||
future_to_tarball = {}
|
future_to_tarball = {}
|
||||||
for tarball in tarball_files:
|
for tarball in tarball_files:
|
||||||
future = executor.submit(extract_tarball, tarball, extracted_dir)
|
future = executor.submit(extract_tarball, tarball, extracted_dir)
|
||||||
future_to_tarball[future] = tarball
|
future_to_tarball[future] = tarball
|
||||||
|
|
||||||
# Process results as they complete with progress bar
|
# Process results as they complete with progress bar
|
||||||
total_files_extracted = 0
|
total_files_extracted = 0
|
||||||
with tqdm(total=len(tarball_files), desc="Extracting tarballs") as pbar:
|
with tqdm(total=len(tarball_files), desc="Extracting tarballs") as pbar:
|
||||||
for future in as_completed(future_to_tarball):
|
for future in as_completed(future_to_tarball):
|
||||||
tarball = future_to_tarball[future]
|
tarball = future_to_tarball[future]
|
||||||
try:
|
try:
|
||||||
files_extracted = future.result()
|
files_extracted = future.result()
|
||||||
total_files_extracted += files_extracted
|
total_files_extracted += files_extracted
|
||||||
pbar.set_postfix({"files": total_files_extracted})
|
pbar.set_postfix({"files": total_files_extracted})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"\nError with {tarball.name}: {e}")
|
print(f"\nError with {tarball.name}: {e}")
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
print(f"Extracted {total_files_extracted} files from tarballs to {extracted_dir}")
|
print(f"Extracted {total_files_extracted} files from tarballs to {extracted_dir}")
|
||||||
else:
|
else:
|
||||||
print(f"No PDF tarballs directory found at {pdf_tarballs_dir}")
|
print(f"No PDF tarballs directory found at {pdf_tarballs_dir}")
|
||||||
|
|
||||||
@ -155,6 +160,19 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
|||||||
# Write natural text
|
# Write natural text
|
||||||
f.write(natural_text)
|
f.write(natural_text)
|
||||||
|
|
||||||
|
# Look for matching PDF in extracted directory and create symlinks
|
||||||
|
extracted_pdfs_dir = dest_path / "hugging_face" / "pdf_tarballs" / "extracted"
|
||||||
|
|
||||||
|
# Find PDFs that match the ID pattern
|
||||||
|
matched_pdf_path = extracted_pdfs_dir / f"{doc_id}.pdf"
|
||||||
|
assert matched_pdf_path.exists(), "Matching PDF not found"
|
||||||
|
|
||||||
|
symlink_path = output_dir / f"{doc_id[4:]}.pdf"
|
||||||
|
|
||||||
|
# Create relative symlink to the PDF
|
||||||
|
if not symlink_path.exists():
|
||||||
|
symlink_path.symlink_to(matched_pdf_path)
|
||||||
|
|
||||||
total_processed += 1
|
total_processed += 1
|
||||||
if total_processed % 1000 == 0:
|
if total_processed % 1000 == 0:
|
||||||
print(f"Processed {total_processed} examples...")
|
print(f"Processed {total_processed} examples...")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user