olmocr/tests/test_olmocrmix.py
Jake Poznanski aa239eb34c Lints
2025-10-13 21:15:19 +00:00

95 lines
3.7 KiB
Python

# Test that prepare_olmocrmix.py and repackage_olmocrmix.py work correctly
# by packaging the sample dataset, unpacking it, and verifying contents are preserved
import os
import subprocess
import tempfile
from pathlib import Path
def test_repackage_and_prepare_olmocrmix():
"""Test that repackaging and preparing preserves the dataset contents exactly."""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
sample_dataset = Path("tests/sample_dataset")
# Step 1: Repackage the sample dataset into parquet + tarballs
packaged_dir = temp_path / "packaged"
packaged_dir.mkdir(parents=True, exist_ok=True)
repackage_result = subprocess.run(
[
"python",
"olmocr/data/repackage_olmocrmix.py",
"--processed-dir",
str(sample_dataset),
"--subset",
"test_subset",
"--split",
"test_split",
"--output-dir",
str(packaged_dir),
],
capture_output=True,
text=True,
)
assert repackage_result.returncode == 0, f"Repackage script failed with stderr: {repackage_result.stderr}\nstdout: {repackage_result.stdout}"
# Verify the packaged output exists
parquet_file = packaged_dir / "test_subset_test_split.parquet"
assert parquet_file.exists(), f"Expected parquet file not found: {parquet_file}"
# Step 2: Repackage the sample dataset into parquet + tarballs
unpackaged_dir = temp_path / "unpackaged"
prepare_result = subprocess.run(
[
"python",
"olmocr/data/prepare_olmocrmix.py",
"--dataset-path",
str(packaged_dir),
"--subset",
"test_subset",
"--split",
"test_split",
"--destination",
str(unpackaged_dir),
],
capture_output=True,
text=True,
)
assert prepare_result.returncode == 0
for root, _, files in os.walk(temp_path):
for file_name in files:
print(Path(root) / file_name)
unpacked_processed = unpackaged_dir / "processed_test_subset_test_split"
assert unpacked_processed.exists(), f"Unpacked processed dir missing: {unpacked_processed}"
def relative_files(root: Path):
return sorted(path.relative_to(root) for path in root.rglob("*") if path.is_file())
sample_files = relative_files(sample_dataset)
unpacked_files = relative_files(unpacked_processed)
assert sample_files == unpacked_files, "Mismatch in files between sample dataset and unpacked output"
for relative_path in sample_files:
sample_file = sample_dataset / relative_path
unpacked_file = unpacked_processed / relative_path
if relative_path.suffix == ".jsonl":
# For JSONL files, compare as sets of lines (order doesn't matter)
# Filter out empty lines
sample_lines = set(line for line in sample_file.read_text().strip().split("\n") if line.strip())
unpacked_lines = set(line for line in unpacked_file.read_text().strip().split("\n") if line.strip())
assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}"
else:
# For other files, compare as bytes
sample_contents = sample_file.read_bytes()
unpacked_contents = unpacked_file.read_bytes()
assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"