This commit is contained in:
Jake Poznanski 2025-10-13 21:15:19 +00:00
parent 369fd4d23a
commit aa239eb34c
2 changed files with 3 additions and 3 deletions

View File

@ -18,7 +18,7 @@ import torch
import torch.distributed as dist
import wandb
from PIL import Image
from rapidfuzz import fuzz, distance
from rapidfuzz import distance, fuzz
from torch.utils.data import Dataset
from transformers import (
AutoProcessor,

View File

@ -84,8 +84,8 @@ def test_repackage_and_prepare_olmocrmix():
if relative_path.suffix == ".jsonl":
# For JSONL files, compare as sets of lines (order doesn't matter)
# Filter out empty lines
sample_lines = set(line for line in sample_file.read_text().strip().split('\n') if line.strip())
unpacked_lines = set(line for line in unpacked_file.read_text().strip().split('\n') if line.strip())
sample_lines = set(line for line in sample_file.read_text().strip().split("\n") if line.strip())
unpacked_lines = set(line for line in unpacked_file.read_text().strip().split("\n") if line.strip())
assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}"
else:
# For other files, compare as bytes