This commit is contained in:
Jake Poznanski 2025-10-13 21:15:19 +00:00
parent 369fd4d23a
commit aa239eb34c
2 changed files with 3 additions and 3 deletions

View File

@ -18,7 +18,7 @@ import torch
import torch.distributed as dist import torch.distributed as dist
import wandb import wandb
from PIL import Image from PIL import Image
from rapidfuzz import fuzz, distance from rapidfuzz import distance, fuzz
from torch.utils.data import Dataset from torch.utils.data import Dataset
from transformers import ( from transformers import (
AutoProcessor, AutoProcessor,

View File

@ -84,8 +84,8 @@ def test_repackage_and_prepare_olmocrmix():
if relative_path.suffix == ".jsonl": if relative_path.suffix == ".jsonl":
# For JSONL files, compare as sets of lines (order doesn't matter) # For JSONL files, compare as sets of lines (order doesn't matter)
# Filter out empty lines # Filter out empty lines
sample_lines = set(line for line in sample_file.read_text().strip().split('\n') if line.strip()) sample_lines = set(line for line in sample_file.read_text().strip().split("\n") if line.strip())
unpacked_lines = set(line for line in unpacked_file.read_text().strip().split('\n') if line.strip()) unpacked_lines = set(line for line in unpacked_file.read_text().strip().split("\n") if line.strip())
assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}" assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}"
else: else:
# For other files, compare as bytes # For other files, compare as bytes