From 557bb9a5e96e33de2044bfe8b4c755d014b1c202 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 9 Oct 2025 22:01:01 +0000 Subject: [PATCH] Repackager is still not working right --- olmocr/data/repackage_olmocrmix.py | 2 +- .../{0000 => empty_document}/blanktext.md | 0 .../{0000 => empty_document}/blanktext.pdf | Bin .../{0000 => simple_document}/edgar.md | 0 .../{0000 => simple_document}/edgar.pdf | Bin 5 files changed, 1 insertion(+), 1 deletion(-) rename tests/sample_dataset/{0000 => empty_document}/blanktext.md (100%) rename tests/sample_dataset/{0000 => empty_document}/blanktext.pdf (100%) rename tests/sample_dataset/{0000 => simple_document}/edgar.md (100%) rename tests/sample_dataset/{0000 => simple_document}/edgar.pdf (100%) diff --git a/olmocr/data/repackage_olmocrmix.py b/olmocr/data/repackage_olmocrmix.py index 86ebadb..512435d 100644 --- a/olmocr/data/repackage_olmocrmix.py +++ b/olmocr/data/repackage_olmocrmix.py @@ -54,7 +54,7 @@ def infer_doc_id(md_path: Path, processed_root: Path) -> str: rel = md_path.relative_to(processed_root) if len(rel.parts) < 2: stem = rel.stem - prefix = rel.stem[:4] + prefix = rel.stem else: prefix = rel.parts[0] stem = Path(rel.parts[-1]).stem diff --git a/tests/sample_dataset/0000/blanktext.md b/tests/sample_dataset/empty_document/blanktext.md similarity index 100% rename from tests/sample_dataset/0000/blanktext.md rename to tests/sample_dataset/empty_document/blanktext.md diff --git a/tests/sample_dataset/0000/blanktext.pdf b/tests/sample_dataset/empty_document/blanktext.pdf similarity index 100% rename from tests/sample_dataset/0000/blanktext.pdf rename to tests/sample_dataset/empty_document/blanktext.pdf diff --git a/tests/sample_dataset/0000/edgar.md b/tests/sample_dataset/simple_document/edgar.md similarity index 100% rename from tests/sample_dataset/0000/edgar.md rename to tests/sample_dataset/simple_document/edgar.md diff --git a/tests/sample_dataset/0000/edgar.pdf b/tests/sample_dataset/simple_document/edgar.pdf similarity index 100% rename from tests/sample_dataset/0000/edgar.pdf rename to tests/sample_dataset/simple_document/edgar.pdf