diff --git a/olmocr/data/repackage_olmocrmix.py b/olmocr/data/repackage_olmocrmix.py index 86ebadb..512435d 100644 --- a/olmocr/data/repackage_olmocrmix.py +++ b/olmocr/data/repackage_olmocrmix.py @@ -54,7 +54,7 @@ def infer_doc_id(md_path: Path, processed_root: Path) -> str: rel = md_path.relative_to(processed_root) if len(rel.parts) < 2: stem = rel.stem - prefix = rel.stem[:4] + prefix = rel.stem else: prefix = rel.parts[0] stem = Path(rel.parts[-1]).stem diff --git a/tests/sample_dataset/0000/blanktext.md b/tests/sample_dataset/empty_document/blanktext.md similarity index 100% rename from tests/sample_dataset/0000/blanktext.md rename to tests/sample_dataset/empty_document/blanktext.md diff --git a/tests/sample_dataset/0000/blanktext.pdf b/tests/sample_dataset/empty_document/blanktext.pdf similarity index 100% rename from tests/sample_dataset/0000/blanktext.pdf rename to tests/sample_dataset/empty_document/blanktext.pdf diff --git a/tests/sample_dataset/0000/edgar.md b/tests/sample_dataset/simple_document/edgar.md similarity index 100% rename from tests/sample_dataset/0000/edgar.md rename to tests/sample_dataset/simple_document/edgar.md diff --git a/tests/sample_dataset/0000/edgar.pdf b/tests/sample_dataset/simple_document/edgar.pdf similarity index 100% rename from tests/sample_dataset/0000/edgar.pdf rename to tests/sample_dataset/simple_document/edgar.pdf