URL packaging

2025-12-26 14:47:13 +00:00 · 2025-10-10 16:52:42 +00:00 · 2025-10-10 16:52:42 +00:00 · fc4934c9b4
commit fc4934c9b4
parent 87a2b8a9a3
4 changed files with 61 additions and 4 deletions
--- a/olmocr/data/prepare_olmocrmix.py
+++ b/olmocr/data/prepare_olmocrmix.py
@ -213,6 +213,10 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
    total_processed = 0
    total_errors = 0

+    # Create urls.jsonl file for id-to-url mappings
+    urls_file_path = processed_dir / "urls.jsonl"
+    urls_file = open(urls_file_path, "w", encoding="utf-8")
+
    for parquet_file in parquet_files:
        print(f"Processing {parquet_file.name}...")
        df = pd.read_parquet(parquet_file)
@ -229,6 +233,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:

                assert len(doc_id) > 4

+                # Extract URL from row and write to urls.jsonl
+                url = row.get("url", None)
+                if url:
+                    url_entry = {"id": doc_id, "url": url}
+                    urls_file.write(json.dumps(url_entry) + "\n")
+
                # Create folder structure
                # For allenai/olmOCR-mix-0225: use first 4 characters as folder
                # For other datasets: preserve the existing structure
@ -305,6 +315,10 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
        if max_examples and total_processed >= max_examples:
            break

+    # Close the urls.jsonl file
+    urls_file.close()
+    print(f"Created urls.jsonl with {total_processed} id-to-url mappings")
+
    print(f"Completed! Processed {total_processed} examples to {processed_dir}")
    print(f"Total errors: {total_errors}")

--- a/olmocr/data/repackage_olmocrmix.py
+++ b/olmocr/data/repackage_olmocrmix.py
@ -83,6 +83,22 @@ def normalize_response_payload(front_matter: Dict[str, object], body_text: str)
    return payload


+def load_url_mappings(processed_dir: Path) -> Dict[str, str]:
+    """Load URL mappings from urls.jsonl if it exists."""
+    urls_file = processed_dir / "urls.jsonl"
+    url_map = {}
+
+    if urls_file.exists():
+        print(f"Loading URL mappings from {urls_file}")
+        with open(urls_file, "r", encoding="utf-8") as f:
+            for line in f:
+                entry = json.loads(line.strip())
+                url_map[entry["id"]] = entry["url"]
+        print(f"Loaded {len(url_map)} URL mappings")
+
+    return url_map
+
+
 def guess_url(front_matter: Dict[str, object], doc_id: str, source_url_template: Optional[str]) -> Optional[str]:
    # TODO, we will have to add some better support for this
    return None
@ -124,6 +140,9 @@ def collect_documents(
        "natural_text",
    }

+    # Load URL mappings from urls.jsonl if it exists
+    url_map = load_url_mappings(processed_dir)
+
    parser = FrontMatterParser(front_matter_class=PageResponse)

    for md_path in tqdm(md_files, desc="Scanning markdown files"):
@ -135,7 +154,18 @@ def collect_documents(
            response_payload = normalize_response_payload(front_matter, body_text)
            pdf_size = pdf_path.stat().st_size
            page_number = parse_page_number(doc_id, front_matter)
-            url = guess_url(front_matter, doc_id, url_template)
+
+            # Try to get URL from the loaded url_map
+            # Handle both formats: "0001/234567" and "0001234567"
+            url = url_map.get(doc_id)
+            if not url and "/" in doc_id:
+                # Try combining the parts (e.g., "0001/234567" -> "0001234567")
+                combined_id = doc_id.replace("/", "")
+                url = url_map.get(combined_id)
+            if not url:
+                # Fall back to guess_url if URL not found in map
+                url = guess_url(front_matter, doc_id, url_template)
+
            extras = {k: v for k, v in response_payload.items() if k not in canonical_keys}
            extras_json = json.dumps(extras, ensure_ascii=False) if extras else None

--- a/tests/sample_dataset/urls.jsonl
+++ b/tests/sample_dataset/urls.jsonl
@ -0,0 +1,2 @@
+{"id": "empty_document/blanktext", "url": "https://www.allenai.org"}
+{"id": "simple_document/edgar", "url": "https://en.wikipedia.org/wiki/Edgar,_King_of_England"}
--- a/tests/test_olmocrmix.py
+++ b/tests/test_olmocrmix.py
@ -78,6 +78,17 @@ def test_repackage_and_prepare_olmocrmix():
        assert sample_files == unpacked_files, "Mismatch in files between sample dataset and unpacked output"

        for relative_path in sample_files:
-            sample_contents = (sample_dataset / relative_path).read_bytes()
-            unpacked_contents = (unpacked_processed / relative_path).read_bytes()
-            assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"
+            sample_file = sample_dataset / relative_path
+            unpacked_file = unpacked_processed / relative_path
+
+            if relative_path.suffix == ".jsonl":
+                # For JSONL files, compare as sets of lines (order doesn't matter)
+                # Filter out empty lines
+                sample_lines = set(line for line in sample_file.read_text().strip().split('\n') if line.strip())
+                unpacked_lines = set(line for line in unpacked_file.read_text().strip().split('\n') if line.strip())
+                assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}"
+            else:
+                # For other files, compare as bytes
+                sample_contents = sample_file.read_bytes()
+                unpacked_contents = unpacked_file.read_bytes()
+                assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"