URL packaging

This commit is contained in:
Jake Poznanski 2025-10-10 16:52:42 +00:00
parent 87a2b8a9a3
commit fc4934c9b4
4 changed files with 61 additions and 4 deletions

View File

@ -213,6 +213,10 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
total_processed = 0
total_errors = 0
# Create urls.jsonl file for id-to-url mappings
urls_file_path = processed_dir / "urls.jsonl"
urls_file = open(urls_file_path, "w", encoding="utf-8")
for parquet_file in parquet_files:
print(f"Processing {parquet_file.name}...")
df = pd.read_parquet(parquet_file)
@ -229,6 +233,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
assert len(doc_id) > 4
# Extract URL from row and write to urls.jsonl
url = row.get("url", None)
if url:
url_entry = {"id": doc_id, "url": url}
urls_file.write(json.dumps(url_entry) + "\n")
# Create folder structure
# For allenai/olmOCR-mix-0225: use first 4 characters as folder
# For other datasets: preserve the existing structure
@ -305,6 +315,10 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
if max_examples and total_processed >= max_examples:
break
# Close the urls.jsonl file
urls_file.close()
print(f"Created urls.jsonl with {total_processed} id-to-url mappings")
print(f"Completed! Processed {total_processed} examples to {processed_dir}")
print(f"Total errors: {total_errors}")

View File

@ -83,6 +83,22 @@ def normalize_response_payload(front_matter: Dict[str, object], body_text: str)
return payload
def load_url_mappings(processed_dir: Path) -> Dict[str, str]:
"""Load URL mappings from urls.jsonl if it exists."""
urls_file = processed_dir / "urls.jsonl"
url_map = {}
if urls_file.exists():
print(f"Loading URL mappings from {urls_file}")
with open(urls_file, "r", encoding="utf-8") as f:
for line in f:
entry = json.loads(line.strip())
url_map[entry["id"]] = entry["url"]
print(f"Loaded {len(url_map)} URL mappings")
return url_map
def guess_url(front_matter: Dict[str, object], doc_id: str, source_url_template: Optional[str]) -> Optional[str]:
# TODO, we will have to add some better support for this
return None
@ -124,6 +140,9 @@ def collect_documents(
"natural_text",
}
# Load URL mappings from urls.jsonl if it exists
url_map = load_url_mappings(processed_dir)
parser = FrontMatterParser(front_matter_class=PageResponse)
for md_path in tqdm(md_files, desc="Scanning markdown files"):
@ -135,7 +154,18 @@ def collect_documents(
response_payload = normalize_response_payload(front_matter, body_text)
pdf_size = pdf_path.stat().st_size
page_number = parse_page_number(doc_id, front_matter)
url = guess_url(front_matter, doc_id, url_template)
# Try to get URL from the loaded url_map
# Handle both formats: "0001/234567" and "0001234567"
url = url_map.get(doc_id)
if not url and "/" in doc_id:
# Try combining the parts (e.g., "0001/234567" -> "0001234567")
combined_id = doc_id.replace("/", "")
url = url_map.get(combined_id)
if not url:
# Fall back to guess_url if URL not found in map
url = guess_url(front_matter, doc_id, url_template)
extras = {k: v for k, v in response_payload.items() if k not in canonical_keys}
extras_json = json.dumps(extras, ensure_ascii=False) if extras else None

View File

@ -0,0 +1,2 @@
{"id": "empty_document/blanktext", "url": "https://www.allenai.org"}
{"id": "simple_document/edgar", "url": "https://en.wikipedia.org/wiki/Edgar,_King_of_England"}

View File

@ -78,6 +78,17 @@ def test_repackage_and_prepare_olmocrmix():
assert sample_files == unpacked_files, "Mismatch in files between sample dataset and unpacked output"
for relative_path in sample_files:
sample_contents = (sample_dataset / relative_path).read_bytes()
unpacked_contents = (unpacked_processed / relative_path).read_bytes()
assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"
sample_file = sample_dataset / relative_path
unpacked_file = unpacked_processed / relative_path
if relative_path.suffix == ".jsonl":
# For JSONL files, compare as sets of lines (order doesn't matter)
# Filter out empty lines
sample_lines = set(line for line in sample_file.read_text().strip().split('\n') if line.strip())
unpacked_lines = set(line for line in unpacked_file.read_text().strip().split('\n') if line.strip())
assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}"
else:
# For other files, compare as bytes
sample_contents = sample_file.read_bytes()
unpacked_contents = unpacked_file.read_bytes()
assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"