mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-26 14:47:13 +00:00
URL packaging
This commit is contained in:
parent
87a2b8a9a3
commit
fc4934c9b4
@ -213,6 +213,10 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
||||
total_processed = 0
|
||||
total_errors = 0
|
||||
|
||||
# Create urls.jsonl file for id-to-url mappings
|
||||
urls_file_path = processed_dir / "urls.jsonl"
|
||||
urls_file = open(urls_file_path, "w", encoding="utf-8")
|
||||
|
||||
for parquet_file in parquet_files:
|
||||
print(f"Processing {parquet_file.name}...")
|
||||
df = pd.read_parquet(parquet_file)
|
||||
@ -229,6 +233,12 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
||||
|
||||
assert len(doc_id) > 4
|
||||
|
||||
# Extract URL from row and write to urls.jsonl
|
||||
url = row.get("url", None)
|
||||
if url:
|
||||
url_entry = {"id": doc_id, "url": url}
|
||||
urls_file.write(json.dumps(url_entry) + "\n")
|
||||
|
||||
# Create folder structure
|
||||
# For allenai/olmOCR-mix-0225: use first 4 characters as folder
|
||||
# For other datasets: preserve the existing structure
|
||||
@ -305,6 +315,10 @@ def prepare_olmocr_mix(dataset_path: str, subset: str, split: str, destination:
|
||||
if max_examples and total_processed >= max_examples:
|
||||
break
|
||||
|
||||
# Close the urls.jsonl file
|
||||
urls_file.close()
|
||||
print(f"Created urls.jsonl with {total_processed} id-to-url mappings")
|
||||
|
||||
print(f"Completed! Processed {total_processed} examples to {processed_dir}")
|
||||
print(f"Total errors: {total_errors}")
|
||||
|
||||
|
||||
@ -83,6 +83,22 @@ def normalize_response_payload(front_matter: Dict[str, object], body_text: str)
|
||||
return payload
|
||||
|
||||
|
||||
def load_url_mappings(processed_dir: Path) -> Dict[str, str]:
|
||||
"""Load URL mappings from urls.jsonl if it exists."""
|
||||
urls_file = processed_dir / "urls.jsonl"
|
||||
url_map = {}
|
||||
|
||||
if urls_file.exists():
|
||||
print(f"Loading URL mappings from {urls_file}")
|
||||
with open(urls_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
entry = json.loads(line.strip())
|
||||
url_map[entry["id"]] = entry["url"]
|
||||
print(f"Loaded {len(url_map)} URL mappings")
|
||||
|
||||
return url_map
|
||||
|
||||
|
||||
def guess_url(front_matter: Dict[str, object], doc_id: str, source_url_template: Optional[str]) -> Optional[str]:
|
||||
# TODO, we will have to add some better support for this
|
||||
return None
|
||||
@ -124,6 +140,9 @@ def collect_documents(
|
||||
"natural_text",
|
||||
}
|
||||
|
||||
# Load URL mappings from urls.jsonl if it exists
|
||||
url_map = load_url_mappings(processed_dir)
|
||||
|
||||
parser = FrontMatterParser(front_matter_class=PageResponse)
|
||||
|
||||
for md_path in tqdm(md_files, desc="Scanning markdown files"):
|
||||
@ -135,7 +154,18 @@ def collect_documents(
|
||||
response_payload = normalize_response_payload(front_matter, body_text)
|
||||
pdf_size = pdf_path.stat().st_size
|
||||
page_number = parse_page_number(doc_id, front_matter)
|
||||
url = guess_url(front_matter, doc_id, url_template)
|
||||
|
||||
# Try to get URL from the loaded url_map
|
||||
# Handle both formats: "0001/234567" and "0001234567"
|
||||
url = url_map.get(doc_id)
|
||||
if not url and "/" in doc_id:
|
||||
# Try combining the parts (e.g., "0001/234567" -> "0001234567")
|
||||
combined_id = doc_id.replace("/", "")
|
||||
url = url_map.get(combined_id)
|
||||
if not url:
|
||||
# Fall back to guess_url if URL not found in map
|
||||
url = guess_url(front_matter, doc_id, url_template)
|
||||
|
||||
extras = {k: v for k, v in response_payload.items() if k not in canonical_keys}
|
||||
extras_json = json.dumps(extras, ensure_ascii=False) if extras else None
|
||||
|
||||
|
||||
2
tests/sample_dataset/urls.jsonl
Normal file
2
tests/sample_dataset/urls.jsonl
Normal file
@ -0,0 +1,2 @@
|
||||
{"id": "empty_document/blanktext", "url": "https://www.allenai.org"}
|
||||
{"id": "simple_document/edgar", "url": "https://en.wikipedia.org/wiki/Edgar,_King_of_England"}
|
||||
@ -78,6 +78,17 @@ def test_repackage_and_prepare_olmocrmix():
|
||||
assert sample_files == unpacked_files, "Mismatch in files between sample dataset and unpacked output"
|
||||
|
||||
for relative_path in sample_files:
|
||||
sample_contents = (sample_dataset / relative_path).read_bytes()
|
||||
unpacked_contents = (unpacked_processed / relative_path).read_bytes()
|
||||
assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"
|
||||
sample_file = sample_dataset / relative_path
|
||||
unpacked_file = unpacked_processed / relative_path
|
||||
|
||||
if relative_path.suffix == ".jsonl":
|
||||
# For JSONL files, compare as sets of lines (order doesn't matter)
|
||||
# Filter out empty lines
|
||||
sample_lines = set(line for line in sample_file.read_text().strip().split('\n') if line.strip())
|
||||
unpacked_lines = set(line for line in unpacked_file.read_text().strip().split('\n') if line.strip())
|
||||
assert sample_lines == unpacked_lines, f"JSONL file contents differ for {relative_path}"
|
||||
else:
|
||||
# For other files, compare as bytes
|
||||
sample_contents = sample_file.read_bytes()
|
||||
unpacked_contents = unpacked_file.read_bytes()
|
||||
assert sample_contents == unpacked_contents, f"File contents differ for {relative_path}"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user