mirror of
https://github.com/allenai/olmocr.git
synced 2025-08-18 13:52:17 +00:00
ELO stuff
This commit is contained in:
parent
18f72b4e1b
commit
c74e3d1440
217
pdelfin/data/buildtestset.py
Normal file
217
pdelfin/data/buildtestset.py
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import random
|
||||||
|
import argparse
|
||||||
|
import boto3
|
||||||
|
import base64
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
from tqdm import tqdm
|
||||||
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pdelfin.data.renderpdf import render_pdf_to_base64png
|
||||||
|
from pdelfin.filter import PdfFilter
|
||||||
|
|
||||||
|
pdf_filter = PdfFilter()
|
||||||
|
|
||||||
|
def sample_pdf_pages(num_pages: int, first_n_pages: int, max_sample_pages: int) -> List[int]:
|
||||||
|
"""
|
||||||
|
Returns a list of sampled page indices (1-based).
|
||||||
|
- Always include the first_n_pages (or all pages if num_pages < first_n_pages).
|
||||||
|
- Randomly sample the remaining pages up to a total of max_sample_pages.
|
||||||
|
"""
|
||||||
|
if num_pages <= first_n_pages:
|
||||||
|
return list(range(1, num_pages + 1))
|
||||||
|
sample_pages = list(range(1, first_n_pages + 1))
|
||||||
|
remaining_pages = list(range(first_n_pages + 1, num_pages + 1))
|
||||||
|
if remaining_pages:
|
||||||
|
# How many random pages to pick beyond the first_n_pages
|
||||||
|
random_pick = min(max_sample_pages - first_n_pages, len(remaining_pages))
|
||||||
|
sample_pages += random.sample(remaining_pages, random_pick)
|
||||||
|
return sample_pages
|
||||||
|
|
||||||
|
def fetch_s3_file(s3_url: str, local_path: str) -> str:
|
||||||
|
"""
|
||||||
|
Download a file from an S3 URI (s3://bucket/key) to local_path.
|
||||||
|
"""
|
||||||
|
parsed = urlparse(s3_url)
|
||||||
|
bucket_name = parsed.netloc
|
||||||
|
key = parsed.path.lstrip('/')
|
||||||
|
s3 = boto3.client('s3')
|
||||||
|
s3.download_file(bucket_name, key, local_path)
|
||||||
|
return local_path
|
||||||
|
|
||||||
|
def extract_single_page_pdf(input_pdf_path: str, page_number: int, output_pdf_path: str) -> None:
|
||||||
|
"""
|
||||||
|
Extracts exactly one page (page_number, 1-based) from input_pdf_path
|
||||||
|
and writes to output_pdf_path.
|
||||||
|
"""
|
||||||
|
reader = PdfReader(input_pdf_path)
|
||||||
|
writer = PdfWriter()
|
||||||
|
# Page numbers in PdfReader are 0-based
|
||||||
|
writer.add_page(reader.pages[page_number - 1])
|
||||||
|
with open(output_pdf_path, "wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf(
|
||||||
|
pdf_path: str,
|
||||||
|
first_n_pages: int,
|
||||||
|
max_sample_pages: int,
|
||||||
|
no_filter: bool,
|
||||||
|
output_dir: str
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
- Download the PDF locally if it's in S3.
|
||||||
|
- Optionally filter the PDF (if no_filter=False).
|
||||||
|
- Sample the pages.
|
||||||
|
- For each sampled page, extract a one-page PDF and also render it to PNG.
|
||||||
|
"""
|
||||||
|
if pdf_path.startswith("s3://"):
|
||||||
|
local_pdf_path = os.path.join("/tmp", os.path.basename(pdf_path))
|
||||||
|
fetch_s3_file(pdf_path, local_pdf_path)
|
||||||
|
else:
|
||||||
|
local_pdf_path = pdf_path
|
||||||
|
|
||||||
|
if (not no_filter) and pdf_filter.filter_out_pdf(local_pdf_path):
|
||||||
|
print(f"Skipping {local_pdf_path} due to filter.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Make sure we have an absolute path for the PDF name
|
||||||
|
base_pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
|
|
||||||
|
reader = PdfReader(local_pdf_path)
|
||||||
|
num_pages = len(reader.pages)
|
||||||
|
|
||||||
|
sampled_pages = sample_pdf_pages(num_pages, first_n_pages, max_sample_pages)
|
||||||
|
|
||||||
|
# For each sampled page, produce a single-page PDF and a PNG
|
||||||
|
for page_num in sampled_pages:
|
||||||
|
single_pdf_name = f"{base_pdf_name}_page{page_num}.pdf"
|
||||||
|
single_png_name = f"{base_pdf_name}_page{page_num}.png"
|
||||||
|
|
||||||
|
single_pdf_path = os.path.join(output_dir, single_pdf_name)
|
||||||
|
single_png_path = os.path.join(output_dir, single_png_name)
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1) Extract single-page PDF
|
||||||
|
extract_single_page_pdf(local_pdf_path, page_num, single_pdf_path)
|
||||||
|
|
||||||
|
# 2) Render that single-page PDF to a PNG
|
||||||
|
b64png = render_pdf_to_base64png(single_pdf_path, page_num=0, target_longest_image_dim=1024)
|
||||||
|
|
||||||
|
with open(single_png_path, "wb") as pngf:
|
||||||
|
pngf.write(base64.b64decode(b64png))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error while processing {pdf_path}, page {page_num}: {e}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Sample PDFs, extract single-page PDFs, and render them as PNG.")
|
||||||
|
parser.add_argument("--glob_path", type=str, help="Local or S3 path glob (e.g., *.pdf or s3://bucket/pdfs/*.pdf).")
|
||||||
|
parser.add_argument("--path_list", type=str, help="Path to a file containing paths to PDFs, one per line.")
|
||||||
|
parser.add_argument("--no_filter", action="store_true", help="Disables filtering so that ALL PDFs are processed.")
|
||||||
|
parser.add_argument("--num_sample_docs", type=int, default=2000, help="Number of PDF documents to sample.")
|
||||||
|
parser.add_argument("--first_n_pages", type=int, default=0, help="Always sample the first N pages of each PDF.")
|
||||||
|
parser.add_argument("--max_sample_pages", type=int, default=1, help="Max number of pages to sample per PDF.")
|
||||||
|
parser.add_argument("--output_dir", type=str, default="sampled_pages_output", help="Output directory for the extracted PDFs and PNGs.")
|
||||||
|
parser.add_argument("--reservoir_size", type=int, default=None,
|
||||||
|
help="Size of the reservoir for sampling paths. Defaults to 10x num_sample_docs.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Set default reservoir_size if not provided
|
||||||
|
if args.reservoir_size is None:
|
||||||
|
args.reservoir_size = 10 * args.num_sample_docs
|
||||||
|
|
||||||
|
os.makedirs(args.output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Reservoir sample for PDF paths
|
||||||
|
pdf_paths = []
|
||||||
|
n = 0 # total number of items seen
|
||||||
|
|
||||||
|
# Either load from glob or from path_list
|
||||||
|
if args.glob_path:
|
||||||
|
if args.glob_path.startswith("s3://"):
|
||||||
|
# Handle S3 globbing
|
||||||
|
parsed = urlparse(args.glob_path)
|
||||||
|
s3 = boto3.client('s3')
|
||||||
|
bucket_name = parsed.netloc
|
||||||
|
prefix = os.path.dirname(parsed.path.lstrip('/')) + "/"
|
||||||
|
paginator = s3.get_paginator('list_objects_v2')
|
||||||
|
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
|
||||||
|
|
||||||
|
for page in page_iterator:
|
||||||
|
for obj in page.get('Contents', []):
|
||||||
|
if obj['Key'].endswith('.pdf'):
|
||||||
|
n += 1
|
||||||
|
path = f"s3://{bucket_name}/{obj['Key']}"
|
||||||
|
if len(pdf_paths) < args.reservoir_size:
|
||||||
|
pdf_paths.append(path)
|
||||||
|
else:
|
||||||
|
s = random.randint(1, n)
|
||||||
|
if s <= args.reservoir_size:
|
||||||
|
pdf_paths[s - 1] = path
|
||||||
|
else:
|
||||||
|
# Handle local globbing
|
||||||
|
for path in glob.iglob(args.glob_path, recursive=True):
|
||||||
|
n += 1
|
||||||
|
if len(pdf_paths) < args.reservoir_size:
|
||||||
|
pdf_paths.append(path)
|
||||||
|
else:
|
||||||
|
s = random.randint(1, n)
|
||||||
|
if s <= args.reservoir_size:
|
||||||
|
pdf_paths[s - 1] = path
|
||||||
|
elif args.path_list:
|
||||||
|
with open(args.path_list, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
path = line.strip()
|
||||||
|
if not path:
|
||||||
|
continue
|
||||||
|
n += 1
|
||||||
|
if len(pdf_paths) < args.reservoir_size:
|
||||||
|
pdf_paths.append(path)
|
||||||
|
else:
|
||||||
|
s = random.randint(1, n)
|
||||||
|
if s <= args.reservoir_size:
|
||||||
|
pdf_paths[s - 1] = path
|
||||||
|
|
||||||
|
# Shuffle the reservoir so we don't always pick from the front
|
||||||
|
random.shuffle(pdf_paths)
|
||||||
|
print(f"Loaded and shuffled {len(pdf_paths)} PDF paths. Will process up to {args.num_sample_docs} of them.")
|
||||||
|
|
||||||
|
pdfs_with_output = 0
|
||||||
|
|
||||||
|
# Use a ProcessPoolExecutor to parallelize PDF processing
|
||||||
|
# You may reduce max_workers if you have memory/CPU constraints
|
||||||
|
with ProcessPoolExecutor() as executor:
|
||||||
|
futures = {}
|
||||||
|
# Submit tasks
|
||||||
|
for pdf_path in pdf_paths:
|
||||||
|
future = executor.submit(
|
||||||
|
process_pdf,
|
||||||
|
pdf_path,
|
||||||
|
args.first_n_pages,
|
||||||
|
args.max_sample_pages,
|
||||||
|
args.no_filter,
|
||||||
|
args.output_dir
|
||||||
|
)
|
||||||
|
futures[future] = pdf_path
|
||||||
|
|
||||||
|
# Track completion
|
||||||
|
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing PDFs"):
|
||||||
|
if future.result():
|
||||||
|
pdfs_with_output += 1
|
||||||
|
if pdfs_with_output >= args.num_sample_docs:
|
||||||
|
# Cancel remaining tasks
|
||||||
|
executor.shutdown(cancel_futures=True)
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"Done. Processed or attempted to process {pdfs_with_output} PDFs. Output is in: {args.output_dir}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -164,6 +164,9 @@ if __name__ == "__main__":
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {pdf_path}: {str(e)}")
|
print(f"Error processing {pdf_path}: {str(e)}")
|
||||||
|
|
||||||
|
# Remove all results where the alignment is > 0.96 as these are just too similar to be useful
|
||||||
|
all_comps = [c for c in all_comps if c.alignment < 0.96]
|
||||||
|
|
||||||
# Shuffle the results
|
# Shuffle the results
|
||||||
random.shuffle(all_comps)
|
random.shuffle(all_comps)
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
}
|
}
|
||||||
.container {
|
.container {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
max-width: 1200px;
|
max-width: 1600px;
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
}
|
}
|
||||||
.entry {
|
.entry {
|
||||||
|
@ -280,7 +280,25 @@ def make_report(urls):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Example usage
|
# Example usage
|
||||||
urls = [
|
urls = [
|
||||||
"https://jakep-tinyhost.s3.amazonaws.com/review_page-681aae527593.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=BR1nqCUKQLBlh3HIsHjeyRVQumI%3D&Expires=1737500018",
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_0-e09ebadf34a7.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=JEQpJxSaMIHuc9DFHyfHuxx0dEU%3D&Expires=1737654586",
|
||||||
# Add more URLs here...
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_1-c2d267f97a73.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=KMiOTQiFEvgxU94ZrlJRFAgSQZA%3D&Expires=1737654587",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_10-b806c811fb67.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NaoHNU2ZmEGrgMsxg2JHK%2Fv5zd0%3D&Expires=1737654587",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_11-19c1936b4372.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=BjkVydyKjzzH3uZiZ1GkWAk6cbk%3D&Expires=1737654588",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_12-cd41808a7974.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jsk8TzJTKJwHi1Ru4%2Bw%2BiHZG638%3D&Expires=1737654589",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_13-8b055079b5eb.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=SE7kkobEBip44O8JY5axoMTV2Bs%3D&Expires=1737654590",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_14-1126e0da563c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jLSEWpDUzpmS8P9mNXbBoDYDOwU%3D&Expires=1737654590",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_15-05704e3d000d.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=TaCbyv2%2FDGCnCOgTzUvfEXdO%2Fmo%3D&Expires=1737654591",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_16-e57f795a89da.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=stqm1etAfDIpAQGNvZwe9c%2BYUbA%3D&Expires=1737654592",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_17-041a6d042764.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=rOTroBcSqCh3oM65bOJHEfaeal8%3D&Expires=1737654592",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_18-7a29697cee63.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=abmYM9KtzjicmdacRykPWXCdQr0%3D&Expires=1737654593",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_19-d32f14c067f8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=iVg3nxrZXVpYybkLJIgOEJ3v37E%3D&Expires=1737654594",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_2-43c553548e69.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=IK27gl7b6NY05YNnnsimMVJc99I%3D&Expires=1737654595",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_3-fb42a458ecd5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=d1qevJe8ZQONnu7zezYSJe3cbBw%3D&Expires=1737654595",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_4-76a50eed331a.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=qwZu2q1H4Y%2Bf3Kw7DNSYcTxwI7A%3D&Expires=1737654596",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_5-150b4d3583de.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=c%2FeqjnDSIRirgQviFWRLWVowKmA%3D&Expires=1737654597",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_6-6ca285526fd3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=tkWDDuRinY77BLQCqumtlMiFJU8%3D&Expires=1737654598",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_7-01d711ee8bf7.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=eQtFo6CHJYHGu85wK0YG5khlE5U%3D&Expires=1737654598",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_8-0f36b852f274.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=weI3WB8vhjBYjk6t85DmyLdP97k%3D&Expires=1737654599",
|
||||||
|
"https://jakep-tinyhost.s3.amazonaws.com/review_page_9-115e33463fd2.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=b4CpkHprCUtZoL0u%2FFYzsu%2BB1yU%3D&Expires=1737654600",
|
||||||
]
|
]
|
||||||
make_report(urls)
|
make_report(urls)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user