diff --git a/pdelfin/data/buildtestset.py b/pdelfin/data/buildtestset.py new file mode 100644 index 0000000..4eb973a --- /dev/null +++ b/pdelfin/data/buildtestset.py @@ -0,0 +1,217 @@ +import os +import glob +import random +import argparse +import boto3 +import base64 +from pypdf import PdfReader, PdfWriter +from tqdm import tqdm +from concurrent.futures import ProcessPoolExecutor, as_completed +from urllib.parse import urlparse +from typing import List + +from pdelfin.data.renderpdf import render_pdf_to_base64png +from pdelfin.filter import PdfFilter + +pdf_filter = PdfFilter() + +def sample_pdf_pages(num_pages: int, first_n_pages: int, max_sample_pages: int) -> List[int]: + """ + Returns a list of sampled page indices (1-based). + - Always include the first_n_pages (or all pages if num_pages < first_n_pages). + - Randomly sample the remaining pages up to a total of max_sample_pages. + """ + if num_pages <= first_n_pages: + return list(range(1, num_pages + 1)) + sample_pages = list(range(1, first_n_pages + 1)) + remaining_pages = list(range(first_n_pages + 1, num_pages + 1)) + if remaining_pages: + # How many random pages to pick beyond the first_n_pages + random_pick = min(max_sample_pages - first_n_pages, len(remaining_pages)) + sample_pages += random.sample(remaining_pages, random_pick) + return sample_pages + +def fetch_s3_file(s3_url: str, local_path: str) -> str: + """ + Download a file from an S3 URI (s3://bucket/key) to local_path. + """ + parsed = urlparse(s3_url) + bucket_name = parsed.netloc + key = parsed.path.lstrip('/') + s3 = boto3.client('s3') + s3.download_file(bucket_name, key, local_path) + return local_path + +def extract_single_page_pdf(input_pdf_path: str, page_number: int, output_pdf_path: str) -> None: + """ + Extracts exactly one page (page_number, 1-based) from input_pdf_path + and writes to output_pdf_path. + """ + reader = PdfReader(input_pdf_path) + writer = PdfWriter() + # Page numbers in PdfReader are 0-based + writer.add_page(reader.pages[page_number - 1]) + with open(output_pdf_path, "wb") as f: + writer.write(f) + + + +def process_pdf( + pdf_path: str, + first_n_pages: int, + max_sample_pages: int, + no_filter: bool, + output_dir: str +): + """ + - Download the PDF locally if it's in S3. + - Optionally filter the PDF (if no_filter=False). + - Sample the pages. + - For each sampled page, extract a one-page PDF and also render it to PNG. + """ + if pdf_path.startswith("s3://"): + local_pdf_path = os.path.join("/tmp", os.path.basename(pdf_path)) + fetch_s3_file(pdf_path, local_pdf_path) + else: + local_pdf_path = pdf_path + + if (not no_filter) and pdf_filter.filter_out_pdf(local_pdf_path): + print(f"Skipping {local_pdf_path} due to filter.") + return False + + # Make sure we have an absolute path for the PDF name + base_pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] + + reader = PdfReader(local_pdf_path) + num_pages = len(reader.pages) + + sampled_pages = sample_pdf_pages(num_pages, first_n_pages, max_sample_pages) + + # For each sampled page, produce a single-page PDF and a PNG + for page_num in sampled_pages: + single_pdf_name = f"{base_pdf_name}_page{page_num}.pdf" + single_png_name = f"{base_pdf_name}_page{page_num}.png" + + single_pdf_path = os.path.join(output_dir, single_pdf_name) + single_png_path = os.path.join(output_dir, single_png_name) + + + try: + # 1) Extract single-page PDF + extract_single_page_pdf(local_pdf_path, page_num, single_pdf_path) + + # 2) Render that single-page PDF to a PNG + b64png = render_pdf_to_base64png(single_pdf_path, page_num=0, target_longest_image_dim=1024) + + with open(single_png_path, "wb") as pngf: + pngf.write(base64.b64decode(b64png)) + + except Exception as e: + print(f"Error while processing {pdf_path}, page {page_num}: {e}") + + return True + +def main(): + parser = argparse.ArgumentParser(description="Sample PDFs, extract single-page PDFs, and render them as PNG.") + parser.add_argument("--glob_path", type=str, help="Local or S3 path glob (e.g., *.pdf or s3://bucket/pdfs/*.pdf).") + parser.add_argument("--path_list", type=str, help="Path to a file containing paths to PDFs, one per line.") + parser.add_argument("--no_filter", action="store_true", help="Disables filtering so that ALL PDFs are processed.") + parser.add_argument("--num_sample_docs", type=int, default=2000, help="Number of PDF documents to sample.") + parser.add_argument("--first_n_pages", type=int, default=0, help="Always sample the first N pages of each PDF.") + parser.add_argument("--max_sample_pages", type=int, default=1, help="Max number of pages to sample per PDF.") + parser.add_argument("--output_dir", type=str, default="sampled_pages_output", help="Output directory for the extracted PDFs and PNGs.") + parser.add_argument("--reservoir_size", type=int, default=None, + help="Size of the reservoir for sampling paths. Defaults to 10x num_sample_docs.") + args = parser.parse_args() + + # Set default reservoir_size if not provided + if args.reservoir_size is None: + args.reservoir_size = 10 * args.num_sample_docs + + os.makedirs(args.output_dir, exist_ok=True) + + # Reservoir sample for PDF paths + pdf_paths = [] + n = 0 # total number of items seen + + # Either load from glob or from path_list + if args.glob_path: + if args.glob_path.startswith("s3://"): + # Handle S3 globbing + parsed = urlparse(args.glob_path) + s3 = boto3.client('s3') + bucket_name = parsed.netloc + prefix = os.path.dirname(parsed.path.lstrip('/')) + "/" + paginator = s3.get_paginator('list_objects_v2') + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix) + + for page in page_iterator: + for obj in page.get('Contents', []): + if obj['Key'].endswith('.pdf'): + n += 1 + path = f"s3://{bucket_name}/{obj['Key']}" + if len(pdf_paths) < args.reservoir_size: + pdf_paths.append(path) + else: + s = random.randint(1, n) + if s <= args.reservoir_size: + pdf_paths[s - 1] = path + else: + # Handle local globbing + for path in glob.iglob(args.glob_path, recursive=True): + n += 1 + if len(pdf_paths) < args.reservoir_size: + pdf_paths.append(path) + else: + s = random.randint(1, n) + if s <= args.reservoir_size: + pdf_paths[s - 1] = path + elif args.path_list: + with open(args.path_list, 'r') as f: + for line in f: + path = line.strip() + if not path: + continue + n += 1 + if len(pdf_paths) < args.reservoir_size: + pdf_paths.append(path) + else: + s = random.randint(1, n) + if s <= args.reservoir_size: + pdf_paths[s - 1] = path + + # Shuffle the reservoir so we don't always pick from the front + random.shuffle(pdf_paths) + print(f"Loaded and shuffled {len(pdf_paths)} PDF paths. Will process up to {args.num_sample_docs} of them.") + + pdfs_with_output = 0 + + # Use a ProcessPoolExecutor to parallelize PDF processing + # You may reduce max_workers if you have memory/CPU constraints + with ProcessPoolExecutor() as executor: + futures = {} + # Submit tasks + for pdf_path in pdf_paths: + future = executor.submit( + process_pdf, + pdf_path, + args.first_n_pages, + args.max_sample_pages, + args.no_filter, + args.output_dir + ) + futures[future] = pdf_path + + # Track completion + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing PDFs"): + if future.result(): + pdfs_with_output += 1 + if pdfs_with_output >= args.num_sample_docs: + # Cancel remaining tasks + executor.shutdown(cancel_futures=True) + break + + print(f"Done. Processed or attempted to process {pdfs_with_output} PDFs. Output is in: {args.output_dir}") + +if __name__ == "__main__": + main() diff --git a/pdelfin/eval/buildelo.py b/pdelfin/eval/buildelo.py index 151c4c1..f49c8f0 100644 --- a/pdelfin/eval/buildelo.py +++ b/pdelfin/eval/buildelo.py @@ -164,6 +164,9 @@ if __name__ == "__main__": except Exception as e: print(f"Error processing {pdf_path}: {str(e)}") + # Remove all results where the alignment is > 0.96 as these are just too similar to be useful + all_comps = [c for c in all_comps if c.alignment < 0.96] + # Shuffle the results random.shuffle(all_comps) diff --git a/pdelfin/eval/evalhtml_template.html b/pdelfin/eval/evalhtml_template.html index 9e6fc4d..0bc8730 100644 --- a/pdelfin/eval/evalhtml_template.html +++ b/pdelfin/eval/evalhtml_template.html @@ -18,7 +18,7 @@ } .container { width: 100%; - max-width: 1200px; + max-width: 1600px; margin: 0 auto; } .entry { diff --git a/pdelfin/eval/scoreelo.py b/pdelfin/eval/scoreelo.py index 24d1f03..639c9f3 100644 --- a/pdelfin/eval/scoreelo.py +++ b/pdelfin/eval/scoreelo.py @@ -280,7 +280,25 @@ def make_report(urls): if __name__ == "__main__": # Example usage urls = [ - "https://jakep-tinyhost.s3.amazonaws.com/review_page-681aae527593.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=BR1nqCUKQLBlh3HIsHjeyRVQumI%3D&Expires=1737500018", - # Add more URLs here... + "https://jakep-tinyhost.s3.amazonaws.com/review_page_0-e09ebadf34a7.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=JEQpJxSaMIHuc9DFHyfHuxx0dEU%3D&Expires=1737654586", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_1-c2d267f97a73.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=KMiOTQiFEvgxU94ZrlJRFAgSQZA%3D&Expires=1737654587", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_10-b806c811fb67.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=NaoHNU2ZmEGrgMsxg2JHK%2Fv5zd0%3D&Expires=1737654587", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_11-19c1936b4372.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=BjkVydyKjzzH3uZiZ1GkWAk6cbk%3D&Expires=1737654588", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_12-cd41808a7974.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jsk8TzJTKJwHi1Ru4%2Bw%2BiHZG638%3D&Expires=1737654589", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_13-8b055079b5eb.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=SE7kkobEBip44O8JY5axoMTV2Bs%3D&Expires=1737654590", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_14-1126e0da563c.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=jLSEWpDUzpmS8P9mNXbBoDYDOwU%3D&Expires=1737654590", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_15-05704e3d000d.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=TaCbyv2%2FDGCnCOgTzUvfEXdO%2Fmo%3D&Expires=1737654591", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_16-e57f795a89da.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=stqm1etAfDIpAQGNvZwe9c%2BYUbA%3D&Expires=1737654592", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_17-041a6d042764.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=rOTroBcSqCh3oM65bOJHEfaeal8%3D&Expires=1737654592", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_18-7a29697cee63.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=abmYM9KtzjicmdacRykPWXCdQr0%3D&Expires=1737654593", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_19-d32f14c067f8.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=iVg3nxrZXVpYybkLJIgOEJ3v37E%3D&Expires=1737654594", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_2-43c553548e69.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=IK27gl7b6NY05YNnnsimMVJc99I%3D&Expires=1737654595", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_3-fb42a458ecd5.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=d1qevJe8ZQONnu7zezYSJe3cbBw%3D&Expires=1737654595", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_4-76a50eed331a.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=qwZu2q1H4Y%2Bf3Kw7DNSYcTxwI7A%3D&Expires=1737654596", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_5-150b4d3583de.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=c%2FeqjnDSIRirgQviFWRLWVowKmA%3D&Expires=1737654597", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_6-6ca285526fd3.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=tkWDDuRinY77BLQCqumtlMiFJU8%3D&Expires=1737654598", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_7-01d711ee8bf7.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=eQtFo6CHJYHGu85wK0YG5khlE5U%3D&Expires=1737654598", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_8-0f36b852f274.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=weI3WB8vhjBYjk6t85DmyLdP97k%3D&Expires=1737654599", + "https://jakep-tinyhost.s3.amazonaws.com/review_page_9-115e33463fd2.html?AWSAccessKeyId=AKIASHLPW4FEVZOPGK46&Signature=b4CpkHprCUtZoL0u%2FFYzsu%2BB1yU%3D&Expires=1737654600", ] make_report(urls)