From 492a3f6bef939dd425135490c7d9df3f3e8ad851 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Fri, 18 Oct 2024 21:47:30 +0000
Subject: [PATCH] Adding parameters for taget image and anchor text sizes

---
 pdelfin/birrpipeline.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/pdelfin/birrpipeline.py b/pdelfin/birrpipeline.py
index 96e448a..cc34ceb 100644
--- a/pdelfin/birrpipeline.py
+++ b/pdelfin/birrpipeline.py
@@ -321,9 +321,9 @@ class BatchWriter:
             thread.join()
 
 
-def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
-    image_base64 = render_pdf_to_base64png(local_pdf_path, page, 1024)
-    anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
+def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int, target_longest_image_dim: int, target_anchor_text_len: int) -> dict:
+    image_base64 = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
+    anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)
 
     return {
         "custom_id": f"{pretty_pdf_path}-{page}",
@@ -423,7 +423,7 @@ def get_pdf_num_pages(s3_path: str) -> Optional[int]:
 
     return None
 
-def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_round: int) -> list[dict]:
+def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_round: int, target_longest_image_dim: int, target_anchor_text_len: int) -> list[dict]:
     db = DatabaseManager(s3_workspace)
 
     existing_pages = db.get_index_entries(pdf.s3_path)
@@ -447,13 +447,13 @@ def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_rou
 
                 if has_errored_previously:
                     # Retry the page at least one more time regularly
-                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
+                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len), "round": cur_round})
                     
                     # TODO: If the rotation was previously invalid, then apply a rotation  
 
                     # TODO: Try to provide a smaller prompt hint
                 else:
-                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
+                    new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len), "round": cur_round})
     except Exception as ex:
         print(f"Warning, could not get batch inferences lines for {pdf.s3_path} due to {ex}")
 
@@ -550,9 +550,8 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Manager for running millions of PDFs through a batch inference pipeline')
     parser.add_argument('workspace', help='The S3 path where work will be done e.g., s3://bucket/prefix/)')
     parser.add_argument('--add_pdfs', help='Path to add pdfs stored in s3 to the workspace, can be a glob path s3://bucket/prefix/*.pdf or path to file containing list of pdf paths', default=None)
-    parser.add_argument('--prefilter_lang', help='If set, tries to detect the language of the pdf and only accepts it if it matches (ex. ENGLISH)')
-    parser.add_argument('--prefilter_spam', help='If set, tries to detect spammy pdfs and not include them')
-    
+    parser.add_argument('--target_longest_image_dim', type=int, help='Dimension to use for rendering image', default=1024)
+    parser.add_argument('--target_anchor_text_len', type=int, help='Maximum amount of anchor text to use', default=6000)
     parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
     parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
     parser.add_argument('--max_size_mb', type=int, default=250, help='Max file size in MB')
@@ -631,7 +630,7 @@ if __name__ == '__main__':
         potentially_done_pdfs = db.get_pdfs_by_status("pending")
     else:
         print(f"\nCreating batch inference files for new PDFs")
-        future_to_path = {executor.submit(build_pdf_queries, args.workspace, pdf, current_round): pdf for pdf in db.get_pdfs_by_status("pending")}
+        future_to_path = {executor.submit(build_pdf_queries, args.workspace, pdf, current_round, args.target_longest_image_dim, args.target_anchor_text_len): pdf for pdf in db.get_pdfs_by_status("pending")}
         potentially_done_pdfs = []
         lines_written = 0
         new_inference_writer = BatchWriter(f"{args.workspace}/inference_inputs/round_{current_round}", args.max_size_mb)