From 1c8602c0ff275d8a0a951c766f30bc4da1065b98 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Thu, 17 Oct 2024 22:41:44 +0000 Subject: [PATCH] Removing rotation invalid ones to see what happens --- pdelfin/birrpipeline.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pdelfin/birrpipeline.py b/pdelfin/birrpipeline.py index 4d84220..96e448a 100644 --- a/pdelfin/birrpipeline.py +++ b/pdelfin/birrpipeline.py @@ -374,6 +374,9 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI last_error = data.get("completion_error", None) + if not model_response_json["is_rotation_valid"]: + last_error = "rotation_invalid" + index_entries.append(DatabaseManager.BatchInferenceRecord( inference_s3_path=inference_s3_path, pdf_s3_path=pdf_s3_path, @@ -547,6 +550,9 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Manager for running millions of PDFs through a batch inference pipeline') parser.add_argument('workspace', help='The S3 path where work will be done e.g., s3://bucket/prefix/)') parser.add_argument('--add_pdfs', help='Path to add pdfs stored in s3 to the workspace, can be a glob path s3://bucket/prefix/*.pdf or path to file containing list of pdf paths', default=None) + parser.add_argument('--prefilter_lang', help='If set, tries to detect the language of the pdf and only accepts it if it matches (ex. ENGLISH)') + parser.add_argument('--prefilter_spam', help='If set, tries to detect spammy pdfs and not include them') + parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None) parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None) parser.add_argument('--max_size_mb', type=int, default=250, help='Max file size in MB')