Better tracking of completion_errors

2025-12-09 05:52:16 +00:00 · 2024-10-15 22:43:31 +00:00 · 2024-10-15 22:43:31 +00:00 · 9eb252f8f6
commit 9eb252f8f6
parent 4ef14ec813
1 changed files with 30 additions and 18 deletions
--- a/pdelfin/birrpipeline.py
+++ b/pdelfin/birrpipeline.py
@ -422,12 +422,7 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI
            data = json.loads(line_str)
            pdf_s3_path, page_num = parse_custom_id(data["custom_id"])
-            assert "outputs" in data and len(data["outputs"]) > 0, "No outputs from model detected"
+            if data.get("completion_error", None) is not None:
            # Try to parse the actual model response JSON
            try:
                model_response_json = json.loads(data["outputs"][0]["text"])
                index_entries.append(DatabaseManager.BatchInferenceRecord(
                    inference_s3_path=inference_s3_path,
                    pdf_s3_path=pdf_s3_path,
@ -435,20 +430,37 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI
                    round=data["round"],
                    start_index=start_index,  # Byte offset in the original file
                    length=line_length,       # Length in bytes
-                    finish_reason=data["outputs"][0]["finish_reason"],
+                    finish_reason="completion_error",
                    error=data.get("completion_error", None)
                ))
-            except json.JSONDecodeError:
+            else:
-                index_entries.append(DatabaseManager.BatchInferenceRecord(
+                # Try to parse the actual model response JSON
-                    inference_s3_path=inference_s3_path,
+                assert "outputs" in data and len(data["outputs"]) > 0, "No outputs from model detected"
-                    pdf_s3_path=pdf_s3_path,
+
-                    page_num=page_num,
+                try:
-                    round=data["round"],
+                    model_response_json = json.loads(data["outputs"][0]["text"])
-                    start_index=start_index,  # Byte offset in the original file
+
-                    length=line_length,       # Length in bytes
+                    index_entries.append(DatabaseManager.BatchInferenceRecord(
-                    finish_reason=data["outputs"][0]["finish_reason"],
+                        inference_s3_path=inference_s3_path,
-                    error="Could not parse model JSON output",
+                        pdf_s3_path=pdf_s3_path,
-                ))
+                        page_num=page_num,
                        round=data["round"],
                        start_index=start_index,  # Byte offset in the original file
                        length=line_length,       # Length in bytes
                        finish_reason=data["outputs"][0]["finish_reason"],
                        error=data.get("completion_error", None)
                    ))
                except json.JSONDecodeError:
                    index_entries.append(DatabaseManager.BatchInferenceRecord(
                        inference_s3_path=inference_s3_path,
                        pdf_s3_path=pdf_s3_path,
                        page_num=page_num,
                        round=data["round"],
                        start_index=start_index,  # Byte offset in the original file
                        length=line_length,       # Length in bytes
                        finish_reason=data["outputs"][0]["finish_reason"],
                        error="Could not parse model JSON output",
                    ))
        except json.JSONDecodeError:
            print(f"Error with JSON Decoding of inference in {inference_s3_path}")