Robustness

This commit is contained in:
Jake Poznanski 2024-10-14 20:31:37 +00:00
parent 1ed9e4c947
commit 194af5ff52

View File

@ -426,6 +426,7 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI
assert "outputs" in data and len(data["outputs"]) > 0, "No outputs from model detected" assert "outputs" in data and len(data["outputs"]) > 0, "No outputs from model detected"
# Try to parse the actual model response JSON # Try to parse the actual model response JSON
try:
model_response_json = json.loads(data["outputs"][0]["text"]) model_response_json = json.loads(data["outputs"][0]["text"])
index_entries.append(DatabaseManager.BatchInferenceRecord( index_entries.append(DatabaseManager.BatchInferenceRecord(
@ -438,8 +439,21 @@ def process_jsonl_content(inference_s3_path: str) -> List[DatabaseManager.BatchI
finish_reason=data["outputs"][0]["finish_reason"], finish_reason=data["outputs"][0]["finish_reason"],
error=data.get("completion_error", None) error=data.get("completion_error", None)
)) ))
except json.JSONDecodeError:
index_entries.append(DatabaseManager.BatchInferenceRecord(
inference_s3_path=inference_s3_path,
pdf_s3_path=pdf_s3_path,
page_num=page_num,
round=data["round"],
start_index=start_index, # Byte offset in the original file
length=line_length, # Length in bytes
finish_reason="error",
error="Could not parse model JSON output",
))
except json.JSONDecodeError: except json.JSONDecodeError:
print(f"Error with JSON Decoding of infrence in {inference_s3_path}") print(f"Error with JSON Decoding of infrence in {inference_s3_path}")
# TODO Maybe this needs to add an index error that this json is bad
except Exception as e: except Exception as e:
print(f"Error processing line: {e}") print(f"Error processing line: {e}")
@ -645,6 +659,7 @@ if __name__ == '__main__':
print("\nFinal statistics:") print("\nFinal statistics:")
# Output the number of documents in each status "pending" and "completed" # Output the number of documents in each status "pending" and "completed"
# For each round, outputs a report of how many pages were processed, how many had errors
print("\nWork finished, waiting for all workers to finish cleaning up") print("\nWork finished, waiting for all workers to finish cleaning up")