Don't retry accessdenied errors

This commit is contained in:
Jake Poznanski 2024-11-15 13:02:38 -08:00
parent 2c52664301
commit 65763de178

View File

@ -4,6 +4,7 @@ import posixpath
import logging import logging
import tempfile import tempfile
import boto3 import boto3
import time
import requests import requests
import concurrent.futures import concurrent.futures
import hashlib # Added for MD5 hash computation import hashlib # Added for MD5 hash computation
@ -13,7 +14,7 @@ from pathlib import Path
from google.auth import compute_engine from google.auth import compute_engine
from google.cloud import storage from google.cloud import storage
from botocore.config import Config from botocore.config import Config
from botocore.exceptions import NoCredentialsError from botocore.exceptions import NoCredentialsError, ClientError
from boto3.s3.transfer import TransferConfig from boto3.s3.transfer import TransferConfig
from typing import Optional, List from typing import Optional, List
from urllib.parse import urlparse from urllib.parse import urlparse
@ -79,16 +80,28 @@ def get_s3_bytes(s3_client, s3_path: str, start_index: Optional[int] = None, end
return obj['Body'].read() return obj['Body'].read()
def get_s3_bytes_with_backoff(s3_client, pdf_s3_path, max_retries: int=8, backoff_factor: int=2): def get_s3_bytes_with_backoff(s3_client, pdf_s3_path, max_retries: int = 8, backoff_factor: int = 2):
attempt = 0 attempt = 0
while attempt < max_retries: while attempt < max_retries:
try: try:
return get_s3_bytes(s3_client, pdf_s3_path) return get_s3_bytes(s3_client, pdf_s3_path)
except ClientError as e:
# Check for AccessDenied error and raise immediately
if e.response['Error']['Code'] == 'AccessDenied':
logger.error(f"AccessDenied error when trying to access {pdf_s3_path}: {e}")
raise
else:
wait_time = backoff_factor ** attempt
logger.warning(f"Attempt {attempt+1} failed to get_s3_bytes for {pdf_s3_path}: {e}. Retrying in {wait_time} seconds...")
time.sleep(wait_time)
attempt += 1
except Exception as e: except Exception as e:
wait_time = backoff_factor ** attempt wait_time = backoff_factor ** attempt
logger.warning(f"Attempt {attempt+1} failed to get_s3_bytes for {pdf_s3_path}: {e}. Retrying in {wait_time} seconds...") logger.warning(f"Attempt {attempt+1} failed to get_s3_bytes for {pdf_s3_path}: {e}. Retrying in {wait_time} seconds...")
time.sleep(wait_time) time.sleep(wait_time)
attempt += 1 attempt += 1
logger.error(f"Failed to get_s3_bytes for {pdf_s3_path} after {max_retries} retries.") logger.error(f"Failed to get_s3_bytes for {pdf_s3_path} after {max_retries} retries.")
raise Exception("Failed to get_s3_bytes after retries") raise Exception("Failed to get_s3_bytes after retries")