mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-28 00:24:16 +00:00
Don't retry accessdenied errors
This commit is contained in:
parent
2c52664301
commit
65763de178
@ -4,6 +4,7 @@ import posixpath
|
|||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
import boto3
|
import boto3
|
||||||
|
import time
|
||||||
import requests
|
import requests
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import hashlib # Added for MD5 hash computation
|
import hashlib # Added for MD5 hash computation
|
||||||
@ -13,7 +14,7 @@ from pathlib import Path
|
|||||||
from google.auth import compute_engine
|
from google.auth import compute_engine
|
||||||
from google.cloud import storage
|
from google.cloud import storage
|
||||||
from botocore.config import Config
|
from botocore.config import Config
|
||||||
from botocore.exceptions import NoCredentialsError
|
from botocore.exceptions import NoCredentialsError, ClientError
|
||||||
from boto3.s3.transfer import TransferConfig
|
from boto3.s3.transfer import TransferConfig
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -79,16 +80,28 @@ def get_s3_bytes(s3_client, s3_path: str, start_index: Optional[int] = None, end
|
|||||||
|
|
||||||
return obj['Body'].read()
|
return obj['Body'].read()
|
||||||
|
|
||||||
def get_s3_bytes_with_backoff(s3_client, pdf_s3_path, max_retries: int=8, backoff_factor: int=2):
|
def get_s3_bytes_with_backoff(s3_client, pdf_s3_path, max_retries: int = 8, backoff_factor: int = 2):
|
||||||
attempt = 0
|
attempt = 0
|
||||||
|
|
||||||
while attempt < max_retries:
|
while attempt < max_retries:
|
||||||
try:
|
try:
|
||||||
return get_s3_bytes(s3_client, pdf_s3_path)
|
return get_s3_bytes(s3_client, pdf_s3_path)
|
||||||
|
except ClientError as e:
|
||||||
|
# Check for AccessDenied error and raise immediately
|
||||||
|
if e.response['Error']['Code'] == 'AccessDenied':
|
||||||
|
logger.error(f"AccessDenied error when trying to access {pdf_s3_path}: {e}")
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
wait_time = backoff_factor ** attempt
|
||||||
|
logger.warning(f"Attempt {attempt+1} failed to get_s3_bytes for {pdf_s3_path}: {e}. Retrying in {wait_time} seconds...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
attempt += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
wait_time = backoff_factor ** attempt
|
wait_time = backoff_factor ** attempt
|
||||||
logger.warning(f"Attempt {attempt+1} failed to get_s3_bytes for {pdf_s3_path}: {e}. Retrying in {wait_time} seconds...")
|
logger.warning(f"Attempt {attempt+1} failed to get_s3_bytes for {pdf_s3_path}: {e}. Retrying in {wait_time} seconds...")
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
attempt += 1
|
attempt += 1
|
||||||
|
|
||||||
logger.error(f"Failed to get_s3_bytes for {pdf_s3_path} after {max_retries} retries.")
|
logger.error(f"Failed to get_s3_bytes for {pdf_s3_path} after {max_retries} retries.")
|
||||||
raise Exception("Failed to get_s3_bytes after retries")
|
raise Exception("Failed to get_s3_bytes after retries")
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user