fix(ingest): detect malformed Glue S3 script paths (#3037)

This commit is contained in:
Kevin Hu 2021-08-05 23:06:32 -04:00 committed by GitHub
parent f1bea875b2
commit 3d0534be4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -111,6 +111,20 @@ class GlueSource(Source):
S3 path to the job's Python script.
"""
# handle a bug in AWS where script path has duplicate prefixes
if script_path.lower().startswith("s3://s3://"):
script_path = script_path[5:]
# catch any other cases where the script path is invalid
if not script_path.startswith("s3://"):
self.report.report_warning(
script_path,
f"Error parsing DAG for Glue job. The script {script_path} is not a valid S3 path.",
)
return None
# extract the script's bucket and key
url = urlparse(script_path, allow_fragments=False)
bucket = url.netloc