Fixes #14215: Add missing decode stage to gz/zip files in json ingestion reader. (#14375)

* add decoding stage to gz/zip files.

Files that where zip/gz where not being decoded.
This was leading to a error when we wanted them to be.

* remove unnecessary comment

---------

Co-authored-by: Carl Kristensen <carl.johan.coelho.kristensen@schibsted.com>
This commit is contained in:
C. Kris 2023-12-14 12:47:58 +01:00 committed by GitHub
parent 512625c809
commit 74df616679
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -28,14 +28,15 @@ logger = ingestion_logger()
def _get_json_text(key: str, text: bytes, decode: bool) -> Union[str, bytes]:
processed_text: Union[str, bytes] = text
if key.endswith(".gz"):
return gzip.decompress(text)
processed_text = gzip.decompress(text)
if key.endswith(".zip"):
with zipfile.ZipFile(io.BytesIO(text)) as zip_file:
return zip_file.read(zip_file.infolist()[0]).decode(UTF_8)
processed_text = zip_file.read(zip_file.infolist()[0]).decode(UTF_8)
if decode:
return text.decode(UTF_8) if isinstance(text, bytes) else text
return text
return processed_text.decode(UTF_8) if isinstance(text, bytes) else text
return processed_text
class JSONDataFrameReader(DataFrameReader):