OpenMetadata/ingestion/src/metadata/utils/s3_utils.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
import json
from io import BytesIO, StringIO
from typing import Any

import pandas as pd
from pandas import DataFrame


def read_csv_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:
    csv_obj = client.get_object(Bucket=bucket_name, Key=key)
    body = csv_obj["Body"]
    csv_string = body.read().decode("utf-8")
    df = pd.read_csv(StringIO(csv_string))
    return df


def read_tsv_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:
    tsv_obj = client.get_object(Bucket=bucket_name, Key=key)
    body = tsv_obj["Body"]
    tsv_string = body.read().decode("utf-8")
    df = pd.read_csv(StringIO(tsv_string), sep="\t")
    return df


def read_json_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:
    obj = client.get_object(Bucket=bucket_name, Key=key)
    json_text = obj["Body"].read().decode("utf-8")
    data = json.loads(json_text)
    if isinstance(data, list):
        df = pd.DataFrame.from_dict(data)
    else:
        df = pd.DataFrame.from_dict(dict([(k, pd.Series(v)) for k, v in data.items()]))
    return df


def read_parquet_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:
    obj = client.get_object(Bucket=bucket_name, Key=key)
    df = pd.read_parquet(BytesIO(obj["Body"].read()))
    return df
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
Fix #6091: Fix Datalake arrays must be of the same length (#6092) 2022-07-17 21:56:54 +05:30			`import json`
			`from io import BytesIO, StringIO`
			`from typing import Any`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30
Fix #6091: Fix Datalake arrays must be of the same length (#6092) 2022-07-17 21:56:54 +05:30			`import pandas as pd`
			`from pandas import DataFrame`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30

Refactored Datalake and Deltalake for Topology (#6034) * rebasing with main * refactored deltalake for topology * using requests instead of urllib * formatting fixes Co-authored-by: Onkar Ravgan <onkarravgan@Onkars-MacBook-Pro.local> 2022-07-19 10:07:27 +05:30			`def read_csv_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:`
			`csv_obj = client.get_object(Bucket=bucket_name, Key=key)`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30			`body = csv_obj["Body"]`
			`csv_string = body.read().decode("utf-8")`
			`df = pd.read_csv(StringIO(csv_string))`
			`return df`


Refactored Datalake and Deltalake for Topology (#6034) * rebasing with main * refactored deltalake for topology * using requests instead of urllib * formatting fixes Co-authored-by: Onkar Ravgan <onkarravgan@Onkars-MacBook-Pro.local> 2022-07-19 10:07:27 +05:30			`def read_tsv_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:`
			`tsv_obj = client.get_object(Bucket=bucket_name, Key=key)`
Fix #6091: Fix Datalake arrays must be of the same length (#6092) 2022-07-17 21:56:54 +05:30			`body = tsv_obj["Body"]`
			`tsv_string = body.read().decode("utf-8")`
			`df = pd.read_csv(StringIO(tsv_string), sep="\t")`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30			`return df`


Refactored Datalake and Deltalake for Topology (#6034) * rebasing with main * refactored deltalake for topology * using requests instead of urllib * formatting fixes Co-authored-by: Onkar Ravgan <onkarravgan@Onkars-MacBook-Pro.local> 2022-07-19 10:07:27 +05:30			`def read_json_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:`
			`obj = client.get_object(Bucket=bucket_name, Key=key)`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30			`json_text = obj["Body"].read().decode("utf-8")`
			`data = json.loads(json_text)`
Fix #6091: Fix Datalake arrays must be of the same length (#6092) 2022-07-17 21:56:54 +05:30			`if isinstance(data, list):`
			`df = pd.DataFrame.from_dict(data)`
			`else:`
			`df = pd.DataFrame.from_dict(dict([(k, pd.Series(v)) for k, v in data.items()]))`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30			`return df`


Refactored Datalake and Deltalake for Topology (#6034) * rebasing with main * refactored deltalake for topology * using requests instead of urllib * formatting fixes Co-authored-by: Onkar Ravgan <onkarravgan@Onkars-MacBook-Pro.local> 2022-07-19 10:07:27 +05:30			`def read_parquet_from_s3(client: Any, key: str, bucket_name: str) -> DataFrame:`
			`obj = client.get_object(Bucket=bucket_name, Key=key)`
Fix #6091: Fix Datalake arrays must be of the same length (#6092) 2022-07-17 21:56:54 +05:30			`df = pd.read_parquet(BytesIO(obj["Body"].read()))`
datalake-csv-files-ingestion-added (#5343) datalake-csv-files-ingestion-added (#5343) 2022-06-15 12:27:21 +05:30			`return df`