feat(ingest): bigquery sample data (#2178)

2025-11-03 12:16:10 +00:00 · 2021-03-05 16:39:34 -08:00 · 2021-03-05 16:39:34 -08:00 · 1f082b114b
commit 1f082b114b
parent d47cc31157
10 changed files with 63945 additions and 1 deletions
--- a/metadata-ingestion/.gitignore
+++ b/metadata-ingestion/.gitignore
@ -3,6 +3,7 @@
 output
 src/datahub/metadata/
 pvenv36/
+bq_credentials.json

 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/metadata-ingestion/examples/demo_data/.gitignore
+++ b/metadata-ingestion/examples/demo_data/.gitignore
@ -0,0 +1 @@
+/all_covid19_datasets.json
--- a/metadata-ingestion/examples/demo_data/bigquery_covid19_to_file.yml
+++ b/metadata-ingestion/examples/demo_data/bigquery_covid19_to_file.yml
@ -0,0 +1,19 @@
+---
+source:
+  type: bigquery
+  config:
+    project_id: "bigquery-public-data"
+    options:
+      credentials_path: "./bq_credentials.json"
+    table_pattern:
+      allow:
+      # Allow anything that starts with "covid19"
+      - "bigquery-public-data\\.covid19.*"
+      deny:
+      # Except for tables that end with an underscore.
+      - ".*_$"
+
+sink:
+  type: "file"
+  config:
+    filename: "./examples/demo_data/all_covid19_datasets.json"
--- a/metadata-ingestion/examples/demo_data/demo_data.json
+++ b/metadata-ingestion/examples/demo_data/demo_data.json
--- a/metadata-ingestion/examples/demo_data/demo_data_to_datahub_rest.yml
+++ b/metadata-ingestion/examples/demo_data/demo_data_to_datahub_rest.yml
@ -0,0 +1,9 @@
+source:
+  type: "file"
+  config:
+    filename: "./examples/demo_data/demo_data.json"
+
+sink:
+  type: "datahub-rest"
+  config:
+    server: 'http://localhost:8080'
--- a/metadata-ingestion/examples/demo_data/directives.csv
+++ b/metadata-ingestion/examples/demo_data/directives.csv
@ -0,0 +1,95 @@
+table,drop,owners,depends_on
+bigquery-public-data.covid19_aha.hospital_beds,,American Heart Association,
+bigquery-public-data.covid19_aha.staffing,,American Heart Association,
+bigquery-public-data.covid19_ecdc.covid_19_geographic_distribution_worldwide,,European Centre for Disease Prevention and Control,
+bigquery-public-data.covid19_ecdc_eu.covid_19_geographic_distribution_worldwide,x,European Centre for Disease Prevention and Control,
+bigquery-public-data.covid19_geotab_mobility_impact.airport_traffic,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.city_congestion,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic,,Geotab,bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic_by_industry
+bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic_by_industry,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_daily_fillups,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_weekly_fillups,,Geotab,bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_daily_fillups
+bigquery-public-data.covid19_geotab_mobility_impact.lookup_region,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.port_traffic,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.us_border_volumes,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact.us_border_wait_times,,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.airport_traffic,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.city_congestion,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.commercial_traffic,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.commercial_traffic_by_industry,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.fuel_station_daily_fillups,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.fuel_station_weekly_fillups,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.lookup_region,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.port_traffic,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.us_border_volumes,x,Geotab,
+bigquery-public-data.covid19_geotab_mobility_impact_eu.us_border_wait_times,x,Geotab,
+bigquery-public-data.covid19_google_mobility.mobility_report,,Google,
+bigquery-public-data.covid19_google_mobility_eu.mobility_report,x,Google,
+bigquery-public-data.covid19_govt_response.oxford_policy_tracker,,Oxford,
+bigquery-public-data.covid19_italy.data_by_province,,Italian National Institute of Health,
+bigquery-public-data.covid19_italy.data_by_region,,Italian National Institute of Health,bigquery-public-data.covid19_italy.data_by_province
+bigquery-public-data.covid19_italy.national_trends,,Italian National Institute of Health,bigquery-public-data.covid19_italy.data_by_region
+bigquery-public-data.covid19_italy_eu.data_by_province,x,,
+bigquery-public-data.covid19_italy_eu.data_by_region,x,,
+bigquery-public-data.covid19_italy_eu.national_trends,x,,
+bigquery-public-data.covid19_jhu_csse.confirmed_cases,x,,
+bigquery-public-data.covid19_jhu_csse.deaths,x,,
+bigquery-public-data.covid19_jhu_csse.recovered_cases,x,,
+bigquery-public-data.covid19_jhu_csse.summary,x,,
+bigquery-public-data.covid19_jhu_csse_eu.confirmed_cases,x,,
+bigquery-public-data.covid19_jhu_csse_eu.deaths,x,,
+bigquery-public-data.covid19_jhu_csse_eu.recovered_cases,x,,
+bigquery-public-data.covid19_jhu_csse_eu.summary,x,,
+bigquery-public-data.covid19_nyt.excess_deaths,,New York Times,
+bigquery-public-data.covid19_nyt.mask_use_by_county,,New York Times,
+bigquery-public-data.covid19_nyt.us_counties,,New York Times,
+bigquery-public-data.covid19_nyt.us_states,,New York Times,bigquery-public-data.covid19_nyt.us_counties
+bigquery-public-data.covid19_open_data.covid19_open_data,,Google,
+bigquery-public-data.covid19_open_data_eu.covid19_open_data,x,,
+bigquery-public-data.covid19_public_forecasts.county_14d,,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts.county_14d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_14d
+bigquery-public-data.covid19_public_forecasts.county_14d_historical_,x,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts.county_28d,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_14d
+bigquery-public-data.covid19_public_forecasts.county_28d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_28d
+bigquery-public-data.covid19_public_forecasts.county_28d_historical_,x,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d,,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d
+bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d_historical_,x,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts.state_14d,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_14d
+bigquery-public-data.covid19_public_forecasts.state_14d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.state_14d
+bigquery-public-data.covid19_public_forecasts.state_14d_historical_,x,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts.state_28d,,"Harvard Global Health Institute, Google","bigquery-public-data.covid19_public_forecasts.state_14d, bigquery-public-data.covid19_public_forecasts.county_28d"
+bigquery-public-data.covid19_public_forecasts.state_28d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.state_28d
+bigquery-public-data.covid19_public_forecasts.state_28d_historical_,x,"Harvard Global Health Institute, Google",
+bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d_historical,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d_historical_,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d_historical,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d_historical_,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d_historical,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d_historical_,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d_historical,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d_historical_,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d_historical,x,,
+bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d_historical_,x,,
+bigquery-public-data.covid19_rxrx19.rxrx19a_embeddings,x,,
+bigquery-public-data.covid19_rxrx19.rxrx19a_metadata,x,,
+bigquery-public-data.covid19_rxrx19.rxrx19b_embeddings,x,,
+bigquery-public-data.covid19_rxrx19.rxrx19b_metadata,x,,
+bigquery-public-data.covid19_symptom_search.symptom_search_country_daily,,Google,"bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily, bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily"
+bigquery-public-data.covid19_symptom_search.symptom_search_country_weekly,,Google,"bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_weekly, bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly"
+bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily,,Google,
+bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_weekly,,Google,bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily
+bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily,,Google,
+bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly,,Google,bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily
+bigquery-public-data.covid19_usafacts.confirmed_cases,,USAFacts,
+bigquery-public-data.covid19_usafacts.deaths,,USAFacts,
+bigquery-public-data.covid19_usafacts.summary,,USAFacts,"bigquery-public-data.covid19_usafacts.confirmed_cases, bigquery-public-data.covid19_usafacts.deaths"
+bigquery-public-data.covid19_weathersource_com.county_day_forecast,,Weather Source,bigquery-public-data.covid19_weathersource_com.county_day_history
+bigquery-public-data.covid19_weathersource_com.county_day_history,,Weather Source,bigquery-public-data.covid19_weathersource_com.postal_code_day_history
+bigquery-public-data.covid19_weathersource_com.postal_code_day_forecast,,Weather Source,bigquery-public-data.covid19_weathersource_com.postal_code_day_history
+bigquery-public-data.covid19_weathersource_com.postal_code_day_history,,Weather Source,
--- a/metadata-ingestion/examples/demo_data/enrich.py
+++ b/metadata-ingestion/examples/demo_data/enrich.py
@ -0,0 +1,187 @@
+"""
+After running the recipe in this directory, we get a large JSON file called all_covid19_datasets.json.
+This script reads that JSON file, adds to it using the directives pull from a Google sheet, and
+produces a new JSON file called demo_data.json.
+"""
+
+from typing import List
+
+import pathlib
+import json
+import csv
+import dataclasses
+import time
+from datahub.metadata.schema_classes import (
+    MetadataChangeEventClass,
+    DatasetSnapshotClass,
+    OwnershipClass,
+    OwnerClass,
+    OwnershipTypeClass,
+    CorpUserSnapshotClass,
+    CorpUserInfoClass,
+    AuditStampClass,
+    UpstreamLineageClass,
+    UpstreamClass,
+    DatasetLineageTypeClass,
+)
+
+DEMO_DATA_DIR = pathlib.Path("./examples/demo_data")
+INPUT_ALL_DATASETS = DEMO_DATA_DIR / "all_covid19_datasets.json"
+OUTPUT_ENRICHED = DEMO_DATA_DIR / "demo_data.json"
+DIRECTIVES_CSV = DEMO_DATA_DIR / "directives.csv"
+
+
+@dataclasses.dataclass
+class Directive:
+    table: str
+    drop: bool
+    owners: List[str]
+    depends_on: List[str]
+
+
+def read_mces(path) -> List[MetadataChangeEventClass]:
+    with open(path) as f:
+        objs = json.load(f)
+        mces = [MetadataChangeEventClass.from_obj(obj) for obj in objs]
+    return mces
+
+
+def write_mces(path, mces: List[MetadataChangeEventClass]) -> None:
+    objs = [mce.to_obj() for mce in mces]
+    with open(path, "w") as f:
+        json.dump(objs, f, indent=4)
+
+
+def parse_directive(row) -> Directive:
+    return Directive(
+        table=row["table"],
+        drop=bool(row["drop"]),
+        owners=[x.strip() for x in row["owners"].split(",") if x],
+        depends_on=[x.strip() for x in row["depends_on"].split(",") if x],
+    )
+
+
+def fetch_directives() -> List[Directive]:
+    with open(DIRECTIVES_CSV, "r") as f:
+        reader = csv.DictReader(f)
+        rows = [parse_directive(row) for row in reader]
+        return rows
+
+
+def dataset_name_to_urn(name: str) -> str:
+    return f"urn:li:dataset:(urn:li:dataPlatform:bigquery,{name},PROD)"
+
+
+def clean_owner_name(name: str) -> str:
+    clean = "".join(c for c in name if c.isalpha())
+    return clean
+
+
+def owner_name_to_urn(name: str) -> str:
+    return f"urn:li:corpuser:{name}"
+
+
+def create_owner_entity_mce(owner: str) -> MetadataChangeEventClass:
+    clean_name = clean_owner_name(owner)
+    return MetadataChangeEventClass(
+        proposedSnapshot=CorpUserSnapshotClass(
+            urn=owner_name_to_urn(clean_name),
+            aspects=[
+                CorpUserInfoClass(
+                    active=True,
+                    displayName=owner,
+                    fullName=owner,
+                    email=f"{clean_name}-demo@example.com",
+                )
+            ],
+        )
+    )
+
+
+def create_ownership_aspect_mce(directive: Directive) -> MetadataChangeEventClass:
+    return MetadataChangeEventClass(
+        proposedSnapshot=DatasetSnapshotClass(
+            urn=dataset_name_to_urn(directive.table),
+            aspects=[
+                OwnershipClass(
+                    owners=[
+                        OwnerClass(
+                            owner=owner_name_to_urn(clean_owner_name(owner)),
+                            type=OwnershipTypeClass.DATAOWNER,  # type: ignore
+                        )
+                        for owner in directive.owners
+                    ],
+                    lastModified=AuditStampClass(
+                        time=int(time.time() * 1000),
+                        actor="urn:li:corpuser:datahub",
+                    ),
+                )
+            ],
+        )
+    )
+
+
+def create_lineage_aspect_mce(directive: Directive) -> MetadataChangeEventClass:
+    return MetadataChangeEventClass(
+        proposedSnapshot=DatasetSnapshotClass(
+            urn=dataset_name_to_urn(directive.table),
+            aspects=[
+                UpstreamLineageClass(
+                    upstreams=[
+                        UpstreamClass(
+                            dataset=dataset_name_to_urn(upstream),
+                            type=DatasetLineageTypeClass.TRANSFORMED,  # type: ignore
+                            auditStamp=AuditStampClass(
+                                time=int(time.time() * 1000),
+                                actor="urn:li:corpuser:datahub",
+                            ),
+                        )
+                        for upstream in directive.depends_on
+                    ]
+                )
+            ],
+        )
+    )
+
+
+if __name__ == "__main__":
+    datasets = read_mces(INPUT_ALL_DATASETS)
+    all_directives = fetch_directives()
+    directives = [directive for directive in all_directives if not directive.drop]
+
+    all_dataset_urns = {
+        dataset_name_to_urn(directive.table) for directive in all_directives
+    }
+    allowed_urns = {
+        dataset_name_to_urn(directive.table)
+        for directive in all_directives
+        if not directive.drop
+    }
+
+    assert all(dataset.proposedSnapshot.urn in all_dataset_urns for dataset in datasets)
+    filtered_dataset_mces = [
+        dataset for dataset in datasets if dataset.proposedSnapshot.urn in allowed_urns
+    ]
+
+    owner_names = {owner for directive in directives for owner in directive.owners}
+    owner_entity_mces = [create_owner_entity_mce(owner) for owner in owner_names]
+
+    ownership_aspect_mces = [
+        create_ownership_aspect_mce(directive)
+        for directive in directives
+        if directive.owners
+    ]
+
+    lineage_aspect_mces = [
+        create_lineage_aspect_mce(directive)
+        for directive in directives
+        if directive.depends_on
+    ]
+
+    enriched_mces = (
+        filtered_dataset_mces
+        + owner_entity_mces
+        + ownership_aspect_mces
+        + lineage_aspect_mces
+    )
+    write_mces(OUTPUT_ENRICHED, enriched_mces)
--- a/metadata-ingestion/examples/demo_data/generate_demo_data.sh
+++ b/metadata-ingestion/examples/demo_data/generate_demo_data.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# This script will use the YML files in examples/demo_data to generate
+# all_covid19_datasets.json, directives.csv, and finally demo_data.json.
+
+set -euxo pipefail
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+# Fetch public COVID-19 datasets from BigQuery.
+datahub ingest -c $DIR/bigquery_covid19_to_file.yml
+
+# Pull the directives CSV from Google sheets.
+# See https://docs.google.com/spreadsheets/d/17c5SBiXEw5PuV7oEkC2uQnX55C6TPZTnr6XRQ6X-Qy0/edit#gid=0.
+DIRECTIVES_URL="https://docs.google.com/spreadsheets/d/e/2PACX-1vSUtBW2wEb3AO0fk8XsZRauVzdFpXb3Jj_G3L3ngmNPUsnB-12KW_JRXIqpXpZYeYMuaiQQrM8Huu3f/pub?gid=0&single=true&output=csv"
+curl -L "${DIRECTIVES_URL}" --output $DIR/directives.csv
+
+# Enrich the COVID-19 datasets using the directives.
+python $DIR/enrich.py
--- a/metadata-ingestion/src/datahub/ingestion/api/report.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/report.py
@ -9,7 +9,7 @@ class Report:
        return self.__dict__

    def as_string(self) -> str:
-        return pprint.pformat(self.as_obj(), width=120)
+        return pprint.pformat(self.as_obj(), width=150)

    def as_json(self) -> str:
        return json.dumps(self.as_obj())
--- a/metadata-ingestion/src/datahub/ingestion/source/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql_common.py
@ -97,6 +97,7 @@ _field_type_mapping = {
    types.Boolean: BooleanTypeClass,
    types.Enum: EnumTypeClass,
    types._Binary: BytesTypeClass,
+    types.LargeBinary: BytesTypeClass,
    types.PickleType: BytesTypeClass,
    types.ARRAY: ArrayTypeClass,
    types.String: StringTypeClass,
@ -104,6 +105,15 @@ _field_type_mapping = {
    # assigns the NullType by default. We want to carry this warning through.
    types.NullType: NullTypeClass,
 }
+_known_unknown_field_types = {
+    types.Date,
+    types.Time,
+    types.DateTime,
+    types.Interval,
+    types.DATE,
+    types.DATETIME,
+    types.TIMESTAMP,
+}


 def get_column_type(
@ -118,6 +128,11 @@ def get_column_type(
        if isinstance(column_type, sql_type):
            TypeClass = _field_type_mapping[sql_type]
            break
+    if TypeClass is None:
+        for sql_type in _known_unknown_field_types:
+            if isinstance(column_type, sql_type):
+                TypeClass = NullTypeClass
+                break

    if TypeClass is None:
        sql_report.report_warning(