feat(ingest): bigquery sample data (#2178)

This commit is contained in:
Harshal Sheth 2021-03-05 16:39:34 -08:00 committed by GitHub
parent d47cc31157
commit 1f082b114b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 63945 additions and 1 deletions

View File

@ -3,6 +3,7 @@
output
src/datahub/metadata/
pvenv36/
bq_credentials.json
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -0,0 +1 @@
/all_covid19_datasets.json

View File

@ -0,0 +1,19 @@
---
source:
type: bigquery
config:
project_id: "bigquery-public-data"
options:
credentials_path: "./bq_credentials.json"
table_pattern:
allow:
# Allow anything that starts with "covid19"
- "bigquery-public-data\\.covid19.*"
deny:
# Except for tables that end with an underscore.
- ".*_$"
sink:
type: "file"
config:
filename: "./examples/demo_data/all_covid19_datasets.json"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,9 @@
source:
type: "file"
config:
filename: "./examples/demo_data/demo_data.json"
sink:
type: "datahub-rest"
config:
server: 'http://localhost:8080'

View File

@ -0,0 +1,95 @@
table,drop,owners,depends_on
bigquery-public-data.covid19_aha.hospital_beds,,American Heart Association,
bigquery-public-data.covid19_aha.staffing,,American Heart Association,
bigquery-public-data.covid19_ecdc.covid_19_geographic_distribution_worldwide,,European Centre for Disease Prevention and Control,
bigquery-public-data.covid19_ecdc_eu.covid_19_geographic_distribution_worldwide,x,European Centre for Disease Prevention and Control,
bigquery-public-data.covid19_geotab_mobility_impact.airport_traffic,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.city_congestion,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic,,Geotab,bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic_by_industry
bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic_by_industry,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_daily_fillups,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_weekly_fillups,,Geotab,bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_daily_fillups
bigquery-public-data.covid19_geotab_mobility_impact.lookup_region,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.port_traffic,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.us_border_volumes,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact.us_border_wait_times,,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.airport_traffic,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.city_congestion,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.commercial_traffic,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.commercial_traffic_by_industry,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.fuel_station_daily_fillups,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.fuel_station_weekly_fillups,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.lookup_region,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.port_traffic,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.us_border_volumes,x,Geotab,
bigquery-public-data.covid19_geotab_mobility_impact_eu.us_border_wait_times,x,Geotab,
bigquery-public-data.covid19_google_mobility.mobility_report,,Google,
bigquery-public-data.covid19_google_mobility_eu.mobility_report,x,Google,
bigquery-public-data.covid19_govt_response.oxford_policy_tracker,,Oxford,
bigquery-public-data.covid19_italy.data_by_province,,Italian National Institute of Health,
bigquery-public-data.covid19_italy.data_by_region,,Italian National Institute of Health,bigquery-public-data.covid19_italy.data_by_province
bigquery-public-data.covid19_italy.national_trends,,Italian National Institute of Health,bigquery-public-data.covid19_italy.data_by_region
bigquery-public-data.covid19_italy_eu.data_by_province,x,,
bigquery-public-data.covid19_italy_eu.data_by_region,x,,
bigquery-public-data.covid19_italy_eu.national_trends,x,,
bigquery-public-data.covid19_jhu_csse.confirmed_cases,x,,
bigquery-public-data.covid19_jhu_csse.deaths,x,,
bigquery-public-data.covid19_jhu_csse.recovered_cases,x,,
bigquery-public-data.covid19_jhu_csse.summary,x,,
bigquery-public-data.covid19_jhu_csse_eu.confirmed_cases,x,,
bigquery-public-data.covid19_jhu_csse_eu.deaths,x,,
bigquery-public-data.covid19_jhu_csse_eu.recovered_cases,x,,
bigquery-public-data.covid19_jhu_csse_eu.summary,x,,
bigquery-public-data.covid19_nyt.excess_deaths,,New York Times,
bigquery-public-data.covid19_nyt.mask_use_by_county,,New York Times,
bigquery-public-data.covid19_nyt.us_counties,,New York Times,
bigquery-public-data.covid19_nyt.us_states,,New York Times,bigquery-public-data.covid19_nyt.us_counties
bigquery-public-data.covid19_open_data.covid19_open_data,,Google,
bigquery-public-data.covid19_open_data_eu.covid19_open_data,x,,
bigquery-public-data.covid19_public_forecasts.county_14d,,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts.county_14d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_14d
bigquery-public-data.covid19_public_forecasts.county_14d_historical_,x,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts.county_28d,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_14d
bigquery-public-data.covid19_public_forecasts.county_28d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_28d
bigquery-public-data.covid19_public_forecasts.county_28d_historical_,x,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d,,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d
bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d_historical_,x,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts.state_14d,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.county_14d
bigquery-public-data.covid19_public_forecasts.state_14d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.state_14d
bigquery-public-data.covid19_public_forecasts.state_14d_historical_,x,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts.state_28d,,"Harvard Global Health Institute, Google","bigquery-public-data.covid19_public_forecasts.state_14d, bigquery-public-data.covid19_public_forecasts.county_28d"
bigquery-public-data.covid19_public_forecasts.state_28d_historical,,"Harvard Global Health Institute, Google",bigquery-public-data.covid19_public_forecasts.state_28d
bigquery-public-data.covid19_public_forecasts.state_28d_historical_,x,"Harvard Global Health Institute, Google",
bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d_historical,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d_historical_,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d_historical,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d_historical_,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d_historical,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d_historical_,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d_historical,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d_historical_,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d_historical,x,,
bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d_historical_,x,,
bigquery-public-data.covid19_rxrx19.rxrx19a_embeddings,x,,
bigquery-public-data.covid19_rxrx19.rxrx19a_metadata,x,,
bigquery-public-data.covid19_rxrx19.rxrx19b_embeddings,x,,
bigquery-public-data.covid19_rxrx19.rxrx19b_metadata,x,,
bigquery-public-data.covid19_symptom_search.symptom_search_country_daily,,Google,"bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily, bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily"
bigquery-public-data.covid19_symptom_search.symptom_search_country_weekly,,Google,"bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_weekly, bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly"
bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily,,Google,
bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_weekly,,Google,bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily
bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily,,Google,
bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly,,Google,bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily
bigquery-public-data.covid19_usafacts.confirmed_cases,,USAFacts,
bigquery-public-data.covid19_usafacts.deaths,,USAFacts,
bigquery-public-data.covid19_usafacts.summary,,USAFacts,"bigquery-public-data.covid19_usafacts.confirmed_cases, bigquery-public-data.covid19_usafacts.deaths"
bigquery-public-data.covid19_weathersource_com.county_day_forecast,,Weather Source,bigquery-public-data.covid19_weathersource_com.county_day_history
bigquery-public-data.covid19_weathersource_com.county_day_history,,Weather Source,bigquery-public-data.covid19_weathersource_com.postal_code_day_history
bigquery-public-data.covid19_weathersource_com.postal_code_day_forecast,,Weather Source,bigquery-public-data.covid19_weathersource_com.postal_code_day_history
bigquery-public-data.covid19_weathersource_com.postal_code_day_history,,Weather Source,
1 table drop owners depends_on
2 bigquery-public-data.covid19_aha.hospital_beds American Heart Association
3 bigquery-public-data.covid19_aha.staffing American Heart Association
4 bigquery-public-data.covid19_ecdc.covid_19_geographic_distribution_worldwide European Centre for Disease Prevention and Control
5 bigquery-public-data.covid19_ecdc_eu.covid_19_geographic_distribution_worldwide x European Centre for Disease Prevention and Control
6 bigquery-public-data.covid19_geotab_mobility_impact.airport_traffic Geotab
7 bigquery-public-data.covid19_geotab_mobility_impact.city_congestion Geotab
8 bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic Geotab bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic_by_industry
9 bigquery-public-data.covid19_geotab_mobility_impact.commercial_traffic_by_industry Geotab
10 bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_daily_fillups Geotab
11 bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_weekly_fillups Geotab bigquery-public-data.covid19_geotab_mobility_impact.fuel_station_daily_fillups
12 bigquery-public-data.covid19_geotab_mobility_impact.lookup_region Geotab
13 bigquery-public-data.covid19_geotab_mobility_impact.port_traffic Geotab
14 bigquery-public-data.covid19_geotab_mobility_impact.us_border_volumes Geotab
15 bigquery-public-data.covid19_geotab_mobility_impact.us_border_wait_times Geotab
16 bigquery-public-data.covid19_geotab_mobility_impact_eu.airport_traffic x Geotab
17 bigquery-public-data.covid19_geotab_mobility_impact_eu.city_congestion x Geotab
18 bigquery-public-data.covid19_geotab_mobility_impact_eu.commercial_traffic x Geotab
19 bigquery-public-data.covid19_geotab_mobility_impact_eu.commercial_traffic_by_industry x Geotab
20 bigquery-public-data.covid19_geotab_mobility_impact_eu.fuel_station_daily_fillups x Geotab
21 bigquery-public-data.covid19_geotab_mobility_impact_eu.fuel_station_weekly_fillups x Geotab
22 bigquery-public-data.covid19_geotab_mobility_impact_eu.lookup_region x Geotab
23 bigquery-public-data.covid19_geotab_mobility_impact_eu.port_traffic x Geotab
24 bigquery-public-data.covid19_geotab_mobility_impact_eu.us_border_volumes x Geotab
25 bigquery-public-data.covid19_geotab_mobility_impact_eu.us_border_wait_times x Geotab
26 bigquery-public-data.covid19_google_mobility.mobility_report Google
27 bigquery-public-data.covid19_google_mobility_eu.mobility_report x Google
28 bigquery-public-data.covid19_govt_response.oxford_policy_tracker Oxford
29 bigquery-public-data.covid19_italy.data_by_province Italian National Institute of Health
30 bigquery-public-data.covid19_italy.data_by_region Italian National Institute of Health bigquery-public-data.covid19_italy.data_by_province
31 bigquery-public-data.covid19_italy.national_trends Italian National Institute of Health bigquery-public-data.covid19_italy.data_by_region
32 bigquery-public-data.covid19_italy_eu.data_by_province x
33 bigquery-public-data.covid19_italy_eu.data_by_region x
34 bigquery-public-data.covid19_italy_eu.national_trends x
35 bigquery-public-data.covid19_jhu_csse.confirmed_cases x
36 bigquery-public-data.covid19_jhu_csse.deaths x
37 bigquery-public-data.covid19_jhu_csse.recovered_cases x
38 bigquery-public-data.covid19_jhu_csse.summary x
39 bigquery-public-data.covid19_jhu_csse_eu.confirmed_cases x
40 bigquery-public-data.covid19_jhu_csse_eu.deaths x
41 bigquery-public-data.covid19_jhu_csse_eu.recovered_cases x
42 bigquery-public-data.covid19_jhu_csse_eu.summary x
43 bigquery-public-data.covid19_nyt.excess_deaths New York Times
44 bigquery-public-data.covid19_nyt.mask_use_by_county New York Times
45 bigquery-public-data.covid19_nyt.us_counties New York Times
46 bigquery-public-data.covid19_nyt.us_states New York Times bigquery-public-data.covid19_nyt.us_counties
47 bigquery-public-data.covid19_open_data.covid19_open_data Google
48 bigquery-public-data.covid19_open_data_eu.covid19_open_data x
49 bigquery-public-data.covid19_public_forecasts.county_14d Harvard Global Health Institute, Google
50 bigquery-public-data.covid19_public_forecasts.county_14d_historical Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.county_14d
51 bigquery-public-data.covid19_public_forecasts.county_14d_historical_ x Harvard Global Health Institute, Google
52 bigquery-public-data.covid19_public_forecasts.county_28d Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.county_14d
53 bigquery-public-data.covid19_public_forecasts.county_28d_historical Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.county_28d
54 bigquery-public-data.covid19_public_forecasts.county_28d_historical_ x Harvard Global Health Institute, Google
55 bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d Harvard Global Health Institute, Google
56 bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d_historical Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d
57 bigquery-public-data.covid19_public_forecasts.japan_prefecture_28d_historical_ x Harvard Global Health Institute, Google
58 bigquery-public-data.covid19_public_forecasts.state_14d Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.county_14d
59 bigquery-public-data.covid19_public_forecasts.state_14d_historical Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.state_14d
60 bigquery-public-data.covid19_public_forecasts.state_14d_historical_ x Harvard Global Health Institute, Google
61 bigquery-public-data.covid19_public_forecasts.state_28d Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.state_14d, bigquery-public-data.covid19_public_forecasts.county_28d
62 bigquery-public-data.covid19_public_forecasts.state_28d_historical Harvard Global Health Institute, Google bigquery-public-data.covid19_public_forecasts.state_28d
63 bigquery-public-data.covid19_public_forecasts.state_28d_historical_ x Harvard Global Health Institute, Google
64 bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d x
65 bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d_historical x
66 bigquery-public-data.covid19_public_forecasts_asia_ne1.county_14d_historical_ x
67 bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d x
68 bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d_historical x
69 bigquery-public-data.covid19_public_forecasts_asia_ne1.county_28d_historical_ x
70 bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d x
71 bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d_historical x
72 bigquery-public-data.covid19_public_forecasts_asia_ne1.japan_prefecture_28d_historical_ x
73 bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d x
74 bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d_historical x
75 bigquery-public-data.covid19_public_forecasts_asia_ne1.state_14d_historical_ x
76 bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d x
77 bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d_historical x
78 bigquery-public-data.covid19_public_forecasts_asia_ne1.state_28d_historical_ x
79 bigquery-public-data.covid19_rxrx19.rxrx19a_embeddings x
80 bigquery-public-data.covid19_rxrx19.rxrx19a_metadata x
81 bigquery-public-data.covid19_rxrx19.rxrx19b_embeddings x
82 bigquery-public-data.covid19_rxrx19.rxrx19b_metadata x
83 bigquery-public-data.covid19_symptom_search.symptom_search_country_daily Google bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily, bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily
84 bigquery-public-data.covid19_symptom_search.symptom_search_country_weekly Google bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_weekly, bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly
85 bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily Google
86 bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_weekly Google bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_1_daily
87 bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily Google
88 bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_weekly Google bigquery-public-data.covid19_symptom_search.symptom_search_sub_region_2_daily
89 bigquery-public-data.covid19_usafacts.confirmed_cases USAFacts
90 bigquery-public-data.covid19_usafacts.deaths USAFacts
91 bigquery-public-data.covid19_usafacts.summary USAFacts bigquery-public-data.covid19_usafacts.confirmed_cases, bigquery-public-data.covid19_usafacts.deaths
92 bigquery-public-data.covid19_weathersource_com.county_day_forecast Weather Source bigquery-public-data.covid19_weathersource_com.county_day_history
93 bigquery-public-data.covid19_weathersource_com.county_day_history Weather Source bigquery-public-data.covid19_weathersource_com.postal_code_day_history
94 bigquery-public-data.covid19_weathersource_com.postal_code_day_forecast Weather Source bigquery-public-data.covid19_weathersource_com.postal_code_day_history
95 bigquery-public-data.covid19_weathersource_com.postal_code_day_history Weather Source

View File

@ -0,0 +1,187 @@
"""
After running the recipe in this directory, we get a large JSON file called all_covid19_datasets.json.
This script reads that JSON file, adds to it using the directives pull from a Google sheet, and
produces a new JSON file called demo_data.json.
"""
from typing import List
import pathlib
import json
import csv
import dataclasses
import time
from datahub.metadata.schema_classes import (
MetadataChangeEventClass,
DatasetSnapshotClass,
OwnershipClass,
OwnerClass,
OwnershipTypeClass,
CorpUserSnapshotClass,
CorpUserInfoClass,
AuditStampClass,
UpstreamLineageClass,
UpstreamClass,
DatasetLineageTypeClass,
)
DEMO_DATA_DIR = pathlib.Path("./examples/demo_data")
INPUT_ALL_DATASETS = DEMO_DATA_DIR / "all_covid19_datasets.json"
OUTPUT_ENRICHED = DEMO_DATA_DIR / "demo_data.json"
DIRECTIVES_CSV = DEMO_DATA_DIR / "directives.csv"
@dataclasses.dataclass
class Directive:
table: str
drop: bool
owners: List[str]
depends_on: List[str]
def read_mces(path) -> List[MetadataChangeEventClass]:
with open(path) as f:
objs = json.load(f)
mces = [MetadataChangeEventClass.from_obj(obj) for obj in objs]
return mces
def write_mces(path, mces: List[MetadataChangeEventClass]) -> None:
objs = [mce.to_obj() for mce in mces]
with open(path, "w") as f:
json.dump(objs, f, indent=4)
def parse_directive(row) -> Directive:
return Directive(
table=row["table"],
drop=bool(row["drop"]),
owners=[x.strip() for x in row["owners"].split(",") if x],
depends_on=[x.strip() for x in row["depends_on"].split(",") if x],
)
def fetch_directives() -> List[Directive]:
with open(DIRECTIVES_CSV, "r") as f:
reader = csv.DictReader(f)
rows = [parse_directive(row) for row in reader]
return rows
def dataset_name_to_urn(name: str) -> str:
return f"urn:li:dataset:(urn:li:dataPlatform:bigquery,{name},PROD)"
def clean_owner_name(name: str) -> str:
clean = "".join(c for c in name if c.isalpha())
return clean
def owner_name_to_urn(name: str) -> str:
return f"urn:li:corpuser:{name}"
def create_owner_entity_mce(owner: str) -> MetadataChangeEventClass:
clean_name = clean_owner_name(owner)
return MetadataChangeEventClass(
proposedSnapshot=CorpUserSnapshotClass(
urn=owner_name_to_urn(clean_name),
aspects=[
CorpUserInfoClass(
active=True,
displayName=owner,
fullName=owner,
email=f"{clean_name}-demo@example.com",
)
],
)
)
def create_ownership_aspect_mce(directive: Directive) -> MetadataChangeEventClass:
return MetadataChangeEventClass(
proposedSnapshot=DatasetSnapshotClass(
urn=dataset_name_to_urn(directive.table),
aspects=[
OwnershipClass(
owners=[
OwnerClass(
owner=owner_name_to_urn(clean_owner_name(owner)),
type=OwnershipTypeClass.DATAOWNER, # type: ignore
)
for owner in directive.owners
],
lastModified=AuditStampClass(
time=int(time.time() * 1000),
actor="urn:li:corpuser:datahub",
),
)
],
)
)
def create_lineage_aspect_mce(directive: Directive) -> MetadataChangeEventClass:
return MetadataChangeEventClass(
proposedSnapshot=DatasetSnapshotClass(
urn=dataset_name_to_urn(directive.table),
aspects=[
UpstreamLineageClass(
upstreams=[
UpstreamClass(
dataset=dataset_name_to_urn(upstream),
type=DatasetLineageTypeClass.TRANSFORMED, # type: ignore
auditStamp=AuditStampClass(
time=int(time.time() * 1000),
actor="urn:li:corpuser:datahub",
),
)
for upstream in directive.depends_on
]
)
],
)
)
if __name__ == "__main__":
datasets = read_mces(INPUT_ALL_DATASETS)
all_directives = fetch_directives()
directives = [directive for directive in all_directives if not directive.drop]
all_dataset_urns = {
dataset_name_to_urn(directive.table) for directive in all_directives
}
allowed_urns = {
dataset_name_to_urn(directive.table)
for directive in all_directives
if not directive.drop
}
assert all(dataset.proposedSnapshot.urn in all_dataset_urns for dataset in datasets)
filtered_dataset_mces = [
dataset for dataset in datasets if dataset.proposedSnapshot.urn in allowed_urns
]
owner_names = {owner for directive in directives for owner in directive.owners}
owner_entity_mces = [create_owner_entity_mce(owner) for owner in owner_names]
ownership_aspect_mces = [
create_ownership_aspect_mce(directive)
for directive in directives
if directive.owners
]
lineage_aspect_mces = [
create_lineage_aspect_mce(directive)
for directive in directives
if directive.depends_on
]
enriched_mces = (
filtered_dataset_mces
+ owner_entity_mces
+ ownership_aspect_mces
+ lineage_aspect_mces
)
write_mces(OUTPUT_ENRICHED, enriched_mces)

View File

@ -0,0 +1,18 @@
#!/bin/bash
# This script will use the YML files in examples/demo_data to generate
# all_covid19_datasets.json, directives.csv, and finally demo_data.json.
set -euxo pipefail
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
# Fetch public COVID-19 datasets from BigQuery.
datahub ingest -c $DIR/bigquery_covid19_to_file.yml
# Pull the directives CSV from Google sheets.
# See https://docs.google.com/spreadsheets/d/17c5SBiXEw5PuV7oEkC2uQnX55C6TPZTnr6XRQ6X-Qy0/edit#gid=0.
DIRECTIVES_URL="https://docs.google.com/spreadsheets/d/e/2PACX-1vSUtBW2wEb3AO0fk8XsZRauVzdFpXb3Jj_G3L3ngmNPUsnB-12KW_JRXIqpXpZYeYMuaiQQrM8Huu3f/pub?gid=0&single=true&output=csv"
curl -L "${DIRECTIVES_URL}" --output $DIR/directives.csv
# Enrich the COVID-19 datasets using the directives.
python $DIR/enrich.py

View File

@ -9,7 +9,7 @@ class Report:
return self.__dict__
def as_string(self) -> str:
return pprint.pformat(self.as_obj(), width=120)
return pprint.pformat(self.as_obj(), width=150)
def as_json(self) -> str:
return json.dumps(self.as_obj())

View File

@ -97,6 +97,7 @@ _field_type_mapping = {
types.Boolean: BooleanTypeClass,
types.Enum: EnumTypeClass,
types._Binary: BytesTypeClass,
types.LargeBinary: BytesTypeClass,
types.PickleType: BytesTypeClass,
types.ARRAY: ArrayTypeClass,
types.String: StringTypeClass,
@ -104,6 +105,15 @@ _field_type_mapping = {
# assigns the NullType by default. We want to carry this warning through.
types.NullType: NullTypeClass,
}
_known_unknown_field_types = {
types.Date,
types.Time,
types.DateTime,
types.Interval,
types.DATE,
types.DATETIME,
types.TIMESTAMP,
}
def get_column_type(
@ -118,6 +128,11 @@ def get_column_type(
if isinstance(column_type, sql_type):
TypeClass = _field_type_mapping[sql_type]
break
if TypeClass is None:
for sql_type in _known_unknown_field_types:
if isinstance(column_type, sql_type):
TypeClass = NullTypeClass
break
if TypeClass is None:
sql_report.report_warning(