fix(ingestion/looker): deduplicate the view field (#10482)

Co-authored-by: Pedro Silva <pedro@acryl.io>
This commit is contained in:
sid-acryl 2024-05-15 23:55:07 +05:30 committed by GitHub
parent 8e5f17b131
commit c55c12c918
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 598 additions and 0 deletions

View File

@ -112,6 +112,34 @@ _VIEW_FILE_EXTENSION = ".view.lkml"
_MODEL_FILE_EXTENSION = ".model.lkml" _MODEL_FILE_EXTENSION = ".model.lkml"
def deduplicate_fields(fields: List[ViewField]) -> List[ViewField]:
# Remove duplicates filed from self.fields
# Logic is: If more than a field has same ViewField.name then keep only one filed where ViewField.field_type
# is DIMENSION_GROUP.
# Looker Constraint:
# - Any field declared as dimension or measure can be redefined as dimension_group.
# - Any field declared in dimension can't be redefined in measure and vice-versa.
dimension_group_field_names: List[str] = [
field.name
for field in fields
if field.field_type == ViewFieldType.DIMENSION_GROUP
]
new_fields: List[ViewField] = []
for field in fields:
if (
field.name in dimension_group_field_names
and field.field_type != ViewFieldType.DIMENSION_GROUP
):
continue
new_fields.append(field)
return new_fields
def _get_bigquery_definition( def _get_bigquery_definition(
looker_connection: DBConnection, looker_connection: DBConnection,
) -> Tuple[str, Optional[str], Optional[str]]: ) -> Tuple[str, Optional[str], Optional[str]]:
@ -1155,6 +1183,8 @@ class LookerView:
) )
fields: List[ViewField] = dimensions + dimension_groups + measures fields: List[ViewField] = dimensions + dimension_groups + measures
fields = deduplicate_fields(fields)
# Prep "default" values for the view, which will be overridden by the logic below. # Prep "default" values for the view, which will be overridden by the logic below.
view_logic = looker_viewfile.raw_file_content[:max_file_snippet_length] view_logic = looker_viewfile.raw_file_content[:max_file_snippet_length]
sql_table_names: List[str] = [] sql_table_names: List[str] = []

View File

@ -0,0 +1,488 @@
[
{
"entityType": "container",
"entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "looker",
"env": "PROD",
"project_name": "lkml_samples"
},
"name": "lkml_samples"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:looker"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"LookML Project"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "Folders"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"View"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
"changeType": "UPSERT",
"aspectName": "viewProperties",
"aspect": {
"json": {
"materialized": false,
"viewLogic": "# File was added to check duplicate field issue\n\nview: dataset_lineages {\n sql_table_name: \"PUBLIC\".\"DATASET_LINEAGES\"\n ;;\n\n dimension: createdon {\n type: date\n sql: ${TABLE}.\"CREATEDON\" ;;\n }\n\n dimension_group: createdon {\n type: time\n timeframes: [\n raw,\n time,\n date,\n week,\n month,\n quarter,\n year\n ]\n sql: ${TABLE}.\"CREATEDON\" ;;\n }\n\n dimension: entity {\n type: string\n sql: ${TABLE}.\"ENTITY\" ;;\n }\n\n dimension: metadata {\n type: string\n sql: ${TABLE}.\"METADATA\" ;;\n }\n\n dimension: urn {\n type: string\n sql: ${TABLE}.\"URN\" ;;\n }\n\n dimension: version {\n type: number\n sql: ${TABLE}.\"VERSION\" ;;\n }\n\n measure: count {\n type: count\n drill_fields: []\n }\n}\n",
"viewLanguage": "lookml"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
"paths": [
"/Develop/lkml_samples/"
]
}
},
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
"upstreams": [
{
"auditStamp": {
"time": 1586847600000,
"actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD)",
"type": "VIEW"
}
],
"fineGrainedLineages": [
{
"upstreamType": "FIELD_SET",
"upstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),entity)"
],
"downstreamType": "FIELD",
"downstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),entity)"
],
"confidenceScore": 1.0
},
{
"upstreamType": "FIELD_SET",
"upstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),metadata)"
],
"downstreamType": "FIELD",
"downstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),metadata)"
],
"confidenceScore": 1.0
},
{
"upstreamType": "FIELD_SET",
"upstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),urn)"
],
"downstreamType": "FIELD",
"downstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),urn)"
],
"confidenceScore": 1.0
},
{
"upstreamType": "FIELD_SET",
"upstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),version)"
],
"downstreamType": "FIELD",
"downstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),version)"
],
"confidenceScore": 1.0
},
{
"upstreamType": "FIELD_SET",
"upstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),createdon)"
],
"downstreamType": "FIELD",
"downstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),createdon)"
],
"confidenceScore": 1.0
},
{
"upstreamType": "FIELD_SET",
"upstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),count)"
],
"downstreamType": "FIELD",
"downstreams": [
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD),count)"
],
"confidenceScore": 1.0
}
]
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "dataset_lineages",
"platform": "urn:li:dataPlatform:looker",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.OtherSchema": {
"rawSchema": ""
}
},
"fields": [
{
"fieldPath": "entity",
"nullable": false,
"description": "",
"label": "",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "string",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Dimension"
}
]
},
"isPartOfKey": false
},
{
"fieldPath": "metadata",
"nullable": false,
"description": "",
"label": "",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "string",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Dimension"
}
]
},
"isPartOfKey": false
},
{
"fieldPath": "urn",
"nullable": false,
"description": "",
"label": "",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "string",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Dimension"
}
]
},
"isPartOfKey": false
},
{
"fieldPath": "version",
"nullable": false,
"description": "",
"label": "",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "number",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Dimension"
}
]
},
"isPartOfKey": false
},
{
"fieldPath": "createdon",
"nullable": false,
"description": "",
"label": "",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.TimeType": {}
}
},
"nativeDataType": "time",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Dimension"
},
{
"tag": "urn:li:tag:Temporal"
}
]
},
"isPartOfKey": false
},
{
"fieldPath": "count",
"nullable": false,
"description": "",
"label": "",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.NumberType": {}
}
},
"nativeDataType": "count",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Measure"
}
]
},
"isPartOfKey": false
}
],
"primaryKeys": []
}
},
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"looker.file.path": "dataset_lineages.view.lkml",
"looker.model": "data"
},
"name": "dataset_lineages",
"tags": []
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.dataset_lineages,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "Develop"
},
{
"id": "urn:li:container:78f22c19304954b15e8adb1d9809975e",
"urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",
"changeType": "UPSERT",
"aspectName": "tagKey",
"aspect": {
"json": {
"name": "Dimension"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Measure",
"changeType": "UPSERT",
"aspectName": "tagKey",
"aspect": {
"json": {
"name": "Measure"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Temporal",
"changeType": "UPSERT",
"aspectName": "tagKey",
"aspect": {
"json": {
"name": "Temporal"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "lookml-test",
"lastRunId": "no-run-id-provided"
}
}
]

View File

@ -0,0 +1,7 @@
connection: "my_connection"
include: "dataset_lineages.view.lkml"
explore: explore_dataset_lineage {
from: dataset_lineages
}

View File

@ -0,0 +1,50 @@
# File was added to check duplicate field issue
view: dataset_lineages {
sql_table_name: "PUBLIC"."DATASET_LINEAGES"
;;
dimension: createdon {
type: date
sql: ${TABLE}."CREATEDON" ;;
}
dimension_group: createdon {
type: time
timeframes: [
raw,
time,
date,
week,
month,
quarter,
year
]
sql: ${TABLE}."CREATEDON" ;;
}
dimension: entity {
type: string
sql: ${TABLE}."ENTITY" ;;
}
dimension: metadata {
type: string
sql: ${TABLE}."METADATA" ;;
}
dimension: urn {
type: string
sql: ${TABLE}."URN" ;;
}
dimension: version {
type: number
sql: ${TABLE}."VERSION" ;;
}
measure: count {
type: count
drill_fields: []
}
}

View File

@ -864,3 +864,26 @@ def test_manifest_parser(pytestconfig: pytest.Config) -> None:
manifest = load_lkml(manifest_file) manifest = load_lkml(manifest_file)
assert manifest assert manifest
@freeze_time(FROZEN_TIME)
def test_duplicate_field_ingest(pytestconfig, tmp_path, mock_time):
test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
mce_out_file = "duplicate_ingest_mces_output.json"
new_recipe = get_default_recipe(
f"{tmp_path}/{mce_out_file}",
f"{test_resources_dir}/lkml_samples_duplicate_field",
)
pipeline = Pipeline.create(new_recipe)
pipeline.run()
pipeline.pretty_print_summary()
pipeline.raise_from_status(raise_warnings=True)
golden_path = test_resources_dir / "duplicate_field_ingestion_golden.json"
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / mce_out_file,
golden_path=golden_path,
)