mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-02 13:44:39 +00:00
feat: usage stats (part 2) (#2762)
Co-authored-by: Gabe Lyons <itsgabelyons@gmail.com>
This commit is contained in:
parent
937f02c6bc
commit
19b2a42a00
@ -0,0 +1,24 @@
|
||||
package com.linkedin.datahub.graphql.types.usage;
|
||||
|
||||
import com.linkedin.datahub.graphql.generated.FieldUsageCounts;
|
||||
import com.linkedin.datahub.graphql.types.mappers.ModelMapper;
|
||||
import javax.annotation.Nonnull;
|
||||
|
||||
|
||||
public class FieldUsageCountsMapper implements ModelMapper<com.linkedin.usage.FieldUsageCounts, FieldUsageCounts> {
|
||||
|
||||
public static final FieldUsageCountsMapper INSTANCE = new FieldUsageCountsMapper();
|
||||
|
||||
public static FieldUsageCounts map(@Nonnull final com.linkedin.usage.FieldUsageCounts usageCounts) {
|
||||
return INSTANCE.apply(usageCounts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldUsageCounts apply(@Nonnull final com.linkedin.usage.FieldUsageCounts usageCounts) {
|
||||
FieldUsageCounts result = new FieldUsageCounts();
|
||||
result.setCount(usageCounts.getCount());
|
||||
result.setFieldName(usageCounts.getFieldName());
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@ -21,6 +21,10 @@ public class UsageAggregationMetricsMapper implements
|
||||
result.setTotalSqlQueries(usageAggregationMetrics.getTotalSqlQueries());
|
||||
result.setUniqueUserCount(usageAggregationMetrics.getUniqueUserCount());
|
||||
result.setTopSqlQueries(usageAggregationMetrics.getTopSqlQueries());
|
||||
if (usageAggregationMetrics.hasFields()) {
|
||||
result.setFields(
|
||||
usageAggregationMetrics.getFields().stream().map(FieldUsageCountsMapper::map).collect(Collectors.toList()));
|
||||
}
|
||||
if (usageAggregationMetrics.hasUsers()) {
|
||||
result.setUsers(usageAggregationMetrics.getUsers()
|
||||
.stream()
|
||||
|
||||
@ -20,6 +20,10 @@ public class UsageQueryResultAggregationMapper implements
|
||||
UsageQueryResultAggregations result = new UsageQueryResultAggregations();
|
||||
result.setTotalSqlQueries(pdlUsageResultAggregations.getTotalSqlQueries());
|
||||
result.setUniqueUserCount(pdlUsageResultAggregations.getUniqueUserCount());
|
||||
if (pdlUsageResultAggregations.hasFields()) {
|
||||
result.setFields(
|
||||
pdlUsageResultAggregations.getFields().stream().map(FieldUsageCountsMapper::map).collect(Collectors.toList()));
|
||||
}
|
||||
if (pdlUsageResultAggregations.hasUsers()) {
|
||||
result.setUsers(pdlUsageResultAggregations.getUsers()
|
||||
.stream()
|
||||
|
||||
@ -2618,6 +2618,7 @@ type UsageQueryResult {
|
||||
type UsageQueryResultAggregations {
|
||||
uniqueUserCount: Int
|
||||
users: [UserUsageCounts]
|
||||
fields: [FieldUsageCounts]
|
||||
totalSqlQueries: Int
|
||||
}
|
||||
|
||||
@ -2626,6 +2627,7 @@ type UsageAggregationMetrics {
|
||||
users: [UserUsageCounts]
|
||||
totalSqlQueries: Int
|
||||
topSqlQueries: [String]
|
||||
fields: [FieldUsageCounts]
|
||||
}
|
||||
|
||||
type UsageAggregation {
|
||||
@ -2635,6 +2637,11 @@ type UsageAggregation {
|
||||
metrics: UsageAggregationMetrics
|
||||
}
|
||||
|
||||
type FieldUsageCounts {
|
||||
fieldName: String
|
||||
count: Int
|
||||
}
|
||||
|
||||
enum WindowDuration {
|
||||
DAY
|
||||
WEEK
|
||||
|
||||
@ -82,6 +82,7 @@ export const DatasetProfile = ({ urn }: { urn: string }): JSX.Element => {
|
||||
<SchemaView
|
||||
urn={urn}
|
||||
schema={schema}
|
||||
usageStats={dataset.usageStats}
|
||||
editableSchemaMetadata={editableSchemaMetadata}
|
||||
updateEditableSchema={(update) => {
|
||||
analytics.event({
|
||||
|
||||
@ -37,7 +37,6 @@ function getTopNQueries(responseSize: number, buckets?: Maybe<UsageAggregation>[
|
||||
}
|
||||
|
||||
export default function QueriesTab({ dataset }: Props) {
|
||||
console.log(dataset.usageStats);
|
||||
const topQueries = getTopNQueries(5, dataset.usageStats?.buckets);
|
||||
|
||||
if (topQueries.length === 0) {
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import React from 'react';
|
||||
import { Avatar } from 'antd';
|
||||
import React, { useMemo } from 'react';
|
||||
import { Avatar, Tooltip } from 'antd';
|
||||
import { UserUsageCounts } from '../../../../types.generated';
|
||||
|
||||
export type Props = {
|
||||
@ -7,10 +7,14 @@ export type Props = {
|
||||
};
|
||||
|
||||
export default function UsageFacepile({ users }: Props) {
|
||||
const sortedUsers = useMemo(() => users?.slice().sort((a, b) => (b?.count || 0) - (a?.count || 0)), [users]);
|
||||
|
||||
return (
|
||||
<Avatar.Group maxCount={3}>
|
||||
{users?.map((user) => (
|
||||
<Avatar>{user?.userEmail?.charAt(0).toUpperCase()}</Avatar>
|
||||
<Avatar.Group maxCount={2}>
|
||||
{sortedUsers?.map((user) => (
|
||||
<Tooltip title={user?.userEmail}>
|
||||
<Avatar>{user?.userEmail?.charAt(0).toUpperCase()}</Avatar>
|
||||
</Tooltip>
|
||||
))}
|
||||
</Avatar.Group>
|
||||
);
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import React, { useState, useEffect, useMemo } from 'react';
|
||||
import { geekblue } from '@ant-design/colors';
|
||||
|
||||
import { Button, Table, Typography } from 'antd';
|
||||
import { Button, Table, Tooltip, Typography } from 'antd';
|
||||
import { AlignType } from 'rc-table/lib/interface';
|
||||
import styled from 'styled-components';
|
||||
import { FetchResult } from '@apollo/client';
|
||||
import { ColumnsType } from 'antd/lib/table';
|
||||
|
||||
import TypeIcon from './TypeIcon';
|
||||
import {
|
||||
Schema,
|
||||
@ -17,6 +20,7 @@ import {
|
||||
EditableSchemaFieldInfoUpdate,
|
||||
EntityType,
|
||||
GlossaryTerms,
|
||||
UsageQueryResult,
|
||||
} from '../../../../../types.generated';
|
||||
import TagTermGroup from '../../../../shared/tags/TagTermGroup';
|
||||
import { UpdateDatasetMutation } from '../../../../../graphql/dataset.generated';
|
||||
@ -38,8 +42,21 @@ const LighterText = styled(Typography.Text)`
|
||||
color: rgba(0, 0, 0, 0.45);
|
||||
`;
|
||||
|
||||
const UsageBar = styled.div<{ width: number }>`
|
||||
width: ${(props) => props.width}px;
|
||||
height: 10px;
|
||||
background-color: ${geekblue[3]};
|
||||
border-radius: 2px;
|
||||
`;
|
||||
|
||||
const UsageBarContainer = styled.div`
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
`;
|
||||
|
||||
export type Props = {
|
||||
urn: string;
|
||||
usageStats?: UsageQueryResult | null;
|
||||
schema?: Schema | null;
|
||||
editableSchemaMetadata?: EditableSchemaMetadata | null;
|
||||
updateEditableSchema: (
|
||||
@ -109,10 +126,17 @@ function convertEditableSchemaMetadataForUpdate(
|
||||
};
|
||||
}
|
||||
|
||||
export default function SchemaView({ urn, schema, editableSchemaMetadata, updateEditableSchema }: Props) {
|
||||
const USAGE_BAR_MAX_WIDTH = 50;
|
||||
|
||||
export default function SchemaView({ urn, schema, editableSchemaMetadata, updateEditableSchema, usageStats }: Props) {
|
||||
const [tagHoveredIndex, setTagHoveredIndex] = useState<string | undefined>(undefined);
|
||||
const [showRaw, setShowRaw] = useState(false);
|
||||
const [rows, setRows] = useState<Array<ExtendedSchemaFields>>([]);
|
||||
const hasUsageStats = useMemo(() => (usageStats?.aggregations?.fields?.length || 0) > 0, [usageStats]);
|
||||
const maxFieldUsageCount = useMemo(
|
||||
() => Math.max(...(usageStats?.aggregations?.fields?.map((field) => field?.count || 0) || [])),
|
||||
[usageStats],
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
const fields = [...(schema?.fields || [])] as Array<ExtendedSchemaFields>;
|
||||
@ -229,6 +253,24 @@ export default function SchemaView({ urn, schema, editableSchemaMetadata, update
|
||||
);
|
||||
};
|
||||
|
||||
const usageStatsRenderer = (fieldPath: string) => {
|
||||
const relevantUsageStats = usageStats?.aggregations?.fields?.find(
|
||||
(fieldStats) => fieldStats?.fieldName === fieldPath,
|
||||
);
|
||||
|
||||
if (!relevantUsageStats) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (
|
||||
<Tooltip placement="topLeft" title={`${relevantUsageStats.count} queries / month`}>
|
||||
<UsageBarContainer>
|
||||
<UsageBar width={((relevantUsageStats.count || 0) / maxFieldUsageCount) * USAGE_BAR_MAX_WIDTH} />
|
||||
</UsageBarContainer>
|
||||
</Tooltip>
|
||||
);
|
||||
};
|
||||
|
||||
const descriptionColumn = {
|
||||
title: 'Description',
|
||||
dataIndex: 'description',
|
||||
@ -253,6 +295,20 @@ export default function SchemaView({ urn, schema, editableSchemaMetadata, update
|
||||
}),
|
||||
};
|
||||
|
||||
const usageColumn = {
|
||||
width: 50,
|
||||
title: 'Usage',
|
||||
dataIndex: 'fieldPath',
|
||||
key: 'usage',
|
||||
render: usageStatsRenderer,
|
||||
};
|
||||
|
||||
let allColumns: ColumnsType<SchemaField> = [...defaultColumns, descriptionColumn, tagAndTermColumn];
|
||||
|
||||
if (hasUsageStats) {
|
||||
allColumns = [...allColumns, usageColumn];
|
||||
}
|
||||
|
||||
const getRawSchema = (schemaValue) => {
|
||||
try {
|
||||
return JSON.stringify(JSON.parse(schemaValue), null, 2);
|
||||
@ -280,7 +336,7 @@ export default function SchemaView({ urn, schema, editableSchemaMetadata, update
|
||||
) : (
|
||||
rows.length > 0 && (
|
||||
<Table
|
||||
columns={[...defaultColumns, descriptionColumn, tagAndTermColumn]}
|
||||
columns={allColumns}
|
||||
dataSource={rows}
|
||||
rowKey="fieldPath"
|
||||
expandable={{ defaultExpandAllRows: true, expandRowByClick: true }}
|
||||
|
||||
@ -131,6 +131,10 @@ query getDataset($urn: String!) {
|
||||
count
|
||||
userEmail
|
||||
}
|
||||
fields {
|
||||
fieldName
|
||||
count
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -13,6 +13,18 @@
|
||||
"namespace" : "com.linkedin.common",
|
||||
"doc" : "Enum to define the length of a bucket when doing aggregations",
|
||||
"symbols" : [ "YEAR", "MONTH", "WEEK", "DAY", "HOUR" ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "FieldUsageCounts",
|
||||
"namespace" : "com.linkedin.usage",
|
||||
"doc" : " Records field-level usage counts for a given resource ",
|
||||
"fields" : [ {
|
||||
"name" : "fieldName",
|
||||
"type" : "string"
|
||||
}, {
|
||||
"name" : "count",
|
||||
"type" : "int"
|
||||
} ]
|
||||
}, {
|
||||
"type" : "record",
|
||||
"name" : "UsageAggregation",
|
||||
@ -79,6 +91,14 @@
|
||||
},
|
||||
"doc" : " Frequent SQL queries; mostly makes sense for datasets in SQL databases ",
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "fields",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "FieldUsageCounts"
|
||||
},
|
||||
"doc" : " Field-level usage stats ",
|
||||
"optional" : true
|
||||
} ]
|
||||
},
|
||||
"doc" : " Metrics associated with this bucket "
|
||||
@ -116,6 +136,13 @@
|
||||
"items" : "UserUsageCounts"
|
||||
},
|
||||
"optional" : true
|
||||
}, {
|
||||
"name" : "fields",
|
||||
"type" : {
|
||||
"type" : "array",
|
||||
"items" : "FieldUsageCounts"
|
||||
},
|
||||
"optional" : true
|
||||
} ]
|
||||
},
|
||||
"doc" : "Aggregated metrics. All fields are optional here, since they will be populated\nonly if the underlying buckets contain the data required to generate that aggregation."
|
||||
|
||||
@ -10,6 +10,8 @@ import com.linkedin.restli.server.annotations.Action;
|
||||
import com.linkedin.restli.server.annotations.ActionParam;
|
||||
import com.linkedin.restli.server.annotations.RestLiSimpleResource;
|
||||
import com.linkedin.restli.server.resources.SimpleResourceTemplate;
|
||||
import com.linkedin.usage.FieldUsageCounts;
|
||||
import com.linkedin.usage.FieldUsageCountsArray;
|
||||
import com.linkedin.usage.UsageAggregation;
|
||||
import com.linkedin.usage.UsageAggregationArray;
|
||||
import com.linkedin.usage.UsageQueryResult;
|
||||
@ -124,6 +126,29 @@ public class UsageStats extends SimpleResourceTemplate<UsageAggregation> {
|
||||
}
|
||||
}
|
||||
|
||||
// Compute aggregations for field usage counts.
|
||||
{
|
||||
Map<String, Integer> fieldAgg = new HashMap<>();
|
||||
buckets.forEach((bucket) -> {
|
||||
Optional.ofNullable(bucket.getMetrics().getFields()).ifPresent(fieldUsageCounts -> {
|
||||
fieldUsageCounts.forEach((fieldCount -> {
|
||||
String key = fieldCount.getFieldName();
|
||||
int count = fieldAgg.getOrDefault(key, 0);
|
||||
count += fieldCount.getCount();
|
||||
fieldAgg.put(key, count);
|
||||
}));
|
||||
});
|
||||
});
|
||||
|
||||
if (!fieldAgg.isEmpty()) {
|
||||
FieldUsageCountsArray fields = new FieldUsageCountsArray();
|
||||
fields.addAll(fieldAgg.entrySet().stream().map((mapping) -> new FieldUsageCounts()
|
||||
.setFieldName(mapping.getKey())
|
||||
.setCount(mapping.getValue())).collect(Collectors.toList()));
|
||||
aggregations.setFields(fields);
|
||||
}
|
||||
}
|
||||
|
||||
return new UsageQueryResult()
|
||||
.setBuckets(buckets)
|
||||
.setAggregations(aggregations);
|
||||
|
||||
@ -31,32 +31,34 @@ If you run into an error, try checking the [_common setup issues_](./developing.
|
||||
|
||||
We use a plugin architecture so that you can install only the dependencies you actually need.
|
||||
|
||||
| Plugin Name | Install Command | Provides |
|
||||
| ------------- | ---------------------------------------------------------- | ----------------------------------- |
|
||||
| file | _included by default_ | File source and sink |
|
||||
| console | _included by default_ | Console sink |
|
||||
| athena | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
|
||||
| bigquery | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
|
||||
| feast | `pip install 'acryl-datahub[feast]'` | Feast source |
|
||||
| glue | `pip install 'acryl-datahub[glue]'` | AWS Glue source |
|
||||
| hive | `pip install 'acryl-datahub[hive]'` | Hive source |
|
||||
| mssql | `pip install 'acryl-datahub[mssql]'` | SQL Server source |
|
||||
| mysql | `pip install 'acryl-datahub[mysql]'` | MySQL source |
|
||||
| oracle | `pip install 'acryl-datahub[oracle]'` | Oracle source |
|
||||
| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source |
|
||||
| redshift | `pip install 'acryl-datahub[redshift]'` | Redshift source |
|
||||
| sqlalchemy | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
|
||||
| snowflake | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
|
||||
| superset | `pip install 'acryl-datahub[superset]'` | Superset source |
|
||||
| mongodb | `pip install 'acryl-datahub[mongodb]'` | MongoDB source |
|
||||
| ldap | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source |
|
||||
| looker | `pip install 'acryl-datahub[looker]'` | Looker source |
|
||||
| lookml | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ |
|
||||
| kafka | `pip install 'acryl-datahub[kafka]'` | Kafka source |
|
||||
| druid | `pip install 'acryl-datahub[druid]'` | Druid Source |
|
||||
| dbt | _no additional dependencies_ | dbt source |
|
||||
| datahub-rest | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API |
|
||||
| datahub-kafka | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka |
|
||||
| Plugin Name | Install Command | Provides |
|
||||
| --------------- | ---------------------------------------------------------- | ----------------------------------- |
|
||||
| file | _included by default_ | File source and sink |
|
||||
| console | _included by default_ | Console sink |
|
||||
| athena | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
|
||||
| bigquery | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
|
||||
| bigquery-usage | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source |
|
||||
| feast | `pip install 'acryl-datahub[feast]'` | Feast source |
|
||||
| glue | `pip install 'acryl-datahub[glue]'` | AWS Glue source |
|
||||
| hive | `pip install 'acryl-datahub[hive]'` | Hive source |
|
||||
| mssql | `pip install 'acryl-datahub[mssql]'` | SQL Server source |
|
||||
| mysql | `pip install 'acryl-datahub[mysql]'` | MySQL source |
|
||||
| oracle | `pip install 'acryl-datahub[oracle]'` | Oracle source |
|
||||
| postgres | `pip install 'acryl-datahub[postgres]'` | Postgres source |
|
||||
| redshift | `pip install 'acryl-datahub[redshift]'` | Redshift source |
|
||||
| sqlalchemy | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
|
||||
| snowflake | `pip install 'acryl-datahub[snowflake]'` | Snowflake source |
|
||||
| snowflake-usage | `pip install 'acryl-datahub[snowflake-usage]'` | Snowflake usage statistics source |
|
||||
| superset | `pip install 'acryl-datahub[superset]'` | Superset source |
|
||||
| mongodb | `pip install 'acryl-datahub[mongodb]'` | MongoDB source |
|
||||
| ldap | `pip install 'acryl-datahub[ldap]'` ([extra requirements]) | LDAP source |
|
||||
| looker | `pip install 'acryl-datahub[looker]'` | Looker source |
|
||||
| lookml | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ |
|
||||
| kafka | `pip install 'acryl-datahub[kafka]'` | Kafka source |
|
||||
| druid | `pip install 'acryl-datahub[druid]'` | Druid Source |
|
||||
| dbt | _no additional dependencies_ | dbt source |
|
||||
| datahub-rest | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API |
|
||||
| datahub-kafka | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka |
|
||||
|
||||
These plugins can be mixed and matched as desired. For example:
|
||||
|
||||
@ -451,6 +453,12 @@ source:
|
||||
# table_pattern/schema_pattern is same as above
|
||||
```
|
||||
|
||||
:::tip
|
||||
|
||||
You can also get fine-grained usage statistics for BigQuery using the `bigquery-usage` source.
|
||||
|
||||
:::
|
||||
|
||||
### AWS Athena `athena`
|
||||
|
||||
Extracts:
|
||||
@ -766,6 +774,75 @@ sink:
|
||||
schema_registry_config: {} # passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient
|
||||
```
|
||||
|
||||
### Google BigQuery Usage Stats `bigquery-usage`
|
||||
|
||||
- Fetch a list of queries issued
|
||||
- Fetch a list of tables and columns accessed
|
||||
- Aggregate these statistics into buckets, by day or hour granularity
|
||||
|
||||
Note: the client must have one of the following OAuth scopes:
|
||||
|
||||
- https://www.googleapis.com/auth/logging.read
|
||||
- https://www.googleapis.com/auth/logging.admin
|
||||
- https://www.googleapis.com/auth/cloud-platform.read-only
|
||||
- https://www.googleapis.com/auth/cloud-platform
|
||||
|
||||
```yml
|
||||
source:
|
||||
type: bigquery-usage
|
||||
config:
|
||||
project_id: project # optional - can autodetect from environment
|
||||
options:
|
||||
# See https://googleapis.dev/python/logging/latest/client.html for details.
|
||||
credentials: ~ # optional - see docs
|
||||
env: PROD
|
||||
|
||||
bucket_duration: "DAY"
|
||||
start_time: ~ # defaults to the last full day in UTC (or hour)
|
||||
end_time: ~ # defaults to the last full day in UTC (or hour)
|
||||
|
||||
top_n_queries: 10 # number of queries to save for each table
|
||||
```
|
||||
|
||||
:::tip
|
||||
|
||||
This source only does usage statistics. To get the tables, views, and schemas in your BigQuery project, use the `bigquery` source.
|
||||
|
||||
:::
|
||||
|
||||
### Snowflake Usage Stats `snowflake-usage`
|
||||
|
||||
- Fetch a list of queries issued
|
||||
- Fetch a list of tables and columns accessed (excludes views)
|
||||
- Aggregate these statistics into buckets, by day or hour granularity
|
||||
|
||||
Note: the user/role must have access to the account usage table. The "accountadmin" role has this by default, and other roles can be granted this permission: https://docs.snowflake.com/en/sql-reference/account-usage.html#enabling-account-usage-for-other-roles.
|
||||
|
||||
Note: the underlying access history views that we use are only available in Snowflake's enterprise edition or higher.
|
||||
|
||||
```yml
|
||||
source:
|
||||
type: snowflake-usage
|
||||
config:
|
||||
username: user
|
||||
password: pass
|
||||
host_port: account_name
|
||||
role: ACCOUNTADMIN
|
||||
env: PROD
|
||||
|
||||
bucket_duration: "DAY"
|
||||
start_time: ~ # defaults to the last full day in UTC (or hour)
|
||||
end_time: ~ # defaults to the last full day in UTC (or hour)
|
||||
|
||||
top_n_queries: 10 # number of queries to save for each table
|
||||
```
|
||||
|
||||
:::tip
|
||||
|
||||
This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source.
|
||||
|
||||
:::
|
||||
|
||||
### Console `console`
|
||||
|
||||
Simply prints each metadata event to stdout. Useful for experimentation and debugging purposes.
|
||||
|
||||
@ -91,6 +91,7 @@ plugins: Dict[str, Set[str]] = {
|
||||
"postgres": sql_common | {"psycopg2-binary", "GeoAlchemy2"},
|
||||
"redshift": sql_common | {"sqlalchemy-redshift", "psycopg2-binary", "GeoAlchemy2"},
|
||||
"snowflake": sql_common | {"snowflake-sqlalchemy"},
|
||||
"snowflake-usage": sql_common | {"snowflake-sqlalchemy"},
|
||||
"superset": {"requests"},
|
||||
}
|
||||
|
||||
@ -200,6 +201,7 @@ entry_points = {
|
||||
"postgres = datahub.ingestion.source.postgres:PostgresSource",
|
||||
"redshift = datahub.ingestion.source.redshift:RedshiftSource",
|
||||
"snowflake = datahub.ingestion.source.snowflake:SnowflakeSource",
|
||||
"snowflake-usage = datahub.ingestion.source.snowflake_usage:SnowflakeUsageSource",
|
||||
"superset = datahub.ingestion.source.superset:SupersetSource",
|
||||
],
|
||||
"datahub.ingestion.sink.plugins": [
|
||||
|
||||
@ -146,9 +146,9 @@ class Pipeline:
|
||||
|
||||
def pretty_print_summary(self) -> int:
|
||||
click.echo()
|
||||
click.secho("Source report:", bold=True)
|
||||
click.secho(f"Source ({self.config.source.type}) report:", bold=True)
|
||||
click.echo(self.source.get_report().as_string())
|
||||
click.secho("Sink report:", bold=True)
|
||||
click.secho(f"Sink ({self.config.sink.type}) report:", bold=True)
|
||||
click.echo(self.sink.get_report().as_string())
|
||||
click.echo()
|
||||
if self.source.get_report().failures or self.sink.get_report().failures:
|
||||
|
||||
@ -14,11 +14,10 @@ import datahub.emitter.mce_builder as builder
|
||||
from datahub.ingestion.api.common import PipelineContext
|
||||
from datahub.ingestion.api.source import Source, SourceReport
|
||||
from datahub.ingestion.api.workunit import UsageStatsWorkUnit
|
||||
from datahub.ingestion.source.usage_common import BaseUsageConfig, get_time_bucket
|
||||
from datahub.metadata.schema_classes import (
|
||||
UsageAggregationClass,
|
||||
UsageAggregationMetricsClass,
|
||||
UserUsageCountsClass,
|
||||
from datahub.ingestion.source.usage_common import (
|
||||
BaseUsageConfig,
|
||||
GenericAggregatedDataset,
|
||||
get_time_bucket,
|
||||
)
|
||||
from datahub.utilities.delayed_iter import delayed_iter
|
||||
|
||||
@ -101,6 +100,9 @@ class BigQueryTableRef:
|
||||
return f"projects/{self.project}/datasets/{self.dataset}/tables/{self.table}"
|
||||
|
||||
|
||||
AggregatedDataset = GenericAggregatedDataset[BigQueryTableRef]
|
||||
|
||||
|
||||
def _table_ref_to_urn(ref: BigQueryTableRef, env: str) -> str:
|
||||
return builder.make_dataset_urn(
|
||||
"bigquery", f"{ref.project}.{ref.dataset}.{ref.table}", env
|
||||
@ -213,18 +215,6 @@ class QueryEvent:
|
||||
return queryEvent
|
||||
|
||||
|
||||
@dataclass
|
||||
class AggregatedDataset:
|
||||
bucket_start_time: datetime
|
||||
resource: BigQueryTableRef
|
||||
|
||||
readCount: int = 0
|
||||
queryCount: int = 0
|
||||
queryFreq: Counter[str] = dataclasses.field(default_factory=collections.Counter)
|
||||
userFreq: Counter[str] = dataclasses.field(default_factory=collections.Counter)
|
||||
columnFreq: Counter[str] = dataclasses.field(default_factory=collections.Counter)
|
||||
|
||||
|
||||
class BigQueryUsageConfig(BaseUsageConfig):
|
||||
project_id: Optional[str] = None
|
||||
extra_client_options: dict = {}
|
||||
@ -373,43 +363,15 @@ class BigQueryUsageSource(Source):
|
||||
resource,
|
||||
AggregatedDataset(bucket_start_time=floored_ts, resource=resource),
|
||||
)
|
||||
|
||||
agg_bucket.readCount += 1
|
||||
agg_bucket.userFreq[event.actor_email] += 1
|
||||
if event.query:
|
||||
agg_bucket.queryCount += 1
|
||||
agg_bucket.queryFreq[event.query] += 1
|
||||
for column in event.fieldsRead:
|
||||
agg_bucket.columnFreq[column] += 1
|
||||
agg_bucket.add_read_entry(event.actor_email, event.query, event.fieldsRead)
|
||||
|
||||
return datasets
|
||||
|
||||
def _make_usage_stat(self, agg: AggregatedDataset) -> UsageStatsWorkUnit:
|
||||
return UsageStatsWorkUnit(
|
||||
id=f"{agg.bucket_start_time.isoformat()}-{agg.resource}",
|
||||
usageStats=UsageAggregationClass(
|
||||
bucket=int(agg.bucket_start_time.timestamp() * 1000),
|
||||
duration=self.config.bucket_duration,
|
||||
resource=_table_ref_to_urn(agg.resource, self.config.env),
|
||||
metrics=UsageAggregationMetricsClass(
|
||||
uniqueUserCount=len(agg.userFreq),
|
||||
users=[
|
||||
UserUsageCountsClass(
|
||||
user=builder.UNKNOWN_USER,
|
||||
count=count,
|
||||
userEmail=user_email,
|
||||
)
|
||||
for user_email, count in agg.userFreq.most_common()
|
||||
],
|
||||
totalSqlQueries=agg.queryCount,
|
||||
topSqlQueries=[
|
||||
query
|
||||
for query, _ in agg.queryFreq.most_common(
|
||||
self.config.top_n_queries
|
||||
)
|
||||
],
|
||||
),
|
||||
),
|
||||
return agg.make_usage_workunit(
|
||||
self.config.bucket_duration,
|
||||
lambda resource: _table_ref_to_urn(resource, self.config.env),
|
||||
self.config.top_n_queries,
|
||||
)
|
||||
|
||||
def get_report(self) -> SourceReport:
|
||||
|
||||
@ -0,0 +1,199 @@
|
||||
import collections
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
import pydantic
|
||||
import pydantic.dataclasses
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.engine import Engine
|
||||
|
||||
import datahub.emitter.mce_builder as builder
|
||||
from datahub.ingestion.api.source import Source, SourceReport
|
||||
from datahub.ingestion.api.workunit import UsageStatsWorkUnit
|
||||
from datahub.ingestion.source.snowflake import SnowflakeConfig
|
||||
from datahub.ingestion.source.usage_common import (
|
||||
BaseUsageConfig,
|
||||
GenericAggregatedDataset,
|
||||
get_time_bucket,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SnowflakeTableRef = str
|
||||
AggregatedDataset = GenericAggregatedDataset[SnowflakeTableRef]
|
||||
|
||||
SNOWFLAKE_USAGE_SQL_TEMPLATE = """
|
||||
SELECT
|
||||
-- access_history.query_id, -- only for debugging purposes
|
||||
access_history.query_start_time,
|
||||
query_history.query_text,
|
||||
query_history.query_type,
|
||||
access_history.base_objects_accessed,
|
||||
-- access_history.direct_objects_accessed, -- might be useful in the future
|
||||
-- query_history.execution_status, -- not really necessary, but should equal "SUCCESS"
|
||||
-- query_history.warehouse_name,
|
||||
access_history.user_name,
|
||||
users.first_name,
|
||||
users.last_name,
|
||||
users.display_name,
|
||||
users.email,
|
||||
query_history.role_name
|
||||
FROM
|
||||
snowflake.account_usage.access_history access_history
|
||||
LEFT JOIN
|
||||
snowflake.account_usage.query_history query_history
|
||||
ON access_history.query_id = query_history.query_id
|
||||
LEFT JOIN
|
||||
snowflake.account_usage.users users
|
||||
ON access_history.user_name = users.name
|
||||
WHERE ARRAY_SIZE(base_objects_accessed) > 0
|
||||
AND query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
|
||||
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
|
||||
ORDER BY query_start_time DESC
|
||||
;
|
||||
""".strip()
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class SnowflakeColumnReference:
|
||||
columnId: int
|
||||
columnName: str
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class SnowflakeObjectAccessEntry:
|
||||
columns: List[SnowflakeColumnReference]
|
||||
objectDomain: str
|
||||
objectId: int
|
||||
objectName: str
|
||||
|
||||
|
||||
@pydantic.dataclasses.dataclass
|
||||
class SnowflakeJoinedAccessEvent:
|
||||
query_start_time: datetime
|
||||
query_text: str
|
||||
query_type: str
|
||||
base_objects_accessed: List[SnowflakeObjectAccessEntry]
|
||||
|
||||
user_name: str
|
||||
first_name: Optional[str]
|
||||
last_name: Optional[str]
|
||||
display_name: Optional[str]
|
||||
email: str
|
||||
role_name: str
|
||||
|
||||
|
||||
class SnowflakeUsageConfig(SnowflakeConfig, BaseUsageConfig):
|
||||
database: str = "snowflake"
|
||||
|
||||
@pydantic.validator("role", always=True)
|
||||
def role_accountadmin(cls, v):
|
||||
if not v or v.lower() != "accountadmin":
|
||||
# This isn't an error, since the privileges can be delegated to other
|
||||
# roles as well: https://docs.snowflake.com/en/sql-reference/account-usage.html#enabling-account-usage-for-other-roles
|
||||
logger.info(
|
||||
'snowflake usage tables are only accessible by role "accountadmin" by default; you set %s',
|
||||
v,
|
||||
)
|
||||
return v
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class SnowflakeUsageSource(Source):
|
||||
config: SnowflakeUsageConfig
|
||||
report: SourceReport = dataclasses.field(default_factory=SourceReport)
|
||||
|
||||
@classmethod
|
||||
def create(cls, config_dict, ctx):
|
||||
config = SnowflakeUsageConfig.parse_obj(config_dict)
|
||||
return cls(ctx, config)
|
||||
|
||||
def get_workunits(self) -> Iterable[UsageStatsWorkUnit]:
|
||||
access_events = self._get_snowflake_history()
|
||||
aggregated_info = self._aggregate_access_events(access_events)
|
||||
|
||||
for time_bucket in aggregated_info.values():
|
||||
for aggregate in time_bucket.values():
|
||||
wu = self._make_usage_stat(aggregate)
|
||||
self.report.report_workunit(wu)
|
||||
yield wu
|
||||
|
||||
def _make_usage_query(self) -> str:
|
||||
return SNOWFLAKE_USAGE_SQL_TEMPLATE.format(
|
||||
start_time_millis=int(self.config.start_time.timestamp() * 1000),
|
||||
end_time_millis=int(self.config.end_time.timestamp() * 1000),
|
||||
)
|
||||
|
||||
def _make_sql_engine(self) -> Engine:
|
||||
url = self.config.get_sql_alchemy_url()
|
||||
logger.debug(f"sql_alchemy_url={url}")
|
||||
engine = create_engine(url, **self.config.options)
|
||||
return engine
|
||||
|
||||
def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]:
|
||||
query = self._make_usage_query()
|
||||
engine = self._make_sql_engine()
|
||||
|
||||
results = engine.execute(query)
|
||||
for row in results:
|
||||
# Make some minor type conversions.
|
||||
if hasattr(row, "_asdict"):
|
||||
# Compat with SQLAlchemy 1.3 and 1.4
|
||||
# See https://docs.sqlalchemy.org/en/14/changelog/migration_14.html#rowproxy-is-no-longer-a-proxy-is-now-called-row-and-behaves-like-an-enhanced-named-tuple.
|
||||
event_dict = row._asdict()
|
||||
else:
|
||||
event_dict = dict(row)
|
||||
event_dict["base_objects_accessed"] = json.loads(
|
||||
event_dict["base_objects_accessed"]
|
||||
)
|
||||
event_dict["query_start_time"] = (
|
||||
event_dict["query_start_time"]
|
||||
).astimezone(tz=timezone.utc)
|
||||
|
||||
event = SnowflakeJoinedAccessEvent(**event_dict)
|
||||
yield event
|
||||
|
||||
def _aggregate_access_events(
|
||||
self, events: Iterable[SnowflakeJoinedAccessEvent]
|
||||
) -> Dict[datetime, Dict[SnowflakeTableRef, AggregatedDataset]]:
|
||||
datasets: Dict[
|
||||
datetime, Dict[SnowflakeTableRef, AggregatedDataset]
|
||||
] = collections.defaultdict(dict)
|
||||
|
||||
for event in events:
|
||||
floored_ts = get_time_bucket(
|
||||
event.query_start_time, self.config.bucket_duration
|
||||
)
|
||||
|
||||
for object in event.base_objects_accessed:
|
||||
resource = object.objectName
|
||||
|
||||
agg_bucket = datasets[floored_ts].setdefault(
|
||||
resource,
|
||||
AggregatedDataset(bucket_start_time=floored_ts, resource=resource),
|
||||
)
|
||||
agg_bucket.add_read_entry(
|
||||
event.email,
|
||||
event.query_text,
|
||||
[colRef.columnName.lower() for colRef in object.columns],
|
||||
)
|
||||
|
||||
return datasets
|
||||
|
||||
def _make_usage_stat(self, agg: AggregatedDataset) -> UsageStatsWorkUnit:
|
||||
return agg.make_usage_workunit(
|
||||
self.config.bucket_duration,
|
||||
lambda resource: builder.make_dataset_urn(
|
||||
"snowflake", resource.lower(), self.config.env
|
||||
),
|
||||
self.config.top_n_queries,
|
||||
)
|
||||
|
||||
def get_report(self):
|
||||
return self.report
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
@ -1,11 +1,21 @@
|
||||
import collections
|
||||
import dataclasses
|
||||
import enum
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
from typing import Callable, Counter, Generic, List, Optional, TypeVar
|
||||
|
||||
import pydantic
|
||||
|
||||
import datahub.emitter.mce_builder as builder
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.metadata.schema_classes import WindowDurationClass
|
||||
from datahub.ingestion.api.workunit import UsageStatsWorkUnit
|
||||
from datahub.metadata.schema_classes import (
|
||||
FieldUsageCountsClass,
|
||||
UsageAggregationClass,
|
||||
UsageAggregationMetricsClass,
|
||||
UserUsageCountsClass,
|
||||
WindowDurationClass,
|
||||
)
|
||||
|
||||
|
||||
@enum.unique
|
||||
@ -30,6 +40,69 @@ def get_bucket_duration_delta(bucketing: BucketDuration) -> timedelta:
|
||||
return timedelta(days=1)
|
||||
|
||||
|
||||
ResourceType = TypeVar("ResourceType")
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class GenericAggregatedDataset(Generic[ResourceType]):
|
||||
bucket_start_time: datetime
|
||||
resource: ResourceType
|
||||
|
||||
readCount: int = 0
|
||||
queryCount: int = 0
|
||||
queryFreq: Counter[str] = dataclasses.field(default_factory=collections.Counter)
|
||||
userFreq: Counter[str] = dataclasses.field(default_factory=collections.Counter)
|
||||
columnFreq: Counter[str] = dataclasses.field(default_factory=collections.Counter)
|
||||
|
||||
def add_read_entry(
|
||||
self, user: str, query: Optional[str], fields: List[str]
|
||||
) -> None:
|
||||
self.readCount += 1
|
||||
self.userFreq[user] += 1
|
||||
if query:
|
||||
self.queryCount += 1
|
||||
self.queryFreq[query] += 1
|
||||
for column in fields:
|
||||
self.columnFreq[column] += 1
|
||||
|
||||
def make_usage_workunit(
|
||||
self,
|
||||
bucket_duration: BucketDuration,
|
||||
urn_builder: Callable[[ResourceType], str],
|
||||
top_n_queries: Optional[int],
|
||||
) -> UsageStatsWorkUnit:
|
||||
return UsageStatsWorkUnit(
|
||||
id=f"{self.bucket_start_time.isoformat()}-{self.resource}",
|
||||
usageStats=UsageAggregationClass(
|
||||
bucket=int(self.bucket_start_time.timestamp() * 1000),
|
||||
duration=bucket_duration,
|
||||
resource=urn_builder(self.resource),
|
||||
metrics=UsageAggregationMetricsClass(
|
||||
uniqueUserCount=len(self.userFreq),
|
||||
users=[
|
||||
UserUsageCountsClass(
|
||||
user=builder.UNKNOWN_USER,
|
||||
count=count,
|
||||
userEmail=user_email,
|
||||
)
|
||||
for user_email, count in self.userFreq.most_common()
|
||||
],
|
||||
totalSqlQueries=self.queryCount,
|
||||
topSqlQueries=[
|
||||
query for query, _ in self.queryFreq.most_common(top_n_queries)
|
||||
],
|
||||
fields=[
|
||||
FieldUsageCountsClass(
|
||||
fieldName=column,
|
||||
count=count,
|
||||
)
|
||||
for column, count in self.columnFreq.most_common()
|
||||
],
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class BaseUsageConfig(ConfigModel):
|
||||
# start_time and end_time will be populated by the validators.
|
||||
bucket_duration: BucketDuration = BucketDuration.DAY
|
||||
|
||||
@ -4,11 +4,13 @@
|
||||
# Do not modify manually!
|
||||
|
||||
# fmt: off
|
||||
from .....schema_classes import FieldUsageCountsClass
|
||||
from .....schema_classes import UsageAggregationClass
|
||||
from .....schema_classes import UsageAggregationMetricsClass
|
||||
from .....schema_classes import UserUsageCountsClass
|
||||
|
||||
|
||||
FieldUsageCounts = FieldUsageCountsClass
|
||||
UsageAggregation = UsageAggregationClass
|
||||
UsageAggregationMetrics = UsageAggregationMetricsClass
|
||||
UserUsageCounts = UserUsageCountsClass
|
||||
|
||||
@ -4731,6 +4731,33 @@
|
||||
"name": "topSqlQueries",
|
||||
"default": null,
|
||||
"doc": " Frequent SQL queries; mostly makes sense for datasets in SQL databases "
|
||||
},
|
||||
{
|
||||
"type": [
|
||||
"null",
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "record",
|
||||
"name": "FieldUsageCounts",
|
||||
"namespace": "com.linkedin.pegasus2avro.usage",
|
||||
"fields": [
|
||||
{
|
||||
"type": "string",
|
||||
"name": "fieldName"
|
||||
},
|
||||
{
|
||||
"type": "int",
|
||||
"name": "count"
|
||||
}
|
||||
],
|
||||
"doc": " Records field-level usage counts for a given resource "
|
||||
}
|
||||
}
|
||||
],
|
||||
"name": "fields",
|
||||
"default": null,
|
||||
"doc": " Field-level usage stats "
|
||||
}
|
||||
],
|
||||
"doc": "Metrics for usage data for a given resource and bucket. Not all fields\nmake sense for all buckets, so every field is optional."
|
||||
|
||||
@ -7606,6 +7606,55 @@ class TagPropertiesClass(DictWrapper):
|
||||
self._inner_dict['description'] = value
|
||||
|
||||
|
||||
class FieldUsageCountsClass(DictWrapper):
|
||||
""" Records field-level usage counts for a given resource """
|
||||
|
||||
RECORD_SCHEMA = get_schema_type("com.linkedin.pegasus2avro.usage.FieldUsageCounts")
|
||||
def __init__(self,
|
||||
fieldName: str,
|
||||
count: int,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.fieldName = fieldName
|
||||
self.count = count
|
||||
|
||||
@classmethod
|
||||
def construct_with_defaults(cls) -> "FieldUsageCountsClass":
|
||||
self = cls.construct({})
|
||||
self._restore_defaults()
|
||||
|
||||
return self
|
||||
|
||||
def _restore_defaults(self) -> None:
|
||||
self.fieldName = str()
|
||||
self.count = int()
|
||||
|
||||
|
||||
@property
|
||||
def fieldName(self) -> str:
|
||||
# No docs available.
|
||||
return self._inner_dict.get('fieldName') # type: ignore
|
||||
|
||||
|
||||
@fieldName.setter
|
||||
def fieldName(self, value: str) -> None:
|
||||
# No docs available.
|
||||
self._inner_dict['fieldName'] = value
|
||||
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
# No docs available.
|
||||
return self._inner_dict.get('count') # type: ignore
|
||||
|
||||
|
||||
@count.setter
|
||||
def count(self, value: int) -> None:
|
||||
# No docs available.
|
||||
self._inner_dict['count'] = value
|
||||
|
||||
|
||||
class UsageAggregationClass(DictWrapper):
|
||||
"""Usage data for a given resource, rolled up into a bucket."""
|
||||
|
||||
@ -7695,6 +7744,7 @@ class UsageAggregationMetricsClass(DictWrapper):
|
||||
users: Union[None, List["UserUsageCountsClass"]]=None,
|
||||
totalSqlQueries: Union[None, int]=None,
|
||||
topSqlQueries: Union[None, List[str]]=None,
|
||||
fields: Union[None, List["FieldUsageCountsClass"]]=None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
@ -7702,6 +7752,7 @@ class UsageAggregationMetricsClass(DictWrapper):
|
||||
self.users = users
|
||||
self.totalSqlQueries = totalSqlQueries
|
||||
self.topSqlQueries = topSqlQueries
|
||||
self.fields = fields
|
||||
|
||||
@classmethod
|
||||
def construct_with_defaults(cls) -> "UsageAggregationMetricsClass":
|
||||
@ -7715,6 +7766,7 @@ class UsageAggregationMetricsClass(DictWrapper):
|
||||
self.users = self.RECORD_SCHEMA.field_map["users"].default
|
||||
self.totalSqlQueries = self.RECORD_SCHEMA.field_map["totalSqlQueries"].default
|
||||
self.topSqlQueries = self.RECORD_SCHEMA.field_map["topSqlQueries"].default
|
||||
self.fields = self.RECORD_SCHEMA.field_map["fields"].default
|
||||
|
||||
|
||||
@property
|
||||
@ -7765,6 +7817,18 @@ class UsageAggregationMetricsClass(DictWrapper):
|
||||
self._inner_dict['topSqlQueries'] = value
|
||||
|
||||
|
||||
@property
|
||||
def fields(self) -> Union[None, List["FieldUsageCountsClass"]]:
|
||||
"""Getter: Field-level usage stats """
|
||||
return self._inner_dict.get('fields') # type: ignore
|
||||
|
||||
|
||||
@fields.setter
|
||||
def fields(self, value: Union[None, List["FieldUsageCountsClass"]]) -> None:
|
||||
"""Setter: Field-level usage stats """
|
||||
self._inner_dict['fields'] = value
|
||||
|
||||
|
||||
class UserUsageCountsClass(DictWrapper):
|
||||
""" Records a single user's usage counts for a given resource """
|
||||
|
||||
@ -7972,6 +8036,7 @@ __SCHEMA_TYPES = {
|
||||
'com.linkedin.pegasus2avro.schema.UnionType': UnionTypeClass,
|
||||
'com.linkedin.pegasus2avro.schema.UrnForeignKey': UrnForeignKeyClass,
|
||||
'com.linkedin.pegasus2avro.tag.TagProperties': TagPropertiesClass,
|
||||
'com.linkedin.pegasus2avro.usage.FieldUsageCounts': FieldUsageCountsClass,
|
||||
'com.linkedin.pegasus2avro.usage.UsageAggregation': UsageAggregationClass,
|
||||
'com.linkedin.pegasus2avro.usage.UsageAggregationMetrics': UsageAggregationMetricsClass,
|
||||
'com.linkedin.pegasus2avro.usage.UserUsageCounts': UserUsageCountsClass,
|
||||
@ -8117,6 +8182,7 @@ __SCHEMA_TYPES = {
|
||||
'UnionType': UnionTypeClass,
|
||||
'UrnForeignKey': UrnForeignKeyClass,
|
||||
'TagProperties': TagPropertiesClass,
|
||||
'FieldUsageCounts': FieldUsageCountsClass,
|
||||
'UsageAggregation': UsageAggregationClass,
|
||||
'UsageAggregationMetrics': UsageAggregationMetricsClass,
|
||||
'UserUsageCounts': UserUsageCountsClass,
|
||||
|
||||
@ -112,6 +112,32 @@
|
||||
],
|
||||
"doc": " Frequent SQL queries; mostly makes sense for datasets in SQL databases ",
|
||||
"default": null
|
||||
},
|
||||
{
|
||||
"name": "fields",
|
||||
"type": [
|
||||
"null",
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "record",
|
||||
"name": "FieldUsageCounts",
|
||||
"doc": " Records field-level usage counts for a given resource ",
|
||||
"fields": [
|
||||
{
|
||||
"name": "fieldName",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "count",
|
||||
"type": "int"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"doc": " Field-level usage stats ",
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -23,4 +23,4 @@ def assert_mces_equal(output: object, golden: object) -> None:
|
||||
r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['createStamp'\]\['time'\]",
|
||||
}
|
||||
diff = deepdiff.DeepDiff(golden, output, exclude_regex_paths=ignore_paths)
|
||||
assert not diff
|
||||
assert not diff, str(diff)
|
||||
|
||||
@ -13,7 +13,105 @@
|
||||
}
|
||||
],
|
||||
"totalSqlQueries": 0,
|
||||
"topSqlQueries": []
|
||||
"topSqlQueries": [],
|
||||
"fields": [
|
||||
{
|
||||
"fieldName": "unique_key",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "complaint_type",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "complaint_description",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "owning_department",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "source",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "status",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "status_change_date",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "created_date",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "last_update_date",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "close_date",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "incident_address",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "street_number",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "street_name",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "city",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "incident_zip",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "county",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "state_plane_x_coordinate",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "state_plane_y_coordinate",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "latitude",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "longitude",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "location",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "council_district_code",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "map_page",
|
||||
"count": 1
|
||||
},
|
||||
{
|
||||
"fieldName": "map_tile",
|
||||
"count": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -33,84 +131,56 @@
|
||||
"topSqlQueries": [
|
||||
"\nSELECT * FROM `harshal-playground-306419.test_schema.excess_deaths_derived`;\n\n",
|
||||
"SELECT * FROM `harshal-playground-306419.test_schema.excess_deaths_derived`"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"bucket": 1622160000000,
|
||||
"duration": "DAY",
|
||||
"resource": "urn:li:dataset:(urn:li:dataPlatform:bigquery,harshal-playground-306419.test_schema.excess_deaths_derived,PROD)",
|
||||
"metrics": {
|
||||
"uniqueUserCount": 1,
|
||||
"users": [
|
||||
{
|
||||
"user": "urn:li:corpuser:unknown",
|
||||
"count": 2,
|
||||
"userEmail": "harshal@acryl.io"
|
||||
}
|
||||
],
|
||||
"totalSqlQueries": 2,
|
||||
"topSqlQueries": [
|
||||
"SELECT * FROM `harshal-playground-306419.test_schema.excess_deaths_derived`",
|
||||
"# CREATE OR REPLACE TABLE test_schema.excess_deaths_derived AS (SELECT * FROM `bigquery-public-data.covid19_nyt.excess_deaths` LIMIT 10);\n\nSELECT * FROM `harshal-playground-306419.test_schema.excess_deaths_derived`;\n"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"bucket": 1622505600000,
|
||||
"duration": "DAY",
|
||||
"resource": "urn:li:dataset:(urn:li:dataPlatform:bigquery,harshal-playground-306419.test_schema.excess_deaths_derived,PROD)",
|
||||
"metrics": {
|
||||
"uniqueUserCount": 1,
|
||||
"users": [
|
||||
"fields": [
|
||||
{
|
||||
"user": "urn:li:corpuser:unknown",
|
||||
"count": 1,
|
||||
"userEmail": "harshal@acryl.io"
|
||||
}
|
||||
],
|
||||
"totalSqlQueries": 1,
|
||||
"topSqlQueries": [
|
||||
"# CREATE OR REPLACE TABLE test_schema.excess_deaths_derived AS (SELECT * FROM `bigquery-public-data.covid19_nyt.excess_deaths` LIMIT 10);\n\nSELECT * FROM `harshal-playground-306419.test_schema.excess_deaths_derived`;\n"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"bucket": 1622505600000,
|
||||
"duration": "DAY",
|
||||
"resource": "urn:li:dataset:(urn:li:dataPlatform:bigquery,harshal-playground-306419.test_schema.austin311_derived,PROD)",
|
||||
"metrics": {
|
||||
"uniqueUserCount": 1,
|
||||
"users": [
|
||||
"fieldName": "placename",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"user": "urn:li:corpuser:unknown",
|
||||
"count": 1,
|
||||
"userEmail": "harshal@acryl.io"
|
||||
}
|
||||
],
|
||||
"totalSqlQueries": 1,
|
||||
"topSqlQueries": [
|
||||
"# CREATE OR REPLACE TABLE test_schema.excess_deaths_derived AS (SELECT * FROM `bigquery-public-data.covid19_nyt.excess_deaths` LIMIT 10);\n\nSELECT * FROM `harshal-playground-306419.test_schema.austin311_derived`;\n"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"bucket": 1623888000000,
|
||||
"duration": "DAY",
|
||||
"resource": "urn:li:dataset:(urn:li:dataPlatform:bigquery,harshal-playground-306419.test_schema.austin311_derived,PROD)",
|
||||
"metrics": {
|
||||
"uniqueUserCount": 1,
|
||||
"users": [
|
||||
"fieldName": "excess_deaths",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"user": "urn:li:corpuser:unknown",
|
||||
"count": 2,
|
||||
"userEmail": "harshal@acryl.io"
|
||||
"fieldName": "deaths",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "end_date",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "frequency",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "expected_deaths",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "start_date",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "baseline",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "year",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "month",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "week",
|
||||
"count": 4
|
||||
},
|
||||
{
|
||||
"fieldName": "country",
|
||||
"count": 4
|
||||
}
|
||||
],
|
||||
"totalSqlQueries": 2,
|
||||
"topSqlQueries": [
|
||||
"select * from `harshal-playground-306419.test_schema.austin311_derived`",
|
||||
"select complaint_description, complaint_type, unique_key, last_update_date from `harshal-playground-306419.test_schema.austin311_derived`"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@ -14,6 +14,8 @@ import com.linkedin.metadata.search.elasticsearch.update.BulkListener;
|
||||
import com.linkedin.metadata.usage.UsageService;
|
||||
import com.linkedin.metadata.utils.elasticsearch.IndexConvention;
|
||||
import com.linkedin.common.WindowDuration;
|
||||
import com.linkedin.usage.FieldUsageCounts;
|
||||
import com.linkedin.usage.FieldUsageCountsArray;
|
||||
import com.linkedin.usage.UsageAggregation;
|
||||
import com.linkedin.usage.UsageAggregationMetrics;
|
||||
import com.linkedin.usage.UserUsageCounts;
|
||||
@ -139,6 +141,16 @@ public class ElasticUsageService implements UsageService {
|
||||
document.set("metrics.top_sql_queries", sqlQueriesDocument);
|
||||
});
|
||||
|
||||
Optional.ofNullable(bucket.getMetrics().getFields()).ifPresent(fields -> {
|
||||
ArrayNode fieldsDocument = JsonNodeFactory.instance.arrayNode();
|
||||
fields.forEach(fieldUsage -> {
|
||||
ObjectNode fieldDocument = JsonNodeFactory.instance.objectNode();
|
||||
fieldDocument.set("field_name", JsonNodeFactory.instance.textNode(fieldUsage.getFieldName()));
|
||||
fieldDocument.set("count", JsonNodeFactory.instance.numberNode(fieldUsage.getCount()));
|
||||
fieldsDocument.add(fieldDocument);
|
||||
});
|
||||
document.set("metrics.fields", fieldsDocument);
|
||||
});
|
||||
return document.toString();
|
||||
}
|
||||
|
||||
@ -167,7 +179,6 @@ public class ElasticUsageService implements UsageService {
|
||||
if (endTime != null) {
|
||||
finalQuery.must(QueryBuilders.rangeQuery(ES_KEY_BUCKET_END).lte(endTime));
|
||||
}
|
||||
// TODO handle "latest N buckets" style queries
|
||||
|
||||
final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
|
||||
searchSourceBuilder.query(finalQuery);
|
||||
@ -231,6 +242,18 @@ public class ElasticUsageService implements UsageService {
|
||||
metrics.setTopSqlQueries(queries);
|
||||
}
|
||||
|
||||
if (docFields.containsKey("metrics.fields")) {
|
||||
FieldUsageCountsArray fields = new FieldUsageCountsArray();
|
||||
List<Map<String, Object>> docUsers = (List<Map<String, Object>>) docFields.get("metrics.fields");
|
||||
for (Map<String, Object> map : docUsers) {
|
||||
FieldUsageCounts fieldUsage = new FieldUsageCounts();
|
||||
fieldUsage.setFieldName((String) map.get("field_name"));
|
||||
fieldUsage.setCount((Integer) map.get("count"));
|
||||
fields.add(fieldUsage);
|
||||
}
|
||||
metrics.setFields(fields);
|
||||
}
|
||||
|
||||
return agg;
|
||||
} catch (URISyntaxException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
namespace com.linkedin.usage
|
||||
|
||||
/** Records field-level usage counts for a given resource */
|
||||
record FieldUsageCounts {
|
||||
fieldName: string
|
||||
count: int
|
||||
}
|
||||
@ -16,4 +16,7 @@ record UsageAggregationMetrics {
|
||||
|
||||
/** Frequent SQL queries; mostly makes sense for datasets in SQL databases */
|
||||
topSqlQueries: optional array[string]
|
||||
|
||||
/** Field-level usage stats */
|
||||
fields: optional array[FieldUsageCounts]
|
||||
}
|
||||
|
||||
@ -17,5 +17,7 @@ record UsageQueryResult {
|
||||
totalSqlQueries: optional int
|
||||
|
||||
users: optional array[UserUsageCounts]
|
||||
|
||||
fields: optional array[FieldUsageCounts]
|
||||
}
|
||||
}
|
||||
|
||||
@ -10,6 +10,9 @@ FRONTEND_ENDPOINT = "http://localhost:9002"
|
||||
KAFKA_BROKER = "localhost:9092"
|
||||
|
||||
bootstrap_sample_data = "../metadata-ingestion/examples/mce_files/bootstrap_mce.json"
|
||||
usage_sample_data = (
|
||||
"../metadata-ingestion/tests/integration/bigquery-usage/bigquery_usages_golden.json"
|
||||
)
|
||||
bq_sample_data = "./sample_bq_data.json"
|
||||
restli_default_headers = {
|
||||
"X-RestLi-Protocol-Version": "2.0.0",
|
||||
@ -30,13 +33,12 @@ def test_healthchecks(wait_for_healthchecks):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks"])
|
||||
def test_ingestion_via_rest(wait_for_healthchecks):
|
||||
def ingest_file(filename: str):
|
||||
pipeline = Pipeline.create(
|
||||
{
|
||||
"source": {
|
||||
"type": "file",
|
||||
"config": {"filename": bootstrap_sample_data},
|
||||
"config": {"filename": filename},
|
||||
},
|
||||
"sink": {
|
||||
"type": "datahub-rest",
|
||||
@ -48,6 +50,16 @@ def test_ingestion_via_rest(wait_for_healthchecks):
|
||||
pipeline.raise_from_status()
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks"])
|
||||
def test_ingestion_via_rest(wait_for_healthchecks):
|
||||
ingest_file(bootstrap_sample_data)
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks"])
|
||||
def test_ingestion_usage_via_rest(wait_for_healthchecks):
|
||||
ingest_file(usage_sample_data)
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks"])
|
||||
def test_ingestion_via_kafka(wait_for_healthchecks):
|
||||
pipeline = Pipeline.create(
|
||||
@ -74,7 +86,13 @@ def test_ingestion_via_kafka(wait_for_healthchecks):
|
||||
time.sleep(kafka_post_ingestion_wait_sec)
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_ingestion_via_rest", "test_ingestion_via_kafka"])
|
||||
@pytest.mark.dependency(
|
||||
depends=[
|
||||
"test_ingestion_via_rest",
|
||||
"test_ingestion_via_kafka",
|
||||
"test_ingestion_usage_via_rest",
|
||||
]
|
||||
)
|
||||
def test_run_ingestion(wait_for_healthchecks):
|
||||
# Dummy test so that future ones can just depend on this one.
|
||||
pass
|
||||
@ -193,6 +211,39 @@ def test_gms_search_dataset(query, min_expected_results):
|
||||
assert data["elements"][0]["urn"]
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=["test_healthchecks", "test_run_ingestion"])
|
||||
def test_gms_usage_fetch():
|
||||
response = requests.post(
|
||||
f"{GMS_ENDPOINT}/usageStats?action=queryRange",
|
||||
headers=restli_default_headers,
|
||||
json={
|
||||
"resource": "urn:li:dataset:(urn:li:dataPlatform:bigquery,harshal-playground-306419.test_schema.excess_deaths_derived,PROD)",
|
||||
"duration": "DAY",
|
||||
"rangeFromEnd": "ALL",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()["value"]
|
||||
|
||||
assert len(data["buckets"]) == 3
|
||||
assert data["buckets"][0]["metrics"]["topSqlQueries"]
|
||||
|
||||
fields = data["aggregations"].pop("fields")
|
||||
assert len(fields) == 12
|
||||
assert fields[0]["count"] == 7
|
||||
|
||||
users = data["aggregations"].pop("users")
|
||||
assert len(users) == 1
|
||||
assert users[0]["count"] == 7
|
||||
|
||||
assert data["aggregations"] == {
|
||||
# "fields" and "users" already popped out
|
||||
"totalSqlQueries": 7,
|
||||
"uniqueUserCount": 1,
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def frontend_session(wait_for_healthchecks):
|
||||
session = requests.Session()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user