Fix #605: Ingestion: metadata list apis should paginate (#606)

* Fix #605: Ingestion: metadata list apis should paginate

* Fix #605: Ingestion: metadata list apis should paginate, Addressing review comments
This commit is contained in:
Sriharsha Chintalapani 2021-09-28 11:36:08 -07:00 committed by GitHub
parent ed4508ab2c
commit f7da8045b4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 96 additions and 33 deletions

View File

@ -4,7 +4,8 @@
"config": { "config": {
"include_tables": "true", "include_tables": "true",
"include_topics": "true", "include_topics": "true",
"include_dashboards": "true" "include_dashboards": "true",
"limit_records": 10
} }
}, },
"sink": { "sink": {

View File

@ -16,6 +16,8 @@
import logging import logging
from typing import List from typing import List
from pydantic import BaseModel
from metadata.config.common import ConfigModel from metadata.config.common import ConfigModel
from metadata.generated.schema.api.data.createChart import CreateChartEntityRequest from metadata.generated.schema.api.data.createChart import CreateChartEntityRequest
from metadata.generated.schema.api.data.createDashboard import CreateDashboardEntityRequest from metadata.generated.schema.api.data.createDashboard import CreateDashboardEntityRequest
@ -28,6 +30,7 @@ from metadata.generated.schema.api.services.createMessagingService import Create
from metadata.generated.schema.entity.data.chart import Chart from metadata.generated.schema.entity.data.chart import Chart
from metadata.generated.schema.entity.data.dashboard import Dashboard from metadata.generated.schema.entity.data.dashboard import Dashboard
from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.database import Database
from metadata.generated.schema.entity.data.pipeline import Pipeline
from metadata.generated.schema.entity.data.table import Table, TableData, TableJoins, TableProfile from metadata.generated.schema.entity.data.table import Table, TableData, TableJoins, TableProfile
from metadata.generated.schema.entity.data.topic import Topic from metadata.generated.schema.entity.data.topic import Topic
from metadata.generated.schema.entity.services.dashboardService import DashboardService from metadata.generated.schema.entity.services.dashboardService import DashboardService
@ -52,12 +55,33 @@ from okta.jwt import JWT
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DatabaseServiceEntities = List[DatabaseService] DatabaseServiceEntities = List[DatabaseService]
DatabaseEntities = List[Database] DatabaseEntities = List[Database]
TableEntities = List[Table]
Tags = List[Tag] Tags = List[Tag]
Topics = List[Topic]
Dashboards = List[Dashboard]
TableProfiles = List[TableProfile] TableProfiles = List[TableProfile]
class TableEntities(BaseModel):
tables: List[Table]
total: int
after: str = None
class TopicEntities(BaseModel):
topics: List[Topic]
total: int
after: str = None
class DashboardEntities(BaseModel):
dashboards: List[Dashboard]
total: int
after: str = None
class PipelineEntities(BaseModel):
pipelines: List[Pipeline]
total: int
after: str = None
class MetadataServerConfig(ConfigModel): class MetadataServerConfig(ConfigModel):
api_endpoint: str api_endpoint: str
api_version: str = 'v1' api_version: str = 'v1'
@ -146,6 +170,7 @@ class Auth0AuthenticationProvider(AuthenticationProvider):
token = json.loads(data.decode("utf-8")) token = json.loads(data.decode("utf-8"))
return token['access_token'] return token['access_token']
class OpenMetadataAPIClient(object): class OpenMetadataAPIClient(object):
client: REST client: REST
_auth_provider: AuthenticationProvider _auth_provider: AuthenticationProvider
@ -229,24 +254,30 @@ class OpenMetadataAPIClient(object):
""" Delete Database using ID """ """ Delete Database using ID """
self.client.delete('/databases/{}'.format(database_id)) self.client.delete('/databases/{}'.format(database_id))
def list_tables(self, fields: str = None, offset: int = 0, limit: int = 1000000) -> TableEntities: def list_tables(self, fields: str = None, after: str = None, limit: int = 1000000) -> TableEntities:
""" List all tables""" """ List all tables"""
if fields is None: if fields is None:
resp = self.client.get('/tables') resp = self.client.get('/tables')
else: else:
resp = self.client.get('/tables?fields={}&offset={}&limit={}'.format(fields, offset, limit)) if after is not None:
resp = self.client.get('/tables?fields={}&after={}&limit={}'.format(fields, after, limit))
else:
resp = self.client.get('/tables?fields={}&limit={}'.format(fields, limit))
if self._use_raw_data: if self._use_raw_data:
return resp return resp
else: else:
return [Table(**t) for t in resp['data']] tables = [Table(**t) for t in resp['data']]
total = resp['paging']['total']
after = resp['paging']['after'] if 'after' in resp['paging'] else None
return TableEntities(tables=tables, total=total, after=after)
def ingest_sample_data(self, table_id, sample_data): def ingest_sample_data(self, table_id, sample_data):
resp = self.client.put('/tables/{}/sampleData'.format(table_id.__root__), data=sample_data.json()) resp = self.client.put('/tables/{}/sampleData'.format(table_id.__root__), data=sample_data.json())
return TableData(**resp['sampleData']) return TableData(**resp['sampleData'])
def ingest_table_profile_data(self, table_id, table_profile): def ingest_table_profile_data(self, table_id, table_profile):
print(table_profile.json())
resp = self.client.put('/tables/{}/tableProfile'.format(table_id.__root__), data=table_profile.json()) resp = self.client.put('/tables/{}/tableProfile'.format(table_id.__root__), data=table_profile.json())
return [TableProfile(**t) for t in resp['tableProfile']] return [TableProfile(**t) for t in resp['tableProfile']]
@ -310,17 +341,23 @@ class OpenMetadataAPIClient(object):
resp = self.client.put('/topics', data=create_topic_request.json()) resp = self.client.put('/topics', data=create_topic_request.json())
return Topic(**resp) return Topic(**resp)
def list_topics(self, fields: str = None, offset: int = 0, limit: int = 1000000) -> Topics: def list_topics(self, fields: str = None, after: str = None, limit: int = 1000000) -> TopicEntities:
""" List all topics""" """ List all topics"""
if fields is None: if fields is None:
resp = self.client.get('/topics') resp = self.client.get('/tables')
else: else:
resp = self.client.get('/topics?fields={}&offset={}&limit={}'.format(fields, offset, limit)) if after is not None:
resp = self.client.get('/topics?fields={}&after={}&limit={}'.format(fields, after, limit))
else:
resp = self.client.get('/topics?fields={}&limit={}'.format(fields, limit))
if self._use_raw_data: if self._use_raw_data:
return resp return resp
else: else:
return [Topic(**t) for t in resp['data']] topics = [Topic(**t) for t in resp['data']]
total = resp['paging']['total']
after = resp['paging']['after'] if 'after' in resp['paging'] else None
return TopicEntities(topics=topics, total=total, after=after)
def get_dashboard_service(self, service_name: str) -> DashboardService: def get_dashboard_service(self, service_name: str) -> DashboardService:
"""Get the Dashboard service""" """Get the Dashboard service"""
@ -354,16 +391,24 @@ class OpenMetadataAPIClient(object):
resp = self.client.put('/dashboards', data=create_dashboard_request.json()) resp = self.client.put('/dashboards', data=create_dashboard_request.json())
return Dashboard(**resp) return Dashboard(**resp)
def list_dashboards(self, fields: str = None, offset: int = 0, limit: int = 1000000) -> Dashboards: def list_dashboards(self, fields: str = None, after: str = None, limit: int = 1000000) -> DashboardEntities:
""" List all dashboards""" """ List all dashboards"""
if fields is None: if fields is None:
resp = self.client.get('/dashboards') resp = self.client.get('/dashboards')
else: else:
resp = self.client.get('/dashboards?fields={}&offset={}&limit={}'.format(fields, offset, limit)) if after is not None:
resp = self.client.get('/dashboards?fields={}&after={}&limit={}'.format(fields, after, limit))
else:
resp = self.client.get('/dashboards?fields={}&limit={}'.format(fields, limit))
if self._use_raw_data: if self._use_raw_data:
return resp return resp
else: else:
return [Dashboard(**t) for t in resp['data']] dashboards = [Dashboard(**t) for t in resp['data']]
total = resp['paging']['total']
after = resp['paging']['after'] if 'after' in resp['paging'] else None
return DashboardEntities(dashboards=dashboards, total=total, after=after)
def close(self): def close(self):
self.client.close() self.client.close()

View File

@ -35,7 +35,7 @@ class MetadataTablesRestSourceConfig(ConfigModel):
include_tables: Optional[bool] = True include_tables: Optional[bool] = True
include_topics: Optional[bool] = True include_topics: Optional[bool] = True
include_dashboards: Optional[bool] = True include_dashboards: Optional[bool] = True
limit_records: int = 50000 limit_records: int = 1000
@dataclass @dataclass
@ -92,28 +92,45 @@ class MetadataSource(Source):
def fetch_table(self) -> Table: def fetch_table(self) -> Table:
if self.config.include_tables: if self.config.include_tables:
tables = self.client.list_tables( after = None
while True:
table_entities = self.client.list_tables(
fields="columns,tableConstraints,usageSummary,owner,database,tags,followers", fields="columns,tableConstraints,usageSummary,owner,database,tags,followers",
offset=0, limit=self.config.limit_records) after=after,
for table in tables: limit=self.config.limit_records)
for table in table_entities.tables:
self.status.scanned_table(table.name.__root__) self.status.scanned_table(table.name.__root__)
yield table yield table
if table_entities.after is None:
break
after = table_entities.after
def fetch_topic(self) -> Topic: def fetch_topic(self) -> Topic:
if self.config.include_topics: if self.config.include_topics:
topics = self.client.list_topics( after = None
fields="owner,service,tags,followers", offset=0, limit=self.config.limit_records) while True:
for topic in topics: topic_entities = self.client.list_topics(
fields="owner,service,tags,followers", after=after, limit=self.config.limit_records)
for topic in topic_entities.topics:
self.status.scanned_topic(topic.name.__root__) self.status.scanned_topic(topic.name.__root__)
yield topic yield topic
if topic_entities.after is None:
break
after = topic_entities.after
def fetch_dashboard(self) -> Dashboard: def fetch_dashboard(self) -> Dashboard:
if self.config.include_dashboards: if self.config.include_dashboards:
dashboards = self.client.list_dashboards( after = None
fields="owner,service,tags,followers,charts,usageSummary", offset=0, limit=self.config.limit_records) while True:
for dashboard in dashboards: dashboard_entities = self.client.list_dashboards(
fields="owner,service,tags,followers,charts,usageSummary", after=after,
limit=self.config.limit_records)
for dashboard in dashboard_entities.dashboards:
self.status.scanned_dashboard(dashboard.name) self.status.scanned_dashboard(dashboard.name)
yield dashboard yield dashboard
if dashboard_entities.after is None:
break
after = dashboard_entities.after
def get_status(self) -> SourceStatus: def get_status(self) -> SourceStatus:
return self.status return self.status