mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-12 09:23:52 +00:00
feat(ingest/graph): Add get_results_by_filter to DataHubGraph (#10987)
This commit is contained in:
parent
66f8930164
commit
fde71d4500
@ -171,6 +171,22 @@ class DataHubGraph(DatahubRestEmitter):
|
|||||||
self.server_id = _MISSING_SERVER_ID
|
self.server_id = _MISSING_SERVER_ID
|
||||||
logger.debug(f"Failed to get server id due to {e}")
|
logger.debug(f"Failed to get server id due to {e}")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def frontend_base_url(self) -> str:
|
||||||
|
"""Get the public-facing base url of the frontend
|
||||||
|
|
||||||
|
This url can be used to construct links to the frontend. The url will not include a trailing slash.
|
||||||
|
Note: Only supported with Acryl Cloud.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not self.server_config:
|
||||||
|
self.test_connection()
|
||||||
|
|
||||||
|
base_url = self.server_config.get("baseUrl")
|
||||||
|
if not base_url:
|
||||||
|
raise ValueError("baseUrl not found in server config")
|
||||||
|
return base_url
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
|
def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph":
|
||||||
return cls(
|
return cls(
|
||||||
@ -812,6 +828,7 @@ class DataHubGraph(DatahubRestEmitter):
|
|||||||
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
||||||
batch_size: int = 10000,
|
batch_size: int = 10000,
|
||||||
extraFilters: Optional[List[SearchFilterRule]] = None,
|
extraFilters: Optional[List[SearchFilterRule]] = None,
|
||||||
|
extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
|
||||||
) -> Iterable[str]:
|
) -> Iterable[str]:
|
||||||
"""Fetch all urns that match all of the given filters.
|
"""Fetch all urns that match all of the given filters.
|
||||||
|
|
||||||
@ -841,7 +858,13 @@ class DataHubGraph(DatahubRestEmitter):
|
|||||||
|
|
||||||
# Env filter.
|
# Env filter.
|
||||||
orFilters = generate_filter(
|
orFilters = generate_filter(
|
||||||
platform, platform_instance, env, container, status, extraFilters
|
platform,
|
||||||
|
platform_instance,
|
||||||
|
env,
|
||||||
|
container,
|
||||||
|
status,
|
||||||
|
extraFilters,
|
||||||
|
extra_or_filters=extra_or_filters,
|
||||||
)
|
)
|
||||||
|
|
||||||
graphql_query = textwrap.dedent(
|
graphql_query = textwrap.dedent(
|
||||||
@ -885,6 +908,131 @@ class DataHubGraph(DatahubRestEmitter):
|
|||||||
for entity in self._scroll_across_entities(graphql_query, variables):
|
for entity in self._scroll_across_entities(graphql_query, variables):
|
||||||
yield entity["urn"]
|
yield entity["urn"]
|
||||||
|
|
||||||
|
def get_results_by_filter(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
entity_types: Optional[List[str]] = None,
|
||||||
|
platform: Optional[str] = None,
|
||||||
|
platform_instance: Optional[str] = None,
|
||||||
|
env: Optional[str] = None,
|
||||||
|
query: Optional[str] = None,
|
||||||
|
container: Optional[str] = None,
|
||||||
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
||||||
|
batch_size: int = 10000,
|
||||||
|
extra_and_filters: Optional[List[SearchFilterRule]] = None,
|
||||||
|
extra_or_filters: Optional[List[Dict[str, List[SearchFilterRule]]]] = None,
|
||||||
|
extra_source_fields: Optional[List[str]] = None,
|
||||||
|
skip_cache: bool = False,
|
||||||
|
) -> Iterable[dict]:
|
||||||
|
"""Fetch all results that match all of the given filters.
|
||||||
|
|
||||||
|
Filters are combined conjunctively. If multiple filters are specified, the results will match all of them.
|
||||||
|
Note that specifying a platform filter will automatically exclude all entity types that do not have a platform.
|
||||||
|
The same goes for the env filter.
|
||||||
|
|
||||||
|
:param entity_types: List of entity types to include. If None, all entity types will be returned.
|
||||||
|
:param platform: Platform to filter on. If None, all platforms will be returned.
|
||||||
|
:param platform_instance: Platform instance to filter on. If None, all platform instances will be returned.
|
||||||
|
:param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned.
|
||||||
|
:param query: Query string to filter on. If None, all entities will be returned.
|
||||||
|
:param container: A container urn that entities must be within.
|
||||||
|
This works recursively, so it will include entities within sub-containers as well.
|
||||||
|
If None, all entities will be returned.
|
||||||
|
Note that this requires browsePathV2 aspects (added in 0.10.4+).
|
||||||
|
:param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
|
||||||
|
:param extra_and_filters: Additional filters to apply. If specified, the
|
||||||
|
results will match all of the filters.
|
||||||
|
:param extra_or_filters: Additional filters to apply. If specified, the
|
||||||
|
results will match any of the filters.
|
||||||
|
|
||||||
|
:return: An iterable of urns that match the filters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
types = self._get_types(entity_types)
|
||||||
|
|
||||||
|
# Add the query default of * if no query is specified.
|
||||||
|
query = query or "*"
|
||||||
|
|
||||||
|
or_filters_final = generate_filter(
|
||||||
|
platform,
|
||||||
|
platform_instance,
|
||||||
|
env,
|
||||||
|
container,
|
||||||
|
status,
|
||||||
|
extra_and_filters,
|
||||||
|
extra_or_filters,
|
||||||
|
)
|
||||||
|
graphql_query = textwrap.dedent(
|
||||||
|
"""
|
||||||
|
query scrollUrnsWithFilters(
|
||||||
|
$types: [EntityType!],
|
||||||
|
$query: String!,
|
||||||
|
$orFilters: [AndFilterInput!],
|
||||||
|
$batchSize: Int!,
|
||||||
|
$scrollId: String,
|
||||||
|
$skipCache: Boolean!,
|
||||||
|
$fetchExtraFields: [String!]) {
|
||||||
|
|
||||||
|
scrollAcrossEntities(input: {
|
||||||
|
query: $query,
|
||||||
|
count: $batchSize,
|
||||||
|
scrollId: $scrollId,
|
||||||
|
types: $types,
|
||||||
|
orFilters: $orFilters,
|
||||||
|
searchFlags: {
|
||||||
|
skipHighlighting: true
|
||||||
|
skipAggregates: true
|
||||||
|
skipCache: $skipCache
|
||||||
|
fetchExtraFields: $fetchExtraFields
|
||||||
|
}
|
||||||
|
}) {
|
||||||
|
nextScrollId
|
||||||
|
searchResults {
|
||||||
|
entity {
|
||||||
|
urn
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
variables = {
|
||||||
|
"types": types,
|
||||||
|
"query": query,
|
||||||
|
"orFilters": or_filters_final,
|
||||||
|
"batchSize": batch_size,
|
||||||
|
"skipCache": "true" if skip_cache else "false",
|
||||||
|
"fetchExtraFields": extra_source_fields,
|
||||||
|
}
|
||||||
|
|
||||||
|
for result in self._scroll_across_entities_results(graphql_query, variables):
|
||||||
|
yield result
|
||||||
|
|
||||||
|
def _scroll_across_entities_results(
|
||||||
|
self, graphql_query: str, variables_orig: dict
|
||||||
|
) -> Iterable[dict]:
|
||||||
|
variables = variables_orig.copy()
|
||||||
|
first_iter = True
|
||||||
|
scroll_id: Optional[str] = None
|
||||||
|
while first_iter or scroll_id:
|
||||||
|
first_iter = False
|
||||||
|
variables["scrollId"] = scroll_id
|
||||||
|
|
||||||
|
response = self.execute_graphql(
|
||||||
|
graphql_query,
|
||||||
|
variables=variables,
|
||||||
|
)
|
||||||
|
data = response["scrollAcrossEntities"]
|
||||||
|
scroll_id = data["nextScrollId"]
|
||||||
|
for entry in data["searchResults"]:
|
||||||
|
yield entry
|
||||||
|
|
||||||
|
if scroll_id:
|
||||||
|
logger.debug(
|
||||||
|
f"Scrolling to next scrollAcrossEntities page: {scroll_id}"
|
||||||
|
)
|
||||||
|
|
||||||
def _scroll_across_entities(
|
def _scroll_across_entities(
|
||||||
self, graphql_query: str, variables_orig: dict
|
self, graphql_query: str, variables_orig: dict
|
||||||
) -> Iterable[dict]:
|
) -> Iterable[dict]:
|
||||||
|
|||||||
@ -30,7 +30,19 @@ def generate_filter(
|
|||||||
container: Optional[str],
|
container: Optional[str],
|
||||||
status: RemovedStatusFilter,
|
status: RemovedStatusFilter,
|
||||||
extra_filters: Optional[List[SearchFilterRule]],
|
extra_filters: Optional[List[SearchFilterRule]],
|
||||||
|
extra_or_filters: Optional[List[SearchFilterRule]] = None,
|
||||||
) -> List[Dict[str, List[SearchFilterRule]]]:
|
) -> List[Dict[str, List[SearchFilterRule]]]:
|
||||||
|
"""
|
||||||
|
Generate a search filter based on the provided parameters.
|
||||||
|
:param platform: The platform to filter by.
|
||||||
|
:param platform_instance: The platform instance to filter by.
|
||||||
|
:param env: The environment to filter by.
|
||||||
|
:param container: The container to filter by.
|
||||||
|
:param status: The status to filter by.
|
||||||
|
:param extra_filters: Extra AND filters to apply.
|
||||||
|
:param extra_or_filters: Extra OR filters to apply. These are combined with
|
||||||
|
the AND filters using an OR at the top level.
|
||||||
|
"""
|
||||||
and_filters: List[SearchFilterRule] = []
|
and_filters: List[SearchFilterRule] = []
|
||||||
|
|
||||||
# Platform filter.
|
# Platform filter.
|
||||||
@ -66,6 +78,14 @@ def generate_filter(
|
|||||||
for and_filter in or_filters
|
for and_filter in or_filters
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Extra OR filters are distributed across the top level and lists.
|
||||||
|
if extra_or_filters:
|
||||||
|
or_filters = [
|
||||||
|
{"and": and_filter["and"] + [extra_or_filter]}
|
||||||
|
for extra_or_filter in extra_or_filters
|
||||||
|
for and_filter in or_filters
|
||||||
|
]
|
||||||
|
|
||||||
return or_filters
|
return or_filters
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user