feat(ingest/elastic): reduce number of calls made (#8477)

This commit is contained in:
Aseem Bansal 2023-07-23 17:12:31 +05:30 committed by GitHub
parent 8fb5912978
commit c0dbea8363
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -343,7 +343,7 @@ class ElasticsearchSource(Source):
self.report = ElasticsearchSourceReport()
self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
self.platform: str = "elasticsearch"
self.profiling_info: Dict[str, DatasetProfileClass] = {}
self.cat_response: Optional[List[Dict[str, Any]]] = None
@classmethod
def create(
@ -357,7 +357,6 @@ class ElasticsearchSource(Source):
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
indices = self.client.indices.get_alias()
for index in indices:
self.report.report_index_scanned(index)
@ -366,12 +365,6 @@ class ElasticsearchSource(Source):
yield mcp.as_workunit()
else:
self.report.report_dropped(index)
for urn, profiling_info in self.profiling_info.items():
yield MetadataChangeProposalWrapper(
entityUrn=urn,
aspect=profiling_info,
).as_workunit()
self.profiling_info = {}
for mcp in self._get_data_stream_index_count_mcps():
yield mcp.as_workunit()
@ -523,36 +516,44 @@ class ElasticsearchSource(Source):
)
if self.source_config.profiling.enabled:
cat_response = self.client.cat.indices(
index=index, params={"format": "json", "bytes": "b"}
if self.cat_response is None:
self.cat_response = self.client.cat.indices(
params={
"format": "json",
"bytes": "b",
"h": "index,docs.count,store.size",
}
)
if self.cat_response is None:
return
for item in self.cat_response:
item["index"] = collapse_name(
name=item["index"],
collapse_urns=self.source_config.collapse_urns,
)
profile_info_current = list(
filter(lambda x: x["index"] == collapsed_index_name, self.cat_response)
)
if len(cat_response) == 1:
index_res = cat_response[0]
docs_count = int(index_res["docs.count"])
size = int(index_res["store.size"])
if len(self.source_config.collapse_urns.urns_suffix_regex) > 0:
if dataset_urn not in self.profiling_info:
self.profiling_info[dataset_urn] = DatasetProfileClass(
timestampMillis=int(time.time() * 1000),
rowCount=docs_count,
columnCount=len(schema_fields),
sizeInBytes=size,
)
else:
existing_profile = self.profiling_info[dataset_urn]
if existing_profile.rowCount is not None:
docs_count = docs_count + existing_profile.rowCount
if existing_profile.sizeInBytes is not None:
size = size + existing_profile.sizeInBytes
self.profiling_info[dataset_urn] = DatasetProfileClass(
timestampMillis=int(time.time() * 1000),
rowCount=docs_count,
columnCount=len(schema_fields),
sizeInBytes=size,
)
else:
logger.warning(
"Unexpected response from cat response with multiple rows"
if len(profile_info_current) > 0:
self.cat_response = list(
filter(
lambda x: x["index"] != collapsed_index_name, self.cat_response
)
)
row_count = 0
size_in_bytes = 0
for profile_info in profile_info_current:
row_count += int(profile_info["docs.count"])
size_in_bytes += int(profile_info["store.size"])
yield MetadataChangeProposalWrapper(
entityUrn=dataset_urn,
aspect=DatasetProfileClass(
timestampMillis=int(time.time() * 1000),
rowCount=row_count,
columnCount=len(schema_fields),
sizeInBytes=size_in_bytes,
),
)
def get_report(self):