mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-26 09:26:22 +00:00
feat(ingestion): Adds support for memory profiling (#8856)
Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
parent
8813ae2fb1
commit
f6e1312063
@ -140,6 +140,7 @@ module.exports = {
|
||||
"metadata-ingestion/docs/dev_guides/classification",
|
||||
"metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source",
|
||||
"metadata-ingestion/docs/dev_guides/sql_profiles",
|
||||
"metadata-ingestion/docs/dev_guides/profiling_ingestions",
|
||||
],
|
||||
},
|
||||
],
|
||||
|
||||
55
metadata-ingestion/docs/dev_guides/profiling_ingestions.md
Normal file
55
metadata-ingestion/docs/dev_guides/profiling_ingestions.md
Normal file
@ -0,0 +1,55 @@
|
||||
import FeatureAvailability from '@site/src/components/FeatureAvailability';
|
||||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# Profiling ingestions
|
||||
|
||||
<FeatureAvailability/>
|
||||
|
||||
**🤝 Version compatibility**
|
||||
> Open Source DataHub: **0.11.1** | Acryl: **0.2.12**
|
||||
|
||||
This page documents how to perform memory profiles of ingestion runs.
|
||||
It is useful when trying to size the amount of resources necessary to ingest some source or when developing new features or sources.
|
||||
|
||||
## How to use
|
||||
Install the `debug` plugin for DataHub's CLI wherever the ingestion runs:
|
||||
|
||||
```bash
|
||||
pip install 'acryl-datahub[debug]'
|
||||
```
|
||||
|
||||
This will install [memray](https://github.com/bloomberg/memray) in your python environment.
|
||||
|
||||
Add a flag to your ingestion recipe to generate a memray memory dump of your ingestion:
|
||||
```yaml
|
||||
source:
|
||||
...
|
||||
|
||||
sink:
|
||||
...
|
||||
|
||||
flags:
|
||||
generate_memory_profiles: "<path to folder where dumps will be written to>"
|
||||
```
|
||||
|
||||
Once the ingestion run starts a binary file will be created and appended to during the execution of the ingestion.
|
||||
|
||||
These files follow the pattern `file-<ingestion-run-urn>.bin` for a unique identification.
|
||||
Once the ingestion has finished you can use `memray` to analyze the memory dump in a flamegraph view using:
|
||||
|
||||
```$ memray flamegraph file-None-file-2023_09_18-21_38_43.bin```
|
||||
|
||||
This will generate an interactive HTML file for analysis:
|
||||
|
||||
<p align="center">
|
||||
<img width="70%" src="https://github.com/datahub-project/static-assets/blob/main/imgs/metadata-ingestion/memray-example.png?raw=true"/>
|
||||
</p>
|
||||
|
||||
|
||||
`memray` has an extensive set of features for memory investigation. Take a look at their [documentation](https://bloomberg.github.io/memray/overview.html) to see the full feature set.
|
||||
|
||||
|
||||
## Questions
|
||||
|
||||
If you've got any questions on configuring profiling, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
|
||||
@ -431,6 +431,10 @@ pytest_dep = "pytest>=6.2.2"
|
||||
deepdiff_dep = "deepdiff"
|
||||
test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"}
|
||||
|
||||
debug_requirements = {
|
||||
"memray"
|
||||
}
|
||||
|
||||
base_dev_requirements = {
|
||||
*base_requirements,
|
||||
*framework_common,
|
||||
@ -723,5 +727,6 @@ See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion).
|
||||
"dev": list(dev_requirements),
|
||||
"testing-utils": list(test_api_requirements), # To import `datahub.testing`
|
||||
"integration-tests": list(full_test_dev_requirements),
|
||||
"debug": list(debug_requirements),
|
||||
},
|
||||
)
|
||||
|
||||
@ -353,77 +353,89 @@ class Pipeline:
|
||||
return False
|
||||
|
||||
def run(self) -> None:
|
||||
self.final_status = "unknown"
|
||||
self._notify_reporters_on_ingestion_start()
|
||||
callback = None
|
||||
try:
|
||||
callback = (
|
||||
LoggingCallback()
|
||||
if not self.config.failure_log.enabled
|
||||
else DeadLetterQueueCallback(
|
||||
self.ctx, self.config.failure_log.log_config
|
||||
with contextlib.ExitStack() as stack:
|
||||
if self.config.flags.generate_memory_profiles:
|
||||
import memray
|
||||
|
||||
stack.enter_context(
|
||||
memray.Tracker(
|
||||
f"{self.config.flags.generate_memory_profiles}/{self.config.run_id}.bin"
|
||||
)
|
||||
)
|
||||
)
|
||||
for wu in itertools.islice(
|
||||
self.source.get_workunits(),
|
||||
self.preview_workunits if self.preview_mode else None,
|
||||
):
|
||||
try:
|
||||
if self._time_to_print():
|
||||
self.pretty_print_summary(currently_running=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to print summary {e}")
|
||||
|
||||
if not self.dry_run:
|
||||
self.sink.handle_work_unit_start(wu)
|
||||
try:
|
||||
record_envelopes = self.extractor.get_records(wu)
|
||||
for record_envelope in self.transform(record_envelopes):
|
||||
if not self.dry_run:
|
||||
self.sink.write_record_async(record_envelope, callback)
|
||||
|
||||
except RuntimeError:
|
||||
raise
|
||||
except SystemExit:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to process some records. Continuing.", exc_info=e
|
||||
self.final_status = "unknown"
|
||||
self._notify_reporters_on_ingestion_start()
|
||||
callback = None
|
||||
try:
|
||||
callback = (
|
||||
LoggingCallback()
|
||||
if not self.config.failure_log.enabled
|
||||
else DeadLetterQueueCallback(
|
||||
self.ctx, self.config.failure_log.log_config
|
||||
)
|
||||
# TODO: Transformer errors should cause the pipeline to fail.
|
||||
|
||||
self.extractor.close()
|
||||
if not self.dry_run:
|
||||
self.sink.handle_work_unit_end(wu)
|
||||
self.source.close()
|
||||
# no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
|
||||
for record_envelope in self.transform(
|
||||
[
|
||||
RecordEnvelope(
|
||||
record=EndOfStream(), metadata={"workunit_id": "end-of-stream"}
|
||||
)
|
||||
]
|
||||
):
|
||||
if not self.dry_run and not isinstance(
|
||||
record_envelope.record, EndOfStream
|
||||
)
|
||||
for wu in itertools.islice(
|
||||
self.source.get_workunits(),
|
||||
self.preview_workunits if self.preview_mode else None,
|
||||
):
|
||||
# TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
|
||||
self.sink.write_record_async(record_envelope, callback)
|
||||
try:
|
||||
if self._time_to_print():
|
||||
self.pretty_print_summary(currently_running=True)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to print summary {e}")
|
||||
|
||||
self.sink.close()
|
||||
self.process_commits()
|
||||
self.final_status = "completed"
|
||||
except (SystemExit, RuntimeError, KeyboardInterrupt) as e:
|
||||
self.final_status = "cancelled"
|
||||
logger.error("Caught error", exc_info=e)
|
||||
raise
|
||||
finally:
|
||||
clear_global_warnings()
|
||||
if not self.dry_run:
|
||||
self.sink.handle_work_unit_start(wu)
|
||||
try:
|
||||
record_envelopes = self.extractor.get_records(wu)
|
||||
for record_envelope in self.transform(record_envelopes):
|
||||
if not self.dry_run:
|
||||
self.sink.write_record_async(record_envelope, callback)
|
||||
|
||||
if callback and hasattr(callback, "close"):
|
||||
callback.close() # type: ignore
|
||||
except RuntimeError:
|
||||
raise
|
||||
except SystemExit:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to process some records. Continuing.",
|
||||
exc_info=e,
|
||||
)
|
||||
# TODO: Transformer errors should cause the pipeline to fail.
|
||||
|
||||
self._notify_reporters_on_ingestion_completion()
|
||||
self.extractor.close()
|
||||
if not self.dry_run:
|
||||
self.sink.handle_work_unit_end(wu)
|
||||
self.source.close()
|
||||
# no more data is coming, we need to let the transformers produce any additional records if they are holding on to state
|
||||
for record_envelope in self.transform(
|
||||
[
|
||||
RecordEnvelope(
|
||||
record=EndOfStream(),
|
||||
metadata={"workunit_id": "end-of-stream"},
|
||||
)
|
||||
]
|
||||
):
|
||||
if not self.dry_run and not isinstance(
|
||||
record_envelope.record, EndOfStream
|
||||
):
|
||||
# TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc.
|
||||
self.sink.write_record_async(record_envelope, callback)
|
||||
|
||||
self.sink.close()
|
||||
self.process_commits()
|
||||
self.final_status = "completed"
|
||||
except (SystemExit, RuntimeError, KeyboardInterrupt) as e:
|
||||
self.final_status = "cancelled"
|
||||
logger.error("Caught error", exc_info=e)
|
||||
raise
|
||||
finally:
|
||||
clear_global_warnings()
|
||||
|
||||
if callback and hasattr(callback, "close"):
|
||||
callback.close() # type: ignore
|
||||
|
||||
self._notify_reporters_on_ingestion_completion()
|
||||
|
||||
def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]:
|
||||
"""
|
||||
|
||||
@ -57,6 +57,13 @@ class FlagsConfig(ConfigModel):
|
||||
),
|
||||
)
|
||||
|
||||
generate_memory_profiles: Optional[str] = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"Generate memray memory dumps for ingestion process by providing a path to write the dump file in."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class PipelineConfig(ConfigModel):
|
||||
# Once support for discriminated unions gets merged into Pydantic, we can
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user