2025-01-10 10:35:13 -08:00
import dataclasses
2022-05-02 00:18:15 -07:00
import glob
import json
import logging
import os
2025-01-10 10:35:13 -08:00
import pathlib
2022-05-02 00:18:15 -07:00
import re
2022-05-29 08:09:02 -07:00
import sys
2022-05-02 00:18:15 -07:00
import textwrap
from importlib . metadata import metadata , requires
2025-01-10 10:35:13 -08:00
from typing import Dict , List , Optional
2022-05-02 00:18:15 -07:00
import click
2025-01-10 10:35:13 -08:00
from docgen_types import Platform , Plugin
from docs_config_table import gen_md_table_from_json_schema
2022-05-02 00:18:15 -07:00
from datahub . configuration . common import ConfigModel
2025-01-10 10:35:13 -08:00
from datahub . ingestion . api . decorators import SourceCapability , SupportStatus
2022-09-07 09:14:33 -07:00
from datahub . ingestion . source . source_registry import source_registry
2022-05-02 00:18:15 -07:00
logger = logging . getLogger ( __name__ )
def get_snippet ( long_string : str , max_length : int = 100 ) - > str :
snippet = " "
if len ( long_string ) > max_length :
snippet = long_string [ : max_length ] . strip ( ) + " ... "
else :
snippet = long_string . strip ( )
snippet = snippet . replace ( " \n " , " " )
snippet = snippet . strip ( ) + " "
return snippet
def get_support_status_badge ( support_status : SupportStatus ) - > str :
if support_status == SupportStatus . CERTIFIED :
return "  "
if support_status == SupportStatus . INCUBATING :
return "  "
if support_status == SupportStatus . TESTING :
return "  "
return " "
def get_capability_supported_badge ( supported : bool ) - > str :
return " ✅ " if supported else " ❌ "
def get_capability_text ( src_capability : SourceCapability ) - > str :
"""
Returns markdown format cell text for a capability , hyperlinked to capability feature page if known
"""
capability_docs_mapping : Dict [ SourceCapability , str ] = {
2022-09-21 09:02:50 -07:00
SourceCapability . DELETION_DETECTION : " ../../../../metadata-ingestion/docs/dev_guides/stateful.md#stale-entity-removal " ,
2022-05-02 00:18:15 -07:00
SourceCapability . DOMAINS : " ../../../domains.md " ,
SourceCapability . PLATFORM_INSTANCE : " ../../../platform-instances.md " ,
SourceCapability . DATA_PROFILING : " ../../../../metadata-ingestion/docs/dev_guides/sql_profiles.md " ,
2024-03-27 23:58:58 +05:30
SourceCapability . CLASSIFICATION : " ../../../../metadata-ingestion/docs/dev_guides/classification.md " ,
2022-05-02 00:18:15 -07:00
}
capability_doc = capability_docs_mapping . get ( src_capability )
return (
src_capability . value
if not capability_doc
else f " [ { src_capability . value } ]( { capability_doc } ) "
)
def does_extra_exist ( extra_name : str ) - > bool :
for key , value in metadata ( " acryl-datahub " ) . items ( ) :
if key == " Provides-Extra " and value == extra_name :
return True
return False
def get_additional_deps_for_extra ( extra_name : str ) - > List [ str ] :
all_requirements = requires ( " acryl-datahub " ) or [ ]
# filter for base dependencies
base_deps = set ( [ x . split ( " ; " ) [ 0 ] for x in all_requirements if " extra == " not in x ] )
# filter for dependencies for this extra
extra_deps = set (
2022-08-26 09:31:53 -07:00
[ x . split ( " ; " ) [ 0 ] for x in all_requirements if f " extra == ' { extra_name } ' " in x ]
2022-05-02 00:18:15 -07:00
)
# calculate additional deps that this extra adds
delta_deps = extra_deps - base_deps
return list ( delta_deps )
def relocate_path ( orig_path : str , relative_path : str , relocated_path : str ) - > str :
newPath = os . path . join ( os . path . dirname ( orig_path ) , relative_path )
assert os . path . exists ( newPath )
newRelativePath = os . path . relpath ( newPath , os . path . dirname ( relocated_path ) )
return newRelativePath
def rewrite_markdown ( file_contents : str , path : str , relocated_path : str ) - > str :
def new_url ( original_url : str , file_path : str ) - > str :
if original_url . startswith ( ( " http:// " , " https:// " , " # " ) ) :
return original_url
import pathlib
file_ext = pathlib . Path ( original_url ) . suffix
if file_ext . startswith ( " .md " ) :
return original_url
elif file_ext in [ " .png " , " .svg " , " .gif " , " .pdf " ] :
new_url = relocate_path ( path , original_url , relocated_path )
return new_url
return original_url
# Look for the [text](url) syntax. Note that this will also capture images.
#
# We do a little bit of parenthesis matching here to account for parens in URLs.
# See https://stackoverflow.com/a/17759264 for explanation of the second capture group.
new_content = re . sub (
r " \ [(.*?) \ ] \ (((?:[^)(]+| \ ((?:[^)(]+| \ ([^)(]* \ ))* \ ))*) \ ) " ,
lambda x : f " [ { x . group ( 1 ) } ]( { new_url ( x . group ( 2 ) . strip ( ) , path ) } ) " , # type: ignore
file_contents ,
)
new_content = re . sub (
# Also look for the [text]: url syntax.
r " ^ \ [(.+?) \ ] \ s*: \ s*(.+?) \ s*$ " ,
lambda x : f " [ { x . group ( 1 ) } ]: { new_url ( x . group ( 2 ) , path ) } " ,
new_content ,
)
return new_content
2025-01-10 10:35:13 -08:00
def load_plugin ( plugin_name : str , out_dir : str ) - > Plugin :
logger . debug ( f " Loading { plugin_name } " )
class_or_exception = source_registry . _ensure_not_lazy ( plugin_name )
if isinstance ( class_or_exception , Exception ) :
raise class_or_exception
source_type = source_registry . get ( plugin_name )
logger . debug ( f " Source class is { source_type } " )
if hasattr ( source_type , " get_platform_name " ) :
platform_name = source_type . get_platform_name ( )
else :
platform_name = (
plugin_name . title ( )
) # we like platform names to be human readable
platform_id = None
if hasattr ( source_type , " get_platform_id " ) :
platform_id = source_type . get_platform_id ( )
if platform_id is None :
raise ValueError ( f " Platform ID not found for { plugin_name } " )
plugin = Plugin (
name = plugin_name ,
platform_id = platform_id ,
platform_name = platform_name ,
classname = " . " . join ( [ source_type . __module__ , source_type . __name__ ] ) ,
)
if hasattr ( source_type , " get_platform_doc_order " ) :
platform_doc_order = source_type . get_platform_doc_order ( )
plugin . doc_order = platform_doc_order
plugin_file_name = " src/ " + " / " . join ( source_type . __module__ . split ( " . " ) )
if os . path . exists ( plugin_file_name ) and os . path . isdir ( plugin_file_name ) :
plugin_file_name = plugin_file_name + " /__init__.py "
else :
plugin_file_name = plugin_file_name + " .py "
if os . path . exists ( plugin_file_name ) :
plugin . filename = plugin_file_name
else :
logger . info (
f " Failed to locate filename for { plugin_name } . Guessed { plugin_file_name } , but that doesn ' t exist "
)
if hasattr ( source_type , " __doc__ " ) :
plugin . source_docstring = textwrap . dedent ( source_type . __doc__ or " " )
if hasattr ( source_type , " get_support_status " ) :
plugin . support_status = source_type . get_support_status ( )
if hasattr ( source_type , " get_capabilities " ) :
capabilities = list ( source_type . get_capabilities ( ) )
capabilities . sort ( key = lambda x : x . capability . value )
plugin . capabilities = capabilities
try :
extra_plugin = plugin_name if does_extra_exist ( plugin_name ) else None
plugin . extra_deps = (
get_additional_deps_for_extra ( extra_plugin ) if extra_plugin else [ ]
)
except Exception as e :
logger . info (
f " Failed to load extras for { plugin_name } due to exception { e } " , exc_info = e
)
if hasattr ( source_type , " get_config_class " ) :
source_config_class : ConfigModel = source_type . get_config_class ( )
plugin . config_json_schema = source_config_class . schema_json ( indent = 2 )
2025-06-19 15:08:11 +05:30
plugin . config_md = gen_md_table_from_json_schema ( source_config_class . schema ( ) , current_source = plugin_name )
2025-01-10 10:35:13 -08:00
# Write the config json schema to the out_dir.
config_dir = pathlib . Path ( out_dir ) / " config_schemas "
config_dir . mkdir ( parents = True , exist_ok = True )
( config_dir / f " { plugin_name } _config.json " ) . write_text (
plugin . config_json_schema
)
return plugin
@dataclasses.dataclass
class PluginMetrics :
discovered : int = 0
loaded : int = 0
generated : int = 0
failed : int = 0
@dataclasses.dataclass
class PlatformMetrics :
discovered : int = 0
generated : int = 0
warnings : List [ str ] = dataclasses . field ( default_factory = list )
2022-05-02 00:18:15 -07:00
@click.command ( )
@click.option ( " --out-dir " , type = str , required = True )
@click.option ( " --extra-docs " , type = str , required = False )
@click.option ( " --source " , type = str , required = False )
def generate (
out_dir : str , extra_docs : Optional [ str ] = None , source : Optional [ str ] = None
) - > None : # noqa: C901
2025-01-10 10:35:13 -08:00
plugin_metrics = PluginMetrics ( )
platform_metrics = PlatformMetrics ( )
2022-05-02 00:18:15 -07:00
2025-01-10 10:35:13 -08:00
platforms : Dict [ str , Platform ] = { }
2022-09-07 09:14:33 -07:00
for plugin_name in sorted ( source_registry . mapping . keys ( ) ) :
2022-05-02 00:18:15 -07:00
if source and source != plugin_name :
continue
2024-06-12 12:04:22 -05:00
if plugin_name in {
" snowflake-summary " ,
2025-01-10 10:35:13 -08:00
" snowflake-queries " ,
" bigquery-queries " ,
2024-06-12 12:04:22 -05:00
} :
logger . info ( f " Skipping { plugin_name } as it is on the deny list " )
continue
2025-01-10 10:35:13 -08:00
plugin_metrics . discovered + = 1
2022-05-02 00:18:15 -07:00
try :
2025-01-10 10:35:13 -08:00
plugin = load_plugin ( plugin_name , out_dir = out_dir )
2022-05-02 00:18:15 -07:00
except Exception as e :
2025-01-10 10:35:13 -08:00
logger . error (
f " Failed to load { plugin_name } due to exception { e } " , exc_info = e
2022-09-07 09:14:33 -07:00
)
2025-01-10 10:35:13 -08:00
plugin_metrics . failed + = 1
continue
else :
plugin_metrics . loaded + = 1
2022-05-02 00:18:15 -07:00
2025-01-10 10:35:13 -08:00
# Add to the platform list if not already present.
platforms . setdefault (
plugin . platform_id ,
Platform (
id = plugin . platform_id ,
name = plugin . platform_name ,
) ,
) . add_plugin ( plugin_name = plugin . name , plugin = plugin )
2022-07-06 20:39:27 +10:00
2025-01-10 10:35:13 -08:00
if extra_docs :
for path in glob . glob ( f " { extra_docs } /**/*[.md|.yaml|.yml] " , recursive = True ) :
if m := re . search ( " /docs/sources/(.*)/(.*).md " , path ) :
platform_name = m . group ( 1 ) . lower ( ) # TODO: rename this to platform_id
file_name = m . group ( 2 )
destination_md : str = (
f " ../docs/generated/ingestion/sources/ { platform_name } .md "
2022-06-03 14:57:36 -07:00
)
2022-05-02 00:18:15 -07:00
2025-01-10 10:35:13 -08:00
with open ( path , " r " ) as doc_file :
file_contents = doc_file . read ( )
final_markdown = rewrite_markdown ( file_contents , path , destination_md )
if file_name == " README " :
# README goes as platform level docs
# all other docs are assumed to be plugin level
platforms [ platform_name ] . custom_docs_pre = final_markdown
elif " _ " in file_name :
plugin_doc_parts = file_name . split ( " _ " )
if len ( plugin_doc_parts ) != 2 :
raise ValueError (
f " { file_name } needs to be of the form <plugin>_pre.md or <plugin>_post.md "
)
plugin_name , suffix = plugin_doc_parts
if suffix == " pre " :
platforms [ platform_name ] . plugins [
plugin_name
] . custom_docs_pre = final_markdown
elif suffix == " post " :
platforms [ platform_name ] . plugins [
plugin_name
] . custom_docs_post = final_markdown
else :
raise ValueError (
f " { file_name } needs to be of the form <plugin>_pre.md or <plugin>_post.md "
)
2022-05-02 00:18:15 -07:00
2025-01-10 10:35:13 -08:00
else : # assume this is the platform post.
# TODO: Probably need better error checking here.
platforms [ platform_name ] . plugins [
file_name
] . custom_docs_post = final_markdown
elif yml_match := re . search ( " /docs/sources/(.*)/(.*)_recipe.yml " , path ) :
platform_name = yml_match . group ( 1 ) . lower ( )
plugin_name = yml_match . group ( 2 )
platforms [ platform_name ] . plugins [
plugin_name
] . starter_recipe = pathlib . Path ( path ) . read_text ( )
2022-05-02 00:18:15 -07:00
sources_dir = f " { out_dir } /sources "
os . makedirs ( sources_dir , exist_ok = True )
2025-01-10 10:35:13 -08:00
# Sort platforms by platform name.
platforms = dict ( sorted ( platforms . items ( ) , key = lambda x : x [ 1 ] . name . casefold ( ) ) )
2023-02-13 21:39:54 +01:00
i = 0
2025-01-10 10:35:13 -08:00
for platform_id , platform in platforms . items ( ) :
2022-05-02 00:18:15 -07:00
if source and platform_id != source :
continue
2025-01-10 10:35:13 -08:00
platform_metrics . discovered + = 1
2022-05-02 00:18:15 -07:00
platform_doc_file = f " { sources_dir } / { platform_id } .md "
2025-01-10 10:35:13 -08:00
# if "name" not in platform_docs:
# # We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes
# warning_msg = f"Failed to find source classes for platform {platform_id}. Did you remember to annotate your source class with @platform_name({platform_id})?"
# logger.error(warning_msg)
# metrics["source_platforms"]["warnings"].append(warning_msg) # type: ignore
# continue
2022-05-02 00:18:15 -07:00
with open ( platform_doc_file , " w " ) as f :
2023-02-13 21:39:54 +01:00
i + = 1
f . write ( f " --- \n sidebar_position: { i } \n --- \n \n " )
f . write (
2023-11-13 21:51:11 -05:00
" import Tabs from ' @theme/Tabs ' ; \n import TabItem from ' @theme/TabItem ' ; \n \n "
2023-02-13 21:39:54 +01:00
)
2025-01-10 10:35:13 -08:00
f . write ( f " # { platform . name } \n " )
2023-02-13 21:39:54 +01:00
2025-01-10 10:35:13 -08:00
if len ( platform . plugins ) > 1 :
2022-05-02 00:18:15 -07:00
# More than one plugin used to provide integration with this platform
f . write (
2025-01-10 10:35:13 -08:00
f " There are { len ( platform . plugins ) } sources that provide integration with { platform . name } \n "
2022-05-02 00:18:15 -07:00
)
f . write ( " \n " )
f . write ( " <table> \n " )
f . write ( " <tr> " )
for col_header in [ " Source Module " , " Documentation " ] :
f . write ( f " <td> { col_header } </td> " )
f . write ( " </tr> " )
2025-01-10 10:35:13 -08:00
# Sort plugins in the platform.
# It's a dict, so we need to recreate it.
platform . plugins = dict (
sorted (
platform . plugins . items ( ) ,
key = lambda x : str ( x [ 1 ] . doc_order ) if x [ 1 ] . doc_order else x [ 0 ] ,
)
)
2022-05-02 00:18:15 -07:00
# f.write("| Source Module | Documentation |\n")
# f.write("| ------ | ---- |\n")
2025-01-10 10:35:13 -08:00
for plugin_name , plugin in platform . plugins . items ( ) :
2022-05-02 00:18:15 -07:00
f . write ( " <tr> \n " )
2025-01-10 10:35:13 -08:00
f . write ( f " <td> \n \n ` { plugin_name } ` \n \n </td> \n " )
2022-05-02 00:18:15 -07:00
f . write (
2025-01-10 10:35:13 -08:00
f " <td> \n \n \n { plugin . source_docstring or ' ' } [Read more...](#module- { plugin_name } ) \n \n \n </td> \n "
2022-05-02 00:18:15 -07:00
)
f . write ( " </tr> \n " )
# f.write(
# f"| `{plugin}` | {get_snippet(platform_docs['plugins'][plugin]['source_doc'])}[Read more...](#module-{plugin}) |\n"
# )
2022-05-09 13:01:44 -07:00
f . write ( " </table> \n \n " )
2022-05-02 00:18:15 -07:00
# insert platform level custom docs before plugin section
2025-01-10 10:35:13 -08:00
f . write ( platform . custom_docs_pre or " " )
2022-09-08 00:38:10 -07:00
# all_plugins = platform_docs["plugins"].keys()
2025-01-10 10:35:13 -08:00
for plugin_name , plugin in platform . plugins . items ( ) :
if len ( platform . plugins ) > 1 :
2023-04-04 18:40:19 +05:30
# We only need to show this if there are multiple modules.
2025-01-10 10:35:13 -08:00
f . write ( f " \n \n ## Module ` { plugin_name } ` \n " )
2023-04-04 18:40:19 +05:30
2025-01-10 10:35:13 -08:00
if plugin . support_status != SupportStatus . UNKNOWN :
f . write ( get_support_status_badge ( plugin . support_status ) + " \n \n " )
if plugin . capabilities and len ( plugin . capabilities ) :
2022-05-02 00:18:15 -07:00
f . write ( " \n ### Important Capabilities \n " )
f . write ( " | Capability | Status | Notes | \n " )
f . write ( " | ---------- | ------ | ----- | \n " )
2025-01-10 10:35:13 -08:00
for cap_setting in plugin . capabilities :
2022-05-02 00:18:15 -07:00
f . write (
f " | { get_capability_text ( cap_setting . capability ) } | { get_capability_supported_badge ( cap_setting . supported ) } | { cap_setting . description } | \n "
)
f . write ( " \n " )
2025-01-10 10:35:13 -08:00
f . write ( f " { plugin . source_docstring or ' ' } \n " )
2022-09-08 00:38:10 -07:00
# Insert custom pre section
2025-01-10 10:35:13 -08:00
f . write ( plugin . custom_docs_pre or " " )
2022-09-08 00:38:10 -07:00
f . write ( " \n ### CLI based Ingestion \n " )
2025-01-10 10:35:13 -08:00
if plugin . extra_deps and len ( plugin . extra_deps ) :
2022-09-08 00:38:10 -07:00
f . write ( " \n #### Install the Plugin \n " )
2025-01-10 10:35:13 -08:00
if plugin . extra_deps != [ ] :
2022-05-02 00:18:15 -07:00
f . write ( " ```shell \n " )
2022-05-09 13:01:44 -07:00
f . write ( f " pip install ' acryl-datahub[ { plugin } ] ' \n " )
2022-05-02 00:18:15 -07:00
f . write ( " ``` \n " )
else :
f . write (
f " The ` { plugin } ` source works out of the box with `acryl-datahub`. \n "
)
2025-01-10 10:35:13 -08:00
if plugin . starter_recipe :
2022-09-08 00:38:10 -07:00
f . write ( " \n ### Starter Recipe \n " )
2022-05-02 00:18:15 -07:00
f . write (
" Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options. \n \n \n "
)
f . write (
2022-09-14 11:05:31 -07:00
" For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes). \n "
2022-05-02 00:18:15 -07:00
)
f . write ( " ```yaml \n " )
2025-01-10 10:35:13 -08:00
f . write ( plugin . starter_recipe )
2022-05-02 00:18:15 -07:00
f . write ( " \n ``` \n " )
2025-01-10 10:35:13 -08:00
if plugin . config_json_schema :
assert plugin . config_md is not None
2022-05-02 00:18:15 -07:00
f . write ( " \n ### Config Details \n " )
2022-07-06 20:39:27 +10:00
f . write (
""" <Tabs>
< TabItem value = " options " label = " Options " default > \n \n """
)
2022-05-02 00:18:15 -07:00
f . write (
" Note that a `.` is used to denote nested fields in the YAML recipe. \n \n "
)
2023-04-04 18:40:19 +05:30
# f.write(
# "\n<details open>\n<summary>View All Configuration Options</summary>\n\n"
# )
2025-01-10 10:35:13 -08:00
f . write ( plugin . config_md )
f . write ( " \n \n " )
2023-04-04 18:40:19 +05:30
# f.write("\n</details>\n\n")
2022-07-06 20:39:27 +10:00
f . write (
f """ </TabItem>
2022-06-03 14:57:36 -07:00
< TabItem value = " schema " label = " Schema " >
The [ JSONSchema ] ( https : / / json - schema . org / ) for this configuration is inlined below . \n \n
` ` ` javascript
2025-01-10 10:35:13 -08:00
{ plugin . config_json_schema }
2022-06-03 14:57:36 -07:00
` ` ` \n \n
< / TabItem >
2022-07-06 20:39:27 +10:00
< / Tabs > \n \n """
)
2025-01-10 10:35:13 -08:00
2022-05-02 00:18:15 -07:00
# insert custom plugin docs after config details
2025-01-10 10:35:13 -08:00
f . write ( plugin . custom_docs_post or " " )
if plugin . classname :
2022-05-02 00:18:15 -07:00
f . write ( " \n ### Code Coordinates \n " )
2025-01-10 10:35:13 -08:00
f . write ( f " - Class Name: ` { plugin . classname } ` \n " )
if plugin . filename :
2022-05-02 00:18:15 -07:00
f . write (
2025-01-10 10:35:13 -08:00
f " - Browse on [GitHub](../../../../metadata-ingestion/ { plugin . filename } ) \n \n "
2022-05-02 00:18:15 -07:00
)
2025-01-10 10:35:13 -08:00
plugin_metrics . generated + = 1
2022-05-02 00:18:15 -07:00
2023-04-04 18:40:19 +05:30
# Using an h2 tag to prevent this from showing up in page's TOC sidebar.
f . write ( " \n <h2>Questions</h2> \n \n " )
2022-05-02 00:18:15 -07:00
f . write (
2025-04-28 23:34:33 +09:00
f " If you ' ve got any questions on configuring ingestion for { platform . name } , feel free to ping us on [our Slack](https://datahub.com/slack). \n "
2022-05-02 00:18:15 -07:00
)
2025-01-10 10:35:13 -08:00
platform_metrics . generated + = 1
2022-05-02 00:18:15 -07:00
print ( " Ingestion Documentation Generation Complete " )
print ( " ############################################ " )
2025-01-10 10:35:13 -08:00
print (
json . dumps (
{
" plugin_metrics " : dataclasses . asdict ( plugin_metrics ) ,
" platform_metrics " : dataclasses . asdict ( platform_metrics ) ,
} ,
indent = 2 ,
)
)
2022-05-02 00:18:15 -07:00
print ( " ############################################ " )
2025-01-10 10:35:13 -08:00
if plugin_metrics . failed > 0 :
2022-05-29 08:09:02 -07:00
sys . exit ( 1 )
2022-05-02 00:18:15 -07:00
2025-01-10 10:35:13 -08:00
# Create Lineage doc
generate_lineage_doc ( platforms )
2023-10-04 17:43:59 +09:00
2025-01-10 10:35:13 -08:00
def generate_lineage_doc ( platforms : Dict [ str , Platform ] ) - > None :
2023-10-04 17:43:59 +09:00
source_dir = " ../docs/generated/lineage "
os . makedirs ( source_dir , exist_ok = True )
doc_file = f " { source_dir } /lineage-feature-guide.md "
with open ( doc_file , " w+ " ) as f :
2024-06-12 12:04:22 -05:00
f . write (
" import FeatureAvailability from ' @site/src/components/FeatureAvailability ' ; \n \n "
)
2025-01-10 10:35:13 -08:00
f . write ( " # About DataHub Lineage \n \n " )
2023-10-04 17:43:59 +09:00
f . write ( " <FeatureAvailability/> \n " )
2024-06-12 12:04:22 -05:00
f . write (
"""
2024-05-22 15:18:33 +09:00
Data lineage is a * * map that shows how data flows through your organization . * * It details where your data originates , how it travels , and where it ultimately ends up .
This can happen within a single system ( like data moving between Snowflake tables ) or across various platforms .
With data lineage , you can
- Maintaining Data Integrity
- Simplify and Refine Complex Relationships
- Perform [ Lineage Impact Analysis ] ( . . / . . / act - on - metadata / impact - analysis . md )
- [ Propagate Metadata ] ( https : / / blog . datahubproject . io / acryl - data - introduces - lineage - support - and - automated - propagation - of - governance - information - for - 339 c99536561 ) Across Lineage
2023-10-04 17:43:59 +09:00
## Viewing Lineage
You can view lineage under * * Lineage * * tab or * * Lineage Visualization * * screen .
2024-05-22 15:18:33 +09:00
2023-10-04 17:43:59 +09:00
< p align = " center " >
2024-05-22 15:18:33 +09:00
< img width = " 80 % " src = " https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/lineage/lineage-tab.png " / >
2023-10-04 17:43:59 +09:00
< / p >
2024-12-12 16:51:24 -05:00
By default , the UI shows the latest version of the lineage . The time picker can be used to filter out edges within the latest version to exclude those that were last updated outside of the time window . Selecting time windows in the patch will not show you historical lineages . It will only filter the view of the latest version of the lineage .
2023-10-04 17:43:59 +09:00
< p align = " center " >
2024-05-22 15:18:33 +09:00
< img width = " 80 % " src = " https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/lineage/lineage-view.png " / >
2023-10-04 17:43:59 +09:00
< / p >
2024-05-22 15:18:33 +09:00
In this example , data flows from Airflow / BigQuery to Snowflake tables , then to the Hive dataset , and ultimately to the features of Machine Learning Models .
2023-10-04 17:43:59 +09:00
: : : tip The Lineage Tab is greyed out - why can ’ t I click on it ?
This means you have not yet ingested lineage metadata for that entity . Please ingest lineage to proceed .
: : :
2024-05-22 15:18:33 +09:00
## Column Level Lineage Support
Column - level lineage * * tracks changes and movements for each specific data column . * * This approach is often contrasted with table - level lineage , which specifies lineage at the table level .
Below is how column - level lineage can be set with dbt and Postgres tables .
< p align = " center " >
< img width = " 80 % " src = " https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/lineage/column-level-lineage.png " / >
< / p >
2023-10-04 17:43:59 +09:00
## Adding Lineage
### Ingestion Source
If you ' re using an ingestion source that supports extraction of Lineage (e.g. **Table Lineage Capability**), then lineage information can be extracted automatically.
2025-04-28 23:34:33 +09:00
For detailed instructions , refer to the [ source documentation ] ( https : / / docs . datahub . com / integrations ) for the source you are using .
2023-10-04 17:43:59 +09:00
### UI
As of ` v0 .9 .5 ` , DataHub supports the manual editing of lineage between entities . Data experts are free to add or remove upstream and downstream lineage edges in both the Lineage Visualization screen as well as the Lineage tab on entity pages . Use this feature to supplement automatic lineage extraction or establish important entity relationships in sources that do not support automatic extraction . Editing lineage by hand is supported for Datasets , Charts , Dashboards , and Data Jobs .
Please refer to our [ UI Guides on Lineage ] ( . . / . . / features / feature - guides / ui - lineage . md ) for more information .
: : : caution Recommendation on UI - based lineage
Lineage added by hand and programmatically may conflict with one another to cause unwanted overwrites .
It is strongly recommend that lineage is edited manually in cases where lineage information is not also extracted in automated fashion , e . g . by running an ingestion source .
: : :
### API
If you are not using a Lineage - support ingestion source , you can programmatically emit lineage edges between entities via API .
Please refer to [ API Guides on Lineage ] ( . . / . . / api / tutorials / lineage . md ) for more information .
## Lineage Support
2024-05-22 15:18:33 +09:00
DataHub supports * * [ automatic table - and column - level lineage detection ] ( #automatic-lineage-extraction-support)** from BigQuery, Snowflake, dbt, Looker, PowerBI, and 20+ modern data tools.
2024-12-12 16:51:24 -05:00
For data tools with limited native lineage tracking , [ * * DataHub ' s SQL Parser**](../../lineage/sql_parsing.md) detects lineage with 97-99 % a ccuracy, ensuring teams will have high quality lineage graphs across all corners of their data stack.
2024-05-22 15:18:33 +09:00
### Types of Lineage Connections
Types of lineage connections supported in DataHub and the example codes are as follows .
2025-06-13 00:57:16 +09:00
* [ Dataset to Dataset ] ( . . / . . / . . / metadata - ingestion / examples / library / add_lineage_dataset_to_dataset . py )
2024-05-22 15:18:33 +09:00
* [ DataJob to DataFlow ] ( . . / . . / . . / metadata - ingestion / examples / library / lineage_job_dataflow . py )
* [ DataJob to Dataset ] ( . . / . . / . . / metadata - ingestion / examples / library / lineage_dataset_job_dataset . py )
* [ Chart to Dashboard ] ( . . / . . / . . / metadata - ingestion / examples / library / lineage_chart_dashboard . py )
* [ Chart to Dataset ] ( . . / . . / . . / metadata - ingestion / examples / library / lineage_dataset_chart . py )
2023-10-04 17:43:59 +09:00
### Automatic Lineage Extraction Support
2025-06-13 00:57:16 +09:00
This is a summary of automatic lineage extraction support in our data source . Please refer to the * * Important Capabilities * * table in the source documentation . Note that even if the source does not support automatic extraction , you can still add lineage manually using our API & SDKs . \n """
2024-06-12 12:04:22 -05:00
)
2023-10-04 17:43:59 +09:00
2024-06-12 12:04:22 -05:00
f . write (
" \n | Source | Table-Level Lineage | Column-Level Lineage | Related Configs | \n "
)
2023-10-04 17:43:59 +09:00
f . write ( " | ---------- | ------ | ----- |----- | \n " )
2025-01-10 10:35:13 -08:00
for platform_id , platform in platforms . items ( ) :
for plugin in sorted (
platform . plugins . values ( ) ,
key = lambda x : str ( x . doc_order ) if x . doc_order else x . name ,
2023-10-04 17:43:59 +09:00
) :
2025-01-10 10:35:13 -08:00
if len ( platform . plugins ) > 1 :
2023-10-04 17:43:59 +09:00
# We only need to show this if there are multiple modules.
2025-01-10 10:35:13 -08:00
platform_plugin_name = f " { platform . name } ` { plugin . name } ` "
else :
platform_plugin_name = platform . name
2023-10-04 17:43:59 +09:00
# Initialize variables
table_level_supported = " ❌ "
column_level_supported = " ❌ "
2024-06-12 12:04:22 -05:00
config_names = " "
2023-10-04 17:43:59 +09:00
2025-01-10 10:35:13 -08:00
if plugin . capabilities and len ( plugin . capabilities ) :
plugin_capabilities = plugin . capabilities
2023-10-04 17:43:59 +09:00
for cap_setting in plugin_capabilities :
capability_text = get_capability_text ( cap_setting . capability )
2024-06-12 12:04:22 -05:00
capability_supported = get_capability_supported_badge (
cap_setting . supported
)
2023-10-04 17:43:59 +09:00
2024-06-12 12:04:22 -05:00
if (
capability_text == " Table-Level Lineage "
and capability_supported == " ✅ "
) :
2023-10-04 17:43:59 +09:00
table_level_supported = " ✅ "
2024-06-12 12:04:22 -05:00
if (
capability_text == " Column-level Lineage "
and capability_supported == " ✅ "
) :
2023-10-04 17:43:59 +09:00
column_level_supported = " ✅ "
if not ( table_level_supported == " ❌ " and column_level_supported == " ❌ " ) :
2025-01-10 10:35:13 -08:00
if plugin . config_json_schema :
config_properties = json . loads ( plugin . config_json_schema ) . get (
" properties " , { }
)
2024-06-12 12:04:22 -05:00
config_names = " <br /> " . join (
[
f " - { property_name } "
for property_name in config_properties
if " lineage " in property_name
]
)
lineage_not_applicable_sources = [
" azure-ad " ,
" csv " ,
" demo-data " ,
" dynamodb " ,
" iceberg " ,
" json-schema " ,
" ldap " ,
" openapi " ,
" pulsar " ,
" sqlalchemy " ,
]
if platform_id not in lineage_not_applicable_sources :
2023-10-04 17:43:59 +09:00
f . write (
2025-01-10 10:35:13 -08:00
f " | [ { platform_plugin_name } ](../../generated/ingestion/sources/ { platform_id } .md) | { table_level_supported } | { column_level_supported } | { config_names } | \n "
2023-10-04 17:43:59 +09:00
)
2024-06-12 12:04:22 -05:00
f . write (
"""
2024-05-22 15:18:33 +09:00
### SQL Parser Lineage Extraction
2023-10-04 17:43:59 +09:00
2024-05-22 15:18:33 +09:00
If you ’ re using a different database system for which we don ’ t support column - level lineage out of the box , but you do have a database query log available ,
we have a SQL queries connector that generates column - level lineage and detailed table usage statistics from the query log .
2023-10-04 17:43:59 +09:00
2025-06-13 16:50:19 +09:00
If these does not suit your needs , you can use the new ` DataHubGraph . parse_sql_lineage ( ) ` method in our SDK . ( See the source code [ here ] ( https : / / docs . datahub . com / docs / python - sdk / clients / graph - client ) )
2023-10-04 17:43:59 +09:00
2024-05-22 15:18:33 +09:00
For more information , refer to the [ Extracting Column - Level Lineage from SQL ] ( https : / / blog . datahubproject . io / extracting - column - level - lineage - from - sql - 779 b8ce17567 )
2023-10-04 17:43:59 +09:00
: : : tip Our Roadmap
We ' re actively working on expanding lineage support for new data sources.
Visit our [ Official Roadmap ] ( https : / / feature - requests . datahubproject . io / roadmap ) for upcoming updates !
: : :
## References
- [ DataHub Basics : Lineage 101 ] ( https : / / www . youtube . com / watch ? v = rONGpsndzRw & t = 1 s )
- [ DataHub November 2022 Town Hall ] ( https : / / www . youtube . com / watch ? v = BlCLhG8lGoY & t = 1 s ) - Including Manual Lineage Demo
- [ Data in Context : Lineage Explorer in DataHub ] ( https : / / blog . datahubproject . io / data - in - context - lineage - explorer - in - datahub - a53a9a476dc4 )
- [ Harnessing the Power of Data Lineage with DataHub ] ( https : / / blog . datahubproject . io / harnessing - the - power - of - data - lineage - with - datahub - ad086358dec4 )
2024-05-22 15:18:33 +09:00
- [ Data Lineage : What It Is And Why It Matters ] ( https : / / blog . datahubproject . io / data - lineage - what - it - is - and - why - it - matters - 1 a8d9846f0bd )
2024-06-12 12:04:22 -05:00
"""
)
2023-10-04 17:43:59 +09:00
print ( " Lineage Documentation Generation Complete " )
2022-05-02 00:18:15 -07:00
2024-06-12 12:04:22 -05:00
2022-05-02 00:18:15 -07:00
if __name__ == " __main__ " :
logger . setLevel ( " INFO " )
generate ( )