2022-05-02 00:18:15 -07:00
import glob
import json
import logging
import os
import re
2022-05-29 08:09:02 -07:00
import sys
2022-05-02 00:18:15 -07:00
import textwrap
from importlib . metadata import metadata , requires
2023-03-17 07:58:14 -07:00
from typing import Any , Dict , Iterable , List , Optional , Tuple , Union
2022-05-02 00:18:15 -07:00
import click
2023-03-17 07:58:14 -07:00
from pydantic import BaseModel , Field
2022-05-02 00:18:15 -07:00
from pydantic . dataclasses import dataclass
from datahub . configuration . common import ConfigModel
from datahub . ingestion . api . decorators import (
CapabilitySetting ,
SourceCapability ,
SupportStatus ,
)
2022-09-07 09:14:33 -07:00
from datahub . ingestion . source . source_registry import source_registry
2023-03-17 07:58:14 -07:00
from datahub . metadata . schema_classes import SchemaFieldClass
2022-05-02 00:18:15 -07:00
logger = logging . getLogger ( __name__ )
2023-03-17 07:58:14 -07:00
class FieldRow ( BaseModel ) :
2022-05-02 00:18:15 -07:00
path : str
2022-09-08 00:38:10 -07:00
parent : Optional [ str ]
2022-05-02 00:18:15 -07:00
type_name : str
required : bool
default : str
description : str
inner_fields : List [ " FieldRow " ] = Field ( default_factory = list )
2023-03-17 07:58:14 -07:00
discriminated_type : Optional [ str ] = None
class Component ( BaseModel ) :
type : str
field_name : Optional [ str ]
2023-03-18 12:59:43 -07:00
# matches any [...] style section inside a field path
_V2_FIELD_PATH_TOKEN_MATCHER = r " \ [[ \ w.]*[=]*[ \ w \ ( \ - \ \ _ \ ).]* \ ][ \ .]* "
# matches a .?[...] style section inside a field path anchored to the beginning
_V2_FIELD_PATH_TOKEN_MATCHER_PREFIX = rf " ^[ \ .]* { _V2_FIELD_PATH_TOKEN_MATCHER } "
_V2_FIELD_PATH_FIELD_NAME_MATCHER = r " ^ \ w+ "
2023-03-17 07:58:14 -07:00
@staticmethod
def map_field_path_to_components ( field_path : str ) - > List [ Component ] :
2023-03-18 12:59:43 -07:00
m = re . match ( FieldRow . _V2_FIELD_PATH_TOKEN_MATCHER_PREFIX , field_path )
v = re . match ( FieldRow . _V2_FIELD_PATH_FIELD_NAME_MATCHER , field_path )
2023-03-17 07:58:14 -07:00
components : List [ FieldRow . Component ] = [ ]
while m or v :
token = m . group ( ) if m else v . group ( ) # type: ignore
if v :
if components :
if components [ - 1 ] . field_name is None :
components [ - 1 ] . field_name = token
else :
components . append ( FieldRow . Component ( type = " non_map_type " , field_name = token ) )
else :
components . append ( FieldRow . Component ( type = " non_map_type " , field_name = token ) )
if m :
if token . startswith ( " [version= " ) :
pass
elif " [type= " in token :
type_match = re . match ( r " [ \ .]* \ [type=(.*) \ ] " , token )
if type_match :
type_string = type_match . group ( 1 )
if components and components [ - 1 ] . type == " map " :
if components [ - 1 ] . field_name is None :
pass
else :
new_component = FieldRow . Component ( type = " map_key " , field_name = " `key` " )
components . append ( new_component )
new_component = FieldRow . Component ( type = type_string , field_name = None )
components . append ( new_component )
if type_string == " map " :
new_component = FieldRow . Component ( type = type_string , field_name = None )
components . append ( new_component )
field_path = field_path [ m . span ( ) [ 1 ] : ] if m else field_path [ v . span ( ) [ 1 ] : ] # type: ignore
2023-03-18 12:59:43 -07:00
m = re . match ( FieldRow . _V2_FIELD_PATH_TOKEN_MATCHER_PREFIX , field_path )
v = re . match ( FieldRow . _V2_FIELD_PATH_FIELD_NAME_MATCHER , field_path )
2023-03-17 07:58:14 -07:00
return components
@staticmethod
def field_path_to_components ( field_path : str ) - > List [ str ] :
'''
Inverts the field_path v2 format to get the canonical field path
[ version = 2.0 ] . [ type = x ] . foo . [ type = string ( format = uri ) ] . bar = > [ " foo " , " bar " ]
'''
if " type=map " not in field_path :
2023-03-18 12:59:43 -07:00
return re . sub ( FieldRow . _V2_FIELD_PATH_TOKEN_MATCHER , " " , field_path ) . split ( " . " )
2023-03-17 07:58:14 -07:00
else :
2023-03-18 12:59:43 -07:00
# fields with maps in them need special handling to insert the `key` fragment
2023-03-17 07:58:14 -07:00
return [ c . field_name for c in FieldRow . map_field_path_to_components ( field_path ) if c . field_name ]
@classmethod
def from_schema_field ( cls , schema_field : SchemaFieldClass ) - > " FieldRow " :
path_components = FieldRow . field_path_to_components ( schema_field . fieldPath )
parent = path_components [ - 2 ] if len ( path_components ) > = 2 else None
if parent == " `key` " :
# the real parent node is one index above
parent = path_components [ - 3 ]
json_props = json . loads ( schema_field . jsonProps ) if schema_field . jsonProps else { }
default_value = str ( json_props . get ( " default " ) ) or " "
return FieldRow ( path = " . " . join ( path_components ) , parent = parent , type_name = str ( schema_field . nativeDataType ) , required = not schema_field . nullable , default = default_value , description = schema_field . description , inner_fields = [ ]
, discriminated_type = schema_field . nativeDataType )
2022-05-02 00:18:15 -07:00
2022-09-08 00:38:10 -07:00
def get_checkbox ( self ) - > str :
if self . required :
if not self . parent : # None and empty string both count
2023-03-17 07:58:14 -07:00
return " [✅] "
2022-09-08 00:38:10 -07:00
else :
2023-03-17 07:58:14 -07:00
return f " [❓ (required if { self . parent } is set)] "
2022-09-08 00:38:10 -07:00
else :
return " "
2022-05-02 00:18:15 -07:00
def to_md_line ( self ) - > str :
2023-03-17 07:58:14 -07:00
if self . inner_fields :
if len ( self . inner_fields ) == 1 :
type_name = self . inner_fields [ 0 ] . type_name or self . type_name
else :
type_name = " UnionType (See notes for variants) "
else :
type_name = self . type_name
description = self . description . replace ( " \n " , " " ) # descriptions with newlines in them break markdown rendering
if self . inner_fields and len ( self . inner_fields ) > 1 :
# to deal with unions that have essentially the same simple field path, we include the supported types in the Notes section
# Once we move to a better layout, we can expand this section out
notes = " One of " + " , " . join ( [ x . type_name for x in self . inner_fields if x . discriminated_type ] )
else :
notes = " "
md_line = f " | { self . path } { self . get_checkbox ( ) } | { type_name } | { description } | { self . default } | { notes } | \n "
return md_line
2022-05-02 00:18:15 -07:00
class FieldHeader ( FieldRow ) :
def to_md_line ( self ) - > str :
return " \n " . join (
[
2023-03-17 07:58:14 -07:00
" | Field [Required] | Type | Description | Default | Notes | " ,
" | --- | --- | --- | -- | -- | " ,
2022-05-02 00:18:15 -07:00
" " ,
]
)
def __init__ ( self ) :
pass
def get_prefixed_name ( field_prefix : Optional [ str ] , field_name : Optional [ str ] ) - > str :
assert (
field_prefix or field_name
) , " One of field_prefix or field_name should be present "
return (
f " { field_prefix } . { field_name } " # type: ignore
if field_prefix and field_name
else field_name
if not field_prefix
else field_prefix
)
2023-03-17 07:58:14 -07:00
def custom_comparator ( path : str ) - > str :
'''
Projects a string onto a separate space
Low_prio string will start with Z else start with A
Number of field paths will add the second set of letters : 00 - 99
'''
opt1 = path
prio_value = priority_value ( opt1 )
projection = f " { prio_value } "
projection = f " { projection } { opt1 } "
return projection
class FieldTree :
'''
A helper class that re - constructs the tree hierarchy of schema fields
to help sort fields by importance while keeping nesting intact
'''
def __init__ ( self , field : Optional [ FieldRow ] = None ) :
self . field = field
self . fields : Dict [ str , " FieldTree " ] = { }
def add_field ( self , row : FieldRow , path : Optional [ str ] = None ) - > " FieldTree " :
# logger.warn(f"Add field: path:{path}, row:{row}")
#breakpoint()
if self . field and self . field . path == row . path :
#breakpoint()
# we have an incoming field with the same path as us, this is probably a union variant
# attach to existing field
self . field . inner_fields . append ( row )
else :
path = path if path is not None else row . path
top_level_field = path . split ( " . " ) [ 0 ]
if top_level_field == " " :
breakpoint ( )
if top_level_field in self . fields :
self . fields [ top_level_field ] . add_field ( row , " . " . join ( path . split ( " . " ) [ 1 : ] ) )
else :
self . fields [ top_level_field ] = FieldTree ( field = row )
# logger.warn(f"{self}")
return self
def sort ( self ) :
# Required fields before optionals
required_fields = { k : v for k , v in self . fields . items ( ) if v . field and v . field . required }
optional_fields = { k : v for k , v in self . fields . items ( ) if v . field and not v . field . required }
self . sorted_fields = [ ]
for field_map in [ required_fields , optional_fields ] :
# Top-level fields before fields with nesting
self . sorted_fields . extend ( sorted ( [ f for f , val in field_map . items ( ) if val . fields == { } ] , key = custom_comparator ) )
self . sorted_fields . extend ( sorted ( [ f for f , val in field_map . items ( ) if val . fields != { } ] , key = custom_comparator ) )
for field_tree in self . fields . values ( ) :
field_tree . sort ( )
def get_fields ( self ) - > Iterable [ FieldRow ] :
if self . field :
yield self . field
for key in self . sorted_fields :
yield from self . fields [ key ] . get_fields ( )
def __repr__ ( self ) - > str :
result = { }
if self . field :
result [ " _self " ] = json . loads ( json . dumps ( self . field . dict ( ) ) )
for f in self . fields :
result [ f ] = json . loads ( str ( self . fields [ f ] ) )
return json . dumps ( result , indent = 2 )
def priority_value ( path : str ) - > str :
# A map of low value tokens to their relative importance
low_value_token_map = {
" env " : " X " ,
" profiling " : " Y " ,
" stateful_ingestion " : " Z "
}
tokens = path . split ( " . " )
for low_value_token in low_value_token_map :
if low_value_token in tokens :
return low_value_token_map [ low_value_token ]
2022-05-02 00:18:15 -07:00
2023-03-17 07:58:14 -07:00
# everything else high-prio
return " A "
2022-05-02 00:18:15 -07:00
2023-03-17 07:58:14 -07:00
def gen_md_table_from_struct ( schema_dict : Dict [ str , Any ] ) - > List [ str ] :
2022-05-02 00:18:15 -07:00
2023-03-17 07:58:14 -07:00
from datahub . ingestion . extractor . json_schema_util import JsonSchemaTranslator
# we don't want default field values to be injected into the description of the field
JsonSchemaTranslator . _INJECT_DEFAULTS_INTO_DESCRIPTION = False
schema_fields = list ( JsonSchemaTranslator . get_fields_from_schema ( schema_dict ) )
result : List [ str ] = [ FieldHeader ( ) . to_md_line ( ) ]
2022-05-02 00:18:15 -07:00
2023-03-17 07:58:14 -07:00
field_tree = FieldTree ( field = None )
for field in schema_fields :
row : FieldRow = FieldRow . from_schema_field ( field )
field_tree . add_field ( row )
2022-05-02 00:18:15 -07:00
2023-03-17 07:58:14 -07:00
field_tree . sort ( )
2022-05-02 00:18:15 -07:00
2023-03-17 07:58:14 -07:00
for row in field_tree . get_fields ( ) :
result . append ( row . to_md_line ( ) )
return result
2022-05-02 00:18:15 -07:00
def get_snippet ( long_string : str , max_length : int = 100 ) - > str :
snippet = " "
if len ( long_string ) > max_length :
snippet = long_string [ : max_length ] . strip ( ) + " ... "
else :
snippet = long_string . strip ( )
snippet = snippet . replace ( " \n " , " " )
snippet = snippet . strip ( ) + " "
return snippet
def get_support_status_badge ( support_status : SupportStatus ) - > str :
if support_status == SupportStatus . CERTIFIED :
return "  "
if support_status == SupportStatus . INCUBATING :
return "  "
if support_status == SupportStatus . TESTING :
return "  "
return " "
def get_capability_supported_badge ( supported : bool ) - > str :
return " ✅ " if supported else " ❌ "
def get_capability_text ( src_capability : SourceCapability ) - > str :
"""
Returns markdown format cell text for a capability , hyperlinked to capability feature page if known
"""
capability_docs_mapping : Dict [ SourceCapability , str ] = {
2022-09-21 09:02:50 -07:00
SourceCapability . DELETION_DETECTION : " ../../../../metadata-ingestion/docs/dev_guides/stateful.md#stale-entity-removal " ,
2022-05-02 00:18:15 -07:00
SourceCapability . DOMAINS : " ../../../domains.md " ,
SourceCapability . PLATFORM_INSTANCE : " ../../../platform-instances.md " ,
SourceCapability . DATA_PROFILING : " ../../../../metadata-ingestion/docs/dev_guides/sql_profiles.md " ,
}
capability_doc = capability_docs_mapping . get ( src_capability )
return (
src_capability . value
if not capability_doc
else f " [ { src_capability . value } ]( { capability_doc } ) "
)
def create_or_update (
something : Dict [ Any , Any ] , path : List [ str ] , value : Any
) - > Dict [ Any , Any ] :
dict_under_operation = something
for p in path [ : - 1 ] :
if p not in dict_under_operation :
dict_under_operation [ p ] = { }
dict_under_operation = dict_under_operation [ p ]
dict_under_operation [ path [ - 1 ] ] = value
return something
def does_extra_exist ( extra_name : str ) - > bool :
for key , value in metadata ( " acryl-datahub " ) . items ( ) :
if key == " Provides-Extra " and value == extra_name :
return True
return False
def get_additional_deps_for_extra ( extra_name : str ) - > List [ str ] :
all_requirements = requires ( " acryl-datahub " ) or [ ]
# filter for base dependencies
base_deps = set ( [ x . split ( " ; " ) [ 0 ] for x in all_requirements if " extra == " not in x ] )
# filter for dependencies for this extra
extra_deps = set (
2022-08-26 09:31:53 -07:00
[ x . split ( " ; " ) [ 0 ] for x in all_requirements if f " extra == ' { extra_name } ' " in x ]
2022-05-02 00:18:15 -07:00
)
# calculate additional deps that this extra adds
delta_deps = extra_deps - base_deps
return list ( delta_deps )
def relocate_path ( orig_path : str , relative_path : str , relocated_path : str ) - > str :
newPath = os . path . join ( os . path . dirname ( orig_path ) , relative_path )
assert os . path . exists ( newPath )
newRelativePath = os . path . relpath ( newPath , os . path . dirname ( relocated_path ) )
return newRelativePath
def rewrite_markdown ( file_contents : str , path : str , relocated_path : str ) - > str :
def new_url ( original_url : str , file_path : str ) - > str :
if original_url . startswith ( ( " http:// " , " https:// " , " # " ) ) :
return original_url
import pathlib
file_ext = pathlib . Path ( original_url ) . suffix
if file_ext . startswith ( " .md " ) :
return original_url
elif file_ext in [ " .png " , " .svg " , " .gif " , " .pdf " ] :
new_url = relocate_path ( path , original_url , relocated_path )
return new_url
return original_url
# Look for the [text](url) syntax. Note that this will also capture images.
#
# We do a little bit of parenthesis matching here to account for parens in URLs.
# See https://stackoverflow.com/a/17759264 for explanation of the second capture group.
new_content = re . sub (
r " \ [(.*?) \ ] \ (((?:[^)(]+| \ ((?:[^)(]+| \ ([^)(]* \ ))* \ ))*) \ ) " ,
lambda x : f " [ { x . group ( 1 ) } ]( { new_url ( x . group ( 2 ) . strip ( ) , path ) } ) " , # type: ignore
file_contents ,
)
new_content = re . sub (
# Also look for the [text]: url syntax.
r " ^ \ [(.+?) \ ] \ s*: \ s*(.+?) \ s*$ " ,
lambda x : f " [ { x . group ( 1 ) } ]: { new_url ( x . group ( 2 ) , path ) } " ,
new_content ,
)
return new_content
@click.command ( )
@click.option ( " --out-dir " , type = str , required = True )
@click.option ( " --extra-docs " , type = str , required = False )
@click.option ( " --source " , type = str , required = False )
def generate (
out_dir : str , extra_docs : Optional [ str ] = None , source : Optional [ str ] = None
) - > None : # noqa: C901
source_documentation : Dict [ str , Any ] = { }
metrics = { }
metrics [ " source_platforms " ] = { " discovered " : 0 , " generated " : 0 , " warnings " : [ ] }
metrics [ " plugins " ] = { " discovered " : 0 , " generated " : 0 , " failed " : 0 }
if extra_docs :
for path in glob . glob ( f " { extra_docs } /**/*[.md|.yaml|.yml] " , recursive = True ) :
m = re . search ( " /docs/sources/(.*)/(.*).md " , path )
if m :
platform_name = m . group ( 1 ) . lower ( )
file_name = m . group ( 2 )
destination_md : str = (
f " ../docs/generated/ingestion/sources/ { platform_name } .md "
)
with open ( path , " r " ) as doc_file :
file_contents = doc_file . read ( )
final_markdown = rewrite_markdown (
file_contents , path , destination_md
)
if file_name == " README " :
# README goes as platform level docs
# all other docs are assumed to be plugin level
create_or_update (
source_documentation ,
[ platform_name , " custom_docs " ] ,
final_markdown ,
)
else :
2022-09-08 00:38:10 -07:00
if " _ " in file_name :
plugin_doc_parts = file_name . split ( " _ " )
if len ( plugin_doc_parts ) != 2 or plugin_doc_parts [
1
] not in [ " pre " , " post " ] :
raise Exception (
f " { file_name } needs to be of the form <plugin>_pre.md or <plugin>_post.md "
)
docs_key_name = f " custom_docs_ { plugin_doc_parts [ 1 ] } "
create_or_update (
source_documentation ,
[
platform_name ,
" plugins " ,
plugin_doc_parts [ 0 ] ,
docs_key_name ,
] ,
final_markdown ,
)
else :
create_or_update (
source_documentation ,
[
platform_name ,
" plugins " ,
file_name ,
" custom_docs_post " ,
] ,
final_markdown ,
)
2022-05-02 00:18:15 -07:00
else :
yml_match = re . search ( " /docs/sources/(.*)/(.*)_recipe.yml " , path )
if yml_match :
platform_name = yml_match . group ( 1 ) . lower ( )
plugin_name = yml_match . group ( 2 )
with open ( path , " r " ) as doc_file :
file_contents = doc_file . read ( )
create_or_update (
source_documentation ,
[ platform_name , " plugins " , plugin_name , " recipe " ] ,
file_contents ,
)
2022-09-07 09:14:33 -07:00
for plugin_name in sorted ( source_registry . mapping . keys ( ) ) :
2022-05-02 00:18:15 -07:00
if source and source != plugin_name :
continue
2023-03-17 07:58:14 -07:00
metrics [ " plugins " ] [ " discovered " ] = metrics [ " plugins " ] [ " discovered " ] + 1 # type: ignore
2022-05-02 00:18:15 -07:00
# We want to attempt to load all plugins before printing a summary.
source_type = None
try :
# output = subprocess.check_output(
# ["/bin/bash", "-c", f"pip install -e '.[{key}]'"]
# )
2022-05-29 08:09:02 -07:00
class_or_exception = source_registry . _ensure_not_lazy ( plugin_name )
if isinstance ( class_or_exception , Exception ) :
raise class_or_exception
2022-05-02 00:18:15 -07:00
logger . debug ( f " Processing { plugin_name } " )
source_type = source_registry . get ( plugin_name )
logger . debug ( f " Source class is { source_type } " )
extra_plugin = plugin_name if does_extra_exist ( plugin_name ) else None
extra_deps = (
get_additional_deps_for_extra ( extra_plugin ) if extra_plugin else [ ]
)
except Exception as e :
2022-09-07 09:14:33 -07:00
logger . warning (
f " Failed to process { plugin_name } due to exception { e } " , exc_info = e
)
2023-03-17 07:58:14 -07:00
metrics [ " plugins " ] [ " failed " ] = metrics [ " plugins " ] . get ( " failed " , 0 ) + 1 #type: ignore
2022-05-02 00:18:15 -07:00
if source_type and hasattr ( source_type , " get_config_class " ) :
try :
source_config_class : ConfigModel = source_type . get_config_class ( )
support_status = SupportStatus . UNKNOWN
capabilities = [ ]
if hasattr ( source_type , " __doc__ " ) :
source_doc = textwrap . dedent ( source_type . __doc__ or " " )
if hasattr ( source_type , " get_platform_name " ) :
platform_name = source_type . get_platform_name ( )
else :
platform_name = (
plugin_name . title ( )
) # we like platform names to be human readable
if hasattr ( source_type , " get_platform_id " ) :
platform_id = source_type . get_platform_id ( )
2022-09-08 00:38:10 -07:00
if hasattr ( source_type , " get_platform_doc_order " ) :
platform_doc_order = source_type . get_platform_doc_order ( )
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " doc_order " ] ,
platform_doc_order ,
)
2022-05-02 00:18:15 -07:00
source_documentation [ platform_id ] = (
source_documentation . get ( platform_id ) or { }
)
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " classname " ] ,
" . " . join ( [ source_type . __module__ , source_type . __name__ ] ) ,
)
plugin_file_name = " src/ " + " / " . join ( source_type . __module__ . split ( " . " ) )
if os . path . exists ( plugin_file_name ) and os . path . isdir ( plugin_file_name ) :
plugin_file_name = plugin_file_name + " /__init__.py "
else :
plugin_file_name = plugin_file_name + " .py "
if os . path . exists ( plugin_file_name ) :
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " filename " ] ,
plugin_file_name ,
)
else :
logger . info (
f " Failed to locate filename for { plugin_name } . Guessed { plugin_file_name } "
)
if hasattr ( source_type , " get_support_status " ) :
support_status = source_type . get_support_status ( )
if hasattr ( source_type , " get_capabilities " ) :
capabilities = list ( source_type . get_capabilities ( ) )
capabilities . sort ( key = lambda x : x . capability . value )
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " capabilities " ] ,
capabilities ,
)
create_or_update (
source_documentation , [ platform_id , " name " ] , platform_name
)
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " extra_deps " ] ,
extra_deps ,
)
config_dir = f " { out_dir } /config_schemas "
os . makedirs ( config_dir , exist_ok = True )
with open ( f " { config_dir } / { plugin_name } _config.json " , " w " ) as f :
f . write ( source_config_class . schema_json ( indent = 2 ) )
2022-07-06 20:39:27 +10:00
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " config_schema " ] ,
2022-06-03 14:57:36 -07:00
source_config_class . schema_json ( indent = 2 ) or " " ,
)
2022-05-02 00:18:15 -07:00
table_md = gen_md_table_from_struct ( source_config_class . schema ( ) )
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " source_doc " ] ,
source_doc or " " ,
)
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " config " ] ,
table_md ,
)
create_or_update (
source_documentation ,
[ platform_id , " plugins " , plugin_name , " support_status " ] ,
support_status ,
)
except Exception as e :
raise e
sources_dir = f " { out_dir } /sources "
os . makedirs ( sources_dir , exist_ok = True )
2023-02-13 21:39:54 +01:00
i = 0
for platform_id , platform_docs in sorted (
source_documentation . items ( ) ,
2023-02-20 08:46:10 -05:00
key = lambda x : ( x [ 1 ] [ " name " ] . casefold ( ) , x [ 1 ] [ " name " ] )
if " name " in x [ 1 ]
2023-02-13 21:39:54 +01:00
else ( x [ 0 ] . casefold ( ) , x [ 0 ] ) ,
) :
2022-05-02 00:18:15 -07:00
if source and platform_id != source :
continue
metrics [ " source_platforms " ] [ " discovered " ] = (
2023-03-17 07:58:14 -07:00
metrics [ " source_platforms " ] [ " discovered " ] + 1 # type: ignore
2022-05-02 00:18:15 -07:00
)
platform_doc_file = f " { sources_dir } / { platform_id } .md "
if " name " not in platform_docs :
# We seem to have discovered written docs that corresponds to a platform, but haven't found linkage to it from the source classes
warning_msg = f " Failed to find source classes for platform { platform_id } . Did you remember to annotate your source class with @platform_name( { platform_id } )? "
logger . error ( warning_msg )
2023-03-17 07:58:14 -07:00
metrics [ " source_platforms " ] [ " warnings " ] . append ( warning_msg ) # type: ignore
2022-09-08 00:38:10 -07:00
continue
2022-05-02 00:18:15 -07:00
with open ( platform_doc_file , " w " ) as f :
2023-02-13 21:39:54 +01:00
i + = 1
f . write ( f " --- \n sidebar_position: { i } \n --- \n \n " )
f . write (
f " import Tabs from ' @theme/Tabs ' ; \n import TabItem from ' @theme/TabItem ' ; \n \n "
)
f . write ( f " # { platform_docs [ ' name ' ] } \n " )
2022-05-02 00:18:15 -07:00
if len ( platform_docs [ " plugins " ] . keys ( ) ) > 1 :
# More than one plugin used to provide integration with this platform
f . write (
f " There are { len ( platform_docs [ ' plugins ' ] . keys ( ) ) } sources that provide integration with { platform_docs [ ' name ' ] } \n "
)
f . write ( " \n " )
f . write ( " <table> \n " )
f . write ( " <tr> " )
for col_header in [ " Source Module " , " Documentation " ] :
f . write ( f " <td> { col_header } </td> " )
f . write ( " </tr> " )
# f.write("| Source Module | Documentation |\n")
# f.write("| ------ | ---- |\n")
2022-09-08 00:38:10 -07:00
for plugin , plugin_docs in sorted (
platform_docs [ " plugins " ] . items ( ) ,
key = lambda x : str ( x [ 1 ] . get ( " doc_order " ) )
if x [ 1 ] . get ( " doc_order " )
else x [ 0 ] ,
) :
2022-05-02 00:18:15 -07:00
f . write ( " <tr> \n " )
f . write ( f " <td> \n \n ` { plugin } ` \n \n </td> \n " )
f . write (
f " <td> \n \n \n { platform_docs [ ' plugins ' ] [ plugin ] . get ( ' source_doc ' ) or ' ' } [Read more...](#module- { plugin } ) \n \n \n </td> \n "
)
f . write ( " </tr> \n " )
# f.write(
# f"| `{plugin}` | {get_snippet(platform_docs['plugins'][plugin]['source_doc'])}[Read more...](#module-{plugin}) |\n"
# )
2022-05-09 13:01:44 -07:00
f . write ( " </table> \n \n " )
2022-05-02 00:18:15 -07:00
# insert platform level custom docs before plugin section
f . write ( platform_docs . get ( " custom_docs " ) or " " )
2022-09-08 00:38:10 -07:00
# all_plugins = platform_docs["plugins"].keys()
for plugin , plugin_docs in sorted (
platform_docs [ " plugins " ] . items ( ) ,
key = lambda x : str ( x [ 1 ] . get ( " doc_order " ) )
if x [ 1 ] . get ( " doc_order " )
else x [ 0 ] ,
) :
2022-05-09 13:01:44 -07:00
f . write ( f " \n \n ## Module ` { plugin } ` \n " )
2022-05-02 00:18:15 -07:00
if " support_status " in plugin_docs :
f . write (
get_support_status_badge ( plugin_docs [ " support_status " ] ) + " \n \n "
)
if " capabilities " in plugin_docs and len ( plugin_docs [ " capabilities " ] ) :
f . write ( " \n ### Important Capabilities \n " )
f . write ( " | Capability | Status | Notes | \n " )
f . write ( " | ---------- | ------ | ----- | \n " )
plugin_capabilities : List [ CapabilitySetting ] = plugin_docs [
" capabilities "
]
for cap_setting in plugin_capabilities :
f . write (
f " | { get_capability_text ( cap_setting . capability ) } | { get_capability_supported_badge ( cap_setting . supported ) } | { cap_setting . description } | \n "
)
f . write ( " \n " )
f . write ( f " { plugin_docs . get ( ' source_doc ' ) or ' ' } \n " )
2022-09-08 00:38:10 -07:00
# Insert custom pre section
f . write ( plugin_docs . get ( " custom_docs_pre " , " " ) )
f . write ( " \n ### CLI based Ingestion \n " )
2022-05-02 00:18:15 -07:00
if " extra_deps " in plugin_docs :
2022-09-08 00:38:10 -07:00
f . write ( " \n #### Install the Plugin \n " )
2022-05-02 00:18:15 -07:00
if plugin_docs [ " extra_deps " ] != [ ] :
f . write ( " ```shell \n " )
2022-05-09 13:01:44 -07:00
f . write ( f " pip install ' acryl-datahub[ { plugin } ] ' \n " )
2022-05-02 00:18:15 -07:00
f . write ( " ``` \n " )
else :
f . write (
f " The ` { plugin } ` source works out of the box with `acryl-datahub`. \n "
)
if " recipe " in plugin_docs :
2022-09-08 00:38:10 -07:00
f . write ( " \n ### Starter Recipe \n " )
2022-05-02 00:18:15 -07:00
f . write (
" Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration options. \n \n \n "
)
f . write (
2022-09-14 11:05:31 -07:00
" For general pointers on writing and running a recipe, see our [main recipe guide](../../../../metadata-ingestion/README.md#recipes). \n "
2022-05-02 00:18:15 -07:00
)
f . write ( " ```yaml \n " )
f . write ( plugin_docs [ " recipe " ] )
f . write ( " \n ``` \n " )
if " config " in plugin_docs :
f . write ( " \n ### Config Details \n " )
2022-07-06 20:39:27 +10:00
f . write (
""" <Tabs>
< TabItem value = " options " label = " Options " default > \n \n """
)
2022-05-02 00:18:15 -07:00
f . write (
" Note that a `.` is used to denote nested fields in the YAML recipe. \n \n "
)
f . write (
2022-06-02 09:27:13 -07:00
" \n <details open> \n <summary>View All Configuration Options</summary> \n \n "
2022-05-02 00:18:15 -07:00
)
for doc in plugin_docs [ " config " ] :
f . write ( doc )
f . write ( " \n </details> \n \n " )
2022-07-06 20:39:27 +10:00
f . write (
f """ </TabItem>
2022-06-03 14:57:36 -07:00
< TabItem value = " schema " label = " Schema " >
The [ JSONSchema ] ( https : / / json - schema . org / ) for this configuration is inlined below . \n \n
` ` ` javascript
{ plugin_docs [ ' config_schema ' ] }
` ` ` \n \n
< / TabItem >
2022-07-06 20:39:27 +10:00
< / Tabs > \n \n """
)
2022-05-02 00:18:15 -07:00
# insert custom plugin docs after config details
2022-09-08 00:38:10 -07:00
f . write ( plugin_docs . get ( " custom_docs_post " , " " ) )
2022-05-02 00:18:15 -07:00
if " classname " in plugin_docs :
f . write ( " \n ### Code Coordinates \n " )
f . write ( f " - Class Name: ` { plugin_docs [ ' classname ' ] } ` \n " )
if " filename " in plugin_docs :
f . write (
f " - Browse on [GitHub](../../../../metadata-ingestion/ { plugin_docs [ ' filename ' ] } ) \n \n "
)
2023-03-17 07:58:14 -07:00
metrics [ " plugins " ] [ " generated " ] = metrics [ " plugins " ] [ " generated " ] + 1 #type: ignore
2022-05-02 00:18:15 -07:00
f . write ( " \n ## Questions \n " )
f . write (
f " If you ' ve got any questions on configuring ingestion for { platform_docs . get ( ' name ' , platform_id ) } , feel free to ping us on [our Slack](https://slack.datahubproject.io) \n "
)
metrics [ " source_platforms " ] [ " generated " ] = (
2023-03-17 07:58:14 -07:00
metrics [ " source_platforms " ] [ " generated " ] + 1 # type: ignore
2022-05-02 00:18:15 -07:00
)
print ( " Ingestion Documentation Generation Complete " )
print ( " ############################################ " )
print ( json . dumps ( metrics , indent = 2 ) )
print ( " ############################################ " )
2023-03-17 07:58:14 -07:00
if metrics [ " plugins " ] . get ( " failed " , 0 ) > 0 : #type: ignore
2022-05-29 08:09:02 -07:00
sys . exit ( 1 )
2022-05-02 00:18:15 -07:00
if __name__ == " __main__ " :
logger . setLevel ( " INFO " )
generate ( )