feat(ingest): generate capability summary (#13881)

This commit is contained in:
Aseem Bansal 2025-06-30 15:16:08 +05:30 committed by GitHub
parent 03309b7ffa
commit 7345af898d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 3364 additions and 93 deletions

View File

@ -7,3 +7,9 @@ repos:
language: system language: system
files: ^smoke-test/tests/cypress/.*\.tsx$ files: ^smoke-test/tests/cypress/.*\.tsx$
pass_filenames: false pass_filenames: false
- id: update-capability-summary
name: update-capability-summary
entry: ./gradlew :metadata-ingestion:capabilitySummary
language: system
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
pass_filenames: false

View File

@ -69,9 +69,21 @@ jobs:
run: ./metadata-ingestion/scripts/install_deps.sh run: ./metadata-ingestion/scripts/install_deps.sh
- name: Install package - name: Install package
run: ./gradlew :metadata-ingestion:installPackageOnly run: ./gradlew :metadata-ingestion:installPackageOnly
- name: Run lint alongwith testQuick - name: Check lint and capability_summary.json being up-to-date
if: ${{ matrix.command == 'testQuick' }} if: ${{ matrix.command == 'testQuick' }}
run: ./gradlew :metadata-ingestion:lint run: |
./gradlew :metadata-ingestion:lint
echo "Lint passed. Checking if capability_summary.json is up-to-date."
./gradlew :metadata-ingestion:capabilitySummary
# Check if capability summary file has changed
if git diff --quiet metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json; then
echo "✅ Capability summary file is unchanged"
else
echo "❌ Capability summary file has changed. Please commit the updated file."
echo "Changed lines:"
git diff metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json
exit 1
fi
- name: Run metadata-ingestion tests - name: Run metadata-ingestion tests
run: ./gradlew :metadata-ingestion:${{ matrix.command }} run: ./gradlew :metadata-ingestion:${{ matrix.command }}
- name: Debug info - name: Debug info

View File

@ -1,4 +1,4 @@
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-04-21 19:41:02 UTC # Auto-generated by .github/scripts/generate_pre_commit.py at 2025-06-27 12:14:33 UTC
# Do not edit this file directly. Run the script to regenerate. # Do not edit this file directly. Run the script to regenerate.
# Add additional hooks in .github/scripts/pre-commit-override.yaml # Add additional hooks in .github/scripts/pre-commit-override.yaml
repos: repos:
@ -493,3 +493,10 @@ repos:
language: system language: system
files: ^smoke-test/tests/cypress/.*\.tsx$ files: ^smoke-test/tests/cypress/.*\.tsx$
pass_filenames: false pass_filenames: false
- id: update-capability-summary
name: update-capability-summary
entry: ./gradlew :metadata-ingestion:capabilitySummary
language: system
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
pass_filenames: false

View File

@ -31,3 +31,4 @@ yarn-error.log*
.yarn-test-sentinel .yarn-test-sentinel
.yarn-lint-sentinel .yarn-lint-sentinel
public/assets/ingestion/**

View File

@ -146,6 +146,24 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
outputs.dir('dist') outputs.dir('dist')
} }
task copyCapabilitySummary(type: Copy) {
def sourceFile = file('../metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json')
if (!sourceFile.exists()) {
// We don't want frontend devs to have to run this task
// But still keeping it here to make sure the dependency is there properly in gradle
dependsOn ':metadata-ingestion:capabilitySummary'
}
from sourceFile
into 'public/assets/ingestion'
inputs.file(sourceFile)
outputs.file('public/assets/ingestion/capability_summary.json')
}
yarnBuild.dependsOn copyCapabilitySummary
// Define a list of configurations for prettier tasks // Define a list of configurations for prettier tasks
def externalPrettierConfigs = [ def externalPrettierConfigs = [
[ [
@ -201,6 +219,7 @@ clean {
delete 'tmp' delete 'tmp'
delete 'just' delete 'just'
delete fileTree(dir: 'src', include: '*.generated.ts') delete fileTree(dir: 'src', include: '*.generated.ts')
delete 'public/assets/capability_summary.json'
} }
configurations { configurations {

View File

@ -18,6 +18,27 @@ import {
import { useAppConfig } from '@app/useAppConfig'; import { useAppConfig } from '@app/useAppConfig';
import { useShowNavBarRedesign } from '@app/useShowNavBarRedesign'; import { useShowNavBarRedesign } from '@app/useShowNavBarRedesign';
interface Capability {
capability: string;
description: string;
supported: boolean;
}
interface PluginDetails {
capabilities: Capability[];
classname: string;
platform_id: string;
platform_name: string;
support_status: string;
}
// this type is based off of the type in metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json
interface CapabilitySummary {
generated_at: string;
generated_by: string;
plugin_details: Record<string, PluginDetails>;
}
const PageContainer = styled.div<{ $isShowNavBarRedesign?: boolean }>` const PageContainer = styled.div<{ $isShowNavBarRedesign?: boolean }>`
padding-top: 20px; padding-top: 20px;
background-color: white; background-color: white;
@ -78,9 +99,63 @@ export const ManageIngestionPage = () => {
const [showCreateSecretModal, setShowCreateSecretModal] = useState<boolean>(false); const [showCreateSecretModal, setShowCreateSecretModal] = useState<boolean>(false);
const [hideSystemSources, setHideSystemSources] = useState(true); const [hideSystemSources, setHideSystemSources] = useState(true);
const [capabilitySummary, setCapabilitySummary] = useState<CapabilitySummary | null>(null);
const [isCapabilitySummaryLoading, setIsCapabilitySummaryLoading] = useState<boolean>(true);
const [isCapabilitySummaryError, setIsCapabilitySummaryError] = useState<string | null>(null);
const history = useHistory(); const history = useHistory();
const shouldPreserveParams = useRef(false); const shouldPreserveParams = useRef(false);
useEffect(() => {
const fetchCapabilitySummary = async () => {
setIsCapabilitySummaryLoading(true);
try {
const response = await fetch('/assets/ingestion/capability_summary.json');
if (!response.ok) {
throw new Error(`Failed to fetch capability summary: ${response.status} ${response.statusText}`);
}
const data = await response.json();
setCapabilitySummary(data);
} catch (error) {
console.error('Error fetching capability summary:', error);
setIsCapabilitySummaryError(
error instanceof Error ? error.message : 'Failed to fetch capability summary',
);
} finally {
setIsCapabilitySummaryLoading(false);
}
};
fetchCapabilitySummary();
}, []);
const getPluginCapabilities = (platformId: string): PluginDetails | null => {
if (!capabilitySummary?.plugin_details?.[platformId]) {
return null;
}
return capabilitySummary.plugin_details[platformId];
};
const isCapabilitySupported = (platformId: string, capabilityName: string): boolean => {
const capabilities = getPluginCapabilities(platformId)?.capabilities;
if (!capabilities) {
return false;
}
return capabilities?.some((capability) => capability.capability === capabilityName && capability.supported);
};
const isProfilingSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'DATA_PROFILING');
// const isLineageSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'LINEAGE_COARSE');
// const isFineGrainedLineageSupported = (platformId: string): boolean =>
// isCapabilitySupported(platformId, 'LINEAGE_FINE');
// const isUsageStatsSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'USAGE_STATS');
if (!isCapabilitySummaryLoading && !isCapabilitySummaryError) {
console.log(
'Example to be removed when is actually used for something is profiling support for bigquery',
isProfilingSupported('bigquery'),
);
}
// defaultTab might not be calculated correctly on mount, if `config` or `me` haven't been loaded yet // defaultTab might not be calculated correctly on mount, if `config` or `me` haven't been loaded yet
useEffect(() => { useEffect(() => {
if (loaded && me.loaded && !showIngestionTab && selectedTab === TabType.Sources) { if (loaded && me.loaded && !showIngestionTab && selectedTab === TabType.Sources) {

View File

@ -112,16 +112,16 @@ task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) {
task lint(type: Exec, dependsOn: installDev) { task lint(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-c', commandLine 'bash', '-c',
venv_activate_command + venv_activate_command +
"ruff check src/ tests/ examples/ && " + "ruff check scripts/capability_summary.py src/ tests/ examples/ && " +
"ruff format --check src/ tests/ examples/ && " + "ruff format --check scripts/capability_summary.py src/ tests/ examples/ && " +
"mypy --show-traceback --show-error-codes src/ tests/ examples/" "mypy --show-traceback --show-error-codes src/ tests/ examples/"
} }
task lintFix(type: Exec, dependsOn: installDev) { task lintFix(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-c', commandLine 'bash', '-c',
venv_activate_command + venv_activate_command +
"ruff check --fix src/ tests/ examples/ && " + "ruff check --fix scripts/capability_summary.py src/ tests/ examples/ && " +
"ruff format src/ tests/ examples/ " "ruff format scripts/capability_summary.py src/ tests/ examples/ "
} }
def pytest_default_env = "PYTHONDEVMODE=1" def pytest_default_env = "PYTHONDEVMODE=1"
@ -191,7 +191,23 @@ task testFull(type: Exec, dependsOn: [installDevTest]) {
task specGen(type: Exec, dependsOn: [codegen, installDevTest]) { task specGen(type: Exec, dependsOn: [codegen, installDevTest]) {
commandLine 'bash', '-c', "${venv_activate_command} ./scripts/specgen.sh" commandLine 'bash', '-c', "${venv_activate_command} ./scripts/specgen.sh"
} }
task capabilitySummary(type: Exec, dependsOn: [codegen, installDevTest]) {
inputs.files(
file('scripts/capability_summary.py'),
file('scripts/docgen_types.py'),
project.fileTree(dir: "src/datahub/ingestion/source", include: "**/*.py")
)
commandLine 'bash', '-c', "${venv_activate_command} python scripts/capability_summary.py --output-dir ./src/datahub/ingestion/autogenerated"
}
task docGen(type: Exec, dependsOn: [codegen, installDevTest, specGen]) { task docGen(type: Exec, dependsOn: [codegen, installDevTest, specGen]) {
def sourceFile = file('./src/datahub/ingestion/autogenerated/capability_summary.json')
if (!sourceFile.exists()) {
// Doing it like this cuts docGen time from 15 seconds to 9 seconds locally
// This can further reduce if we generate more things in the future
dependsOn capabilitySummary
}
commandLine 'bash', '-c', "${venv_activate_command} ./scripts/docgen.sh" commandLine 'bash', '-c', "${venv_activate_command} ./scripts/docgen.sh"
} }

View File

@ -0,0 +1,234 @@
import dataclasses
import json
import logging
import pathlib
from datetime import datetime, timezone
from typing import Dict, Optional
import click
from docgen_types import Plugin
from datahub.ingestion.api.decorators import SupportStatus
from datahub.ingestion.source.source_registry import source_registry
logger = logging.getLogger(__name__)
DENY_LIST = {
"snowflake-summary",
"snowflake-queries",
"bigquery-queries",
}
def load_plugin_capabilities(plugin_name: str) -> Optional[Plugin]:
"""Load plugin capabilities without generating full documentation."""
logger.debug(f"Loading capabilities for {plugin_name}")
try:
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
if isinstance(class_or_exception, Exception):
# Log the specific error but don't re-raise it
logger.warning(f"Plugin {plugin_name} failed to load: {class_or_exception}")
return None
source_type = source_registry.get(plugin_name)
logger.debug(f"Source class is {source_type}")
if hasattr(source_type, "get_platform_name"):
platform_name = source_type.get_platform_name()
else:
platform_name = plugin_name.title()
platform_id = None
if hasattr(source_type, "get_platform_id"):
platform_id = source_type.get_platform_id()
if platform_id is None:
logger.warning(f"Platform ID not found for {plugin_name}")
return None
plugin = Plugin(
name=plugin_name,
platform_id=platform_id,
platform_name=platform_name,
classname=".".join([source_type.__module__, source_type.__name__]),
)
if hasattr(source_type, "get_support_status"):
plugin.support_status = source_type.get_support_status()
if hasattr(source_type, "get_capabilities"):
capabilities = list(source_type.get_capabilities())
if capabilities:
capabilities.sort(key=lambda x: x.capability.value)
plugin.capabilities = capabilities
else:
logger.debug(f"No capabilities defined for {plugin_name}")
plugin.capabilities = []
else:
logger.debug(f"No get_capabilities method for {plugin_name}")
plugin.capabilities = []
return plugin
except Exception as e:
logger.warning(f"Failed to load capabilities for {plugin_name}: {e}")
return None
@dataclasses.dataclass
class CapabilitySummary:
"""Summary of capabilities across all plugins."""
plugin_details: Dict[str, Dict] # plugin_name -> detailed info
def generate_capability_summary() -> CapabilitySummary:
"""Generate a comprehensive summary of capabilities across all plugins."""
plugin_details: Dict[str, Dict] = {}
for plugin_name in sorted(source_registry.mapping.keys()):
if plugin_name in DENY_LIST:
logger.info(f"Skipping {plugin_name} as it is on the deny list")
continue
plugin = load_plugin_capabilities(plugin_name)
if plugin is None:
continue
plugin_details[plugin_name] = {
"platform_id": plugin.platform_id,
"platform_name": plugin.platform_name,
"classname": plugin.classname,
"support_status": plugin.support_status.name
if plugin.support_status != SupportStatus.UNKNOWN
else None,
"capabilities": [],
}
if plugin.capabilities:
for cap_setting in plugin.capabilities:
capability_name = cap_setting.capability.name
plugin_details[plugin_name]["capabilities"].append(
{
"capability": capability_name,
"supported": cap_setting.supported,
"description": cap_setting.description,
}
)
return CapabilitySummary(
plugin_details=plugin_details,
)
def save_capability_report(summary: CapabilitySummary, output_dir: str) -> None:
"""Save the capability summary as JSON files, but only write if contents have changed."""
output_path = pathlib.Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
existing_capabilities = {}
existing_summary_file = pathlib.Path(
"./src/datahub/ingestion/autogenerated/capability_summary.json"
)
if existing_summary_file.exists():
try:
with open(existing_summary_file, "r") as f:
existing_data = json.load(f)
existing_capabilities = existing_data.get("plugin_details", {})
logger.info(
f"Loaded existing capability data for {len(existing_capabilities)} plugins"
)
except Exception as e:
logger.warning(f"Failed to load existing capability data: {e}")
missing_plugins = set(existing_capabilities.keys()) - set(
summary.plugin_details.keys()
)
for plugin_name in missing_plugins:
logger.warning(
f"Plugin {plugin_name} failed to load, using existing capability data as fallback. Manually remove from capability_summary.json if you want to remove it from the report."
)
summary.plugin_details[plugin_name] = existing_capabilities[plugin_name]
summary_dict = dataclasses.asdict(summary)
summary_dict["generated_by"] = "metadata-ingestion/scripts/capability_summary.py"
summary_dict["generated_at"] = datetime.now(timezone.utc).isoformat()
summary_json = json.dumps(summary_dict, indent=2, sort_keys=True)
summary_file = output_path / "capability_summary.json"
write_file = True
if summary_file.exists():
try:
with open(summary_file, "r") as f:
existing_data = json.load(f)
# Create copies without generated_at for comparison
existing_for_comparison = existing_data.copy()
new_for_comparison = summary_dict.copy()
existing_for_comparison.pop("generated_at", None)
new_for_comparison.pop("generated_at", None)
if json.dumps(
existing_for_comparison, indent=2, sort_keys=True
) == json.dumps(new_for_comparison, indent=2, sort_keys=True):
logger.info(f"No changes detected in {summary_file}, skipping write.")
write_file = False
except Exception as e:
logger.warning(f"Could not read existing summary file: {e}")
if write_file:
with open(summary_file, "w") as f:
f.write(summary_json)
logger.info(f"Capability summary saved to {summary_file}")
@click.command()
@click.option(
"--output-dir",
type=str,
default="./autogenerated",
help="Output directory for capability reports",
)
@click.option(
"--source",
type=str,
required=False,
help="Generate report for specific source only",
)
def generate_capability_report(output_dir: str, source: Optional[str] = None) -> None:
"""Generate a comprehensive capability report for all ingestion sources."""
logger.info("Starting capability report generation...")
if source:
if source not in source_registry.mapping:
logger.error(f"Source '{source}' not found in registry")
return
original_mapping = source_registry.mapping.copy()
source_registry.mapping = {source: original_mapping[source]}
try:
summary = generate_capability_summary()
save_capability_report(summary, output_dir)
print("Capability Report Generation Complete")
print("=====================================")
print(f"Total plugins processed: {len(summary.plugin_details)}")
print(f"Plugins with capabilities: {len(summary.plugin_details)}")
print(f"Output directory: {output_dir}")
finally:
if source:
source_registry.mapping = original_mapping
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s %(levelname)-8s {%(name)s:%(lineno)d}] - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S %Z",
)
generate_capability_report()

View File

@ -15,7 +15,7 @@ from docgen_types import Platform, Plugin
from docs_config_table import gen_md_table_from_json_schema from docs_config_table import gen_md_table_from_json_schema
from datahub.configuration.common import ConfigModel from datahub.configuration.common import ConfigModel
from datahub.ingestion.api.decorators import SourceCapability, SupportStatus from datahub.ingestion.api.decorators import SourceCapability, SupportStatus, CapabilitySetting
from datahub.ingestion.source.source_registry import source_registry from datahub.ingestion.source.source_registry import source_registry
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -68,6 +68,20 @@ def get_capability_text(src_capability: SourceCapability) -> str:
) )
def map_capability_name_to_enum(capability_name: str) -> SourceCapability:
"""
Maps capability names from the JSON file to SourceCapability enum values.
The JSON file uses enum names (e.g., "DATA_PROFILING") but the enum expects values (e.g., "Data Profiling").
"""
try:
return SourceCapability[capability_name]
except KeyError:
try:
return SourceCapability(capability_name)
except ValueError:
raise ValueError(f"Unknown capability name: {capability_name}")
def does_extra_exist(extra_name: str) -> bool: def does_extra_exist(extra_name: str) -> bool:
for key, value in metadata("acryl-datahub").items(): for key, value in metadata("acryl-datahub").items():
if key == "Provides-Extra" and value == extra_name: if key == "Provides-Extra" and value == extra_name:
@ -129,38 +143,58 @@ def rewrite_markdown(file_contents: str, path: str, relocated_path: str) -> str:
return new_content return new_content
def load_plugin(plugin_name: str, out_dir: str) -> Plugin: def load_capability_data(capability_summary_path: str) -> Dict:
logger.debug(f"Loading {plugin_name}") """Load capability data from the capability summary JSON file."""
try:
with open(capability_summary_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
logger.error(f"Capability summary file not found: {capability_summary_path}")
raise
except json.JSONDecodeError as e:
logger.error(f"Failed to parse capability summary JSON: {e}")
raise
def create_plugin_from_capability_data(plugin_name: str, plugin_data: Dict, out_dir: str) -> Plugin:
"""Create a Plugin object from capability data."""
plugin = Plugin(
name=plugin_name,
platform_id=plugin_data["platform_id"],
platform_name=plugin_data["platform_name"],
classname=plugin_data["classname"],
)
# Set support status
if plugin_data.get("support_status"):
plugin.support_status = SupportStatus[plugin_data["support_status"]]
# Set capabilities
if plugin_data.get("capabilities"):
capabilities = []
for cap_data in plugin_data["capabilities"]:
capability = map_capability_name_to_enum(cap_data["capability"])
capabilities.append(CapabilitySetting(
capability=capability,
supported=cap_data["supported"],
description=cap_data["description"]
))
plugin.capabilities = capabilities
# Load additional plugin information that's not in capability summary
try:
# Load source class to get additional metadata
class_or_exception = source_registry._ensure_not_lazy(plugin_name) class_or_exception = source_registry._ensure_not_lazy(plugin_name)
if isinstance(class_or_exception, Exception): if isinstance(class_or_exception, Exception):
raise class_or_exception raise class_or_exception
source_type = source_registry.get(plugin_name) source_type = source_registry.get(plugin_name)
logger.debug(f"Source class is {source_type}")
if hasattr(source_type, "get_platform_name"):
platform_name = source_type.get_platform_name()
else:
platform_name = (
plugin_name.title()
) # we like platform names to be human readable
platform_id = None
if hasattr(source_type, "get_platform_id"):
platform_id = source_type.get_platform_id()
if platform_id is None:
raise ValueError(f"Platform ID not found for {plugin_name}")
plugin = Plugin(
name=plugin_name,
platform_id=platform_id,
platform_name=platform_name,
classname=".".join([source_type.__module__, source_type.__name__]),
)
# Get doc order
if hasattr(source_type, "get_platform_doc_order"): if hasattr(source_type, "get_platform_doc_order"):
platform_doc_order = source_type.get_platform_doc_order() platform_doc_order = source_type.get_platform_doc_order()
plugin.doc_order = platform_doc_order plugin.doc_order = platform_doc_order
# Get filename
plugin_file_name = "src/" + "/".join(source_type.__module__.split(".")) plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name): if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
plugin_file_name = plugin_file_name + "/__init__.py" plugin_file_name = plugin_file_name + "/__init__.py"
@ -173,17 +207,11 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist" f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist"
) )
# Get docstring
if hasattr(source_type, "__doc__"): if hasattr(source_type, "__doc__"):
plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "") plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "")
if hasattr(source_type, "get_support_status"): # Get extra dependencies
plugin.support_status = source_type.get_support_status()
if hasattr(source_type, "get_capabilities"):
capabilities = list(source_type.get_capabilities())
capabilities.sort(key=lambda x: x.capability.value)
plugin.capabilities = capabilities
try: try:
extra_plugin = plugin_name if does_extra_exist(plugin_name) else None extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
plugin.extra_deps = ( plugin.extra_deps = (
@ -194,6 +222,7 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e
) )
# Get config class
if hasattr(source_type, "get_config_class"): if hasattr(source_type, "get_config_class"):
source_config_class: ConfigModel = source_type.get_config_class() source_config_class: ConfigModel = source_type.get_config_class()
@ -207,6 +236,9 @@ def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
plugin.config_json_schema plugin.config_json_schema
) )
except Exception as e:
logger.warning(f"Failed to load additional metadata for {plugin_name}: {e}")
return plugin return plugin
@ -227,15 +259,25 @@ class PlatformMetrics:
@click.command() @click.command()
@click.option("--out-dir", type=str, required=True) @click.option("--out-dir", type=str, required=True)
@click.option("--capability-summary", type=str, required=True, help="Path to capability summary JSON file")
@click.option("--extra-docs", type=str, required=False) @click.option("--extra-docs", type=str, required=False)
@click.option("--source", type=str, required=False) @click.option("--source", type=str, required=False)
def generate( def generate(
out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None out_dir: str, capability_summary: str, extra_docs: Optional[str] = None, source: Optional[str] = None
) -> None: # noqa: C901 ) -> None: # noqa: C901
plugin_metrics = PluginMetrics() plugin_metrics = PluginMetrics()
platform_metrics = PlatformMetrics() platform_metrics = PlatformMetrics()
platforms: Dict[str, Platform] = {} platforms: Dict[str, Platform] = {}
# Load capability data
try:
capability_data = load_capability_data(capability_summary)
logger.info(f"Loaded capability data from {capability_summary}")
except Exception as e:
logger.error(f"Failed to load capability data: {e}")
sys.exit(1)
for plugin_name in sorted(source_registry.mapping.keys()): for plugin_name in sorted(source_registry.mapping.keys()):
if source and source != plugin_name: if source and source != plugin_name:
continue continue
@ -250,7 +292,14 @@ def generate(
plugin_metrics.discovered += 1 plugin_metrics.discovered += 1
try: try:
plugin = load_plugin(plugin_name, out_dir=out_dir) if plugin_name in capability_data.get("plugin_details", {}):
# Use capability data
plugin_data = capability_data["plugin_details"][plugin_name]
plugin = create_plugin_from_capability_data(plugin_name, plugin_data, out_dir=out_dir)
else:
logger.error(f"Plugin {plugin_name} not found in capability data")
plugin_metrics.failed += 1
continue
except Exception as e: except Exception as e:
logger.error( logger.error(
f"Failed to load {plugin_name} due to exception {e}", exc_info=e f"Failed to load {plugin_name} due to exception {e}", exc_info=e
@ -531,7 +580,7 @@ By default, the UI shows the latest version of the lineage. The time picker can
In this example, data flows from Airflow/BigQuery to Snowflake tables, then to the Hive dataset, and ultimately to the features of Machine Learning Models. In this example, data flows from Airflow/BigQuery to Snowflake tables, then to the Hive dataset, and ultimately to the features of Machine Learning Models.
:::tip The Lineage Tab is greyed out - why cant I click on it? :::tip The Lineage Tab is greyed out - why can't I click on it?
This means you have not yet ingested lineage metadata for that entity. Please ingest lineage to proceed. This means you have not yet ingested lineage metadata for that entity. Please ingest lineage to proceed.
::: :::
@ -666,7 +715,7 @@ This is a summary of automatic lineage extraction support in our data source. Pl
### SQL Parser Lineage Extraction ### SQL Parser Lineage Extraction
If youre using a different database system for which we dont support column-level lineage out of the box, but you do have a database query log available, If you're using a different database system for which we don't support column-level lineage out of the box, but you do have a database query log available,
we have a SQL queries connector that generates column-level lineage and detailed table usage statistics from the query log. we have a SQL queries connector that generates column-level lineage and detailed table usage statistics from the query log.
If these does not suit your needs, you can use the new `DataHubGraph.parse_sql_lineage()` method in our SDK. (See the source code [here](https://docs.datahub.com/docs/python-sdk/clients/graph-client)) If these does not suit your needs, you can use the new `DataHubGraph.parse_sql_lineage()` method in our SDK. (See the source code [here](https://docs.datahub.com/docs/python-sdk/clients/graph-client))

View File

@ -5,6 +5,7 @@ set -euo pipefail
DATAHUB_ROOT=.. DATAHUB_ROOT=..
DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion
EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources
CAPABILITY_SUMMARY_FILE=./src/datahub/ingestion/autogenerated/capability_summary.json
rm -r $DOCS_OUT_DIR || true rm -r $DOCS_OUT_DIR || true
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@ python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --capability-summary ${CAPABILITY_SUMMARY_FILE} --extra-docs ${EXTRA_DOCS_DIR} $@

View File

@ -104,6 +104,7 @@ def capability(
for base in cls.__bases__ for base in cls.__bases__
): ):
cls.__capabilities = {} cls.__capabilities = {}
cls.get_capabilities = lambda: cls.__capabilities.values() cls.get_capabilities = lambda: cls.__capabilities.values()
# If the superclasses have capability annotations, copy those over. # If the superclasses have capability annotations, copy those over.

File diff suppressed because it is too large Load Diff

View File

@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@capability( @capability(
SourceCapability.DELETION_DETECTION, SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`", "Enabled by default via stateful ingestion",
supported=True, supported=True,
) )
class CassandraSource(StatefulIngestionSourceBase): class CassandraSource(StatefulIngestionSourceBase):

View File

@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
@config_class(AzureADConfig) @config_class(AzureADConfig)
@support_status(SupportStatus.CERTIFIED) @support_status(SupportStatus.CERTIFIED)
@capability( @capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
) )
class AzureADSource(StatefulIngestionSourceBase): class AzureADSource(StatefulIngestionSourceBase):
""" """

View File

@ -202,7 +202,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
@support_status(SupportStatus.CERTIFIED) @support_status(SupportStatus.CERTIFIED)
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration") @capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
@capability( @capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
) )
class OktaSource(StatefulIngestionSourceBase): class OktaSource(StatefulIngestionSourceBase):
""" """

View File

@ -71,7 +71,7 @@ class PresetConfig(SupersetConfig):
@config_class(PresetConfig) @config_class(PresetConfig)
@support_status(SupportStatus.CERTIFIED) @support_status(SupportStatus.CERTIFIED)
@capability( @capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
) )
class PresetSource(SupersetSource): class PresetSource(SupersetSource):
""" """

View File

@ -118,7 +118,7 @@ logger: logging.Logger = logging.getLogger(__name__)
) )
@capability( @capability(
SourceCapability.DELETION_DETECTION, SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`", "Enabled by default via stateful ingestion",
supported=True, supported=True,
) )
@capability( @capability(

View File

@ -116,7 +116,7 @@ class VerticaConfig(BasicSQLAlchemyConfig):
) )
@capability( @capability(
SourceCapability.DELETION_DETECTION, SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`", "Enabled by default via stateful ingestion",
supported=True, supported=True,
) )
class VerticaSource(SQLAlchemySource): class VerticaSource(SQLAlchemySource):

View File

@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
@capability( @capability(
SourceCapability.DELETION_DETECTION, SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`", "Enabled by default via stateful ingestion",
supported=True, supported=True,
) )
class StatefulIngestionSourceBase(Source): class StatefulIngestionSourceBase(Source):

View File

@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
@config_class(SupersetConfig) @config_class(SupersetConfig)
@support_status(SupportStatus.CERTIFIED) @support_status(SupportStatus.CERTIFIED)
@capability( @capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
) )
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key") @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")

View File

@ -159,7 +159,7 @@ logger: logging.Logger = logging.getLogger(__name__)
) )
@capability( @capability(
SourceCapability.DELETION_DETECTION, SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`", "Enabled by default via stateful ingestion",
supported=True, supported=True,
) )
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default") @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")