feat(ingest): generate capability summary (#13881)

This commit is contained in:
Aseem Bansal 2025-06-30 15:16:08 +05:30 committed by GitHub
parent 03309b7ffa
commit 7345af898d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 3364 additions and 93 deletions

View File

@ -7,3 +7,9 @@ repos:
language: system
files: ^smoke-test/tests/cypress/.*\.tsx$
pass_filenames: false
- id: update-capability-summary
name: update-capability-summary
entry: ./gradlew :metadata-ingestion:capabilitySummary
language: system
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
pass_filenames: false

View File

@ -69,9 +69,21 @@ jobs:
run: ./metadata-ingestion/scripts/install_deps.sh
- name: Install package
run: ./gradlew :metadata-ingestion:installPackageOnly
- name: Run lint alongwith testQuick
- name: Check lint and capability_summary.json being up-to-date
if: ${{ matrix.command == 'testQuick' }}
run: ./gradlew :metadata-ingestion:lint
run: |
./gradlew :metadata-ingestion:lint
echo "Lint passed. Checking if capability_summary.json is up-to-date."
./gradlew :metadata-ingestion:capabilitySummary
# Check if capability summary file has changed
if git diff --quiet metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json; then
echo "✅ Capability summary file is unchanged"
else
echo "❌ Capability summary file has changed. Please commit the updated file."
echo "Changed lines:"
git diff metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json
exit 1
fi
- name: Run metadata-ingestion tests
run: ./gradlew :metadata-ingestion:${{ matrix.command }}
- name: Debug info

View File

@ -1,4 +1,4 @@
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-04-21 19:41:02 UTC
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-06-27 12:14:33 UTC
# Do not edit this file directly. Run the script to regenerate.
# Add additional hooks in .github/scripts/pre-commit-override.yaml
repos:
@ -493,3 +493,10 @@ repos:
language: system
files: ^smoke-test/tests/cypress/.*\.tsx$
pass_filenames: false
- id: update-capability-summary
name: update-capability-summary
entry: ./gradlew :metadata-ingestion:capabilitySummary
language: system
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
pass_filenames: false

View File

@ -30,4 +30,5 @@ yarn-error.log*
/.vscode
.yarn-test-sentinel
.yarn-lint-sentinel
.yarn-lint-sentinel
public/assets/ingestion/**

View File

@ -146,6 +146,24 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
outputs.dir('dist')
}
task copyCapabilitySummary(type: Copy) {
def sourceFile = file('../metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json')
if (!sourceFile.exists()) {
// We don't want frontend devs to have to run this task
// But still keeping it here to make sure the dependency is there properly in gradle
dependsOn ':metadata-ingestion:capabilitySummary'
}
from sourceFile
into 'public/assets/ingestion'
inputs.file(sourceFile)
outputs.file('public/assets/ingestion/capability_summary.json')
}
yarnBuild.dependsOn copyCapabilitySummary
// Define a list of configurations for prettier tasks
def externalPrettierConfigs = [
[
@ -201,6 +219,7 @@ clean {
delete 'tmp'
delete 'just'
delete fileTree(dir: 'src', include: '*.generated.ts')
delete 'public/assets/capability_summary.json'
}
configurations {

View File

@ -18,6 +18,27 @@ import {
import { useAppConfig } from '@app/useAppConfig';
import { useShowNavBarRedesign } from '@app/useShowNavBarRedesign';
interface Capability {
capability: string;
description: string;
supported: boolean;
}
interface PluginDetails {
capabilities: Capability[];
classname: string;
platform_id: string;
platform_name: string;
support_status: string;
}
// this type is based off of the type in metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json
interface CapabilitySummary {
generated_at: string;
generated_by: string;
plugin_details: Record<string, PluginDetails>;
}
const PageContainer = styled.div<{ $isShowNavBarRedesign?: boolean }>`
padding-top: 20px;
background-color: white;
@ -78,9 +99,63 @@ export const ManageIngestionPage = () => {
const [showCreateSecretModal, setShowCreateSecretModal] = useState<boolean>(false);
const [hideSystemSources, setHideSystemSources] = useState(true);
const [capabilitySummary, setCapabilitySummary] = useState<CapabilitySummary | null>(null);
const [isCapabilitySummaryLoading, setIsCapabilitySummaryLoading] = useState<boolean>(true);
const [isCapabilitySummaryError, setIsCapabilitySummaryError] = useState<string | null>(null);
const history = useHistory();
const shouldPreserveParams = useRef(false);
useEffect(() => {
const fetchCapabilitySummary = async () => {
setIsCapabilitySummaryLoading(true);
try {
const response = await fetch('/assets/ingestion/capability_summary.json');
if (!response.ok) {
throw new Error(`Failed to fetch capability summary: ${response.status} ${response.statusText}`);
}
const data = await response.json();
setCapabilitySummary(data);
} catch (error) {
console.error('Error fetching capability summary:', error);
setIsCapabilitySummaryError(
error instanceof Error ? error.message : 'Failed to fetch capability summary',
);
} finally {
setIsCapabilitySummaryLoading(false);
}
};
fetchCapabilitySummary();
}, []);
const getPluginCapabilities = (platformId: string): PluginDetails | null => {
if (!capabilitySummary?.plugin_details?.[platformId]) {
return null;
}
return capabilitySummary.plugin_details[platformId];
};
const isCapabilitySupported = (platformId: string, capabilityName: string): boolean => {
const capabilities = getPluginCapabilities(platformId)?.capabilities;
if (!capabilities) {
return false;
}
return capabilities?.some((capability) => capability.capability === capabilityName && capability.supported);
};
const isProfilingSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'DATA_PROFILING');
// const isLineageSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'LINEAGE_COARSE');
// const isFineGrainedLineageSupported = (platformId: string): boolean =>
// isCapabilitySupported(platformId, 'LINEAGE_FINE');
// const isUsageStatsSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'USAGE_STATS');
if (!isCapabilitySummaryLoading && !isCapabilitySummaryError) {
console.log(
'Example to be removed when is actually used for something is profiling support for bigquery',
isProfilingSupported('bigquery'),
);
}
// defaultTab might not be calculated correctly on mount, if `config` or `me` haven't been loaded yet
useEffect(() => {
if (loaded && me.loaded && !showIngestionTab && selectedTab === TabType.Sources) {

View File

@ -112,16 +112,16 @@ task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) {
task lint(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-c',
venv_activate_command +
"ruff check src/ tests/ examples/ && " +
"ruff format --check src/ tests/ examples/ && " +
"ruff check scripts/capability_summary.py src/ tests/ examples/ && " +
"ruff format --check scripts/capability_summary.py src/ tests/ examples/ && " +
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
}
task lintFix(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-c',
venv_activate_command +
"ruff check --fix src/ tests/ examples/ && " +
"ruff format src/ tests/ examples/ "
"ruff check --fix scripts/capability_summary.py src/ tests/ examples/ && " +
"ruff format scripts/capability_summary.py src/ tests/ examples/ "
}
def pytest_default_env = "PYTHONDEVMODE=1"
@ -191,7 +191,23 @@ task testFull(type: Exec, dependsOn: [installDevTest]) {
task specGen(type: Exec, dependsOn: [codegen, installDevTest]) {
commandLine 'bash', '-c', "${venv_activate_command} ./scripts/specgen.sh"
}
task capabilitySummary(type: Exec, dependsOn: [codegen, installDevTest]) {
inputs.files(
file('scripts/capability_summary.py'),
file('scripts/docgen_types.py'),
project.fileTree(dir: "src/datahub/ingestion/source", include: "**/*.py")
)
commandLine 'bash', '-c', "${venv_activate_command} python scripts/capability_summary.py --output-dir ./src/datahub/ingestion/autogenerated"
}
task docGen(type: Exec, dependsOn: [codegen, installDevTest, specGen]) {
def sourceFile = file('./src/datahub/ingestion/autogenerated/capability_summary.json')
if (!sourceFile.exists()) {
// Doing it like this cuts docGen time from 15 seconds to 9 seconds locally
// This can further reduce if we generate more things in the future
dependsOn capabilitySummary
}
commandLine 'bash', '-c', "${venv_activate_command} ./scripts/docgen.sh"
}

View File

@ -0,0 +1,234 @@
import dataclasses
import json
import logging
import pathlib
from datetime import datetime, timezone
from typing import Dict, Optional
import click
from docgen_types import Plugin
from datahub.ingestion.api.decorators import SupportStatus
from datahub.ingestion.source.source_registry import source_registry
logger = logging.getLogger(__name__)
DENY_LIST = {
"snowflake-summary",
"snowflake-queries",
"bigquery-queries",
}
def load_plugin_capabilities(plugin_name: str) -> Optional[Plugin]:
"""Load plugin capabilities without generating full documentation."""
logger.debug(f"Loading capabilities for {plugin_name}")
try:
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
if isinstance(class_or_exception, Exception):
# Log the specific error but don't re-raise it
logger.warning(f"Plugin {plugin_name} failed to load: {class_or_exception}")
return None
source_type = source_registry.get(plugin_name)
logger.debug(f"Source class is {source_type}")
if hasattr(source_type, "get_platform_name"):
platform_name = source_type.get_platform_name()
else:
platform_name = plugin_name.title()
platform_id = None
if hasattr(source_type, "get_platform_id"):
platform_id = source_type.get_platform_id()
if platform_id is None:
logger.warning(f"Platform ID not found for {plugin_name}")
return None
plugin = Plugin(
name=plugin_name,
platform_id=platform_id,
platform_name=platform_name,
classname=".".join([source_type.__module__, source_type.__name__]),
)
if hasattr(source_type, "get_support_status"):
plugin.support_status = source_type.get_support_status()
if hasattr(source_type, "get_capabilities"):
capabilities = list(source_type.get_capabilities())
if capabilities:
capabilities.sort(key=lambda x: x.capability.value)
plugin.capabilities = capabilities
else:
logger.debug(f"No capabilities defined for {plugin_name}")
plugin.capabilities = []
else:
logger.debug(f"No get_capabilities method for {plugin_name}")
plugin.capabilities = []
return plugin
except Exception as e:
logger.warning(f"Failed to load capabilities for {plugin_name}: {e}")
return None
@dataclasses.dataclass
class CapabilitySummary:
"""Summary of capabilities across all plugins."""
plugin_details: Dict[str, Dict] # plugin_name -> detailed info
def generate_capability_summary() -> CapabilitySummary:
"""Generate a comprehensive summary of capabilities across all plugins."""
plugin_details: Dict[str, Dict] = {}
for plugin_name in sorted(source_registry.mapping.keys()):
if plugin_name in DENY_LIST:
logger.info(f"Skipping {plugin_name} as it is on the deny list")
continue
plugin = load_plugin_capabilities(plugin_name)
if plugin is None:
continue
plugin_details[plugin_name] = {
"platform_id": plugin.platform_id,
"platform_name": plugin.platform_name,
"classname": plugin.classname,
"support_status": plugin.support_status.name
if plugin.support_status != SupportStatus.UNKNOWN
else None,
"capabilities": [],
}
if plugin.capabilities:
for cap_setting in plugin.capabilities:
capability_name = cap_setting.capability.name
plugin_details[plugin_name]["capabilities"].append(
{
"capability": capability_name,
"supported": cap_setting.supported,
"description": cap_setting.description,
}
)
return CapabilitySummary(
plugin_details=plugin_details,
)
def save_capability_report(summary: CapabilitySummary, output_dir: str) -> None:
"""Save the capability summary as JSON files, but only write if contents have changed."""
output_path = pathlib.Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
existing_capabilities = {}
existing_summary_file = pathlib.Path(
"./src/datahub/ingestion/autogenerated/capability_summary.json"
)
if existing_summary_file.exists():
try:
with open(existing_summary_file, "r") as f:
existing_data = json.load(f)
existing_capabilities = existing_data.get("plugin_details", {})
logger.info(
f"Loaded existing capability data for {len(existing_capabilities)} plugins"
)
except Exception as e:
logger.warning(f"Failed to load existing capability data: {e}")
missing_plugins = set(existing_capabilities.keys()) - set(
summary.plugin_details.keys()
)
for plugin_name in missing_plugins:
logger.warning(
f"Plugin {plugin_name} failed to load, using existing capability data as fallback. Manually remove from capability_summary.json if you want to remove it from the report."
)
summary.plugin_details[plugin_name] = existing_capabilities[plugin_name]
summary_dict = dataclasses.asdict(summary)
summary_dict["generated_by"] = "metadata-ingestion/scripts/capability_summary.py"
summary_dict["generated_at"] = datetime.now(timezone.utc).isoformat()
summary_json = json.dumps(summary_dict, indent=2, sort_keys=True)
summary_file = output_path / "capability_summary.json"
write_file = True
if summary_file.exists():
try:
with open(summary_file, "r") as f:
existing_data = json.load(f)
# Create copies without generated_at for comparison
existing_for_comparison = existing_data.copy()
new_for_comparison = summary_dict.copy()
existing_for_comparison.pop("generated_at", None)
new_for_comparison.pop("generated_at", None)
if json.dumps(
existing_for_comparison, indent=2, sort_keys=True
) == json.dumps(new_for_comparison, indent=2, sort_keys=True):
logger.info(f"No changes detected in {summary_file}, skipping write.")
write_file = False
except Exception as e:
logger.warning(f"Could not read existing summary file: {e}")
if write_file:
with open(summary_file, "w") as f:
f.write(summary_json)
logger.info(f"Capability summary saved to {summary_file}")
@click.command()
@click.option(
"--output-dir",
type=str,
default="./autogenerated",
help="Output directory for capability reports",
)
@click.option(
"--source",
type=str,
required=False,
help="Generate report for specific source only",
)
def generate_capability_report(output_dir: str, source: Optional[str] = None) -> None:
"""Generate a comprehensive capability report for all ingestion sources."""
logger.info("Starting capability report generation...")
if source:
if source not in source_registry.mapping:
logger.error(f"Source '{source}' not found in registry")
return
original_mapping = source_registry.mapping.copy()
source_registry.mapping = {source: original_mapping[source]}
try:
summary = generate_capability_summary()
save_capability_report(summary, output_dir)
print("Capability Report Generation Complete")
print("=====================================")
print(f"Total plugins processed: {len(summary.plugin_details)}")
print(f"Plugins with capabilities: {len(summary.plugin_details)}")
print(f"Output directory: {output_dir}")
finally:
if source:
source_registry.mapping = original_mapping
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s %(levelname)-8s {%(name)s:%(lineno)d}] - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S %Z",
)
generate_capability_report()

View File

@ -15,7 +15,7 @@ from docgen_types import Platform, Plugin
from docs_config_table import gen_md_table_from_json_schema
from datahub.configuration.common import ConfigModel
from datahub.ingestion.api.decorators import SourceCapability, SupportStatus
from datahub.ingestion.api.decorators import SourceCapability, SupportStatus, CapabilitySetting
from datahub.ingestion.source.source_registry import source_registry
logger = logging.getLogger(__name__)
@ -68,6 +68,20 @@ def get_capability_text(src_capability: SourceCapability) -> str:
)
def map_capability_name_to_enum(capability_name: str) -> SourceCapability:
"""
Maps capability names from the JSON file to SourceCapability enum values.
The JSON file uses enum names (e.g., "DATA_PROFILING") but the enum expects values (e.g., "Data Profiling").
"""
try:
return SourceCapability[capability_name]
except KeyError:
try:
return SourceCapability(capability_name)
except ValueError:
raise ValueError(f"Unknown capability name: {capability_name}")
def does_extra_exist(extra_name: str) -> bool:
for key, value in metadata("acryl-datahub").items():
if key == "Provides-Extra" and value == extra_name:
@ -129,84 +143,102 @@ def rewrite_markdown(file_contents: str, path: str, relocated_path: str) -> str:
return new_content
def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
logger.debug(f"Loading {plugin_name}")
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
if isinstance(class_or_exception, Exception):
raise class_or_exception
source_type = source_registry.get(plugin_name)
logger.debug(f"Source class is {source_type}")
def load_capability_data(capability_summary_path: str) -> Dict:
"""Load capability data from the capability summary JSON file."""
try:
with open(capability_summary_path, 'r') as f:
return json.load(f)
except FileNotFoundError:
logger.error(f"Capability summary file not found: {capability_summary_path}")
raise
except json.JSONDecodeError as e:
logger.error(f"Failed to parse capability summary JSON: {e}")
raise
if hasattr(source_type, "get_platform_name"):
platform_name = source_type.get_platform_name()
else:
platform_name = (
plugin_name.title()
) # we like platform names to be human readable
platform_id = None
if hasattr(source_type, "get_platform_id"):
platform_id = source_type.get_platform_id()
if platform_id is None:
raise ValueError(f"Platform ID not found for {plugin_name}")
def create_plugin_from_capability_data(plugin_name: str, plugin_data: Dict, out_dir: str) -> Plugin:
"""Create a Plugin object from capability data."""
plugin = Plugin(
name=plugin_name,
platform_id=platform_id,
platform_name=platform_name,
classname=".".join([source_type.__module__, source_type.__name__]),
platform_id=plugin_data["platform_id"],
platform_name=plugin_data["platform_name"],
classname=plugin_data["classname"],
)
if hasattr(source_type, "get_platform_doc_order"):
platform_doc_order = source_type.get_platform_doc_order()
plugin.doc_order = platform_doc_order
plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
plugin_file_name = plugin_file_name + "/__init__.py"
else:
plugin_file_name = plugin_file_name + ".py"
if os.path.exists(plugin_file_name):
plugin.filename = plugin_file_name
else:
logger.info(
f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist"
)
if hasattr(source_type, "__doc__"):
plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "")
if hasattr(source_type, "get_support_status"):
plugin.support_status = source_type.get_support_status()
if hasattr(source_type, "get_capabilities"):
capabilities = list(source_type.get_capabilities())
capabilities.sort(key=lambda x: x.capability.value)
# Set support status
if plugin_data.get("support_status"):
plugin.support_status = SupportStatus[plugin_data["support_status"]]
# Set capabilities
if plugin_data.get("capabilities"):
capabilities = []
for cap_data in plugin_data["capabilities"]:
capability = map_capability_name_to_enum(cap_data["capability"])
capabilities.append(CapabilitySetting(
capability=capability,
supported=cap_data["supported"],
description=cap_data["description"]
))
plugin.capabilities = capabilities
# Load additional plugin information that's not in capability summary
try:
extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
plugin.extra_deps = (
get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
)
# Load source class to get additional metadata
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
if isinstance(class_or_exception, Exception):
raise class_or_exception
source_type = source_registry.get(plugin_name)
# Get doc order
if hasattr(source_type, "get_platform_doc_order"):
platform_doc_order = source_type.get_platform_doc_order()
plugin.doc_order = platform_doc_order
# Get filename
plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
plugin_file_name = plugin_file_name + "/__init__.py"
else:
plugin_file_name = plugin_file_name + ".py"
if os.path.exists(plugin_file_name):
plugin.filename = plugin_file_name
else:
logger.info(
f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist"
)
# Get docstring
if hasattr(source_type, "__doc__"):
plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "")
# Get extra dependencies
try:
extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
plugin.extra_deps = (
get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
)
except Exception as e:
logger.info(
f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e
)
# Get config class
if hasattr(source_type, "get_config_class"):
source_config_class: ConfigModel = source_type.get_config_class()
plugin.config_json_schema = source_config_class.schema_json(indent=2)
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
# Write the config json schema to the out_dir.
config_dir = pathlib.Path(out_dir) / "config_schemas"
config_dir.mkdir(parents=True, exist_ok=True)
(config_dir / f"{plugin_name}_config.json").write_text(
plugin.config_json_schema
)
except Exception as e:
logger.info(
f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e
)
if hasattr(source_type, "get_config_class"):
source_config_class: ConfigModel = source_type.get_config_class()
plugin.config_json_schema = source_config_class.schema_json(indent=2)
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
# Write the config json schema to the out_dir.
config_dir = pathlib.Path(out_dir) / "config_schemas"
config_dir.mkdir(parents=True, exist_ok=True)
(config_dir / f"{plugin_name}_config.json").write_text(
plugin.config_json_schema
)
logger.warning(f"Failed to load additional metadata for {plugin_name}: {e}")
return plugin
@ -227,15 +259,25 @@ class PlatformMetrics:
@click.command()
@click.option("--out-dir", type=str, required=True)
@click.option("--capability-summary", type=str, required=True, help="Path to capability summary JSON file")
@click.option("--extra-docs", type=str, required=False)
@click.option("--source", type=str, required=False)
def generate(
out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None
out_dir: str, capability_summary: str, extra_docs: Optional[str] = None, source: Optional[str] = None
) -> None: # noqa: C901
plugin_metrics = PluginMetrics()
platform_metrics = PlatformMetrics()
platforms: Dict[str, Platform] = {}
# Load capability data
try:
capability_data = load_capability_data(capability_summary)
logger.info(f"Loaded capability data from {capability_summary}")
except Exception as e:
logger.error(f"Failed to load capability data: {e}")
sys.exit(1)
for plugin_name in sorted(source_registry.mapping.keys()):
if source and source != plugin_name:
continue
@ -250,7 +292,14 @@ def generate(
plugin_metrics.discovered += 1
try:
plugin = load_plugin(plugin_name, out_dir=out_dir)
if plugin_name in capability_data.get("plugin_details", {}):
# Use capability data
plugin_data = capability_data["plugin_details"][plugin_name]
plugin = create_plugin_from_capability_data(plugin_name, plugin_data, out_dir=out_dir)
else:
logger.error(f"Plugin {plugin_name} not found in capability data")
plugin_metrics.failed += 1
continue
except Exception as e:
logger.error(
f"Failed to load {plugin_name} due to exception {e}", exc_info=e
@ -531,7 +580,7 @@ By default, the UI shows the latest version of the lineage. The time picker can
In this example, data flows from Airflow/BigQuery to Snowflake tables, then to the Hive dataset, and ultimately to the features of Machine Learning Models.
:::tip The Lineage Tab is greyed out - why cant I click on it?
:::tip The Lineage Tab is greyed out - why can't I click on it?
This means you have not yet ingested lineage metadata for that entity. Please ingest lineage to proceed.
:::
@ -666,7 +715,7 @@ This is a summary of automatic lineage extraction support in our data source. Pl
### SQL Parser Lineage Extraction
If youre using a different database system for which we dont support column-level lineage out of the box, but you do have a database query log available,
If you're using a different database system for which we don't support column-level lineage out of the box, but you do have a database query log available,
we have a SQL queries connector that generates column-level lineage and detailed table usage statistics from the query log.
If these does not suit your needs, you can use the new `DataHubGraph.parse_sql_lineage()` method in our SDK. (See the source code [here](https://docs.datahub.com/docs/python-sdk/clients/graph-client))

View File

@ -5,6 +5,7 @@ set -euo pipefail
DATAHUB_ROOT=..
DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion
EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources
CAPABILITY_SUMMARY_FILE=./src/datahub/ingestion/autogenerated/capability_summary.json
rm -r $DOCS_OUT_DIR || true
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --capability-summary ${CAPABILITY_SUMMARY_FILE} --extra-docs ${EXTRA_DOCS_DIR} $@

View File

@ -104,6 +104,7 @@ def capability(
for base in cls.__bases__
):
cls.__capabilities = {}
cls.get_capabilities = lambda: cls.__capabilities.values()
# If the superclasses have capability annotations, copy those over.

File diff suppressed because it is too large Load Diff

View File

@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@capability(
SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
"Enabled by default via stateful ingestion",
supported=True,
)
class CassandraSource(StatefulIngestionSourceBase):

View File

@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
@config_class(AzureADConfig)
@support_status(SupportStatus.CERTIFIED)
@capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
)
class AzureADSource(StatefulIngestionSourceBase):
"""

View File

@ -202,7 +202,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
@support_status(SupportStatus.CERTIFIED)
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
@capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
)
class OktaSource(StatefulIngestionSourceBase):
"""

View File

@ -71,7 +71,7 @@ class PresetConfig(SupersetConfig):
@config_class(PresetConfig)
@support_status(SupportStatus.CERTIFIED)
@capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
)
class PresetSource(SupersetSource):
"""

View File

@ -118,7 +118,7 @@ logger: logging.Logger = logging.getLogger(__name__)
)
@capability(
SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
"Enabled by default via stateful ingestion",
supported=True,
)
@capability(

View File

@ -116,7 +116,7 @@ class VerticaConfig(BasicSQLAlchemyConfig):
)
@capability(
SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
"Enabled by default via stateful ingestion",
supported=True,
)
class VerticaSource(SQLAlchemySource):

View File

@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
@capability(
SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
"Enabled by default via stateful ingestion",
supported=True,
)
class StatefulIngestionSourceBase(Source):

View File

@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
@config_class(SupersetConfig)
@support_status(SupportStatus.CERTIFIED)
@capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
)
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")

View File

@ -159,7 +159,7 @@ logger: logging.Logger = logging.getLogger(__name__)
)
@capability(
SourceCapability.DELETION_DETECTION,
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
"Enabled by default via stateful ingestion",
supported=True,
)
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")