mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-02 19:58:59 +00:00
feat(ingest): generate capability summary (#13881)
This commit is contained in:
parent
03309b7ffa
commit
7345af898d
6
.github/scripts/pre-commit-override.yaml
vendored
6
.github/scripts/pre-commit-override.yaml
vendored
@ -7,3 +7,9 @@ repos:
|
||||
language: system
|
||||
files: ^smoke-test/tests/cypress/.*\.tsx$
|
||||
pass_filenames: false
|
||||
- id: update-capability-summary
|
||||
name: update-capability-summary
|
||||
entry: ./gradlew :metadata-ingestion:capabilitySummary
|
||||
language: system
|
||||
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
|
||||
pass_filenames: false
|
||||
|
||||
16
.github/workflows/metadata-ingestion.yml
vendored
16
.github/workflows/metadata-ingestion.yml
vendored
@ -69,9 +69,21 @@ jobs:
|
||||
run: ./metadata-ingestion/scripts/install_deps.sh
|
||||
- name: Install package
|
||||
run: ./gradlew :metadata-ingestion:installPackageOnly
|
||||
- name: Run lint alongwith testQuick
|
||||
- name: Check lint and capability_summary.json being up-to-date
|
||||
if: ${{ matrix.command == 'testQuick' }}
|
||||
run: ./gradlew :metadata-ingestion:lint
|
||||
run: |
|
||||
./gradlew :metadata-ingestion:lint
|
||||
echo "Lint passed. Checking if capability_summary.json is up-to-date."
|
||||
./gradlew :metadata-ingestion:capabilitySummary
|
||||
# Check if capability summary file has changed
|
||||
if git diff --quiet metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json; then
|
||||
echo "✅ Capability summary file is unchanged"
|
||||
else
|
||||
echo "❌ Capability summary file has changed. Please commit the updated file."
|
||||
echo "Changed lines:"
|
||||
git diff metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json
|
||||
exit 1
|
||||
fi
|
||||
- name: Run metadata-ingestion tests
|
||||
run: ./gradlew :metadata-ingestion:${{ matrix.command }}
|
||||
- name: Debug info
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-04-21 19:41:02 UTC
|
||||
# Auto-generated by .github/scripts/generate_pre_commit.py at 2025-06-27 12:14:33 UTC
|
||||
# Do not edit this file directly. Run the script to regenerate.
|
||||
# Add additional hooks in .github/scripts/pre-commit-override.yaml
|
||||
repos:
|
||||
@ -493,3 +493,10 @@ repos:
|
||||
language: system
|
||||
files: ^smoke-test/tests/cypress/.*\.tsx$
|
||||
pass_filenames: false
|
||||
|
||||
- id: update-capability-summary
|
||||
name: update-capability-summary
|
||||
entry: ./gradlew :metadata-ingestion:capabilitySummary
|
||||
language: system
|
||||
files: ^metadata-ingestion/src/datahub/ingestion/source/.*\.py$
|
||||
pass_filenames: false
|
||||
|
||||
3
datahub-web-react/.gitignore
vendored
3
datahub-web-react/.gitignore
vendored
@ -30,4 +30,5 @@ yarn-error.log*
|
||||
/.vscode
|
||||
|
||||
.yarn-test-sentinel
|
||||
.yarn-lint-sentinel
|
||||
.yarn-lint-sentinel
|
||||
public/assets/ingestion/**
|
||||
@ -146,6 +146,24 @@ task yarnBuild(type: YarnTask, dependsOn: [yarnInstall, yarnGenerate]) {
|
||||
outputs.dir('dist')
|
||||
}
|
||||
|
||||
task copyCapabilitySummary(type: Copy) {
|
||||
def sourceFile = file('../metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json')
|
||||
|
||||
if (!sourceFile.exists()) {
|
||||
// We don't want frontend devs to have to run this task
|
||||
// But still keeping it here to make sure the dependency is there properly in gradle
|
||||
dependsOn ':metadata-ingestion:capabilitySummary'
|
||||
}
|
||||
|
||||
from sourceFile
|
||||
into 'public/assets/ingestion'
|
||||
|
||||
inputs.file(sourceFile)
|
||||
outputs.file('public/assets/ingestion/capability_summary.json')
|
||||
}
|
||||
|
||||
yarnBuild.dependsOn copyCapabilitySummary
|
||||
|
||||
// Define a list of configurations for prettier tasks
|
||||
def externalPrettierConfigs = [
|
||||
[
|
||||
@ -201,6 +219,7 @@ clean {
|
||||
delete 'tmp'
|
||||
delete 'just'
|
||||
delete fileTree(dir: 'src', include: '*.generated.ts')
|
||||
delete 'public/assets/capability_summary.json'
|
||||
}
|
||||
|
||||
configurations {
|
||||
|
||||
@ -18,6 +18,27 @@ import {
|
||||
import { useAppConfig } from '@app/useAppConfig';
|
||||
import { useShowNavBarRedesign } from '@app/useShowNavBarRedesign';
|
||||
|
||||
interface Capability {
|
||||
capability: string;
|
||||
description: string;
|
||||
supported: boolean;
|
||||
}
|
||||
|
||||
interface PluginDetails {
|
||||
capabilities: Capability[];
|
||||
classname: string;
|
||||
platform_id: string;
|
||||
platform_name: string;
|
||||
support_status: string;
|
||||
}
|
||||
|
||||
// this type is based off of the type in metadata-ingestion/src/datahub/ingestion/autogenerated/capability_summary.json
|
||||
interface CapabilitySummary {
|
||||
generated_at: string;
|
||||
generated_by: string;
|
||||
plugin_details: Record<string, PluginDetails>;
|
||||
}
|
||||
|
||||
const PageContainer = styled.div<{ $isShowNavBarRedesign?: boolean }>`
|
||||
padding-top: 20px;
|
||||
background-color: white;
|
||||
@ -78,9 +99,63 @@ export const ManageIngestionPage = () => {
|
||||
const [showCreateSecretModal, setShowCreateSecretModal] = useState<boolean>(false);
|
||||
const [hideSystemSources, setHideSystemSources] = useState(true);
|
||||
|
||||
const [capabilitySummary, setCapabilitySummary] = useState<CapabilitySummary | null>(null);
|
||||
const [isCapabilitySummaryLoading, setIsCapabilitySummaryLoading] = useState<boolean>(true);
|
||||
const [isCapabilitySummaryError, setIsCapabilitySummaryError] = useState<string | null>(null);
|
||||
|
||||
const history = useHistory();
|
||||
const shouldPreserveParams = useRef(false);
|
||||
|
||||
useEffect(() => {
|
||||
const fetchCapabilitySummary = async () => {
|
||||
setIsCapabilitySummaryLoading(true);
|
||||
try {
|
||||
const response = await fetch('/assets/ingestion/capability_summary.json');
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch capability summary: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
const data = await response.json();
|
||||
setCapabilitySummary(data);
|
||||
} catch (error) {
|
||||
console.error('Error fetching capability summary:', error);
|
||||
setIsCapabilitySummaryError(
|
||||
error instanceof Error ? error.message : 'Failed to fetch capability summary',
|
||||
);
|
||||
} finally {
|
||||
setIsCapabilitySummaryLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
fetchCapabilitySummary();
|
||||
}, []);
|
||||
|
||||
const getPluginCapabilities = (platformId: string): PluginDetails | null => {
|
||||
if (!capabilitySummary?.plugin_details?.[platformId]) {
|
||||
return null;
|
||||
}
|
||||
return capabilitySummary.plugin_details[platformId];
|
||||
};
|
||||
const isCapabilitySupported = (platformId: string, capabilityName: string): boolean => {
|
||||
const capabilities = getPluginCapabilities(platformId)?.capabilities;
|
||||
if (!capabilities) {
|
||||
return false;
|
||||
}
|
||||
return capabilities?.some((capability) => capability.capability === capabilityName && capability.supported);
|
||||
};
|
||||
|
||||
const isProfilingSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'DATA_PROFILING');
|
||||
// const isLineageSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'LINEAGE_COARSE');
|
||||
// const isFineGrainedLineageSupported = (platformId: string): boolean =>
|
||||
// isCapabilitySupported(platformId, 'LINEAGE_FINE');
|
||||
// const isUsageStatsSupported = (platformId: string): boolean => isCapabilitySupported(platformId, 'USAGE_STATS');
|
||||
|
||||
if (!isCapabilitySummaryLoading && !isCapabilitySummaryError) {
|
||||
console.log(
|
||||
'Example to be removed when is actually used for something is profiling support for bigquery',
|
||||
isProfilingSupported('bigquery'),
|
||||
);
|
||||
}
|
||||
|
||||
// defaultTab might not be calculated correctly on mount, if `config` or `me` haven't been loaded yet
|
||||
useEffect(() => {
|
||||
if (loaded && me.loaded && !showIngestionTab && selectedTab === TabType.Sources) {
|
||||
|
||||
@ -112,16 +112,16 @@ task modelDocUpload(type: Exec, dependsOn: [modelDocGen]) {
|
||||
task lint(type: Exec, dependsOn: installDev) {
|
||||
commandLine 'bash', '-c',
|
||||
venv_activate_command +
|
||||
"ruff check src/ tests/ examples/ && " +
|
||||
"ruff format --check src/ tests/ examples/ && " +
|
||||
"ruff check scripts/capability_summary.py src/ tests/ examples/ && " +
|
||||
"ruff format --check scripts/capability_summary.py src/ tests/ examples/ && " +
|
||||
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
|
||||
}
|
||||
|
||||
task lintFix(type: Exec, dependsOn: installDev) {
|
||||
commandLine 'bash', '-c',
|
||||
venv_activate_command +
|
||||
"ruff check --fix src/ tests/ examples/ && " +
|
||||
"ruff format src/ tests/ examples/ "
|
||||
"ruff check --fix scripts/capability_summary.py src/ tests/ examples/ && " +
|
||||
"ruff format scripts/capability_summary.py src/ tests/ examples/ "
|
||||
}
|
||||
|
||||
def pytest_default_env = "PYTHONDEVMODE=1"
|
||||
@ -191,7 +191,23 @@ task testFull(type: Exec, dependsOn: [installDevTest]) {
|
||||
task specGen(type: Exec, dependsOn: [codegen, installDevTest]) {
|
||||
commandLine 'bash', '-c', "${venv_activate_command} ./scripts/specgen.sh"
|
||||
}
|
||||
|
||||
task capabilitySummary(type: Exec, dependsOn: [codegen, installDevTest]) {
|
||||
inputs.files(
|
||||
file('scripts/capability_summary.py'),
|
||||
file('scripts/docgen_types.py'),
|
||||
project.fileTree(dir: "src/datahub/ingestion/source", include: "**/*.py")
|
||||
)
|
||||
commandLine 'bash', '-c', "${venv_activate_command} python scripts/capability_summary.py --output-dir ./src/datahub/ingestion/autogenerated"
|
||||
}
|
||||
|
||||
task docGen(type: Exec, dependsOn: [codegen, installDevTest, specGen]) {
|
||||
def sourceFile = file('./src/datahub/ingestion/autogenerated/capability_summary.json')
|
||||
if (!sourceFile.exists()) {
|
||||
// Doing it like this cuts docGen time from 15 seconds to 9 seconds locally
|
||||
// This can further reduce if we generate more things in the future
|
||||
dependsOn capabilitySummary
|
||||
}
|
||||
commandLine 'bash', '-c', "${venv_activate_command} ./scripts/docgen.sh"
|
||||
}
|
||||
|
||||
|
||||
234
metadata-ingestion/scripts/capability_summary.py
Normal file
234
metadata-ingestion/scripts/capability_summary.py
Normal file
@ -0,0 +1,234 @@
|
||||
import dataclasses
|
||||
import json
|
||||
import logging
|
||||
import pathlib
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Optional
|
||||
|
||||
import click
|
||||
from docgen_types import Plugin
|
||||
|
||||
from datahub.ingestion.api.decorators import SupportStatus
|
||||
from datahub.ingestion.source.source_registry import source_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
DENY_LIST = {
|
||||
"snowflake-summary",
|
||||
"snowflake-queries",
|
||||
"bigquery-queries",
|
||||
}
|
||||
|
||||
|
||||
def load_plugin_capabilities(plugin_name: str) -> Optional[Plugin]:
|
||||
"""Load plugin capabilities without generating full documentation."""
|
||||
logger.debug(f"Loading capabilities for {plugin_name}")
|
||||
|
||||
try:
|
||||
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
|
||||
if isinstance(class_or_exception, Exception):
|
||||
# Log the specific error but don't re-raise it
|
||||
logger.warning(f"Plugin {plugin_name} failed to load: {class_or_exception}")
|
||||
return None
|
||||
source_type = source_registry.get(plugin_name)
|
||||
logger.debug(f"Source class is {source_type}")
|
||||
|
||||
if hasattr(source_type, "get_platform_name"):
|
||||
platform_name = source_type.get_platform_name()
|
||||
else:
|
||||
platform_name = plugin_name.title()
|
||||
|
||||
platform_id = None
|
||||
if hasattr(source_type, "get_platform_id"):
|
||||
platform_id = source_type.get_platform_id()
|
||||
if platform_id is None:
|
||||
logger.warning(f"Platform ID not found for {plugin_name}")
|
||||
return None
|
||||
|
||||
plugin = Plugin(
|
||||
name=plugin_name,
|
||||
platform_id=platform_id,
|
||||
platform_name=platform_name,
|
||||
classname=".".join([source_type.__module__, source_type.__name__]),
|
||||
)
|
||||
|
||||
if hasattr(source_type, "get_support_status"):
|
||||
plugin.support_status = source_type.get_support_status()
|
||||
|
||||
if hasattr(source_type, "get_capabilities"):
|
||||
capabilities = list(source_type.get_capabilities())
|
||||
if capabilities:
|
||||
capabilities.sort(key=lambda x: x.capability.value)
|
||||
plugin.capabilities = capabilities
|
||||
else:
|
||||
logger.debug(f"No capabilities defined for {plugin_name}")
|
||||
plugin.capabilities = []
|
||||
else:
|
||||
logger.debug(f"No get_capabilities method for {plugin_name}")
|
||||
plugin.capabilities = []
|
||||
|
||||
return plugin
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load capabilities for {plugin_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class CapabilitySummary:
|
||||
"""Summary of capabilities across all plugins."""
|
||||
|
||||
plugin_details: Dict[str, Dict] # plugin_name -> detailed info
|
||||
|
||||
|
||||
def generate_capability_summary() -> CapabilitySummary:
|
||||
"""Generate a comprehensive summary of capabilities across all plugins."""
|
||||
|
||||
plugin_details: Dict[str, Dict] = {}
|
||||
|
||||
for plugin_name in sorted(source_registry.mapping.keys()):
|
||||
if plugin_name in DENY_LIST:
|
||||
logger.info(f"Skipping {plugin_name} as it is on the deny list")
|
||||
continue
|
||||
|
||||
plugin = load_plugin_capabilities(plugin_name)
|
||||
|
||||
if plugin is None:
|
||||
continue
|
||||
|
||||
plugin_details[plugin_name] = {
|
||||
"platform_id": plugin.platform_id,
|
||||
"platform_name": plugin.platform_name,
|
||||
"classname": plugin.classname,
|
||||
"support_status": plugin.support_status.name
|
||||
if plugin.support_status != SupportStatus.UNKNOWN
|
||||
else None,
|
||||
"capabilities": [],
|
||||
}
|
||||
|
||||
if plugin.capabilities:
|
||||
for cap_setting in plugin.capabilities:
|
||||
capability_name = cap_setting.capability.name
|
||||
|
||||
plugin_details[plugin_name]["capabilities"].append(
|
||||
{
|
||||
"capability": capability_name,
|
||||
"supported": cap_setting.supported,
|
||||
"description": cap_setting.description,
|
||||
}
|
||||
)
|
||||
|
||||
return CapabilitySummary(
|
||||
plugin_details=plugin_details,
|
||||
)
|
||||
|
||||
|
||||
def save_capability_report(summary: CapabilitySummary, output_dir: str) -> None:
|
||||
"""Save the capability summary as JSON files, but only write if contents have changed."""
|
||||
|
||||
output_path = pathlib.Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
existing_capabilities = {}
|
||||
existing_summary_file = pathlib.Path(
|
||||
"./src/datahub/ingestion/autogenerated/capability_summary.json"
|
||||
)
|
||||
if existing_summary_file.exists():
|
||||
try:
|
||||
with open(existing_summary_file, "r") as f:
|
||||
existing_data = json.load(f)
|
||||
existing_capabilities = existing_data.get("plugin_details", {})
|
||||
logger.info(
|
||||
f"Loaded existing capability data for {len(existing_capabilities)} plugins"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load existing capability data: {e}")
|
||||
|
||||
missing_plugins = set(existing_capabilities.keys()) - set(
|
||||
summary.plugin_details.keys()
|
||||
)
|
||||
for plugin_name in missing_plugins:
|
||||
logger.warning(
|
||||
f"Plugin {plugin_name} failed to load, using existing capability data as fallback. Manually remove from capability_summary.json if you want to remove it from the report."
|
||||
)
|
||||
summary.plugin_details[plugin_name] = existing_capabilities[plugin_name]
|
||||
|
||||
summary_dict = dataclasses.asdict(summary)
|
||||
summary_dict["generated_by"] = "metadata-ingestion/scripts/capability_summary.py"
|
||||
summary_dict["generated_at"] = datetime.now(timezone.utc).isoformat()
|
||||
summary_json = json.dumps(summary_dict, indent=2, sort_keys=True)
|
||||
|
||||
summary_file = output_path / "capability_summary.json"
|
||||
write_file = True
|
||||
if summary_file.exists():
|
||||
try:
|
||||
with open(summary_file, "r") as f:
|
||||
existing_data = json.load(f)
|
||||
|
||||
# Create copies without generated_at for comparison
|
||||
existing_for_comparison = existing_data.copy()
|
||||
new_for_comparison = summary_dict.copy()
|
||||
existing_for_comparison.pop("generated_at", None)
|
||||
new_for_comparison.pop("generated_at", None)
|
||||
|
||||
if json.dumps(
|
||||
existing_for_comparison, indent=2, sort_keys=True
|
||||
) == json.dumps(new_for_comparison, indent=2, sort_keys=True):
|
||||
logger.info(f"No changes detected in {summary_file}, skipping write.")
|
||||
write_file = False
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not read existing summary file: {e}")
|
||||
if write_file:
|
||||
with open(summary_file, "w") as f:
|
||||
f.write(summary_json)
|
||||
logger.info(f"Capability summary saved to {summary_file}")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="./autogenerated",
|
||||
help="Output directory for capability reports",
|
||||
)
|
||||
@click.option(
|
||||
"--source",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Generate report for specific source only",
|
||||
)
|
||||
def generate_capability_report(output_dir: str, source: Optional[str] = None) -> None:
|
||||
"""Generate a comprehensive capability report for all ingestion sources."""
|
||||
|
||||
logger.info("Starting capability report generation...")
|
||||
|
||||
if source:
|
||||
if source not in source_registry.mapping:
|
||||
logger.error(f"Source '{source}' not found in registry")
|
||||
return
|
||||
original_mapping = source_registry.mapping.copy()
|
||||
source_registry.mapping = {source: original_mapping[source]}
|
||||
|
||||
try:
|
||||
summary = generate_capability_summary()
|
||||
save_capability_report(summary, output_dir)
|
||||
|
||||
print("Capability Report Generation Complete")
|
||||
print("=====================================")
|
||||
print(f"Total plugins processed: {len(summary.plugin_details)}")
|
||||
print(f"Plugins with capabilities: {len(summary.plugin_details)}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
|
||||
finally:
|
||||
if source:
|
||||
source_registry.mapping = original_mapping
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(asctime)s %(levelname)-8s {%(name)s:%(lineno)d}] - %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S %Z",
|
||||
)
|
||||
generate_capability_report()
|
||||
@ -15,7 +15,7 @@ from docgen_types import Platform, Plugin
|
||||
from docs_config_table import gen_md_table_from_json_schema
|
||||
|
||||
from datahub.configuration.common import ConfigModel
|
||||
from datahub.ingestion.api.decorators import SourceCapability, SupportStatus
|
||||
from datahub.ingestion.api.decorators import SourceCapability, SupportStatus, CapabilitySetting
|
||||
from datahub.ingestion.source.source_registry import source_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -68,6 +68,20 @@ def get_capability_text(src_capability: SourceCapability) -> str:
|
||||
)
|
||||
|
||||
|
||||
def map_capability_name_to_enum(capability_name: str) -> SourceCapability:
|
||||
"""
|
||||
Maps capability names from the JSON file to SourceCapability enum values.
|
||||
The JSON file uses enum names (e.g., "DATA_PROFILING") but the enum expects values (e.g., "Data Profiling").
|
||||
"""
|
||||
try:
|
||||
return SourceCapability[capability_name]
|
||||
except KeyError:
|
||||
try:
|
||||
return SourceCapability(capability_name)
|
||||
except ValueError:
|
||||
raise ValueError(f"Unknown capability name: {capability_name}")
|
||||
|
||||
|
||||
def does_extra_exist(extra_name: str) -> bool:
|
||||
for key, value in metadata("acryl-datahub").items():
|
||||
if key == "Provides-Extra" and value == extra_name:
|
||||
@ -129,84 +143,102 @@ def rewrite_markdown(file_contents: str, path: str, relocated_path: str) -> str:
|
||||
return new_content
|
||||
|
||||
|
||||
def load_plugin(plugin_name: str, out_dir: str) -> Plugin:
|
||||
logger.debug(f"Loading {plugin_name}")
|
||||
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
|
||||
if isinstance(class_or_exception, Exception):
|
||||
raise class_or_exception
|
||||
source_type = source_registry.get(plugin_name)
|
||||
logger.debug(f"Source class is {source_type}")
|
||||
def load_capability_data(capability_summary_path: str) -> Dict:
|
||||
"""Load capability data from the capability summary JSON file."""
|
||||
try:
|
||||
with open(capability_summary_path, 'r') as f:
|
||||
return json.load(f)
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Capability summary file not found: {capability_summary_path}")
|
||||
raise
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse capability summary JSON: {e}")
|
||||
raise
|
||||
|
||||
if hasattr(source_type, "get_platform_name"):
|
||||
platform_name = source_type.get_platform_name()
|
||||
else:
|
||||
platform_name = (
|
||||
plugin_name.title()
|
||||
) # we like platform names to be human readable
|
||||
|
||||
platform_id = None
|
||||
if hasattr(source_type, "get_platform_id"):
|
||||
platform_id = source_type.get_platform_id()
|
||||
if platform_id is None:
|
||||
raise ValueError(f"Platform ID not found for {plugin_name}")
|
||||
|
||||
def create_plugin_from_capability_data(plugin_name: str, plugin_data: Dict, out_dir: str) -> Plugin:
|
||||
"""Create a Plugin object from capability data."""
|
||||
plugin = Plugin(
|
||||
name=plugin_name,
|
||||
platform_id=platform_id,
|
||||
platform_name=platform_name,
|
||||
classname=".".join([source_type.__module__, source_type.__name__]),
|
||||
platform_id=plugin_data["platform_id"],
|
||||
platform_name=plugin_data["platform_name"],
|
||||
classname=plugin_data["classname"],
|
||||
)
|
||||
|
||||
if hasattr(source_type, "get_platform_doc_order"):
|
||||
platform_doc_order = source_type.get_platform_doc_order()
|
||||
plugin.doc_order = platform_doc_order
|
||||
|
||||
plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
|
||||
if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
|
||||
plugin_file_name = plugin_file_name + "/__init__.py"
|
||||
else:
|
||||
plugin_file_name = plugin_file_name + ".py"
|
||||
if os.path.exists(plugin_file_name):
|
||||
plugin.filename = plugin_file_name
|
||||
else:
|
||||
logger.info(
|
||||
f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist"
|
||||
)
|
||||
|
||||
if hasattr(source_type, "__doc__"):
|
||||
plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "")
|
||||
|
||||
if hasattr(source_type, "get_support_status"):
|
||||
plugin.support_status = source_type.get_support_status()
|
||||
|
||||
if hasattr(source_type, "get_capabilities"):
|
||||
capabilities = list(source_type.get_capabilities())
|
||||
capabilities.sort(key=lambda x: x.capability.value)
|
||||
|
||||
# Set support status
|
||||
if plugin_data.get("support_status"):
|
||||
plugin.support_status = SupportStatus[plugin_data["support_status"]]
|
||||
|
||||
# Set capabilities
|
||||
if plugin_data.get("capabilities"):
|
||||
capabilities = []
|
||||
for cap_data in plugin_data["capabilities"]:
|
||||
capability = map_capability_name_to_enum(cap_data["capability"])
|
||||
capabilities.append(CapabilitySetting(
|
||||
capability=capability,
|
||||
supported=cap_data["supported"],
|
||||
description=cap_data["description"]
|
||||
))
|
||||
plugin.capabilities = capabilities
|
||||
|
||||
|
||||
# Load additional plugin information that's not in capability summary
|
||||
try:
|
||||
extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
|
||||
plugin.extra_deps = (
|
||||
get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
|
||||
)
|
||||
# Load source class to get additional metadata
|
||||
class_or_exception = source_registry._ensure_not_lazy(plugin_name)
|
||||
if isinstance(class_or_exception, Exception):
|
||||
raise class_or_exception
|
||||
source_type = source_registry.get(plugin_name)
|
||||
|
||||
# Get doc order
|
||||
if hasattr(source_type, "get_platform_doc_order"):
|
||||
platform_doc_order = source_type.get_platform_doc_order()
|
||||
plugin.doc_order = platform_doc_order
|
||||
|
||||
# Get filename
|
||||
plugin_file_name = "src/" + "/".join(source_type.__module__.split("."))
|
||||
if os.path.exists(plugin_file_name) and os.path.isdir(plugin_file_name):
|
||||
plugin_file_name = plugin_file_name + "/__init__.py"
|
||||
else:
|
||||
plugin_file_name = plugin_file_name + ".py"
|
||||
if os.path.exists(plugin_file_name):
|
||||
plugin.filename = plugin_file_name
|
||||
else:
|
||||
logger.info(
|
||||
f"Failed to locate filename for {plugin_name}. Guessed {plugin_file_name}, but that doesn't exist"
|
||||
)
|
||||
|
||||
# Get docstring
|
||||
if hasattr(source_type, "__doc__"):
|
||||
plugin.source_docstring = textwrap.dedent(source_type.__doc__ or "")
|
||||
|
||||
# Get extra dependencies
|
||||
try:
|
||||
extra_plugin = plugin_name if does_extra_exist(plugin_name) else None
|
||||
plugin.extra_deps = (
|
||||
get_additional_deps_for_extra(extra_plugin) if extra_plugin else []
|
||||
)
|
||||
except Exception as e:
|
||||
logger.info(
|
||||
f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e
|
||||
)
|
||||
|
||||
# Get config class
|
||||
if hasattr(source_type, "get_config_class"):
|
||||
source_config_class: ConfigModel = source_type.get_config_class()
|
||||
|
||||
plugin.config_json_schema = source_config_class.schema_json(indent=2)
|
||||
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
|
||||
|
||||
# Write the config json schema to the out_dir.
|
||||
config_dir = pathlib.Path(out_dir) / "config_schemas"
|
||||
config_dir.mkdir(parents=True, exist_ok=True)
|
||||
(config_dir / f"{plugin_name}_config.json").write_text(
|
||||
plugin.config_json_schema
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.info(
|
||||
f"Failed to load extras for {plugin_name} due to exception {e}", exc_info=e
|
||||
)
|
||||
|
||||
if hasattr(source_type, "get_config_class"):
|
||||
source_config_class: ConfigModel = source_type.get_config_class()
|
||||
|
||||
plugin.config_json_schema = source_config_class.schema_json(indent=2)
|
||||
plugin.config_md = gen_md_table_from_json_schema(source_config_class.schema(), current_source=plugin_name)
|
||||
|
||||
# Write the config json schema to the out_dir.
|
||||
config_dir = pathlib.Path(out_dir) / "config_schemas"
|
||||
config_dir.mkdir(parents=True, exist_ok=True)
|
||||
(config_dir / f"{plugin_name}_config.json").write_text(
|
||||
plugin.config_json_schema
|
||||
)
|
||||
|
||||
logger.warning(f"Failed to load additional metadata for {plugin_name}: {e}")
|
||||
|
||||
return plugin
|
||||
|
||||
|
||||
@ -227,15 +259,25 @@ class PlatformMetrics:
|
||||
|
||||
@click.command()
|
||||
@click.option("--out-dir", type=str, required=True)
|
||||
@click.option("--capability-summary", type=str, required=True, help="Path to capability summary JSON file")
|
||||
@click.option("--extra-docs", type=str, required=False)
|
||||
@click.option("--source", type=str, required=False)
|
||||
def generate(
|
||||
out_dir: str, extra_docs: Optional[str] = None, source: Optional[str] = None
|
||||
out_dir: str, capability_summary: str, extra_docs: Optional[str] = None, source: Optional[str] = None
|
||||
) -> None: # noqa: C901
|
||||
plugin_metrics = PluginMetrics()
|
||||
platform_metrics = PlatformMetrics()
|
||||
|
||||
platforms: Dict[str, Platform] = {}
|
||||
|
||||
# Load capability data
|
||||
try:
|
||||
capability_data = load_capability_data(capability_summary)
|
||||
logger.info(f"Loaded capability data from {capability_summary}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load capability data: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
for plugin_name in sorted(source_registry.mapping.keys()):
|
||||
if source and source != plugin_name:
|
||||
continue
|
||||
@ -250,7 +292,14 @@ def generate(
|
||||
|
||||
plugin_metrics.discovered += 1
|
||||
try:
|
||||
plugin = load_plugin(plugin_name, out_dir=out_dir)
|
||||
if plugin_name in capability_data.get("plugin_details", {}):
|
||||
# Use capability data
|
||||
plugin_data = capability_data["plugin_details"][plugin_name]
|
||||
plugin = create_plugin_from_capability_data(plugin_name, plugin_data, out_dir=out_dir)
|
||||
else:
|
||||
logger.error(f"Plugin {plugin_name} not found in capability data")
|
||||
plugin_metrics.failed += 1
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to load {plugin_name} due to exception {e}", exc_info=e
|
||||
@ -531,7 +580,7 @@ By default, the UI shows the latest version of the lineage. The time picker can
|
||||
In this example, data flows from Airflow/BigQuery to Snowflake tables, then to the Hive dataset, and ultimately to the features of Machine Learning Models.
|
||||
|
||||
|
||||
:::tip The Lineage Tab is greyed out - why can’t I click on it?
|
||||
:::tip The Lineage Tab is greyed out - why can't I click on it?
|
||||
This means you have not yet ingested lineage metadata for that entity. Please ingest lineage to proceed.
|
||||
|
||||
:::
|
||||
@ -666,7 +715,7 @@ This is a summary of automatic lineage extraction support in our data source. Pl
|
||||
|
||||
### SQL Parser Lineage Extraction
|
||||
|
||||
If you’re using a different database system for which we don’t support column-level lineage out of the box, but you do have a database query log available,
|
||||
If you're using a different database system for which we don't support column-level lineage out of the box, but you do have a database query log available,
|
||||
we have a SQL queries connector that generates column-level lineage and detailed table usage statistics from the query log.
|
||||
|
||||
If these does not suit your needs, you can use the new `DataHubGraph.parse_sql_lineage()` method in our SDK. (See the source code [here](https://docs.datahub.com/docs/python-sdk/clients/graph-client))
|
||||
|
||||
@ -5,6 +5,7 @@ set -euo pipefail
|
||||
DATAHUB_ROOT=..
|
||||
DOCS_OUT_DIR=$DATAHUB_ROOT/docs/generated/ingestion
|
||||
EXTRA_DOCS_DIR=$DATAHUB_ROOT/metadata-ingestion/docs/sources
|
||||
CAPABILITY_SUMMARY_FILE=./src/datahub/ingestion/autogenerated/capability_summary.json
|
||||
|
||||
rm -r $DOCS_OUT_DIR || true
|
||||
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --extra-docs ${EXTRA_DOCS_DIR} $@
|
||||
python scripts/docgen.py --out-dir ${DOCS_OUT_DIR} --capability-summary ${CAPABILITY_SUMMARY_FILE} --extra-docs ${EXTRA_DOCS_DIR} $@
|
||||
|
||||
@ -104,6 +104,7 @@ def capability(
|
||||
for base in cls.__bases__
|
||||
):
|
||||
cls.__capabilities = {}
|
||||
|
||||
cls.get_capabilities = lambda: cls.__capabilities.values()
|
||||
|
||||
# If the superclasses have capability annotations, copy those over.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
|
||||
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION,
|
||||
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
|
||||
"Enabled by default via stateful ingestion",
|
||||
supported=True,
|
||||
)
|
||||
class CassandraSource(StatefulIngestionSourceBase):
|
||||
|
||||
@ -167,7 +167,7 @@ class AzureADSourceReport(StaleEntityRemovalSourceReport):
|
||||
@config_class(AzureADConfig)
|
||||
@support_status(SupportStatus.CERTIFIED)
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
|
||||
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
||||
)
|
||||
class AzureADSource(StatefulIngestionSourceBase):
|
||||
"""
|
||||
|
||||
@ -202,7 +202,7 @@ class OktaSourceReport(StaleEntityRemovalSourceReport):
|
||||
@support_status(SupportStatus.CERTIFIED)
|
||||
@capability(SourceCapability.DESCRIPTIONS, "Optionally enabled via configuration")
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
|
||||
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
||||
)
|
||||
class OktaSource(StatefulIngestionSourceBase):
|
||||
"""
|
||||
|
||||
@ -71,7 +71,7 @@ class PresetConfig(SupersetConfig):
|
||||
@config_class(PresetConfig)
|
||||
@support_status(SupportStatus.CERTIFIED)
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
|
||||
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
||||
)
|
||||
class PresetSource(SupersetSource):
|
||||
"""
|
||||
|
||||
@ -118,7 +118,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
||||
)
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION,
|
||||
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
|
||||
"Enabled by default via stateful ingestion",
|
||||
supported=True,
|
||||
)
|
||||
@capability(
|
||||
|
||||
@ -116,7 +116,7 @@ class VerticaConfig(BasicSQLAlchemyConfig):
|
||||
)
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION,
|
||||
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
|
||||
"Enabled by default via stateful ingestion",
|
||||
supported=True,
|
||||
)
|
||||
class VerticaSource(SQLAlchemySource):
|
||||
|
||||
@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
|
||||
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION,
|
||||
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
|
||||
"Enabled by default via stateful ingestion",
|
||||
supported=True,
|
||||
)
|
||||
class StatefulIngestionSourceBase(Source):
|
||||
|
||||
@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
|
||||
@config_class(SupersetConfig)
|
||||
@support_status(SupportStatus.CERTIFIED)
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
|
||||
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
||||
)
|
||||
@capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
|
||||
@capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
|
||||
|
||||
@ -159,7 +159,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
||||
)
|
||||
@capability(
|
||||
SourceCapability.DELETION_DETECTION,
|
||||
"Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
|
||||
"Enabled by default via stateful ingestion",
|
||||
supported=True,
|
||||
)
|
||||
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user