OpenMetadata/scripts/deploy-pipelines.py

"""
Deploy Ingestion Pipelines Script

This script uses the OpenMetadata client to:
1. Paginate over ingestion pipelines in groups of 20
2. Fetch the IDs of those IngestionPipelines  
3. Send bulk deploy requests to api/v1/services/ingestionPipelines/bulk/deploy
4. Track responses to monitor deployment success/failure
"""

import argparse
import json
import logging
import sys
from typing import Dict, List
from uuid import UUID

from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (
    OpenMetadataConnection,
)
from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import (
    IngestionPipeline,
)
from metadata.generated.schema.entity.services.ingestionPipelines.pipelineServiceClientResponse import (
    PipelineServiceClientResponse,
)
from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (
    OpenMetadataJWTClientConfig,
)
from metadata.ingestion.ometa.ometa_api import OpenMetadata
from metadata.ingestion.ometa.utils import model_str

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


class PipelineDeployer:
    """Class to handle bulk deployment of ingestion pipelines"""
    
    def __init__(self, server_url: str, jwt_token: str):
        """
        Initialize the PipelineDeployer with OpenMetadata connection
        
        Args:
            server_url: OpenMetadata server URL
            jwt_token: JWT token for authentication
        """
        # Remove trailing slash if present
        self.server_url = server_url.rstrip('/')
        
        # Configure OpenMetadata connection
        server_config = OpenMetadataConnection(
            hostPort=self.server_url,
            authProvider="openmetadata",
            securityConfig=OpenMetadataJWTClientConfig(jwtToken=jwt_token),
        )
        
        self.metadata = OpenMetadata(server_config)
        logger.info(f"Connected to OpenMetadata server: {server_url}")
    
    def bulk_deploy_pipelines(self, pipeline_ids: List[str]) -> List[PipelineServiceClientResponse]:
        """
        Send bulk deploy request to OpenMetadata API
        
        Args:
            pipeline_ids: List of pipeline UUIDs to deploy
            
        Returns:
            List of PipelineServiceClientResponse objects
        """
        if not pipeline_ids:
            logger.warning("No pipeline IDs provided for deployment")
            return []
        
        logger.info(f"Deploying {len(pipeline_ids)} pipelines...")
        
        try:
            # Make POST request to bulk deploy endpoint
            response = self.metadata.client.post(
                "/services/ingestionPipelines/bulk/deploy",
                data=json.dumps(pipeline_ids),
            )
            
            if response:
                # Parse response data into Pydantic models
                parsed_responses = [
                    PipelineServiceClientResponse.model_validate(item)
                    for item in response
                ]
                logger.info(f"Bulk deploy request completed with {len(parsed_responses)} responses")
                return parsed_responses
            else:
                logger.error("No response from bulk deploy API")
                return []
                
        except Exception as e:
            logger.error(f"Error during bulk deployment: {e}")
            raise
    
    def analyze_deployment_results(
        self, 
        responses: List[PipelineServiceClientResponse],
        pipeline_ids: List[UUID]
    ) -> Dict[str, int]:
        """
        Analyze deployment responses to track success/failure
        
        Args:
            responses: List of PipelineServiceClientResponse objects
            pipeline_ids: List of pipeline IDs that were deployed (for correlation)
            
        Returns:
            Dictionary with deployment statistics
        """
        stats = {
            "total": len(responses),
            "success": 0,
            "failed": 0,
            "unknown": 0
        }
        
        success_codes = {200, 201}  # HTTP success codes
        
        # Correlate responses with pipeline IDs (assuming same order)
        for i, response in enumerate(responses):
            pipeline_id = pipeline_ids[i] if i < len(pipeline_ids) else "unknown"
            code = response.code
            reason = response.reason or ""
            platform = response.platform
            version = response.version or "unknown"
            
            if code in success_codes:
                stats["success"] += 1
                logger.info(f"✓ Pipeline {pipeline_id} deployed successfully on {platform} v{version} (code: {code})")
            elif code:
                stats["failed"] += 1
                logger.warning(f"✗ Pipeline {pipeline_id} deployment failed on {platform} v{version} (code: {code}): {reason}")
            else:
                stats["unknown"] += 1
                logger.warning(f"? Pipeline {pipeline_id} unknown deployment status on {platform}: {response.model_dump()}")
        
        return stats
    
    def deploy_all_pipelines(self, batch_size: int = 20) -> Dict[str, int]:
        """
        Main method to deploy all ingestion pipelines
        
        Args:
            batch_size: Size of batches for pagination and deployment
            
        Returns:
            Dictionary with overall deployment statistics
        """
        try:
            # Fetch all pipelines
            pipelines = list(self.metadata.list_all_entities(entity=IngestionPipeline, skip_on_failure=True))

            if not pipelines:
                logger.warning("No pipelines found to deploy")
                return {"total": 0, "success": 0, "failed": 0, "unknown": 0}

            logger.info("Found %d pipelines to deploy", len(pipelines))
            
            # Extract pipeline IDs
            pipeline_ids = [model_str(pipeline.id) for pipeline in pipelines]
            
            if not pipeline_ids:
                logger.error("No valid pipeline IDs found")
                return {"total": 0, "success": 0, "failed": 0, "unknown": 0}
            
            # Deploy pipelines in batches
            all_responses = []
            total_batches = (len(pipeline_ids) + batch_size - 1) // batch_size
            
            for i in range(0, len(pipeline_ids), batch_size):
                batch = pipeline_ids[i:i+batch_size]
                batch_num = i//batch_size + 1
                remaining_batches = total_batches - batch_num
                
                logger.info(f"Deploying batch {batch_num}/{total_batches} with {len(batch)} pipelines... ({remaining_batches} chunks remaining)")
                
                batch_responses = self.bulk_deploy_pipelines(batch)
                all_responses.extend(batch_responses)
                
                # Log parsed responses for this batch
                logger.info(f"Batch {batch_num} responses:")
                for j, response in enumerate(batch_responses):
                    pipeline_id = batch[j] if j < len(batch) else "unknown"
                    logger.info(f"  Pipeline {pipeline_id}: code={response.code}, platform={response.platform}, reason={response.reason or 'N/A'}")
                
                logger.info(f"Batch {batch_num} completed. Progress: {batch_num}/{total_batches} batches processed")
            
            # Analyze results - correlate responses with the pipeline IDs we sent
            stats = self.analyze_deployment_results(all_responses, pipeline_ids)
            
            logger.info("=== Deployment Summary ===")
            logger.info(f"Total pipelines: {stats['total']}")
            logger.info(f"Successfully deployed: {stats['success']}")
            logger.info(f"Failed deployments: {stats['failed']}")
            logger.info(f"Unknown status: {stats['unknown']}")
            
            return stats
            
        except Exception as e:
            logger.error(f"Deployment process failed: {e}")
            raise


def main():
    """Main function to run the pipeline deployment script"""
    parser = argparse.ArgumentParser(
        description="Deploy all ingestion pipelines using OpenMetadata bulk deploy API"
    )
    parser.add_argument(
        "--server-url",
        required=True,
        help="OpenMetadata server URL (e.g., http://localhost:8585/api/)"
    )
    parser.add_argument(
        "--jwt-token", 
        required=True,
        help="JWT token for authentication"
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=20,
        help="Batch size for pagination and deployment (default: 20)"
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable verbose logging"
    )
    
    args = parser.parse_args()
    logging.getLogger().setLevel(logging.INFO)
    
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    
    try:
        deployer = PipelineDeployer(args.server_url, args.jwt_token)
        stats = deployer.deploy_all_pipelines(batch_size=args.batch_size)
        
        # Exit with error code if any deployments failed
        if stats["failed"] > 0:
            sys.exit(1)
        else:
            sys.exit(0)
            
    except Exception as e:
        logger.error(f"Script failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()
MINOR - deploy pipelines fixes (#24575) * MINOR - deploy pipelines fixes * fix token and status 2025-11-27 12:31:07 +01:00			`"""`
			`Deploy Ingestion Pipelines Script`

			`This script uses the OpenMetadata client to:`
			`1. Paginate over ingestion pipelines in groups of 20`
			`2. Fetch the IDs of those IngestionPipelines`
			`3. Send bulk deploy requests to api/v1/services/ingestionPipelines/bulk/deploy`
			`4. Track responses to monitor deployment success/failure`
			`"""`

			`import argparse`
			`import json`
			`import logging`
			`import sys`
			`from typing import Dict, List`
			`from uuid import UUID`

			`from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import (`
			`OpenMetadataConnection,`
			`)`
			`from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import (`
			`IngestionPipeline,`
			`)`
			`from metadata.generated.schema.entity.services.ingestionPipelines.pipelineServiceClientResponse import (`
			`PipelineServiceClientResponse,`
			`)`
			`from metadata.generated.schema.security.client.openMetadataJWTClientConfig import (`
			`OpenMetadataJWTClientConfig,`
			`)`
			`from metadata.ingestion.ometa.ometa_api import OpenMetadata`
			`from metadata.ingestion.ometa.utils import model_str`

			`# Configure logging`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"`
			`)`
			`logger = logging.getLogger(__name__)`


			`class PipelineDeployer:`
			`"""Class to handle bulk deployment of ingestion pipelines"""`

			`def __init__(self, server_url: str, jwt_token: str):`
			`"""`
			`Initialize the PipelineDeployer with OpenMetadata connection`

			`Args:`
			`server_url: OpenMetadata server URL`
			`jwt_token: JWT token for authentication`
			`"""`
			`# Remove trailing slash if present`
			`self.server_url = server_url.rstrip('/')`

			`# Configure OpenMetadata connection`
			`server_config = OpenMetadataConnection(`
			`hostPort=self.server_url,`
			`authProvider="openmetadata",`
			`securityConfig=OpenMetadataJWTClientConfig(jwtToken=jwt_token),`
			`)`

			`self.metadata = OpenMetadata(server_config)`
			`logger.info(f"Connected to OpenMetadata server: {server_url}")`

			`def bulk_deploy_pipelines(self, pipeline_ids: List[str]) -> List[PipelineServiceClientResponse]:`
			`"""`
			`Send bulk deploy request to OpenMetadata API`

			`Args:`
			`pipeline_ids: List of pipeline UUIDs to deploy`

			`Returns:`
			`List of PipelineServiceClientResponse objects`
			`"""`
			`if not pipeline_ids:`
			`logger.warning("No pipeline IDs provided for deployment")`
			`return []`

			`logger.info(f"Deploying {len(pipeline_ids)} pipelines...")`

			`try:`
			`# Make POST request to bulk deploy endpoint`
			`response = self.metadata.client.post(`
			`"/services/ingestionPipelines/bulk/deploy",`
			`data=json.dumps(pipeline_ids),`
			`)`

			`if response:`
			`# Parse response data into Pydantic models`
			`parsed_responses = [`
			`PipelineServiceClientResponse.model_validate(item)`
			`for item in response`
			`]`
			`logger.info(f"Bulk deploy request completed with {len(parsed_responses)} responses")`
			`return parsed_responses`
			`else:`
			`logger.error("No response from bulk deploy API")`
			`return []`

			`except Exception as e:`
			`logger.error(f"Error during bulk deployment: {e}")`
			`raise`

			`def analyze_deployment_results(`
			`self,`
			`responses: List[PipelineServiceClientResponse],`
			`pipeline_ids: List[UUID]`
			`) -> Dict[str, int]:`
			`"""`
			`Analyze deployment responses to track success/failure`

			`Args:`
			`responses: List of PipelineServiceClientResponse objects`
			`pipeline_ids: List of pipeline IDs that were deployed (for correlation)`

			`Returns:`
			`Dictionary with deployment statistics`
			`"""`
			`stats = {`
			`"total": len(responses),`
			`"success": 0,`
			`"failed": 0,`
			`"unknown": 0`
			`}`

			`success_codes = {200, 201} # HTTP success codes`

			`# Correlate responses with pipeline IDs (assuming same order)`
			`for i, response in enumerate(responses):`
			`pipeline_id = pipeline_ids[i] if i < len(pipeline_ids) else "unknown"`
			`code = response.code`
			`reason = response.reason or ""`
			`platform = response.platform`
			`version = response.version or "unknown"`

			`if code in success_codes:`
			`stats["success"] += 1`
			`logger.info(f"✓ Pipeline {pipeline_id} deployed successfully on {platform} v{version} (code: {code})")`
			`elif code:`
			`stats["failed"] += 1`
			`logger.warning(f"✗ Pipeline {pipeline_id} deployment failed on {platform} v{version} (code: {code}): {reason}")`
			`else:`
			`stats["unknown"] += 1`
			`logger.warning(f"? Pipeline {pipeline_id} unknown deployment status on {platform}: {response.model_dump()}")`

			`return stats`

			`def deploy_all_pipelines(self, batch_size: int = 20) -> Dict[str, int]:`
			`"""`
			`Main method to deploy all ingestion pipelines`

			`Args:`
			`batch_size: Size of batches for pagination and deployment`

			`Returns:`
			`Dictionary with overall deployment statistics`
			`"""`
			`try:`
			`# Fetch all pipelines`
			`pipelines = list(self.metadata.list_all_entities(entity=IngestionPipeline, skip_on_failure=True))`

			`if not pipelines:`
			`logger.warning("No pipelines found to deploy")`
			`return {"total": 0, "success": 0, "failed": 0, "unknown": 0}`

			`logger.info("Found %d pipelines to deploy", len(pipelines))`

			`# Extract pipeline IDs`
			`pipeline_ids = [model_str(pipeline.id) for pipeline in pipelines]`

			`if not pipeline_ids:`
			`logger.error("No valid pipeline IDs found")`
			`return {"total": 0, "success": 0, "failed": 0, "unknown": 0}`

			`# Deploy pipelines in batches`
			`all_responses = []`
			`total_batches = (len(pipeline_ids) + batch_size - 1) // batch_size`

			`for i in range(0, len(pipeline_ids), batch_size):`
			`batch = pipeline_ids[i:i+batch_size]`
			`batch_num = i//batch_size + 1`
			`remaining_batches = total_batches - batch_num`

			`logger.info(f"Deploying batch {batch_num}/{total_batches} with {len(batch)} pipelines... ({remaining_batches} chunks remaining)")`

			`batch_responses = self.bulk_deploy_pipelines(batch)`
			`all_responses.extend(batch_responses)`

			`# Log parsed responses for this batch`
			`logger.info(f"Batch {batch_num} responses:")`
			`for j, response in enumerate(batch_responses):`
			`pipeline_id = batch[j] if j < len(batch) else "unknown"`
			`logger.info(f" Pipeline {pipeline_id}: code={response.code}, platform={response.platform}, reason={response.reason or 'N/A'}")`

			`logger.info(f"Batch {batch_num} completed. Progress: {batch_num}/{total_batches} batches processed")`

			`# Analyze results - correlate responses with the pipeline IDs we sent`
			`stats = self.analyze_deployment_results(all_responses, pipeline_ids)`

			`logger.info("=== Deployment Summary ===")`
			`logger.info(f"Total pipelines: {stats['total']}")`
			`logger.info(f"Successfully deployed: {stats['success']}")`
			`logger.info(f"Failed deployments: {stats['failed']}")`
			`logger.info(f"Unknown status: {stats['unknown']}")`

			`return stats`

			`except Exception as e:`
			`logger.error(f"Deployment process failed: {e}")`
			`raise`


			`def main():`
			`"""Main function to run the pipeline deployment script"""`
			`parser = argparse.ArgumentParser(`
			`description="Deploy all ingestion pipelines using OpenMetadata bulk deploy API"`
			`)`
			`parser.add_argument(`
			`"--server-url",`
			`required=True,`
			`help="OpenMetadata server URL (e.g., http://localhost:8585/api/)"`
			`)`
			`parser.add_argument(`
			`"--jwt-token",`
			`required=True,`
			`help="JWT token for authentication"`
			`)`
			`parser.add_argument(`
			`"--batch-size",`
			`type=int,`
			`default=20,`
			`help="Batch size for pagination and deployment (default: 20)"`
			`)`
			`parser.add_argument(`
			`"--verbose",`
			`"-v",`
			`action="store_true",`
			`help="Enable verbose logging"`
			`)`

			`args = parser.parse_args()`
			`logging.getLogger().setLevel(logging.INFO)`

			`if args.verbose:`
			`logging.getLogger().setLevel(logging.DEBUG)`

			`try:`
			`deployer = PipelineDeployer(args.server_url, args.jwt_token)`
			`stats = deployer.deploy_all_pipelines(batch_size=args.batch_size)`

			`# Exit with error code if any deployments failed`
			`if stats["failed"] > 0:`
			`sys.exit(1)`
			`else:`
			`sys.exit(0)`

			`except Exception as e:`
			`logger.error(f"Script failed: {e}")`
			`sys.exit(1)`


			`if __name__ == "__main__":`
			`main()`