From e1c1cca8a04ba4bf753b12c8680e3b8ba630c722 Mon Sep 17 00:00:00 2001 From: Paul Farault Date: Tue, 16 Dec 2025 14:38:56 +0100 Subject: [PATCH] feat: recover from inconsistent state Fixes: #694 --- tdp/cli/__main__.py | 2 + tdp/cli/commands/deploy.py | 13 +++-- tdp/cli/commands/deploy_danger_fix_running.py | 50 +++++++++++++++++++ tdp/cli/utils.py | 5 +- tdp/core/models/deployment_model.py | 31 ++++++++++++ 5 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 tdp/cli/commands/deploy_danger_fix_running.py diff --git a/tdp/cli/__main__.py b/tdp/cli/__main__.py index f5f0bc5c..7caabc9b 100644 --- a/tdp/cli/__main__.py +++ b/tdp/cli/__main__.py @@ -11,6 +11,7 @@ from tdp.cli.commands.dag import dag from tdp.cli.commands.default_diff import default_diff from tdp.cli.commands.deploy import deploy +from tdp.cli.commands.deploy_danger_fix_running import danger_fix_running from tdp.cli.commands.init import init from tdp.cli.commands.ops import ops from tdp.cli.commands.plan import plan @@ -110,6 +111,7 @@ def cli(): cli.add_command(dag) cli.add_command(default_diff) cli.add_command(deploy) +deploy.add_command(danger_fix_running) cli.add_command(init) cli.add_command(ops) cli.add_command(plan) diff --git a/tdp/cli/commands/deploy.py b/tdp/cli/commands/deploy.py index 87912d59..baaddbbc 100644 --- a/tdp/cli/commands/deploy.py +++ b/tdp/cli/commands/deploy.py @@ -21,7 +21,8 @@ from tdp.core.collections import Collections -@click.command() +@click.group(invoke_without_command=True) +@click.pass_context @click.option( "--force-stale-update", "--fsu", @@ -40,7 +41,13 @@ ) @validate_option @vars_option -def deploy( +def deploy(ctx, *args, **kwargs): + """Execute a planned deployment.""" + if ctx.invoked_subcommand is None: + _handle_deploy(*args, **kwargs) + + +def _handle_deploy( dry: bool, collections: Collections, db_engine: Engine, @@ -49,8 +56,6 @@ def deploy( validate: bool, vars: Path, ): - """Execute a planned deployment.""" - from tdp.cli.utils import check_services_cleanliness from tdp.core.deployment import DeploymentRunner, Executor from tdp.core.models.enums import DeploymentStateEnum diff --git a/tdp/cli/commands/deploy_danger_fix_running.py b/tdp/cli/commands/deploy_danger_fix_running.py new file mode 100644 index 00000000..90ddc014 --- /dev/null +++ b/tdp/cli/commands/deploy_danger_fix_running.py @@ -0,0 +1,50 @@ +# Copyright 2022 TOSIT.IO +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import click + +from tdp.cli.params import database_dsn_option + +if TYPE_CHECKING: + from sqlalchemy import Engine + + +@click.command("danger-fix-running") +@database_dsn_option +def danger_fix_running( + db_engine: Engine, +): + """Fix the last deployment state if left as RUNNING. + + DANGER: Only use this command if the database has been left in an incorect state, + where the last deployment state is 'RUNNING' while it is not the case. Ensure + that no Ansible deployment is indeed running. + + This command will only override the last deployment state to set it as 'FAILURE'. + No Ansible command will be executed. + """ + + from tdp.core.models.deployment_model import NothingToFixError + from tdp.dao import Dao + + with Dao(db_engine) as dao: + last_deployment = dao.get_last_deployment() + if last_deployment is None: + raise click.ClickException("No deployment found.") + + try: + last_deployment.fix_running() + except NothingToFixError: + raise click.ClickException( + "Nothing to fix: last deployment is not in a RUNNING state." + ) + else: + dao.session.commit() + click.echo( + "Last deployement has been succesfully set to FAILURE. Use " + "'tdp plan resume' to generate a new deployment plan based on it." + ) diff --git a/tdp/cli/utils.py b/tdp/cli/utils.py index ebd6a35c..17f31515 100644 --- a/tdp/cli/utils.py +++ b/tdp/cli/utils.py @@ -219,7 +219,10 @@ def validate_plan_creation( raise click.ClickException( "Last deployment is in a RUNNING state. Wait for it to finish " "before planning a new deployment.\n\n" - "Use '--force' to create a plan anyway (not recommended)." + "Use '--force' to create a plan anyway (not recommended).\n" + "If no Ansible deployment is running it means that the database has been " + "left in an incorect state. Use 'tdp deploy danger-fix-running' to fix the " + "last deployment state from RUNNING to FAILURE." ) raise click.ClickException("Unknown deployment state.") diff --git a/tdp/core/models/deployment_model.py b/tdp/core/models/deployment_model.py index b10b2d60..e8cdcdcf 100644 --- a/tdp/core/models/deployment_model.py +++ b/tdp/core/models/deployment_model.py @@ -54,6 +54,10 @@ class NothingToResumeError(Exception): class NothingToDeployError(Exception): pass +class NothingToFixError(Exception): + pass + + class MissingOperationError(Exception): def __init__(self, operation_name: str): self.operation_name = operation_name @@ -545,6 +549,33 @@ def start_running(self) -> None: operation.state = OperationStateEnum.PENDING self.start_time = datetime.utcnow() + def fix_running(self): + # Only RUNNING deployment can be fixed + if self.state != DeploymentStateEnum.RUNNING: + raise NothingToFixError() + + held = False + for operation in self.operations: + # Set operation status to HELD if a previous operation is FAILURE or HELD + if held is True: + operation.state = OperationStateEnum.HELD + continue + + # If an operation is RUNNING, set it to FAILURE + if operation.state == OperationStateEnum.RUNNING: + operation.state = OperationStateEnum.FAILURE + held = True + # If an operation is PENDING, set it to HELD + elif operation.state == OperationStateEnum.PENDING: + operation.state = OperationStateEnum.HELD + held = True + # If an operation is HELD, leave it as is + elif operation.state == OperationStateEnum.HELD: + held = True + + # Update deployment to FAILURE + self.state = DeploymentStateEnum.FAILURE + def _filter_falsy_options(options: dict) -> dict: """Get options without falsy values.