From 5fb720be6da8b421a4889ffd2a454a20f7156bb1 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 19 Jan 2026 06:42:55 +0000 Subject: [PATCH 001/159] disbale default user and add charmed-operator user and password generation on leader elected --- src/core/base_workload.py | 10 ++++++++++ src/core/cluster_state.py | 19 +++++++++++++++++++ src/core/models.py | 13 +++++++++++++ src/events/base_events.py | 24 +++++++++++++++++++++++- src/literals.py | 4 ++++ src/managers/config.py | 31 +++++++++++++++++++++++++++++++ src/workload_k8s.py | 11 +++++++++++ 7 files changed, 111 insertions(+), 1 deletion(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 452d52c..bed9210 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -29,3 +29,13 @@ def write_config_file(self, config: dict[str, str]) -> None: config (dict): The config properties to be written. """ pass + + @abstractmethod + def write_file(self, content: str, path: str) -> None: + """Write content to a file on disk. + + Args: + content (str): The content to be written. + path (str): The file path where the content should be written. + """ + pass diff --git a/src/core/cluster_state.py b/src/core/cluster_state.py index fd53cee..6f62510 100644 --- a/src/core/cluster_state.py +++ b/src/core/cluster_state.py @@ -99,3 +99,22 @@ def servers(self) -> set[ValkeyServer]: servers.add(self.unit_server) return servers + + def get_secret_from_id(self, secret_id: str) -> dict[str, str]: + """Resolve the given id of a Juju secret and return the content as a dict. + + Args: + model (Model): Model object. + secret_id (str): The id of the secret. + + Returns: + dict: The content of the secret. + """ + try: + secret_content = self.charm.model.get_secret(id=secret_id).get_content(refresh=True) + except ops.SecretNotFoundError: + raise ops.SecretNotFoundError(f"The secret '{secret_id}' does not exist.") + except ops.ModelError: + raise + + return secret_content diff --git a/src/core/models.py b/src/core/models.py index 9bd1b76..95f777f 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -9,6 +9,7 @@ import ops from charms.data_platform_libs.v1.data_interfaces import ( + ExtraSecretStr, OpsOtherPeerUnitRepositoryInterface, OpsPeerRepositoryInterface, OpsPeerUnitRepositoryInterface, @@ -16,12 +17,16 @@ ) from pydantic import Field +from literals import INTERNAL_USER + logger = logging.getLogger(__name__) class PeerAppModel(PeerModel): """Model for the peer application data.""" + charmed_operator_password: ExtraSecretStr = Field(default="") + class PeerUnitModel(PeerModel): """Model for the peer unit data.""" @@ -119,3 +124,11 @@ def __init__( def model(self) -> PeerAppModel | None: """The peer relation model for this application.""" return self.data_interface.build_model(self.relation.id) if self.relation else None + + @property + def internal_user_credentials(self) -> dict[str, str]: + """Retrieve the credentials for the internal admin user.""" + if self.model and (password := self.model.charmed_operator_password): + return {INTERNAL_USER: password} + + return {} diff --git a/src/events/base_events.py b/src/events/base_events.py index 924c9d6..3ef79c0 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -9,7 +9,7 @@ import ops -from literals import PEER_RELATION +from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION if TYPE_CHECKING: from charm import ValkeyCharm @@ -28,6 +28,7 @@ def __init__(self, charm: "ValkeyCharm"): self.charm.on[PEER_RELATION].relation_joined, self._on_peer_relation_joined ) self.framework.observe(self.charm.on.update_status, self._on_update_status) + self.framework.observe(self.charm.on.leader_elected, self._on_leader_elected) def _on_peer_relation_joined(self, event: ops.RelationJoinedEvent) -> None: """Handle event received by all units when a new unit joins the cluster relation.""" @@ -38,3 +39,24 @@ def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: """Handle the update-status event.""" if not self.charm.state.unit_server.is_started: logger.warning("Service not started") + + def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: + """Handle the leader-elected event.""" + if not self.charm.state.peer_relation: + event.defer() + return + + if self.charm.unit.is_leader() and not self.charm.state.cluster.internal_user_credentials: + if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): + try: + password = self.charm.state.get_secret_from_id(str(admin_secret_id)).get( + INTERNAL_USER + ) + except (ops.ModelError, ops.SecretNotFoundError) as e: + logger.error(f"Could not access secret {admin_secret_id}: {e}") + raise + else: + password = self.charm.config_manager.generate_password() + + self.charm.state.cluster.update({"charmed_operator_password": password}) + self.charm.config_manager.set_acl_file() diff --git a/src/literals.py b/src/literals.py index 7921033..4294b64 100644 --- a/src/literals.py +++ b/src/literals.py @@ -9,6 +9,10 @@ CONTAINER = "valkey" CONFIG_FILE = "/var/lib/valkey/valkey.conf" +ACL_FILE = "/var/lib/valkey/users.acl" PEER_RELATION = "valkey-peers" STATUS_PEERS_RELATION = "status-peers" + +INTERNAL_USER = "charmed-operator" +INTERNAL_USER_PASSWORD_CONFIG = "system-users" diff --git a/src/managers/config.py b/src/managers/config.py index f1c9718..75df8b7 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -4,7 +4,10 @@ """Manager for all config related tasks.""" +import hashlib import logging +import secrets +import string from pathlib import Path from data_platform_helpers.advanced_statuses.models import StatusObject @@ -13,6 +16,7 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState +from literals import ACL_FILE, INTERNAL_USER from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -55,6 +59,10 @@ def config_properties(self) -> dict[str, str]: value = "" config_properties[key.strip()] = value.strip() + # Adjust default values + # Use the ACL file + config_properties["aclfile"] = str(ACL_FILE) + return config_properties def set_config_properties(self) -> None: @@ -62,6 +70,29 @@ def set_config_properties(self) -> None: logger.debug("Writing configuration") self.workload.write_config_file(config=self.config_properties) + def set_acl_file(self) -> None: + """Write the ACL file with appropriate user permissions.""" + logger.debug("Writing ACL configuration") + charmed_operator_password = self.state.cluster.internal_user_credentials.get( + INTERNAL_USER, "" + ) + # sha256 hash the password + charmed_operator_password_hash = hashlib.sha256( + charmed_operator_password.encode("utf-8") + ).hexdigest() + # write the ACL file + acl_content = "user default off\n" + acl_content += f"user {INTERNAL_USER} on #{charmed_operator_password_hash} ~* +@all\n" + self.workload.write_file(acl_content, ACL_FILE) + + def generate_password(self) -> str: + """Create randomized string for use as app passwords. + + Returns: + str: String of 32 randomized letter+digit characters + """ + return "".join([secrets.choice(string.ascii_letters + string.digits) for _ in range(32)]) + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the config manager's statuses.""" status_list: list[StatusObject] = [] diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 75e1113..5e6b5a6 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -71,3 +71,14 @@ def write_config_file(self, config: dict[str, str]) -> None: path = self.config_file path.write_text(config_string) + + @override + def write_file(self, content: str, path: str) -> None: + """Write content to a file on disk. + + Args: + content (str): The content to be written. + path (str): The file path where the content should be written. + """ + file_path = pathops.ContainerPath(path, container=self.container) + file_path.write_text(content) From 93f8b418930dbf002de15002bb9b73f156daef73 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 19 Jan 2026 11:39:34 +0000 Subject: [PATCH 002/159] add secret handlign and config for admin password --- config.yaml | 11 +++++ poetry.lock | 20 ++++++++- pyproject.toml | 1 + src/common/client.py | 64 ++++++++++++++++++++++++++++ src/common/exceptions.py | 8 ++++ src/core/models.py | 1 + src/events/base_events.py | 89 +++++++++++++++++++++++++++++++++++++++ src/literals.py | 2 + src/managers/cluster.py | 23 ++++++++++ src/managers/config.py | 5 ++- src/statuses.py | 13 ++++++ 11 files changed, 234 insertions(+), 3 deletions(-) create mode 100644 config.yaml create mode 100644 src/common/client.py create mode 100644 src/common/exceptions.py diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..bf71dcd --- /dev/null +++ b/config.yaml @@ -0,0 +1,11 @@ +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +options: + system-users: + type: secret + description: | + Configure the internal system user and it's password. The password will + be auto-generated if this option is not set. It is for internal use only + and SHOULD NOT be used by applications. This needs to be a Juju Secret URI pointing + to a secret that contains the following content: `root: `. \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index ab7bb95..4eb3149 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "allure-pytest" @@ -905,6 +905,22 @@ files = [ [package.dependencies] typing-extensions = ">=4.12.0" +[[package]] +name = "valkey" +version = "6.1.1" +description = "Python client for Valkey forked from redis-py" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "valkey-6.1.1-py3-none-any.whl", hash = "sha256:e2691541c6e1503b53c714ad9a35551ac9b7c0bbac93865f063dbc859a46de92"}, + {file = "valkey-6.1.1.tar.gz", hash = "sha256:5880792990c6c2b5eb604a5ed5f98f300880b6dd92d123819b66ed54bb259731"}, +] + +[package.extras] +libvalkey = ["libvalkey (>=4.0.1)"] +ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] + [[package]] name = "valkey-glide" version = "2.2.5" @@ -997,4 +1013,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "c1ca33a93e20384cbd2bfdf6bcdcbb39a54d4d60854bcbf1c33f4e580b82122e" +content-hash = "e4b51126ae1629392f53bbebc2d837e2a5cd51804315984a028859b8c799af3e" diff --git a/pyproject.toml b/pyproject.toml index 76efd34..b786d85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ ops = "^3.5.0" charmlibs-pathops = "^1.2.0" data-platform-helpers = ">=0.1.7" valkey-glide = "^2.2.5" +valkey = "^6.1.1" [tool.poetry.requires-plugins] poetry-plugin-export = ">=1.8" diff --git a/src/common/client.py b/src/common/client.py new file mode 100644 index 0000000..44f38a0 --- /dev/null +++ b/src/common/client.py @@ -0,0 +1,64 @@ +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +"""ValkeyClient utility class to connect to valkey servers.""" + +from valkey import Valkey + +from common.exceptions import ValkeyUserManagementError +from literals import CLIENT_PORT + + +class ValkeyClient: + """Handle valkey client connections.""" + + def __init__( + self, + username: str, + password: str, + host: str, + ): + self.host = host + self.user = username + self.password = password + self.client = Valkey(port=CLIENT_PORT, username=username, password=password) + + # async def create_client(self) -> GlideClient: + # """Initialize the Valkey client.""" + # addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in self.host] + # credentials = ServerCredentials(self.user, self.password) + # client_config = GlideClusterClientConfiguration( + # addresses, + # credentials=credentials, + # ) + # return await GlideClient.create(client_config) + + def update_password(self, username: str, new_password: str) -> None: + """Update a user's password. + + Args: + username (str): The username to update. + new_password (str): The new password. + """ + # try: + # client = await self.create_client() + # await client.custom_command( + # [ + # "ACL", + # "SETUSER", + # username, + # "resetpass", + # f">{new_password}", + # ] + # ) + # except Exception as e: + # raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") + # finally: + # await client.close() + try: + self.client.acl_setuser( + username, enabled=True, reset_passwords=True, passwords=[f"+{new_password}"] + ) + self.client.acl_save() + except Exception as e: + raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") diff --git a/src/common/exceptions.py b/src/common/exceptions.py new file mode 100644 index 0000000..acd66c1 --- /dev/null +++ b/src/common/exceptions.py @@ -0,0 +1,8 @@ +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Charm-specific exceptions.""" + + +class ValkeyUserManagementError(Exception): + """Custom Exception if user could not be added or updated in valkey cluster.""" diff --git a/src/core/models.py b/src/core/models.py index 95f777f..de27f03 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -32,6 +32,7 @@ class PeerUnitModel(PeerModel): """Model for the peer unit data.""" started: bool = Field(default=False) + hostname: str = Field(default="") class RelationState: diff --git a/src/events/base_events.py b/src/events/base_events.py index 3ef79c0..9a8def6 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -5,11 +5,14 @@ """Valkey base event handlers.""" import logging +import socket from typing import TYPE_CHECKING import ops +from common.exceptions import ValkeyUserManagementError from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION +from statuses import CharmStatuses, ClusterStatuses if TYPE_CHECKING: from charm import ValkeyCharm @@ -29,6 +32,8 @@ def __init__(self, charm: "ValkeyCharm"): ) self.framework.observe(self.charm.on.update_status, self._on_update_status) self.framework.observe(self.charm.on.leader_elected, self._on_leader_elected) + self.framework.observe(self.charm.on.config_changed, self._on_config_changed) + self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed) def _on_peer_relation_joined(self, event: ops.RelationJoinedEvent) -> None: """Handle event received by all units when a new unit joins the cluster relation.""" @@ -60,3 +65,87 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: self.charm.state.cluster.update({"charmed_operator_password": password}) self.charm.config_manager.set_acl_file() + + def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: + """Handle the config_changed event.""" + self.charm.state.unit_server.update({"hostname": socket.gethostname()}) + + if not self.charm.unit.is_leader(): + return + + if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): + try: + self.update_admin_password(str(admin_secret_id)) + except (ops.ModelError, ops.SecretNotFoundError): + event.defer() + return + + def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: + """Handle the secret_changed event.""" + if not self.charm.unit.is_leader(): + return + + if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): + if admin_secret_id == event.secret.id: + try: + self.update_admin_password(str(admin_secret_id)) + except (ops.ModelError, ops.SecretNotFoundError): + event.defer() + return + + def update_admin_password(self, admin_secret_id: str) -> None: + """Compare current admin password and update in valkey if required.""" + try: + if new_password := self.charm.state.get_secret_from_id(admin_secret_id).get( + INTERNAL_USER + ): + # only update admin credentials if the password has changed + if new_password != self.charm.state.cluster.internal_user_credentials.get( + INTERNAL_USER + ): + logger.debug(f"{INTERNAL_USER_PASSWORD_CONFIG} have changed.") + try: + self.charm.cluster_manager.update_credentials( + username=INTERNAL_USER, password=new_password + ) + self.charm.state.cluster.update( + {"charmed_operator_password": new_password} + ) + except ValkeyUserManagementError as e: + logger.error(e) + self.charm.status.set_running_status( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + return + else: + logger.error(f"Invalid username in secret {admin_secret_id}.") + self.charm.status.set_running_status( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + return + except (ops.ModelError, ops.SecretNotFoundError) as e: + logger.error(e) + self.charm.status.set_running_status( + CharmStatuses.SECRET_ACCESS_ERROR.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + raise + + self.charm.state.statuses.delete( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component=self.charm.cluster_manager.name, + ) + self.charm.state.statuses.delete( + CharmStatuses.SECRET_ACCESS_ERROR.value, + scope="app", + component=self.charm.cluster_manager.name, + ) diff --git a/src/literals.py b/src/literals.py index 4294b64..61e0f04 100644 --- a/src/literals.py +++ b/src/literals.py @@ -16,3 +16,5 @@ INTERNAL_USER = "charmed-operator" INTERNAL_USER_PASSWORD_CONFIG = "system-users" + +CLIENT_PORT = 6379 diff --git a/src/managers/cluster.py b/src/managers/cluster.py index f099c2c..2c717ed 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -10,8 +10,11 @@ from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope +from common.client import ValkeyClient +from common.exceptions import ValkeyUserManagementError from core.base_workload import WorkloadBase from core.cluster_state import ClusterState +from literals import INTERNAL_USER from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -26,6 +29,26 @@ class ClusterManager(ManagerStatusProtocol): def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload + self.admin_user = INTERNAL_USER + self.admin_password = self.state.cluster.internal_user_credentials.get(INTERNAL_USER, "") + self.cluster_hostnames = [server.model.hostname for server in self.state.servers] + + def update_credentials(self, username: str, password: str) -> None: + """Update a user's password. + + Args: + username (str): The username to update. + password (str): The new password. + """ + try: + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + host=self.state.unit_server.model.hostname, + ) + client.update_password(username=username, new_password=password) + except ValkeyUserManagementError: + raise def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" diff --git a/src/managers/config.py b/src/managers/config.py index 75df8b7..d0febfc 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -16,7 +16,7 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import ACL_FILE, INTERNAL_USER +from literals import ACL_FILE, CLIENT_PORT, INTERNAL_USER from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -60,6 +60,9 @@ def config_properties(self) -> dict[str, str]: config_properties[key.strip()] = value.strip() # Adjust default values + # port + config_properties["port"] = str(CLIENT_PORT) + # Use the ACL file config_properties["aclfile"] = str(ACL_FILE) diff --git a/src/statuses.py b/src/statuses.py index 4d0036f..ba9234b 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -20,3 +20,16 @@ class CharmStatuses(Enum): message="Scaling Valkey is not implemented yet", ) SERVICE_NOT_STARTED = StatusObject(status="blocked", message="Service not started") + SECRET_ACCESS_ERROR = StatusObject( + status="blocked", + message="Cannot access configured secret, check permissions", + running="async", + ) + + +class ClusterStatuses(Enum): + """Collection of possible cluster related statuses.""" + + PASSWORD_UPDATE_FAILED = StatusObject( + status="blocked", message="Failed to update the internal user's password", running="async" + ) From a0e62d49ed17ca043853a2b5027b667fbd35cb92 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 07:09:24 +0000 Subject: [PATCH 003/159] bind to 0.0.0.0 --- src/managers/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/managers/config.py b/src/managers/config.py index d0febfc..2340aa9 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -63,6 +63,9 @@ def config_properties(self) -> dict[str, str]: # port config_properties["port"] = str(CLIENT_PORT) + # bind to all interfaces + config_properties["bind"] = "0.0.0.0 -::1" + # Use the ACL file config_properties["aclfile"] = str(ACL_FILE) From c7caead67ec7a46a7be9c0b520c4be7194c7aeb6 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 07:13:57 +0000 Subject: [PATCH 004/159] switch to glide --- poetry.lock | 129 ++++++++++++---------------------------- pyproject.toml | 12 ++-- src/common/client.py | 75 +++++++++++++---------- src/managers/cluster.py | 2 +- 4 files changed, 90 insertions(+), 128 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4eb3149..5c9f8e8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -66,7 +66,7 @@ version = "4.12.1" description = "High-level concurrency and networking framework on top of asyncio or Trio" optional = false python-versions = ">=3.9" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c"}, {file = "anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703"}, @@ -269,7 +269,7 @@ version = "3.11" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, @@ -392,7 +392,6 @@ files = [ {file = "ops-3.5.0-py3-none-any.whl", hash = "sha256:07b1d1dbc0f3ca59534d5fe5020a66ee95c528f2430e004922350274509420c6"}, {file = "ops-3.5.0.tar.gz", hash = "sha256:e3427889054285bd2711a3a297a77218384eacaf0d1001590ee4437cca115577"}, ] -develop = false [package.dependencies] opentelemetry-api = ">=1.0,<2.0" @@ -400,14 +399,8 @@ PyYAML = "==6.*" websocket-client = "==1.*" [package.extras] -testing = ["ops-scenario (==8.6.0.dev0)"] -tracing = ["ops-tracing (==3.6.0.dev0)"] - -[package.source] -type = "git" -url = "https://github.com/reneradoi/operator" -reference = "HEAD" -resolved_reference = "d3d3b1816a4f9c15861908375703c7f54e0735ad" +testing = ["ops-scenario (==8.5.0)"] +tracing = ["ops-tracing (==3.5.0)"] [[package]] name = "ops-scenario" @@ -475,7 +468,7 @@ version = "6.33.4" description = "" optional = false python-versions = ">=3.9" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"}, {file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"}, @@ -824,31 +817,31 @@ jupyter = ["ipywidgets (>=7.5.1,<9)"] [[package]] name = "ruff" -version = "0.14.10" +version = "0.14.13" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" groups = ["format", "lint"] files = [ - {file = "ruff-0.14.10-py3-none-linux_armv6l.whl", hash = "sha256:7a3ce585f2ade3e1f29ec1b92df13e3da262178df8c8bdf876f48fa0e8316c49"}, - {file = "ruff-0.14.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:674f9be9372907f7257c51f1d4fc902cb7cf014b9980152b802794317941f08f"}, - {file = "ruff-0.14.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d85713d522348837ef9df8efca33ccb8bd6fcfc86a2cde3ccb4bc9d28a18003d"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6987ebe0501ae4f4308d7d24e2d0fe3d7a98430f5adfd0f1fead050a740a3a77"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16a01dfb7b9e4eee556fbfd5392806b1b8550c9b4a9f6acd3dbe6812b193c70a"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7165d31a925b7a294465fa81be8c12a0e9b60fb02bf177e79067c867e71f8b1f"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c561695675b972effb0c0a45db233f2c816ff3da8dcfbe7dfc7eed625f218935"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bb98fcbbc61725968893682fd4df8966a34611239c9fd07a1f6a07e7103d08e"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f24b47993a9d8cb858429e97bdf8544c78029f09b520af615c1d261bf827001d"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59aabd2e2c4fd614d2862e7939c34a532c04f1084476d6833dddef4afab87e9f"}, - {file = "ruff-0.14.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:213db2b2e44be8625002dbea33bb9c60c66ea2c07c084a00d55732689d697a7f"}, - {file = "ruff-0.14.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b914c40ab64865a17a9a5b67911d14df72346a634527240039eb3bd650e5979d"}, - {file = "ruff-0.14.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1484983559f026788e3a5c07c81ef7d1e97c1c78ed03041a18f75df104c45405"}, - {file = "ruff-0.14.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:c70427132db492d25f982fffc8d6c7535cc2fd2c83fc8888f05caaa248521e60"}, - {file = "ruff-0.14.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5bcf45b681e9f1ee6445d317ce1fa9d6cba9a6049542d1c3d5b5958986be8830"}, - {file = "ruff-0.14.10-py3-none-win32.whl", hash = "sha256:104c49fc7ab73f3f3a758039adea978869a918f31b73280db175b43a2d9b51d6"}, - {file = "ruff-0.14.10-py3-none-win_amd64.whl", hash = "sha256:466297bd73638c6bdf06485683e812db1c00c7ac96d4ddd0294a338c62fdc154"}, - {file = "ruff-0.14.10-py3-none-win_arm64.whl", hash = "sha256:e51d046cf6dda98a4633b8a8a771451107413b0f07183b2bef03f075599e44e6"}, - {file = "ruff-0.14.10.tar.gz", hash = "sha256:9a2e830f075d1a42cd28420d7809ace390832a490ed0966fe373ba288e77aaf4"}, + {file = "ruff-0.14.13-py3-none-linux_armv6l.whl", hash = "sha256:76f62c62cd37c276cb03a275b198c7c15bd1d60c989f944db08a8c1c2dbec18b"}, + {file = "ruff-0.14.13-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:914a8023ece0528d5cc33f5a684f5f38199bbb566a04815c2c211d8f40b5d0ed"}, + {file = "ruff-0.14.13-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d24899478c35ebfa730597a4a775d430ad0d5631b8647a3ab368c29b7e7bd063"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9aaf3870f14d925bbaf18b8a2347ee0ae7d95a2e490e4d4aea6813ed15ebc80e"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac5b7f63dd3b27cc811850f5ffd8fff845b00ad70e60b043aabf8d6ecc304e09"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d2b1097750d90ba82ce4ba676e85230a0ed694178ca5e61aa9b459970b3eb9"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:7d0bf87705acbbcb8d4c24b2d77fbb73d40210a95c3903b443cd9e30824a5032"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3eb5da8e2c9e9f13431032fdcbe7681de9ceda5835efee3269417c13f1fed5c"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:642442b42957093811cd8d2140dfadd19c7417030a7a68cf8d51fcdd5f217427"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4acdf009f32b46f6e8864af19cbf6841eaaed8638e65c8dac845aea0d703c841"}, + {file = "ruff-0.14.13-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:591a7f68860ea4e003917d19b5c4f5ac39ff558f162dc753a2c5de897fd5502c"}, + {file = "ruff-0.14.13-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:774c77e841cc6e046fc3e91623ce0903d1cd07e3a36b1a9fe79b81dab3de506b"}, + {file = "ruff-0.14.13-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:61f4e40077a1248436772bb6512db5fc4457fe4c49e7a94ea7c5088655dd21ae"}, + {file = "ruff-0.14.13-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6d02f1428357fae9e98ac7aa94b7e966fd24151088510d32cf6f902d6c09235e"}, + {file = "ruff-0.14.13-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e399341472ce15237be0c0ae5fbceca4b04cd9bebab1a2b2c979e015455d8f0c"}, + {file = "ruff-0.14.13-py3-none-win32.whl", hash = "sha256:ef720f529aec113968b45dfdb838ac8934e519711da53a0456038a0efecbd680"}, + {file = "ruff-0.14.13-py3-none-win_amd64.whl", hash = "sha256:6070bd026e409734b9257e03e3ef18c6e1a216f0435c6751d7a8ec69cb59abef"}, + {file = "ruff-0.14.13-py3-none-win_arm64.whl", hash = "sha256:7ab819e14f1ad9fe39f246cfcc435880ef7a9390d81a2b6ac7e01039083dd247"}, + {file = "ruff-0.14.13.tar.gz", hash = "sha256:83cd6c0763190784b99650a20fec7633c59f6ebe41c5cc9d45ee42749563ad47"}, ] [[package]] @@ -872,7 +865,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -905,74 +898,28 @@ files = [ [package.dependencies] typing-extensions = ">=4.12.0" -[[package]] -name = "valkey" -version = "6.1.1" -description = "Python client for Valkey forked from redis-py" -optional = false -python-versions = ">=3.9" -groups = ["main"] -files = [ - {file = "valkey-6.1.1-py3-none-any.whl", hash = "sha256:e2691541c6e1503b53c714ad9a35551ac9b7c0bbac93865f063dbc859a46de92"}, - {file = "valkey-6.1.1.tar.gz", hash = "sha256:5880792990c6c2b5eb604a5ed5f98f300880b6dd92d123819b66ed54bb259731"}, -] - -[package.extras] -libvalkey = ["libvalkey (>=4.0.1)"] -ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] - [[package]] name = "valkey-glide" -version = "2.2.5" +version = "0.0.0" description = "Valkey GLIDE Async client. Supports Valkey and Redis OSS." optional = false python-versions = ">=3.9" -groups = ["main", "integration"] -files = [ - {file = "valkey_glide-2.2.5-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:7e59ad6c2dca0e6f8dd85cfaebf7206a4dded9ec5a377eeccfbeee60df5770aa"}, - {file = "valkey_glide-2.2.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:821d279e1c255a22a9c65f3010ac5b56daa3150a9f6808d9e1e41335a34c08dd"}, - {file = "valkey_glide-2.2.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e509a561de873a088ccf6c1f407b4d4e96ee66889e958307ff28d4544b62bf1"}, - {file = "valkey_glide-2.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb24ce3da6057b7bafba028897ad9020ac5a697b03e054a520d7a1d97ba48b7d"}, - {file = "valkey_glide-2.2.5-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:8637c3b0ce071fdbce4dffb6bc8602d2c6515b29f7762159d2a4322e5511ca34"}, - {file = "valkey_glide-2.2.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37c33fbcc417a88285dc4179df3426b7dc3c81c6de1ae1f95a3eb9303ef8614d"}, - {file = "valkey_glide-2.2.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3984babebd5ecbee30c068a450e80541711d67a9f1dc22dae7958716eefb8bda"}, - {file = "valkey_glide-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec58b1414b330070ddb3976c159c40c1ee990af86113f5d6b6728dbfcd33aabb"}, - {file = "valkey_glide-2.2.5-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:5e87e4e763a147a23bb94c88ccf9d498f9b32cefcf681d7a2722466e30ed8951"}, - {file = "valkey_glide-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14e03ad36050dca2a76f422ac2afeedcc20aeade6d9266378f6d869e580d91df"}, - {file = "valkey_glide-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ece81fe4ae94e4dc2e5fb6f0d9ad11398308bbd7d7f8a392b3c4a11f6810778"}, - {file = "valkey_glide-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a732f517a434f0c0b0d143ffc283ee1b8452e0cc6144e45dcec101ea94a3b3"}, - {file = "valkey_glide-2.2.5-cp313-cp313-macosx_10_7_x86_64.whl", hash = "sha256:e36e312791ce204fa2580c7f6677d659b5080c4af96f4b1a9e7fc8ecbb358c72"}, - {file = "valkey_glide-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4f074927d15b00d481e0c7d206b64b991e92e27c335a4a309dc67fe6080d660d"}, - {file = "valkey_glide-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:098f6f3c0a941e7ae39ed937ecfadb02db75f2c514b94e9f8b6a85f9be1acb2a"}, - {file = "valkey_glide-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2357da132e58b96165c5c7226b571eb68f87d176b4f1b61c15b720db6d61d02"}, - {file = "valkey_glide-2.2.5-cp314-cp314-macosx_10_7_x86_64.whl", hash = "sha256:4b550fe6e6f0de9bf3a097a425463e47e14c94528b6d7e17250b23f0a47eaa74"}, - {file = "valkey_glide-2.2.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c27dc3d3b88bc1b5c1db0bfcebfeea9ea592e1db019d5cb70f6188df39ee63e7"}, - {file = "valkey_glide-2.2.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e8c7ce15229e81ccf433d6b7f70823007f05a708a605f1fb4421f576c807b60"}, - {file = "valkey_glide-2.2.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ef4ba6d5ac14f1a2d8d6c6d0c447b1ac890e02b79978aee96c96c998c499f53"}, - {file = "valkey_glide-2.2.5-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:948c250ab3ccbc68a244a308a04d088348077fb4d5b2af299f0a1571caf55c9f"}, - {file = "valkey_glide-2.2.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236b758a50d6d2360cfd61ecd0a6943feaf07bdb8bdc9abd06429f7e16b0ae1"}, - {file = "valkey_glide-2.2.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1abcd2db1b29159f398c3b6968891b2c61ac9e8bc81ffae86437ec19b3e3d96"}, - {file = "valkey_glide-2.2.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6bba254f1b5dadb86cf99b865752e94371e59dd0ffe374d7b78cf09a47749d4"}, - {file = "valkey_glide-2.2.5-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:113d26b8e2c78fec6fc4bf76b1afffb8287fac296eff730ed5461cf5bd6220f9"}, - {file = "valkey_glide-2.2.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:42aae978607ac2f3c2428364883f9da072889547eafafbf67161017332a2a267"}, - {file = "valkey_glide-2.2.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:944e545c09d1eb3d5624e214237daa3293936366a2fb39e7a0c0b4ace970636a"}, - {file = "valkey_glide-2.2.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ae6a3ffeb7657098488a38b4273493a2100c3e3675ba1a7fca5db2e1ab74815"}, - {file = "valkey_glide-2.2.5-pp311-pypy311_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cb6a007ccf4309dd03b7f20bd0643e61402954f2cdf4d45a7fea929bb7502305"}, - {file = "valkey_glide-2.2.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:97a8e80ff57f9b360ea539e4a6425ae0481cb0c73115d42c543c5505516b2240"}, - {file = "valkey_glide-2.2.5-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ac5e4ed0f9042da401ba99465938c4bf2d671f8326e8e2989477766709f78a"}, - {file = "valkey_glide-2.2.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb71f85db2395bf867dcfb5f9f5cd8b1b5866a4465266d4ce2f54d532184dbf"}, - {file = "valkey_glide-2.2.5-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:84a784353e1a379d134946b0bca5aca6ebb35babfb90e2e986e18feda9790208"}, - {file = "valkey_glide-2.2.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:30e1865e4da6c5056a27377f7aed8078504c15e882a08a0105b2b4fe0d2990ab"}, - {file = "valkey_glide-2.2.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91482d1be8a119e222f9e44534fbbd28eedf4ed5e22b1bd73dfd0688bf43f80c"}, - {file = "valkey_glide-2.2.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8c3ec3eee02cd2247cd1609475cfccd4278dce8908a45091bcf4064b13f7545"}, - {file = "valkey_glide-2.2.5.tar.gz", hash = "sha256:7abd6ce28d655caed4a5f41e056b5a13ce7b3271435ae9bc2c8c72ba725c4adf"}, -] +groups = ["main"] +files = [] +develop = false [package.dependencies] anyio = ">=4.9.0" protobuf = ">=6.20" sniffio = "*" +[package.source] +type = "git" +url = "https://github.com/skourta/valkey-glide" +reference = "add-build-rs" +resolved_reference = "5e2dfce07bed84dc8637e1c43aa55b135a76137f" +subdirectory = "python/glide-async" + [[package]] name = "websocket-client" version = "1.9.0" @@ -1013,4 +960,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "e4b51126ae1629392f53bbebc2d837e2a5cd51804315984a028859b8c799af3e" +content-hash = "68cd6ebdf1633cde09c2e9079faed8d557645b266ace86836f9da88c97215dcc" diff --git a/pyproject.toml b/pyproject.toml index b786d85..22e5bc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,12 +6,11 @@ package-mode = false requires-poetry = ">=2.0.0" [tool.poetry.dependencies] -python = "^3.12" # switch to 3.14 once charm base is 26.04 +python = "^3.12" # switch to 3.14 once charm base is 26.04 ops = "^3.5.0" charmlibs-pathops = "^1.2.0" data-platform-helpers = ">=0.1.7" -valkey-glide = "^2.2.5" -valkey = "^6.1.1" +valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs" } [tool.poetry.requires-plugins] poetry-plugin-export = ">=1.8" @@ -49,7 +48,6 @@ allure-pytest = "*" allure-pytest-default-results = "^0.1.2" data-platform-helpers = ">=0.1.7" jubilant = "^1.6.0" -valkey-glide = "^2.2.5" [tool.coverage.run] branch = true @@ -82,7 +80,11 @@ lint.extend-ignore = [ "D413", ] lint.ignore = ["E501", "D107"] -extend-exclude = ["__pycache__", "*.egg_info", "../../common/common/lib/charms/**"] +extend-exclude = [ + "__pycache__", + "*.egg_info", + "../../common/common/lib/charms/**", +] lint.per-file-ignores = { "tests/*" = ["D100", "D101", "D102", "D103", "D104"] } [tool.ruff.lint.mccabe] diff --git a/src/common/client.py b/src/common/client.py index 44f38a0..8cd94e3 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -3,11 +3,21 @@ """ValkeyClient utility class to connect to valkey servers.""" -from valkey import Valkey +import asyncio +import logging + +from glide import ( + GlideClient, + GlideClientConfiguration, + NodeAddress, + ServerCredentials, +) from common.exceptions import ValkeyUserManagementError from literals import CLIENT_PORT +logger = logging.getLogger(__name__) + class ValkeyClient: """Handle valkey client connections.""" @@ -16,22 +26,26 @@ def __init__( self, username: str, password: str, - host: str, + hosts: list[str], ): - self.host = host + self.hosts = hosts self.user = username self.password = password - self.client = Valkey(port=CLIENT_PORT, username=username, password=password) - # async def create_client(self) -> GlideClient: - # """Initialize the Valkey client.""" - # addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in self.host] - # credentials = ServerCredentials(self.user, self.password) - # client_config = GlideClusterClientConfiguration( - # addresses, - # credentials=credentials, - # ) - # return await GlideClient.create(client_config) + async def create_client(self) -> GlideClient: + """Initialize the Valkey client.""" + addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in self.hosts] + credentials = ServerCredentials(username=self.user, password=self.password) + # TODO add back when we enable cluster mode + # client_config = GlideClusterClientConfiguration( + # addresses, + # credentials=credentials, + # ) + client_config = GlideClientConfiguration( + addresses, + credentials=credentials, + ) + return await GlideClient.create(client_config) def update_password(self, username: str, new_password: str) -> None: """Update a user's password. @@ -40,25 +54,24 @@ def update_password(self, username: str, new_password: str) -> None: username (str): The username to update. new_password (str): The new password. """ - # try: - # client = await self.create_client() - # await client.custom_command( - # [ - # "ACL", - # "SETUSER", - # username, - # "resetpass", - # f">{new_password}", - # ] - # ) - # except Exception as e: - # raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") - # finally: - # await client.close() + client = None try: - self.client.acl_setuser( - username, enabled=True, reset_passwords=True, passwords=[f"+{new_password}"] + client = asyncio.run(self.create_client()) + result = asyncio.run( + client.custom_command( + [ + "ACL", + "SETUSER", + username, + "resetpass", + f">{new_password}", + ] + ) ) - self.client.acl_save() + logger.debug(f"Password update result: {result}") except Exception as e: + logger.error(f"Error updating password for user {username}: {e}") raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") + finally: + if client: + asyncio.run(client.close()) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 2c717ed..aa0f626 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -44,7 +44,7 @@ def update_credentials(self, username: str, password: str) -> None: client = ValkeyClient( username=self.admin_user, password=self.admin_password, - host=self.state.unit_server.model.hostname, + hosts=self.cluster_hostnames, ) client.update_password(username=username, new_password=password) except ValkeyUserManagementError: From af42d57690bb51f5633ea42525c8189f237092ac Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 09:42:24 +0000 Subject: [PATCH 005/159] add unit tests --- .gitignore | 3 + tests/unit/test_charm.py | 241 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 243 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b4be834..b0e57b2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ .env valkey-operator/kubernetes/*coverage* common/poetry.lock +__pycache__ +coverage.xml +.coverage diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 685dc15..16bd6d5 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -2,10 +2,20 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. +from pathlib import Path +from unittest.mock import patch + +import pytest +import yaml from ops import ActiveStatus, pebble, testing from src.charm import ValkeyCharm -from src.literals import PEER_RELATION, STATUS_PEERS_RELATION +from src.literals import ( + INTERNAL_USER, + INTERNAL_USER_PASSWORD_CONFIG, + PEER_RELATION, + STATUS_PEERS_RELATION, +) from src.statuses import CharmStatuses from .helpers import status_is @@ -15,6 +25,9 @@ SERVICE_VALKEY = "valkey" SERVICE_METRIC_EXPORTER = "metric_exporter" +METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) +APP_NAME = METADATA["name"] + def test_pebble_ready_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) @@ -148,3 +161,229 @@ def test_update_status_non_leader_unit(cloud_spec): ) state_out = ctx.run(ctx.on.update_status(), state_in) assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value) + + +def test_internal_user_creation(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + + container = testing.Container(name=CONTAINER, can_connect=True) + state_in = testing.State(relations={relation}, leader=True, containers={container}) + with patch("workload_k8s.ValkeyK8sWorkload.write_file"): + state_out = ctx.run(ctx.on.leader_elected(), state_in) + secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") + assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") + + +def test_leader_elected_no_peer_relation(): + ctx = testing.Context(ValkeyCharm) + + container = testing.Container(name=CONTAINER, can_connect=True) + state_in = testing.State(leader=True, containers={container}) + with patch("workload_k8s.ValkeyK8sWorkload.write_file"): + state_out = ctx.run(ctx.on.leader_elected(), state_in) + assert "leader_elected" in [e.name for e in state_out.deferred] + + +def test_leader_elected_leader_password_specified(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + ) + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("managers.config.ConfigManager.generate_password") as mock_generate, + ): + state_out = ctx.run(ctx.on.leader_elected(), state_in) + secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") + assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") == "secure-password" + mock_generate.assert_not_called() + + +def test_leader_elected_leader_password_specified_wrong_secret(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + config={INTERNAL_USER_PASSWORD_CONFIG: "secret:1tf1wk0tmfrodp8ofwxn"}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + pytest.raises(testing.errors.UncaughtCharmError) as exc_info, + ): + ctx.run(ctx.on.leader_elected(), state_in) + assert "SecretNotFoundError" in str(exc_info.value) + + +def test_config_changed_non_leader_unit(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=False, + relations={relation}, + containers={container}, + config={INTERNAL_USER_PASSWORD_CONFIG: "secret:1tf1wk0tmfrodp8ofwxn"}, + ) + with ( + patch("events.base_events.BaseEvents.update_admin_password") as mock_update, + ): + ctx.run(ctx.on.config_changed(), state_in) + mock_update.assert_not_called() + + +def test_config_changed_leader_unit_valkey_update_fails(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + ) + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("common.client.ValkeyClient.create_client", side_effect=Exception("fail")), + patch("core.models.RelationState.update") as mock_update, + ): + ctx.run(ctx.on.config_changed(), state_in) + mock_update.assert_called_once() + + +def test_config_changed_leader_unit(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + ) + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("common.client.ValkeyClient.update_password") as mock_update_password, + ): + state_out = ctx.run(ctx.on.config_changed(), state_in) + mock_update_password.assert_called_once_with( + username=INTERNAL_USER, new_password="secure-password" + ) + secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") + assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") == "secure-password" + + +def test_config_changed_leader_unit_wrong_username(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={"wrong-username": "secure-password"}, remote_grants=APP_NAME + ) + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("common.client.ValkeyClient.update_password") as mock_update_password, + ): + ctx.run(ctx.on.config_changed(), state_in) + mock_update_password.assert_not_called() + + +def test_config_changed_leader_unit_wrong_secret(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={"wrong-username": "secure-password"}, remote_grants=APP_NAME + ) + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("common.client.ValkeyClient.update_password") as mock_update_password, + ): + ctx.run(ctx.on.config_changed(), state_in) + mock_update_password.assert_not_called() + + +def test_change_password_secret_changed_non_leader_unit(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + ) + + state_in = testing.State( + leader=False, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("events.base_events.BaseEvents.update_admin_password") as mock_update_password, + ): + ctx.run(ctx.on.secret_changed(password_secret), state_in) + mock_update_password.assert_not_called() + + +def test_change_password_secret_changed_leader_unit(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + ) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("events.base_events.BaseEvents.update_admin_password") as mock_update_password, + ): + ctx.run(ctx.on.secret_changed(password_secret), state_in) + mock_update_password.assert_called_once_with(password_secret.id) From 8889bd526353490cb5068d56d60d679bfecf3f53 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 14:09:15 +0000 Subject: [PATCH 006/159] add integeration tests --- poetry.lock | 12 +-- pyproject.toml | 1 + src/common/client.py | 31 +++++-- src/managers/cluster.py | 2 +- tests/integration/k8s/helpers.py | 135 +++++++++++++++++++++++++++- tests/integration/k8s/test_charm.py | 126 +++++++++++++++++++++++++- tox.ini | 2 +- 7 files changed, 292 insertions(+), 17 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5c9f8e8..86b3887 100644 --- a/poetry.lock +++ b/poetry.lock @@ -66,7 +66,7 @@ version = "4.12.1" description = "High-level concurrency and networking framework on top of asyncio or Trio" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "integration"] files = [ {file = "anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c"}, {file = "anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703"}, @@ -269,7 +269,7 @@ version = "3.11" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "integration"] files = [ {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, @@ -468,7 +468,7 @@ version = "6.33.4" description = "" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "integration"] files = [ {file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"}, {file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"}, @@ -865,7 +865,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "integration"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -904,7 +904,7 @@ version = "0.0.0" description = "Valkey GLIDE Async client. Supports Valkey and Redis OSS." optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "integration"] files = [] develop = false @@ -960,4 +960,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "68cd6ebdf1633cde09c2e9079faed8d557645b266ace86836f9da88c97215dcc" +content-hash = "9721ba0790a1a564baa26313d5d1385a916ff9e9a510dd00c8b559b14247d55a" diff --git a/pyproject.toml b/pyproject.toml index 22e5bc7..e6cb1b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ allure-pytest = "*" allure-pytest-default-results = "^0.1.2" data-platform-helpers = ">=0.1.7" jubilant = "^1.6.0" +valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs" } [tool.coverage.run] branch = true diff --git a/src/common/client.py b/src/common/client.py index 8cd94e3..ac9c941 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -5,6 +5,7 @@ import asyncio import logging +from typing import Any from glide import ( GlideClient, @@ -44,9 +45,31 @@ async def create_client(self) -> GlideClient: client_config = GlideClientConfiguration( addresses, credentials=credentials, + request_timeout=1000, # in milliseconds ) return await GlideClient.create(client_config) + async def _run_custom_command(self, command: list[str]) -> Any: + """Run a custom command on the Valkey client. + + Args: + command (list[str]): The command to run as a list of strings. + + Returns: + Any result from the command. + """ + client = None + try: + client = await self.create_client() + result = await asyncio.wait_for(client.custom_command(command), timeout=5) + return result + except Exception as e: + logger.error(f"Error running command {' '.join(command)}: {e}") + raise ValkeyUserManagementError(f"Could not run command {' '.join(command)}: {e}") + finally: + if client: + await client.close() + def update_password(self, username: str, new_password: str) -> None: """Update a user's password. @@ -54,11 +77,9 @@ def update_password(self, username: str, new_password: str) -> None: username (str): The username to update. new_password (str): The new password. """ - client = None try: - client = asyncio.run(self.create_client()) result = asyncio.run( - client.custom_command( + self._run_custom_command( [ "ACL", "SETUSER", @@ -68,10 +89,8 @@ def update_password(self, username: str, new_password: str) -> None: ] ) ) + logger.debug(f"Password update result: {result}") except Exception as e: logger.error(f"Error updating password for user {username}: {e}") raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") - finally: - if client: - asyncio.run(client.close()) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index aa0f626..cdfe142 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -53,7 +53,7 @@ def update_credentials(self, username: str, password: str) -> None: def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( - scope=scope, component=self.name + scope=scope, component=self.name, running_status_only=True, running_status_type="async" ).root if not self.workload.can_connect: diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index c3f3c1a..37c467b 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -2,6 +2,7 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. +import contextlib import logging from enum import Enum from pathlib import Path @@ -10,7 +11,10 @@ import jubilant import yaml from data_platform_helpers.advanced_statuses.models import StatusObject -from ops import StatusBase +from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials +from ops import SecretNotFoundError, StatusBase + +from literals import CLIENT_PORT, INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG logger = logging.getLogger(__name__) @@ -27,6 +31,11 @@ class CharmStatuses(Enum): status="blocked", message="Scaling Valkey is not implemented yet", ) + SECRET_ACCESS_ERROR = StatusObject( + status="blocked", + message="Cannot access configured secret, check permissions", + running="async", + ) def does_status_match( @@ -141,3 +150,127 @@ def verify_unit_count( unit_count[app] = 1 return all(count == len(status.get_units(app)) for app, count in unit_count.items()) + + +def get_cluster_hostnames(juju: jubilant.Juju, app_name: str) -> list[str]: + """Get the hostnames of all units in the Valkey application. + + Args: + juju: The Juju client instance. + app_name: The name of the Valkey application. + + Returns: + A list of hostnames for all units in the Valkey application. + """ + status = juju.status() + return [unit.address for unit in status.get_units(app_name).values()] + + +def get_secret_by_label(juju: jubilant.Juju, label: str) -> dict[str, str]: + for secret in juju.secrets(): + if label == secret.label: + revealed_secret = juju.show_secret(secret.uri, reveal=True) + return revealed_secret.content + + raise SecretNotFoundError(f"Secret with label {label} not found") + + +async def create_valkey_client( + hostnames: list[str], username: str | None = INTERNAL_USER, password: str | None = None +): + """Create and return a Valkey client connected to the cluster. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + username: The username for authentication. + password: The password for the internal user. + + Returns: + A Valkey client instance connected to the cluster. + """ + addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in hostnames] + + credentials = None + if username or password: + credentials = ServerCredentials(username=username, password=password) + # TODO add back when we enable cluster mode + # client_config = GlideClusterClientConfiguration( + # addresses, + # credentials=credentials, + # ) + client_config = GlideClientConfiguration( + addresses, + credentials=credentials, + ) + return await GlideClient.create(client_config) + + +def set_password( + juju: jubilant.Juju, + password: str, + username: str = INTERNAL_USER, + application: str = APP_NAME, +) -> None: + """Set a user password (or update it if existing) via secret. + + Args: + juju: An instance of Jubilant's Juju class on which to run Juju commands + password: password to use + username: the user to set the password + application: the application the created secret will be granted to + """ + secret_name = "system_users_secret" + + # if secret exists, update it, else add secret + existing = next((s for s in juju.secrets() if s.name == secret_name), None) + if existing: + juju.update_secret(identifier=existing.uri, content={username: password}) + secret_id = existing.uri + else: + secret_id = juju.add_secret(name=secret_name, content={username: password}) + + # grant the application access to this secret + juju.grant_secret(identifier=secret_id, app=application) + + # update the application config to include the secret + juju.config(app=application, values={INTERNAL_USER_PASSWORD_CONFIG: secret_id}) + + +async def set_key( + hostnames: list[str], username: str, password: str, key: str, value: str +) -> bytes | None: + """Write a key-value pair to the Valkey cluster. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + key: The key to write. + value: The value to write. + username: The username for authentication. + password: The password for authentication. + """ + client = await create_valkey_client(hostnames=hostnames, username=username, password=password) + return await client.set(key, value) + + +async def get_key(hostnames: list[str], username: str, password: str, key: str) -> bytes | None: + """Read a value from the Valkey cluster by key. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + key: The key to read. + username: The username for authentication. + password: The password for authentication. + """ + client = await create_valkey_client(hostnames=hostnames, username=username, password=password) + return await client.get(key) + + +@contextlib.contextmanager +def fast_forward(juju: jubilant.Juju): + """Context manager that temporarily speeds up update-status hooks to fire every 10s.""" + old = juju.model_config()["update-status-hook-interval"] + juju.model_config({"update-status-hook-interval": "10s"}) + try: + yield + finally: + juju.model_config({"update-status-hook-interval": old}) diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 2c1013b..5104f76 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -2,15 +2,33 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. import logging +from time import sleep import jubilant import pytest -from .helpers import APP_NAME, IMAGE_RESOURCE, CharmStatuses, does_status_match +from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION + +from .helpers import ( + APP_NAME, + IMAGE_RESOURCE, + CharmStatuses, + create_valkey_client, + does_status_match, + fast_forward, + get_cluster_hostnames, + get_key, + get_secret_by_label, + set_key, + set_password, +) logger = logging.getLogger(__name__) -NUM_UNITS = 3 +# TODO scale up when scaling is implemented +NUM_UNITS = 1 +TEST_KEY = "test_key" +TEST_VALUE = "test_value" @pytest.mark.abort_on_fail @@ -24,3 +42,107 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: ), timeout=600, ) + + +@pytest.mark.abort_on_fail +async def test_authentication(juju: jubilant.Juju) -> None: + """Assert that we can authenticate to valkey.""" + hostnames = get_cluster_hostnames(juju, APP_NAME) + + # try without authentication + with pytest.raises(Exception) as exc_info: + unauth_client = await create_valkey_client( + hostnames=hostnames, username=None, password=None + ) + await unauth_client.ping() + assert "NOAUTH" in str(exc_info.value), "Unauthenticated access did not fail as expected" + + # Authenticate with internal user + secret = get_secret_by_label(juju, label=f"{PEER_RELATION}.{APP_NAME}.app") + password = secret.get(f"{INTERNAL_USER}-password") + assert password is not None, "Admin password secret not found" + + client = await create_valkey_client(hostnames=hostnames, password=password) + auth_result = await client.ping() + assert auth_result == b"PONG", "Authentication to Valkey cluster failed" + + +@pytest.mark.abort_on_fail +async def test_update_admin_password(juju: jubilant.Juju) -> None: + """Assert the admin password is updated when adding a user secret to the config.""" + hostnames = get_cluster_hostnames(juju, APP_NAME) + + # create a user secret and grant it to the application + new_password = "some-password" + set_password(juju, new_password) + + # wait for config-changed hook to finish executing + juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + + # perform read operation with the updated password + result = await set_key( + hostnames=hostnames, + username=INTERNAL_USER, + password=new_password, + key=TEST_KEY, + value=TEST_VALUE, + ) + assert result == "OK", "Failed to write data after admin password update" + + # update the config again and remove the option `admin-password` + juju.config(app=APP_NAME, reset=[INTERNAL_USER_PASSWORD_CONFIG]) + + # wait for config-changed hook to finish executing + juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + + # make sure we can still read data with the previously set password + assert await get_key( + hostnames=hostnames, username=INTERNAL_USER, password=new_password, key=TEST_KEY + ) == bytes(TEST_VALUE, "utf-8") + + +@pytest.mark.abort_on_fail +async def test_user_secret_permissions(juju: jubilant.Juju) -> None: + """If a user secret is not granted, ensure we can process updated permissions.""" + hostnames = get_cluster_hostnames(juju, APP_NAME) + + logger.info("Creating new user secret") + secret_name = "my_secret" + new_password = "even-newer-password" + secret_id = juju.add_secret(name=secret_name, content={INTERNAL_USER: new_password}) + + logger.info("Updating configuration with the new secret - but without access") + juju.config(app=APP_NAME, values={INTERNAL_USER_PASSWORD_CONFIG: secret_id}) + + juju.wait( + lambda status: does_status_match( + status, + expected_app_statuses={APP_NAME: [CharmStatuses.SECRET_ACCESS_ERROR.value]}, + ), + timeout=1200, + ) + + logger.info("Secret access will be granted now - wait for updated password") + # deferred `config_changed` event will be retried before `update_status` + with fast_forward(juju): + juju.grant_secret(identifier=secret_name, app=APP_NAME) + sleep(10) # allow some time for the permission to propagate + + # juju.wait( + # lambda status: jubilant.all_active(status, APP_NAME), + # timeout=1200, + # ) + juju.wait( + lambda status: does_status_match( + status, + expected_app_statuses={APP_NAME: [CharmStatuses.SCALING_NOT_IMPLEMENTED.value]}, + ), + timeout=600, + ) + + # perform read operation with the updated password + assert await get_key( + hostnames=hostnames, username=INTERNAL_USER, password=new_password, key=TEST_KEY + ) == bytes(TEST_VALUE, "utf-8"), "Failed to read data after secret permissions were updated" + + logger.info("Password update successful after secret was granted") diff --git a/tox.ini b/tox.ini index c2c8f7a..c4e23b0 100644 --- a/tox.ini +++ b/tox.ini @@ -68,5 +68,5 @@ commands_pre = poetry install --only integration commands = # on CI, concierge will setup the model `testing` - locally we need to do it ourselves - sh -c "if [ -z "$CI" ]; then juju add-model testing; fi;" + sh -c "if [ -z "$CI" ]; then juju add-model testing && juju model-config logging-config='=INFO;unit=DEBUG'; fi;" poetry run pytest -v --tb native --log-cli-level=INFO -s --ignore={[vars]tests_path}/unit/ {posargs} \ No newline at end of file From 7157121be8f71db97cfd3c058e3907353d53d79b Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 14:16:46 +0000 Subject: [PATCH 007/159] add install deps to ci unit tests --- .github/workflows/ci.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2d997d6..80d5cb4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,6 +35,10 @@ jobs: run: | pipx install tox pipx install poetry + # to build Valkey-glide during tests + - name: Install dependencies + run: | + apt install libprotobuf-dev protobuf-compiler - name: Run tests run: tox run -e unit - name: Upload Coverage to Codecov From 90750a151ff8e25f6d6e9aeac3930809c2fc43e7 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 14:19:08 +0000 Subject: [PATCH 008/159] add sudo to apt --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 80d5cb4..db69c95 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -38,7 +38,7 @@ jobs: # to build Valkey-glide during tests - name: Install dependencies run: | - apt install libprotobuf-dev protobuf-compiler + sudo apt install libprotobuf-dev protobuf-compiler - name: Run tests run: tox run -e unit - name: Upload Coverage to Codecov From 6a80e4603bf34cf03fb9a85d6639d7863baca4d9 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 20 Jan 2026 14:37:03 +0000 Subject: [PATCH 009/159] install protobug for glide on integration tests --- .github/workflows/integration_test.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 129dbb2..60634c0 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -108,6 +108,8 @@ jobs: sudo snap install charmcraft --classic sudo snap install go --classic go install github.com/snapcore/spread/cmd/spread@latest + # to build Valkey-glide during tests + sudo apt install libprotobuf-dev protobuf-compiler - name: Download packed charm(s) timeout-minutes: 5 uses: actions/download-artifact@v6 From 301e62736ac77c7bb1cf51d96c1be2714e92bb16 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 21 Jan 2026 04:33:15 +0000 Subject: [PATCH 010/159] auto approve installing deps --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 60634c0..2621d80 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -109,7 +109,7 @@ jobs: sudo snap install go --classic go install github.com/snapcore/spread/cmd/spread@latest # to build Valkey-glide during tests - sudo apt install libprotobuf-dev protobuf-compiler + sudo apt install libprotobuf-dev protobuf-compiler -y - name: Download packed charm(s) timeout-minutes: 5 uses: actions/download-artifact@v6 From 2be061ca67b120ff51f61953c07245ce6a0937d7 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 21 Jan 2026 05:12:16 +0000 Subject: [PATCH 011/159] update rust --- .github/workflows/integration_test.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 2621d80..52ef9ef 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -110,6 +110,9 @@ jobs: go install github.com/snapcore/spread/cmd/spread@latest # to build Valkey-glide during tests sudo apt install libprotobuf-dev protobuf-compiler -y + apt-get install rustup -y + rustup set profile minimal + rustup default 1.90.0 - name: Download packed charm(s) timeout-minutes: 5 uses: actions/download-artifact@v6 From e2ea39fe7cdf965ce47a1a4c0da0aad882bd4f6b Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 21 Jan 2026 05:22:59 +0000 Subject: [PATCH 012/159] sudo apt --- .github/workflows/integration_test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index 52ef9ef..d7112c5 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -110,7 +110,7 @@ jobs: go install github.com/snapcore/spread/cmd/spread@latest # to build Valkey-glide during tests sudo apt install libprotobuf-dev protobuf-compiler -y - apt-get install rustup -y + sudo apt install rustup -y rustup set profile minimal rustup default 1.90.0 - name: Download packed charm(s) From 07353c32c491613412790ea2c823d2b32c6f75cd Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 21 Jan 2026 05:41:14 +0000 Subject: [PATCH 013/159] set default rust on spread --- spread.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/spread.yaml b/spread.yaml index 1bf511e..a6ee5f5 100644 --- a/spread.yaml +++ b/spread.yaml @@ -117,6 +117,7 @@ prepare: | concierge prepare --trace pipx install tox poetry + rustup default 1.90.0 prepare-each: | cd "$SPREAD_PATH" # `concierge prepare` needs to be run for each spread job in case Juju version changed From a8a2f18a132ad1f45160cc11ffb16f64e80dcc3e Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 21 Jan 2026 07:27:28 +0000 Subject: [PATCH 014/159] save acl after udpating password so the change persists across restarts --- src/common/client.py | 13 +++++++++++++ src/managers/cluster.py | 2 ++ tests/unit/test_charm.py | 2 ++ 3 files changed, 17 insertions(+) diff --git a/src/common/client.py b/src/common/client.py index ac9c941..d1b6f1d 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -94,3 +94,16 @@ def update_password(self, username: str, new_password: str) -> None: except Exception as e: logger.error(f"Error updating password for user {username}: {e}") raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") + + def save_acl(self) -> None: + """Save ACL content to the Valkey server. + + Args: + acl_content (str): The ACL content to save. + """ + try: + result = asyncio.run(self._run_custom_command(["ACL", "SAVE"])) + logger.debug(f"ACL save result: {result}") + except Exception as e: + logger.error(f"Error saving ACL: {e}") + raise ValkeyUserManagementError(f"Could not save ACL: {e}") diff --git a/src/managers/cluster.py b/src/managers/cluster.py index cdfe142..3ceaa85 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -47,6 +47,8 @@ def update_credentials(self, username: str, password: str) -> None: hosts=self.cluster_hostnames, ) client.update_password(username=username, new_password=password) + client.password = password + client.save_acl() except ValkeyUserManagementError: raise diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 16bd6d5..1d42fed 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -289,11 +289,13 @@ def test_config_changed_leader_unit(): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("common.client.ValkeyClient.update_password") as mock_update_password, + patch("common.client.ValkeyClient.save_acl") as mock_save_acl, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_update_password.assert_called_once_with( username=INTERNAL_USER, new_password="secure-password" ) + mock_save_acl.assert_called_once() secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") == "secure-password" From 87c443e761d2c59de4d879c37277b1e0e3a0d0c1 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 22 Jan 2026 04:59:27 +0000 Subject: [PATCH 015/159] feedback from rene --- pyproject.toml | 4 +++- src/common/client.py | 4 ++-- tests/unit/test_charm.py | 28 +++++----------------------- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e6cb1b7..6b0ae59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,10 +6,12 @@ package-mode = false requires-poetry = ">=2.0.0" [tool.poetry.dependencies] -python = "^3.12" # switch to 3.14 once charm base is 26.04 +python = "^3.12" # switch to 3.14 once charm base is 26.04 ops = "^3.5.0" charmlibs-pathops = "^1.2.0" data-platform-helpers = ">=0.1.7" +# TODO replace with official release once build from source is possible +# https://github.com/valkey-io/valkey-glide/pull/5202 valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs" } [tool.poetry.requires-plugins] diff --git a/src/common/client.py b/src/common/client.py index d1b6f1d..1b10371 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -64,8 +64,8 @@ async def _run_custom_command(self, command: list[str]) -> Any: result = await asyncio.wait_for(client.custom_command(command), timeout=5) return result except Exception as e: - logger.error(f"Error running command {' '.join(command)}: {e}") - raise ValkeyUserManagementError(f"Could not run command {' '.join(command)}: {e}") + logger.error("Error running custom command: %s", e) + raise ValkeyUserManagementError(f"Could not run custom command: {e}") finally: if client: await client.close() diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 1d42fed..7837c15 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -233,12 +233,16 @@ def test_config_changed_non_leader_unit(): ctx = testing.Context(ValkeyCharm) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) + password_secret = testing.Secret( + tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + ) state_in = testing.State( leader=False, relations={relation}, containers={container}, - config={INTERNAL_USER_PASSWORD_CONFIG: "secret:1tf1wk0tmfrodp8ofwxn"}, + secrets={password_secret}, + config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("events.base_events.BaseEvents.update_admin_password") as mock_update, @@ -323,28 +327,6 @@ def test_config_changed_leader_unit_wrong_username(): mock_update_password.assert_not_called() -def test_config_changed_leader_unit_wrong_secret(): - ctx = testing.Context(ValkeyCharm) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) - container = testing.Container(name=CONTAINER, can_connect=True) - - password_secret = testing.Secret( - tracked_content={"wrong-username": "secure-password"}, remote_grants=APP_NAME - ) - state_in = testing.State( - leader=True, - relations={relation}, - containers={container}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, - ) - with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("common.client.ValkeyClient.update_password") as mock_update_password, - ): - ctx.run(ctx.on.config_changed(), state_in) - mock_update_password.assert_not_called() - - def test_change_password_secret_changed_non_leader_unit(): ctx = testing.Context(ValkeyCharm) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) From 2cd5c8b5950cf0742f1296f138f129d2eb7c4a4c Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 22 Jan 2026 09:10:27 +0000 Subject: [PATCH 016/159] switch updating password to write acl file and then load it --- src/common/client.py | 46 +++++--------------------------- src/events/base_events.py | 7 ++--- src/managers/cluster.py | 13 +++------ src/managers/config.py | 16 +++++++---- tests/integration/k8s/helpers.py | 5 ---- tests/unit/test_charm.py | 14 +++++----- 6 files changed, 30 insertions(+), 71 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 1b10371..e88a1a6 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -37,11 +37,6 @@ async def create_client(self) -> GlideClient: """Initialize the Valkey client.""" addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in self.hosts] credentials = ServerCredentials(username=self.user, password=self.password) - # TODO add back when we enable cluster mode - # client_config = GlideClusterClientConfiguration( - # addresses, - # credentials=credentials, - # ) client_config = GlideClientConfiguration( addresses, credentials=credentials, @@ -70,40 +65,11 @@ async def _run_custom_command(self, command: list[str]) -> Any: if client: await client.close() - def update_password(self, username: str, new_password: str) -> None: - """Update a user's password. - - Args: - username (str): The username to update. - new_password (str): The new password. - """ - try: - result = asyncio.run( - self._run_custom_command( - [ - "ACL", - "SETUSER", - username, - "resetpass", - f">{new_password}", - ] - ) - ) - - logger.debug(f"Password update result: {result}") - except Exception as e: - logger.error(f"Error updating password for user {username}: {e}") - raise ValkeyUserManagementError(f"Could not update password for user {username}: {e}") - - def save_acl(self) -> None: - """Save ACL content to the Valkey server. - - Args: - acl_content (str): The ACL content to save. - """ + def load_acl(self) -> None: + """Load ACL content to the Valkey server.""" try: - result = asyncio.run(self._run_custom_command(["ACL", "SAVE"])) - logger.debug(f"ACL save result: {result}") + result = asyncio.run(self._run_custom_command(["ACL", "LOAD"])) + logger.debug(f"ACL load result: {result}") except Exception as e: - logger.error(f"Error saving ACL: {e}") - raise ValkeyUserManagementError(f"Could not save ACL: {e}") + logger.error(f"Error loading ACL: {e}") + raise ValkeyUserManagementError(f"Could not load ACL: {e}") diff --git a/src/events/base_events.py b/src/events/base_events.py index 9a8def6..0125411 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -82,6 +82,8 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: """Handle the secret_changed event.""" + # TODO For a multi-node cluster the units should independently update their passwords. + # If they fail the event should be deferred and retried. if not self.charm.unit.is_leader(): return @@ -105,9 +107,8 @@ def update_admin_password(self, admin_secret_id: str) -> None: ): logger.debug(f"{INTERNAL_USER_PASSWORD_CONFIG} have changed.") try: - self.charm.cluster_manager.update_credentials( - username=INTERNAL_USER, password=new_password - ) + self.charm.config_manager.set_acl_file(new_password) + self.charm.cluster_manager.load_acl_file() self.charm.state.cluster.update( {"charmed_operator_password": new_password} ) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 3ceaa85..bbd3073 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -33,22 +33,15 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.admin_password = self.state.cluster.internal_user_credentials.get(INTERNAL_USER, "") self.cluster_hostnames = [server.model.hostname for server in self.state.servers] - def update_credentials(self, username: str, password: str) -> None: - """Update a user's password. - - Args: - username (str): The username to update. - password (str): The new password. - """ + def load_acl_file(self) -> None: + """Load the ACL file into the cluster.""" try: client = ValkeyClient( username=self.admin_user, password=self.admin_password, hosts=self.cluster_hostnames, ) - client.update_password(username=username, new_password=password) - client.password = password - client.save_acl() + client.load_acl() except ValkeyUserManagementError: raise diff --git a/src/managers/config.py b/src/managers/config.py index 2340aa9..32ae023 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -76,12 +76,18 @@ def set_config_properties(self) -> None: logger.debug("Writing configuration") self.workload.write_config_file(config=self.config_properties) - def set_acl_file(self) -> None: - """Write the ACL file with appropriate user permissions.""" + def set_acl_file(self, charmed_operator_password: str = "") -> None: + """Write the ACL file with appropriate user permissions. + + Args: + charmed_operator_password (str): Password for the charmed-operator user. If not provided, + the password from the cluster state will be used. + """ logger.debug("Writing ACL configuration") - charmed_operator_password = self.state.cluster.internal_user_credentials.get( - INTERNAL_USER, "" - ) + if not charmed_operator_password: + charmed_operator_password = self.state.cluster.internal_user_credentials.get( + INTERNAL_USER, "" + ) # sha256 hash the password charmed_operator_password_hash = hashlib.sha256( charmed_operator_password.encode("utf-8") diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 37c467b..98c2ba3 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -193,11 +193,6 @@ async def create_valkey_client( credentials = None if username or password: credentials = ServerCredentials(username=username, password=password) - # TODO add back when we enable cluster mode - # client_config = GlideClusterClientConfiguration( - # addresses, - # credentials=credentials, - # ) client_config = GlideClientConfiguration( addresses, credentials=credentials, diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 7837c15..34b4d00 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -292,14 +292,12 @@ def test_config_changed_leader_unit(): ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("common.client.ValkeyClient.update_password") as mock_update_password, - patch("common.client.ValkeyClient.save_acl") as mock_save_acl, + patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, + patch("common.client.ValkeyClient.load_acl") as mock_load_acl, ): state_out = ctx.run(ctx.on.config_changed(), state_in) - mock_update_password.assert_called_once_with( - username=INTERNAL_USER, new_password="secure-password" - ) - mock_save_acl.assert_called_once() + mock_set_acl_file.assert_called_once() + mock_load_acl.assert_called_once() secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") == "secure-password" @@ -321,10 +319,10 @@ def test_config_changed_leader_unit_wrong_username(): ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("common.client.ValkeyClient.update_password") as mock_update_password, + patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, ): ctx.run(ctx.on.config_changed(), state_in) - mock_update_password.assert_not_called() + mock_set_acl_file.assert_not_called() def test_change_password_secret_changed_non_leader_unit(): From 1f73be7bcf2b319b642104d4220bc113e196b9f4 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 26 Jan 2026 05:23:18 +0000 Subject: [PATCH 017/159] implement feedback --- config.yaml | 2 +- src/common/client.py | 11 +++++++---- src/common/exceptions.py | 10 +++++++++- src/events/base_events.py | 5 +++-- src/managers/cluster.py | 4 ++-- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/config.yaml b/config.yaml index bf71dcd..3fe2d8e 100644 --- a/config.yaml +++ b/config.yaml @@ -8,4 +8,4 @@ options: Configure the internal system user and it's password. The password will be auto-generated if this option is not set. It is for internal use only and SHOULD NOT be used by applications. This needs to be a Juju Secret URI pointing - to a secret that contains the following content: `root: `. \ No newline at end of file + to a secret that contains the following content: `root: `. diff --git a/src/common/client.py b/src/common/client.py index e88a1a6..d851361 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -14,7 +14,10 @@ ServerCredentials, ) -from common.exceptions import ValkeyUserManagementError +from common.exceptions import ( + ValkeyACLLoadError, + ValkeyCustomCommandError, +) from literals import CLIENT_PORT logger = logging.getLogger(__name__) @@ -60,7 +63,7 @@ async def _run_custom_command(self, command: list[str]) -> Any: return result except Exception as e: logger.error("Error running custom command: %s", e) - raise ValkeyUserManagementError(f"Could not run custom command: {e}") + raise ValkeyCustomCommandError(f"Could not run custom command: {e}") finally: if client: await client.close() @@ -70,6 +73,6 @@ def load_acl(self) -> None: try: result = asyncio.run(self._run_custom_command(["ACL", "LOAD"])) logger.debug(f"ACL load result: {result}") - except Exception as e: + except ValkeyCustomCommandError as e: logger.error(f"Error loading ACL: {e}") - raise ValkeyUserManagementError(f"Could not load ACL: {e}") + raise ValkeyACLLoadError(f"Could not load ACL: {e}") diff --git a/src/common/exceptions.py b/src/common/exceptions.py index acd66c1..71e16bc 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -4,5 +4,13 @@ """Charm-specific exceptions.""" -class ValkeyUserManagementError(Exception): +class ValkeyClientError(Exception): """Custom Exception if user could not be added or updated in valkey cluster.""" + + +class ValkeyCustomCommandError(ValkeyClientError): + """Custom Exception if a custom command fails on valkey cluster.""" + + +class ValkeyACLLoadError(ValkeyClientError): + """Custom Exception if ACL file could not be loaded in valkey cluster.""" diff --git a/src/events/base_events.py b/src/events/base_events.py index 0125411..3c375cd 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -10,7 +10,7 @@ import ops -from common.exceptions import ValkeyUserManagementError +from common.exceptions import ValkeyClientError from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION from statuses import CharmStatuses, ClusterStatuses @@ -57,6 +57,7 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: password = self.charm.state.get_secret_from_id(str(admin_secret_id)).get( INTERNAL_USER ) + # TODO consider deferring and blocking the charm except (ops.ModelError, ops.SecretNotFoundError) as e: logger.error(f"Could not access secret {admin_secret_id}: {e}") raise @@ -112,7 +113,7 @@ def update_admin_password(self, admin_secret_id: str) -> None: self.charm.state.cluster.update( {"charmed_operator_password": new_password} ) - except ValkeyUserManagementError as e: + except ValkeyClientError as e: logger.error(e) self.charm.status.set_running_status( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, diff --git a/src/managers/cluster.py b/src/managers/cluster.py index bbd3073..ccb2681 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -11,7 +11,7 @@ from data_platform_helpers.advanced_statuses.types import Scope from common.client import ValkeyClient -from common.exceptions import ValkeyUserManagementError +from common.exceptions import ValkeyACLLoadError from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import INTERNAL_USER @@ -42,7 +42,7 @@ def load_acl_file(self) -> None: hosts=self.cluster_hostnames, ) client.load_acl() - except ValkeyUserManagementError: + except ValkeyACLLoadError: raise def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: From b51956dcc18c44f021fac5aa9d5f1343a7f1f6cc Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 27 Jan 2026 05:34:12 +0000 Subject: [PATCH 018/159] add different charm users --- src/core/models.py | 17 +++-- src/events/base_events.py | 140 +++++++++++++++++++++++++------------- src/literals.py | 26 ++++++- src/managers/cluster.py | 8 ++- src/managers/config.py | 42 ++++++++---- 5 files changed, 160 insertions(+), 73 deletions(-) diff --git a/src/core/models.py b/src/core/models.py index de27f03..5796359 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -17,7 +17,7 @@ ) from pydantic import Field -from literals import INTERNAL_USER +from literals import CharmUsers logger = logging.getLogger(__name__) @@ -26,6 +26,9 @@ class PeerAppModel(PeerModel): """Model for the peer application data.""" charmed_operator_password: ExtraSecretStr = Field(default="") + charmed_sentinel_valkey_password: ExtraSecretStr = Field(default="") + charmed_replication_password: ExtraSecretStr = Field(default="") + charmed_sentinel_operator_password: ExtraSecretStr = Field(default="") class PeerUnitModel(PeerModel): @@ -129,7 +132,11 @@ def model(self) -> PeerAppModel | None: @property def internal_user_credentials(self) -> dict[str, str]: """Retrieve the credentials for the internal admin user.""" - if self.model and (password := self.model.charmed_operator_password): - return {INTERNAL_USER: password} - - return {} + passwords = {} + if not self.model: + return passwords + + for user in CharmUsers: + if password := getattr(self.model, f"{user.value.replace('-', '_')}_password", ""): + passwords[user.value] = password + return passwords diff --git a/src/events/base_events.py b/src/events/base_events.py index 3c375cd..6500647 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -11,7 +11,7 @@ import ops from common.exceptions import ValkeyClientError -from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION +from literals import INTERNAL_USERS_PASSWORD_CONFIG, PEER_RELATION, CharmUsers from statuses import CharmStatuses, ClusterStatuses if TYPE_CHECKING: @@ -52,19 +52,42 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: return if self.charm.unit.is_leader() and not self.charm.state.cluster.internal_user_credentials: - if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): + passwords = {} + user_specified_passwords = {} + if admin_secret_id := self.charm.config.get(INTERNAL_USERS_PASSWORD_CONFIG): try: - password = self.charm.state.get_secret_from_id(str(admin_secret_id)).get( - INTERNAL_USER + user_specified_passwords = self.charm.state.get_secret_from_id( + str(admin_secret_id) ) - # TODO consider deferring and blocking the charm except (ops.ModelError, ops.SecretNotFoundError) as e: logger.error(f"Could not access secret {admin_secret_id}: {e}") - raise - else: - password = self.charm.config_manager.generate_password() + self.charm.status.set_running_status( + CharmStatuses.SECRET_ACCESS_ERROR.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + event.defer() + return + + self.charm.state.statuses.delete( + CharmStatuses.SECRET_ACCESS_ERROR.value, + scope="app", + component=self.charm.cluster_manager.name, + ) + + # generate passwords for all internal users if not specified in the user secret + for user in CharmUsers: + passwords[user.value] = user_specified_passwords.get( + user.value, self.charm.config_manager.generate_password() + ) - self.charm.state.cluster.update({"charmed_operator_password": password}) + self.charm.state.cluster.update( + { + f"{user.value.replace('-', '_')}_password": passwords[user.value] + for user in CharmUsers + } + ) self.charm.config_manager.set_acl_file() def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: @@ -74,9 +97,9 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: if not self.charm.unit.is_leader(): return - if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): + if admin_secret_id := self.charm.config.get(INTERNAL_USERS_PASSWORD_CONFIG): try: - self.update_admin_password(str(admin_secret_id)) + self._update_internal_users_password(str(admin_secret_id)) except (ops.ModelError, ops.SecretNotFoundError): event.defer() return @@ -88,49 +111,22 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: if not self.charm.unit.is_leader(): return - if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): + if admin_secret_id := self.charm.config.get(INTERNAL_USERS_PASSWORD_CONFIG): if admin_secret_id == event.secret.id: try: - self.update_admin_password(str(admin_secret_id)) + self._update_internal_users_password(str(admin_secret_id)) except (ops.ModelError, ops.SecretNotFoundError): event.defer() return - def update_admin_password(self, admin_secret_id: str) -> None: - """Compare current admin password and update in valkey if required.""" + def _update_internal_users_password(self, secret_id: str) -> None: + """Update internal users' passwords in charm/valkey if they have changed. + + Args: + secret_id (str): The id of the secret containing the internal users' passwords. + """ try: - if new_password := self.charm.state.get_secret_from_id(admin_secret_id).get( - INTERNAL_USER - ): - # only update admin credentials if the password has changed - if new_password != self.charm.state.cluster.internal_user_credentials.get( - INTERNAL_USER - ): - logger.debug(f"{INTERNAL_USER_PASSWORD_CONFIG} have changed.") - try: - self.charm.config_manager.set_acl_file(new_password) - self.charm.cluster_manager.load_acl_file() - self.charm.state.cluster.update( - {"charmed_operator_password": new_password} - ) - except ValkeyClientError as e: - logger.error(e) - self.charm.status.set_running_status( - ClusterStatuses.PASSWORD_UPDATE_FAILED.value, - scope="app", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - return - else: - logger.error(f"Invalid username in secret {admin_secret_id}.") - self.charm.status.set_running_status( - ClusterStatuses.PASSWORD_UPDATE_FAILED.value, - scope="app", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - return + secret_content = self.charm.state.get_secret_from_id(secret_id) except (ops.ModelError, ops.SecretNotFoundError) as e: logger.error(e) self.charm.status.set_running_status( @@ -142,12 +138,58 @@ def update_admin_password(self, admin_secret_id: str) -> None: raise self.charm.state.statuses.delete( - ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + CharmStatuses.SECRET_ACCESS_ERROR.value, scope="app", component=self.charm.cluster_manager.name, ) + + # Check which passwords have changed + old_passwords = self.charm.state.cluster.internal_user_credentials + passwords = {user.value: old_passwords.get(user.value, "") for user in CharmUsers} + for user in CharmUsers: + new_password = secret_content.get(user.value) + if not new_password: + continue + # only update user credentials if the password has changed + if new_password != passwords.get(user.value): + logger.debug(f"Password for user {user.value} has changed.") + passwords[user.value] = new_password + + # check if there are any users that are in the secret but not in the CharmUsers + for key in secret_content.keys(): + if key not in passwords: + logger.error(f"Invalid username in secret {secret_id}.") + self.charm.status.set_running_status( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + return + + # Update passwords if any have changed + if passwords != old_passwords: + try: + self.charm.config_manager.set_acl_file(passwords=passwords) + self.charm.cluster_manager.load_acl_file() + self.charm.state.cluster.update( + { + f"{user.value.replace('-', '_')}_password": passwords[user.value] + for user in CharmUsers + } + ) + except ValkeyClientError as e: + logger.error(e) + self.charm.status.set_running_status( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + return + self.charm.state.statuses.delete( - CharmStatuses.SECRET_ACCESS_ERROR.value, + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, scope="app", component=self.charm.cluster_manager.name, ) diff --git a/src/literals.py b/src/literals.py index 61e0f04..7918351 100644 --- a/src/literals.py +++ b/src/literals.py @@ -4,6 +4,8 @@ """Collection of global literals for the Valkey charm.""" +from enum import Enum + CHARM = "valkey" CHARM_USER = "valkey" CONTAINER = "valkey" @@ -14,7 +16,27 @@ PEER_RELATION = "valkey-peers" STATUS_PEERS_RELATION = "status-peers" -INTERNAL_USER = "charmed-operator" -INTERNAL_USER_PASSWORD_CONFIG = "system-users" +INTERNAL_USERS_PASSWORD_CONFIG = "system-users" CLIENT_PORT = 6379 + + +# As per the valkey users spec +# https://docs.google.com/document/d/1EImKKHK3wLY73-D1M2ItpHe88NHeB-Iq2M3lz7AQB7E +class CharmUsers(str, Enum): + """Enumeration of Valkey charm users.""" + + VALKEY_ADMIN = "charmed-operator" + VALKEY_SENTINEL = "charmed-sentinel-valkey" + VALKEY_REPLICA = "charmed-replication" + + # Sentinel users + SENTINEL_ADMIN = "charmed-sentinel-operator" + + +CHARM_USERS_ROLE_MAP = { + CharmUsers.VALKEY_ADMIN: "~* +@all", + CharmUsers.VALKEY_SENTINEL: "+client +config +info +publish +subscribe +monitor +ping +replicaof +failover +script|kill +multi +exec &__sentinel__:hello", + CharmUsers.VALKEY_REPLICA: "+psync +replconf +ping", + CharmUsers.SENTINEL_ADMIN: "~* +@all", +} diff --git a/src/managers/cluster.py b/src/managers/cluster.py index ccb2681..fd0057b 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -14,7 +14,7 @@ from common.exceptions import ValkeyACLLoadError from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import INTERNAL_USER +from literals import CharmUsers from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -29,8 +29,10 @@ class ClusterManager(ManagerStatusProtocol): def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload - self.admin_user = INTERNAL_USER - self.admin_password = self.state.cluster.internal_user_credentials.get(INTERNAL_USER, "") + self.admin_user = CharmUsers.VALKEY_ADMIN.value + self.admin_password = self.state.cluster.internal_user_credentials.get( + CharmUsers.VALKEY_ADMIN.value, "" + ) self.cluster_hostnames = [server.model.hostname for server in self.state.servers] def load_acl_file(self) -> None: diff --git a/src/managers/config.py b/src/managers/config.py index 32ae023..81710bd 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -16,7 +16,7 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import ACL_FILE, CLIENT_PORT, INTERNAL_USER +from literals import ACL_FILE, CHARM_USERS_ROLE_MAP, CLIENT_PORT, CharmUsers from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -76,27 +76,41 @@ def set_config_properties(self) -> None: logger.debug("Writing configuration") self.workload.write_config_file(config=self.config_properties) - def set_acl_file(self, charmed_operator_password: str = "") -> None: + def set_acl_file(self, passwords: dict[str, str] | None = None) -> None: """Write the ACL file with appropriate user permissions. Args: - charmed_operator_password (str): Password for the charmed-operator user. If not provided, - the password from the cluster state will be used. + passwords (dict[str, str] | None): Optional dictionary of passwords to use. If not provided, + the passwords from the cluster state will be used. """ logger.debug("Writing ACL configuration") - if not charmed_operator_password: - charmed_operator_password = self.state.cluster.internal_user_credentials.get( - INTERNAL_USER, "" - ) - # sha256 hash the password - charmed_operator_password_hash = hashlib.sha256( - charmed_operator_password.encode("utf-8") - ).hexdigest() - # write the ACL file acl_content = "user default off\n" - acl_content += f"user {INTERNAL_USER} on #{charmed_operator_password_hash} ~* +@all\n" + for user in CharmUsers: + # only process VALKEY users + # Sentinel users should be in the sentinel acl file + if "VALKEY_" not in str(user): + continue + acl_content += self._get_user_acl_line(user, passwords=passwords) self.workload.write_file(acl_content, ACL_FILE) + def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None = None) -> str: + """Generate an ACL line for a given user. + + Args: + user (CharmUsers): User for which to generate the ACL line. + passwords (dict[str, str] | None): Optional dictionary of passwords to use. If not provided, + the passwords from the cluster state will be used. + + Returns: + str: ACL line for the user. + """ + passwords = passwords or self.state.cluster.internal_user_credentials + if not (password := passwords.get(user.value, "")): + raise ValueError(f"No password found for user {user}") + password_hash = hashlib.sha256(password.encode("utf-8")).hexdigest() + acl_line = f"user {user.value} on #{password_hash} {CHARM_USERS_ROLE_MAP[user]}\n" + return acl_line + def generate_password(self) -> str: """Create randomized string for use as app passwords. From 16153067a9bf238048c24d2d996ba0d095f1acc1 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 27 Jan 2026 07:12:44 +0000 Subject: [PATCH 019/159] update passwords on non leader units --- src/core/models.py | 17 +++++++++++------ src/events/base_events.py | 37 ++++++++++++++++++++++++++++++------- src/literals.py | 2 +- src/managers/cluster.py | 2 +- src/managers/config.py | 2 +- 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/core/models.py b/src/core/models.py index 5796359..fdf00a3 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -9,26 +9,31 @@ import ops from charms.data_platform_libs.v1.data_interfaces import ( - ExtraSecretStr, OpsOtherPeerUnitRepositoryInterface, OpsPeerRepositoryInterface, OpsPeerUnitRepositoryInterface, + OptionalSecretStr, PeerModel, ) from pydantic import Field +from typing_extensions import Annotated from literals import CharmUsers logger = logging.getLogger(__name__) +InternalUsersSecret = Annotated[ + OptionalSecretStr, Field(exclude=True, default=None), "internal_users_secret" +] + class PeerAppModel(PeerModel): """Model for the peer application data.""" - charmed_operator_password: ExtraSecretStr = Field(default="") - charmed_sentinel_valkey_password: ExtraSecretStr = Field(default="") - charmed_replication_password: ExtraSecretStr = Field(default="") - charmed_sentinel_operator_password: ExtraSecretStr = Field(default="") + charmed_operator_password: InternalUsersSecret = Field(default="") + charmed_sentinel_valkey_password: InternalUsersSecret = Field(default="") + charmed_replication_password: InternalUsersSecret = Field(default="") + charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") class PeerUnitModel(PeerModel): @@ -130,7 +135,7 @@ def model(self) -> PeerAppModel | None: return self.data_interface.build_model(self.relation.id) if self.relation else None @property - def internal_user_credentials(self) -> dict[str, str]: + def internal_users_credentials(self) -> dict[str, str]: """Retrieve the credentials for the internal admin user.""" passwords = {} if not self.model: diff --git a/src/events/base_events.py b/src/events/base_events.py index 6500647..96ab857 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -11,7 +11,12 @@ import ops from common.exceptions import ValkeyClientError -from literals import INTERNAL_USERS_PASSWORD_CONFIG, PEER_RELATION, CharmUsers +from literals import ( + INTERNAL_USERS_PASSWORD_CONFIG, + INTERNAL_USERS_SECRET_LABEL, + PEER_RELATION, + CharmUsers, +) from statuses import CharmStatuses, ClusterStatuses if TYPE_CHECKING: @@ -51,7 +56,7 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: event.defer() return - if self.charm.unit.is_leader() and not self.charm.state.cluster.internal_user_credentials: + if self.charm.unit.is_leader() and not self.charm.state.cluster.internal_users_credentials: passwords = {} user_specified_passwords = {} if admin_secret_id := self.charm.config.get(INTERNAL_USERS_PASSWORD_CONFIG): @@ -106,16 +111,34 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: """Handle the secret_changed event.""" - # TODO For a multi-node cluster the units should independently update their passwords. - # If they fail the event should be deferred and retried. if not self.charm.unit.is_leader(): + if event.secret.label and event.secret.label.endswith(INTERNAL_USERS_SECRET_LABEL): + # leader unit processed the secret change from user, non-leader units can replicate + try: + self.charm.config_manager.set_acl_file() + self.charm.cluster_manager.load_acl_file() + except ValkeyClientError as e: + logger.error(e) + self.charm.status.set_running_status( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + event.defer() + return + self.charm.state.statuses.delete( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component=self.charm.cluster_manager.name, + ) return if admin_secret_id := self.charm.config.get(INTERNAL_USERS_PASSWORD_CONFIG): if admin_secret_id == event.secret.id: try: self._update_internal_users_password(str(admin_secret_id)) - except (ops.ModelError, ops.SecretNotFoundError): + except (ops.ModelError, ops.SecretNotFoundError, ValkeyClientError): event.defer() return @@ -144,7 +167,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: ) # Check which passwords have changed - old_passwords = self.charm.state.cluster.internal_user_credentials + old_passwords = self.charm.state.cluster.internal_users_credentials passwords = {user.value: old_passwords.get(user.value, "") for user in CharmUsers} for user in CharmUsers: new_password = secret_content.get(user.value) @@ -186,7 +209,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) - return + raise self.charm.state.statuses.delete( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, diff --git a/src/literals.py b/src/literals.py index 7918351..a2734d6 100644 --- a/src/literals.py +++ b/src/literals.py @@ -17,7 +17,7 @@ STATUS_PEERS_RELATION = "status-peers" INTERNAL_USERS_PASSWORD_CONFIG = "system-users" - +INTERNAL_USERS_SECRET_LABEL = "internal_users_secret" CLIENT_PORT = 6379 diff --git a/src/managers/cluster.py b/src/managers/cluster.py index fd0057b..e9671c7 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -30,7 +30,7 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload self.admin_user = CharmUsers.VALKEY_ADMIN.value - self.admin_password = self.state.cluster.internal_user_credentials.get( + self.admin_password = self.state.cluster.internal_users_credentials.get( CharmUsers.VALKEY_ADMIN.value, "" ) self.cluster_hostnames = [server.model.hostname for server in self.state.servers] diff --git a/src/managers/config.py b/src/managers/config.py index 81710bd..eec35a8 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -104,7 +104,7 @@ def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None Returns: str: ACL line for the user. """ - passwords = passwords or self.state.cluster.internal_user_credentials + passwords = passwords or self.state.cluster.internal_users_credentials if not (password := passwords.get(user.value, "")): raise ValueError(f"No password found for user {user}") password_hash = hashlib.sha256(password.encode("utf-8")).hexdigest() From 7aa45059da44e56b0e7903e4a637c362772dcd34 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 27 Jan 2026 07:33:12 +0000 Subject: [PATCH 020/159] chagne scope of status for units and fix exception catching --- src/events/base_events.py | 16 ++++++++-------- src/statuses.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 96ab857..68ca575 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -10,7 +10,7 @@ import ops -from common.exceptions import ValkeyClientError +from common.exceptions import ValkeyACLLoadError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL, @@ -117,11 +117,11 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: try: self.charm.config_manager.set_acl_file() self.charm.cluster_manager.load_acl_file() - except ValkeyClientError as e: + except ValkeyACLLoadError as e: logger.error(e) self.charm.status.set_running_status( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, - scope="app", + scope="unit", component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) @@ -129,7 +129,7 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: return self.charm.state.statuses.delete( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, - scope="app", + scope="unit", component=self.charm.cluster_manager.name, ) return @@ -138,7 +138,7 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: if admin_secret_id == event.secret.id: try: self._update_internal_users_password(str(admin_secret_id)) - except (ops.ModelError, ops.SecretNotFoundError, ValkeyClientError): + except (ops.ModelError, ops.SecretNotFoundError, ValkeyACLLoadError): event.defer() return @@ -201,11 +201,11 @@ def _update_internal_users_password(self, secret_id: str) -> None: for user in CharmUsers } ) - except ValkeyClientError as e: + except ValkeyACLLoadError as e: logger.error(e) self.charm.status.set_running_status( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, - scope="app", + scope="unit", component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) @@ -213,6 +213,6 @@ def _update_internal_users_password(self, secret_id: str) -> None: self.charm.state.statuses.delete( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, - scope="app", + scope="unit", component=self.charm.cluster_manager.name, ) diff --git a/src/statuses.py b/src/statuses.py index ba9234b..0f557a2 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -31,5 +31,5 @@ class ClusterStatuses(Enum): """Collection of possible cluster related statuses.""" PASSWORD_UPDATE_FAILED = StatusObject( - status="blocked", message="Failed to update the internal user's password", running="async" + status="blocked", message="Failed to update an internal user's password", running="async" ) From c63f21e9abf72a66c5a56ef4d07cbc4a522cd156 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 27 Jan 2026 10:21:09 +0000 Subject: [PATCH 021/159] fixing unit tests WIP --- tests/integration/k8s/helpers.py | 10 ++-- tests/integration/k8s/test_charm.py | 20 ++++---- tests/unit/test_charm.py | 74 ++++++++++++++++++----------- 3 files changed, 65 insertions(+), 39 deletions(-) diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 98c2ba3..2b50ad2 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -14,7 +14,7 @@ from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials from ops import SecretNotFoundError, StatusBase -from literals import CLIENT_PORT, INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG +from literals import CLIENT_PORT, INTERNAL_USERS_PASSWORD_CONFIG, CharmUsers logger = logging.getLogger(__name__) @@ -176,7 +176,9 @@ def get_secret_by_label(juju: jubilant.Juju, label: str) -> dict[str, str]: async def create_valkey_client( - hostnames: list[str], username: str | None = INTERNAL_USER, password: str | None = None + hostnames: list[str], + username: str | None = CharmUsers.VALKEY_ADMIN, + password: str | None = None, ): """Create and return a Valkey client connected to the cluster. @@ -203,7 +205,7 @@ async def create_valkey_client( def set_password( juju: jubilant.Juju, password: str, - username: str = INTERNAL_USER, + username: str = CharmUsers.VALKEY_ADMIN, application: str = APP_NAME, ) -> None: """Set a user password (or update it if existing) via secret. @@ -228,7 +230,7 @@ def set_password( juju.grant_secret(identifier=secret_id, app=application) # update the application config to include the secret - juju.config(app=application, values={INTERNAL_USER_PASSWORD_CONFIG: secret_id}) + juju.config(app=application, values={INTERNAL_USERS_PASSWORD_CONFIG: secret_id}) async def set_key( diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 5104f76..66b9855 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -7,7 +7,11 @@ import jubilant import pytest -from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION +from literals import ( + INTERNAL_USERS_PASSWORD_CONFIG, + PEER_RELATION, + CharmUsers, +) from .helpers import ( APP_NAME, @@ -59,7 +63,7 @@ async def test_authentication(juju: jubilant.Juju) -> None: # Authenticate with internal user secret = get_secret_by_label(juju, label=f"{PEER_RELATION}.{APP_NAME}.app") - password = secret.get(f"{INTERNAL_USER}-password") + password = secret.get(f"{CharmUsers.VALKEY_ADMIN}-password") assert password is not None, "Admin password secret not found" client = await create_valkey_client(hostnames=hostnames, password=password) @@ -82,7 +86,7 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: # perform read operation with the updated password result = await set_key( hostnames=hostnames, - username=INTERNAL_USER, + username=CharmUsers.VALKEY_ADMIN, password=new_password, key=TEST_KEY, value=TEST_VALUE, @@ -90,14 +94,14 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: assert result == "OK", "Failed to write data after admin password update" # update the config again and remove the option `admin-password` - juju.config(app=APP_NAME, reset=[INTERNAL_USER_PASSWORD_CONFIG]) + juju.config(app=APP_NAME, reset=[INTERNAL_USERS_PASSWORD_CONFIG]) # wait for config-changed hook to finish executing juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) # make sure we can still read data with the previously set password assert await get_key( - hostnames=hostnames, username=INTERNAL_USER, password=new_password, key=TEST_KEY + hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN, password=new_password, key=TEST_KEY ) == bytes(TEST_VALUE, "utf-8") @@ -109,10 +113,10 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: logger.info("Creating new user secret") secret_name = "my_secret" new_password = "even-newer-password" - secret_id = juju.add_secret(name=secret_name, content={INTERNAL_USER: new_password}) + secret_id = juju.add_secret(name=secret_name, content={CharmUsers.VALKEY_ADMIN: new_password}) logger.info("Updating configuration with the new secret - but without access") - juju.config(app=APP_NAME, values={INTERNAL_USER_PASSWORD_CONFIG: secret_id}) + juju.config(app=APP_NAME, values={INTERNAL_USERS_PASSWORD_CONFIG: secret_id}) juju.wait( lambda status: does_status_match( @@ -142,7 +146,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform read operation with the updated password assert await get_key( - hostnames=hostnames, username=INTERNAL_USER, password=new_password, key=TEST_KEY + hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN, password=new_password, key=TEST_KEY ) == bytes(TEST_VALUE, "utf-8"), "Failed to read data after secret permissions were updated" logger.info("Password update successful after secret was granted") diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 34b4d00..a1c61ae 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -5,16 +5,16 @@ from pathlib import Path from unittest.mock import patch -import pytest import yaml from ops import ActiveStatus, pebble, testing from src.charm import ValkeyCharm from src.literals import ( - INTERNAL_USER, - INTERNAL_USER_PASSWORD_CONFIG, + INTERNAL_USERS_PASSWORD_CONFIG, + INTERNAL_USERS_SECRET_LABEL, PEER_RELATION, STATUS_PEERS_RELATION, + CharmUsers, ) from src.statuses import CharmStatuses @@ -171,8 +171,10 @@ def test_internal_user_creation(): state_in = testing.State(relations={relation}, leader=True, containers={container}) with patch("workload_k8s.ValkeyK8sWorkload.write_file"): state_out = ctx.run(ctx.on.leader_elected(), state_in) - secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") - assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") + secret_out = state_out.get_secret( + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}" + ) + assert secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") def test_leader_elected_no_peer_relation(): @@ -191,42 +193,57 @@ def test_leader_elected_leader_password_specified(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME ) state_in = testing.State( leader=True, relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("managers.config.ConfigManager.generate_password") as mock_generate, + patch( + "managers.config.ConfigManager.generate_password", return_value="generated-password" + ), ): state_out = ctx.run(ctx.on.leader_elected(), state_in) secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") - assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") == "secure-password" - mock_generate.assert_not_called() + assert ( + secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + == "secure-password" + ) + secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") + for user in CharmUsers: + if user == CharmUsers.VALKEY_ADMIN: + assert secret_out.latest_content.get(f"{user.value}-password") == "secure-password" + continue + assert secret_out.latest_content.get(f"{user.value}-password") == "generated-password" def test_leader_elected_leader_password_specified_wrong_secret(): ctx = testing.Context(ValkeyCharm) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + status_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) state_in = testing.State( leader=True, - relations={relation}, + relations={relation, status_relation}, containers={container}, - config={INTERNAL_USER_PASSWORD_CONFIG: "secret:1tf1wk0tmfrodp8ofwxn"}, + config={INTERNAL_USERS_PASSWORD_CONFIG: "secret:1tf1wk0tmfrodp8ofwxn"}, ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - pytest.raises(testing.errors.UncaughtCharmError) as exc_info, + ctx(ctx.on.leader_elected(), state_in) as manager, ): - ctx.run(ctx.on.leader_elected(), state_in) - assert "SecretNotFoundError" in str(exc_info.value) + charm: ValkeyCharm = manager.charm + manager.run() + assert ( + charm.state.statuses.get(scope="app", component="cluster")[0] + == CharmStatuses.SECRET_ACCESS_ERROR.value + ) def test_config_changed_non_leader_unit(): @@ -234,7 +251,7 @@ def test_config_changed_non_leader_unit(): relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME ) state_in = testing.State( @@ -242,7 +259,7 @@ def test_config_changed_non_leader_unit(): relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("events.base_events.BaseEvents.update_admin_password") as mock_update, @@ -257,14 +274,14 @@ def test_config_changed_leader_unit_valkey_update_fails(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME ) state_in = testing.State( leader=True, relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), @@ -281,14 +298,14 @@ def test_config_changed_leader_unit(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME ) state_in = testing.State( leader=True, relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), @@ -299,7 +316,10 @@ def test_config_changed_leader_unit(): mock_set_acl_file.assert_called_once() mock_load_acl.assert_called_once() secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") - assert secret_out.latest_content.get(f"{INTERNAL_USER}-password") == "secure-password" + assert ( + secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + == "secure-password" + ) def test_config_changed_leader_unit_wrong_username(): @@ -315,7 +335,7 @@ def test_config_changed_leader_unit_wrong_username(): relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), @@ -331,7 +351,7 @@ def test_change_password_secret_changed_non_leader_unit(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME ) state_in = testing.State( @@ -339,7 +359,7 @@ def test_change_password_secret_changed_non_leader_unit(): relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("events.base_events.BaseEvents.update_admin_password") as mock_update_password, @@ -354,7 +374,7 @@ def test_change_password_secret_changed_leader_unit(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={INTERNAL_USER: "secure-password"}, remote_grants=APP_NAME + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME ) state_in = testing.State( @@ -362,7 +382,7 @@ def test_change_password_secret_changed_leader_unit(): relations={relation}, containers={container}, secrets={password_secret}, - config={INTERNAL_USER_PASSWORD_CONFIG: password_secret.id}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( patch("events.base_events.BaseEvents.update_admin_password") as mock_update_password, From d8e2754cc7e03b96a971ebc0cce1a65ad0e6fd4a Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 28 Jan 2026 06:00:41 +0000 Subject: [PATCH 022/159] small charm restructure and enahnce unit tests --- src/events/base_events.py | 25 +++++------ tests/unit/test_charm.py | 90 +++++++++++++++++++++++++++++++++------ 2 files changed, 89 insertions(+), 26 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 68ca575..a3bbd74 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -105,7 +105,7 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: if admin_secret_id := self.charm.config.get(INTERNAL_USERS_PASSWORD_CONFIG): try: self._update_internal_users_password(str(admin_secret_id)) - except (ops.ModelError, ops.SecretNotFoundError): + except (ops.ModelError, ops.SecretNotFoundError, ValkeyACLLoadError): event.defer() return @@ -116,7 +116,7 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: # leader unit processed the secret change from user, non-leader units can replicate try: self.charm.config_manager.set_acl_file() - self.charm.cluster_manager.load_acl_file() + self.charm.cluster_manager.reload_acl_file() except ValkeyACLLoadError as e: logger.error(e) self.charm.status.set_running_status( @@ -169,14 +169,6 @@ def _update_internal_users_password(self, secret_id: str) -> None: # Check which passwords have changed old_passwords = self.charm.state.cluster.internal_users_credentials passwords = {user.value: old_passwords.get(user.value, "") for user in CharmUsers} - for user in CharmUsers: - new_password = secret_content.get(user.value) - if not new_password: - continue - # only update user credentials if the password has changed - if new_password != passwords.get(user.value): - logger.debug(f"Password for user {user.value} has changed.") - passwords[user.value] = new_password # check if there are any users that are in the secret but not in the CharmUsers for key in secret_content.keys(): @@ -190,11 +182,20 @@ def _update_internal_users_password(self, secret_id: str) -> None: ) return + for user in CharmUsers: + new_password = secret_content.get(user.value) + if not new_password: + continue + # only update user credentials if the password has changed + if new_password != passwords.get(user.value): + logger.debug(f"Password for user {user.value} has changed.") + passwords[user.value] = new_password + # Update passwords if any have changed if passwords != old_passwords: try: self.charm.config_manager.set_acl_file(passwords=passwords) - self.charm.cluster_manager.load_acl_file() + self.charm.cluster_manager.reload_acl_file() self.charm.state.cluster.update( { f"{user.value.replace('-', '_')}_password": passwords[user.value] @@ -209,7 +210,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) - raise + raise e self.charm.state.statuses.delete( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index fca20e2..cb557fc 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -8,6 +8,7 @@ import yaml from ops import ActiveStatus, pebble, testing +from common.exceptions import ValkeyACLLoadError from src.charm import ValkeyCharm from src.literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -16,7 +17,7 @@ STATUS_PEERS_RELATION, CharmUsers, ) -from src.statuses import CharmStatuses +from src.statuses import CharmStatuses, ClusterStatuses from .helpers import status_is @@ -209,12 +210,9 @@ def test_leader_elected_leader_password_specified(): ), ): state_out = ctx.run(ctx.on.leader_elected(), state_in) - secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") - assert ( - secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") - == "secure-password" + secret_out = state_out.get_secret( + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}" ) - secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") for user in CharmUsers: if user == CharmUsers.VALKEY_ADMIN: assert secret_out.latest_content.get(f"{user.value}-password") == "secure-password" @@ -262,7 +260,7 @@ def test_config_changed_non_leader_unit(): config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( - patch("events.base_events.BaseEvents.update_admin_password") as mock_update, + patch("events.base_events.BaseEvents._update_internal_users_password") as mock_update, ): ctx.run(ctx.on.config_changed(), state_in) mock_update.assert_not_called() @@ -274,7 +272,8 @@ def test_config_changed_leader_unit_valkey_update_fails(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME + tracked_content={user.value: "secure-password" for user in CharmUsers}, + remote_grants=APP_NAME, ) state_in = testing.State( leader=True, @@ -315,7 +314,9 @@ def test_config_changed_leader_unit(): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() mock_load_acl.assert_called_once() - secret_out = state_out.get_secret(label=f"{PEER_RELATION}.{APP_NAME}.app") + secret_out = state_out.get_secret( + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}" + ) assert ( secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") == "secure-password" @@ -325,6 +326,7 @@ def test_config_changed_leader_unit(): def test_config_changed_leader_unit_wrong_username(): ctx = testing.Context(ValkeyCharm) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( @@ -332,7 +334,7 @@ def test_config_changed_leader_unit_wrong_username(): ) state_in = testing.State( leader=True, - relations={relation}, + relations={relation, status_peer_relation}, containers={container}, secrets={password_secret}, config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, @@ -340,8 +342,15 @@ def test_config_changed_leader_unit_wrong_username(): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, + ctx(ctx.on.config_changed(), state_in) as manager, ): - ctx.run(ctx.on.config_changed(), state_in) + charm: ValkeyCharm = manager.charm + manager.run() + cluster_statuses = charm.state.statuses.get( + scope="app", + component=charm.cluster_manager.name, + ) + assert ClusterStatuses.PASSWORD_UPDATE_FAILED.value in cluster_statuses mock_set_acl_file.assert_not_called() @@ -351,7 +360,9 @@ def test_change_password_secret_changed_non_leader_unit(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}", + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, + remote_grants=APP_NAME, ) state_in = testing.State( @@ -362,10 +373,59 @@ def test_change_password_secret_changed_non_leader_unit(): config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( - patch("events.base_events.BaseEvents.update_admin_password") as mock_update_password, + patch( + "events.base_events.BaseEvents._update_internal_users_password" + ) as mock_update_password, + patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, + patch("common.client.ValkeyClient.reload_acl") as mock_reload_acl, ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() + mock_set_acl_file.assert_called_once() + mock_reload_acl.assert_called_once() + + +def test_change_password_secret_changed_non_leader_unit_not_successful(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + statuses_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}", + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, + remote_grants=APP_NAME, + ) + + state_in = testing.State( + leader=False, + relations={relation, statuses_peer_relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch( + "events.base_events.BaseEvents._update_internal_users_password" + ) as mock_update_password, + patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, + patch( + "common.client.ValkeyClient.reload_acl", + side_effect=ValkeyACLLoadError("Reload failed"), + ) as mock_reload_acl, + ctx(ctx.on.secret_changed(password_secret), state_in) as manager, + ): + charm: ValkeyCharm = manager.charm + state_out = manager.run() + mock_update_password.assert_not_called() + mock_set_acl_file.assert_called_once() + mock_reload_acl.assert_called_once() + cluster_statuses = charm.state.statuses.get( + scope="unit", + component=charm.cluster_manager.name, + ) + assert "secret_changed" in [e.name for e in state_out.deferred] + assert ClusterStatuses.PASSWORD_UPDATE_FAILED.value in cluster_statuses def test_change_password_secret_changed_leader_unit(): @@ -385,7 +445,9 @@ def test_change_password_secret_changed_leader_unit(): config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, ) with ( - patch("events.base_events.BaseEvents.update_admin_password") as mock_update_password, + patch( + "events.base_events.BaseEvents._update_internal_users_password" + ) as mock_update_password, ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_called_once_with(password_secret.id) From fc9c9d30c6fd0a6c8f49b1e8463816ad0db4a525 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 28 Jan 2026 06:16:03 +0000 Subject: [PATCH 023/159] fix integration tests --- src/events/base_events.py | 6 ++++-- src/literals.py | 2 +- tests/integration/k8s/helpers.py | 13 +++++++++++-- tests/integration/k8s/test_charm.py | 29 ++++++++++++++++++++--------- 4 files changed, 36 insertions(+), 14 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index a3bbd74..10081fe 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -13,7 +13,7 @@ from common.exceptions import ValkeyACLLoadError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, - INTERNAL_USERS_SECRET_LABEL, + INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, CharmUsers, ) @@ -112,7 +112,9 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: """Handle the secret_changed event.""" if not self.charm.unit.is_leader(): - if event.secret.label and event.secret.label.endswith(INTERNAL_USERS_SECRET_LABEL): + if event.secret.label and event.secret.label.endswith( + INTERNAL_USERS_SECRET_LABEL_SUFFIX + ): # leader unit processed the secret change from user, non-leader units can replicate try: self.charm.config_manager.set_acl_file() diff --git a/src/literals.py b/src/literals.py index 77b658c..bc8d86b 100644 --- a/src/literals.py +++ b/src/literals.py @@ -18,7 +18,7 @@ STATUS_PEERS_RELATION = "status-peers" INTERNAL_USERS_PASSWORD_CONFIG = "system-users" -INTERNAL_USERS_SECRET_LABEL = "internal_users_secret" +INTERNAL_USERS_SECRET_LABEL_SUFFIX = "internal_users_secret" CLIENT_PORT = 6379 diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 2b50ad2..43e8d50 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -14,7 +14,13 @@ from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials from ops import SecretNotFoundError, StatusBase -from literals import CLIENT_PORT, INTERNAL_USERS_PASSWORD_CONFIG, CharmUsers +from literals import ( + CLIENT_PORT, + INTERNAL_USERS_PASSWORD_CONFIG, + INTERNAL_USERS_SECRET_LABEL_SUFFIX, + PEER_RELATION, + CharmUsers, +) logger = logging.getLogger(__name__) @@ -22,6 +28,9 @@ METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) APP_NAME: str = METADATA["name"] IMAGE_RESOURCE = {"valkey-image": METADATA["resources"]["valkey-image"]["upstream-source"]} +INTERNAL_USERS_SECRET_LABEL = ( + f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" +) class CharmStatuses(Enum): @@ -205,7 +214,7 @@ async def create_valkey_client( def set_password( juju: jubilant.Juju, password: str, - username: str = CharmUsers.VALKEY_ADMIN, + username: str = CharmUsers.VALKEY_ADMIN.value, application: str = APP_NAME, ) -> None: """Set a user password (or update it if existing) via secret. diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 66b9855..10eebbc 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -9,13 +9,13 @@ from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, - PEER_RELATION, CharmUsers, ) from .helpers import ( APP_NAME, IMAGE_RESOURCE, + INTERNAL_USERS_SECRET_LABEL, CharmStatuses, create_valkey_client, does_status_match, @@ -62,8 +62,8 @@ async def test_authentication(juju: jubilant.Juju) -> None: assert "NOAUTH" in str(exc_info.value), "Unauthenticated access did not fail as expected" # Authenticate with internal user - secret = get_secret_by_label(juju, label=f"{PEER_RELATION}.{APP_NAME}.app") - password = secret.get(f"{CharmUsers.VALKEY_ADMIN}-password") + secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) + password = secret.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") assert password is not None, "Admin password secret not found" client = await create_valkey_client(hostnames=hostnames, password=password) @@ -86,7 +86,7 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: # perform read operation with the updated password result = await set_key( hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN, + username=CharmUsers.VALKEY_ADMIN.value, password=new_password, key=TEST_KEY, value=TEST_VALUE, @@ -101,7 +101,10 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: # make sure we can still read data with the previously set password assert await get_key( - hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN, password=new_password, key=TEST_KEY + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, ) == bytes(TEST_VALUE, "utf-8") @@ -111,9 +114,11 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: hostnames = get_cluster_hostnames(juju, APP_NAME) logger.info("Creating new user secret") - secret_name = "my_secret" + secret_name = "my_secret_2" new_password = "even-newer-password" - secret_id = juju.add_secret(name=secret_name, content={CharmUsers.VALKEY_ADMIN: new_password}) + secret_id = juju.add_secret( + name=secret_name, content={CharmUsers.VALKEY_ADMIN.value: new_password} + ) logger.info("Updating configuration with the new secret - but without access") juju.config(app=APP_NAME, values={INTERNAL_USERS_PASSWORD_CONFIG: secret_id}) @@ -130,7 +135,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # deferred `config_changed` event will be retried before `update_status` with fast_forward(juju): juju.grant_secret(identifier=secret_name, app=APP_NAME) - sleep(10) # allow some time for the permission to propagate + sleep(20) # allow some time for the permission to propagate # juju.wait( # lambda status: jubilant.all_active(status, APP_NAME), @@ -146,7 +151,13 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform read operation with the updated password assert await get_key( - hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN, password=new_password, key=TEST_KEY + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, ) == bytes(TEST_VALUE, "utf-8"), "Failed to read data after secret permissions were updated" logger.info("Password update successful after secret was granted") + + +# TODO Once scaling is implemented, add tests to check on password update in non-leader units From 3bc87743377521cd74ac275d80f932d2f26beb2d Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 28 Jan 2026 06:22:39 +0000 Subject: [PATCH 024/159] add wrong username update test --- src/events/base_events.py | 5 +++++ tests/integration/k8s/test_charm.py | 34 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/events/base_events.py b/src/events/base_events.py index 10081fe..d316d12 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -219,3 +219,8 @@ def _update_internal_users_password(self, secret_id: str) -> None: scope="unit", component=self.charm.cluster_manager.name, ) + self.charm.state.statuses.delete( + ClusterStatuses.PASSWORD_UPDATE_FAILED.value, + scope="app", + component=self.charm.cluster_manager.name, + ) diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 10eebbc..ee9ce21 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -11,6 +11,7 @@ INTERNAL_USERS_PASSWORD_CONFIG, CharmUsers, ) +from statuses import ClusterStatuses from .helpers import ( APP_NAME, @@ -108,6 +109,39 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: ) == bytes(TEST_VALUE, "utf-8") +@pytest.mark.abort_on_fail +async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None: + """Assert the admin password is updated when adding a user secret to the config.""" + hostnames = get_cluster_hostnames(juju, APP_NAME) + + # create a user secret and grant it to the application + new_password = "some-password" + set_password(juju, username="wrong-username", password=new_password) + + # wait for config-changed hook to finish executing + juju.wait( + lambda status: does_status_match( + status, + expected_app_statuses={APP_NAME: [ClusterStatuses.PASSWORD_UPDATE_FAILED.value]}, + ), + timeout=1200, + ) + + set_password(juju, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) + # wait for config-changed hook to finish executing + juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + + # perform read operation with the updated password + result = await set_key( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, + value=TEST_VALUE, + ) + assert result == "OK", "Failed to write data after admin password update" + + @pytest.mark.abort_on_fail async def test_user_secret_permissions(juju: jubilant.Juju) -> None: """If a user secret is not granted, ensure we can process updated permissions.""" From b8128477310b28b7a2536f85d5748cb2504c80b3 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 28 Jan 2026 06:28:58 +0000 Subject: [PATCH 025/159] fix copilot feedback --- src/literals.py | 1 - tests/integration/k8s/helpers.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/literals.py b/src/literals.py index bc8d86b..1276be8 100644 --- a/src/literals.py +++ b/src/literals.py @@ -12,7 +12,6 @@ CONFIG_FILE = "/var/lib/valkey/valkey.conf" ACL_FILE = "/var/lib/valkey/users.acl" -ACL_FILE = "/var/lib/valkey/users.acl" PEER_RELATION = "valkey-peers" STATUS_PEERS_RELATION = "status-peers" diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 43e8d50..56a24b0 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -186,7 +186,7 @@ def get_secret_by_label(juju: jubilant.Juju, label: str) -> dict[str, str]: async def create_valkey_client( hostnames: list[str], - username: str | None = CharmUsers.VALKEY_ADMIN, + username: str | None = CharmUsers.VALKEY_ADMIN.value, password: str | None = None, ): """Create and return a Valkey client connected to the cluster. From 913f85f46fd7a1cf938f453691c6012b61bdc2ad Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 28 Jan 2026 06:40:16 +0000 Subject: [PATCH 026/159] fix unit tests --- tests/unit/test_charm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index cb557fc..2063efe 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -12,7 +12,7 @@ from src.charm import ValkeyCharm from src.literals import ( INTERNAL_USERS_PASSWORD_CONFIG, - INTERNAL_USERS_SECRET_LABEL, + INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, STATUS_PEERS_RELATION, CharmUsers, @@ -173,7 +173,7 @@ def test_internal_user_creation(): with patch("workload_k8s.ValkeyK8sWorkload.write_file"): state_out = ctx.run(ctx.on.leader_elected(), state_in) secret_out = state_out.get_secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}" + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) assert secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") @@ -211,7 +211,7 @@ def test_leader_elected_leader_password_specified(): ): state_out = ctx.run(ctx.on.leader_elected(), state_in) secret_out = state_out.get_secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}" + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) for user in CharmUsers: if user == CharmUsers.VALKEY_ADMIN: @@ -315,7 +315,7 @@ def test_config_changed_leader_unit(): mock_set_acl_file.assert_called_once() mock_load_acl.assert_called_once() secret_out = state_out.get_secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}" + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) assert ( secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") @@ -360,7 +360,7 @@ def test_change_password_secret_changed_non_leader_unit(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}", + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}", tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME, ) @@ -392,7 +392,7 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(): container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL}", + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}", tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME, ) From 073f087b39ebe6508fec067e9722d8a29bdf0615 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 28 Jan 2026 13:49:54 +0000 Subject: [PATCH 027/159] add charm sentinel user --- src/core/models.py | 1 + src/literals.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/core/models.py b/src/core/models.py index fdf00a3..66bebf5 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -33,6 +33,7 @@ class PeerAppModel(PeerModel): charmed_operator_password: InternalUsersSecret = Field(default="") charmed_sentinel_valkey_password: InternalUsersSecret = Field(default="") charmed_replication_password: InternalUsersSecret = Field(default="") + charmed_sentinel_peers_password: InternalUsersSecret = Field(default="") charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") diff --git a/src/literals.py b/src/literals.py index 1276be8..c541698 100644 --- a/src/literals.py +++ b/src/literals.py @@ -31,7 +31,8 @@ class CharmUsers(str, Enum): VALKEY_REPLICA = "charmed-replication" # Sentinel users - SENTINEL_ADMIN = "charmed-sentinel-operator" + SENTINEL_ADMIN = "charmed-sentinel-peers" + SENTINEL_CHARM_ADMIN = "charmed-sentinel-operator" CHARM_USERS_ROLE_MAP = { @@ -39,4 +40,5 @@ class CharmUsers(str, Enum): CharmUsers.VALKEY_SENTINEL: "+client +config +info +publish +subscribe +monitor +ping +replicaof +failover +script|kill +multi +exec &__sentinel__:hello", CharmUsers.VALKEY_REPLICA: "+psync +replconf +ping", CharmUsers.SENTINEL_ADMIN: "~* +@all", + CharmUsers.SENTINEL_CHARM_ADMIN: "~* +@all", } From f6a84891396522bc2fa0d133d3e2a48ba1c6804f Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 05:33:35 +0000 Subject: [PATCH 028/159] initial scale up implementation --- src/charm.py | 19 +- src/core/base_workload.py | 39 ++- src/core/models.py | 18 +- src/events/base_events.py | 45 ++- src/literals.py | 9 + src/managers/config-template/sentinel.conf | 361 +++++++++++++++++++++ src/managers/config.py | 76 ++++- src/workload_k8s.py | 40 ++- 8 files changed, 581 insertions(+), 26 deletions(-) create mode 100644 src/managers/config-template/sentinel.conf diff --git a/src/charm.py b/src/charm.py index 466527f..a1740df 100755 --- a/src/charm.py +++ b/src/charm.py @@ -11,7 +11,7 @@ from core.cluster_state import ClusterState from events.base_events import BaseEvents -from literals import CONTAINER +from literals import CHARM_USER, CONTAINER, DATA_DIR from managers.cluster import ClusterManager from managers.config import ConfigManager from workload_k8s import ValkeyK8sWorkload @@ -41,21 +41,28 @@ def __init__(self, *args) -> None: # --- EVENT HANDLERS --- self.base_events = BaseEvents(self) - # --- Observers - self.framework.observe(self.on.valkey_pebble_ready, self._on_pebble_ready) + # --- Observers --- + self.framework.observe(self.on.start, self._on_ready) - def _on_pebble_ready(self, event: ops.PebbleReadyEvent) -> None: + def _on_ready(self, event: ops.StartEvent) -> None: """Handle the `pebble-ready` event.""" if not self.workload.can_connect: logger.warning("Container not ready yet") event.defer() return - if not self.unit.is_leader(): - logger.warning("Scaling not implemented yet, services not started") + if not self.unit.is_leader() and ( + not self.state.cluster.internal_user_credentials + or not self.state.cluster.model.primary_ip + ): + logger.info("Deferring leader write primary and internal user credentials") + event.defer() return self.config_manager.set_config_properties() + self.config_manager.set_acl_file() + self.config_manager.set_sentinel_config() + self.workload.mkdir(DATA_DIR, user=CHARM_USER, group=CHARM_USER) self.workload.start() logger.info("Services started") self.state.unit_server.update({"started": True}) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index bed9210..2206c0a 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -4,8 +4,13 @@ """Base objects for workload operations across different substrates.""" +import logging +import socket +import subprocess from abc import ABC, abstractmethod +logger = logging.getLogger(__name__) + class WorkloadBase(ABC): """Base interface for common workload operations.""" @@ -31,11 +36,43 @@ def write_config_file(self, config: dict[str, str]) -> None: pass @abstractmethod - def write_file(self, content: str, path: str) -> None: + def write_file( + self, + content: str, + path: str, + mode: int | None = None, + user: str | None = None, + group: str | None = None, + ) -> None: """Write content to a file on disk. + Note: + mode, user, and group are optional parameters used only on k8s workloads. + Args: content (str): The content to be written. path (str): The file path where the content should be written. + mode (int, optional): The file mode (permissions). Defaults to None. + user (str, optional): The user name. Defaults to None. + group (str, optional): The group name. Defaults to None. """ pass + + def get_private_ip(self) -> str: + """Get the Private IP address of the current unit.""" + cmd = "unit-get private-address" + try: + output = subprocess.run( + cmd, + check=True, + text=True, + shell=True, + capture_output=True, + timeout=10, + ) + if output.returncode == 0: + return output.stdout.strip() + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + logger.error(f"Error executing command '{cmd}': {e}") + + return socket.gethostbyname(socket.gethostname()) diff --git a/src/core/models.py b/src/core/models.py index de27f03..d8555fc 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -17,7 +17,7 @@ ) from pydantic import Field -from literals import INTERNAL_USER +from literals import INTERNAL_USER, SENTINEL_USER logger = logging.getLogger(__name__) @@ -26,6 +26,8 @@ class PeerAppModel(PeerModel): """Model for the peer application data.""" charmed_operator_password: ExtraSecretStr = Field(default="") + charmed_replication_password: ExtraSecretStr = Field(default="") + primary_ip: str = Field(default="") class PeerUnitModel(PeerModel): @@ -33,6 +35,7 @@ class PeerUnitModel(PeerModel): started: bool = Field(default=False) hostname: str = Field(default="") + private_ip: str = Field(default="") class RelationState: @@ -129,7 +132,14 @@ def model(self) -> PeerAppModel | None: @property def internal_user_credentials(self) -> dict[str, str]: """Retrieve the credentials for the internal admin user.""" - if self.model and (password := self.model.charmed_operator_password): - return {INTERNAL_USER: password} + creds = {} - return {} + if not self.model: + return creds + + if self.model.charmed_operator_password: + creds[INTERNAL_USER] = self.model.charmed_operator_password + if self.model.charmed_replication_password: + creds[SENTINEL_USER] = self.model.charmed_replication_password + + return creds diff --git a/src/events/base_events.py b/src/events/base_events.py index 0125411..49b96ee 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -11,7 +11,7 @@ import ops from common.exceptions import ValkeyUserManagementError -from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION +from literals import INTERNAL_USER, INTERNAL_USER_PASSWORD_CONFIG, PEER_RELATION, SENTINEL_USER from statuses import CharmStatuses, ClusterStatuses if TYPE_CHECKING: @@ -50,25 +50,52 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: if not self.charm.state.peer_relation: event.defer() return + self.charm.state.unit_server.update( + { + "hostname": socket.gethostname(), + "private_ip": self.charm.workload.get_private_ip(), + } + ) + if not self.charm.state.cluster.model.primary_ip and self.charm.unit.is_leader(): + # set the primary to this unit if not already set + self.charm.state.cluster.update( + { + "primary_ip": self.charm.state.unit_server.model.private_ip, + } + ) if self.charm.unit.is_leader() and not self.charm.state.cluster.internal_user_credentials: + charmed_operator_password = "" + charmed_replication_password = "" if admin_secret_id := self.charm.config.get(INTERNAL_USER_PASSWORD_CONFIG): try: - password = self.charm.state.get_secret_from_id(str(admin_secret_id)).get( - INTERNAL_USER - ) + admin_secret = self.charm.state.get_secret_from_id(str(admin_secret_id)) + charmed_operator_password = admin_secret.get(INTERNAL_USER) + charmed_replication_password = admin_secret.get(SENTINEL_USER) except (ops.ModelError, ops.SecretNotFoundError) as e: logger.error(f"Could not access secret {admin_secret_id}: {e}") raise - else: - password = self.charm.config_manager.generate_password() - self.charm.state.cluster.update({"charmed_operator_password": password}) - self.charm.config_manager.set_acl_file() + if not charmed_operator_password: + charmed_operator_password = self.charm.config_manager.generate_password() + if not charmed_replication_password: + charmed_replication_password = self.charm.config_manager.generate_password() + + self.charm.state.cluster.update( + { + "charmed_operator_password": charmed_operator_password, + "charmed_replication_password": charmed_replication_password, + } + ) def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: """Handle the config_changed event.""" - self.charm.state.unit_server.update({"hostname": socket.gethostname()}) + self.charm.state.unit_server.update( + { + "hostname": socket.gethostname(), + "private_ip": self.charm.workload.get_private_ip(), + } + ) if not self.charm.unit.is_leader(): return diff --git a/src/literals.py b/src/literals.py index 61e0f04..3af238c 100644 --- a/src/literals.py +++ b/src/literals.py @@ -9,12 +9,21 @@ CONTAINER = "valkey" CONFIG_FILE = "/var/lib/valkey/valkey.conf" +VALKEY_LOG_FILE = "/var/lib/valkey/valkey.log" +SENTINEL_LOG_FILE = "/var/lib/valkey/sentinel.log" ACL_FILE = "/var/lib/valkey/users.acl" +SENTINEL_CONFIG_FILE = "/var/lib/valkey/sentinel.conf" +DATA_DIR = "/var/lib/valkey/data" PEER_RELATION = "valkey-peers" STATUS_PEERS_RELATION = "status-peers" INTERNAL_USER = "charmed-operator" +SENTINEL_USER = "charmed-replication" INTERNAL_USER_PASSWORD_CONFIG = "system-users" CLIENT_PORT = 6379 +SENTINEL_PORT = 26379 + +PRIMARY_NAME = "primary" +QUORUM_NUMBER = 2 diff --git a/src/managers/config-template/sentinel.conf b/src/managers/config-template/sentinel.conf new file mode 100644 index 0000000..abd5c60 --- /dev/null +++ b/src/managers/config-template/sentinel.conf @@ -0,0 +1,361 @@ +# Example sentinel.conf + +# By default protected mode is disabled in sentinel mode. Sentinel is reachable +# from interfaces different than localhost. Make sure the sentinel instance is +# protected from the outside world via firewalling or other means. +protected-mode no + +# port +# The port that this sentinel instance will run on +port 26379 + +# By default Valkey Sentinel does not run as a daemon. Use 'yes' if you need it. +# Note that Valkey will write a pid file in /var/run/valkey-sentinel.pid when +# daemonized. +daemonize no + +# When running daemonized, Valkey Sentinel writes a pid file in +# /var/run/valkey-sentinel.pid by default. You can specify a custom pid file +# location here. +pidfile /var/run/valkey-sentinel.pid + +# Specify the server verbosity level. +# This can be one of: +# debug (a lot of information, useful for development/testing) +# verbose (many rarely useful info, but not a mess like the debug level) +# notice (moderately verbose, what you want in production probably) +# warning (only very important / critical messages are logged) +# nothing (nothing is logged) +loglevel notice + +# Specify the log file name. Also the empty string can be used to force +# Sentinel to log on the standard output. Note that if you use standard +# output for logging but daemonize, logs will be sent to /dev/null +logfile "" + +# To enable logging to the system logger, just set 'syslog-enabled' to yes, +# and optionally update the other syslog parameters to suit your needs. +# syslog-enabled no + +# Specify the syslog identity. +# syslog-ident sentinel + +# Specify the syslog facility. Must be USER or between LOCAL0-LOCAL7. +# syslog-facility local0 + +# sentinel announce-ip +# sentinel announce-port +# +# The above two configuration directives are useful in environments where, +# because of NAT, Sentinel is reachable from outside via a non-local address. +# +# When announce-ip is provided, the Sentinel will claim the specified IP address +# in HELLO messages used to gossip its presence, instead of auto-detecting the +# local address as it usually does. +# +# Similarly when announce-port is provided and is valid and non-zero, Sentinel +# will announce the specified TCP port. +# +# The two options don't need to be used together, if only announce-ip is +# provided, the Sentinel will announce the specified IP and the server port +# as specified by the "port" option. If only announce-port is provided, the +# Sentinel will announce the auto-detected local IP and the specified port. +# +# Example: +# +# sentinel announce-ip 1.2.3.4 + +# dir +# Every long running process should have a well-defined working directory. +# For Valkey Sentinel to chdir to /tmp at startup is the simplest thing +# for the process to don't interfere with administrative tasks such as +# unmounting filesystems. +dir /tmp + +# sentinel monitor +# +# Tells Sentinel to monitor this master, and to consider it in O_DOWN +# (Objectively Down) state only if at least sentinels agree. +# +# Note that whatever is the ODOWN quorum, a Sentinel will require to +# be elected by the majority of the known Sentinels in order to +# start a failover, so no failover can be performed in minority. +# +# Replicas are auto-discovered, so you don't need to specify replicas in +# any way. Sentinel itself will rewrite this configuration file adding +# the replicas using additional configuration options. +# Also note that the configuration file is rewritten when a +# replica is promoted to master. +# +# Note: master name should not include special characters or spaces. +# The valid charset is A-z 0-9 and the three characters ".-_". +sentinel monitor mymaster 127.0.0.1 6379 2 + +# sentinel auth-pass +# +# Set the password to use to authenticate with the master and replicas. +# Useful if there is a password set in the Valkey instances to monitor. +# +# Note that the master password is also used for replicas, so it is not +# possible to set a different password in masters and replicas instances +# if you want to be able to monitor these instances with Sentinel. +# +# However you can have Valkey instances without the authentication enabled +# mixed with Valkey instances requiring the authentication (as long as the +# password set is the same for all the instances requiring the password) as +# the AUTH command will have no effect in Valkey instances with authentication +# switched off. +# +# Example: +# +# sentinel auth-pass mymaster MySUPER--secret-0123passw0rd + +# sentinel auth-user +# +# This is useful in order to authenticate to instances having ACL capabilities, +# that is, running Valkey. When just auth-pass is provided the +# Sentinel instance will authenticate to Valkey using the old "AUTH " +# method. When also an username is provided, it will use "AUTH ". +# In the Valkey servers side, the ACL to provide just minimal access to +# Sentinel instances, should be configured along the following lines: +# +# user sentinel-user >somepassword +subscribe +publish +failover +script|kill \ +# +ping +info +multi +slaveof +config +client +exec &__sentinel__:hello on +# +# Since Valkey Sentinel 9.0, the sentinel user requires the +failover permission +# on all monitored Valkey instances for proper operation. + +# sentinel down-after-milliseconds +# +# Number of milliseconds the master (or any attached replica or sentinel) should +# be unreachable (as in, not acceptable reply to PING, continuously, for the +# specified period) in order to consider it in S_DOWN state (Subjectively +# Down). +# +# Default is 30 seconds. +sentinel down-after-milliseconds mymaster 30000 + + +# Sentinel's ACL users are defined in the following format: +# +# user ... acl rules ... +# +# For example: +# +# user worker +@admin +@connection ~* on >ffa9203c493aa99 +# +# For more information about ACL configuration please refer to the Valkey +# website at https://valkey.io/topics/acl and valkey server configuration +# template valkey.conf. + +# ACL LOG +# +# The ACL Log tracks failed commands and authentication events associated +# with ACLs. The ACL Log is useful to troubleshoot failed commands blocked +# by ACLs. The ACL Log is stored in memory. You can reclaim memory with +# ACL LOG RESET. Define the maximum entry length of the ACL Log below. +acllog-max-len 128 + +# Using an external ACL file +# +# Instead of configuring users here in this file, it is possible to use +# a stand-alone file just listing users. The two methods cannot be mixed: +# if you configure users here and at the same time you activate the external +# ACL file, the server will refuse to start. +# +# The format of the external ACL user file is exactly the same as the +# format that is used inside valkey.conf to describe users. +# +# aclfile /etc/valkey/sentinel-users.acl + +# requirepass +# +# You can configure Sentinel itself to require a password, however when doing +# so Sentinel will try to authenticate with the same password to all the +# other Sentinels. So you need to configure all your Sentinels in a given +# group with the same "requirepass" password. Check the following documentation +# for more info: https://valkey.io/topics/sentinel +# +# IMPORTANT NOTE: "requirepass" is a compatibility +# layer on top of the ACL system. The option effect will be just setting +# the password for the default user. Clients will still authenticate using +# AUTH as usually, or more explicitly with AUTH default +# if they follow the new protocol: both will work. +# +# New config files are advised to use separate authentication control for +# incoming connections (via ACL), and for outgoing connections (via +# sentinel-user and sentinel-pass) +# +# The requirepass is not compatible with aclfile option and the ACL LOAD +# command, these will cause requirepass to be ignored. + +# sentinel sentinel-user +# +# You can configure Sentinel to authenticate with other Sentinels with specific +# user name. + +# sentinel sentinel-pass +# +# The password for Sentinel to authenticate with other Sentinels. If sentinel-user +# is not configured, Sentinel will use 'default' user with sentinel-pass to authenticate. + +# sentinel parallel-syncs +# +# How many replicas we can reconfigure to point to the new replica simultaneously +# during the failover. Use a low number if you use the replicas to serve query +# to avoid that all the replicas will be unreachable at about the same +# time while performing the synchronization with the master. +sentinel parallel-syncs mymaster 1 + +# sentinel failover-timeout +# +# Specifies the failover timeout in milliseconds. It is used in many ways: +# +# - The time needed to re-start a failover after a previous failover was +# already tried against the same master by a given Sentinel, is two +# times the failover timeout. +# +# - The time needed for a replica replicating to a wrong master according +# to a Sentinel current configuration, to be forced to replicate +# with the right master, is exactly the failover timeout (counting since +# the moment a Sentinel detected the misconfiguration). +# +# - The time needed to cancel a failover that is already in progress but +# did not produced any configuration change (SLAVEOF NO ONE yet not +# acknowledged by the promoted replica). +# +# - The maximum time a failover in progress waits for all the replicas to be +# reconfigured as replicas of the new master. However even after this time +# the replicas will be reconfigured by the Sentinels anyway, but not with +# the exact parallel-syncs progression as specified. +# +# Default is 3 minutes. +sentinel failover-timeout mymaster 180000 + +# SCRIPTS EXECUTION +# +# sentinel notification-script and sentinel reconfig-script are used in order +# to configure scripts that are called to notify the system administrator +# or to reconfigure clients after a failover. The scripts are executed +# with the following rules for error handling: +# +# If script exits with "1" the execution is retried later (up to a maximum +# number of times currently set to 10). +# +# If script exits with "2" (or an higher value) the script execution is +# not retried. +# +# If script terminates because it receives a signal the behavior is the same +# as exit code 1. +# +# A script has a maximum running time of 60 seconds. After this limit is +# reached the script is terminated with a SIGKILL and the execution retried. + +# NOTIFICATION SCRIPT +# +# sentinel notification-script +# +# Call the specified notification script for any sentinel event that is +# generated in the WARNING level (for instance -sdown, -odown, and so forth). +# This script should notify the system administrator via email, SMS, or any +# other messaging system, that there is something wrong with the monitored +# Valkey systems. +# +# The script is called with just two arguments: the first is the event type +# and the second the event description. +# +# The script must exist and be executable in order for sentinel to start if +# this option is provided. +# +# Example: +# +# sentinel notification-script mymaster /var/valkey/notify.sh + +# CLIENTS RECONFIGURATION SCRIPT +# +# sentinel client-reconfig-script +# +# When the master changed because of a failover a script can be called in +# order to perform application-specific tasks to notify the clients that the +# configuration has changed and the master is at a different address. +# +# The following arguments are passed to the script: +# +# +# +# is currently always "start" +# is either "leader" or "observer" +# +# The arguments from-ip, from-port, to-ip, to-port are used to communicate +# the old address of the master and the new address of the elected replica +# (now a master). +# +# This script should be resistant to multiple invocations. +# +# Example: +# +# sentinel client-reconfig-script mymaster /var/valkey/reconfig.sh + +# SECURITY +# +# By default SENTINEL SET will not be able to change the notification-script +# and client-reconfig-script at runtime. This avoids a trivial security issue +# where clients can set the script to anything and trigger a failover in order +# to get the program executed. + +sentinel deny-scripts-reconfig yes + +# VALKEY COMMANDS RENAMING (DEPRECATED) +# +# WARNING: avoid using this option if possible, instead use ACLs. +# +# Sometimes the Valkey server has certain commands, that are needed for Sentinel +# to work correctly, renamed to unguessable strings. This is often the case +# of CONFIG and SLAVEOF in the context of providers that provide Valkey as +# a service, and don't want the customers to reconfigure the instances outside +# of the administration console. +# +# In such case it is possible to tell Sentinel to use different command names +# instead of the normal ones. For example if the master "mymaster", and the +# associated replicas, have "CONFIG" all renamed to "GUESSME", I could use: +# +# SENTINEL rename-command mymaster CONFIG GUESSME +# +# After such configuration is set, every time Sentinel would use CONFIG it will +# use GUESSME instead. Note that there is no actual need to respect the command +# case, so writing "config guessme" is the same in the example above. +# +# SENTINEL SET can also be used in order to perform this configuration at runtime. +# +# In order to set a command back to its original name (undo the renaming), it +# is possible to just rename a command to itself: +# +# SENTINEL rename-command mymaster CONFIG CONFIG + +# HOSTNAMES SUPPORT +# +# Normally Sentinel uses only IP addresses and requires SENTINEL MONITOR +# to specify an IP address. Also, it requires the Valkey replica-announce-ip +# keyword to specify only IP addresses. +# +# You may enable hostnames support by enabling resolve-hostnames. Note +# that you must make sure your DNS is configured properly and that DNS +# resolution does not introduce very long delays. +# +SENTINEL resolve-hostnames no + +# When resolve-hostnames is enabled, Sentinel still uses IP addresses +# when exposing instances to users, configuration files, etc. If you want +# to retain the hostnames when announced, enable announce-hostnames below. +# +SENTINEL announce-hostnames no + +# When primary-reboot-down-after-period is set to 0, Sentinel does not fail over +# when receiving a -LOADING response from a primary. This was the only supported +# behavior before Redis OSS 7.0. +# +# Otherwise, Sentinel will use this value as the time (in ms) it is willing to +# accept a -LOADING response after a primary has been rebooted, before failing +# over. + +SENTINEL primary-reboot-down-after-period mymaster 0 \ No newline at end of file diff --git a/src/managers/config.py b/src/managers/config.py index 32ae023..f1e8566 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -16,7 +16,18 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import ACL_FILE, CLIENT_PORT, INTERNAL_USER +from literals import ( + ACL_FILE, + CHARM_USER, + CLIENT_PORT, + DATA_DIR, + INTERNAL_USER, + PRIMARY_NAME, + QUORUM_NUMBER, + SENTINEL_CONFIG_FILE, + SENTINEL_PORT, + SENTINEL_USER, +) from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -42,6 +53,8 @@ def config_properties(self) -> dict[str, str]: Dictionary of properties to be written to the config file. """ config_properties = {} + if not self.state.unit_server.model or not self.state.cluster.model: + return config_properties # load the config properties provided from the template in this repo # it does NOT load the file from disk in the charm unit in order to avoid config drift @@ -60,6 +73,8 @@ def config_properties(self) -> dict[str, str]: config_properties[key.strip()] = value.strip() # Adjust default values + # dir + config_properties["dir"] = DATA_DIR # port config_properties["port"] = str(CLIENT_PORT) @@ -67,7 +82,26 @@ def config_properties(self) -> dict[str, str]: config_properties["bind"] = "0.0.0.0 -::1" # Use the ACL file - config_properties["aclfile"] = str(ACL_FILE) + config_properties["aclfile"] = ACL_FILE + + # # logfile location + # config_properties["logfile"] = VALKEY_LOG_FILE + + logger.debug( + "primary: %s, hostname: %s", + self.state.cluster.model.primary_ip, + self.state.unit_server.model.hostname, + ) + # replicaof + if ( + self.state.cluster.model.primary_ip + and self.state.cluster.model.primary_ip != self.state.unit_server.model.private_ip + ): + # set replicaof + logger.debug("Setting replicaof to primary %s", self.state.cluster.model.primary_ip) + config_properties["replicaof"] = f"{self.state.cluster.model.primary_ip} {CLIENT_PORT}" + config_properties["primaryuser"] = "replication-user" + config_properties["primaryauth"] = "testpassword" # TODO make this configurable return config_properties @@ -92,11 +126,49 @@ def set_acl_file(self, charmed_operator_password: str = "") -> None: charmed_operator_password_hash = hashlib.sha256( charmed_operator_password.encode("utf-8") ).hexdigest() + # sentinel user + charmed_replication_password = self.state.cluster.internal_user_credentials.get( + SENTINEL_USER, "" + ) + charmed_replication_password_hash = hashlib.sha256( + charmed_replication_password.encode("utf-8") + ).hexdigest() # write the ACL file acl_content = "user default off\n" acl_content += f"user {INTERNAL_USER} on #{charmed_operator_password_hash} ~* +@all\n" + acl_content += f"user {SENTINEL_USER} on #{charmed_replication_password_hash} +client +config +info +publish +subscribe +monitor +ping +replicaof +failover +script|kill +multi +exec &__sentinel__:hello\n" + # TODO make the replication user password configurable + acl_content += "user replication-user on >testpassword +psync +replconf +ping\n" self.workload.write_file(acl_content, ACL_FILE) + def set_sentinel_config(self) -> None: + """Write sentinel configuration file.""" + if not self.state.cluster.model or not self.state.cluster.model.primary_ip: + logger.warning("Cannot write sentinel config without primary details set") + return + if not ( + charmed_replication_password := self.state.cluster.internal_user_credentials.get( + SENTINEL_USER + ) + ): + logger.warning("Cannot write sentinel config without sentinel user credentials set") + return + logger.debug("Writing Sentinel configuration") + + sentinel_config = f"port {SENTINEL_PORT}\n" + # TODO consider adding quorum calculation based on number of units + sentinel_config += f"sentinel monitor {PRIMARY_NAME} {self.state.cluster.model.primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" + sentinel_config += f"sentinel auth-user {PRIMARY_NAME} {SENTINEL_USER}\n" + sentinel_config += f"sentinel auth-pass {PRIMARY_NAME} {charmed_replication_password}\n" + # TODO consider making these configs adjustable via charm config + sentinel_config += f"sentinel down-after-milliseconds {PRIMARY_NAME} 30000\n" + sentinel_config += f"sentinel failover-timeout {PRIMARY_NAME} 180000\n" + sentinel_config += f"sentinel parallel-syncs {PRIMARY_NAME} 1\n" + + self.workload.write_file( + sentinel_config, SENTINEL_CONFIG_FILE, mode=0o600, user=CHARM_USER, group=CHARM_USER + ) + def generate_password(self) -> str: """Create randomized string for use as app passwords. diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 5e6b5a6..aa91898 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -11,7 +11,7 @@ from charmlibs import pathops from core.base_workload import WorkloadBase -from literals import CHARM, CHARM_USER, CONFIG_FILE +from literals import CHARM, CHARM_USER, CONFIG_FILE, SENTINEL_CONFIG_FILE logger = logging.getLogger(__name__) @@ -25,7 +25,9 @@ def __init__(self, container: ops.Container | None) -> None: self.container = container self.config_file = pathops.ContainerPath(CONFIG_FILE, container=container) + self.sentinel_config = pathops.ContainerPath(SENTINEL_CONFIG_FILE, container=container) self.valkey_service = "valkey" + self.sentinel_service = "valkey-sentinel" self.metric_service = "metric_exporter" @property @@ -48,6 +50,14 @@ def pebble_layer(self) -> ops.pebble.Layer: "group": CHARM_USER, "startup": "enabled", }, + self.sentinel_service: { + "override": "replace", + "summary": "Valkey sentinel service", + "command": f"valkey-sentinel {self.sentinel_config}", + "user": CHARM_USER, + "group": CHARM_USER, + "startup": "enabled", + }, self.metric_service: { "override": "replace", "summary": "Valkey metric exporter", @@ -63,7 +73,7 @@ def pebble_layer(self) -> ops.pebble.Layer: @override def start(self) -> None: self.container.add_layer(CHARM, self.pebble_layer, combine=True) - self.container.restart(self.valkey_service, self.metric_service) + self.container.restart(self.valkey_service, self.sentinel_service, self.metric_service) @override def write_config_file(self, config: dict[str, str]) -> None: @@ -73,12 +83,34 @@ def write_config_file(self, config: dict[str, str]) -> None: path.write_text(config_string) @override - def write_file(self, content: str, path: str) -> None: + def write_file( + self, + content: str, + path: str, + mode: int | None = None, + user: str | None = None, + group: str | None = None, + ) -> None: """Write content to a file on disk. Args: content (str): The content to be written. path (str): The file path where the content should be written. + mode (int, optional): The file mode (permissions). Defaults to None. + user (str, optional): The user name. Defaults to None. + group (str, optional): The group name. Defaults to None. """ file_path = pathops.ContainerPath(path, container=self.container) - file_path.write_text(content) + file_path.write_text(content, mode=mode, user=user, group=group) + + def mkdir( + self, path: str, mode: int = 0o755, user: str | None = None, group: str | None = None + ) -> None: + """Create a directory on disk. + + Args: + path (str): The directory path to be created. + mode (int, optional): The directory mode (permissions). Defaults to None. + """ + dir_path = pathops.ContainerPath(path, container=self.container) + dir_path.mkdir(mode=mode, user=user, group=group) From 12e63fb49a560f00a78697aca6a022750e7c81e0 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 05:55:40 +0000 Subject: [PATCH 029/159] set sentinel acl file --- src/charm.py | 11 ++++++----- src/literals.py | 1 + src/managers/config.py | 20 +++++++++++++++++++- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/charm.py b/src/charm.py index a1740df..55ac8e1 100755 --- a/src/charm.py +++ b/src/charm.py @@ -42,17 +42,17 @@ def __init__(self, *args) -> None: self.base_events = BaseEvents(self) # --- Observers --- - self.framework.observe(self.on.start, self._on_ready) + self.framework.observe(self.on.start, self._on_start) - def _on_ready(self, event: ops.StartEvent) -> None: - """Handle the `pebble-ready` event.""" + def _on_start(self, event: ops.StartEvent) -> None: + """Handle the `start` event.""" if not self.workload.can_connect: logger.warning("Container not ready yet") event.defer() return if not self.unit.is_leader() and ( - not self.state.cluster.internal_user_credentials + not self.state.cluster.internal_users_credentials or not self.state.cluster.model.primary_ip ): logger.info("Deferring leader write primary and internal user credentials") @@ -61,7 +61,8 @@ def _on_ready(self, event: ops.StartEvent) -> None: self.config_manager.set_config_properties() self.config_manager.set_acl_file() - self.config_manager.set_sentinel_config() + self.config_manager.set_sentinel_config_properties() + self.config_manager.set_sentinel_acl_file() self.workload.mkdir(DATA_DIR, user=CHARM_USER, group=CHARM_USER) self.workload.start() logger.info("Services started") diff --git a/src/literals.py b/src/literals.py index 84e2274..e07f2c8 100644 --- a/src/literals.py +++ b/src/literals.py @@ -13,6 +13,7 @@ CONFIG_FILE = "/var/lib/valkey/valkey.conf" SENTINEL_CONFIG_FILE = "/var/lib/valkey/sentinel.conf" ACL_FILE = "/var/lib/valkey/users.acl" +SENTINEL_ACL_FILE = "/var/lib/valkey/sentinel-users.acl" DATA_DIR = "/var/lib/valkey/data" PEER_RELATION = "valkey-peers" diff --git a/src/managers/config.py b/src/managers/config.py index ca45c5b..5f6035c 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -24,6 +24,7 @@ DATA_DIR, PRIMARY_NAME, QUORUM_NUMBER, + SENTINEL_ACL_FILE, SENTINEL_CONFIG_FILE, SENTINEL_PORT, CharmUsers, @@ -145,7 +146,7 @@ def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None acl_line = f"user {user.value} on #{password_hash} {CHARM_USERS_ROLE_MAP[user]}\n" return acl_line - def set_sentinel_config(self) -> None: + def set_sentinel_config_properties(self) -> None: """Write sentinel configuration file.""" if not self.state.cluster.model or not self.state.cluster.model.primary_ip: logger.warning("Cannot write sentinel config without primary details set") @@ -177,6 +178,23 @@ def set_sentinel_config(self) -> None: sentinel_config, SENTINEL_CONFIG_FILE, mode=0o600, user=CHARM_USER, group=CHARM_USER ) + def set_sentinel_acl_file(self, passwords: dict[str, str] | None = None) -> None: + """Write the Sentinel ACL file with appropriate user permissions. + + Args: + passwords (dict[str, str] | None): Optional dictionary of passwords to use. If not provided, + the passwords from the cluster state will be used. + """ + logger.debug("Writing Sentinel ACL configuration") + acl_content = "user default off\n" + for user in CharmUsers: + # only process VALKEY users + # Sentinel users should be in the sentinel acl file + if "VALKEY_" in str(user): + continue + acl_content += self._get_user_acl_line(user, passwords=passwords) + self.workload.write_file(acl_content, SENTINEL_ACL_FILE) + def generate_password(self) -> str: """Create randomized string for use as app passwords. From 935d794c389bd047fa57be286be945d1da38b275 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 06:07:32 +0000 Subject: [PATCH 030/159] add monitoring user --- src/core/models.py | 1 + src/literals.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/core/models.py b/src/core/models.py index 66bebf5..d0b91b0 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -33,6 +33,7 @@ class PeerAppModel(PeerModel): charmed_operator_password: InternalUsersSecret = Field(default="") charmed_sentinel_valkey_password: InternalUsersSecret = Field(default="") charmed_replication_password: InternalUsersSecret = Field(default="") + charmed_stats_password: InternalUsersSecret = Field(default="") charmed_sentinel_peers_password: InternalUsersSecret = Field(default="") charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") diff --git a/src/literals.py b/src/literals.py index c541698..2b76437 100644 --- a/src/literals.py +++ b/src/literals.py @@ -29,6 +29,7 @@ class CharmUsers(str, Enum): VALKEY_ADMIN = "charmed-operator" VALKEY_SENTINEL = "charmed-sentinel-valkey" VALKEY_REPLICA = "charmed-replication" + VALKEY_MONITORING = "charmed-stats" # Sentinel users SENTINEL_ADMIN = "charmed-sentinel-peers" @@ -39,6 +40,7 @@ class CharmUsers(str, Enum): CharmUsers.VALKEY_ADMIN: "~* +@all", CharmUsers.VALKEY_SENTINEL: "+client +config +info +publish +subscribe +monitor +ping +replicaof +failover +script|kill +multi +exec &__sentinel__:hello", CharmUsers.VALKEY_REPLICA: "+psync +replconf +ping", + CharmUsers.VALKEY_MONITORING: "-@all +@connection +memory -readonly +strlen +config|get +xinfo +pfcount -quit +zcard +type +xlen -readwrite -command +client -wait +scard +llen +hlen +get +eval +slowlog +cluster|info +cluster|slots +cluster|nodes -hello -echo +info +latency +scan -reset -auth -asking", CharmUsers.SENTINEL_ADMIN: "~* +@all", CharmUsers.SENTINEL_CHARM_ADMIN: "~* +@all", } From 9b27441a713bbacf3e793a3ca16c08198793e197 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 06:11:19 +0000 Subject: [PATCH 031/159] revert back secret name --- tests/integration/k8s/test_charm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index ee9ce21..0ca8fa9 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -148,7 +148,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: hostnames = get_cluster_hostnames(juju, APP_NAME) logger.info("Creating new user secret") - secret_name = "my_secret_2" + secret_name = "my_secret" new_password = "even-newer-password" secret_id = juju.add_secret( name=secret_name, content={CharmUsers.VALKEY_ADMIN.value: new_password} From 3f2177ee2e9575eb173946e66474cc195f1ef956 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 06:28:17 +0000 Subject: [PATCH 032/159] update users for acls and configs --- src/managers/config.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/managers/config.py b/src/managers/config.py index 5f6035c..851fd3e 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -101,8 +101,10 @@ def config_properties(self) -> dict[str, str]: # set replicaof logger.debug("Setting replicaof to primary %s", self.state.cluster.model.primary_ip) config_properties["replicaof"] = f"{self.state.cluster.model.primary_ip} {CLIENT_PORT}" - config_properties["primaryuser"] = "replication-user" - config_properties["primaryauth"] = "testpassword" # TODO make this configurable + config_properties["primaryuser"] = CharmUsers.VALKEY_REPLICA.value + config_properties["primaryauth"] = self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_REPLICA.value, "" + ) return config_properties @@ -161,14 +163,21 @@ def set_sentinel_config_properties(self) -> None: logger.debug("Writing Sentinel configuration") sentinel_config = f"port {SENTINEL_PORT}\n" + + sentinel_config += f"aclfile {SENTINEL_ACL_FILE}\n" # TODO consider adding quorum calculation based on number of units sentinel_config += f"sentinel monitor {PRIMARY_NAME} {self.state.cluster.model.primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" + # auth settings + # auth-user is used by sentinel to authenticate to the valkey primary sentinel_config += ( f"sentinel auth-user {PRIMARY_NAME} {CharmUsers.VALKEY_SENTINEL.value}\n" ) sentinel_config += ( f"sentinel auth-pass {PRIMARY_NAME} {charmed_sentinel_valkey_password}\n" ) + # sentinel admin user settings used by sentinel for its own authentication + sentinel_config += f"sentinel sentinel-user {CharmUsers.SENTINEL_ADMIN.value}\n" + sentinel_config += f"sentinel sentinel-pass {self.state.cluster.internal_users_credentials.get(CharmUsers.SENTINEL_ADMIN.value, '')}\n" # TODO consider making these configs adjustable via charm config sentinel_config += f"sentinel down-after-milliseconds {PRIMARY_NAME} 30000\n" sentinel_config += f"sentinel failover-timeout {PRIMARY_NAME} 180000\n" From 31c217a766da06964b53f541e40b746fba8cce85 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 09:12:37 +0000 Subject: [PATCH 033/159] add update primaryauth on password change --- src/common/client.py | 18 ++++++++++++++++++ src/common/exceptions.py | 4 ++++ src/events/base_events.py | 15 +++++++++++---- src/managers/cluster.py | 24 +++++++++++++++++++++++- 4 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index e092eec..fef79e8 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -16,6 +16,7 @@ from common.exceptions import ( ValkeyACLLoadError, + ValkeyConfigSetError, ValkeyCustomCommandError, ) from literals import CLIENT_PORT @@ -77,3 +78,20 @@ def reload_acl(self) -> None: except ValkeyCustomCommandError as e: logger.error(f"Error loading ACL: {e}") raise ValkeyACLLoadError(f"Could not load ACL: {e}") + + def set_runtime_config(self, config_properties: dict[str, str]) -> None: + """Set configuration properties on the Valkey server. + + Args: + config_properties (dict[str, str]): Configuration properties to set. + """ + try: + command = ["CONFIG", "SET"] + for key, value in config_properties.items(): + command.append(key) + command.append(value) + result = asyncio.run(self._run_custom_command(command)) + logger.debug("Config set result: %s", result) + except ValkeyCustomCommandError as e: + logger.error("Error setting config: %s", e) + raise ValkeyConfigSetError(f"Could not set config: {e}") diff --git a/src/common/exceptions.py b/src/common/exceptions.py index 71e16bc..ef81e29 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -14,3 +14,7 @@ class ValkeyCustomCommandError(ValkeyClientError): class ValkeyACLLoadError(ValkeyClientError): """Custom Exception if ACL file could not be loaded in valkey cluster.""" + + +class ValkeyConfigSetError(ValkeyClientError): + """Custom Exception if setting configuration on valkey cluster fails.""" diff --git a/src/events/base_events.py b/src/events/base_events.py index fa2f0b8..e90ed49 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -10,7 +10,7 @@ import ops -from common.exceptions import ValkeyACLLoadError +from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL_SUFFIX, @@ -137,7 +137,8 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: try: self.charm.config_manager.set_acl_file() self.charm.cluster_manager.reload_acl_file() - except ValkeyACLLoadError as e: + self.charm.cluster_manager.update_primary_auth() + except (ValkeyACLLoadError, ValkeyConfigSetError) as e: logger.error(e) self.charm.status.set_running_status( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, @@ -158,7 +159,12 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: if admin_secret_id == event.secret.id: try: self._update_internal_users_password(str(admin_secret_id)) - except (ops.ModelError, ops.SecretNotFoundError, ValkeyACLLoadError): + except ( + ops.ModelError, + ops.SecretNotFoundError, + ValkeyACLLoadError, + ValkeyConfigSetError, + ): event.defer() return @@ -216,13 +222,14 @@ def _update_internal_users_password(self, secret_id: str) -> None: try: self.charm.config_manager.set_acl_file(passwords=passwords) self.charm.cluster_manager.reload_acl_file() + self.charm.cluster_manager.update_primary_auth() self.charm.state.cluster.update( { f"{user.value.replace('-', '_')}_password": passwords[user.value] for user in CharmUsers } ) - except ValkeyACLLoadError as e: + except (ValkeyACLLoadError, ValkeyConfigSetError) as e: logger.error(e) self.charm.status.set_running_status( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 9e9492c..e8ed606 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -11,7 +11,7 @@ from data_platform_helpers.advanced_statuses.types import Scope from common.client import ValkeyClient -from common.exceptions import ValkeyACLLoadError +from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import CharmUsers @@ -47,6 +47,28 @@ def reload_acl_file(self) -> None: except ValkeyACLLoadError: raise + def update_primary_auth(self) -> None: + """Update the primaryauth runtime configuration on the Valkey server.""" + if self.state.unit_server.model.private_ip == self.state.cluster.model.primary_ip: + logger.info("Current unit is primary; no need to update primaryauth") + return + try: + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + hosts=self.cluster_hostnames, + ) + client.set_runtime_config( + { + "primaryauth": self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_REPLICA.value, "" + ) + } + ) + logger.info("Updated primaryauth runtime configuration on Valkey server") + except ValkeyConfigSetError: + raise + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( From b5a9bea4d2508910692b79071030fed25a26277b Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 10:16:38 +0000 Subject: [PATCH 034/159] switch to ips instead of hostnames --- src/managers/cluster.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index e8ed606..2f82a05 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -33,7 +33,7 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.admin_password = self.state.cluster.internal_users_credentials.get( CharmUsers.VALKEY_ADMIN.value, "" ) - self.cluster_hostnames = [server.model.hostname for server in self.state.servers] + self.cluster_ips = [server.model.private_ip for server in self.state.servers] def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" @@ -41,7 +41,7 @@ def reload_acl_file(self) -> None: client = ValkeyClient( username=self.admin_user, password=self.admin_password, - hosts=self.cluster_hostnames, + hosts=self.cluster_ips, ) client.reload_acl() except ValkeyACLLoadError: @@ -56,7 +56,7 @@ def update_primary_auth(self) -> None: client = ValkeyClient( username=self.admin_user, password=self.admin_password, - hosts=self.cluster_hostnames, + hosts=self.cluster_ips, ) client.set_runtime_config( { From ef5415ad308744fa1a696eb98702f6a41449cd6b Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 10:37:10 +0000 Subject: [PATCH 035/159] fix unit tests and remove checks from manager --- src/managers/config.py | 14 +--- src/workload_k8s.py | 2 + tests/unit/test_charm.py | 144 ++++++++++++++++++++++++++++----------- 3 files changed, 106 insertions(+), 54 deletions(-) diff --git a/src/managers/config.py b/src/managers/config.py index 851fd3e..f6a3d5f 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -150,16 +150,6 @@ def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None def set_sentinel_config_properties(self) -> None: """Write sentinel configuration file.""" - if not self.state.cluster.model or not self.state.cluster.model.primary_ip: - logger.warning("Cannot write sentinel config without primary details set") - return - if not ( - charmed_sentinel_valkey_password := self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_SENTINEL.value - ) - ): - logger.warning("Cannot write sentinel config without sentinel user credentials set") - return logger.debug("Writing Sentinel configuration") sentinel_config = f"port {SENTINEL_PORT}\n" @@ -172,9 +162,7 @@ def set_sentinel_config_properties(self) -> None: sentinel_config += ( f"sentinel auth-user {PRIMARY_NAME} {CharmUsers.VALKEY_SENTINEL.value}\n" ) - sentinel_config += ( - f"sentinel auth-pass {PRIMARY_NAME} {charmed_sentinel_valkey_password}\n" - ) + sentinel_config += f"sentinel auth-pass {PRIMARY_NAME} {self.state.cluster.internal_users_credentials.get(CharmUsers.VALKEY_SENTINEL.value, '')}\n" # sentinel admin user settings used by sentinel for its own authentication sentinel_config += f"sentinel sentinel-user {CharmUsers.SENTINEL_ADMIN.value}\n" sentinel_config += f"sentinel sentinel-pass {self.state.cluster.internal_users_credentials.get(CharmUsers.SENTINEL_ADMIN.value, '')}\n" diff --git a/src/workload_k8s.py b/src/workload_k8s.py index aa91898..342d01e 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -111,6 +111,8 @@ def mkdir( Args: path (str): The directory path to be created. mode (int, optional): The directory mode (permissions). Defaults to None. + user (str, optional): The user name. Defaults to None. + group (str, optional): The group name. Defaults to None. """ dir_path = pathops.ContainerPath(path, container=self.container) dir_path.mkdir(mode=mode, user=user, group=group) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 2063efe..93a531c 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -25,12 +25,13 @@ CONTAINER = "valkey" SERVICE_VALKEY = "valkey" SERVICE_METRIC_EXPORTER = "metric_exporter" +SERVICE_SENTINEL = "valkey-sentinel" METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) APP_NAME = METADATA["name"] -def test_pebble_ready_leader_unit(cloud_spec): +def test_start_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) @@ -54,6 +55,14 @@ def test_pebble_ready_leader_unit(cloud_spec): "group": CHARM_USER, "startup": "enabled", }, + SERVICE_SENTINEL: { + "override": "replace", + "summary": "Valkey sentinel service", + "command": "valkey-sentinel /var/lib/valkey/sentinel.conf", + "user": CHARM_USER, + "group": CHARM_USER, + "startup": "enabled", + }, SERVICE_METRIC_EXPORTER: { "override": "replace", "summary": "Valkey metric exporter", @@ -65,34 +74,42 @@ def test_pebble_ready_leader_unit(cloud_spec): } } - state_out = ctx.run(ctx.on.pebble_ready(container), state_in) - assert state_out.get_container(container.name).plan == expected_plan - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] - == pebble.ServiceStatus.ACTIVE - ) - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] - == pebble.ServiceStatus.ACTIVE - ) - assert state_out.unit_status == ActiveStatus() - assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value, is_app=True) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("workload_k8s.ValkeyK8sWorkload.mkdir"), + ): + # generate passwords + state_out = ctx.run(ctx.on.leader_elected(), state_in) - # container not ready - container = testing.Container(name=CONTAINER, can_connect=False) - state_in = testing.State( - model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), - leader=True, - relations={relation, status_peer_relation}, - containers={container}, - ) + # start event + state_out = ctx.run(ctx.on.start(), state_out) + assert state_out.get_container(container.name).plan == expected_plan + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] + == pebble.ServiceStatus.ACTIVE + ) + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] + == pebble.ServiceStatus.ACTIVE + ) + assert state_out.unit_status == ActiveStatus() + assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value, is_app=True) + + # container not ready + container = testing.Container(name=CONTAINER, can_connect=False) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=True, + relations={relation, status_peer_relation}, + containers={container}, + ) - state_out = ctx.run(ctx.on.pebble_ready(container), state_in) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value, is_app=True) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) + assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value, is_app=True) -def test_pebble_ready_non_leader_unit(cloud_spec): +def test_start_non_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) @@ -106,24 +123,31 @@ def test_pebble_ready_non_leader_unit(cloud_spec): containers={container}, ) - state_out = ctx.run(ctx.on.pebble_ready(container), state_in) - assert not state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) - assert not state_out.get_container(container.name).service_statuses.get( - SERVICE_METRIC_EXPORTER - ) - assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("workload_k8s.ValkeyK8sWorkload.mkdir"), + ): + # generate passwords + state_out = ctx.run(ctx.on.leader_elected(), state_in) - # container not ready - container = testing.Container(name=CONTAINER, can_connect=False) - state_in = testing.State( - model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), - leader=True, - relations={relation, status_peer_relation}, - containers={container}, - ) + state_out = ctx.run(ctx.on.start(), state_out) + assert not state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) + assert not state_out.get_container(container.name).service_statuses.get( + SERVICE_METRIC_EXPORTER + ) + assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value) + + # container not ready + container = testing.Container(name=CONTAINER, can_connect=False) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=True, + relations={relation, status_peer_relation}, + containers={container}, + ) - state_out = ctx.run(ctx.on.pebble_ready(container), state_in) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) def test_update_status_leader_unit(cloud_spec): @@ -310,10 +334,48 @@ def test_config_changed_leader_unit(): patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("common.client.ValkeyClient.reload_acl") as mock_load_acl, + patch("common.client.ValkeyClient.set_runtime_config") as mock_set_runtime_config, + ): + state_out = ctx.run(ctx.on.config_changed(), state_in) + mock_set_acl_file.assert_called_once() + mock_load_acl.assert_called_once() + mock_set_runtime_config.assert_called_once() + secret_out = state_out.get_secret( + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" + ) + assert ( + secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + == "secure-password" + ) + + +def test_config_changed_leader_unit_primary(): + ctx = testing.Context(ValkeyCharm) + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, local_app_data={"primary_ip": "127.0.1.1"} + ) + container = testing.Container(name=CONTAINER, can_connect=True) + + password_secret = testing.Secret( + tracked_content={CharmUsers.VALKEY_ADMIN.value: "secure-password"}, remote_grants=APP_NAME + ) + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + secrets={password_secret}, + config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, + ) + with ( + patch("workload_k8s.ValkeyK8sWorkload.write_file"), + patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, + patch("common.client.ValkeyClient.reload_acl") as mock_load_acl, + patch("common.client.ValkeyClient.set_runtime_config") as mock_set_runtime_config, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() mock_load_acl.assert_called_once() + mock_set_runtime_config.assert_not_called() secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) From 888ffbd0a46c4c5ce62c30081e257befb2f86908 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 11:25:27 +0000 Subject: [PATCH 036/159] add statuses for starting --- src/charm.py | 25 ++++++++++++++++++++++++- src/core/base_workload.py | 5 +++++ src/managers/cluster.py | 6 +----- src/statuses.py | 15 +++++++++++---- src/workload_k8s.py | 12 ++++++++++++ 5 files changed, 53 insertions(+), 10 deletions(-) diff --git a/src/charm.py b/src/charm.py index 55ac8e1..a4fbd01 100755 --- a/src/charm.py +++ b/src/charm.py @@ -11,9 +11,10 @@ from core.cluster_state import ClusterState from events.base_events import BaseEvents -from literals import CHARM_USER, CONTAINER, DATA_DIR +from literals import CHARM_USER, CLIENT_PORT, CONTAINER, DATA_DIR from managers.cluster import ClusterManager from managers.config import ConfigManager +from statuses import ValkeyServiceStatuses from workload_k8s import ValkeyK8sWorkload logger = logging.getLogger(__name__) @@ -64,7 +65,29 @@ def _on_start(self, event: ops.StartEvent) -> None: self.config_manager.set_sentinel_config_properties() self.config_manager.set_sentinel_acl_file() self.workload.mkdir(DATA_DIR, user=CHARM_USER, group=CHARM_USER) + self.status.set_running_status( + ValkeyServiceStatuses.SERVICE_STARTING.value, + scope="unit", + component_name=self.cluster_manager.name, + statuses_state=self.state.statuses, + ) self.workload.start() + if self.workload.alive(): + logger.info("Workload started successfully. Opening client port") + self.unit.open_port("tcp", CLIENT_PORT) + self.state.statuses.delete( + ValkeyServiceStatuses.SERVICE_STARTING.value, + scope="unit", + component=self.cluster_manager.name, + ) + else: + logger.error("Workload failed to start.") + self.status.set_running_status( + ValkeyServiceStatuses.SERVICE_NOT_RUNNING.value, + scope="unit", + component_name=self.cluster_manager.name, + statuses_state=self.state.statuses, + ) logger.info("Services started") self.state.unit_server.update({"started": True}) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 2206c0a..1d48628 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -76,3 +76,8 @@ def get_private_ip(self) -> str: logger.error(f"Error executing command '{cmd}': {e}") return socket.gethostbyname(socket.gethostname()) + + @abstractmethod + def alive(self) -> bool: + """Check if the Valkey service is running.""" + pass diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 2f82a05..3df6dea 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -79,10 +79,6 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje status_list.append(CharmStatuses.SERVICE_NOT_STARTED.value) if not self.state.unit_server.is_started: - status_list.append(CharmStatuses.SCALING_NOT_IMPLEMENTED.value) - - if scope == "app": - # todo: remove when scaling is implemented - status_list.append(CharmStatuses.SCALING_NOT_IMPLEMENTED.value) + status_list.append(CharmStatuses.SERVICE_NOT_STARTED.value) return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/statuses.py b/src/statuses.py index 0f557a2..84f91c6 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -15,10 +15,6 @@ class CharmStatuses(Enum): """Collection of possible statuses for the charm.""" ACTIVE_IDLE = StatusObject(status="active", message="") - SCALING_NOT_IMPLEMENTED = StatusObject( - status="blocked", - message="Scaling Valkey is not implemented yet", - ) SERVICE_NOT_STARTED = StatusObject(status="blocked", message="Service not started") SECRET_ACCESS_ERROR = StatusObject( status="blocked", @@ -33,3 +29,14 @@ class ClusterStatuses(Enum): PASSWORD_UPDATE_FAILED = StatusObject( status="blocked", message="Failed to update an internal user's password", running="async" ) + + +class ValkeyServiceStatuses(Enum): + """Collection of possible Valkey service related statuses.""" + + SERVICE_STARTING = StatusObject( + status="maintenance", message="waiting for valkey to start...", running="async" + ) + SERVICE_NOT_RUNNING = StatusObject( + status="blocked", message="valkey service not running", running="async" + ) diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 342d01e..8e4d90c 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -116,3 +116,15 @@ def mkdir( """ dir_path = pathops.ContainerPath(path, container=self.container) dir_path.mkdir(mode=mode, user=user, group=group) + + def alive(self) -> bool: + """Check if the Valkey service is running.""" + for service_name in [ + self.valkey_service, + self.sentinel_service, + self.metric_service, + ]: + service = self.container.get_service(service_name) + if not service.is_running(): + return False + return True From 5f2bc81c88a04565bcb2af591b6715909d1f6771 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 29 Jan 2026 11:25:35 +0000 Subject: [PATCH 037/159] fix unit tests --- tests/integration/k8s/helpers.py | 4 ---- tests/unit/test_charm.py | 39 +++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 56a24b0..52bf28f 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -36,10 +36,6 @@ class CharmStatuses(Enum): """List all StatusObjects here that are checked against in the integration tests.""" - SCALING_NOT_IMPLEMENTED = StatusObject( - status="blocked", - message="Scaling Valkey is not implemented yet", - ) SECRET_ACCESS_ERROR = StatusObject( status="blocked", message="Cannot access configured secret, check permissions", diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 93a531c..ac00ba0 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -30,6 +30,12 @@ METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) APP_NAME = METADATA["name"] +internal_passwords_secret = testing.Secret( + tracked_content={f"{user.value}-password": "secure-password" for user in CharmUsers}, + owner="app", + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}", +) + def test_start_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) @@ -93,7 +99,7 @@ def test_start_leader_unit(cloud_spec): == pebble.ServiceStatus.ACTIVE ) assert state_out.unit_status == ActiveStatus() - assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value, is_app=True) + assert state_out.app_status == ActiveStatus() # container not ready container = testing.Container(name=CONTAINER, can_connect=False) @@ -127,15 +133,30 @@ def test_start_non_leader_unit(cloud_spec): patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("workload_k8s.ValkeyK8sWorkload.mkdir"), ): - # generate passwords - state_out = ctx.run(ctx.on.leader_elected(), state_in) - - state_out = ctx.run(ctx.on.start(), state_out) + state_out = ctx.run(ctx.on.start(), state_in) assert not state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) assert not state_out.get_container(container.name).service_statuses.get( SERVICE_METRIC_EXPORTER ) - assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value) + assert "start" in [e.name for e in state_out.deferred] + + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, local_app_data={"primary_ip": "127.1.0.1"} + ) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + assert state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) + assert state_out.get_container(container.name).service_statuses.get( + SERVICE_METRIC_EXPORTER + ) + assert state_out.get_container(container.name).service_statuses[SERVICE_SENTINEL] + assert state_out.get_relation(1).local_unit_data["started"] == "true" # container not ready container = testing.Container(name=CONTAINER, can_connect=False) @@ -174,7 +195,9 @@ def test_update_status_leader_unit(cloud_spec): def test_update_status_non_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, local_unit_data={"started": "true"} + ) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) @@ -185,7 +208,7 @@ def test_update_status_non_leader_unit(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.update_status(), state_in) - assert status_is(state_out, CharmStatuses.SCALING_NOT_IMPLEMENTED.value) + assert state_out.unit_status == ActiveStatus() def test_internal_user_creation(): From 79b45f0cfbc17d91b372aa24ec6c1e3460568ef0 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 30 Jan 2026 06:28:05 +0000 Subject: [PATCH 038/159] switch from valkey glide to valkey cli with subprocess --- src/common/exceptions.py | 4 ++++ src/core/base_workload.py | 16 +++++++++++++++ src/core/cluster_state.py | 5 +++-- src/managers/cluster.py | 43 +++++++++++++++++---------------------- src/workload_k8s.py | 25 +++++++++++++++++++++++ 5 files changed, 67 insertions(+), 26 deletions(-) diff --git a/src/common/exceptions.py b/src/common/exceptions.py index ef81e29..3a78681 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -18,3 +18,7 @@ class ValkeyACLLoadError(ValkeyClientError): class ValkeyConfigSetError(ValkeyClientError): """Custom Exception if setting configuration on valkey cluster fails.""" + + +class ValkeyExecCommandError(Exception): + """Custom Exception if exec command on valkey container fails.""" diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 1d48628..70e4331 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -81,3 +81,19 @@ def get_private_ip(self) -> str: def alive(self) -> bool: """Check if the Valkey service is running.""" pass + + @abstractmethod + def exec_command( + self, command: list[str], username: str, password: str + ) -> tuple[str, str | None] | None: + """Execute a Valkey command inside the workload. + + Args: + command (list[str]): The command to execute as a list of strings. + username (str): The username for authentication. + password (str): The password for authentication. + + Returns: + bool: True if the command executed successfully, False otherwise. + """ + pass diff --git a/src/core/cluster_state.py b/src/core/cluster_state.py index 6f62510..1eda942 100644 --- a/src/core/cluster_state.py +++ b/src/core/cluster_state.py @@ -100,18 +100,19 @@ def servers(self) -> set[ValkeyServer]: return servers - def get_secret_from_id(self, secret_id: str) -> dict[str, str]: + def get_secret_from_id(self, secret_id: str, refresh: bool = False) -> dict[str, str]: """Resolve the given id of a Juju secret and return the content as a dict. Args: model (Model): Model object. secret_id (str): The id of the secret. + refresh (bool): Whether to refresh the secret content from the controller. Defaults to False. Returns: dict: The content of the secret. """ try: - secret_content = self.charm.model.get_secret(id=secret_id).get_content(refresh=True) + secret_content = self.charm.model.get_secret(id=secret_id).get_content(refresh=refresh) except ops.SecretNotFoundError: raise ops.SecretNotFoundError(f"The secret '{secret_id}' does not exist.") except ops.ModelError: diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 3df6dea..d37f836 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -10,8 +10,7 @@ from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope -from common.client import ValkeyClient -from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError +from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError, ValkeyExecCommandError from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import CharmUsers @@ -30,22 +29,18 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload self.admin_user = CharmUsers.VALKEY_ADMIN.value - self.admin_password = self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_ADMIN.value, "" - ) - self.cluster_ips = [server.model.private_ip for server in self.state.servers] + self.admin_password = self.state.unit_server.valkey_admin_password + # target only the unit's valkey server IP + self.cluster_ips = [self.workload.get_private_ip()] def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" try: - client = ValkeyClient( - username=self.admin_user, - password=self.admin_password, - hosts=self.cluster_ips, + self.workload.exec_command( + ["acl", "load"], username=self.admin_user, password=self.admin_password ) - client.reload_acl() - except ValkeyACLLoadError: - raise + except ValkeyExecCommandError: + raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: """Update the primaryauth runtime configuration on the Valkey server.""" @@ -53,21 +48,21 @@ def update_primary_auth(self) -> None: logger.info("Current unit is primary; no need to update primaryauth") return try: - client = ValkeyClient( + self.workload.exec_command( + [ + "config", + "set", + "primaryauth", + self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_REPLICA.value, "" + ), + ], username=self.admin_user, password=self.admin_password, - hosts=self.cluster_ips, - ) - client.set_runtime_config( - { - "primaryauth": self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_REPLICA.value, "" - ) - } ) logger.info("Updated primaryauth runtime configuration on Valkey server") - except ValkeyConfigSetError: - raise + except ValkeyExecCommandError: + raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 8e4d90c..cba2077 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -10,6 +10,7 @@ import ops from charmlibs import pathops +from common.exceptions import ValkeyExecCommandError from core.base_workload import WorkloadBase from literals import CHARM, CHARM_USER, CONFIG_FILE, SENTINEL_CONFIG_FILE @@ -128,3 +129,27 @@ def alive(self) -> bool: if not service.is_running(): return False return True + + def exec_command( + self, command: list[str], username: str, password: str + ) -> tuple[str, str | None] | None: + """Execute a Valkey command inside the container. + + Args: + command (list[str]): The command to execute as a list of strings. + username (str): The username for authentication. + password (str): The password for authentication. + + Returns: + bool: True if the command executed successfully, False otherwise. + """ + full_command = ["valkey-cli"] + ["--user", username, "--pass", password] + command + try: + process = self.container.exec(full_command) + out, err = process.wait_output() + if err: + logger.warning("Command returned error: %s", err) + return out.strip(), err.strip() if err else None + except (ops.pebble.ExecError, ops.pebble.ChangeError) as e: + logger.error("Error executing command: %s", e) + raise ValkeyExecCommandError(f"Could not execute command{e}") From b1b4f0666d4aa4dde99a8e41b1aa534d2604776f Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 30 Jan 2026 06:28:32 +0000 Subject: [PATCH 039/159] add unit local admin password and fix integration tests --- poetry.lock | 29 +++++++++++- pyproject.toml | 1 + src/charm.py | 1 + src/core/models.py | 8 ++++ src/events/base_events.py | 8 +++- src/managers/config.py | 13 +++++ tests/integration/k8s/helpers.py | 73 ++++++++++++++++++++++++----- tests/integration/k8s/test_charm.py | 45 ++++++++---------- 8 files changed, 139 insertions(+), 39 deletions(-) diff --git a/poetry.lock b/poetry.lock index 86b3887..b2873ab 100644 --- a/poetry.lock +++ b/poetry.lock @@ -713,6 +713,21 @@ pytest = ">=6.2.5" [package.extras] dev = ["pre-commit", "pytest-asyncio", "tox"] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["integration"] +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "pyyaml" version = "6.0.3" @@ -859,6 +874,18 @@ files = [ {file = "shellcheck_py-0.11.0.1.tar.gz", hash = "sha256:5c620c88901e8f1d3be5934b31ea99e3310065e1245253741eafd0a275c8c9cc"}, ] +[[package]] +name = "six" +version = "1.17.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["integration"] +files = [ + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -960,4 +987,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "9721ba0790a1a564baa26313d5d1385a916ff9e9a510dd00c8b559b14247d55a" +content-hash = "f2a6e74276e2fa70da78db10de6e45c0c8047c900b1faf7ec3564f7d5da28c21" diff --git a/pyproject.toml b/pyproject.toml index 6b0ae59..73fc9d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ allure-pytest-default-results = "^0.1.2" data-platform-helpers = ">=0.1.7" jubilant = "^1.6.0" valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs" } +python-dateutil = "*" [tool.coverage.run] branch = true diff --git a/src/charm.py b/src/charm.py index a4fbd01..075735a 100755 --- a/src/charm.py +++ b/src/charm.py @@ -60,6 +60,7 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return + self.config_manager.update_local_valkey_admin() self.config_manager.set_config_properties() self.config_manager.set_acl_file() self.config_manager.set_sentinel_config_properties() diff --git a/src/core/models.py b/src/core/models.py index fbfdcfe..8b6e942 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -42,6 +42,7 @@ class PeerAppModel(PeerModel): class PeerUnitModel(PeerModel): """Model for the peer unit data.""" + charmed_operator_password: InternalUsersSecret = Field(default="") started: bool = Field(default=False) hostname: str = Field(default="") private_ip: str = Field(default="") @@ -118,6 +119,13 @@ def is_started(self) -> bool: """Check if the unit has started.""" return self.model.started if self.model else False + @property + def valkey_admin_password(self) -> str: + """Retrieve the password for the valkey admin user.""" + if not self.model: + return "" + return self.model.charmed_operator_password or "" + @final class ValkeyCluster(RelationState): diff --git a/src/events/base_events.py b/src/events/base_events.py index e90ed49..7ddf253 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -106,6 +106,8 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: for user in CharmUsers } ) + # update local unit admin password + self.charm.config_manager.update_local_valkey_admin() self.charm.config_manager.set_acl_file() def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: @@ -138,6 +140,8 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: self.charm.config_manager.set_acl_file() self.charm.cluster_manager.reload_acl_file() self.charm.cluster_manager.update_primary_auth() + # update the local unit admin password to match the leader + self.charm.config_manager.update_local_valkey_admin() except (ValkeyACLLoadError, ValkeyConfigSetError) as e: logger.error(e) self.charm.status.set_running_status( @@ -175,7 +179,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: secret_id (str): The id of the secret containing the internal users' passwords. """ try: - secret_content = self.charm.state.get_secret_from_id(secret_id) + secret_content = self.charm.state.get_secret_from_id(secret_id, refresh=True) except (ops.ModelError, ops.SecretNotFoundError) as e: logger.error(e) self.charm.status.set_running_status( @@ -229,6 +233,8 @@ def _update_internal_users_password(self, secret_id: str) -> None: for user in CharmUsers } ) + # update the local unit admin password + self.charm.config_manager.update_local_valkey_admin() except (ValkeyACLLoadError, ValkeyConfigSetError) as e: logger.error(e) self.charm.status.set_running_status( diff --git a/src/managers/config.py b/src/managers/config.py index f6a3d5f..3288376 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -200,6 +200,19 @@ def generate_password(self) -> str: """ return "".join([secrets.choice(string.ascii_letters + string.digits) for _ in range(32)]) + def update_local_valkey_admin(self) -> None: + """Update the local unit's valkey admin password in the state.""" + if not ( + app_password := self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_ADMIN.value + ) + ): + logger.warning("No valkey admin password found to update local unit state") + return + self.state.unit_server.update( + {f"{CharmUsers.VALKEY_ADMIN.value.replace('-', '_')}_password": app_password} + ) + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the config manager's statuses.""" status_list: list[StatusObject] = [] diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 52bf28f..9416045 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -4,13 +4,14 @@ import contextlib import logging -from enum import Enum +from datetime import datetime, timedelta from pathlib import Path from typing import List import jubilant import yaml from data_platform_helpers.advanced_statuses.models import StatusObject +from dateutil.parser import parse from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials from ops import SecretNotFoundError, StatusBase @@ -33,16 +34,6 @@ ) -class CharmStatuses(Enum): - """List all StatusObjects here that are checked against in the integration tests.""" - - SECRET_ACCESS_ERROR = StatusObject( - status="blocked", - message="Cannot access configured secret, check permissions", - running="async", - ) - - def does_status_match( model_status: jubilant.Status, expected_unit_statuses: dict[str, List[StatusObject]] | None = None, @@ -127,8 +118,66 @@ def does_message_match(expected_status_message: str, status: StatusObject) -> bo return False +def are_apps_active_and_agents_idle( + status: jubilant.Status, + *apps: str, + idle_period: int = 0, + unit_count: int | dict[str, int] | None = None, +) -> bool: + """Check that all given apps are active, their agents idle (optional idle interval too) and optionally verify unit count as well. + + Args: + status: represents the jubilant model's current status + apps: A list of applications whose statuses to test against + idle_period: Seconds to wait for the agents of each application unit to be idle. + unit_count: The desired number of units to wait for, can be > to 0. + If set as int, this value is expected for all apps but if more granularity is needed, + pass a dictionary such as: {"app1": 2, "app2": 1, ...}, if set to -1, the check + only happens at the application level. + """ + return ( + jubilant.all_active(status, *apps) + and jubilant.all_agents_idle(status, *apps) + and _check_apps_idle_period(status, *apps, idle_period=idle_period) + and verify_unit_count(status, *apps, unit_count=unit_count) + ) + + +def are_agents_idle( + status: jubilant.Status, + *apps: str, + idle_period: int = 0, + unit_count: int | dict[str, int] | None = None, +) -> bool: + """Check that agents of all given apps are idle (optional idle interval too). Optionally verify unit count as well. + + Args: + status: represents the jubilant model's current status + apps: A list of applications whose statuses to test against + idle_period: Seconds to wait for the agents of each application unit to be idle. + unit_count: The desired number of units to wait for, should be > 0. + If set as int, this value is expected for all apps but if more granularity is needed, + pass a dictionary such as: {"app1": 2, "app2": 1, ...}, if set to -1, the check + only happens at the application level. + """ + return ( + jubilant.all_agents_idle(status, *apps) + and _check_apps_idle_period(status, *apps, idle_period=idle_period) + and verify_unit_count(status, *apps, unit_count=unit_count) + ) + + +def _check_apps_idle_period(status: jubilant.Status, *apps: str, idle_period: int) -> bool: + return all( + parse(unit.juju_status.since, ignoretz=True) + timedelta(seconds=idle_period) + < datetime.now() + for app in apps + for unit in status.get_units(app).values() + ) + + def verify_unit_count( - status: jubilant.Status, *apps: str, unit_count: int | dict[str, int] = None + status: jubilant.Status, *apps: str, unit_count: int | dict[str, int] | None = None ): """Verify the unit count for an application. diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 0ca8fa9..bab7cea 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -2,7 +2,6 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. import logging -from time import sleep import jubilant import pytest @@ -11,13 +10,13 @@ INTERNAL_USERS_PASSWORD_CONFIG, CharmUsers, ) -from statuses import ClusterStatuses +from statuses import CharmStatuses, ClusterStatuses from .helpers import ( APP_NAME, IMAGE_RESOURCE, INTERNAL_USERS_SECRET_LABEL, - CharmStatuses, + are_apps_active_and_agents_idle, create_valkey_client, does_status_match, fast_forward, @@ -30,8 +29,7 @@ logger = logging.getLogger(__name__) -# TODO scale up when scaling is implemented -NUM_UNITS = 1 +NUM_UNITS = 3 TEST_KEY = "test_key" TEST_VALUE = "test_value" @@ -41,10 +39,7 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: """Build the charm-under-test and deploy it with three units.""" juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=NUM_UNITS) juju.wait( - lambda status: does_status_match( - status, - expected_app_statuses={APP_NAME: [CharmStatuses.SCALING_NOT_IMPLEMENTED.value]}, - ), + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, ) @@ -82,7 +77,10 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: set_password(juju, new_password) # wait for config-changed hook to finish executing - juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) # perform read operation with the updated password result = await set_key( @@ -98,7 +96,10 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: juju.config(app=APP_NAME, reset=[INTERNAL_USERS_PASSWORD_CONFIG]) # wait for config-changed hook to finish executing - juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) # make sure we can still read data with the previously set password assert await get_key( @@ -129,7 +130,10 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None set_password(juju, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) # wait for config-changed hook to finish executing - juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) # perform read operation with the updated password result = await set_key( @@ -169,19 +173,10 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # deferred `config_changed` event will be retried before `update_status` with fast_forward(juju): juju.grant_secret(identifier=secret_name, app=APP_NAME) - sleep(20) # allow some time for the permission to propagate - - # juju.wait( - # lambda status: jubilant.all_active(status, APP_NAME), - # timeout=1200, - # ) - juju.wait( - lambda status: does_status_match( - status, - expected_app_statuses={APP_NAME: [CharmStatuses.SCALING_NOT_IMPLEMENTED.value]}, - ), - timeout=600, - ) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) # perform read operation with the updated password assert await get_key( From 577171ec6e14b823eecbce34223cefbaf4922274 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 30 Jan 2026 06:38:33 +0000 Subject: [PATCH 040/159] fix unit tests --- tests/unit/test_charm.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index ac00ba0..596183e 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -8,7 +8,7 @@ import yaml from ops import ActiveStatus, pebble, testing -from common.exceptions import ValkeyACLLoadError +from common.exceptions import ValkeyExecCommandError from src.charm import ValkeyCharm from src.literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -356,13 +356,11 @@ def test_config_changed_leader_unit(): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.reload_acl") as mock_load_acl, - patch("common.client.ValkeyClient.set_runtime_config") as mock_set_runtime_config, + patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mocl_exec_command, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - mock_load_acl.assert_called_once() - mock_set_runtime_config.assert_called_once() + assert mocl_exec_command.call_count == 2 # one for acl load, one for primaryauth set secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) @@ -392,13 +390,13 @@ def test_config_changed_leader_unit_primary(): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.reload_acl") as mock_load_acl, - patch("common.client.ValkeyClient.set_runtime_config") as mock_set_runtime_config, + patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mock_exec_command, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - mock_load_acl.assert_called_once() - mock_set_runtime_config.assert_not_called() + mock_exec_command.assert_called_once_with( + ["acl", "load"], username=CharmUsers.VALKEY_ADMIN.value, password="" + ) secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) @@ -462,12 +460,14 @@ def test_change_password_secret_changed_non_leader_unit(): "events.base_events.BaseEvents._update_internal_users_password" ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.reload_acl") as mock_reload_acl, + patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mock_exec_command, ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_reload_acl.assert_called_once() + mock_exec_command.assert_called_once_with( + ["acl", "load"], username=CharmUsers.VALKEY_ADMIN.value, password="" + ) def test_change_password_secret_changed_non_leader_unit_not_successful(): @@ -495,16 +495,18 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(): ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch( - "common.client.ValkeyClient.reload_acl", - side_effect=ValkeyACLLoadError("Reload failed"), - ) as mock_reload_acl, + "workload_k8s.ValkeyK8sWorkload.exec_command", + side_effect=ValkeyExecCommandError("Failed to execute command"), + ) as mock_exec_command, ctx(ctx.on.secret_changed(password_secret), state_in) as manager, ): charm: ValkeyCharm = manager.charm state_out = manager.run() mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_reload_acl.assert_called_once() + mock_exec_command.assert_called_once_with( + ["acl", "load"], username=CharmUsers.VALKEY_ADMIN.value, password="" + ) cluster_statuses = charm.state.statuses.get( scope="unit", component=charm.cluster_manager.name, From c74a27ac5fe98ff6aa5db90e866aa93799ad4fd7 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 30 Jan 2026 10:42:44 +0000 Subject: [PATCH 041/159] switch away from glide for integration tests --- poetry.lock | 28 ++++++++-- pyproject.toml | 2 +- tests/integration/k8s/helpers.py | 77 +++++++++++--------------- tests/integration/k8s/test_charm.py | 86 +++++++++++++---------------- 4 files changed, 95 insertions(+), 98 deletions(-) diff --git a/poetry.lock b/poetry.lock index b2873ab..6cfc71c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -66,7 +66,7 @@ version = "4.12.1" description = "High-level concurrency and networking framework on top of asyncio or Trio" optional = false python-versions = ">=3.9" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c"}, {file = "anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703"}, @@ -269,7 +269,7 @@ version = "3.11" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, @@ -468,7 +468,7 @@ version = "6.33.4" description = "" optional = false python-versions = ">=3.9" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"}, {file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"}, @@ -892,7 +892,7 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" -groups = ["main", "integration"] +groups = ["main"] files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -925,13 +925,29 @@ files = [ [package.dependencies] typing-extensions = ">=4.12.0" +[[package]] +name = "valkey" +version = "6.1.1" +description = "Python client for Valkey forked from redis-py" +optional = false +python-versions = ">=3.9" +groups = ["integration"] +files = [ + {file = "valkey-6.1.1-py3-none-any.whl", hash = "sha256:e2691541c6e1503b53c714ad9a35551ac9b7c0bbac93865f063dbc859a46de92"}, + {file = "valkey-6.1.1.tar.gz", hash = "sha256:5880792990c6c2b5eb604a5ed5f98f300880b6dd92d123819b66ed54bb259731"}, +] + +[package.extras] +libvalkey = ["libvalkey (>=4.0.1)"] +ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] + [[package]] name = "valkey-glide" version = "0.0.0" description = "Valkey GLIDE Async client. Supports Valkey and Redis OSS." optional = false python-versions = ">=3.9" -groups = ["main", "integration"] +groups = ["main"] files = [] develop = false @@ -987,4 +1003,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "f2a6e74276e2fa70da78db10de6e45c0c8047c900b1faf7ec3564f7d5da28c21" +content-hash = "abc38cad6a46313a8cc9e71a9c82b52e2e0b14e76247ccd11bec2cffdef18876" diff --git a/pyproject.toml b/pyproject.toml index 73fc9d9..ab475bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,8 +50,8 @@ allure-pytest = "*" allure-pytest-default-results = "^0.1.2" data-platform-helpers = ">=0.1.7" jubilant = "^1.6.0" -valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs" } python-dateutil = "*" +valkey = "^6.1.1" [tool.coverage.run] branch = true diff --git a/tests/integration/k8s/helpers.py b/tests/integration/k8s/helpers.py index 9416045..64aeb4b 100644 --- a/tests/integration/k8s/helpers.py +++ b/tests/integration/k8s/helpers.py @@ -9,10 +9,10 @@ from typing import List import jubilant +import valkey import yaml from data_platform_helpers.advanced_statuses.models import StatusObject from dateutil.parser import parse -from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials from ops import SecretNotFoundError, StatusBase from literals import ( @@ -229,31 +229,23 @@ def get_secret_by_label(juju: jubilant.Juju, label: str) -> dict[str, str]: raise SecretNotFoundError(f"Secret with label {label} not found") -async def create_valkey_client( - hostnames: list[str], +def create_valkey_client( + hostname: str, username: str | None = CharmUsers.VALKEY_ADMIN.value, password: str | None = None, -): +) -> valkey.Valkey: """Create and return a Valkey client connected to the cluster. Args: - hostnames: List of hostnames of the Valkey cluster nodes. + hostname: The hostname of the Valkey cluster node. username: The username for authentication. password: The password for the internal user. Returns: A Valkey client instance connected to the cluster. """ - addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in hostnames] - - credentials = None - if username or password: - credentials = ServerCredentials(username=username, password=password) - client_config = GlideClientConfiguration( - addresses, - credentials=credentials, - ) - return await GlideClient.create(client_config) + client = valkey.Valkey(host=hostname, port=CLIENT_PORT, username=username, password=password) + return client def set_password( @@ -287,35 +279,6 @@ def set_password( juju.config(app=application, values={INTERNAL_USERS_PASSWORD_CONFIG: secret_id}) -async def set_key( - hostnames: list[str], username: str, password: str, key: str, value: str -) -> bytes | None: - """Write a key-value pair to the Valkey cluster. - - Args: - hostnames: List of hostnames of the Valkey cluster nodes. - key: The key to write. - value: The value to write. - username: The username for authentication. - password: The password for authentication. - """ - client = await create_valkey_client(hostnames=hostnames, username=username, password=password) - return await client.set(key, value) - - -async def get_key(hostnames: list[str], username: str, password: str, key: str) -> bytes | None: - """Read a value from the Valkey cluster by key. - - Args: - hostnames: List of hostnames of the Valkey cluster nodes. - key: The key to read. - username: The username for authentication. - password: The password for authentication. - """ - client = await create_valkey_client(hostnames=hostnames, username=username, password=password) - return await client.get(key) - - @contextlib.contextmanager def fast_forward(juju: jubilant.Juju): """Context manager that temporarily speeds up update-status hooks to fire every 10s.""" @@ -325,3 +288,29 @@ def fast_forward(juju: jubilant.Juju): yield finally: juju.model_config({"update-status-hook-interval": old}) + + +def get_primary_ip(juju: jubilant.Juju, app: str) -> str: + """Get the primary node of the Valkey cluster. + + Returns: + The IP address of the primary node. + """ + hostnames = get_cluster_hostnames(juju, app) + client = create_valkey_client(hostname=hostnames[0], password=get_password(juju)) + info = client.info("replication") + return hostnames[0] if info["role"] == "master" else info.get("master_host", "") + + +def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN) -> str: + """Retrieve the password for a given internal user from Juju secrets. + + Args: + juju: The Juju client instance. + user: The internal user whose password to retrieve. + + Returns: + The password for the specified internal user. + """ + secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) + return secret.get(f"{user.value}-password", "") diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index bab7cea..9ca570b 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -5,6 +5,7 @@ import jubilant import pytest +from valkey import AuthenticationError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -15,15 +16,13 @@ from .helpers import ( APP_NAME, IMAGE_RESOURCE, - INTERNAL_USERS_SECRET_LABEL, are_apps_active_and_agents_idle, create_valkey_client, does_status_match, fast_forward, get_cluster_hostnames, - get_key, - get_secret_by_label, - set_key, + get_password, + get_primary_ip, set_password, ) @@ -47,31 +46,28 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: @pytest.mark.abort_on_fail async def test_authentication(juju: jubilant.Juju) -> None: """Assert that we can authenticate to valkey.""" + primary = get_primary_ip(juju, APP_NAME) hostnames = get_cluster_hostnames(juju, APP_NAME) # try without authentication - with pytest.raises(Exception) as exc_info: - unauth_client = await create_valkey_client( - hostnames=hostnames, username=None, password=None - ) + with pytest.raises(AuthenticationError): + unauth_client = create_valkey_client(hostname=primary, username=None, password=None) await unauth_client.ping() - assert "NOAUTH" in str(exc_info.value), "Unauthenticated access did not fail as expected" # Authenticate with internal user - secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) - password = secret.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + password = get_password(juju, user=CharmUsers.VALKEY_ADMIN) assert password is not None, "Admin password secret not found" - client = await create_valkey_client(hostnames=hostnames, password=password) - auth_result = await client.ping() - assert auth_result == b"PONG", "Authentication to Valkey cluster failed" + for hostname in hostnames: + client = create_valkey_client(hostname=hostname, password=password) + assert client.ping() is True, ( + f"Authentication to Valkey cluster failed for host {hostname}" + ) @pytest.mark.abort_on_fail async def test_update_admin_password(juju: jubilant.Juju) -> None: """Assert the admin password is updated when adding a user secret to the config.""" - hostnames = get_cluster_hostnames(juju, APP_NAME) - # create a user secret and grant it to the application new_password = "some-password" set_password(juju, new_password) @@ -81,16 +77,15 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), timeout=1200, ) + primary = get_primary_ip(juju, APP_NAME) + client = create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, "Failed to authenticate with new admin password" - # perform read operation with the updated password - result = await set_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - value=TEST_VALUE, + assert client.set(TEST_KEY, TEST_VALUE) is True, ( + "Failed to write data after admin password update" ) - assert result == "OK", "Failed to write data after admin password update" # update the config again and remove the option `admin-password` juju.config(app=APP_NAME, reset=[INTERNAL_USERS_PASSWORD_CONFIG]) @@ -102,19 +97,14 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: ) # make sure we can still read data with the previously set password - assert await get_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - ) == bytes(TEST_VALUE, "utf-8") + assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( + "Failed to read data after admin password update" + ) @pytest.mark.abort_on_fail async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None: """Assert the admin password is updated when adding a user secret to the config.""" - hostnames = get_cluster_hostnames(juju, APP_NAME) - # create a user secret and grant it to the application new_password = "some-password" set_password(juju, username="wrong-username", password=new_password) @@ -136,21 +126,19 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None ) # perform read operation with the updated password - result = await set_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - value=TEST_VALUE, + primary = get_primary_ip(juju, APP_NAME) + client = create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, "Failed to authenticate with new admin password" + assert client.set(TEST_KEY, TEST_VALUE) is True, ( + "Failed to write data after admin password update" ) - assert result == "OK", "Failed to write data after admin password update" @pytest.mark.abort_on_fail async def test_user_secret_permissions(juju: jubilant.Juju) -> None: """If a user secret is not granted, ensure we can process updated permissions.""" - hostnames = get_cluster_hostnames(juju, APP_NAME) - logger.info("Creating new user secret") secret_name = "my_secret" new_password = "even-newer-password" @@ -179,12 +167,16 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: ) # perform read operation with the updated password - assert await get_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - ) == bytes(TEST_VALUE, "utf-8"), "Failed to read data after secret permissions were updated" + primary = get_primary_ip(juju, APP_NAME) + client = create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, ( + "Failed to authenticate with new admin password after secret access" + ) + assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( + "Failed to read data after secret permissions were updated" + ) logger.info("Password update successful after secret was granted") From 2f00f1f4775e60c7225fbe588cb137f82e7330f0 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 30 Jan 2026 11:03:26 +0000 Subject: [PATCH 042/159] add replica password change and check on all units --- tests/integration/k8s/test_charm.py | 60 ++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 9ca570b..5a3d552 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -96,10 +96,16 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: timeout=1200, ) - # make sure we can still read data with the previously set password - assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( - "Failed to read data after admin password update" - ) + for hostname in get_cluster_hostnames(juju, APP_NAME): + client = create_valkey_client( + hostname=hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, ( + f"Failed to authenticate with admin password after removing user secret on host {hostname}" + ) + assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( + f"Failed to read data after admin password update on host {hostname}" + ) @pytest.mark.abort_on_fail @@ -167,18 +173,52 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: ) # perform read operation with the updated password + hostnames = get_cluster_hostnames(juju, APP_NAME) primary = get_primary_ip(juju, APP_NAME) client = create_valkey_client( hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password ) - assert client.ping() is True, ( - "Failed to authenticate with new admin password after secret access" - ) - assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( - "Failed to read data after secret permissions were updated" + assert client.ping() is True, "Failed to authenticate with new admin password" + assert client.set(TEST_KEY, TEST_VALUE) is True, ( + "Failed to write data after admin password update" ) + for hostname in hostnames: + client = create_valkey_client( + hostname=hostname, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + ) + assert client.ping() is True, ( + f"Failed to authenticate with new admin password on host {hostname}" + ) + assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( + f"Failed to read data after admin password update on host {hostname}" + ) logger.info("Password update successful after secret was granted") + # change replication password + replica_password = "replica-password" + juju.update_secret( + identifier=secret_id, + content={ + CharmUsers.VALKEY_ADMIN.value: new_password, + CharmUsers.VALKEY_REPLICA.value: replica_password, + }, + ) + + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) -# TODO Once scaling is implemented, add tests to check on password update in non-leader units + # perform pings with the updated replica password + for hostname in hostnames: + client = create_valkey_client( + hostname=hostname, + username=CharmUsers.VALKEY_REPLICA.value, + password=replica_password, + ) + assert client.ping() is True, ( + f"Failed to authenticate with new replica password on host {hostname}" + ) From 827e58d728af0ebc88214f466fb9afe88c9ab5da Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 30 Jan 2026 12:26:02 +0000 Subject: [PATCH 043/159] add continuouswrites file --- poetry.lock | 18 +++- pyproject.toml | 1 + tests/integration/k8s/ha/continuous_writes.py | 87 +++++++++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 tests/integration/k8s/ha/continuous_writes.py diff --git a/poetry.lock b/poetry.lock index 6cfc71c..b460602 100644 --- a/poetry.lock +++ b/poetry.lock @@ -898,6 +898,22 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "tenacity" +version = "9.1.2" +description = "Retry code until it succeeds" +optional = false +python-versions = ">=3.9" +groups = ["integration"] +files = [ + {file = "tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138"}, + {file = "tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb"}, +] + +[package.extras] +doc = ["reno", "sphinx"] +test = ["pytest", "tornado (>=4.5)", "typeguard"] + [[package]] name = "typing-extensions" version = "4.15.0" @@ -1003,4 +1019,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "abc38cad6a46313a8cc9e71a9c82b52e2e0b14e76247ccd11bec2cffdef18876" +content-hash = "2d6ad1ccf6e7505c4b9136e91d9d970046f9ba6814866fff86c52b256a837b25" diff --git a/pyproject.toml b/pyproject.toml index ab475bf..4ae7e26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ data-platform-helpers = ">=0.1.7" jubilant = "^1.6.0" python-dateutil = "*" valkey = "^6.1.1" +tenacity = "^9.1.2" [tool.coverage.run] branch = true diff --git a/tests/integration/k8s/ha/continuous_writes.py b/tests/integration/k8s/ha/continuous_writes.py new file mode 100644 index 0000000..7bd723a --- /dev/null +++ b/tests/integration/k8s/ha/continuous_writes.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import pathlib +import signal +import sys +import time + +import valkey +from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed + +SENTINEL_PORT = 26379 + +logger = logging.getLogger(__name__) + +WRITES_LAST_WRITTEN_VAL_PATH = "last_written_value" +LOG_FILE_PATH = "log_file" +continue_running = True + + +def continuous_writes( + endpoints: str, + valkey_user: str, + valkey_password: str, + sentinel_user: str, + sentinel_password: str, +) -> None: + key = "cw_key" + count = 0 + + client = valkey.Sentinel( + [(host, SENTINEL_PORT) for host in endpoints.split(",")], + username=valkey_user, + password=valkey_password, + sentinel_kwargs={"password": sentinel_password, "username": sentinel_user}, + ) + master = client.master_for("primary") + + # clean up from previous runs + pathlib.Path(WRITES_LAST_WRITTEN_VAL_PATH).unlink(missing_ok=True) + try: + master.delete(key) + except Exception: + pass + + while continue_running: + count += 1 + + try: + for attempt in Retrying(stop=stop_after_attempt(2), wait=wait_fixed(1)): + with attempt: + result = master.set(key, str(count)) + if not result: + raise ValueError + with open(LOG_FILE_PATH, "a") as log_file: + log_file.write(f"{count}\n") + except RetryError: + pass + + time.sleep(1) + else: + # write last expected written value on disk when terminating + pathlib.Path(WRITES_LAST_WRITTEN_VAL_PATH).write_text(str(count)) + + +def handle_stop_signal(signum, frame) -> None: + global continue_running + continue_running = False + + +def main(): + endpoints = sys.argv[1] + valkey_user = sys.argv[2] + valkey_password = sys.argv[3] + sentinel_user = sys.argv[4] + sentinel_password = sys.argv[5] + + # handle the stop signal for a graceful stop of the writes process + signal.signal(signal.SIGTERM, handle_stop_signal) + + continuous_writes(endpoints, valkey_user, valkey_password, sentinel_user, sentinel_password) + + +if __name__ == "__main__": + main() From 72e4b4fde5a42932af60ca5eb608d904a4126066 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Feb 2026 08:38:27 +0000 Subject: [PATCH 044/159] add first integration test for scale up --- tests/integration/k8s/ha/__init__.py | 0 tests/integration/k8s/ha/helpers.py | 93 ++++++++++++++++++++ tests/integration/k8s/ha/test_scaling.py | 105 +++++++++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 tests/integration/k8s/ha/__init__.py create mode 100644 tests/integration/k8s/ha/helpers.py create mode 100644 tests/integration/k8s/ha/test_scaling.py diff --git a/tests/integration/k8s/ha/__init__.py b/tests/integration/k8s/ha/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/k8s/ha/helpers.py b/tests/integration/k8s/ha/helpers.py new file mode 100644 index 0000000..4542e1d --- /dev/null +++ b/tests/integration/k8s/ha/helpers.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import subprocess +import time + +import valkey +from tenacity import Retrying, stop_after_attempt, wait_fixed + +from literals import CLIENT_PORT, SENTINEL_PORT + +logger = logging.getLogger(__name__) + +WRITES_LAST_WRITTEN_VAL_PATH = "last_written_value" + +KEY = "cw_key" + + +def start_continuous_writes( + endpoints: str, + valkey_user: str, + valkey_password: str, + sentinel_user: str, + sentinel_password: str, +) -> None: + """Create a subprocess instance of `continuous writes` and start writing data to etcd.""" + subprocess.Popen( + [ + "python3", + "tests/integration/k8s/ha/continuous_writes.py", + endpoints, + valkey_user, + valkey_password, + sentinel_user, + sentinel_password, + ] + ) + + +def stop_continuous_writes() -> None: + """Shut down the subprocess instance of the `continuous writes`.""" + proc = subprocess.Popen(["pkill", "-15", "-f", "continuous_writes.py"]) + proc.communicate() + + +def assert_continuous_writes_increasing( + endpoints: str, + valkey_user: str, + valkey_password: str, + sentinel_user: str, + sentinel_password: str, +) -> None: + """Assert that the continuous writes are increasing.""" + client = valkey.Sentinel( + [(host, SENTINEL_PORT) for host in endpoints.split(",")], + username=valkey_user, + password=valkey_password, + sentinel_kwargs={"password": sentinel_password, "username": sentinel_user}, + ) + master = client.master_for("primary") + writes_count = int(master.get(KEY)) + time.sleep(10) + more_writes = int(master.get(KEY)) + assert more_writes > writes_count, "Writes not continuing to DB" + logger.info("Continuous writes are increasing.") + + +def assert_continuous_writes_consistent( + endpoints: str, + valkey_user: str, + valkey_password: str, +) -> None: + """Assert that the continuous writes are consistent.""" + last_written_value = None + for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(5)): + with attempt: + with open(WRITES_LAST_WRITTEN_VAL_PATH, "r") as f: + last_written_value = int(f.read().rstrip()) + + for endpoint in endpoints.split(","): + client = valkey.Valkey( + host=endpoint, + port=CLIENT_PORT, + username=valkey_user, + password=valkey_password, + ) + last_etcd_value = int(client.get(KEY).decode("utf-8")) + assert last_written_value == last_etcd_value, ( + f"endpoint: {endpoint}, expected value: {last_written_value}, current value: {last_etcd_value}" + ) + logger.info(f"Continuous writes are consistent on {endpoint}.") diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py new file mode 100644 index 0000000..23850d0 --- /dev/null +++ b/tests/integration/k8s/ha/test_scaling.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. +import logging + +import jubilant +import pytest +import valkey + +from literals import CharmUsers + +from ..helpers import ( + APP_NAME, + IMAGE_RESOURCE, + are_apps_active_and_agents_idle, + get_cluster_hostnames, + get_password, +) +from .helpers import ( + assert_continuous_writes_consistent, + assert_continuous_writes_increasing, + start_continuous_writes, + stop_continuous_writes, +) + +logger = logging.getLogger(__name__) + +NUM_UNITS = 3 +TEST_KEY = "test_key" +TEST_VALUE = "test_value" + + +@pytest.mark.abort_on_fail +def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: + """Build the charm-under-test and deploy it with three units.""" + juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), + timeout=600, + ) + + assert len(juju.status().apps[APP_NAME].units) == 1, ( + "Unexpected number of units after initial deploy" + ) + + +@pytest.mark.abort_on_fail +async def test_scale_up(juju: jubilant.Juju) -> None: + """Make sure new units are added to the etcd cluster without downtime.""" + init_units_count = len(juju.status().apps[APP_NAME].units) + init_endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) + # start writing data to the cluster + start_continuous_writes( + endpoints=init_endpoints, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_user=CharmUsers.SENTINEL_ADMIN.value, + sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + ) + + # scale up + juju.add_unit(APP_NAME, num_units=2) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, idle_period=10, unit_count=init_units_count + 2 + ), + timeout=1200, + ) + num_units = len(juju.status().apps[APP_NAME].units) + assert num_units == init_units_count + 2, ( + f"Expected {init_units_count + 2} units, got {num_units}." + ) + + # check if all units have been added to the cluster + endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) + + sentinel_client = valkey.Sentinel( + [(host, 26379) for host in endpoints.split(",")], + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_kwargs={ + "password": get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + "username": CharmUsers.SENTINEL_ADMIN.value, + }, + ) + master = sentinel_client.master_for("primary") + info = master.info("replication") + connected_slaves = info.get("connected_slaves", 0) + assert connected_slaves == num_units - 1, ( + f"Expected {num_units - 1} connected slaves, got {connected_slaves}." + ) + + assert_continuous_writes_increasing( + endpoints=endpoints, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_user=CharmUsers.SENTINEL_ADMIN.value, + sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + ) + stop_continuous_writes() + assert_continuous_writes_consistent( + endpoints=endpoints, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) From 38923b1b9d6160e29f33542da5c461d60f3525ce Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Feb 2026 08:43:19 +0000 Subject: [PATCH 045/159] add scaling spread file --- tests/spread/test_charm.py/task.yaml | 2 +- tests/spread/test_scaling.py/task.yaml | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 tests/spread/test_scaling.py/task.yaml diff --git a/tests/spread/test_charm.py/task.yaml b/tests/spread/test_charm.py/task.yaml index e4b01a9..81aee01 100644 --- a/tests/spread/test_charm.py/task.yaml +++ b/tests/spread/test_charm.py/task.yaml @@ -6,4 +6,4 @@ systems: execute: | tox run -e integration -- "tests/integration/k8s/$TEST_MODULE" --alluredir="$SPREAD_TASK/allure-results" artifacts: - - allure-results \ No newline at end of file + - allure-results diff --git a/tests/spread/test_scaling.py/task.yaml b/tests/spread/test_scaling.py/task.yaml new file mode 100644 index 0000000..a3c57af --- /dev/null +++ b/tests/spread/test_scaling.py/task.yaml @@ -0,0 +1,9 @@ +summary: test_scaling.py +environment: + TEST_MODULE: ha/test_scaling.py +systems: + - self-hosted-linux-amd64-noble-medium +execute: | + tox run -e integration -- "tests/integration/k8s/$TEST_MODULE" --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From c436a4a3ea757aace20d1467e2769ef0df783313 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Feb 2026 09:23:50 +0000 Subject: [PATCH 046/159] mock get_private_ip --- tests/unit/test_charm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 518e158..c9e57b8 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -390,6 +390,7 @@ def test_config_changed_leader_unit_primary(): patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mock_exec_command, + patch("core.base_workload.WorkloadBase.get_private_ip", return_value="127.0.1.1"), ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() From 6ea5fa29ed8e9d20b513e9d95f17ab460fbac253 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Feb 2026 09:58:01 +0000 Subject: [PATCH 047/159] remove markers and etcd references --- src/literals.py | 1 - tests/integration/k8s/ha/helpers.py | 8 ++++---- tests/integration/k8s/ha/test_scaling.py | 5 +---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/src/literals.py b/src/literals.py index 62808f8..9a30a40 100644 --- a/src/literals.py +++ b/src/literals.py @@ -27,7 +27,6 @@ QUORUM_NUMBER = 2 INTERNAL_USERS_PASSWORD_CONFIG = "system-users" INTERNAL_USERS_SECRET_LABEL_SUFFIX = "internal_users_secret" -CLIENT_PORT = 6379 # As per the valkey users spec diff --git a/tests/integration/k8s/ha/helpers.py b/tests/integration/k8s/ha/helpers.py index 4542e1d..6cc84d9 100644 --- a/tests/integration/k8s/ha/helpers.py +++ b/tests/integration/k8s/ha/helpers.py @@ -25,7 +25,7 @@ def start_continuous_writes( sentinel_user: str, sentinel_password: str, ) -> None: - """Create a subprocess instance of `continuous writes` and start writing data to etcd.""" + """Create a subprocess instance of `continuous writes` and start writing data to valkey.""" subprocess.Popen( [ "python3", @@ -86,8 +86,8 @@ def assert_continuous_writes_consistent( username=valkey_user, password=valkey_password, ) - last_etcd_value = int(client.get(KEY).decode("utf-8")) - assert last_written_value == last_etcd_value, ( - f"endpoint: {endpoint}, expected value: {last_written_value}, current value: {last_etcd_value}" + last_value = int(client.get(KEY).decode("utf-8")) + assert last_written_value == last_value, ( + f"endpoint: {endpoint}, expected value: {last_written_value}, current value: {last_value}" ) logger.info(f"Continuous writes are consistent on {endpoint}.") diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 23850d0..1f7d8b2 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -4,7 +4,6 @@ import logging import jubilant -import pytest import valkey from literals import CharmUsers @@ -30,7 +29,6 @@ TEST_VALUE = "test_value" -@pytest.mark.abort_on_fail def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: """Build the charm-under-test and deploy it with three units.""" juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1) @@ -44,9 +42,8 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: ) -@pytest.mark.abort_on_fail async def test_scale_up(juju: jubilant.Juju) -> None: - """Make sure new units are added to the etcd cluster without downtime.""" + """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) init_endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) # start writing data to the cluster From 21e1837ee7d102ff1332bf19255040bf06b5030f Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 5 Feb 2026 06:42:29 +0000 Subject: [PATCH 048/159] fix unit tests and add some workload functions --- src/core/base_workload.py | 32 ++++++++++++++++++++++++++++++-- src/managers/cluster.py | 31 ++++++++++++++++++++++--------- src/workload_vm.py | 15 +++++++++++++-- tests/unit/test_charm.py | 30 +++++++++++++----------------- 4 files changed, 78 insertions(+), 30 deletions(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 0348b80..92bde4a 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -19,6 +19,15 @@ class WorkloadBase(ABC): """Base interface for common workload operations.""" + def __init__(self) -> None: + """Initialize the WorkloadBase.""" + self.root: pathops.PathProtocol + self.config_file: pathops.PathProtocol + self.sentinel_config: pathops.PathProtocol + self.acl_file: pathops.PathProtocol + self.sentinel_acl_file: pathops.PathProtocol + self.working_dir: pathops.PathProtocol + @property @abstractmethod def can_connect(self) -> bool: @@ -62,7 +71,7 @@ def get_private_ip(self) -> str: def write_file( self, content: str, - path: pathops.ContainerPath, + path: pathops.PathProtocol, mode: int | None = None, user: str | None = None, group: str | None = None, @@ -74,7 +83,7 @@ def write_file( Args: content (str): The content to be written. - path (str): The file path where the content should be written. + path (pathops.PathProtocol): The file path where the content should be written. mode (int, optional): The file mode (permissions). Defaults to None. user (str, optional): The user name. Defaults to None. group (str, optional): The group name. Defaults to None. @@ -111,3 +120,22 @@ def write_config_file(self, config: dict[str, str]) -> None: ValueError, ) as e: raise ValkeyWorkloadCommandError(e) + + def mkdir( + self, + path: pathops.PathProtocol, + mode: int = 0o755, + user: str | None = None, + group: str | None = None, + exist_ok: bool = True, + ) -> None: + """Create a directory on disk. + + Args: + path (pathops.PathProtocol): The directory path to be created. + mode (int, optional): The directory mode (permissions). Defaults to None. + user (str, optional): The user name. Defaults to None. + group (str, optional): The group name. Defaults to None. + exist_ok (bool, optional): Whether to ignore if the directory already exists. Defaults to True. + """ + path.mkdir(mode=mode, user=user, group=group, exist_ok=exist_ok) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index d37f836..837c263 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -10,7 +10,11 @@ from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope -from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError, ValkeyExecCommandError +from common.exceptions import ( + ValkeyACLLoadError, + ValkeyConfigSetError, + ValkeyWorkloadCommandError, +) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import CharmUsers @@ -36,10 +40,8 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" try: - self.workload.exec_command( - ["acl", "load"], username=self.admin_user, password=self.admin_password - ) - except ValkeyExecCommandError: + self._exec_cli_command(["acl", "load"]) + except ValkeyWorkloadCommandError: raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: @@ -48,7 +50,7 @@ def update_primary_auth(self) -> None: logger.info("Current unit is primary; no need to update primaryauth") return try: - self.workload.exec_command( + self._exec_cli_command( [ "config", "set", @@ -57,13 +59,24 @@ def update_primary_auth(self) -> None: CharmUsers.VALKEY_REPLICA.value, "" ), ], - username=self.admin_user, - password=self.admin_password, ) logger.info("Updated primaryauth runtime configuration on Valkey server") - except ValkeyExecCommandError: + except ValkeyWorkloadCommandError: raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") + def _exec_cli_command(self, command: list[str]) -> str: + """Execute a Valkey CLI command on the server.""" + cli_command = [ + "valkey-cli", + "--user", + self.admin_user, + "--password", + self.admin_password, + ] + command + output = self.workload.exec(cli_command) + logger.debug("Executed command: %s, got output: %s", " ".join(command), output) + return output + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( diff --git a/src/workload_vm.py b/src/workload_vm.py index 6990851..9312b29 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -65,7 +65,7 @@ def install(self, revision: str | None = None, retry_and_raise: bool = True) -> True if successfully installed, False if errors occur and `retry_and_raise` is False. """ if not revision: - revision = SNAP_REVISION + revision = str(SNAP_REVISION) try: # as long as 26.04 is not stable, we need to install the core26 snap from edge @@ -99,6 +99,17 @@ def exec(self, command: List[str]) -> str: ).stdout.strip() logger.debug(output) return output - except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: + except subprocess.CalledProcessError as e: logger.error("Command failed with %s, %s", e.returncode, e.stderr) raise ValkeyWorkloadCommandError(e) + except subprocess.TimeoutExpired as e: + logger.error("Command '%s' timed out: %s", command, str(e.stderr)) + raise ValkeyWorkloadCommandError(e) + + @override + def alive(self) -> bool: + """Check if the Valkey service is running.""" + try: + return bool(self.valkey.services[SNAP_SERVICE]["active"]) + except KeyError: + return False diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 286ef6f..46b6e9a 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -9,6 +9,7 @@ import yaml from ops import ActiveStatus, pebble, testing +from common.exceptions import ValkeyWorkloadCommandError from src.charm import ValkeyCharm from src.literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -367,11 +368,11 @@ def test_config_changed_leader_unit(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mocl_exec_command, + patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - assert mocl_exec_command.call_count == 2 # one for acl load, one for primaryauth set + assert mock_exec_command.call_count == 2 # one for acl load, one for primaryauth set secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) @@ -381,8 +382,8 @@ def test_config_changed_leader_unit(cloud_spec): ) -def test_config_changed_leader_unit_primary(): - ctx = testing.Context(ValkeyCharm) +def test_config_changed_leader_unit_primary(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, local_app_data={"primary_ip": "127.0.1.1"} ) @@ -398,18 +399,17 @@ def test_config_changed_leader_unit_primary(): containers={container}, secrets={password_secret}, config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mock_exec_command, + patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, patch("core.base_workload.WorkloadBase.get_private_ip", return_value="127.0.1.1"), ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with( - ["acl", "load"], username=CharmUsers.VALKEY_ADMIN.value, password="" - ) + mock_exec_command.assert_called_once_with(["acl", "load"]) secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) @@ -475,14 +475,12 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): "events.base_events.BaseEvents._update_internal_users_password" ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("workload_k8s.ValkeyK8sWorkload.exec_command") as mock_exec_command, + patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with( - ["acl", "load"], username=CharmUsers.VALKEY_ADMIN.value, password="" - ) + mock_exec_command.assert_called_once_with(["acl", "load"]) def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spec): @@ -511,8 +509,8 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spe ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch( - "workload_k8s.ValkeyK8sWorkload.exec_command", - side_effect=ValkeyExecCommandError("Failed to execute command"), + "managers.cluster.ClusterManager._exec_cli_command", + side_effect=ValkeyWorkloadCommandError("Failed to execute command"), ) as mock_exec_command, ctx(ctx.on.secret_changed(password_secret), state_in) as manager, ): @@ -520,9 +518,7 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spe state_out = manager.run() mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with( - ["acl", "load"], username=CharmUsers.VALKEY_ADMIN.value, password="" - ) + mock_exec_command.assert_called_once_with(["acl", "load"]) cluster_statuses = charm.state.statuses.get( scope="unit", component=charm.cluster_manager.name, From c48daeb6905da54cb810a348d9a22b4d99dee326 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 5 Feb 2026 06:44:01 +0000 Subject: [PATCH 049/159] add mode user and group to write file --- src/core/base_workload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 92bde4a..8de6448 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -89,7 +89,7 @@ def write_file( group (str, optional): The group name. Defaults to None. """ try: - path.write_text(content) + path.write_text(content, mode=mode, user=user, group=group) except ( FileNotFoundError, LookupError, From 25d25b8f64c0c3b67a6452079750c644839659f2 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 6 Feb 2026 04:33:17 +0000 Subject: [PATCH 050/159] fix integration tests --- src/managers/cluster.py | 2 +- src/managers/config.py | 8 +------- tests/integration/k8s/ha/test_scaling.py | 6 +++--- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 837c263..7faa038 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -70,7 +70,7 @@ def _exec_cli_command(self, command: list[str]) -> str: "valkey-cli", "--user", self.admin_user, - "--password", + "--pass", self.admin_password, ] + command output = self.workload.exec(cli_command) diff --git a/src/managers/config.py b/src/managers/config.py index c4b9453..6067712 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -17,7 +17,6 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import ( - ACL_FILE, CHARM_USER, CHARM_USERS_ROLE_MAP, CLIENT_PORT, @@ -83,11 +82,6 @@ def config_properties(self) -> dict[str, str]: config_properties["bind"] = self.state.bind_address else: config_properties["bind"] = "0.0.0.0 -::1" - # Use the ACL file - config_properties["aclfile"] = ACL_FILE - - # # logfile location - # config_properties["logfile"] = VALKEY_LOG_FILE logger.debug( "primary: %s, hostname: %s", @@ -191,7 +185,7 @@ def set_sentinel_acl_file(self, passwords: dict[str, str] | None = None) -> None for user in CharmUsers: # only process VALKEY users # Sentinel users should be in the sentinel acl file - if "VALKEY_" in str(user): + if "VALKEY_" in user.name: continue acl_content += self._get_user_acl_line(user, passwords=passwords) self.workload.write_file(acl_content, self.workload.sentinel_acl_file) diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 1f7d8b2..d951a60 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -7,14 +7,14 @@ import valkey from literals import CharmUsers - -from ..helpers import ( +from tests.integration.helpers import ( APP_NAME, IMAGE_RESOURCE, are_apps_active_and_agents_idle, get_cluster_hostnames, get_password, ) + from .helpers import ( assert_continuous_writes_consistent, assert_continuous_writes_increasing, @@ -31,7 +31,7 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: """Build the charm-under-test and deploy it with three units.""" - juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1) + juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1, trust=True) juju.wait( lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, From e8db36c022846b47628deb40498412bf6c1cbbf8 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 05:29:09 +0000 Subject: [PATCH 051/159] add one-by-one scaling up --- src/core/base_workload.py | 2 +- src/core/models.py | 1 + src/events/base_events.py | 131 +++++++++++++++++++++++++++++++++++--- src/managers/cluster.py | 102 ++++++++++++++++++++++++++--- src/statuses.py | 12 ++++ src/workload_k8s.py | 6 +- src/workload_vm.py | 8 +-- 7 files changed, 234 insertions(+), 28 deletions(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 8de6448..096cc85 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -40,7 +40,7 @@ def start(self) -> None: pass @abstractmethod - def exec(self, command: list[str]) -> str: + def exec(self, command: list[str]) -> tuple[str, str | None]: """Run a command on the workload substrate.""" pass diff --git a/src/core/models.py b/src/core/models.py index d911534..450b5ba 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -37,6 +37,7 @@ class PeerAppModel(PeerModel): charmed_sentinel_peers_password: InternalUsersSecret = Field(default="") charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") primary_ip: str = Field(default="") + starting_member: str = Field(default="") class PeerUnitModel(PeerModel): diff --git a/src/events/base_events.py b/src/events/base_events.py index 5b19c53..3b7e69c 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -28,9 +28,20 @@ logger = logging.getLogger(__name__) +class UnitFullyStarted(ops.EventBase): + """Event that signals that the unit's has fully started. + + This event will be deferred until: + The Sentinel service is running and was discovered by other units. + The Valkey service is running and the replica has finished syncing data. + """ + + class BaseEvents(ops.Object): """Handle all base events.""" + unit_fully_started = ops.EventSource(UnitFullyStarted) + def __init__(self, charm: "ValkeyCharm"): super().__init__(charm, key="base_events") self.charm = charm @@ -40,10 +51,14 @@ def __init__(self, charm: "ValkeyCharm"): self.framework.observe( self.charm.on[PEER_RELATION].relation_joined, self._on_peer_relation_joined ) + self.framework.observe( + self.charm.on[PEER_RELATION].relation_changed, self._on_peer_relation_changed + ) self.framework.observe(self.charm.on.update_status, self._on_update_status) self.framework.observe(self.charm.on.leader_elected, self._on_leader_elected) self.framework.observe(self.charm.on.config_changed, self._on_config_changed) self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed) + self.framework.observe(self.unit_fully_started, self._on_unit_fully_started) def _on_install(self, event: ops.InstallEvent) -> None: """Handle install event.""" @@ -63,13 +78,20 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - if not self.charm.unit.is_leader() and ( - not self.charm.state.cluster.internal_users_credentials - or not self.charm.state.cluster.model.primary_ip - ): - logger.info("Deferring leader write primary and internal user credentials") - event.defer() - return + if not self.charm.unit.is_leader(): + if ( + not self.charm.state.cluster.internal_users_credentials + or not self.charm.state.cluster.model.primary_ip + ): + logger.info( + "Non-leader unit waiting for leader to set primary and internal user credentials" + ) + event.defer() + return + if self.charm.state.cluster.model.starting_member != self.charm.unit.name: + logger.info("Non-leader unit waiting for leader to choose it as starting member") + event.defer() + return try: self.charm.config_manager.update_local_valkey_admin() @@ -108,13 +130,104 @@ def _on_start(self, event: ops.StartEvent) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) + + self.charm.state.statuses.delete( + ValkeyServiceStatuses.SERVICE_NOT_RUNNING.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) + + self.unit_fully_started.emit() + + def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: + """Handle the unit-fully-started event.""" + self.charm.status.set_running_status( + ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + self.charm.status.set_running_status( + ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + + if not self.charm.cluster_manager.is_sentinel_discovered(): + logger.info("Sentinel service not yet discovered by other units. Deferring event.") + event.defer() + return + + self.charm.state.statuses.delete( + ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) + + if not self.charm.cluster_manager.is_replica_synced(): + logger.info("Replica not yet synced. Deferring event.") + event.defer() + return + + self.charm.state.statuses.delete( + ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) + logger.info("Services started") self.charm.state.unit_server.update({"started": True}) def _on_peer_relation_joined(self, event: ops.RelationJoinedEvent) -> None: """Handle event received by all units when a new unit joins the cluster relation.""" - if self.charm.unit.is_leader(): - logger.info("Unit %s has joined the relation", event.unit.name) + if not self.charm.unit.is_leader() or not event.unit: + return + + logger.debug("Peer relation joined by %s", event.unit.name) + + if not self.charm.state.unit_server.is_started: + logger.info("Primary member has not started yet. Deferring event.") + event.defer() + return + + if self.charm.state.cluster.model.starting_member: + logger.debug( + "%s is already starting. Deferring relation joined event for %s", + self.charm.state.cluster.model.starting_member, + event.unit.name, + ) + event.defer() + return + self.charm.state.cluster.update({"starting_member": event.unit.name}) + + def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: + """Handle event received by all units when a unit's relation data changes.""" + logger.debug( + "Starting member is currently %s", self.charm.state.cluster.model.starting_member + ) + starting_unit = next( + ( + unit + for unit in self.charm.state.servers + if unit.unit_name == self.charm.state.cluster.model.starting_member + ), + None, + ) + logger.debug( + "Starting unit has started: %s", + starting_unit.is_started if starting_unit else "No starting unit", + ) + if ( + self.charm.state.cluster.model.starting_member + and starting_unit + and starting_unit.is_started + ): + logger.debug( + "Starting member %s has started. Clearing starting member field.", + self.charm.state.cluster.model.starting_member, + ) + self.charm.state.cluster.update({"starting_member": ""}) def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: """Handle the update-status event.""" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 7faa038..682d891 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -5,6 +5,7 @@ """Manager for all cluster related tasks.""" import logging +from typing import Literal from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol @@ -17,7 +18,7 @@ ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import CharmUsers +from literals import CLIENT_PORT, PRIMARY_NAME, SENTINEL_PORT, CharmUsers from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -64,18 +65,102 @@ def update_primary_auth(self) -> None: except ValkeyWorkloadCommandError: raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") - def _exec_cli_command(self, command: list[str]) -> str: - """Execute a Valkey CLI command on the server.""" + def is_sentinel_discovered(self) -> bool: + """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" + # list of active sentinels: units with started flag true + active_sentinels = [ + unit.model.private_ip + for unit in self.state.servers + if unit.model + and unit.model.started + and unit.model.private_ip != self.state.unit_server.model.private_ip + ] + + for sentinel_ip in active_sentinels: + try: + output, _ = self._exec_cli_command( + command=["sentinel", "sentinels", PRIMARY_NAME], + hostname=sentinel_ip, + connect_to="sentinel", + ) + if self.state.unit_server.model.private_ip not in output: + logger.info(f"Sentinel at {sentinel_ip} has discovered this sentinel") + return False + except ValkeyWorkloadCommandError: + logger.warning(f"Could not query sentinel at {sentinel_ip} for primary discovery.") + continue + return True + + def is_replica_synced(self) -> bool: + """Check if the replica is synced with the primary.""" + if self.state.unit_server.model.private_ip == self.state.cluster.model.primary_ip: + logger.info("Current unit is primary; no need to check replica sync") + return True + try: + output = ( + self._exec_cli_command( + command=["role"], + )[0] + .strip() + .split() + ) + if output and output[0] == "slave" and output[3] == "connected": + logger.info("Replica is synced with primary") + return True + + return False + except ValkeyWorkloadCommandError: + logger.warning("Could not determine replica sync status from Valkey server.") + return False + + def _exec_cli_command( + self, + command: list[str], + hostname: str = "localhost", + connect_to: Literal["valkey", "sentinel"] = "valkey", + ) -> tuple[str, str | None]: + """Execute a Valkey CLI command on the server. + + Args: + command (list[str]): The CLI command to execute, as a list of arguments. + hostname (str): The hostname to connect to. Defaults to "localhost". + connect_to (Literal["valkey", "sentinel"]): Whether to connect to the valkey server or sentinel for executing the command. Defaults to "valkey". + + Returns: + tuple[str, str | None]: The standard output and standard error from the command execution. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute. + """ + port = CLIENT_PORT if connect_to == "valkey" else SENTINEL_PORT + user = ( + CharmUsers.VALKEY_ADMIN.value + if connect_to == "valkey" + else CharmUsers.SENTINEL_CHARM_ADMIN.value + ) + password = ( + self.state.unit_server.valkey_admin_password + if connect_to == "valkey" + else self.state.cluster.internal_users_credentials.get( + CharmUsers.SENTINEL_CHARM_ADMIN.value, "" + ) + ) cli_command = [ "valkey-cli", + "-h", + hostname, + "-p", + str(port), "--user", - self.admin_user, + user, "--pass", - self.admin_password, + password, ] + command - output = self.workload.exec(cli_command) + output, error = self.workload.exec(cli_command) logger.debug("Executed command: %s, got output: %s", " ".join(command), output) - return output + if error: + logger.error("Error output from command '%s': %s", " ".join(command), error) + return output, error def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" @@ -86,7 +171,4 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje if not self.workload.can_connect: status_list.append(CharmStatuses.SERVICE_NOT_STARTED.value) - if not self.state.unit_server.is_started: - status_list.append(CharmStatuses.SERVICE_NOT_STARTED.value) - return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/statuses.py b/src/statuses.py index 84f91c6..5f589ff 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -30,6 +30,18 @@ class ClusterStatuses(Enum): status="blocked", message="Failed to update an internal user's password", running="async" ) + WAITING_FOR_SENTINEL_DISCOVERY = StatusObject( + status="maintenance", + message="Waiting for sentinel to be discovered by other units...", + running="async", + ) + + WAITING_FOR_REPLICA_SYNC = StatusObject( + status="maintenance", + message="Waiting for replica to sync with primary...", + running="async", + ) + class ValkeyServiceStatuses(Enum): """Collection of possible Valkey service related statuses.""" diff --git a/src/workload_k8s.py b/src/workload_k8s.py index c5d0cf9..9bafed0 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -102,14 +102,12 @@ def alive(self) -> bool: return True @override - def exec(self, command: list[str]) -> str: + def exec(self, command: list[str]) -> tuple[str, str | None]: try: process = self.container.exec( command=command, - combine_stderr=True, ) - output, _ = process.wait_output() - return output + return process.wait_output() except ops.pebble.ExecError as e: logger.error("Command failed with %s, %s", e.exit_code, e.stdout) raise ValkeyWorkloadCommandError(e) diff --git a/src/workload_vm.py b/src/workload_vm.py index 9312b29..b36d93b 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -88,7 +88,7 @@ def start(self) -> None: logger.exception(str(e)) @override - def exec(self, command: List[str]) -> str: + def exec(self, command: List[str]) -> tuple[str, str | None]: try: output = subprocess.run( command, @@ -96,9 +96,9 @@ def exec(self, command: List[str]) -> str: text=True, capture_output=True, timeout=10, - ).stdout.strip() - logger.debug(output) - return output + ) + logger.debug("Executed command: %s, got output: %s", " ".join(command), output.stdout) + return output.stdout, output.stderr except subprocess.CalledProcessError as e: logger.error("Command failed with %s, %s", e.returncode, e.stderr) raise ValkeyWorkloadCommandError(e) From a6d02bdabc5b0a25f16ff6be6b4103e656ccdb7a Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 05:38:13 +0000 Subject: [PATCH 052/159] add retries to sentinel discovery and replica sync check --- src/managers/cluster.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 682d891..b312ee8 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -7,6 +7,7 @@ import logging from typing import Literal +import tenacity from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope @@ -65,6 +66,12 @@ def update_primary_auth(self) -> None: except ValkeyWorkloadCommandError: raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") + @tenacity.retry( + wait=tenacity.wait_fixed(5), + stop=tenacity.stop_after_attempt(5), + retry=tenacity.retry_if_result(lambda result: result is False), + reraise=True, + ) def is_sentinel_discovered(self) -> bool: """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" # list of active sentinels: units with started flag true @@ -91,6 +98,12 @@ def is_sentinel_discovered(self) -> bool: continue return True + @tenacity.retry( + wait=tenacity.wait_fixed(5), + stop=tenacity.stop_after_attempt(5), + retry=tenacity.retry_if_result(lambda result: result is False), + reraise=True, + ) def is_replica_synced(self) -> bool: """Check if the replica is synced with the primary.""" if self.state.unit_server.model.private_ip == self.state.cluster.model.primary_ip: From cde911e4055509d523d5a3b2a0f19c5b31863026 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 05:46:14 +0000 Subject: [PATCH 053/159] better statuses --- src/events/base_events.py | 23 +++++++++++++++++++++++ src/statuses.py | 11 +++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/events/base_events.py b/src/events/base_events.py index 3b7e69c..007c609 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -86,12 +86,35 @@ def _on_start(self, event: ops.StartEvent) -> None: logger.info( "Non-leader unit waiting for leader to set primary and internal user credentials" ) + self.charm.status.set_running_status( + ClusterStatuses.WAITING_FOR_PRIMARY_START.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) event.defer() return + + self.charm.state.statuses.delete( + ClusterStatuses.WAITING_FOR_PRIMARY_START.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) if self.charm.state.cluster.model.starting_member != self.charm.unit.name: logger.info("Non-leader unit waiting for leader to choose it as starting member") + self.charm.status.set_running_status( + CharmStatuses.WAITING_TO_START.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) event.defer() return + self.charm.state.statuses.delete( + CharmStatuses.WAITING_TO_START.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) try: self.charm.config_manager.update_local_valkey_admin() diff --git a/src/statuses.py b/src/statuses.py index 5f589ff..7139223 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -21,6 +21,11 @@ class CharmStatuses(Enum): message="Cannot access configured secret, check permissions", running="async", ) + WAITING_TO_START = StatusObject( + status="maintenance", + message="Waiting for leader to authorize service start", + running="async", + ) class ClusterStatuses(Enum): @@ -42,6 +47,12 @@ class ClusterStatuses(Enum): running="async", ) + WAITING_FOR_PRIMARY_START = StatusObject( + status="maintenance", + message="Waiting for primary to start and become active...", + running="async", + ) + class ValkeyServiceStatuses(Enum): """Collection of possible Valkey service related statuses.""" From efac5a83a81488664f11132eaf1829bf25042aff Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 07:09:39 +0000 Subject: [PATCH 054/159] seed data and auto decode --- tests/integration/helpers.py | 109 ++++++++++++++++++++++- tests/integration/k8s/ha/helpers.py | 3 +- tests/integration/k8s/ha/test_scaling.py | 8 +- tests/integration/k8s/test_charm.py | 4 +- 4 files changed, 116 insertions(+), 8 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index b8b4501..81e9b8e 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -4,6 +4,8 @@ import contextlib import logging +import os +import time from datetime import datetime, timedelta from pathlib import Path from typing import List @@ -32,6 +34,7 @@ INTERNAL_USERS_SECRET_LABEL = ( f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) +SEED_KEY_PREFIX = "seed:key:" def does_status_match( @@ -249,10 +252,48 @@ def create_valkey_client( Returns: A Valkey client instance connected to the cluster. """ - client = valkey.Valkey(host=hostname, port=CLIENT_PORT, username=username, password=password) + client = valkey.Valkey( + host=hostname, + port=CLIENT_PORT, + username=username, + password=password, + decode_responses=True, + ) return client +def create_sentinel_client( + hostnames: list[str], + valkey_user: str | None = CharmUsers.VALKEY_ADMIN.value, + valkey_password: str | None = None, + sentinel_user: str | None = CharmUsers.SENTINEL_ADMIN.value, + sentinel_password: str | None = None, +) -> valkey.Sentinel: + """Create and return a Valkey Sentinel client connected to the cluster. + + Args: + hostnames: A list of hostnames for the Sentinel nodes. + valkey_user: The username for authentication to Valkey. + valkey_password: The password for the internal user for Valkey authentication. + sentinel_user: The username for authentication to Sentinel. + sentinel_password: The password for the internal user for Sentinel authentication. + + Returns: + A Valkey Sentinel client instance connected to the cluster. + """ + sentinel_client = valkey.Sentinel( + [(host, 26379) for host in hostnames], + username=valkey_user, + password=valkey_password, + sentinel_kwargs={ + "password": sentinel_password, + "username": sentinel_user, + }, + decode_responses=True, + ) + return sentinel_client + + def set_password( juju: jubilant.Juju, password: str, @@ -302,9 +343,14 @@ def get_primary_ip(juju: jubilant.Juju, app: str) -> str: The IP address of the primary node. """ hostnames = get_cluster_hostnames(juju, app) - client = create_valkey_client(hostname=hostnames[0], password=get_password(juju)) - info = client.info("replication") - return hostnames[0] if info["role"] == "master" else info.get("master_host", "") + client = create_sentinel_client( + hostnames=hostnames, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_user=CharmUsers.SENTINEL_CHARM_ADMIN.value, + sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), + ) + return client.discover_master("primary")[0] def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN) -> str: @@ -319,3 +365,58 @@ def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN """ secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) return secret.get(f"{user.value}-password", "") + + +def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: + # Connect to Valkey + primary_ip = get_primary_ip(juju, APP_NAME) + client = valkey.Valkey( + host=primary_ip, + port=CLIENT_PORT, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + + # Configuration + value_size_bytes = 1024 # 1KB per value + batch_size = 5000 # Commands per pipeline + total_bytes_target = target_gb * 1024 * 1024 * 1024 + total_keys = total_bytes_target // value_size_bytes + + logger.debug( + f"Targeting ~{target_gb}GB ({total_keys:,} keys of {value_size_bytes} bytes each)" + ) + + start_time = time.time() + keys_added = 0 + + # Generate a fixed random block to reuse (saves CPU cycles on generation) + random_data = os.urandom(value_size_bytes).hex()[:value_size_bytes] + + try: + while keys_added < total_keys: + pipe = client.pipeline(transaction=False) + + # Fill the batch + for i in range(batch_size): + key_idx = keys_added + i + pipe.set(f"{SEED_KEY_PREFIX}{key_idx}", random_data) + + if keys_added + i >= total_keys: + break + + pipe.execute() + keys_added += batch_size + + # Progress reporting + elapsed = time.time() - start_time + percent = (keys_added / total_keys) * 100 + logger.info( + f"Progress: {percent:.1f}% | Keys: {keys_added:,} | Elapsed: {elapsed:.1f}s", + ) + + except Exception as e: + logger.error(f"\nError: {e}") + finally: + total_time = time.time() - start_time + logger.info(f"\nSeeding complete! Added {keys_added:,} keys in {total_time:.2f} seconds.") diff --git a/tests/integration/k8s/ha/helpers.py b/tests/integration/k8s/ha/helpers.py index 6cc84d9..3ea3967 100644 --- a/tests/integration/k8s/ha/helpers.py +++ b/tests/integration/k8s/ha/helpers.py @@ -85,8 +85,9 @@ def assert_continuous_writes_consistent( port=CLIENT_PORT, username=valkey_user, password=valkey_password, + decode_responses=True, ) - last_value = int(client.get(KEY).decode("utf-8")) + last_value = int(client.get(KEY)) assert last_written_value == last_value, ( f"endpoint: {endpoint}, expected value: {last_written_value}, current value: {last_value}" ) diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index d951a60..341159c 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -13,6 +13,7 @@ are_apps_active_and_agents_idle, get_cluster_hostnames, get_password, + seed_valkey, ) from .helpers import ( @@ -42,7 +43,12 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: ) -async def test_scale_up(juju: jubilant.Juju) -> None: +def test_seed_data(juju: jubilant.Juju) -> None: + """Seed some data to the cluster.""" + seed_valkey(juju, target_gb=1) + + +def test_scale_up(juju: jubilant.Juju) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) init_endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index efa90ed..021a195 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -115,7 +115,7 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: assert client.ping() is True, ( f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) - assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( + assert client.get(TEST_KEY) == TEST_VALUE, ( f"Failed to read data after admin password update on host {hostname}" ) @@ -218,7 +218,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: assert client.ping() is True, ( f"Failed to authenticate with new admin password on host {hostname}" ) - assert client.get(TEST_KEY) == bytes(TEST_VALUE, "utf-8"), ( + assert client.get(TEST_KEY) == TEST_VALUE, ( f"Failed to read data after admin password update on host {hostname}" ) From c165c68ff360e63678164e8d03a3bf59d5b64465 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 07:25:06 +0000 Subject: [PATCH 055/159] add different scenarios for unit test non leader starting --- tests/unit/test_charm.py | 81 +++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 46b6e9a..39fe735 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -142,7 +142,7 @@ def test_start_non_leader_unit(cloud_spec): assert "start" in [e.name for e in state_out.deferred] relation = testing.PeerRelation( - id=1, endpoint=PEER_RELATION, local_app_data={"primary_ip": "127.1.0.1"} + id=1, endpoint=PEER_RELATION, local_app_data={"primary-ip": "127.1.0.1"} ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -152,24 +152,69 @@ def test_start_non_leader_unit(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.start(), state_in) - assert state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) - assert state_out.get_container(container.name).service_statuses.get( - SERVICE_METRIC_EXPORTER - ) - assert state_out.get_container(container.name).service_statuses[SERVICE_SENTINEL] - assert state_out.get_relation(1).local_unit_data["started"] == "true" - # container not ready - container = testing.Container(name=CONTAINER, can_connect=False) - state_in = testing.State( - model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), - leader=True, - relations={relation, status_peer_relation}, - containers={container}, - ) - - state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) + assert status_is(state_out, CharmStatuses.WAITING_TO_START.value) + + # replica syncing + with patch("managers.cluster.ClusterManager.is_replica_synced", return_value=False): + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"primary-ip": "127.1.0.1", "starting-member": "valkey/0"}, + ) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value) + + # sentinel not yet discovered + with patch("managers.cluster.ClusterManager.is_sentinel_discovered", return_value=False): + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"primary-ip": "127.1.0.1", "starting-member": "valkey/0"}, + ) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) + + # Happy path with sentinel discovered and replica synced + with ( + patch("managers.cluster.ClusterManager.is_sentinel_discovered", return_value=True), + patch("managers.cluster.ClusterManager.is_replica_synced", return_value=True), + ): + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"primary-ip": "127.1.0.1", "starting-member": "valkey/0"}, + ) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, CharmStatuses.ACTIVE_IDLE.value) + + assert state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) + assert state_out.get_container(container.name).service_statuses.get( + SERVICE_METRIC_EXPORTER + ) + assert state_out.get_container(container.name).service_statuses[SERVICE_SENTINEL] + assert state_out.get_relation(1).local_unit_data["started"] == "true" def test_update_status_leader_unit(cloud_spec): From 230b4e54e0d15e15c083a7e6956df835073a9979 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 07:56:57 +0000 Subject: [PATCH 056/159] update vm tests --- tests/integration/vm/test_charm.py | 203 ++++++++++++++++------------- 1 file changed, 113 insertions(+), 90 deletions(-) diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index 079cf21..eb22aa3 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -2,34 +2,33 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. import logging -from time import sleep import jubilant import pytest +from valkey import AuthenticationError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, CharmUsers, ) -from statuses import ClusterStatuses +from statuses import CharmStatuses, ClusterStatuses from tests.integration.helpers import ( APP_NAME, INTERNAL_USERS_SECRET_LABEL, - CharmStatuses, + are_apps_active_and_agents_idle, create_valkey_client, does_status_match, fast_forward, get_cluster_hostnames, - get_key, + get_password, + get_primary_ip, get_secret_by_label, - set_key, set_password, ) logger = logging.getLogger(__name__) -# TODO scale up when scaling is implemented -NUM_UNITS = 1 +NUM_UNITS = 3 TEST_KEY = "test_key" TEST_VALUE = "test_value" @@ -38,93 +37,91 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: """Build the charm-under-test and deploy it with three units.""" juju.deploy(charm, num_units=NUM_UNITS, trust=True) juju.wait( - lambda status: does_status_match( - status, - expected_app_statuses={APP_NAME: [CharmStatuses.SCALING_NOT_IMPLEMENTED.value]}, - ), + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, ) async def test_authentication(juju: jubilant.Juju) -> None: """Assert that we can authenticate to valkey.""" + primary = get_primary_ip(juju, APP_NAME) hostnames = get_cluster_hostnames(juju, APP_NAME) # try without authentication - logger.info("Ensure unauthenticated access fails") - with pytest.raises(Exception) as exc_info: - unauth_client = await create_valkey_client( - hostnames=hostnames, username=None, password=None - ) + with pytest.raises(AuthenticationError): + unauth_client = create_valkey_client(hostname=primary, username=None, password=None) await unauth_client.ping() - assert "NOAUTH" in str(exc_info.value), "Unauthenticated access did not fail as expected" # Authenticate with internal user - secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) - password = secret.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + password = get_password(juju, user=CharmUsers.VALKEY_ADMIN) assert password is not None, "Admin password secret not found" - logger.info("Check access works correctly when authenticated") - client = await create_valkey_client(hostnames=hostnames, password=password) - auth_result = await client.ping() - assert auth_result == b"PONG", "Authentication to Valkey cluster failed" + for hostname in hostnames: + client = create_valkey_client(hostname=hostname, password=password) + assert client.ping() is True, ( + f"Authentication to Valkey cluster failed for host {hostname}" + ) async def test_update_admin_password(juju: jubilant.Juju) -> None: """Assert the admin password is updated when adding a user secret to the config.""" - hostnames = get_cluster_hostnames(juju, APP_NAME) - # create a user secret and grant it to the application logger.info("Updating operator password") - secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) - old_password = secret.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + old_password = get_password(juju, user=CharmUsers.VALKEY_ADMIN) new_password = "some-password" set_password(juju, new_password) # wait for config-changed hook to finish executing - juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) - logger.info("Ensure password was updated on charm-internal secret") - updated_secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) - assert old_password != updated_secret.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + new_password_secret = get_password(juju, user=CharmUsers.VALKEY_ADMIN) + assert new_password_secret == new_password, "Admin password not updated in secret" - logger.info("Ensure access with old password no longer possible") - with pytest.raises(Exception) as exc_info: - unauth_client = await create_valkey_client( - hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=old_password - ) - await unauth_client.ping() - assert "WRONGPASS" in str(exc_info.value), "Unauthenticated access did not fail as expected" - - logger.info("Check access with updated password") - result = await set_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - value=TEST_VALUE, + primary = get_primary_ip(juju, APP_NAME) + + # confirm old password no longer works + with pytest.raises(AuthenticationError): + create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=old_password + ).ping() + # ping with new password + client = create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, "Failed to authenticate with new admin password" + + assert client.set(TEST_KEY, TEST_VALUE) is True, ( + "Failed to write data after admin password update" ) - assert result == "OK", "Failed to write data after admin password update" # update the config again and remove the option `admin-password` logger.info("Ensure access is still possible after removing config option") juju.config(app=APP_NAME, reset=[INTERNAL_USERS_PASSWORD_CONFIG]) # wait for config-changed hook to finish executing - juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) - # make sure we can still read data with the previously set password - assert await get_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - ) == bytes(TEST_VALUE, "utf-8") + for hostname in get_cluster_hostnames(juju, APP_NAME): + client = create_valkey_client( + hostname=hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, ( + f"Failed to authenticate with admin password after removing user secret on host {hostname}" + ) + assert client.get(TEST_KEY) == TEST_VALUE, ( + f"Failed to read data after admin password update on host {hostname}" + ) async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None: """Assert the admin password is updated when adding a user secret to the config.""" - hostnames = get_cluster_hostnames(juju, APP_NAME) + # create a user secret and grant it to the application secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) old_passwords = {} @@ -133,9 +130,6 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None if user == CharmUsers.VALKEY_ADMIN: continue old_passwords[user.value] = secret.get(f"{user.value}-password") - - # create a user secret and grant it to the application - logger.info("Updating invalid username") new_password = "some-password" set_password(juju, username="wrong-username", password=new_password) @@ -148,20 +142,22 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None timeout=1200, ) - logger.info("Updating password correctly now") set_password(juju, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) # wait for config-changed hook to finish executing - juju.wait(lambda status: jubilant.all_agents_idle(status, APP_NAME), timeout=1200) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) # perform read operation with the updated password - result = await set_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - value=TEST_VALUE, + primary = get_primary_ip(juju, APP_NAME) + client = create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, "Failed to authenticate with new admin password" + assert client.set(TEST_KEY, TEST_VALUE) is True, ( + "Failed to write data after admin password update" ) - assert result == "OK", "Failed to write data after admin password update" logger.info("Comparing other users passwords to previously") updated_secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) @@ -175,8 +171,6 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None async def test_user_secret_permissions(juju: jubilant.Juju) -> None: """If a user secret is not granted, ensure we can process updated permissions.""" - hostnames = get_cluster_hostnames(juju, APP_NAME) - logger.info("Creating new user secret") secret_name = "my_secret" new_password = "even-newer-password" @@ -199,29 +193,58 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # deferred `config_changed` event will be retried before `update_status` with fast_forward(juju): juju.grant_secret(identifier=secret_name, app=APP_NAME) - sleep(20) # allow some time for the permission to propagate - - # juju.wait( - # lambda status: jubilant.all_active(status, APP_NAME), - # timeout=1200, - # ) - juju.wait( - lambda status: does_status_match( - status, - expected_app_statuses={APP_NAME: [CharmStatuses.SCALING_NOT_IMPLEMENTED.value]}, - ), - timeout=600, - ) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) # perform read operation with the updated password - assert await get_key( - hostnames=hostnames, - username=CharmUsers.VALKEY_ADMIN.value, - password=new_password, - key=TEST_KEY, - ) == bytes(TEST_VALUE, "utf-8"), "Failed to read data after secret permissions were updated" + hostnames = get_cluster_hostnames(juju, APP_NAME) + primary = get_primary_ip(juju, APP_NAME) + client = create_valkey_client( + hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + assert client.ping() is True, "Failed to authenticate with new admin password" + assert client.set(TEST_KEY, TEST_VALUE) is True, ( + "Failed to write data after admin password update" + ) + for hostname in hostnames: + client = create_valkey_client( + hostname=hostname, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + ) + assert client.ping() is True, ( + f"Failed to authenticate with new admin password on host {hostname}" + ) + assert client.get(TEST_KEY) == TEST_VALUE, ( + f"Failed to read data after admin password update on host {hostname}" + ) logger.info("Password update successful after secret was granted") + # change replication password + replica_password = "replica-password" + juju.update_secret( + identifier=secret_id, + content={ + CharmUsers.VALKEY_ADMIN.value: new_password, + CharmUsers.VALKEY_REPLICA.value: replica_password, + }, + ) -# TODO Once scaling is implemented, add tests to check on password update in non-leader units + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + timeout=1200, + ) + + # perform pings with the updated replica password + for hostname in hostnames: + client = create_valkey_client( + hostname=hostname, + username=CharmUsers.VALKEY_REPLICA.value, + password=replica_password, + ) + assert client.ping() is True, ( + f"Failed to authenticate with new replica password on host {hostname}" + ) From 3dbb47152c3a60f788b8e7cf8538a18f92d0bdb9 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 10:35:02 +0000 Subject: [PATCH 057/159] remove primary ip from databag --- src/core/models.py | 1 - src/events/base_events.py | 29 ++++++++++++++++++----------- src/managers/cluster.py | 35 +++++++++++++++++++++++++++++++---- src/managers/config.py | 24 +++++++++++------------- 4 files changed, 60 insertions(+), 29 deletions(-) diff --git a/src/core/models.py b/src/core/models.py index 450b5ba..273c87f 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -36,7 +36,6 @@ class PeerAppModel(PeerModel): charmed_stats_password: InternalUsersSecret = Field(default="") charmed_sentinel_peers_password: InternalUsersSecret = Field(default="") charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") - primary_ip: str = Field(default="") starting_member: str = Field(default="") diff --git a/src/events/base_events.py b/src/events/base_events.py index 007c609..a29d2c2 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -81,7 +81,7 @@ def _on_start(self, event: ops.StartEvent) -> None: if not self.charm.unit.is_leader(): if ( not self.charm.state.cluster.internal_users_credentials - or not self.charm.state.cluster.model.primary_ip + or not self.charm.cluster_manager.number_units_started ): logger.info( "Non-leader unit waiting for leader to set primary and internal user credentials" @@ -116,11 +116,22 @@ def _on_start(self, event: ops.StartEvent) -> None: component=self.charm.cluster_manager.name, ) + if not ( + primary_ip := ( + self.charm.state.unit_server.model.private_ip + if self.charm.unit.is_leader() + else self.charm.cluster_manager.get_primary_ip() + ) + ): + logger.error("Primary IP not found. Deferring start event.") + event.defer() + return + try: self.charm.config_manager.update_local_valkey_admin() - self.charm.config_manager.set_config_properties() + self.charm.config_manager.set_config_properties(primary_ip=primary_ip) self.charm.config_manager.set_acl_file() - self.charm.config_manager.set_sentinel_config_properties() + self.charm.config_manager.set_sentinel_config_properties(primary_ip=primary_ip) self.charm.config_manager.set_sentinel_acl_file() self.charm.workload.mkdir( self.charm.workload.working_dir, user=CHARM_USER, group=CHARM_USER @@ -159,6 +170,10 @@ def _on_start(self, event: ops.StartEvent) -> None: scope="unit", component=self.charm.cluster_manager.name, ) + if self.charm.unit.is_leader(): + logger.info("Services started") + self.charm.state.unit_server.update({"started": True}) + return self.unit_fully_started.emit() @@ -273,14 +288,6 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: if not self.charm.unit.is_leader(): return - if not self.charm.state.cluster.model.primary_ip: - # set the primary to this unit if not already set - self.charm.state.cluster.update( - { - "primary_ip": self.charm.state.unit_server.model.private_ip, - } - ) - if self.charm.state.cluster.internal_users_credentials: logger.debug("Internal user credentials already set") return diff --git a/src/managers/cluster.py b/src/managers/cluster.py index b312ee8..b4dce81 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -36,8 +36,11 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.workload = workload self.admin_user = CharmUsers.VALKEY_ADMIN.value self.admin_password = self.state.unit_server.valkey_admin_password - # target only the unit's valkey server IP - self.cluster_ips = [self.workload.get_private_ip()] + + @property + def number_units_started(self) -> int: + """Return the number of units in the cluster that have their Valkey server started.""" + return len([unit for unit in self.state.servers if unit.model and unit.model.started]) def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" @@ -48,7 +51,7 @@ def reload_acl_file(self) -> None: def update_primary_auth(self) -> None: """Update the primaryauth runtime configuration on the Valkey server.""" - if self.state.unit_server.model.private_ip == self.state.cluster.model.primary_ip: + if self.get_primary_ip() == self.state.unit_server.model.private_ip: logger.info("Current unit is primary; no need to update primaryauth") return try: @@ -106,7 +109,7 @@ def is_sentinel_discovered(self) -> bool: ) def is_replica_synced(self) -> bool: """Check if the replica is synced with the primary.""" - if self.state.unit_server.model.private_ip == self.state.cluster.model.primary_ip: + if self.get_primary_ip() == self.state.unit_server.model.private_ip: logger.info("Current unit is primary; no need to check replica sync") return True try: @@ -126,6 +129,30 @@ def is_replica_synced(self) -> bool: logger.warning("Could not determine replica sync status from Valkey server.") return False + def get_primary_ip(self) -> str | None: + """Get the IP address of the primary node in the cluster.""" + started_servers = [ + unit for unit in self.state.servers if unit.model and unit.model.started + ] + + for unit in started_servers: + try: + output = self._exec_cli_command( + ["sentinel", "get-master-addr-by-name", PRIMARY_NAME], + connect_to="sentinel", + hostname=unit.model.private_ip, + )[0] + primary_ip = output.strip().split()[0] + logger.info(f"Primary IP address is {primary_ip}") + return primary_ip + except (IndexError, ValkeyWorkloadCommandError): + logger.error("Could not get primary IP from sentinel output.") + + logger.error( + "Could not determine primary IP from sentinels. Number of started servers: %d.", + len(started_servers), + ) + def _exec_cli_command( self, command: list[str], diff --git a/src/managers/config.py b/src/managers/config.py index 6067712..7fbcca2 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -44,8 +44,7 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload - @property - def config_properties(self) -> dict[str, str]: + def get_config_properties(self, primary_ip: str) -> dict[str, str]: """Assemble the config properties. Returns: @@ -85,17 +84,14 @@ def config_properties(self) -> dict[str, str]: logger.debug( "primary: %s, hostname: %s", - self.state.cluster.model.primary_ip, + primary_ip, self.state.unit_server.model.hostname, ) # replicaof - if ( - self.state.cluster.model.primary_ip - and self.state.cluster.model.primary_ip != self.state.unit_server.model.private_ip - ): + if primary_ip != self.state.unit_server.model.private_ip: # set replicaof - logger.debug("Setting replicaof to primary %s", self.state.cluster.model.primary_ip) - config_properties["replicaof"] = f"{self.state.cluster.model.primary_ip} {CLIENT_PORT}" + logger.debug("Setting replicaof to primary %s", primary_ip) + config_properties["replicaof"] = f"{primary_ip} {CLIENT_PORT}" config_properties["primaryuser"] = CharmUsers.VALKEY_REPLICA.value config_properties["primaryauth"] = self.state.cluster.internal_users_credentials.get( CharmUsers.VALKEY_REPLICA.value, "" @@ -103,10 +99,10 @@ def config_properties(self) -> dict[str, str]: return config_properties - def set_config_properties(self) -> None: + def set_config_properties(self, primary_ip: str) -> None: """Write the config properties to the config file.""" logger.debug("Writing configuration") - self.workload.write_config_file(config=self.config_properties) + self.workload.write_config_file(config=self.get_config_properties(primary_ip=primary_ip)) def set_acl_file(self, passwords: dict[str, str] | None = None) -> None: """Write the ACL file with appropriate user permissions. @@ -142,7 +138,7 @@ def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None password_hash = hashlib.sha256(password.encode("utf-8")).hexdigest() return f"user {user.value} on #{password_hash} {CHARM_USERS_ROLE_MAP[user]}\n" - def set_sentinel_config_properties(self) -> None: + def set_sentinel_config_properties(self, primary_ip: str) -> None: """Write sentinel configuration file.""" logger.debug("Writing Sentinel configuration") @@ -150,7 +146,9 @@ def set_sentinel_config_properties(self) -> None: sentinel_config += f"aclfile {SENTINEL_ACL_FILE}\n" # TODO consider adding quorum calculation based on number of units - sentinel_config += f"sentinel monitor {PRIMARY_NAME} {self.state.cluster.model.primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" + sentinel_config += ( + f"sentinel monitor {PRIMARY_NAME} {primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" + ) # auth settings # auth-user is used by sentinel to authenticate to the valkey primary sentinel_config += ( From 8b50dfff4380157ddbc041fe86922a1cff8dd20f Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 9 Feb 2026 10:54:29 +0000 Subject: [PATCH 058/159] fix unit tests --- tests/unit/test_charm.py | 42 ++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 39fe735..a163977 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -121,7 +121,6 @@ def test_start_non_leader_unit(cloud_spec): relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) - # happy path container = testing.Container(name=CONTAINER, can_connect=True) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -133,6 +132,7 @@ def test_start_non_leader_unit(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("workload_k8s.ValkeyK8sWorkload.mkdir"), + patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.1.0.1"), ): state_out = ctx.run(ctx.on.start(), state_in) assert not state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) @@ -141,8 +141,23 @@ def test_start_non_leader_unit(cloud_spec): ) assert "start" in [e.name for e in state_out.deferred] + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + + assert status_is(state_out, ClusterStatuses.WAITING_FOR_PRIMARY_START.value) + relation = testing.PeerRelation( - id=1, endpoint=PEER_RELATION, local_app_data={"primary-ip": "127.1.0.1"} + id=1, + endpoint=PEER_RELATION, + local_app_data={"primary-ip": "127.1.0.1"}, + peers_data={1: {"started": "true"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -160,7 +175,8 @@ def test_start_non_leader_unit(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"primary-ip": "127.1.0.1", "starting-member": "valkey/0"}, + local_app_data={"starting-member": "valkey/0"}, + peers_data={1: {"started": "true"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -177,7 +193,8 @@ def test_start_non_leader_unit(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"primary-ip": "127.1.0.1", "starting-member": "valkey/0"}, + local_app_data={"starting-member": "valkey/0"}, + peers_data={1: {"started": "true"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -197,7 +214,8 @@ def test_start_non_leader_unit(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"primary-ip": "127.1.0.1", "starting-member": "valkey/0"}, + local_app_data={"starting-member": "valkey/0"}, + peers_data={1: {"started": "true"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -429,9 +447,7 @@ def test_config_changed_leader_unit(cloud_spec): def test_config_changed_leader_unit_primary(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation( - id=1, endpoint=PEER_RELATION, local_app_data={"primary_ip": "127.0.1.1"} - ) + relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( @@ -451,6 +467,7 @@ def test_config_changed_leader_unit_primary(cloud_spec): patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, patch("core.base_workload.WorkloadBase.get_private_ip", return_value="127.0.1.1"), + patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.0.1.1"), ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() @@ -498,7 +515,11 @@ def test_config_changed_leader_unit_wrong_username(cloud_spec): def test_change_password_secret_changed_non_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_unit_data={"started": "true", "private-ip": "127.0.1.0"}, + ) container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( @@ -521,11 +542,12 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, + patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.0.1.1"), ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with(["acl", "load"]) + assert mock_exec_command.call_count == 2 def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spec): From 7c553be1abd0d6171320ac1ed1aad311de6109be Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 05:23:17 +0000 Subject: [PATCH 059/159] fix vm startup --- src/core/base_workload.py | 20 +------------------- src/events/base_events.py | 17 ++++++++++++----- src/literals.py | 2 ++ src/managers/cluster.py | 8 +++++--- src/managers/config.py | 21 ++++++++++++++------- src/statuses.py | 27 ++++++++++++++++++++++----- src/workload_k8s.py | 1 + src/workload_vm.py | 5 +++++ tests/integration/helpers.py | 28 ++++++++++++++++++++-------- tests/integration/vm/test_charm.py | 3 ++- tests/unit/test_charm.py | 2 -- 11 files changed, 84 insertions(+), 50 deletions(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 096cc85..d9f31fc 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -27,6 +27,7 @@ def __init__(self) -> None: self.acl_file: pathops.PathProtocol self.sentinel_acl_file: pathops.PathProtocol self.working_dir: pathops.PathProtocol + self.cli: str @property @abstractmethod @@ -120,22 +121,3 @@ def write_config_file(self, config: dict[str, str]) -> None: ValueError, ) as e: raise ValkeyWorkloadCommandError(e) - - def mkdir( - self, - path: pathops.PathProtocol, - mode: int = 0o755, - user: str | None = None, - group: str | None = None, - exist_ok: bool = True, - ) -> None: - """Create a directory on disk. - - Args: - path (pathops.PathProtocol): The directory path to be created. - mode (int, optional): The directory mode (permissions). Defaults to None. - user (str, optional): The user name. Defaults to None. - group (str, optional): The group name. Defaults to None. - exist_ok (bool, optional): Whether to ignore if the directory already exists. Defaults to True. - """ - path.mkdir(mode=mode, user=user, group=group, exist_ok=exist_ok) diff --git a/src/events/base_events.py b/src/events/base_events.py index a29d2c2..98fd05f 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -12,7 +12,6 @@ from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError, ValkeyWorkloadCommandError from literals import ( - CHARM_USER, CLIENT_PORT, INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL_SUFFIX, @@ -118,7 +117,7 @@ def _on_start(self, event: ops.StartEvent) -> None: if not ( primary_ip := ( - self.charm.state.unit_server.model.private_ip + self.charm.workload.get_private_ip() if self.charm.unit.is_leader() else self.charm.cluster_manager.get_primary_ip() ) @@ -133,13 +132,21 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.config_manager.set_acl_file() self.charm.config_manager.set_sentinel_config_properties(primary_ip=primary_ip) self.charm.config_manager.set_sentinel_acl_file() - self.charm.workload.mkdir( - self.charm.workload.working_dir, user=CHARM_USER, group=CHARM_USER - ) except (ValkeyWorkloadCommandError, ValueError): logger.error("Failed to set configuration") + self.charm.status.set_running_status( + CharmStatuses.CONFIGURATION_ERROR.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) event.defer() return + self.charm.state.statuses.delete( + CharmStatuses.CONFIGURATION_ERROR.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) self.charm.status.set_running_status( ValkeyServiceStatuses.SERVICE_STARTING.value, scope="unit", diff --git a/src/literals.py b/src/literals.py index 8031c14..65b0518 100644 --- a/src/literals.py +++ b/src/literals.py @@ -16,7 +16,9 @@ SNAP_COMMON_PATH = "var/snap/charmed-valkey/common" SNAP_CURRENT_PATH = "var/snap/charmed-valkey/current" SNAP_CONFIG_FILE = "etc/charmed-valkey/valkey.conf" +SNAP_SENTINEL_CONFIG_FILE = "etc/charmed-valkey/sentinel.conf" SNAP_ACL_FILE = "etc/charmed-valkey/users.acl" +SNAP_SENTINEL_ACL_FILE = "etc/charmed-valkey/sentinel-users.acl" # todo: update these paths once directories in the rock are complying with the standard CONFIG_FILE = "var/lib/valkey/valkey.conf" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index b4dce81..cccd778 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -156,14 +156,14 @@ def get_primary_ip(self) -> str | None: def _exec_cli_command( self, command: list[str], - hostname: str = "localhost", + hostname: str | None = None, connect_to: Literal["valkey", "sentinel"] = "valkey", ) -> tuple[str, str | None]: """Execute a Valkey CLI command on the server. Args: command (list[str]): The CLI command to execute, as a list of arguments. - hostname (str): The hostname to connect to. Defaults to "localhost". + hostname (str | None): The hostname to connect to. Defaults to private ip of unit. connect_to (Literal["valkey", "sentinel"]): Whether to connect to the valkey server or sentinel for executing the command. Defaults to "valkey". Returns: @@ -172,6 +172,8 @@ def _exec_cli_command( Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute. """ + if not hostname: + hostname = self.workload.get_private_ip() port = CLIENT_PORT if connect_to == "valkey" else SENTINEL_PORT user = ( CharmUsers.VALKEY_ADMIN.value @@ -186,7 +188,7 @@ def _exec_cli_command( ) ) cli_command = [ - "valkey-cli", + self.workload.cli, "-h", hostname, "-p", diff --git a/src/managers/config.py b/src/managers/config.py index 7fbcca2..6568bf9 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -163,13 +163,20 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: sentinel_config += f"sentinel failover-timeout {PRIMARY_NAME} 180000\n" sentinel_config += f"sentinel parallel-syncs {PRIMARY_NAME} 1\n" - self.workload.write_file( - sentinel_config, - self.workload.sentinel_config, - mode=0o600, - user=CHARM_USER, - group=CHARM_USER, - ) + if self.state.substrate == Substrate.K8S: + # on k8s we need to set the ownership of the sentinel config file to the non-root user that the valkey process runs as in order for sentinel to be able to read/write it + self.workload.write_file( + sentinel_config, + self.workload.sentinel_config, + mode=0o600, + user=CHARM_USER, + group=CHARM_USER, + ) + else: + self.workload.write_file( + sentinel_config, + self.workload.sentinel_config, + ) def set_sentinel_acl_file(self, passwords: dict[str, str] | None = None) -> None: """Write the Sentinel ACL file with appropriate user permissions. diff --git a/src/statuses.py b/src/statuses.py index 7139223..f1dc39b 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -14,8 +14,14 @@ class CharmStatuses(Enum): """Collection of possible statuses for the charm.""" - ACTIVE_IDLE = StatusObject(status="active", message="") - SERVICE_NOT_STARTED = StatusObject(status="blocked", message="Service not started") + ACTIVE_IDLE = StatusObject( + status="active", + message="", + ) + SERVICE_NOT_STARTED = StatusObject( + status="blocked", + message="Service not started", + ) SECRET_ACCESS_ERROR = StatusObject( status="blocked", message="Cannot access configured secret, check permissions", @@ -26,13 +32,20 @@ class CharmStatuses(Enum): message="Waiting for leader to authorize service start", running="async", ) + CONFIGURATION_ERROR = StatusObject( + status="blocked", + message="Configuration error, check logs for details", + running="async", + ) class ClusterStatuses(Enum): """Collection of possible cluster related statuses.""" PASSWORD_UPDATE_FAILED = StatusObject( - status="blocked", message="Failed to update an internal user's password", running="async" + status="blocked", + message="Failed to update an internal user's password", + running="async", ) WAITING_FOR_SENTINEL_DISCOVERY = StatusObject( @@ -58,8 +71,12 @@ class ValkeyServiceStatuses(Enum): """Collection of possible Valkey service related statuses.""" SERVICE_STARTING = StatusObject( - status="maintenance", message="waiting for valkey to start...", running="async" + status="maintenance", + message="waiting for valkey to start...", + running="async", ) SERVICE_NOT_RUNNING = StatusObject( - status="blocked", message="valkey service not running", running="async" + status="blocked", + message="valkey service not running", + running="async", ) diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 9bafed0..11ea9c4 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -42,6 +42,7 @@ def __init__(self, container: ops.Container | None) -> None: self.valkey_service = "valkey" self.sentinel_service = "valkey-sentinel" self.metric_service = "metric_exporter" + self.cli = "valkey-cli" @property @override diff --git a/src/workload_vm.py b/src/workload_vm.py index b36d93b..26a3287 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -20,6 +20,8 @@ SNAP_CURRENT_PATH, SNAP_NAME, SNAP_REVISION, + SNAP_SENTINEL_ACL_FILE, + SNAP_SENTINEL_CONFIG_FILE, SNAP_SERVICE, ) @@ -36,8 +38,11 @@ def __init__(self) -> None: self.root = pathops.LocalPath("/") self.config_file = self.root / SNAP_CURRENT_PATH / SNAP_CONFIG_FILE + self.sentinel_config = self.root / SNAP_CURRENT_PATH / SNAP_SENTINEL_CONFIG_FILE self.acl_file = self.root / SNAP_CURRENT_PATH / SNAP_ACL_FILE + self.sentinel_acl_file = self.root / SNAP_CURRENT_PATH / SNAP_SENTINEL_ACL_FILE self.working_dir = self.root / SNAP_COMMON_PATH / "var/lib/charmed-valkey" + self.cli = "charmed-valkey.cli" @property @override diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 81e9b8e..4777f75 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -336,6 +336,23 @@ def fast_forward(juju: jubilant.Juju): juju.model_config({"update-status-hook-interval": old}) +# TODO switch to sentinel once VM is implemented +# def get_primary_ip(juju: jubilant.Juju, app: str) -> str: +# """Get the primary node of the Valkey cluster. + + +# Returns: +# The IP address of the primary node. +# """ +# hostnames = get_cluster_hostnames(juju, app) +# client = create_sentinel_client( +# hostnames=hostnames, +# valkey_user=CharmUsers.VALKEY_ADMIN.value, +# valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), +# sentinel_user=CharmUsers.SENTINEL_CHARM_ADMIN.value, +# sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), +# ) +# return client.discover_master("primary")[0] def get_primary_ip(juju: jubilant.Juju, app: str) -> str: """Get the primary node of the Valkey cluster. @@ -343,14 +360,9 @@ def get_primary_ip(juju: jubilant.Juju, app: str) -> str: The IP address of the primary node. """ hostnames = get_cluster_hostnames(juju, app) - client = create_sentinel_client( - hostnames=hostnames, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_user=CharmUsers.SENTINEL_CHARM_ADMIN.value, - sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), - ) - return client.discover_master("primary")[0] + client = create_valkey_client(hostname=hostnames[0], password=get_password(juju)) + info = client.info("replication") + return hostnames[0] if info["role"] == "master" else info.get("master_host", "") def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN) -> str: diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index eb22aa3..3f0fa35 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -28,7 +28,8 @@ logger = logging.getLogger(__name__) -NUM_UNITS = 3 +# Update once scale up is implemented in VM +NUM_UNITS = 1 TEST_KEY = "test_key" TEST_VALUE = "test_value" diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index a163977..9b79bbf 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -83,7 +83,6 @@ def test_start_leader_unit(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("workload_k8s.ValkeyK8sWorkload.mkdir"), ): # generate passwords state_out = ctx.run(ctx.on.leader_elected(), state_in) @@ -131,7 +130,6 @@ def test_start_non_leader_unit(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("workload_k8s.ValkeyK8sWorkload.mkdir"), patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.1.0.1"), ): state_out = ctx.run(ctx.on.start(), state_in) From 40fb3009e4ef4255b63b86d89f74676cec824df6 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 05:38:57 +0000 Subject: [PATCH 060/159] move spread file to correct position --- tests/spread/{ => k8s}/test_scaling.py/task.yaml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/spread/{ => k8s}/test_scaling.py/task.yaml (100%) diff --git a/tests/spread/test_scaling.py/task.yaml b/tests/spread/k8s/test_scaling.py/task.yaml similarity index 100% rename from tests/spread/test_scaling.py/task.yaml rename to tests/spread/k8s/test_scaling.py/task.yaml From 50db8520cfe618eb96fc013baafe0d39acb00c69 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 07:43:40 +0000 Subject: [PATCH 061/159] enable sentinel on VM --- src/events/base_events.py | 1 + src/managers/config.py | 7 +++++-- src/workload_vm.py | 7 +++++++ tests/integration/vm/test_charm.py | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 98fd05f..7d61fe2 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -184,6 +184,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.unit_fully_started.emit() + # TODO check how to trigger if defered without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: """Handle the unit-fully-started event.""" self.charm.status.set_running_status( diff --git a/src/managers/config.py b/src/managers/config.py index 6568bf9..069ffab 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -22,7 +22,6 @@ CLIENT_PORT, PRIMARY_NAME, QUORUM_NUMBER, - SENTINEL_ACL_FILE, SENTINEL_PORT, CharmUsers, Substrate, @@ -144,7 +143,11 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: sentinel_config = f"port {SENTINEL_PORT}\n" - sentinel_config += f"aclfile {SENTINEL_ACL_FILE}\n" + # TODO remove once deamonized in snap + if self.state.substrate == Substrate.VM: + sentinel_config += "daemonize yes\n" + + sentinel_config += f"aclfile {self.workload.sentinel_acl_file.as_posix()}\n" # TODO consider adding quorum calculation based on number of units sentinel_config += ( f"sentinel monitor {PRIMARY_NAME} {primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" diff --git a/src/workload_vm.py b/src/workload_vm.py index 26a3287..08bae34 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -89,6 +89,13 @@ def install(self, revision: str | None = None, retry_and_raise: bool = True) -> def start(self) -> None: try: self.valkey.start(services=[SNAP_SERVICE]) + # TODO replace with snap service when PR merged + self.exec( + [ + "charmed-valkey.sentinel", + self.sentinel_config.as_posix(), + ] + ) except snap.SnapError as e: logger.exception(str(e)) diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index 3f0fa35..f808eb9 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) # Update once scale up is implemented in VM -NUM_UNITS = 1 +NUM_UNITS = 3 TEST_KEY = "test_key" TEST_VALUE = "test_value" From d60a25be63297c630a2c3c3e5976de30d3346733 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 07:44:17 +0000 Subject: [PATCH 062/159] mv cw to the base of integration tests --- tests/integration/{k8s/ha => }/continuous_writes.py | 0 .../{k8s/ha/helpers.py => cw_helpers.py} | 2 +- tests/integration/k8s/ha/test_scaling.py | 13 ++++++------- 3 files changed, 7 insertions(+), 8 deletions(-) rename tests/integration/{k8s/ha => }/continuous_writes.py (100%) rename tests/integration/{k8s/ha/helpers.py => cw_helpers.py} (97%) diff --git a/tests/integration/k8s/ha/continuous_writes.py b/tests/integration/continuous_writes.py similarity index 100% rename from tests/integration/k8s/ha/continuous_writes.py rename to tests/integration/continuous_writes.py diff --git a/tests/integration/k8s/ha/helpers.py b/tests/integration/cw_helpers.py similarity index 97% rename from tests/integration/k8s/ha/helpers.py rename to tests/integration/cw_helpers.py index 3ea3967..df6ccd7 100644 --- a/tests/integration/k8s/ha/helpers.py +++ b/tests/integration/cw_helpers.py @@ -29,7 +29,7 @@ def start_continuous_writes( subprocess.Popen( [ "python3", - "tests/integration/k8s/ha/continuous_writes.py", + "tests/integration/continuous_writes.py", endpoints, valkey_user, valkey_password, diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 341159c..11c6676 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -7,6 +7,12 @@ import valkey from literals import CharmUsers +from tests.integration.cw_helpers import ( + assert_continuous_writes_consistent, + assert_continuous_writes_increasing, + start_continuous_writes, + stop_continuous_writes, +) from tests.integration.helpers import ( APP_NAME, IMAGE_RESOURCE, @@ -16,13 +22,6 @@ seed_valkey, ) -from .helpers import ( - assert_continuous_writes_consistent, - assert_continuous_writes_increasing, - start_continuous_writes, - stop_continuous_writes, -) - logger = logging.getLogger(__name__) NUM_UNITS = 3 From 76a9b52aab28d4f4c57acef5ae253c643789d996 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 07:48:32 +0000 Subject: [PATCH 063/159] add scaling tests on VM --- tests/integration/vm/ha/__init__.py | 0 tests/integration/vm/ha/test_scaling.py | 107 ++++++++++++++++++++++ tests/spread/vm/test_scaling.py/task.yaml | 9 ++ 3 files changed, 116 insertions(+) create mode 100644 tests/integration/vm/ha/__init__.py create mode 100644 tests/integration/vm/ha/test_scaling.py create mode 100644 tests/spread/vm/test_scaling.py/task.yaml diff --git a/tests/integration/vm/ha/__init__.py b/tests/integration/vm/ha/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/vm/ha/test_scaling.py b/tests/integration/vm/ha/test_scaling.py new file mode 100644 index 0000000..11c6676 --- /dev/null +++ b/tests/integration/vm/ha/test_scaling.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. +import logging + +import jubilant +import valkey + +from literals import CharmUsers +from tests.integration.cw_helpers import ( + assert_continuous_writes_consistent, + assert_continuous_writes_increasing, + start_continuous_writes, + stop_continuous_writes, +) +from tests.integration.helpers import ( + APP_NAME, + IMAGE_RESOURCE, + are_apps_active_and_agents_idle, + get_cluster_hostnames, + get_password, + seed_valkey, +) + +logger = logging.getLogger(__name__) + +NUM_UNITS = 3 +TEST_KEY = "test_key" +TEST_VALUE = "test_value" + + +def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: + """Build the charm-under-test and deploy it with three units.""" + juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1, trust=True) + juju.wait( + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), + timeout=600, + ) + + assert len(juju.status().apps[APP_NAME].units) == 1, ( + "Unexpected number of units after initial deploy" + ) + + +def test_seed_data(juju: jubilant.Juju) -> None: + """Seed some data to the cluster.""" + seed_valkey(juju, target_gb=1) + + +def test_scale_up(juju: jubilant.Juju) -> None: + """Make sure new units are added to the valkey downtime.""" + init_units_count = len(juju.status().apps[APP_NAME].units) + init_endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) + # start writing data to the cluster + start_continuous_writes( + endpoints=init_endpoints, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_user=CharmUsers.SENTINEL_ADMIN.value, + sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + ) + + # scale up + juju.add_unit(APP_NAME, num_units=2) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, idle_period=10, unit_count=init_units_count + 2 + ), + timeout=1200, + ) + num_units = len(juju.status().apps[APP_NAME].units) + assert num_units == init_units_count + 2, ( + f"Expected {init_units_count + 2} units, got {num_units}." + ) + + # check if all units have been added to the cluster + endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) + + sentinel_client = valkey.Sentinel( + [(host, 26379) for host in endpoints.split(",")], + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_kwargs={ + "password": get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + "username": CharmUsers.SENTINEL_ADMIN.value, + }, + ) + master = sentinel_client.master_for("primary") + info = master.info("replication") + connected_slaves = info.get("connected_slaves", 0) + assert connected_slaves == num_units - 1, ( + f"Expected {num_units - 1} connected slaves, got {connected_slaves}." + ) + + assert_continuous_writes_increasing( + endpoints=endpoints, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_user=CharmUsers.SENTINEL_ADMIN.value, + sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + ) + stop_continuous_writes() + assert_continuous_writes_consistent( + endpoints=endpoints, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) diff --git a/tests/spread/vm/test_scaling.py/task.yaml b/tests/spread/vm/test_scaling.py/task.yaml new file mode 100644 index 0000000..e309e66 --- /dev/null +++ b/tests/spread/vm/test_scaling.py/task.yaml @@ -0,0 +1,9 @@ +summary: test_scaling.py +environment: + TEST_MODULE: ha/test_scaling.py +systems: + - self-hosted-linux-amd64-noble-medium +execute: | + tox run -e integration -- "tests/integration/vm/$TEST_MODULE" --alluredir="$SPREAD_TASK/allure-results" +artifacts: + - allure-results From bc3b51b7f540c1f98c7588ac00c8a8815495b796 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 07:48:58 +0000 Subject: [PATCH 064/159] fix typos --- src/events/base_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 7d61fe2..b3b4c35 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -184,7 +184,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.unit_fully_started.emit() - # TODO check how to trigger if defered without update status event + # TODO check how to trigger if deferred without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: """Handle the unit-fully-started event.""" self.charm.status.set_running_status( From 47f5c1265a4455cdec1f1b43849e8c62353cd75b Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 07:49:28 +0000 Subject: [PATCH 065/159] fix typo --- src/managers/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/managers/config.py b/src/managers/config.py index 069ffab..1c2ddfe 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -143,7 +143,7 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: sentinel_config = f"port {SENTINEL_PORT}\n" - # TODO remove once deamonized in snap + # TODO remove once daemonized in snap if self.state.substrate == Substrate.VM: sentinel_config += "daemonize yes\n" From 9a6f877f47dad4ac954f47ea633bd28b7c65d457 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Feb 2026 08:04:59 +0000 Subject: [PATCH 066/159] remove resource from vm test scaling --- tests/integration/vm/ha/test_scaling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/vm/ha/test_scaling.py b/tests/integration/vm/ha/test_scaling.py index 11c6676..dcd3ede 100644 --- a/tests/integration/vm/ha/test_scaling.py +++ b/tests/integration/vm/ha/test_scaling.py @@ -15,7 +15,6 @@ ) from tests.integration.helpers import ( APP_NAME, - IMAGE_RESOURCE, are_apps_active_and_agents_idle, get_cluster_hostnames, get_password, @@ -31,7 +30,7 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: """Build the charm-under-test and deploy it with three units.""" - juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1, trust=True) + juju.deploy(charm, num_units=1, trust=True) juju.wait( lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, From 1fdc7e99bb5ba1184e7edc6b0080e64cf5478f5e Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 11 Feb 2026 06:41:46 +0000 Subject: [PATCH 067/159] remove scaling comment and update cw to be pythonic --- poetry.lock | 90 +------- pyproject.toml | 1 - tests/integration/conftest.py | 29 +++ tests/integration/continuous_writes.py | 274 +++++++++++++++++------ tests/integration/cw_helpers.py | 20 +- tests/integration/k8s/ha/test_scaling.py | 16 +- tests/integration/vm/ha/test_scaling.py | 17 +- tests/integration/vm/test_charm.py | 1 - 8 files changed, 255 insertions(+), 193 deletions(-) create mode 100644 tests/integration/conftest.py diff --git a/poetry.lock b/poetry.lock index 8e570c8..d1bf741 100644 --- a/poetry.lock +++ b/poetry.lock @@ -60,25 +60,6 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] -[[package]] -name = "anyio" -version = "4.12.1" -description = "High-level concurrency and networking framework on top of asyncio or Trio" -optional = false -python-versions = ">=3.9" -groups = ["main"] -files = [ - {file = "anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c"}, - {file = "anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703"}, -] - -[package.dependencies] -idna = ">=2.8" -typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} - -[package.extras] -trio = ["trio (>=0.31.0) ; python_version < \"3.10\"", "trio (>=0.32.0) ; python_version >= \"3.10\""] - [[package]] name = "attrs" version = "25.4.0" @@ -278,21 +259,6 @@ rich = "*" all = ["pytest_operator (==0.36.0)"] tests = ["pytest_operator (==0.36.0)"] -[[package]] -name = "idna" -version = "3.11" -description = "Internationalized Domain Names in Applications (IDNA)" -optional = false -python-versions = ">=3.8" -groups = ["main"] -files = [ - {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, - {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, -] - -[package.extras] -all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] - [[package]] name = "importlib-metadata" version = "8.7.1" @@ -477,26 +443,6 @@ files = [ dev = ["pre-commit", "tox"] testing = ["coverage", "pytest", "pytest-benchmark"] -[[package]] -name = "protobuf" -version = "6.33.5" -description = "" -optional = false -python-versions = ">=3.9" -groups = ["main"] -files = [ - {file = "protobuf-6.33.5-cp310-abi3-win32.whl", hash = "sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b"}, - {file = "protobuf-6.33.5-cp310-abi3-win_amd64.whl", hash = "sha256:3093804752167bcab3998bec9f1048baae6e29505adaf1afd14a37bddede533c"}, - {file = "protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a5cb85982d95d906df1e2210e58f8e4f1e3cdc088e52c921a041f9c9a0386de5"}, - {file = "protobuf-6.33.5-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:9b71e0281f36f179d00cbcb119cb19dec4d14a81393e5ea220f64b286173e190"}, - {file = "protobuf-6.33.5-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:8afa18e1d6d20af15b417e728e9f60f3aa108ee76f23c3b2c07a2c3b546d3afd"}, - {file = "protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:cbf16ba3350fb7b889fca858fb215967792dc125b35c7976ca4818bee3521cf0"}, - {file = "protobuf-6.33.5-cp39-cp39-win32.whl", hash = "sha256:a3157e62729aafb8df6da2c03aa5c0937c7266c626ce11a278b6eb7963c4e37c"}, - {file = "protobuf-6.33.5-cp39-cp39-win_amd64.whl", hash = "sha256:8f04fa32763dcdb4973d537d6b54e615cc61108c7cb38fe59310c3192d29510a"}, - {file = "protobuf-6.33.5-py3-none-any.whl", hash = "sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02"}, - {file = "protobuf-6.33.5.tar.gz", hash = "sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c"}, -] - [[package]] name = "pydantic" version = "2.12.5" @@ -900,18 +846,6 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] -[[package]] -name = "sniffio" -version = "1.3.1" -description = "Sniff out which async library your code is running under" -optional = false -python-versions = ">=3.7" -groups = ["main"] -files = [ - {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, - {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, -] - [[package]] name = "tenacity" version = "9.1.2" @@ -971,28 +905,6 @@ files = [ libvalkey = ["libvalkey (>=4.0.1)"] ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] -[[package]] -name = "valkey-glide" -version = "0.0.0" -description = "Valkey GLIDE Async client. Supports Valkey and Redis OSS." -optional = false -python-versions = ">=3.9" -groups = ["main"] -files = [] -develop = false - -[package.dependencies] -anyio = ">=4.9.0" -protobuf = ">=6.20" -sniffio = "*" - -[package.source] -type = "git" -url = "https://github.com/skourta/valkey-glide" -reference = "add-build-rs" -resolved_reference = "5e2dfce07bed84dc8637e1c43aa55b135a76137f" -subdirectory = "python/glide-async" - [[package]] name = "websocket-client" version = "1.9.0" @@ -1033,4 +945,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "05f3431c740a9805c0ae2b05cd496a779a619f1c9443218d33ed717177cc98b5" +content-hash = "6710246ac0750c8538cb34d54f3465ad67023241c3cc2af36836b9f0a4d11354" diff --git a/pyproject.toml b/pyproject.toml index 4a122fc..f5441fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,6 @@ tenacity = "*" data-platform-helpers = ">=0.1.7" # TODO replace with official release once build from source is possible # https://github.com/valkey-io/valkey-glide/pull/5202 -valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs" } [tool.poetry.requires-plugins] poetry-plugin-export = ">=1.8" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 0000000..119ab26 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,29 @@ +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging + +import jubilant +import pytest + +from tests.integration.continuous_writes import ContinuousWrites +from tests.integration.helpers import APP_NAME + +logger = logging.getLogger(__name__) + + +@pytest.fixture(scope="function") +def c_writes(juju: jubilant.Juju): + """Create instance of the ContinuousWrites.""" + app = APP_NAME + logger.debug(f"Creating ContinuousWrites instance for app with name {app}") + return ContinuousWrites(juju, app, log_written_values=True) + + +@pytest.fixture(scope="function") +def c_writes_runner(juju: jubilant.Juju, c_writes: ContinuousWrites): + """Start continuous write operations and clears writes at the end of the test.""" + c_writes.start() + yield + logger.info("Clearing continuous writes after test completion") + logger.info(c_writes.clear()) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index 7bd723a..6519207 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -1,87 +1,221 @@ #!/usr/bin/env python3 -# Copyright 2025 Canonical Ltd. +# Copyright 2026 Canonical Ltd. # See LICENSE file for licensing details. import logging -import pathlib -import signal -import sys +import os import time +from contextlib import contextmanager +from multiprocessing import Event, Process, Queue, log_to_stderr +from types import SimpleNamespace +from typing import Generator +import jubilant import valkey -from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed - -SENTINEL_PORT = 26379 +from tenacity import ( + RetryError, + Retrying, + retry, + stop_after_attempt, + stop_after_delay, + wait_fixed, + wait_random, +) + +from literals import CharmUsers +from tests.integration.helpers import get_cluster_hostnames, get_password logger = logging.getLogger(__name__) -WRITES_LAST_WRITTEN_VAL_PATH = "last_written_value" -LOG_FILE_PATH = "log_file" -continue_running = True - - -def continuous_writes( - endpoints: str, - valkey_user: str, - valkey_password: str, - sentinel_user: str, - sentinel_password: str, -) -> None: - key = "cw_key" - count = 0 - - client = valkey.Sentinel( - [(host, SENTINEL_PORT) for host in endpoints.split(",")], - username=valkey_user, - password=valkey_password, - sentinel_kwargs={"password": sentinel_password, "username": sentinel_user}, - ) - master = client.master_for("primary") - - # clean up from previous runs - pathlib.Path(WRITES_LAST_WRITTEN_VAL_PATH).unlink(missing_ok=True) - try: - master.delete(key) - except Exception: - pass - - while continue_running: - count += 1 +class WriteFailedError(Exception): + """Raised when a single write operation has failed.""" + + +class ContinuousWrites: + """Utility class for managing continuous writes to Valkey.""" + + KEY = "cw_key" + LAST_WRITTEN_VAL_PATH = "last_written_value" + SENTINEL_PORT = 26379 + + def __init__( + self, + juju: jubilant.Juju, + app: str, + initial_count: int = 0, + log_written_values: bool = False, + ): + self._juju = juju + self._app = app + self._is_stopped = True + self._event = None + self._queue = None + self._process = None + self._initial_count = initial_count + self._log_written_values = log_written_values + + def _get_config(self) -> SimpleNamespace: + """Fetch current cluster configuration from Juju.""" + return SimpleNamespace( + endpoints=",".join(get_cluster_hostnames(self._juju, app_name=self._app)), + valkey_password=get_password(self._juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_password=get_password(self._juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), + ) + + @contextmanager + def _get_client(self) -> Generator[valkey.Valkey, None, None]: + """Context manager to provide a master client and ensure cleanup.""" + conf = self._get_config() + sentinel = valkey.Sentinel( + [(host, self.SENTINEL_PORT) for host in conf.endpoints.split(",")], + username=CharmUsers.VALKEY_ADMIN.value, + password=conf.valkey_password, + sentinel_kwargs={ + "password": conf.sentinel_password, + "username": CharmUsers.SENTINEL_CHARM_ADMIN.value, + }, + ) + master = sentinel.master_for("primary") + try: + yield master + finally: + # Valkey clients use connection pools, but we ensure logical separation + master.close() + + @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) + def start(self) -> None: + """Run continuous writes in the background.""" + if not self._is_stopped: + self.stop() + + self._is_stopped = False + self._event = Event() + self._queue = Queue() + + self._process = Process( + target=self._run_wrapper, + name="continuous_writes", + args=(self._event, self._queue, self._initial_count, self._log_written_values), + ) + + self.update() # Load initial config into queue + self._process.start() + + def update(self) -> None: + """Update cluster related conf (scaling, password changes).""" + if self._queue: + self._queue.put(self._get_config()) + + @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) + def clear(self) -> SimpleNamespace | None: + """Stop writes and delete the tracking key/file.""" + result = None + if not self._is_stopped: + result = self.stop() + + with self._get_client() as client: + client.delete(self.KEY) + + if os.path.exists(self.LAST_WRITTEN_VAL_PATH): + os.remove(self.LAST_WRITTEN_VAL_PATH) + + return result + + def count(self) -> int: + """Return number of items in the list.""" + with self._get_client() as client: + return client.llen(self.KEY) + + def max_stored_id(self) -> int: + """Return the most recently inserted ID (top of list).""" + with self._get_client() as client: + val = client.lindex(self.KEY, 0) + return int(val) if val else 0 + + @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) + def stop(self) -> SimpleNamespace: + """Stop the background process and return summary statistics.""" + if not self._is_stopped and self._process: + self._event.set() + self._process.join(timeout=30) + self._process.terminate() + self._is_stopped = True + + result = SimpleNamespace() + result.max_stored_id = self.max_stored_id() + result.count = self.count() + + # Retrieve the last ID the worker attempted to write try: - for attempt in Retrying(stop=stop_after_attempt(2), wait=wait_fixed(1)): + for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(2)): with attempt: - result = master.set(key, str(count)) - if not result: - raise ValueError - with open(LOG_FILE_PATH, "a") as log_file: - log_file.write(f"{count}\n") - except RetryError: - pass - - time.sleep(1) - else: - # write last expected written value on disk when terminating - pathlib.Path(WRITES_LAST_WRITTEN_VAL_PATH).write_text(str(count)) - - -def handle_stop_signal(signum, frame) -> None: - global continue_running - continue_running = False + with open(self.LAST_WRITTEN_VAL_PATH, "r") as f: + result.last_expected_id = int(f.read().strip()) + except (RetryError, FileNotFoundError, ValueError): + result.last_expected_id = -1 + + return result + + @staticmethod + def _run_wrapper( + event: Event, data_queue: Queue, starting_number: int, log_written_values: bool = False + ) -> None: + """Entry point for the Process; simplified without unnecessary asyncio.""" + proc_logger = log_to_stderr() + proc_logger.setLevel(logging.INFO) + + def _make_client(conf): + s = valkey.Sentinel( + [(h, ContinuousWrites.SENTINEL_PORT) for h in conf.endpoints.split(",")], + username=CharmUsers.VALKEY_ADMIN.value, + password=conf.valkey_password, + sentinel_kwargs={ + "password": conf.sentinel_password, + "username": CharmUsers.SENTINEL_CHARM_ADMIN.value, + }, + ) + return s.master_for("primary") + + current_val = starting_number + config = data_queue.get(block=True) + client = _make_client(config) + + proc_logger.info(f"Starting continuous writes from {current_val}") - -def main(): - endpoints = sys.argv[1] - valkey_user = sys.argv[2] - valkey_password = sys.argv[3] - sentinel_user = sys.argv[4] - sentinel_password = sys.argv[5] - - # handle the stop signal for a graceful stop of the writes process - signal.signal(signal.SIGTERM, handle_stop_signal) - - continuous_writes(endpoints, valkey_user, valkey_password, sentinel_user, sentinel_password) + try: + while not event.is_set(): + # Check for config updates (e.g. cluster scaling) + if not data_queue.empty(): + config = data_queue.get(block=False) + client = _make_client(config) + + try: + # note LPUSH returns the length of the list after the push + if client.lpush(ContinuousWrites.KEY, current_val): + if log_written_values: + proc_logger.info(f"Wrote value: {current_val}") + current_val += 1 + # Throttle to avoid flooding small test runners + time.sleep(1) + else: + raise WriteFailedError("LPUSH returned 0/None") + except Exception as e: + proc_logger.warning(f"Write failed at {current_val}: {e}") + time.sleep(2) + continue + finally: + # Persistent where we stopped + with open(ContinuousWrites.LAST_WRITTEN_VAL_PATH, "w") as f: + f.write(str(current_val - 1)) + os.fsync(f) if __name__ == "__main__": - main() + # Example usage + juju_env = jubilant.Juju(model="testing") + cw = ContinuousWrites(juju=juju_env, app="valkey", initial_count=100, log_written_values=False) + cw.clear() + cw.start() + time.sleep(10) + print(f"Stats: {cw.clear()}") diff --git a/tests/integration/cw_helpers.py b/tests/integration/cw_helpers.py index df6ccd7..1b068d4 100644 --- a/tests/integration/cw_helpers.py +++ b/tests/integration/cw_helpers.py @@ -10,12 +10,15 @@ from tenacity import Retrying, stop_after_attempt, wait_fixed from literals import CLIENT_PORT, SENTINEL_PORT +from tests.integration.continuous_writes import ContinuousWrites logger = logging.getLogger(__name__) -WRITES_LAST_WRITTEN_VAL_PATH = "last_written_value" +# WRITES_LAST_WRITTEN_VAL_PATH = "last_written_value" +# KEY = "cw_key" -KEY = "cw_key" +KEY = ContinuousWrites.KEY +WRITES_LAST_WRITTEN_VAL_PATH = ContinuousWrites.LAST_WRITTEN_VAL_PATH def start_continuous_writes( @@ -60,9 +63,9 @@ def assert_continuous_writes_increasing( sentinel_kwargs={"password": sentinel_password, "username": sentinel_user}, ) master = client.master_for("primary") - writes_count = int(master.get(KEY)) + writes_count = int(master.llen(KEY)) time.sleep(10) - more_writes = int(master.get(KEY)) + more_writes = int(master.llen(KEY)) assert more_writes > writes_count, "Writes not continuing to DB" logger.info("Continuous writes are increasing.") @@ -79,6 +82,9 @@ def assert_continuous_writes_consistent( with open(WRITES_LAST_WRITTEN_VAL_PATH, "r") as f: last_written_value = int(f.read().rstrip()) + if not last_written_value: + raise ValueError("Could not read last written value from file.") + for endpoint in endpoints.split(","): client = valkey.Valkey( host=endpoint, @@ -87,8 +93,12 @@ def assert_continuous_writes_consistent( password=valkey_password, decode_responses=True, ) - last_value = int(client.get(KEY)) + last_value = int(client.lrange(KEY, 0, 0)[0]) + count = int(client.llen(KEY)) assert last_written_value == last_value, ( f"endpoint: {endpoint}, expected value: {last_written_value}, current value: {last_value}" ) + assert count == last_written_value + 1, ( + f"endpoint: {endpoint}, expected count: {last_written_value + 1}, current count: {count}" + ) logger.info(f"Continuous writes are consistent on {endpoint}.") diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 11c6676..9585c40 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -10,8 +10,6 @@ from tests.integration.cw_helpers import ( assert_continuous_writes_consistent, assert_continuous_writes_increasing, - start_continuous_writes, - stop_continuous_writes, ) from tests.integration.helpers import ( APP_NAME, @@ -47,18 +45,9 @@ def test_seed_data(juju: jubilant.Juju) -> None: seed_valkey(juju, target_gb=1) -def test_scale_up(juju: jubilant.Juju) -> None: +def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) - init_endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) - # start writing data to the cluster - start_continuous_writes( - endpoints=init_endpoints, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_user=CharmUsers.SENTINEL_ADMIN.value, - sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), - ) # scale up juju.add_unit(APP_NAME, num_units=2) @@ -99,7 +88,8 @@ def test_scale_up(juju: jubilant.Juju) -> None: sentinel_user=CharmUsers.SENTINEL_ADMIN.value, sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), ) - stop_continuous_writes() + logger.info("Stopping continuous writes after scale up test.") + logger.info(c_writes.stop()) assert_continuous_writes_consistent( endpoints=endpoints, valkey_user=CharmUsers.VALKEY_ADMIN.value, diff --git a/tests/integration/vm/ha/test_scaling.py b/tests/integration/vm/ha/test_scaling.py index dcd3ede..fbd977e 100644 --- a/tests/integration/vm/ha/test_scaling.py +++ b/tests/integration/vm/ha/test_scaling.py @@ -10,8 +10,6 @@ from tests.integration.cw_helpers import ( assert_continuous_writes_consistent, assert_continuous_writes_increasing, - start_continuous_writes, - stop_continuous_writes, ) from tests.integration.helpers import ( APP_NAME, @@ -46,19 +44,9 @@ def test_seed_data(juju: jubilant.Juju) -> None: seed_valkey(juju, target_gb=1) -def test_scale_up(juju: jubilant.Juju) -> None: +def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) - init_endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) - # start writing data to the cluster - start_continuous_writes( - endpoints=init_endpoints, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_user=CharmUsers.SENTINEL_ADMIN.value, - sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), - ) - # scale up juju.add_unit(APP_NAME, num_units=2) juju.wait( @@ -98,7 +86,8 @@ def test_scale_up(juju: jubilant.Juju) -> None: sentinel_user=CharmUsers.SENTINEL_ADMIN.value, sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), ) - stop_continuous_writes() + logger.info("Stopping continuous writes after scale up test.") + logger.info(c_writes.stop()) assert_continuous_writes_consistent( endpoints=endpoints, valkey_user=CharmUsers.VALKEY_ADMIN.value, diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index f808eb9..eb22aa3 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -28,7 +28,6 @@ logger = logging.getLogger(__name__) -# Update once scale up is implemented in VM NUM_UNITS = 3 TEST_KEY = "test_key" TEST_VALUE = "test_value" From 3218f3725c550c7284dfabebb114e49d2ace377e Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 11 Feb 2026 06:46:14 +0000 Subject: [PATCH 068/159] remove unused patch --- tests/unit/test_charm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 9b79bbf..85cb0f8 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -402,7 +402,6 @@ def test_config_changed_leader_unit_valkey_update_fails(cloud_spec): ) with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("common.client.ValkeyClient.create_client", side_effect=Exception("fail")), patch("core.models.RelationState.update") as mock_update, ): ctx.run(ctx.on.config_changed(), state_in) From e46b5f3b844416e0105892c3b2631bd9eac862b9 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 11 Feb 2026 06:58:48 +0000 Subject: [PATCH 069/159] turn off write logging for CW --- tests/integration/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 119ab26..96946b7 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -17,7 +17,7 @@ def c_writes(juju: jubilant.Juju): """Create instance of the ContinuousWrites.""" app = APP_NAME logger.debug(f"Creating ContinuousWrites instance for app with name {app}") - return ContinuousWrites(juju, app, log_written_values=True) + return ContinuousWrites(juju, app) @pytest.fixture(scope="function") From 7c5afc283efe63b367068f4ccce3784a665f6b00 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 11 Feb 2026 10:07:59 +0000 Subject: [PATCH 070/159] add sentinel as daemon for vm and fix permissions for files --- src/core/base_workload.py | 3 +- src/literals.py | 4 +-- src/managers/config.py | 41 +++++++++++++------------- src/workload_k8s.py | 14 ++++----- src/workload_vm.py | 15 ++++------ tests/integration/continuous_writes.py | 34 +++++++++++++++++---- tests/integration/helpers.py | 28 +++++------------- 7 files changed, 73 insertions(+), 66 deletions(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index d9f31fc..9649bfc 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -28,6 +28,7 @@ def __init__(self) -> None: self.sentinel_acl_file: pathops.PathProtocol self.working_dir: pathops.PathProtocol self.cli: str + self.user: str @property @abstractmethod @@ -111,7 +112,7 @@ def write_config_file(self, config: dict[str, str]) -> None: path = self.config_file try: - path.write_text(config_string) + path.write_text(config_string, user=self.user, group=self.user) except ( FileNotFoundError, LookupError, diff --git a/src/literals.py b/src/literals.py index 65b0518..f3ea6ba 100644 --- a/src/literals.py +++ b/src/literals.py @@ -7,12 +7,12 @@ from enum import StrEnum CHARM = "valkey" -CHARM_USER = "valkey" CONTAINER = "valkey" SNAP_NAME = "charmed-valkey" -SNAP_REVISION = 14 +SNAP_REVISION = 16 SNAP_SERVICE = "server" +SNAP_SENTINEL_SERVICE = "sentinel" SNAP_COMMON_PATH = "var/snap/charmed-valkey/common" SNAP_CURRENT_PATH = "var/snap/charmed-valkey/current" SNAP_CONFIG_FILE = "etc/charmed-valkey/valkey.conf" diff --git a/src/managers/config.py b/src/managers/config.py index 1c2ddfe..b88740f 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -17,7 +17,6 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import ( - CHARM_USER, CHARM_USERS_ROLE_MAP, CLIENT_PORT, PRIMARY_NAME, @@ -118,7 +117,12 @@ def set_acl_file(self, passwords: dict[str, str] | None = None) -> None: if "VALKEY_" not in user.name: continue acl_content += self._get_user_acl_line(user, passwords=passwords) - self.workload.write_file(acl_content, self.workload.acl_file) + self.workload.write_file( + acl_content, + self.workload.acl_file, + user=self.workload.user, + group=self.workload.user, + ) def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None = None) -> str: """Generate an ACL line for a given user. @@ -143,10 +147,6 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: sentinel_config = f"port {SENTINEL_PORT}\n" - # TODO remove once daemonized in snap - if self.state.substrate == Substrate.VM: - sentinel_config += "daemonize yes\n" - sentinel_config += f"aclfile {self.workload.sentinel_acl_file.as_posix()}\n" # TODO consider adding quorum calculation based on number of units sentinel_config += ( @@ -166,20 +166,14 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: sentinel_config += f"sentinel failover-timeout {PRIMARY_NAME} 180000\n" sentinel_config += f"sentinel parallel-syncs {PRIMARY_NAME} 1\n" - if self.state.substrate == Substrate.K8S: - # on k8s we need to set the ownership of the sentinel config file to the non-root user that the valkey process runs as in order for sentinel to be able to read/write it - self.workload.write_file( - sentinel_config, - self.workload.sentinel_config, - mode=0o600, - user=CHARM_USER, - group=CHARM_USER, - ) - else: - self.workload.write_file( - sentinel_config, - self.workload.sentinel_config, - ) + # on k8s we need to set the ownership of the sentinel config file to the non-root user that the valkey process runs as in order for sentinel to be able to read/write it + self.workload.write_file( + sentinel_config, + self.workload.sentinel_config, + mode=0o600, + user=self.workload.user, + group=self.workload.user, + ) def set_sentinel_acl_file(self, passwords: dict[str, str] | None = None) -> None: """Write the Sentinel ACL file with appropriate user permissions. @@ -196,7 +190,12 @@ def set_sentinel_acl_file(self, passwords: dict[str, str] | None = None) -> None if "VALKEY_" in user.name: continue acl_content += self._get_user_acl_line(user, passwords=passwords) - self.workload.write_file(acl_content, self.workload.sentinel_acl_file) + self.workload.write_file( + acl_content, + self.workload.sentinel_acl_file, + user=self.workload.user, + group=self.workload.user, + ) def generate_password(self) -> str: """Create randomized string for use as app passwords. diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 11ea9c4..5fe5482 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -15,7 +15,6 @@ from literals import ( ACL_FILE, CHARM, - CHARM_USER, CONFIG_FILE, SENTINEL_ACL_FILE, SENTINEL_CONFIG_FILE, @@ -43,6 +42,7 @@ def __init__(self, container: ops.Container | None) -> None: self.sentinel_service = "valkey-sentinel" self.metric_service = "metric_exporter" self.cli = "valkey-cli" + self.user = "valkey" @property @override @@ -60,24 +60,24 @@ def pebble_layer(self) -> ops.pebble.Layer: "override": "replace", "summary": "Valkey service", "command": f"valkey-server {self.config_file.as_posix()}", - "user": CHARM_USER, - "group": CHARM_USER, + "user": self.user, + "group": self.user, "startup": "enabled", }, self.sentinel_service: { "override": "replace", "summary": "Valkey sentinel service", "command": f"valkey-sentinel {self.sentinel_config.as_posix()}", - "user": CHARM_USER, - "group": CHARM_USER, + "user": self.user, + "group": self.user, "startup": "enabled", }, self.metric_service: { "override": "replace", "summary": "Valkey metric exporter", "command": "bin/redis_exporter", - "user": CHARM_USER, - "group": CHARM_USER, + "user": self.user, + "group": self.user, "startup": "enabled", }, }, diff --git a/src/workload_vm.py b/src/workload_vm.py index 08bae34..fdfd8ed 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -22,6 +22,7 @@ SNAP_REVISION, SNAP_SENTINEL_ACL_FILE, SNAP_SENTINEL_CONFIG_FILE, + SNAP_SENTINEL_SERVICE, SNAP_SERVICE, ) @@ -43,6 +44,7 @@ def __init__(self) -> None: self.sentinel_acl_file = self.root / SNAP_CURRENT_PATH / SNAP_SENTINEL_ACL_FILE self.working_dir = self.root / SNAP_COMMON_PATH / "var/lib/charmed-valkey" self.cli = "charmed-valkey.cli" + self.user = "snap_daemon" @property @override @@ -88,14 +90,7 @@ def install(self, revision: str | None = None, retry_and_raise: bool = True) -> @override def start(self) -> None: try: - self.valkey.start(services=[SNAP_SERVICE]) - # TODO replace with snap service when PR merged - self.exec( - [ - "charmed-valkey.sentinel", - self.sentinel_config.as_posix(), - ] - ) + self.valkey.start(services=[SNAP_SERVICE, SNAP_SENTINEL_SERVICE]) except snap.SnapError as e: logger.exception(str(e)) @@ -122,6 +117,8 @@ def exec(self, command: List[str]) -> tuple[str, str | None]: def alive(self) -> bool: """Check if the Valkey service is running.""" try: - return bool(self.valkey.services[SNAP_SERVICE]["active"]) + return bool(self.valkey.services[SNAP_SERVICE]["active"]) and bool( + self.valkey.services[SNAP_SENTINEL_SERVICE]["active"] + ) except KeyError: return False diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index 6519207..ea1ae44 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -45,6 +45,7 @@ def __init__( app: str, initial_count: int = 0, log_written_values: bool = False, + in_between_sleep: float = 1, ): self._juju = juju self._app = app @@ -54,6 +55,7 @@ def __init__( self._process = None self._initial_count = initial_count self._log_written_values = log_written_values + self._in_between_sleep = in_between_sleep def _get_config(self) -> SimpleNamespace: """Fetch current cluster configuration from Juju.""" @@ -96,7 +98,13 @@ def start(self) -> None: self._process = Process( target=self._run_wrapper, name="continuous_writes", - args=(self._event, self._queue, self._initial_count, self._log_written_values), + args=( + self._event, + self._queue, + self._initial_count, + self._log_written_values, + self._in_between_sleep, + ), ) self.update() # Load initial config into queue @@ -159,7 +167,11 @@ def stop(self) -> SimpleNamespace: @staticmethod def _run_wrapper( - event: Event, data_queue: Queue, starting_number: int, log_written_values: bool = False + event: Event, + data_queue: Queue, + starting_number: int, + log_written_values: bool = False, + in_between_sleep: float = 1, ) -> None: """Entry point for the Process; simplified without unnecessary asyncio.""" proc_logger = log_to_stderr() @@ -197,7 +209,7 @@ def _make_client(conf): proc_logger.info(f"Wrote value: {current_val}") current_val += 1 # Throttle to avoid flooding small test runners - time.sleep(1) + time.sleep(in_between_sleep) else: raise WriteFailedError("LPUSH returned 0/None") except Exception as e: @@ -214,8 +226,18 @@ def _make_client(conf): if __name__ == "__main__": # Example usage juju_env = jubilant.Juju(model="testing") - cw = ContinuousWrites(juju=juju_env, app="valkey", initial_count=100, log_written_values=False) + cw = ContinuousWrites( + juju=juju_env, + app="valkey", + initial_count=100, + log_written_values=True, + in_between_sleep=1, + ) cw.clear() cw.start() - time.sleep(10) - print(f"Stats: {cw.clear()}") + # continue until manually stopped by ctrl+c or by calling cw.stop() from another process + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + print(f"Stats: {cw.clear()}") diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 4777f75..81e9b8e 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -336,23 +336,6 @@ def fast_forward(juju: jubilant.Juju): juju.model_config({"update-status-hook-interval": old}) -# TODO switch to sentinel once VM is implemented -# def get_primary_ip(juju: jubilant.Juju, app: str) -> str: -# """Get the primary node of the Valkey cluster. - - -# Returns: -# The IP address of the primary node. -# """ -# hostnames = get_cluster_hostnames(juju, app) -# client = create_sentinel_client( -# hostnames=hostnames, -# valkey_user=CharmUsers.VALKEY_ADMIN.value, -# valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), -# sentinel_user=CharmUsers.SENTINEL_CHARM_ADMIN.value, -# sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), -# ) -# return client.discover_master("primary")[0] def get_primary_ip(juju: jubilant.Juju, app: str) -> str: """Get the primary node of the Valkey cluster. @@ -360,9 +343,14 @@ def get_primary_ip(juju: jubilant.Juju, app: str) -> str: The IP address of the primary node. """ hostnames = get_cluster_hostnames(juju, app) - client = create_valkey_client(hostname=hostnames[0], password=get_password(juju)) - info = client.info("replication") - return hostnames[0] if info["role"] == "master" else info.get("master_host", "") + client = create_sentinel_client( + hostnames=hostnames, + valkey_user=CharmUsers.VALKEY_ADMIN.value, + valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + sentinel_user=CharmUsers.SENTINEL_CHARM_ADMIN.value, + sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), + ) + return client.discover_master("primary")[0] def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN) -> str: From 5244669c965295a416b767c25d8ea8a598803683 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 11 Feb 2026 12:01:53 +0000 Subject: [PATCH 071/159] fix role for valkey sentinel user --- src/literals.py | 2 +- src/managers/config.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/literals.py b/src/literals.py index f3ea6ba..60dccb3 100644 --- a/src/literals.py +++ b/src/literals.py @@ -56,7 +56,7 @@ class CharmUsers(StrEnum): CHARM_USERS_ROLE_MAP = { CharmUsers.VALKEY_ADMIN: "~* +@all", - CharmUsers.VALKEY_SENTINEL: "+client +config +info +publish +subscribe +monitor +ping +replicaof +failover +script|kill +multi +exec &__sentinel__:hello", + CharmUsers.VALKEY_SENTINEL: "+subscribe +publish +failover +script|kill +ping +info +multi +slaveof +config +client +exec &__sentinel__:hello", CharmUsers.VALKEY_REPLICA: "+psync +replconf +ping", CharmUsers.VALKEY_MONITORING: "-@all +@connection +memory -readonly +strlen +config|get +xinfo +pfcount -quit +zcard +type +xlen -readwrite -command +client -wait +scard +llen +hlen +get +eval +slowlog +cluster|info +cluster|slots +cluster|nodes -hello -echo +info +latency +scan -reset -auth -asking", CharmUsers.SENTINEL_ADMIN: "~* +@all", diff --git a/src/managers/config.py b/src/managers/config.py index b88740f..d72d83f 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -90,10 +90,10 @@ def get_config_properties(self, primary_ip: str) -> dict[str, str]: # set replicaof logger.debug("Setting replicaof to primary %s", primary_ip) config_properties["replicaof"] = f"{primary_ip} {CLIENT_PORT}" - config_properties["primaryuser"] = CharmUsers.VALKEY_REPLICA.value - config_properties["primaryauth"] = self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_REPLICA.value, "" - ) + config_properties["primaryuser"] = CharmUsers.VALKEY_REPLICA.value + config_properties["primaryauth"] = self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_REPLICA.value, "" + ) return config_properties From 9a0a081df8a4d4c1fe2acde101b9a5982a498a4d Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 12 Feb 2026 05:19:36 +0000 Subject: [PATCH 072/159] update to the new rock and its user --- metadata.yaml | 2 +- src/workload_k8s.py | 2 +- tests/unit/test_charm.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata.yaml b/metadata.yaml index ecb700c..69e11f4 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -27,7 +27,7 @@ resources: valkey-image: type: oci-image description: OCI Image for Valkey - upstream-source: ghcr.io/canonical/valkey@sha256:3f884d584eac51f3794d3538861f84e5f9e866b890ae0869deb7e4df6fc8eb21 + upstream-source: ghcr.io/canonical/valkey@sha256:bb8166ff96c5159ed0ab04e49a7b3e5b6a074cbd90ec66baf96e4d03e2fd7c90 peers: valkey-peers: diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 5fe5482..c991e32 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -42,7 +42,7 @@ def __init__(self, container: ops.Container | None) -> None: self.sentinel_service = "valkey-sentinel" self.metric_service = "metric_exporter" self.cli = "valkey-cli" - self.user = "valkey" + self.user = "_daemon_" @property @override diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 85cb0f8..5605cd1 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -22,7 +22,7 @@ from .helpers import status_is -CHARM_USER = "valkey" +CHARM_USER = "_daemon_" CONTAINER = "valkey" SERVICE_VALKEY = "valkey" SERVICE_METRIC_EXPORTER = "metric_exporter" From 60504e26e50fc34551c66abd17d0ab27c88102a3 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 12 Feb 2026 05:20:04 +0000 Subject: [PATCH 073/159] only log the command no arguments to avoid leaking secrets --- src/managers/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index cccd778..3760cc4 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -199,9 +199,9 @@ def _exec_cli_command( password, ] + command output, error = self.workload.exec(cli_command) - logger.debug("Executed command: %s, got output: %s", " ".join(command), output) + logger.debug("Executed command: %s, got output: %s", " ".join(command[0]), output) if error: - logger.error("Error output from command '%s': %s", " ".join(command), error) + logger.error("Error output from command '%s': %s", " ".join(command[0]), error) return output, error def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: From ea713537486540f1335589c6cf3c414a3a8be38c Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 13 Feb 2026 07:19:44 +0000 Subject: [PATCH 074/159] refactored start procedure and added rene feedback --- src/charm.py | 2 + src/common/client.py | 107 ++++++++--------------- src/core/cluster_state.py | 5 ++ src/core/models.py | 7 +- src/events/base_events.py | 165 ++++++++++++----------------------- src/literals.py | 9 ++ src/managers/cluster.py | 177 ++++++++++++-------------------------- src/managers/sentinel.py | 116 +++++++++++++++++++++++++ src/statuses.py | 4 - src/workload_vm.py | 1 - 10 files changed, 284 insertions(+), 309 deletions(-) create mode 100644 src/managers/sentinel.py diff --git a/src/charm.py b/src/charm.py index a55e539..c920986 100755 --- a/src/charm.py +++ b/src/charm.py @@ -14,6 +14,7 @@ from literals import CONTAINER, Substrate from managers.cluster import ClusterManager from managers.config import ConfigManager +from managers.sentinel import SentinelManager from workload_k8s import ValkeyK8sWorkload from workload_vm import ValkeyVmWorkload @@ -42,6 +43,7 @@ def __init__(self, *args) -> None: # --- MANAGERS --- self.cluster_manager = ClusterManager(state=self.state, workload=self.workload) self.config_manager = ConfigManager(state=self.state, workload=self.workload) + self.sentinel_manager = SentinelManager(state=self.state, workload=self.workload) # --- STATUS HANDLER --- self.status = StatusHandler( diff --git a/src/common/client.py b/src/common/client.py index fef79e8..78e57f5 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -3,23 +3,11 @@ """ValkeyClient utility class to connect to valkey servers.""" -import asyncio import logging -from typing import Any +from typing import Literal -from glide import ( - GlideClient, - GlideClientConfiguration, - NodeAddress, - ServerCredentials, -) - -from common.exceptions import ( - ValkeyACLLoadError, - ValkeyConfigSetError, - ValkeyCustomCommandError, -) -from literals import CLIENT_PORT +from core.base_workload import WorkloadBase +from literals import CLIENT_PORT, SENTINEL_PORT logger = logging.getLogger(__name__) @@ -31,67 +19,46 @@ def __init__( self, username: str, password: str, - hosts: list[str], + workload: WorkloadBase, + connect_to: Literal["valkey", "sentinel"] = "valkey", ): - self.hosts = hosts - self.user = username + self.username = username self.password = password + self.workload = workload + self.connect_to = connect_to - async def create_client(self) -> GlideClient: - """Initialize the Valkey client.""" - addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in self.hosts] - credentials = ServerCredentials(username=self.user, password=self.password) - client_config = GlideClientConfiguration( - addresses, - credentials=credentials, - request_timeout=1000, # in milliseconds - ) - return await GlideClient.create(client_config) - - async def _run_custom_command(self, command: list[str]) -> Any: - """Run a custom command on the Valkey client. + def exec_cli_command( + self, + command: list[str], + hostname: str | None = None, + ) -> tuple[str, str | None]: + """Execute a Valkey CLI command on the server. Args: - command (list[str]): The command to run as a list of strings. + command (list[str]): The CLI command to execute, as a list of arguments. + hostname (str | None): The hostname to connect to. If None, defaults to the private IP of the unit. Returns: - Any result from the command. - """ - client = None - try: - client = await self.create_client() - result = await asyncio.wait_for(client.custom_command(command), timeout=5) - return result - # TODO refine exception handling - except Exception as e: - logger.error("Error running custom command: %s", e) - raise ValkeyCustomCommandError(f"Could not run custom command: {e}") - finally: - if client: - await client.close() + tuple[str, str | None]: The standard output and standard error from the command execution. - def reload_acl(self) -> None: - """Load ACL content to the Valkey server.""" - try: - result = asyncio.run(self._run_custom_command(["ACL", "LOAD"])) - logger.debug(f"ACL load result: {result}") - except ValkeyCustomCommandError as e: - logger.error(f"Error loading ACL: {e}") - raise ValkeyACLLoadError(f"Could not load ACL: {e}") - - def set_runtime_config(self, config_properties: dict[str, str]) -> None: - """Set configuration properties on the Valkey server. - - Args: - config_properties (dict[str, str]): Configuration properties to set. + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute. """ - try: - command = ["CONFIG", "SET"] - for key, value in config_properties.items(): - command.append(key) - command.append(value) - result = asyncio.run(self._run_custom_command(command)) - logger.debug("Config set result: %s", result) - except ValkeyCustomCommandError as e: - logger.error("Error setting config: %s", e) - raise ValkeyConfigSetError(f"Could not set config: {e}") + if not hostname: + hostname = self.workload.get_private_ip() + port = CLIENT_PORT if self.connect_to == "valkey" else SENTINEL_PORT + user = self.username + password = self.password + cli_command: list[str] = [ + self.workload.cli, + "-h", + hostname, + "-p", + str(port), + "--user", + user, + "--pass", + password, + ] + command + output, error = self.workload.exec(cli_command) + return output, error diff --git a/src/core/cluster_state.py b/src/core/cluster_state.py index f1993b5..b11b635 100644 --- a/src/core/cluster_state.py +++ b/src/core/cluster_state.py @@ -131,3 +131,8 @@ def get_secret_from_id(self, secret_id: str, refresh: bool = False) -> dict[str, raise return secret_content + + @property + def number_units_started(self) -> int: + """Return the number of units in the cluster that have their Valkey server started.""" + return len([unit for unit in self.servers if unit.model and unit.is_started]) diff --git a/src/core/models.py b/src/core/models.py index 273c87f..642a628 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -18,7 +18,7 @@ from pydantic import Field from typing_extensions import Annotated -from literals import CharmUsers +from literals import CharmUsers, StartState logger = logging.getLogger(__name__) @@ -43,9 +43,10 @@ class PeerUnitModel(PeerModel): """Model for the peer unit data.""" charmed_operator_password: InternalUsersSecret = Field(default="") - started: bool = Field(default=False) + start_state: str = Field(default=StartState.NOT_STARTED.value) hostname: str = Field(default="") private_ip: str = Field(default="") + request_start_lock: bool = Field(default=False) class RelationState: @@ -117,7 +118,7 @@ def unit_name(self) -> str: @property def is_started(self) -> bool: """Check if the unit has started.""" - return self.model.started if self.model else False + return self.model.start_state == StartState.STARTED.value if self.model else False @property def valkey_admin_password(self) -> str: diff --git a/src/events/base_events.py b/src/events/base_events.py index b3b4c35..a74d12b 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -17,6 +17,7 @@ INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, CharmUsers, + StartState, Substrate, ) from statuses import CharmStatuses, ClusterStatuses, ValkeyServiceStatuses @@ -47,9 +48,6 @@ def __init__(self, charm: "ValkeyCharm"): self.framework.observe(self.charm.on.install, self._on_install) self.framework.observe(self.charm.on.start, self._on_start) - self.framework.observe( - self.charm.on[PEER_RELATION].relation_joined, self._on_peer_relation_joined - ) self.framework.observe( self.charm.on[PEER_RELATION].relation_changed, self._on_peer_relation_changed ) @@ -77,55 +75,39 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - if not self.charm.unit.is_leader(): - if ( - not self.charm.state.cluster.internal_users_credentials - or not self.charm.cluster_manager.number_units_started - ): - logger.info( - "Non-leader unit waiting for leader to set primary and internal user credentials" - ) - self.charm.status.set_running_status( - ClusterStatuses.WAITING_FOR_PRIMARY_START.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - event.defer() - return + if self.charm.unit.is_leader(): + self._start_services(event, primary_ip=self.charm.workload.get_private_ip()) + logger.info("Services started") + self.charm.state.unit_server.update({"start_state": StartState.STARTED.value}) + return - self.charm.state.statuses.delete( - ClusterStatuses.WAITING_FOR_PRIMARY_START.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - if self.charm.state.cluster.model.starting_member != self.charm.unit.name: - logger.info("Non-leader unit waiting for leader to choose it as starting member") - self.charm.status.set_running_status( - CharmStatuses.WAITING_TO_START.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - event.defer() - return - self.charm.state.statuses.delete( - CharmStatuses.WAITING_TO_START.value, - scope="unit", - component=self.charm.cluster_manager.name, + if ( + not self.charm.state.cluster.internal_users_credentials + or not self.charm.state.number_units_started + ): + logger.info( + "Non-leader unit waiting for leader to set primary and internal user credentials" ) + event.defer() + return - if not ( - primary_ip := ( - self.charm.workload.get_private_ip() - if self.charm.unit.is_leader() - else self.charm.cluster_manager.get_primary_ip() - ) - ): + self.charm.state.unit_server.update({"request_start_lock": True}) + + if self.charm.state.cluster.model.starting_member != self.charm.unit.name: + logger.info("Non-leader unit waiting for leader to choose it as starting member") + event.defer() + return + + if not (primary_ip := (self.charm.sentinel_manager.get_primary_ip())): logger.error("Primary IP not found. Deferring start event.") event.defer() return + self._start_services(event, primary_ip=primary_ip) + self.unit_fully_started.emit() + + def _start_services(self, event: ops.StartEvent, primary_ip: str) -> None: + """Start Valkey and Sentinel services.""" try: self.charm.config_manager.update_local_valkey_admin() self.charm.config_manager.set_config_properties(primary_ip=primary_ip) @@ -177,81 +159,42 @@ def _on_start(self, event: ops.StartEvent) -> None: scope="unit", component=self.charm.cluster_manager.name, ) - if self.charm.unit.is_leader(): - logger.info("Services started") - self.charm.state.unit_server.update({"started": True}) - return - - self.unit_fully_started.emit() # TODO check how to trigger if deferred without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: """Handle the unit-fully-started event.""" - self.charm.status.set_running_status( - ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - self.charm.status.set_running_status( - ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - - if not self.charm.cluster_manager.is_sentinel_discovered(): + # Only ran on non-leader units when starting replicas + if not self.charm.sentinel_manager.is_sentinel_discovered(): logger.info("Sentinel service not yet discovered by other units. Deferring event.") + self.charm.state.unit_server.update( + {"start_state": StartState.STARTING_WAITING_SENTINEL.value} + ) event.defer() return - self.charm.state.statuses.delete( - ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - if not self.charm.cluster_manager.is_replica_synced(): logger.info("Replica not yet synced. Deferring event.") + self.charm.state.unit_server.update( + {"start_state": StartState.STARTING_WAITING_REPLICA_SYNC.value} + ) event.defer() return - self.charm.state.statuses.delete( - ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - logger.info("Services started") - self.charm.state.unit_server.update({"started": True}) - - def _on_peer_relation_joined(self, event: ops.RelationJoinedEvent) -> None: - """Handle event received by all units when a new unit joins the cluster relation.""" - if not self.charm.unit.is_leader() or not event.unit: - return - - logger.debug("Peer relation joined by %s", event.unit.name) - - if not self.charm.state.unit_server.is_started: - logger.info("Primary member has not started yet. Deferring event.") - event.defer() - return - - if self.charm.state.cluster.model.starting_member: - logger.debug( - "%s is already starting. Deferring relation joined event for %s", - self.charm.state.cluster.model.starting_member, - event.unit.name, - ) - event.defer() - return - self.charm.state.cluster.update({"starting_member": event.unit.name}) + self.charm.state.unit_server.update( + {"start_state": StartState.STARTED.value, "request_start_lock": False} + ) def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: """Handle event received by all units when a unit's relation data changes.""" - logger.debug( - "Starting member is currently %s", self.charm.state.cluster.model.starting_member - ) + if not self.charm.unit.is_leader(): + return + + units_requesting_start = [ + unit.unit_name + for unit in self.charm.state.servers + if unit.model and unit.model.request_start_lock + ] starting_unit = next( ( unit @@ -261,19 +204,25 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: None, ) logger.debug( - "Starting unit has started: %s", + "Starting unit %s has started: %s", + self.charm.state.cluster.model.starting_member, starting_unit.is_started if starting_unit else "No starting unit", ) - if ( + if not units_requesting_start or ( + # if the starting member has not started yet, we want to wait for it to start instead of choosing another unit that requested start self.charm.state.cluster.model.starting_member and starting_unit - and starting_unit.is_started + and not starting_unit.is_started ): logger.debug( - "Starting member %s has started. Clearing starting member field.", + "Starting member %s has not started yet. Units requesting start: %s. ", self.charm.state.cluster.model.starting_member, + units_requesting_start, ) - self.charm.state.cluster.update({"starting_member": ""}) + + self.charm.state.cluster.update( + {"starting_member": units_requesting_start[0] if units_requesting_start else ""} + ) def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: """Handle the update-status event.""" diff --git a/src/literals.py b/src/literals.py index 60dccb3..bab1fc8 100644 --- a/src/literals.py +++ b/src/literals.py @@ -69,3 +69,12 @@ class Substrate(StrEnum): VM = "vm" K8S = "k8s" + + +class StartState(StrEnum): + """Start states for the service.""" + + NOT_STARTED = "not_started" + STARTING_WAITING_SENTINEL = "starting_waiting_sentinel" + STARTING_WAITING_REPLICA_SYNC = "starting_waiting_replica_sync" + STARTED = "started" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 3760cc4..0c81cd8 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -5,13 +5,13 @@ """Manager for all cluster related tasks.""" import logging -from typing import Literal import tenacity from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope +from common.client import ValkeyClient from common.exceptions import ( ValkeyACLLoadError, ValkeyConfigSetError, @@ -19,8 +19,8 @@ ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import CLIENT_PORT, PRIMARY_NAME, SENTINEL_PORT, CharmUsers -from statuses import CharmStatuses +from literals import CharmUsers, StartState +from statuses import CharmStatuses, ClusterStatuses logger = logging.getLogger(__name__) @@ -35,27 +35,33 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload self.admin_user = CharmUsers.VALKEY_ADMIN.value - self.admin_password = self.state.unit_server.valkey_admin_password @property - def number_units_started(self) -> int: - """Return the number of units in the cluster that have their Valkey server started.""" - return len([unit for unit in self.state.servers if unit.model and unit.model.started]) + def admin_password(self) -> str: + """Get the password of the admin user for the Valkey cluster.""" + return self.state.unit_server.valkey_admin_password def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" try: - self._exec_cli_command(["acl", "load"]) + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + ) + client.exec_cli_command(["acl", "load"]) except ValkeyWorkloadCommandError: raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: """Update the primaryauth runtime configuration on the Valkey server.""" - if self.get_primary_ip() == self.state.unit_server.model.private_ip: - logger.info("Current unit is primary; no need to update primaryauth") - return + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + ) try: - self._exec_cli_command( + client.exec_cli_command( [ "config", "set", @@ -69,38 +75,6 @@ def update_primary_auth(self) -> None: except ValkeyWorkloadCommandError: raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") - @tenacity.retry( - wait=tenacity.wait_fixed(5), - stop=tenacity.stop_after_attempt(5), - retry=tenacity.retry_if_result(lambda result: result is False), - reraise=True, - ) - def is_sentinel_discovered(self) -> bool: - """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" - # list of active sentinels: units with started flag true - active_sentinels = [ - unit.model.private_ip - for unit in self.state.servers - if unit.model - and unit.model.started - and unit.model.private_ip != self.state.unit_server.model.private_ip - ] - - for sentinel_ip in active_sentinels: - try: - output, _ = self._exec_cli_command( - command=["sentinel", "sentinels", PRIMARY_NAME], - hostname=sentinel_ip, - connect_to="sentinel", - ) - if self.state.unit_server.model.private_ip not in output: - logger.info(f"Sentinel at {sentinel_ip} has discovered this sentinel") - return False - except ValkeyWorkloadCommandError: - logger.warning(f"Could not query sentinel at {sentinel_ip} for primary discovery.") - continue - return True - @tenacity.retry( wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_attempt(5), @@ -109,12 +83,14 @@ def is_sentinel_discovered(self) -> bool: ) def is_replica_synced(self) -> bool: """Check if the replica is synced with the primary.""" - if self.get_primary_ip() == self.state.unit_server.model.private_ip: - logger.info("Current unit is primary; no need to check replica sync") - return True + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + ) try: output = ( - self._exec_cli_command( + client.exec_cli_command( command=["role"], )[0] .strip() @@ -129,81 +105,6 @@ def is_replica_synced(self) -> bool: logger.warning("Could not determine replica sync status from Valkey server.") return False - def get_primary_ip(self) -> str | None: - """Get the IP address of the primary node in the cluster.""" - started_servers = [ - unit for unit in self.state.servers if unit.model and unit.model.started - ] - - for unit in started_servers: - try: - output = self._exec_cli_command( - ["sentinel", "get-master-addr-by-name", PRIMARY_NAME], - connect_to="sentinel", - hostname=unit.model.private_ip, - )[0] - primary_ip = output.strip().split()[0] - logger.info(f"Primary IP address is {primary_ip}") - return primary_ip - except (IndexError, ValkeyWorkloadCommandError): - logger.error("Could not get primary IP from sentinel output.") - - logger.error( - "Could not determine primary IP from sentinels. Number of started servers: %d.", - len(started_servers), - ) - - def _exec_cli_command( - self, - command: list[str], - hostname: str | None = None, - connect_to: Literal["valkey", "sentinel"] = "valkey", - ) -> tuple[str, str | None]: - """Execute a Valkey CLI command on the server. - - Args: - command (list[str]): The CLI command to execute, as a list of arguments. - hostname (str | None): The hostname to connect to. Defaults to private ip of unit. - connect_to (Literal["valkey", "sentinel"]): Whether to connect to the valkey server or sentinel for executing the command. Defaults to "valkey". - - Returns: - tuple[str, str | None]: The standard output and standard error from the command execution. - - Raises: - ValkeyWorkloadCommandError: If the CLI command fails to execute. - """ - if not hostname: - hostname = self.workload.get_private_ip() - port = CLIENT_PORT if connect_to == "valkey" else SENTINEL_PORT - user = ( - CharmUsers.VALKEY_ADMIN.value - if connect_to == "valkey" - else CharmUsers.SENTINEL_CHARM_ADMIN.value - ) - password = ( - self.state.unit_server.valkey_admin_password - if connect_to == "valkey" - else self.state.cluster.internal_users_credentials.get( - CharmUsers.SENTINEL_CHARM_ADMIN.value, "" - ) - ) - cli_command = [ - self.workload.cli, - "-h", - hostname, - "-p", - str(port), - "--user", - user, - "--pass", - password, - ] + command - output, error = self.workload.exec(cli_command) - logger.debug("Executed command: %s, got output: %s", " ".join(command[0]), output) - if error: - logger.error("Error output from command '%s': %s", " ".join(command[0]), error) - return output, error - def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( @@ -213,4 +114,34 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje if not self.workload.can_connect: status_list.append(CharmStatuses.SERVICE_NOT_STARTED.value) + # Peer relation not established yet, or model not built yet for unit or app + if not self.state.cluster.model or not self.state.unit_server.model: + return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] + + if self.state.charm.unit.is_leader(): + return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] + + # non leader statuses + if ( + not self.state.cluster.internal_users_credentials + or not self.state.number_units_started + ): + status_list.append( + ClusterStatuses.WAITING_FOR_PRIMARY_START.value, + ) + + match self.state.unit_server.model.start_state: + case StartState.NOT_STARTED.value: + status_list.append( + CharmStatuses.WAITING_TO_START.value, + ) + case StartState.STARTING_WAITING_SENTINEL.value: + status_list.append( + ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, + ) + case StartState.STARTING_WAITING_REPLICA_SYNC.value: + status_list.append( + ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, + ) + return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py new file mode 100644 index 0000000..60ae6d9 --- /dev/null +++ b/src/managers/sentinel.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Manager for all sentinel related tasks.""" + +import logging + +import tenacity +from data_platform_helpers.advanced_statuses.models import StatusObject +from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol +from data_platform_helpers.advanced_statuses.types import Scope + +from common.client import ValkeyClient +from common.exceptions import ( + ValkeyWorkloadCommandError, +) +from core.base_workload import WorkloadBase +from core.cluster_state import ClusterState +from literals import PRIMARY_NAME, CharmUsers +from statuses import CharmStatuses + +logger = logging.getLogger(__name__) + + +class SentinelManager(ManagerStatusProtocol): + """Manage sentinel members.""" + + name: str = "sentinel" + state: ClusterState + + def __init__(self, state: ClusterState, workload: WorkloadBase): + self.state = state + self.workload = workload + self.admin_user = CharmUsers.SENTINEL_CHARM_ADMIN.value + + @property + def admin_password(self) -> str: + """Get the password of the admin user for the sentinel service.""" + return self.state.cluster.internal_users_credentials.get( + CharmUsers.SENTINEL_CHARM_ADMIN.value, "" + ) + + @tenacity.retry( + wait=tenacity.wait_fixed(5), + stop=tenacity.stop_after_attempt(5), + retry=tenacity.retry_if_result(lambda result: result is False), + reraise=True, + ) + def is_sentinel_discovered(self) -> bool: + """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" + # list of active sentinels: units with started flag true + active_sentinels = [ + unit.model.private_ip + for unit in self.state.servers + if unit.model + and unit.is_started + and unit.model.private_ip != self.state.unit_server.model.private_ip + ] + + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + + for sentinel_ip in active_sentinels: + try: + output, _ = client.exec_cli_command( + command=["sentinel", "sentinels", PRIMARY_NAME], + hostname=sentinel_ip, + ) + if self.state.unit_server.model.private_ip not in output: + logger.info(f"Sentinel at {sentinel_ip} has not discovered this sentinel") + return False + except ValkeyWorkloadCommandError: + logger.warning(f"Could not query sentinel at {sentinel_ip} for primary discovery.") + continue + return True + + def get_primary_ip(self) -> str | None: + """Get the IP address of the primary node in the cluster.""" + started_servers = [unit for unit in self.state.servers if unit.model and unit.is_started] + + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + + for unit in started_servers: + try: + output = client.exec_cli_command( + command=["sentinel", "get-master-addr-by-name", PRIMARY_NAME], + hostname=unit.model.private_ip, + )[0] + primary_ip = output.strip().split()[0] + logger.info(f"Primary IP address is {primary_ip}") + return primary_ip + except (IndexError, ValkeyWorkloadCommandError): + logger.error("Could not get primary IP from sentinel output.") + + logger.error( + "Could not determine primary IP from sentinels. Number of started servers: %d.", + len(started_servers), + ) + + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: + """Compute the sentinel manager's statuses.""" + status_list: list[StatusObject] = self.state.statuses.get( + scope=scope, component=self.name, running_status_only=True, running_status_type="async" + ).root + + return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/statuses.py b/src/statuses.py index f1dc39b..23cdd81 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -30,7 +30,6 @@ class CharmStatuses(Enum): WAITING_TO_START = StatusObject( status="maintenance", message="Waiting for leader to authorize service start", - running="async", ) CONFIGURATION_ERROR = StatusObject( status="blocked", @@ -51,19 +50,16 @@ class ClusterStatuses(Enum): WAITING_FOR_SENTINEL_DISCOVERY = StatusObject( status="maintenance", message="Waiting for sentinel to be discovered by other units...", - running="async", ) WAITING_FOR_REPLICA_SYNC = StatusObject( status="maintenance", message="Waiting for replica to sync with primary...", - running="async", ) WAITING_FOR_PRIMARY_START = StatusObject( status="maintenance", message="Waiting for primary to start and become active...", - running="async", ) diff --git a/src/workload_vm.py b/src/workload_vm.py index fdfd8ed..949383f 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -104,7 +104,6 @@ def exec(self, command: List[str]) -> tuple[str, str | None]: capture_output=True, timeout=10, ) - logger.debug("Executed command: %s, got output: %s", " ".join(command), output.stdout) return output.stdout, output.stderr except subprocess.CalledProcessError as e: logger.error("Command failed with %s, %s", e.returncode, e.stderr) From ab3e4c59ed4b33277e058d30a959d9fde770d8b6 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 13 Feb 2026 07:46:57 +0000 Subject: [PATCH 075/159] fix unit tests and fine tune statuses --- src/events/base_events.py | 8 ++------ src/managers/cluster.py | 22 +++++++++++----------- tests/unit/test_charm.py | 36 ++++++++++++++++++------------------ 3 files changed, 31 insertions(+), 35 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index a74d12b..d9e5d64 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -74,6 +74,7 @@ def _on_start(self, event: ops.StartEvent) -> None: logger.warning("Workload not ready yet") event.defer() return + self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) if self.charm.unit.is_leader(): self._start_services(event, primary_ip=self.charm.workload.get_private_ip()) @@ -203,12 +204,7 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: ), None, ) - logger.debug( - "Starting unit %s has started: %s", - self.charm.state.cluster.model.starting_member, - starting_unit.is_started if starting_unit else "No starting unit", - ) - if not units_requesting_start or ( + if ( # if the starting member has not started yet, we want to wait for it to start instead of choosing another unit that requested start self.charm.state.cluster.model.starting_member and starting_unit diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 0c81cd8..b0b6ce0 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -122,19 +122,19 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] # non leader statuses - if ( - not self.state.cluster.internal_users_credentials - or not self.state.number_units_started - ): - status_list.append( - ClusterStatuses.WAITING_FOR_PRIMARY_START.value, - ) - match self.state.unit_server.model.start_state: case StartState.NOT_STARTED.value: - status_list.append( - CharmStatuses.WAITING_TO_START.value, - ) + if ( + not self.state.cluster.internal_users_credentials + or not self.state.number_units_started + ): + status_list.append( + ClusterStatuses.WAITING_FOR_PRIMARY_START.value, + ) + else: + status_list.append( + CharmStatuses.WAITING_TO_START.value, + ) case StartState.STARTING_WAITING_SENTINEL.value: status_list.append( ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 5605cd1..6dd02a1 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -130,7 +130,7 @@ def test_start_non_leader_unit(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.1.0.1"), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.1.0.1"), ): state_out = ctx.run(ctx.on.start(), state_in) assert not state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) @@ -155,7 +155,7 @@ def test_start_non_leader_unit(cloud_spec): id=1, endpoint=PEER_RELATION, local_app_data={"primary-ip": "127.1.0.1"}, - peers_data={1: {"started": "true"}}, + peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -174,7 +174,7 @@ def test_start_non_leader_unit(cloud_spec): id=1, endpoint=PEER_RELATION, local_app_data={"starting-member": "valkey/0"}, - peers_data={1: {"started": "true"}}, + peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -187,12 +187,12 @@ def test_start_non_leader_unit(cloud_spec): assert status_is(state_out, ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value) # sentinel not yet discovered - with patch("managers.cluster.ClusterManager.is_sentinel_discovered", return_value=False): + with patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=False): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, local_app_data={"starting-member": "valkey/0"}, - peers_data={1: {"started": "true"}}, + peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -206,14 +206,14 @@ def test_start_non_leader_unit(cloud_spec): # Happy path with sentinel discovered and replica synced with ( - patch("managers.cluster.ClusterManager.is_sentinel_discovered", return_value=True), + patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=True), patch("managers.cluster.ClusterManager.is_replica_synced", return_value=True), ): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, local_app_data={"starting-member": "valkey/0"}, - peers_data={1: {"started": "true"}}, + peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), @@ -230,7 +230,7 @@ def test_start_non_leader_unit(cloud_spec): SERVICE_METRIC_EXPORTER ) assert state_out.get_container(container.name).service_statuses[SERVICE_SENTINEL] - assert state_out.get_relation(1).local_unit_data["started"] == "true" + assert state_out.get_relation(1).local_unit_data["start-state"] == "started" def test_update_status_leader_unit(cloud_spec): @@ -238,7 +238,7 @@ def test_update_status_leader_unit(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_unit_data={"started": "True"}, + local_unit_data={"start-state": "started"}, ) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) @@ -258,7 +258,7 @@ def test_update_status_leader_unit(cloud_spec): def test_update_status_non_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation( - id=1, endpoint=PEER_RELATION, local_unit_data={"started": "true"} + id=1, endpoint=PEER_RELATION, local_unit_data={"start-state": "started"} ) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) @@ -428,7 +428,7 @@ def test_config_changed_leader_unit(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, + patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() @@ -462,13 +462,13 @@ def test_config_changed_leader_unit_primary(cloud_spec): with ( patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, + patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, patch("core.base_workload.WorkloadBase.get_private_ip", return_value="127.0.1.1"), - patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.0.1.1"), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with(["acl", "load"]) + assert mock_exec_command.call_count == 2 # one for acl load, one for primaryauth set secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) @@ -515,7 +515,7 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_unit_data={"started": "true", "private-ip": "127.0.1.0"}, + local_unit_data={"start-state": "started", "private-ip": "127.0.1.0"}, ) container = testing.Container(name=CONTAINER, can_connect=True) @@ -538,8 +538,8 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): "events.base_events.BaseEvents._update_internal_users_password" ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("managers.cluster.ClusterManager._exec_cli_command") as mock_exec_command, - patch("managers.cluster.ClusterManager.get_primary_ip", return_value="127.0.1.1"), + patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() @@ -573,7 +573,7 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spe ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch( - "managers.cluster.ClusterManager._exec_cli_command", + "common.client.ValkeyClient.exec_cli_command", side_effect=ValkeyWorkloadCommandError("Failed to execute command"), ) as mock_exec_command, ctx(ctx.on.secret_changed(password_secret), state_in) as manager, From 99562f00ac4343baca22a9a1c387924832fb8043 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 13 Feb 2026 08:06:25 +0000 Subject: [PATCH 076/159] fixes for rene feedback --- src/managers/config.py | 29 ++++---- tests/integration/k8s/ha/test_scaling.py | 12 ++-- tests/unit/conftest.py | 5 ++ tests/unit/test_charm.py | 92 ++++++++++-------------- 4 files changed, 63 insertions(+), 75 deletions(-) diff --git a/src/managers/config.py b/src/managers/config.py index d72d83f..9f8a722 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -80,22 +80,25 @@ def get_config_properties(self, primary_ip: str) -> dict[str, str]: else: config_properties["bind"] = "0.0.0.0 -::1" - logger.debug( - "primary: %s, hostname: %s", - primary_ip, - self.state.unit_server.model.hostname, - ) - # replicaof + # replica related config + replica_config = self.generate_replica_config(primary_ip=primary_ip) + config_properties.update(replica_config) + + return config_properties + + def generate_replica_config(self, primary_ip): + """Generate the config properties related to replica configuration based on the current cluster state.""" + replica_config = { + "primaryuser": CharmUsers.VALKEY_REPLICA.value, + "primaryauth": self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_REPLICA.value, "" + ), + } if primary_ip != self.state.unit_server.model.private_ip: # set replicaof logger.debug("Setting replicaof to primary %s", primary_ip) - config_properties["replicaof"] = f"{primary_ip} {CLIENT_PORT}" - config_properties["primaryuser"] = CharmUsers.VALKEY_REPLICA.value - config_properties["primaryauth"] = self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_REPLICA.value, "" - ) - - return config_properties + replica_config["replicaof"] = f"{primary_ip} {CLIENT_PORT}" + return replica_config def set_config_properties(self, primary_ip: str) -> None: """Write the config properties to the config file.""" diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 9585c40..e55530f 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -50,17 +50,15 @@ def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: init_units_count = len(juju.status().apps[APP_NAME].units) # scale up - juju.add_unit(APP_NAME, num_units=2) + juju.add_unit(APP_NAME, num_units=NUM_UNITS - init_units_count) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, idle_period=10, unit_count=init_units_count + 2 + status, APP_NAME, idle_period=10, unit_count=NUM_UNITS ), timeout=1200, ) num_units = len(juju.status().apps[APP_NAME].units) - assert num_units == init_units_count + 2, ( - f"Expected {init_units_count + 2} units, got {num_units}." - ) + assert num_units == NUM_UNITS, f"Expected {NUM_UNITS} units, got {num_units}." # check if all units have been added to the cluster endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) @@ -77,8 +75,8 @@ def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: master = sentinel_client.master_for("primary") info = master.info("replication") connected_slaves = info.get("connected_slaves", 0) - assert connected_slaves == num_units - 1, ( - f"Expected {num_units - 1} connected slaves, got {connected_slaves}." + assert connected_slaves == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." ) assert_continuous_writes_increasing( diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 92b049c..cedaf24 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -11,6 +11,11 @@ def mock_write_config_file(mocker): mocker.patch("workload_k8s.ValkeyK8sWorkload.write_config_file") +@pytest.fixture(autouse=True) +def mock_write_file(mocker): + mocker.patch("workload_k8s.ValkeyK8sWorkload.write_file") + + @pytest.fixture(autouse=True) def cloud_spec(): return testing.CloudSpec( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 6dd02a1..8ffc350 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -81,38 +81,35 @@ def test_start_leader_unit(cloud_spec): } } - with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), - ): - # generate passwords - state_out = ctx.run(ctx.on.leader_elected(), state_in) - - # start event - state_out = ctx.run(ctx.on.start(), state_out) - assert state_out.get_container(container.name).plan == expected_plan - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] - == pebble.ServiceStatus.ACTIVE - ) - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] - == pebble.ServiceStatus.ACTIVE - ) - assert state_out.unit_status == ActiveStatus() - assert state_out.app_status == ActiveStatus() + # generate passwords + state_out = ctx.run(ctx.on.leader_elected(), state_in) + + # start event + state_out = ctx.run(ctx.on.start(), state_out) + assert state_out.get_container(container.name).plan == expected_plan + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] + == pebble.ServiceStatus.ACTIVE + ) + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] + == pebble.ServiceStatus.ACTIVE + ) + assert state_out.unit_status == ActiveStatus() + assert state_out.app_status == ActiveStatus() - # container not ready - container = testing.Container(name=CONTAINER, can_connect=False) - state_in = testing.State( - model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), - leader=True, - relations={relation, status_peer_relation}, - containers={container}, - ) + # container not ready + container = testing.Container(name=CONTAINER, can_connect=False) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=True, + relations={relation, status_peer_relation}, + containers={container}, + ) - state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value, is_app=True) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) + assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value, is_app=True) def test_start_non_leader_unit(cloud_spec): @@ -128,10 +125,7 @@ def test_start_non_leader_unit(cloud_spec): containers={container}, ) - with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.1.0.1"), - ): + with patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.1.0.1"): state_out = ctx.run(ctx.on.start(), state_in) assert not state_out.get_container(container.name).service_statuses.get(SERVICE_VALKEY) assert not state_out.get_container(container.name).service_statuses.get( @@ -284,12 +278,11 @@ def test_internal_user_creation(cloud_spec): leader=True, containers={container}, ) - with patch("workload_k8s.ValkeyK8sWorkload.write_file"): - state_out = ctx.run(ctx.on.leader_elected(), state_in) - secret_out = state_out.get_secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" - ) - assert secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") + state_out = ctx.run(ctx.on.leader_elected(), state_in) + secret_out = state_out.get_secret( + label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" + ) + assert secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") def test_leader_elected_no_peer_relation(cloud_spec): @@ -301,9 +294,8 @@ def test_leader_elected_no_peer_relation(cloud_spec): containers={container}, model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) - with patch("workload_k8s.ValkeyK8sWorkload.write_file"): - state_out = ctx.run(ctx.on.leader_elected(), state_in) - assert "leader_elected" in [e.name for e in state_out.deferred] + state_out = ctx.run(ctx.on.leader_elected(), state_in) + assert "leader_elected" in [e.name for e in state_out.deferred] def test_leader_elected_leader_password_specified(cloud_spec): @@ -323,7 +315,6 @@ def test_leader_elected_leader_password_specified(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch( "managers.config.ConfigManager.generate_password", return_value="generated-password" ), @@ -352,10 +343,7 @@ def test_leader_elected_leader_password_specified_wrong_secret(cloud_spec): config={INTERNAL_USERS_PASSWORD_CONFIG: "secret:1tf1wk0tmfrodp8ofwxn"}, model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) - with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), - pytest.raises(testing.errors.UncaughtCharmError) as exc_info, - ): + with pytest.raises(testing.errors.UncaughtCharmError) as exc_info: ctx.run(ctx.on.leader_elected(), state_in) assert "SecretNotFoundError" in str(exc_info.value) @@ -400,10 +388,7 @@ def test_config_changed_leader_unit_valkey_update_fails(cloud_spec): config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) - with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), - patch("core.models.RelationState.update") as mock_update, - ): + with patch("core.models.RelationState.update") as mock_update: ctx.run(ctx.on.config_changed(), state_in) mock_update.assert_called_once() @@ -426,7 +411,6 @@ def test_config_changed_leader_unit(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, ): @@ -460,7 +444,6 @@ def test_config_changed_leader_unit_primary(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, patch("core.base_workload.WorkloadBase.get_private_ip", return_value="127.0.1.1"), @@ -496,7 +479,6 @@ def test_config_changed_leader_unit_wrong_username(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) with ( - patch("workload_k8s.ValkeyK8sWorkload.write_file"), patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, ctx(ctx.on.config_changed(), state_in) as manager, ): From 4258bd5e4a8ceb27ae403c3bd4d8f31419c1dc39 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 13 Feb 2026 10:20:04 +0000 Subject: [PATCH 077/159] remove get_private_ip and replace it with bind_address --- src/common/client.py | 6 ++---- src/core/base_workload.py | 21 --------------------- src/core/cluster_state.py | 1 - src/events/base_events.py | 6 +++--- src/managers/cluster.py | 4 +++- tests/unit/conftest.py | 11 +++++++++++ tests/unit/test_charm.py | 3 +-- 7 files changed, 20 insertions(+), 32 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 78e57f5..17f563e 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -30,13 +30,13 @@ def __init__( def exec_cli_command( self, command: list[str], - hostname: str | None = None, + hostname: str, ) -> tuple[str, str | None]: """Execute a Valkey CLI command on the server. Args: command (list[str]): The CLI command to execute, as a list of arguments. - hostname (str | None): The hostname to connect to. If None, defaults to the private IP of the unit. + hostname (str): The hostname to connect to. Returns: tuple[str, str | None]: The standard output and standard error from the command execution. @@ -44,8 +44,6 @@ def exec_cli_command( Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute. """ - if not hostname: - hostname = self.workload.get_private_ip() port = CLIENT_PORT if self.connect_to == "valkey" else SENTINEL_PORT user = self.username password = self.password diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 9649bfc..8ded732 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -5,8 +5,6 @@ """Base objects for workload operations across different substrates.""" import logging -import socket -import subprocess from abc import ABC, abstractmethod from charmlibs import pathops @@ -51,25 +49,6 @@ def alive(self) -> bool: """Check if the Valkey service is running.""" pass - def get_private_ip(self) -> str: - """Get the Private IP address of the current unit.""" - cmd = "unit-get private-address" - try: - output = subprocess.run( - cmd, - check=True, - text=True, - shell=True, - capture_output=True, - timeout=10, - ) - if output.returncode == 0: - return output.stdout.strip() - except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e: - logger.error(f"Error executing command '{cmd}': {e}") - - return socket.gethostbyname(socket.gethostname()) - def write_file( self, content: str, diff --git a/src/core/cluster_state.py b/src/core/cluster_state.py index b11b635..9739f85 100644 --- a/src/core/cluster_state.py +++ b/src/core/cluster_state.py @@ -116,7 +116,6 @@ def get_secret_from_id(self, secret_id: str, refresh: bool = False) -> dict[str, """Resolve the given id of a Juju secret and return the content as a dict. Args: - model (Model): Model object. secret_id (str): The id of the secret. refresh (bool): Whether to refresh the secret content from the controller. Defaults to False. diff --git a/src/events/base_events.py b/src/events/base_events.py index d9e5d64..1f0258f 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -77,7 +77,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) if self.charm.unit.is_leader(): - self._start_services(event, primary_ip=self.charm.workload.get_private_ip()) + self._start_services(event, primary_ip=self.charm.state.bind_address) logger.info("Services started") self.charm.state.unit_server.update({"start_state": StartState.STARTED.value}) return @@ -234,7 +234,7 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: self.charm.state.unit_server.update( { "hostname": socket.gethostname(), - "private_ip": self.charm.workload.get_private_ip(), + "private_ip": self.charm.state.bind_address, } ) @@ -281,7 +281,7 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: self.charm.state.unit_server.update( { "hostname": socket.gethostname(), - "private_ip": self.charm.workload.get_private_ip(), + "private_ip": self.charm.state.bind_address, } ) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index b0b6ce0..f663452 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -49,7 +49,7 @@ def reload_acl_file(self) -> None: password=self.admin_password, workload=self.workload, ) - client.exec_cli_command(["acl", "load"]) + client.exec_cli_command(["acl", "load"], hostname=self.state.bind_address) except ValkeyWorkloadCommandError: raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") @@ -70,6 +70,7 @@ def update_primary_auth(self) -> None: CharmUsers.VALKEY_REPLICA.value, "" ), ], + hostname=self.state.bind_address, ) logger.info("Updated primaryauth runtime configuration on Valkey server") except ValkeyWorkloadCommandError: @@ -92,6 +93,7 @@ def is_replica_synced(self) -> bool: output = ( client.exec_cli_command( command=["role"], + hostname=self.state.bind_address, )[0] .strip() .split() diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index cedaf24..ea04b33 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -2,6 +2,8 @@ # Copyright 2026 Canonical Ltd. # See LICENSE file for licensing details. +from unittest.mock import PropertyMock + import pytest from ops import testing @@ -16,6 +18,15 @@ def mock_write_file(mocker): mocker.patch("workload_k8s.ValkeyK8sWorkload.write_file") +@pytest.fixture(autouse=True) +def mock_bind_address(mocker): + mocker.patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock, + return_value="127.1.1.1", + ) + + @pytest.fixture(autouse=True) def cloud_spec(): return testing.CloudSpec( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 8ffc350..dbf77c2 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -446,7 +446,6 @@ def test_config_changed_leader_unit_primary(cloud_spec): with ( patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, - patch("core.base_workload.WorkloadBase.get_private_ip", return_value="127.0.1.1"), patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), ): state_out = ctx.run(ctx.on.config_changed(), state_in) @@ -564,7 +563,7 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spe state_out = manager.run() mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with(["acl", "load"]) + mock_exec_command.assert_called_once_with(["acl", "load"], hostname="127.1.1.1") cluster_statuses = charm.state.statuses.get( scope="unit", component=charm.cluster_manager.name, From d00c206cef4be5ed904f8a07f68d1df0e15de321 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 13 Feb 2026 10:27:24 +0000 Subject: [PATCH 078/159] add unit tests for peer relation changed --- tests/unit/test_charm.py | 68 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index dbf77c2..d8eed58 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -17,6 +17,7 @@ PEER_RELATION, STATUS_PEERS_RELATION, CharmUsers, + StartState, ) from src.statuses import CharmStatuses, ClusterStatuses @@ -596,3 +597,70 @@ def test_change_password_secret_changed_leader_unit(cloud_spec): ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_called_once_with(password_secret.id) + + +def test_relation_changed_event_leader_setting_starting_member(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_unit_data={"start-state": "started"}, + peers_data={1: {"request-start-lock": "true"}}, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + ) + state_out = ctx.run(ctx.on.relation_changed(relation), state_in) + assert state_out.get_relation(1).local_app_data.get("starting-member") == "valkey/1" + + +def test_relation_changed_event_leader_clears_starting_member(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"starting-member": "valkey/1"}, + local_unit_data={"start-state": "started"}, + peers_data={1: {"start-state": "started"}}, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + ) + state_out = ctx.run(ctx.on.relation_changed(relation), state_in) + assert state_out.get_relation(1).local_app_data.get("starting-member") is None + + +def test_relation_changed_event_leader_leaves_starting_member_as_is(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"starting-member": "valkey/1"}, + local_unit_data={"start-state": StartState.STARTED.value}, + peers_data={ + 1: { + "start-state": StartState.STARTING_WAITING_REPLICA_SYNC.value, + "request-start-lock": "true", + } + }, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + ) + state_out = ctx.run(ctx.on.relation_changed(relation), state_in) + assert state_out.get_relation(1).local_app_data.get("starting-member") == "valkey/1" From ee7f33105115f7cf70014573bc1fc8437a604aef Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 16 Feb 2026 07:24:45 +0000 Subject: [PATCH 079/159] fix some feedback from mehdi --- src/charm.py | 1 + src/core/base_workload.py | 4 +- src/events/base_events.py | 51 ++++++++++------------ src/managers/cluster.py | 16 +++---- src/managers/config-template/sentinel.conf | 2 +- src/managers/config.py | 10 ++--- src/managers/sentinel.py | 19 ++++---- src/statuses.py | 8 ++-- src/workload_k8s.py | 14 +++--- src/workload_vm.py | 12 ++--- 10 files changed, 68 insertions(+), 69 deletions(-) diff --git a/src/charm.py b/src/charm.py index c920986..6915819 100755 --- a/src/charm.py +++ b/src/charm.py @@ -50,6 +50,7 @@ def __init__(self, *args) -> None: self, self.cluster_manager, self.config_manager, + self.sentinel_manager, ) # --- EVENT HANDLERS --- diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 8ded732..c6ce4d8 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -19,9 +19,9 @@ class WorkloadBase(ABC): def __init__(self) -> None: """Initialize the WorkloadBase.""" - self.root: pathops.PathProtocol + self.root_dir: pathops.PathProtocol self.config_file: pathops.PathProtocol - self.sentinel_config: pathops.PathProtocol + self.sentinel_config_file: pathops.PathProtocol self.acl_file: pathops.PathProtocol self.sentinel_acl_file: pathops.PathProtocol self.working_dir: pathops.PathProtocol diff --git a/src/events/base_events.py b/src/events/base_events.py index 1f0258f..3c98c03 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -33,7 +33,7 @@ class UnitFullyStarted(ops.EventBase): This event will be deferred until: The Sentinel service is running and was discovered by other units. - The Valkey service is running and the replica has finished syncing data. + The Valkey service is running and the current node is in sync with the primary (if a replica). """ @@ -69,23 +69,21 @@ def _on_install(self, event: ops.InstallEvent) -> None: raise RuntimeError("Failed to install the Valkey snap") def _on_start(self, event: ops.StartEvent) -> None: - """Handle the `pebble-ready` event.""" + """Handle the on start event.""" if not self.charm.workload.can_connect: logger.warning("Workload not ready yet") event.defer() return self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) - if self.charm.unit.is_leader(): + primary_ip = self.charm.sentinel_manager.get_primary_ip() + if self.charm.unit.is_leader() and not primary_ip: self._start_services(event, primary_ip=self.charm.state.bind_address) logger.info("Services started") self.charm.state.unit_server.update({"start_state": StartState.STARTED.value}) return - if ( - not self.charm.state.cluster.internal_users_credentials - or not self.charm.state.number_units_started - ): + if not self.charm.state.cluster.internal_users_credentials or not primary_ip: logger.info( "Non-leader unit waiting for leader to set primary and internal user credentials" ) @@ -94,23 +92,20 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.state.unit_server.update({"request_start_lock": True}) + # TODO unit.name would not work across models we need to switch to using `model.unit.name + model_uuid` if self.charm.state.cluster.model.starting_member != self.charm.unit.name: logger.info("Non-leader unit waiting for leader to choose it as starting member") event.defer() return - if not (primary_ip := (self.charm.sentinel_manager.get_primary_ip())): - logger.error("Primary IP not found. Deferring start event.") - event.defer() + if not self._start_services(event, primary_ip=primary_ip): return - - self._start_services(event, primary_ip=primary_ip) self.unit_fully_started.emit() - def _start_services(self, event: ops.StartEvent, primary_ip: str) -> None: + def _start_services(self, event: ops.StartEvent, primary_ip: str) -> bool: """Start Valkey and Sentinel services.""" try: - self.charm.config_manager.update_local_valkey_admin() + self.charm.config_manager.update_local_valkey_admin_password() self.charm.config_manager.set_config_properties(primary_ip=primary_ip) self.charm.config_manager.set_acl_file() self.charm.config_manager.set_sentinel_config_properties(primary_ip=primary_ip) @@ -124,7 +119,7 @@ def _start_services(self, event: ops.StartEvent, primary_ip: str) -> None: statuses_state=self.charm.state.statuses, ) event.defer() - return + return False self.charm.state.statuses.delete( CharmStatuses.CONFIGURATION_ERROR.value, scope="unit", @@ -138,15 +133,7 @@ def _start_services(self, event: ops.StartEvent, primary_ip: str) -> None: ) self.charm.workload.start() - if self.charm.workload.alive(): - logger.info("Workload started successfully. Opening client port") - self.charm.unit.open_port("tcp", CLIENT_PORT) - self.charm.state.statuses.delete( - ValkeyServiceStatuses.SERVICE_STARTING.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - else: + if not self.charm.workload.alive(): logger.error("Workload failed to start.") self.charm.status.set_running_status( ValkeyServiceStatuses.SERVICE_NOT_RUNNING.value, @@ -154,12 +141,22 @@ def _start_services(self, event: ops.StartEvent, primary_ip: str) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) + return False + + logger.info("Workload started successfully. Opening client port") + self.charm.unit.open_port("tcp", CLIENT_PORT) + self.charm.state.statuses.delete( + ValkeyServiceStatuses.SERVICE_STARTING.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) self.charm.state.statuses.delete( ValkeyServiceStatuses.SERVICE_NOT_RUNNING.value, scope="unit", component=self.charm.cluster_manager.name, ) + return True # TODO check how to trigger if deferred without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: @@ -269,7 +266,7 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: } ) # update local unit admin password - self.charm.config_manager.update_local_valkey_admin() + self.charm.config_manager.update_local_valkey_admin_password() try: self.charm.config_manager.set_acl_file() except ValkeyWorkloadCommandError: @@ -327,7 +324,7 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: self.charm.cluster_manager.reload_acl_file() self.charm.cluster_manager.update_primary_auth() # update the local unit admin password to match the leader - self.charm.config_manager.update_local_valkey_admin() + self.charm.config_manager.update_local_valkey_admin_password() except (ValkeyACLLoadError, ValkeyConfigSetError, ValkeyWorkloadCommandError) as e: logger.error(e) self.charm.status.set_running_status( @@ -394,7 +391,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: } ) # update the local unit admin password - self.charm.config_manager.update_local_valkey_admin() + self.charm.config_manager.update_local_valkey_admin_password() except ( ValkeyACLLoadError, ValueError, diff --git a/src/managers/cluster.py b/src/managers/cluster.py index f663452..ed5277f 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -6,10 +6,10 @@ import logging -import tenacity from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope +from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed from common.client import ValkeyClient from common.exceptions import ( @@ -76,10 +76,10 @@ def update_primary_auth(self) -> None: except ValkeyWorkloadCommandError: raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") - @tenacity.retry( - wait=tenacity.wait_fixed(5), - stop=tenacity.stop_after_attempt(5), - retry=tenacity.retry_if_result(lambda result: result is False), + @retry( + wait=wait_fixed(5), + stop=stop_after_attempt(5), + retry=retry_if_result(lambda result: result is False), reraise=True, ) def is_replica_synced(self) -> bool: @@ -118,10 +118,10 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje # Peer relation not established yet, or model not built yet for unit or app if not self.state.cluster.model or not self.state.unit_server.model: - return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] + return status_list or [CharmStatuses.ACTIVE_IDLE.value] if self.state.charm.unit.is_leader(): - return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] + return status_list or [CharmStatuses.ACTIVE_IDLE.value] # non leader statuses match self.state.unit_server.model.start_state: @@ -146,4 +146,4 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, ) - return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] + return status_list or [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/managers/config-template/sentinel.conf b/src/managers/config-template/sentinel.conf index abd5c60..3db10fe 100644 --- a/src/managers/config-template/sentinel.conf +++ b/src/managers/config-template/sentinel.conf @@ -358,4 +358,4 @@ SENTINEL announce-hostnames no # accept a -LOADING response after a primary has been rebooted, before failing # over. -SENTINEL primary-reboot-down-after-period mymaster 0 \ No newline at end of file +SENTINEL primary-reboot-down-after-period mymaster 0 diff --git a/src/managers/config.py b/src/managers/config.py index 9f8a722..820f8e4 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -81,12 +81,12 @@ def get_config_properties(self, primary_ip: str) -> dict[str, str]: config_properties["bind"] = "0.0.0.0 -::1" # replica related config - replica_config = self.generate_replica_config(primary_ip=primary_ip) + replica_config = self._generate_replica_config(primary_ip=primary_ip) config_properties.update(replica_config) return config_properties - def generate_replica_config(self, primary_ip): + def _generate_replica_config(self, primary_ip: str) -> dict[str, str]: """Generate the config properties related to replica configuration based on the current cluster state.""" replica_config = { "primaryuser": CharmUsers.VALKEY_REPLICA.value, @@ -151,7 +151,7 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: sentinel_config = f"port {SENTINEL_PORT}\n" sentinel_config += f"aclfile {self.workload.sentinel_acl_file.as_posix()}\n" - # TODO consider adding quorum calculation based on number of units + # TODO consider adding quorum calculation based on number of planned_units and the parity of the number of units sentinel_config += ( f"sentinel monitor {PRIMARY_NAME} {primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" ) @@ -172,7 +172,7 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: # on k8s we need to set the ownership of the sentinel config file to the non-root user that the valkey process runs as in order for sentinel to be able to read/write it self.workload.write_file( sentinel_config, - self.workload.sentinel_config, + self.workload.sentinel_config_file, mode=0o600, user=self.workload.user, group=self.workload.user, @@ -208,7 +208,7 @@ def generate_password(self) -> str: """ return "".join([secrets.choice(string.ascii_letters + string.digits) for _ in range(32)]) - def update_local_valkey_admin(self) -> None: + def update_local_valkey_admin_password(self) -> None: """Update the local unit's valkey admin password in the state.""" if not ( app_password := self.state.cluster.internal_users_credentials.get( diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 60ae6d9..9326159 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -6,10 +6,10 @@ import logging -import tenacity from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope +from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed from common.client import ValkeyClient from common.exceptions import ( @@ -41,10 +41,10 @@ def admin_password(self) -> str: CharmUsers.SENTINEL_CHARM_ADMIN.value, "" ) - @tenacity.retry( - wait=tenacity.wait_fixed(5), - stop=tenacity.stop_after_attempt(5), - retry=tenacity.retry_if_result(lambda result: result is False), + @retry( + wait=wait_fixed(5), + stop=stop_after_attempt(5), + retry=retry_if_result(lambda result: result is False), reraise=True, ) def is_sentinel_discovered(self) -> bool: @@ -76,7 +76,7 @@ def is_sentinel_discovered(self) -> bool: return False except ValkeyWorkloadCommandError: logger.warning(f"Could not query sentinel at {sentinel_ip} for primary discovery.") - continue + return False return True def get_primary_ip(self) -> str | None: @@ -99,13 +99,14 @@ def get_primary_ip(self) -> str | None: primary_ip = output.strip().split()[0] logger.info(f"Primary IP address is {primary_ip}") return primary_ip - except (IndexError, ValkeyWorkloadCommandError): - logger.error("Could not get primary IP from sentinel output.") + except (IndexError, ValkeyWorkloadCommandError) as e: + logger.error("Could not get primary IP from sentinel output: %s", e) logger.error( "Could not determine primary IP from sentinels. Number of started servers: %d.", len(started_servers), ) + return None def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the sentinel manager's statuses.""" @@ -113,4 +114,4 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje scope=scope, component=self.name, running_status_only=True, running_status_type="async" ).root - return status_list if status_list else [CharmStatuses.ACTIVE_IDLE.value] + return status_list or [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/statuses.py b/src/statuses.py index 23cdd81..b19d875 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -29,7 +29,7 @@ class CharmStatuses(Enum): ) WAITING_TO_START = StatusObject( status="maintenance", - message="Waiting for leader to authorize service start", + message="Waiting for leader to allow service start", ) CONFIGURATION_ERROR = StatusObject( status="blocked", @@ -59,7 +59,7 @@ class ClusterStatuses(Enum): WAITING_FOR_PRIMARY_START = StatusObject( status="maintenance", - message="Waiting for primary to start and become active...", + message="Waiting for the primary unit to start...", ) @@ -68,11 +68,11 @@ class ValkeyServiceStatuses(Enum): SERVICE_STARTING = StatusObject( status="maintenance", - message="waiting for valkey to start...", + message="Waiting for Valkey to start...", running="async", ) SERVICE_NOT_RUNNING = StatusObject( status="blocked", - message="valkey service not running", + message="Valkey service not running", running="async", ) diff --git a/src/workload_k8s.py b/src/workload_k8s.py index c991e32..37899c6 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -31,13 +31,13 @@ def __init__(self, container: ops.Container | None) -> None: raise AttributeError("Container is required.") self.container = container - self.root = pathops.ContainerPath("/", container=self.container) - self.config_file = self.root / CONFIG_FILE - self.sentinel_config = self.root / SENTINEL_CONFIG_FILE - self.acl_file = self.root / ACL_FILE - self.sentinel_acl_file = self.root / SENTINEL_ACL_FILE + self.root_dir = pathops.ContainerPath("/", container=self.container) + self.config_file = self.root_dir / CONFIG_FILE + self.sentinel_config_file = self.root_dir / SENTINEL_CONFIG_FILE + self.acl_file = self.root_dir / ACL_FILE + self.sentinel_acl_file = self.root_dir / SENTINEL_ACL_FILE # todo: update this path once directories in the rock are complying with the standard - self.working_dir = self.root / "var/lib/valkey" + self.working_dir = self.root_dir / "var/lib/valkey" self.valkey_service = "valkey" self.sentinel_service = "valkey-sentinel" self.metric_service = "metric_exporter" @@ -67,7 +67,7 @@ def pebble_layer(self) -> ops.pebble.Layer: self.sentinel_service: { "override": "replace", "summary": "Valkey sentinel service", - "command": f"valkey-sentinel {self.sentinel_config.as_posix()}", + "command": f"valkey-sentinel {self.sentinel_config_file.as_posix()}", "user": self.user, "group": self.user, "startup": "enabled", diff --git a/src/workload_vm.py b/src/workload_vm.py index 949383f..c646260 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -37,12 +37,12 @@ def __init__(self) -> None: with attempt: self.valkey = snap.SnapCache()[SNAP_NAME] - self.root = pathops.LocalPath("/") - self.config_file = self.root / SNAP_CURRENT_PATH / SNAP_CONFIG_FILE - self.sentinel_config = self.root / SNAP_CURRENT_PATH / SNAP_SENTINEL_CONFIG_FILE - self.acl_file = self.root / SNAP_CURRENT_PATH / SNAP_ACL_FILE - self.sentinel_acl_file = self.root / SNAP_CURRENT_PATH / SNAP_SENTINEL_ACL_FILE - self.working_dir = self.root / SNAP_COMMON_PATH / "var/lib/charmed-valkey" + self.root_dir = pathops.LocalPath("/") + self.config_file = self.root_dir / SNAP_CURRENT_PATH / SNAP_CONFIG_FILE + self.sentinel_config_file = self.root_dir / SNAP_CURRENT_PATH / SNAP_SENTINEL_CONFIG_FILE + self.acl_file = self.root_dir / SNAP_CURRENT_PATH / SNAP_ACL_FILE + self.sentinel_acl_file = self.root_dir / SNAP_CURRENT_PATH / SNAP_SENTINEL_ACL_FILE + self.working_dir = self.root_dir / SNAP_COMMON_PATH / "var/lib/charmed-valkey" self.cli = "charmed-valkey.cli" self.user = "snap_daemon" From 0c4eb4eabdba8900d0c961e42173b355080e500e Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 16 Feb 2026 10:37:30 +0000 Subject: [PATCH 080/159] refactor client and add health checks --- src/common/client.py | 223 +++++++++++++++++++++++++++++++++++++- src/events/base_events.py | 47 ++++++-- src/literals.py | 1 + src/managers/cluster.py | 92 +++++++++------- src/managers/sentinel.py | 37 +++++-- src/statuses.py | 2 +- 6 files changed, 335 insertions(+), 67 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 17f563e..75ae51d 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -6,8 +6,9 @@ import logging from typing import Literal +from common.exceptions import ValkeyWorkloadCommandError from core.base_workload import WorkloadBase -from literals import CLIENT_PORT, SENTINEL_PORT +from literals import CLIENT_PORT, PRIMARY_NAME, SENTINEL_PORT logger = logging.getLogger(__name__) @@ -45,8 +46,6 @@ def exec_cli_command( ValkeyWorkloadCommandError: If the CLI command fails to execute. """ port = CLIENT_PORT if self.connect_to == "valkey" else SENTINEL_PORT - user = self.username - password = self.password cli_command: list[str] = [ self.workload.cli, "-h", @@ -54,9 +53,223 @@ def exec_cli_command( "-p", str(port), "--user", - user, + self.username, "--pass", - password, + self.password, ] + command output, error = self.workload.exec(cli_command) return output, error + + def ping(self, hostname: str) -> bool: + """Ping the Valkey server to check if it's responsive. + + Args: + hostname (str): The hostname to connect to. + + Returns: + bool: True if the server responds to the ping command, False otherwise. + """ + try: + output, _ = self.exec_cli_command(["ping"], hostname=hostname) + return "PONG" in output + except ValkeyWorkloadCommandError: + return False + + def get_persistence_info(self, hostname: str) -> dict[str, str] | None: + """Get the persistence information of the Valkey server. + + Args: + hostname (str): The hostname to connect to. + + Returns: + dict[str, str] | None: The persistence information retrieved from the server. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute. + """ + output, _ = self.exec_cli_command(["info", "persistence"], hostname=hostname) + values = {} + if not output.strip(): + logger.warning(f"No persistence info found on Valkey server at {hostname}.") + return None + for line in output.strip().splitlines(): + if line.startswith("#"): + continue + values_parts = line.split(":", 1) + if len(values_parts) != 2: + logger.error( + "Unexpected output format when getting persistence info from Valkey server at %s: %s", + hostname, + output, + ) + return None + values[values_parts[0]] = values_parts[1] + return values + + def set_value(self, hostname: str, key: str, value: str) -> bool: + """Set a key-value pair on the Valkey server. + + Args: + hostname (str): The hostname to connect to. + key (str): The key to set. + value (str): The value to set for the key. + + Returns: + bool: True if the command executed successfully, False otherwise. + """ + try: + output, err = self.exec_cli_command(["set", key, value], hostname=hostname) + if output.strip() == "OK": + return True + logger.error( + "Failed to set key %s on Valkey server at %s: stdout: %s, stderr: %s", + key, + hostname, + output, + err, + ) + return False + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to set key {key} on Valkey server at {hostname}: {e}") + return False + + def is_replica_synced(self, hostname: str) -> bool: + """Check if the replica is synced with the primary. + + Args: + hostname (str): The hostname to connect to. + + Returns: + bool: True if the replica is synced with the primary, False otherwise. + """ + try: + output, _ = self.exec_cli_command(["role"], hostname=hostname) + output_parts = output.strip().split() + return ( + bool(output_parts) + and output_parts[0] == "slave" + and output_parts[3] == "connected" + ) + except ValkeyWorkloadCommandError: + logger.warning( + "Could not determine replica sync status from Valkey server at %s.", hostname + ) + return False + + def config_set(self, hostname: str, parameter: str, value: str) -> bool: + """Set a runtime configuration parameter on the Valkey server. + + Args: + hostname (str): The hostname to connect to. + parameter (str): The configuration parameter to set. + value (str): The value to set for the configuration parameter. + + Returns: + bool: True if the command executed successfully, False otherwise. + """ + try: + output, err = self.exec_cli_command( + ["config", "set", parameter, value], hostname=hostname + ) + if output.strip() == "OK": + return True + logger.error( + "Failed to set config %s on Valkey server at %s: stdout: %s, stderr: %s", + parameter, + hostname, + output, + err, + ) + return False + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to set config {parameter} on Valkey server at {hostname}: {e}") + return False + + def load_acl(self, hostname: str) -> bool: + """Load the ACL file into the Valkey server. + + Args: + hostname (str): The hostname to connect to. + + Returns: + bool: True if the ACL file was loaded successfully, False otherwise. + """ + try: + output, err = self.exec_cli_command(["acl", "load"], hostname=hostname) + if output.strip() == "OK": + return True + logger.error( + "Failed to load ACL file on Valkey server at %s: stdout: %s, stderr: %s", + hostname, + output, + err, + ) + return False + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to load ACL file on Valkey server at {hostname}: {e}") + return False + + def sentinel_get_primary_ip(self, hostname: str) -> str | None: + """Get the primary IP address from the sentinel. + + Args: + hostname (str): The hostname to connect to. + + Returns: + str | None: The primary IP address if retrieved successfully, None otherwise. + """ + if not self.connect_to == "sentinel": + logger.error( + "Attempted to get primary IP from sentinel while client is configured to connect to valkey." + ) + raise ValueError("Client is not configured to connect to sentinel.") + try: + output, _ = self.exec_cli_command( + command=["sentinel", "get-master-addr-by-name", PRIMARY_NAME], hostname=hostname + ) + output_parts = output.strip().split() + if len(output_parts) != 2: + logger.error( + "Unexpected output format when getting primary IP from sentinel at %s: %s", + hostname, + output, + ) + return None + return output_parts[0] + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to get primary IP from sentinel at {hostname}: {e}") + return None + + def sentinel_get_master_info(self, hostname: str) -> dict[str, str] | None: + """Get the master info from the sentinel. + + Args: + hostname (str): The hostname to connect to. + + Returns: + dict[str, str] | None: The master info if retrieved successfully, None otherwise. + """ + if not self.connect_to == "sentinel": + logger.error( + "Attempted to get master info from sentinel while client is configured to connect to valkey." + ) + raise ValueError("Client is not configured to connect to sentinel.") + try: + output, _ = self.exec_cli_command( + command=["sentinel", "master", PRIMARY_NAME], hostname=hostname + ) + if not output.strip(): + logger.warning(f"No master info found in sentinel at {hostname}.") + return None + info_parts = output.strip().split() + if len(info_parts) % 2 != 0: + logger.error( + "Unexpected output format when getting master info from sentinel at %s: %s", + hostname, + output, + ) + return None + return {info_parts[i]: info_parts[i + 1] for i in range(0, len(info_parts), 2)} + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to get master info from sentinel at {hostname}: {e}") + return None diff --git a/src/events/base_events.py b/src/events/base_events.py index 3c98c03..0b7b4d0 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -36,6 +36,18 @@ class UnitFullyStarted(ops.EventBase): The Valkey service is running and the current node is in sync with the primary (if a replica). """ + def __init__(self, handle: ops.Handle, is_primary: bool = False): + super().__init__(handle) + self.is_primary = is_primary + + def snapshot(self) -> dict[str, str]: + """Save the state of the event.""" + return {"is_primary": str(self.is_primary)} + + def restore(self, snapshot: dict[str, str]) -> None: + """Restore the state of the event.""" + self.is_primary = snapshot.get("is_primary", "False") == "True" + class BaseEvents(ops.Object): """Handle all base events.""" @@ -78,9 +90,9 @@ def _on_start(self, event: ops.StartEvent) -> None: primary_ip = self.charm.sentinel_manager.get_primary_ip() if self.charm.unit.is_leader() and not primary_ip: - self._start_services(event, primary_ip=self.charm.state.bind_address) - logger.info("Services started") - self.charm.state.unit_server.update({"start_state": StartState.STARTED.value}) + if not self._start_services(event, primary_ip=self.charm.state.bind_address): + return + self.unit_fully_started.emit(is_primary=True) return if not self.charm.state.cluster.internal_users_credentials or not primary_ip: @@ -100,7 +112,7 @@ def _on_start(self, event: ops.StartEvent) -> None: if not self._start_services(event, primary_ip=primary_ip): return - self.unit_fully_started.emit() + self.unit_fully_started.emit(is_primary=False) def _start_services(self, event: ops.StartEvent, primary_ip: str) -> bool: """Start Valkey and Sentinel services.""" @@ -161,8 +173,25 @@ def _start_services(self, event: ops.StartEvent, primary_ip: str) -> bool: # TODO check how to trigger if deferred without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: """Handle the unit-fully-started event.""" - # Only ran on non-leader units when starting replicas - if not self.charm.sentinel_manager.is_sentinel_discovered(): + if not self.charm.cluster_manager.is_healthy( + is_primary=event.is_primary, check_replica_sync=False + ): + logger.warning("Unit is not healthy after start, deferring event.") + self.charm.state.unit_server.update( + {"start_state": StartState.STARTING_WAITING_VALKEY.value} + ) + event.defer() + return + + if not self.charm.sentinel_manager.is_healthy(): + logger.warning("Sentinel is not healthy after start, deferring event.") + self.charm.state.unit_server.update( + {"start_state": StartState.STARTING_WAITING_SENTINEL.value} + ) + event.defer() + return + + if not event.is_primary and not self.charm.sentinel_manager.is_sentinel_discovered(): logger.info("Sentinel service not yet discovered by other units. Deferring event.") self.charm.state.unit_server.update( {"start_state": StartState.STARTING_WAITING_SENTINEL.value} @@ -170,7 +199,7 @@ def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: event.defer() return - if not self.charm.cluster_manager.is_replica_synced(): + if not event.is_primary and not self.charm.cluster_manager.is_replica_synced(): logger.info("Replica not yet synced. Deferring event.") self.charm.state.unit_server.update( {"start_state": StartState.STARTING_WAITING_REPLICA_SYNC.value} @@ -322,9 +351,9 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: try: self.charm.config_manager.set_acl_file() self.charm.cluster_manager.reload_acl_file() - self.charm.cluster_manager.update_primary_auth() # update the local unit admin password to match the leader self.charm.config_manager.update_local_valkey_admin_password() + self.charm.cluster_manager.update_primary_auth() except (ValkeyACLLoadError, ValkeyConfigSetError, ValkeyWorkloadCommandError) as e: logger.error(e) self.charm.status.set_running_status( @@ -383,7 +412,6 @@ def _update_internal_users_password(self, secret_id: str) -> None: try: self.charm.config_manager.set_acl_file(passwords=new_passwords) self.charm.cluster_manager.reload_acl_file() - self.charm.cluster_manager.update_primary_auth() self.charm.state.cluster.update( { f"{user.value.replace('-', '_')}_password": new_passwords[user.value] @@ -392,6 +420,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: ) # update the local unit admin password self.charm.config_manager.update_local_valkey_admin_password() + self.charm.cluster_manager.update_primary_auth() except ( ValkeyACLLoadError, ValueError, diff --git a/src/literals.py b/src/literals.py index bab1fc8..07e6c38 100644 --- a/src/literals.py +++ b/src/literals.py @@ -75,6 +75,7 @@ class StartState(StrEnum): """Start states for the service.""" NOT_STARTED = "not_started" + STARTING_WAITING_VALKEY = "starting_waiting_valkey" STARTING_WAITING_SENTINEL = "starting_waiting_sentinel" STARTING_WAITING_REPLICA_SYNC = "starting_waiting_replica_sync" STARTED = "started" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index ed5277f..396e8b0 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -15,12 +15,11 @@ from common.exceptions import ( ValkeyACLLoadError, ValkeyConfigSetError, - ValkeyWorkloadCommandError, ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import CharmUsers, StartState -from statuses import CharmStatuses, ClusterStatuses +from statuses import CharmStatuses, ClusterStatuses, ValkeyServiceStatuses logger = logging.getLogger(__name__) @@ -43,14 +42,12 @@ def admin_password(self) -> str: def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" - try: - client = ValkeyClient( - username=self.admin_user, - password=self.admin_password, - workload=self.workload, - ) - client.exec_cli_command(["acl", "load"], hostname=self.state.bind_address) - except ValkeyWorkloadCommandError: + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + ) + if not client.load_acl(hostname=self.state.bind_address): raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: @@ -60,27 +57,20 @@ def update_primary_auth(self) -> None: password=self.admin_password, workload=self.workload, ) - try: - client.exec_cli_command( - [ - "config", - "set", - "primaryauth", - self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_REPLICA.value, "" - ), - ], - hostname=self.state.bind_address, - ) - logger.info("Updated primaryauth runtime configuration on Valkey server") - except ValkeyWorkloadCommandError: + if not client.config_set( + hostname=self.state.bind_address, + parameter="primaryauth", + value=self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_REPLICA.value, "" + ), + ): raise ValkeyConfigSetError("Could not set primaryauth on Valkey server.") @retry( wait=wait_fixed(5), stop=stop_after_attempt(5), retry=retry_if_result(lambda result: result is False), - reraise=True, + retry_error_callback=lambda _: False, ) def is_replica_synced(self) -> bool: """Check if the replica is synced with the primary.""" @@ -89,24 +79,41 @@ def is_replica_synced(self) -> bool: password=self.admin_password, workload=self.workload, ) - try: - output = ( - client.exec_cli_command( - command=["role"], - hostname=self.state.bind_address, - )[0] - .strip() - .split() - ) - if output and output[0] == "slave" and output[3] == "connected": - logger.info("Replica is synced with primary") - return True + return client.is_replica_synced(hostname=self.state.bind_address) + @retry( + wait=wait_fixed(5), + stop=stop_after_attempt(5), + retry=retry_if_result(lambda result: result is False), + retry_error_callback=lambda _: False, + ) + def is_healthy(self, is_primary: bool = False, check_replica_sync: bool = True) -> bool: + """Check if a valkey instance is healthy.""" + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + ) + if not client.ping(hostname=self.state.bind_address): + logger.warning("Health check failed: Valkey server did not respond to ping.") return False - except ValkeyWorkloadCommandError: - logger.warning("Could not determine replica sync status from Valkey server.") + if ( + persistence_info := client.get_persistence_info(hostname=self.state.bind_address) + ) and persistence_info.get("loading", "") != "0": + logger.warning("Health check failed: Valkey server is still loading data.") + return False + if is_primary and not client.set_value( + hostname=self.state.bind_address, key="healthcheck", value="ok" + ): + logger.warning("Health check failed: Could not set test key on Valkey server.") + return False + + if not is_primary and check_replica_sync and not self.is_replica_synced(): + logger.warning("Health check failed: Replica is not synced with primary.") return False + return True + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( @@ -120,9 +127,6 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje if not self.state.cluster.model or not self.state.unit_server.model: return status_list or [CharmStatuses.ACTIVE_IDLE.value] - if self.state.charm.unit.is_leader(): - return status_list or [CharmStatuses.ACTIVE_IDLE.value] - # non leader statuses match self.state.unit_server.model.start_state: case StartState.NOT_STARTED.value: @@ -137,6 +141,10 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje status_list.append( CharmStatuses.WAITING_TO_START.value, ) + case StartState.STARTING_WAITING_VALKEY.value: + status_list.append( + ValkeyServiceStatuses.SERVICE_STARTING.value, + ) case StartState.STARTING_WAITING_SENTINEL.value: status_list.append( ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 9326159..093ceb3 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -45,7 +45,7 @@ def admin_password(self) -> str: wait=wait_fixed(5), stop=stop_after_attempt(5), retry=retry_if_result(lambda result: result is False), - reraise=True, + retry_error_callback=lambda _: False, ) def is_sentinel_discovered(self) -> bool: """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" @@ -91,23 +91,40 @@ def get_primary_ip(self) -> str | None: ) for unit in started_servers: - try: - output = client.exec_cli_command( - command=["sentinel", "get-master-addr-by-name", PRIMARY_NAME], - hostname=unit.model.private_ip, - )[0] - primary_ip = output.strip().split()[0] + if primary_ip := client.sentinel_get_primary_ip(hostname=unit.model.private_ip): logger.info(f"Primary IP address is {primary_ip}") return primary_ip - except (IndexError, ValkeyWorkloadCommandError) as e: - logger.error("Could not get primary IP from sentinel output: %s", e) - logger.error( "Could not determine primary IP from sentinels. Number of started servers: %d.", len(started_servers), ) return None + @retry( + wait=wait_fixed(5), + stop=stop_after_attempt(5), + retry=retry_if_result(lambda result: result is False), + retry_error_callback=lambda retry_state: False, + ) + def is_healthy(self) -> bool: + """Check if the sentinel service is healthy.""" + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + + if not client.ping(hostname=self.state.bind_address): + logger.warning("Health check failed: Sentinel did not respond to ping.") + return False + + if not client.sentinel_get_master_info(hostname=self.state.bind_address): + logger.warning("Health check failed: Could not query sentinel for master information.") + return False + + return True + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the sentinel manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( diff --git a/src/statuses.py b/src/statuses.py index b19d875..e42b1f4 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -49,7 +49,7 @@ class ClusterStatuses(Enum): WAITING_FOR_SENTINEL_DISCOVERY = StatusObject( status="maintenance", - message="Waiting for sentinel to be discovered by other units...", + message="Waiting for sentinel to start and be discovered by other units...", ) WAITING_FOR_REPLICA_SYNC = StatusObject( From 994e852036e45c813b53c6199afc52f91ca2e130 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 16 Feb 2026 10:49:54 +0000 Subject: [PATCH 081/159] mock tenacity nap times and fix unit tests --- tests/unit/conftest.py | 5 ++ tests/unit/test_charm.py | 153 ++++++++++++++++++++++++++------------- 2 files changed, 106 insertions(+), 52 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ea04b33..41a6a18 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -27,6 +27,11 @@ def mock_bind_address(mocker): ) +@pytest.fixture(autouse=True) +def tenacity_wait(mocker): + mocker.patch("tenacity.nap.time") + + @pytest.fixture(autouse=True) def cloud_spec(): return testing.CloudSpec( diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index d8eed58..4816d92 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -19,7 +19,7 @@ CharmUsers, StartState, ) -from src.statuses import CharmStatuses, ClusterStatuses +from src.statuses import CharmStatuses, ClusterStatuses, ValkeyServiceStatuses from .helpers import status_is @@ -83,21 +83,38 @@ def test_start_leader_unit(cloud_spec): } # generate passwords - state_out = ctx.run(ctx.on.leader_elected(), state_in) + state_in = ctx.run(ctx.on.leader_elected(), state_in) # start event - state_out = ctx.run(ctx.on.start(), state_out) - assert state_out.get_container(container.name).plan == expected_plan - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] - == pebble.ServiceStatus.ACTIVE - ) - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] - == pebble.ServiceStatus.ACTIVE - ) - assert state_out.unit_status == ActiveStatus() - assert state_out.app_status == ActiveStatus() + with patch("common.client.ValkeyClient.ping", return_value=False): + state_out = ctx.run(ctx.on.start(), state_in) + assert state_out.get_container(container.name).plan == expected_plan + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] + == pebble.ServiceStatus.ACTIVE + ) + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] + == pebble.ServiceStatus.ACTIVE + ) + assert status_is(state_out, ValkeyServiceStatuses.SERVICE_STARTING.value) + with ( + patch("common.client.ValkeyClient.ping", return_value=True), + patch("common.client.ValkeyClient.get_persistence_info", return_value={"loading": "0"}), + patch("common.client.ValkeyClient.set_value", return_value=True), + ): + state_out = ctx.run(ctx.on.start(), state_out) + assert status_is(state_out, ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) + + with ( + patch("common.client.ValkeyClient.ping", return_value=True), + patch("common.client.ValkeyClient.get_persistence_info", return_value={"loading": "0"}), + patch("common.client.ValkeyClient.set_value", return_value=True), + patch("common.client.ValkeyClient.sentinel_get_master_info", return_value={"ip": "test"}), + ): + state_out = ctx.run(ctx.on.start(), state_out) + assert state_out.unit_status == ActiveStatus() + assert state_out.app_status == ActiveStatus() # container not ready container = testing.Container(name=CONTAINER, can_connect=False) @@ -163,8 +180,30 @@ def test_start_non_leader_unit(cloud_spec): assert status_is(state_out, CharmStatuses.WAITING_TO_START.value) + # health check + with patch("common.client.ValkeyClient.is_replica_synced", return_value=False): + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"starting-member": "valkey/0"}, + peers_data={1: {"start-state": "started"}}, + ) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, ValkeyServiceStatuses.SERVICE_STARTING.value) + # replica syncing - with patch("managers.cluster.ClusterManager.is_replica_synced", return_value=False): + with ( + patch("managers.cluster.ClusterManager.is_replica_synced", return_value=False), + patch("managers.cluster.ClusterManager.is_healthy", return_value=True), + patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), + ): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, @@ -182,7 +221,11 @@ def test_start_non_leader_unit(cloud_spec): assert status_is(state_out, ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value) # sentinel not yet discovered - with patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=False): + with ( + patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=False), + patch("managers.cluster.ClusterManager.is_healthy", return_value=True), + patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), + ): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, @@ -203,6 +246,8 @@ def test_start_non_leader_unit(cloud_spec): with ( patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=True), patch("managers.cluster.ClusterManager.is_replica_synced", return_value=True), + patch("managers.cluster.ClusterManager.is_healthy", return_value=True), + patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), ): relation = testing.PeerRelation( id=1, @@ -413,11 +458,13 @@ def test_config_changed_leader_unit(cloud_spec): ) with ( patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, + patch("common.client.ValkeyClient.load_acl") as mock_load_acl, + patch("common.client.ValkeyClient.config_set") as mock_config_set, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - assert mock_exec_command.call_count == 2 # one for acl load, one for primaryauth set + mock_load_acl.assert_called_once() + mock_config_set.assert_called_once() secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" ) @@ -427,38 +474,38 @@ def test_config_changed_leader_unit(cloud_spec): ) -def test_config_changed_leader_unit_primary(cloud_spec): - ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) - container = testing.Container(name=CONTAINER, can_connect=True) - - password_secret = testing.Secret( - tracked_content={user.value: "secure-password" for user in CharmUsers}, - remote_grants=APP_NAME, - ) - state_in = testing.State( - leader=True, - relations={relation}, - containers={container}, - secrets={password_secret}, - config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, - model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), - ) - with ( - patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, - patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), - ): - state_out = ctx.run(ctx.on.config_changed(), state_in) - mock_set_acl_file.assert_called_once() - assert mock_exec_command.call_count == 2 # one for acl load, one for primaryauth set - secret_out = state_out.get_secret( - label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" - ) - assert ( - secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") - == "secure-password" - ) +# def test_config_changed_leader_unit_primary(cloud_spec): +# ctx = testing.Context(ValkeyCharm, app_trusted=True) +# relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) +# container = testing.Container(name=CONTAINER, can_connect=True) + +# password_secret = testing.Secret( +# tracked_content={user.value: "secure-password" for user in CharmUsers}, +# remote_grants=APP_NAME, +# ) +# state_in = testing.State( +# leader=True, +# relations={relation}, +# containers={container}, +# secrets={password_secret}, +# config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, +# model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), +# ) +# with ( +# patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, +# patch("common.client.ValkeyClient.load_acl") as mock_load_acl, +# patch("common.client.ValkeyClient.config_set") as mock_config_set, +# patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), +# ): +# state_out = ctx.run(ctx.on.config_changed(), state_in) +# mock_set_acl_file.assert_called_once() +# secret_out = state_out.get_secret( +# label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" +# ) +# assert ( +# secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") +# == "secure-password" +# ) def test_config_changed_leader_unit_wrong_username(cloud_spec): @@ -520,13 +567,15 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): "events.base_events.BaseEvents._update_internal_users_password" ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.exec_cli_command") as mock_exec_command, + patch("common.client.ValkeyClient.load_acl") as mock_load_acl, + patch("common.client.ValkeyClient.config_set") as mock_config_set, patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - assert mock_exec_command.call_count == 2 + mock_load_acl.assert_called_once() + mock_config_set.assert_called_once() def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spec): From 53a6285f93404e0c089433ede1b3134f92ee9b91 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 16 Feb 2026 10:54:35 +0000 Subject: [PATCH 082/159] update name of charmed_operator_password for units --- src/core/models.py | 4 ++-- src/managers/config.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/core/models.py b/src/core/models.py index 642a628..fcf79bc 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -42,7 +42,7 @@ class PeerAppModel(PeerModel): class PeerUnitModel(PeerModel): """Model for the peer unit data.""" - charmed_operator_password: InternalUsersSecret = Field(default="") + charmed_operator_password_local_unit_copy: InternalUsersSecret = Field(default="") start_state: str = Field(default=StartState.NOT_STARTED.value) hostname: str = Field(default="") private_ip: str = Field(default="") @@ -125,7 +125,7 @@ def valkey_admin_password(self) -> str: """Retrieve the password for the valkey admin user.""" if not self.model: return "" - return self.model.charmed_operator_password or "" + return self.model.charmed_operator_password_local_unit_copy or "" @final diff --git a/src/managers/config.py b/src/managers/config.py index 820f8e4..9a5d364 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -217,9 +217,7 @@ def update_local_valkey_admin_password(self) -> None: ): logger.warning("No valkey admin password found to update local unit state") return - self.state.unit_server.update( - {f"{CharmUsers.VALKEY_ADMIN.value.replace('-', '_')}_password": app_password} - ) + self.state.unit_server.update({"charmed_operator_password_local_unit_copy": app_password}) def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the config manager's statuses.""" From 7e616ed0c0fc35aec465ff8668cf81db3fc65dd5 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 16 Feb 2026 12:49:08 +0000 Subject: [PATCH 083/159] remove unnecessary check on admin app password --- src/managers/config.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/managers/config.py b/src/managers/config.py index 9a5d364..c14019a 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -210,14 +210,13 @@ def generate_password(self) -> str: def update_local_valkey_admin_password(self) -> None: """Update the local unit's valkey admin password in the state.""" - if not ( - app_password := self.state.cluster.internal_users_credentials.get( - CharmUsers.VALKEY_ADMIN.value - ) - ): - logger.warning("No valkey admin password found to update local unit state") - return - self.state.unit_server.update({"charmed_operator_password_local_unit_copy": app_password}) + self.state.unit_server.update( + { + "charmed_operator_password_local_unit_copy": self.state.cluster.internal_users_credentials.get( + CharmUsers.VALKEY_ADMIN.value + ) + } + ) def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the config manager's statuses.""" From a9f33da438cd4f7325caaf4960ea80fbb0e6e8ad Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Mon, 16 Feb 2026 12:56:13 +0000 Subject: [PATCH 084/159] add alive check in start --- src/core/base_workload.py | 2 +- src/workload_k8s.py | 9 ++++++++- src/workload_vm.py | 18 ++++++++++++++++-- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index c6ce4d8..6ec1472 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -35,7 +35,7 @@ def can_connect(self) -> bool: pass @abstractmethod - def start(self) -> None: + def start(self) -> bool: """Start the workload service.""" pass diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 37899c6..31a959a 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -9,6 +9,7 @@ import ops from charmlibs import pathops +from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed from common.exceptions import ValkeyWorkloadCommandError from core.base_workload import WorkloadBase @@ -85,11 +86,17 @@ def pebble_layer(self) -> ops.pebble.Layer: return ops.pebble.Layer(layer_config) @override - def start(self) -> None: + def start(self) -> bool: self.container.add_layer(CHARM, self.pebble_layer, combine=True) self.container.restart(self.valkey_service, self.sentinel_service, self.metric_service) + return self.alive() @override + @retry( + stop=stop_after_attempt(3), + wait=wait_fixed(1), + retry=retry_if_result(lambda healthy: not healthy), + ) def alive(self) -> bool: """Check if the Valkey service is running.""" for service_name in [ diff --git a/src/workload_vm.py b/src/workload_vm.py index c646260..1edf192 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -9,7 +9,14 @@ from typing import List, override from charmlibs import pathops, snap -from tenacity import Retrying, retry, retry_if_exception_type, stop_after_attempt, wait_fixed +from tenacity import ( + Retrying, + retry, + retry_if_exception_type, + retry_if_result, + stop_after_attempt, + wait_fixed, +) from common.exceptions import ValkeyWorkloadCommandError from core.base_workload import WorkloadBase @@ -88,11 +95,13 @@ def install(self, revision: str | None = None, retry_and_raise: bool = True) -> return False @override - def start(self) -> None: + def start(self) -> bool: try: self.valkey.start(services=[SNAP_SERVICE, SNAP_SENTINEL_SERVICE]) + return self.alive() except snap.SnapError as e: logger.exception(str(e)) + return False @override def exec(self, command: List[str]) -> tuple[str, str | None]: @@ -113,6 +122,11 @@ def exec(self, command: List[str]) -> tuple[str, str | None]: raise ValkeyWorkloadCommandError(e) @override + @retry( + stop=stop_after_attempt(3), + wait=wait_fixed(1), + retry=retry_if_result(lambda healthy: not healthy), + ) def alive(self) -> bool: """Check if the Valkey service is running.""" try: From d5c3a01b3104757a1d17b92fe6f08a66097886fb Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Feb 2026 07:58:01 +0000 Subject: [PATCH 085/159] remove refresh argument from reading secret --- src/core/cluster_state.py | 5 ++--- src/events/base_events.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/cluster_state.py b/src/core/cluster_state.py index 9739f85..24d763e 100644 --- a/src/core/cluster_state.py +++ b/src/core/cluster_state.py @@ -112,18 +112,17 @@ def bind_address(self) -> str: return str(address) - def get_secret_from_id(self, secret_id: str, refresh: bool = False) -> dict[str, str]: + def get_secret_from_id(self, secret_id: str) -> dict[str, str]: """Resolve the given id of a Juju secret and return the content as a dict. Args: secret_id (str): The id of the secret. - refresh (bool): Whether to refresh the secret content from the controller. Defaults to False. Returns: dict: The content of the secret. """ try: - secret_content = self.charm.model.get_secret(id=secret_id).get_content(refresh=refresh) + secret_content = self.charm.model.get_secret(id=secret_id).get_content(refresh=True) except ops.SecretNotFoundError: raise ops.SecretNotFoundError(f"The secret '{secret_id}' does not exist.") except ops.ModelError: diff --git a/src/events/base_events.py b/src/events/base_events.py index 0b7b4d0..5b30a53 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -377,7 +377,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: secret_id (str): The id of the secret containing the internal users' passwords. """ try: - secret_content = self.charm.state.get_secret_from_id(secret_id, refresh=True) + secret_content = self.charm.state.get_secret_from_id(secret_id) except (ops.ModelError, ops.SecretNotFoundError) as e: logger.error(e) self.charm.status.set_running_status( From bf491e848fd11c0cd9917a2159f49e8716509978 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Feb 2026 09:12:31 +0000 Subject: [PATCH 086/159] read and manage sentinel config via a dict --- src/managers/config.py | 95 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/src/managers/config.py b/src/managers/config.py index c14019a..6542033 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -144,34 +144,95 @@ def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None password_hash = hashlib.sha256(password.encode("utf-8")).hexdigest() return f"user {user.value} on #{password_hash} {CHARM_USERS_ROLE_MAP[user]}\n" - def set_sentinel_config_properties(self, primary_ip: str) -> None: - """Write sentinel configuration file.""" - logger.debug("Writing Sentinel configuration") + def get_sentinel_config_properties(self, primary_ip: str) -> dict[str, str | dict[str, str]]: + """Assemble the sentinel config properties. - sentinel_config = f"port {SENTINEL_PORT}\n" + Returns: + Dictionary of sentinel properties to be written to the config file. + """ + config_properties = {} + if not self.state.unit_server.model or not self.state.cluster.model: + return config_properties + sentinel_properties = {} - sentinel_config += f"aclfile {self.workload.sentinel_acl_file.as_posix()}\n" - # TODO consider adding quorum calculation based on number of planned_units and the parity of the number of units - sentinel_config += ( - f"sentinel monitor {PRIMARY_NAME} {primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}\n" + # load the config properties provided from the template in this repo + # it does NOT load the file from disk in the charm unit in order to avoid config drift + with open(f"{WORKING_DIR}/config-template/sentinel.conf") as config: + # The sentinel.conf file contains a number of directives that have a simple format: + # keyword argument1 argument2 ... argumentN + # sentinel keyword argument1 argument2 ... argumentN + for line in config: + line = line.strip().lower() + if not line or line.startswith("#"): + # ignore comments and empty lines + continue + elif line.startswith("sentinel "): + try: + key, value = line.split(" ", 2)[1:] + except ValueError: + key = line.strip().split(" ", 1)[1] + value = "" + sentinel_properties[key.strip()] = value.strip().replace( + "mymaster", PRIMARY_NAME + ) + else: + try: + key, value = line.split(" ", 1) + except ValueError: + key = line.strip() + value = "" + config_properties[key.strip()] = value.strip() + + config_properties["port"] = str(SENTINEL_PORT) + config_properties["aclfile"] = self.workload.sentinel_acl_file.as_posix() + + # sentinel configs + config_properties["sentinel"] = sentinel_properties | self._generate_sentinel_configs( + primary_ip=primary_ip ) + + return config_properties + + def _generate_sentinel_configs(self, primary_ip: str) -> dict[str, str]: + """Generate the sentinel config properties based on the current cluster state.""" + sentinel_configs = {} + # TODO consider adding quorum calculation based on number of planned_units and the parity of the number of units + sentinel_configs["monitor"] = f"{PRIMARY_NAME} {primary_ip} {CLIENT_PORT} {QUORUM_NUMBER}" # auth settings # auth-user is used by sentinel to authenticate to the valkey primary - sentinel_config += ( - f"sentinel auth-user {PRIMARY_NAME} {CharmUsers.VALKEY_SENTINEL.value}\n" + sentinel_configs["auth-user"] = f"{PRIMARY_NAME} {CharmUsers.VALKEY_SENTINEL.value}" + sentinel_configs["auth-pass"] = ( + f"{PRIMARY_NAME} {self.state.cluster.internal_users_credentials.get(CharmUsers.VALKEY_SENTINEL.value, '')}" ) - sentinel_config += f"sentinel auth-pass {PRIMARY_NAME} {self.state.cluster.internal_users_credentials.get(CharmUsers.VALKEY_SENTINEL.value, '')}\n" # sentinel admin user settings used by sentinel for its own authentication - sentinel_config += f"sentinel sentinel-user {CharmUsers.SENTINEL_ADMIN.value}\n" - sentinel_config += f"sentinel sentinel-pass {self.state.cluster.internal_users_credentials.get(CharmUsers.SENTINEL_ADMIN.value, '')}\n" + sentinel_configs["sentinel-user"] = f"{CharmUsers.SENTINEL_ADMIN.value}" + sentinel_configs["sentinel-pass"] = ( + f"{self.state.cluster.internal_users_credentials.get(CharmUsers.SENTINEL_ADMIN.value, '')}" + ) # TODO consider making these configs adjustable via charm config - sentinel_config += f"sentinel down-after-milliseconds {PRIMARY_NAME} 30000\n" - sentinel_config += f"sentinel failover-timeout {PRIMARY_NAME} 180000\n" - sentinel_config += f"sentinel parallel-syncs {PRIMARY_NAME} 1\n" + sentinel_configs["down-after-milliseconds"] = f"{PRIMARY_NAME} 30000" + sentinel_configs["failover-timeout"] = f"{PRIMARY_NAME} 180000" + sentinel_configs["parallel-syncs"] = f"{PRIMARY_NAME} 1" + return sentinel_configs + + def set_sentinel_config_properties(self, primary_ip: str) -> None: + """Write sentinel configuration file.""" + logger.debug("Writing Sentinel configuration") + + sentinel_config = self.get_sentinel_config_properties(primary_ip=primary_ip) + + sentinel_config_string = "\n".join( + f"sentinel {key} {value}" for key, value in sentinel_config["sentinel"].items() + ) + other_config_string = "\n".join( + f"{key} {value}" for key, value in sentinel_config.items() if key != "sentinel" + ) + full_config_string = f"{other_config_string}\n{sentinel_config_string}" + logger.debug("Full Sentinel config:\n%s", full_config_string) # on k8s we need to set the ownership of the sentinel config file to the non-root user that the valkey process runs as in order for sentinel to be able to read/write it self.workload.write_file( - sentinel_config, + full_config_string, self.workload.sentinel_config_file, mode=0o600, user=self.workload.user, From 27a6e23f0fd4f45bb3917eed3a87eb8b869ec770 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Feb 2026 11:19:32 +0000 Subject: [PATCH 087/159] move workload fields to be body annotations --- src/core/base_workload.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 6ec1472..33a865b 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -17,16 +17,14 @@ class WorkloadBase(ABC): """Base interface for common workload operations.""" - def __init__(self) -> None: - """Initialize the WorkloadBase.""" - self.root_dir: pathops.PathProtocol - self.config_file: pathops.PathProtocol - self.sentinel_config_file: pathops.PathProtocol - self.acl_file: pathops.PathProtocol - self.sentinel_acl_file: pathops.PathProtocol - self.working_dir: pathops.PathProtocol - self.cli: str - self.user: str + root_dir: pathops.PathProtocol + config_file: pathops.PathProtocol + sentinel_config_file: pathops.PathProtocol + acl_file: pathops.PathProtocol + sentinel_acl_file: pathops.PathProtocol + working_dir: pathops.PathProtocol + cli: str + user: str @property @abstractmethod From 178f560fb4d7a72f4cc3ded10ffe754b57d79336 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Feb 2026 11:41:21 +0000 Subject: [PATCH 088/159] some minor changes based on feedback --- src/managers/cluster.py | 17 +++++++---------- src/managers/sentinel.py | 6 ++---- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 396e8b0..3cb535f 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -94,19 +94,16 @@ def is_healthy(self, is_primary: bool = False, check_replica_sync: bool = True) password=self.admin_password, workload=self.workload, ) + if not client.ping(hostname=self.state.bind_address): logger.warning("Health check failed: Valkey server did not respond to ping.") return False + if ( persistence_info := client.get_persistence_info(hostname=self.state.bind_address) ) and persistence_info.get("loading", "") != "0": logger.warning("Health check failed: Valkey server is still loading data.") return False - if is_primary and not client.set_value( - hostname=self.state.bind_address, key="healthcheck", value="ok" - ): - logger.warning("Health check failed: Could not set test key on Valkey server.") - return False if not is_primary and check_replica_sync and not self.is_replica_synced(): logger.warning("Health check failed: Replica is not synced with primary.") @@ -120,17 +117,17 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje scope=scope, component=self.name, running_status_only=True, running_status_type="async" ).root - if not self.workload.can_connect: - status_list.append(CharmStatuses.SERVICE_NOT_STARTED.value) - # Peer relation not established yet, or model not built yet for unit or app if not self.state.cluster.model or not self.state.unit_server.model: return status_list or [CharmStatuses.ACTIVE_IDLE.value] - # non leader statuses match self.state.unit_server.model.start_state: case StartState.NOT_STARTED.value: - if ( + if self.state.charm.unit.is_leader(): + status_list.append( + CharmStatuses.SERVICE_NOT_STARTED.value, + ) + elif ( not self.state.cluster.internal_users_credentials or not self.state.number_units_started ): diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 093ceb3..18fff66 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -53,9 +53,7 @@ def is_sentinel_discovered(self) -> bool: active_sentinels = [ unit.model.private_ip for unit in self.state.servers - if unit.model - and unit.is_started - and unit.model.private_ip != self.state.unit_server.model.private_ip + if unit.is_started and unit.model.private_ip != self.state.unit_server.model.private_ip ] client = ValkeyClient( @@ -81,7 +79,7 @@ def is_sentinel_discovered(self) -> bool: def get_primary_ip(self) -> str | None: """Get the IP address of the primary node in the cluster.""" - started_servers = [unit for unit in self.state.servers if unit.model and unit.is_started] + started_servers = [unit for unit in self.state.servers if unit.is_started] client = ValkeyClient( username=self.admin_user, From ed477cfed777375d3b001e367201491f4750bc0b Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Feb 2026 07:11:37 +0000 Subject: [PATCH 089/159] simplify and generalise start up logic --- src/common/exceptions.py | 12 ++++ src/core/base_workload.py | 15 ++++- src/events/base_events.py | 125 +++++++++++++++++++------------------- src/literals.py | 4 ++ src/managers/cluster.py | 42 +++++++------ src/managers/config.py | 1 - src/managers/sentinel.py | 2 +- src/statuses.py | 48 +++++++-------- src/workload_k8s.py | 20 ++++-- src/workload_vm.py | 45 ++++++++++++-- 10 files changed, 190 insertions(+), 124 deletions(-) diff --git a/src/common/exceptions.py b/src/common/exceptions.py index b3f65a1..756f285 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -22,3 +22,15 @@ class ValkeyConfigSetError(ValkeyClientError): class ValkeyWorkloadCommandError(Exception): """Custom Exception if any workload-related command fails.""" + + +class ValkeyServicesFailedToStartError(Exception): + """Custom Exception if Valkey service fails to start.""" + + +class ValkeyServiceNotAliveError(Exception): + """Custom Exception if Valkey service is not alive after start.""" + + +class ValkeyConfigurationError(Exception): + """Custom Exception if Valkey configuration fails to be set.""" diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 33a865b..1f97310 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -33,8 +33,13 @@ def can_connect(self) -> bool: pass @abstractmethod - def start(self) -> bool: - """Start the workload service.""" + def start(self) -> None: + """Start the workload service. + + Raises: + ValkeyServicesFailedToStartError: If the service fails to start. + ValkeyServiceNotAliveError: If the service is not alive after start. + """ pass @abstractmethod @@ -44,7 +49,11 @@ def exec(self, command: list[str]) -> tuple[str, str | None]: @abstractmethod def alive(self) -> bool: - """Check if the Valkey service is running.""" + """Check if the Valkey services are running. + + Returns: + bool: True if the services are active, False otherwise. + """ pass def write_file( diff --git a/src/events/base_events.py b/src/events/base_events.py index 5b30a53..7f06c7d 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -10,7 +10,14 @@ import ops -from common.exceptions import ValkeyACLLoadError, ValkeyConfigSetError, ValkeyWorkloadCommandError +from common.exceptions import ( + ValkeyACLLoadError, + ValkeyConfigSetError, + ValkeyConfigurationError, + ValkeyServiceNotAliveError, + ValkeyServicesFailedToStartError, + ValkeyWorkloadCommandError, +) from literals import ( CLIENT_PORT, INTERNAL_USERS_PASSWORD_CONFIG, @@ -20,7 +27,7 @@ StartState, Substrate, ) -from statuses import CharmStatuses, ClusterStatuses, ValkeyServiceStatuses +from statuses import CharmStatuses, ClusterStatuses if TYPE_CHECKING: from charm import ValkeyCharm @@ -82,39 +89,62 @@ def _on_install(self, event: ops.InstallEvent) -> None: def _on_start(self, event: ops.StartEvent) -> None: """Handle the on start event.""" + self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) + if not self.charm.workload.can_connect: logger.warning("Workload not ready yet") event.defer() return - self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) - - primary_ip = self.charm.sentinel_manager.get_primary_ip() - if self.charm.unit.is_leader() and not primary_ip: - if not self._start_services(event, primary_ip=self.charm.state.bind_address): - return - self.unit_fully_started.emit(is_primary=True) - return - if not self.charm.state.cluster.internal_users_credentials or not primary_ip: + if not self.charm.state.cluster.internal_users_credentials: logger.info( - "Non-leader unit waiting for leader to set primary and internal user credentials" + "Internal users' credentials not set yet. Deferring start event until credentials are set." ) event.defer() return - self.charm.state.unit_server.update({"request_start_lock": True}) + self.charm.state.unit_server.update( + {"start_state": StartState.WAITING_TO_START.value, "request_start_lock": True} + ) + + if self.charm.unit.is_leader(): + logger.info( + "Leader unit requesting lock to start services. Triggering lock request processing." + ) + self._process_lock_requests() # TODO unit.name would not work across models we need to switch to using `model.unit.name + model_uuid` if self.charm.state.cluster.model.starting_member != self.charm.unit.name: - logger.info("Non-leader unit waiting for leader to choose it as starting member") + logger.info("Waiting for lock to start") event.defer() return - if not self._start_services(event, primary_ip=primary_ip): + primary_ip = self.charm.sentinel_manager.get_primary_ip() or self.charm.state.bind_address + + try: + self._configure_services(primary_ip) + self.charm.workload.start() + except ValkeyConfigurationError: + self.charm.state.unit_server.update( + {"start_state": StartState.CONFIGURATION_ERROR.value, "request_start_lock": False} + ) + event.defer() + return + except (ValkeyServicesFailedToStartError, ValkeyServiceNotAliveError) as e: + logger.error(e) + self.charm.state.unit_server.update( + {"start_state": StartState.ERROR_ON_START.value, "request_start_lock": False} + ) + event.defer() return - self.unit_fully_started.emit(is_primary=False) - def _start_services(self, event: ops.StartEvent, primary_ip: str) -> bool: + self.charm.state.unit_server.update( + {"start_state": StartState.STARTING_WAITING_VALKEY.value} + ) + + self.unit_fully_started.emit(is_primary=primary_ip == self.charm.state.bind_address) + + def _configure_services(self, primary_ip: str) -> None: """Start Valkey and Sentinel services.""" try: self.charm.config_manager.update_local_valkey_admin_password() @@ -122,53 +152,12 @@ def _start_services(self, event: ops.StartEvent, primary_ip: str) -> bool: self.charm.config_manager.set_acl_file() self.charm.config_manager.set_sentinel_config_properties(primary_ip=primary_ip) self.charm.config_manager.set_sentinel_acl_file() - except (ValkeyWorkloadCommandError, ValueError): - logger.error("Failed to set configuration") - self.charm.status.set_running_status( - CharmStatuses.CONFIGURATION_ERROR.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - event.defer() - return False - self.charm.state.statuses.delete( - CharmStatuses.CONFIGURATION_ERROR.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - self.charm.status.set_running_status( - ValkeyServiceStatuses.SERVICE_STARTING.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, - ) - - self.charm.workload.start() - if not self.charm.workload.alive(): - logger.error("Workload failed to start.") - self.charm.status.set_running_status( - ValkeyServiceStatuses.SERVICE_NOT_RUNNING.value, - scope="unit", - component_name=self.charm.cluster_manager.name, - statuses_state=self.charm.state.statuses, + except (ValkeyWorkloadCommandError, ValueError) as e: + logger.error("Failed to set configuration properties: %s", e) + self.charm.state.unit_server.update( + {"start_state": StartState.CONFIGURATION_ERROR.value, "request_start_lock": False} ) - return False - - logger.info("Workload started successfully. Opening client port") - self.charm.unit.open_port("tcp", CLIENT_PORT) - self.charm.state.statuses.delete( - ValkeyServiceStatuses.SERVICE_STARTING.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - - self.charm.state.statuses.delete( - ValkeyServiceStatuses.SERVICE_NOT_RUNNING.value, - scope="unit", - component=self.charm.cluster_manager.name, - ) - return True + raise ValkeyConfigurationError("Failed to set configuration") from e # TODO check how to trigger if deferred without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: @@ -212,11 +201,20 @@ def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: {"start_state": StartState.STARTED.value, "request_start_lock": False} ) + self.charm.unit.open_port("tcp", CLIENT_PORT) + def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: """Handle event received by all units when a unit's relation data changes.""" if not self.charm.unit.is_leader(): return + self._process_lock_requests() + + def _process_lock_requests(self) -> None: + """Process start lock requests. + + The leader unit will choose one of the units that requested the lock to start, and update the cluster model with that unit as the starting member. + """ units_requesting_start = [ unit.unit_name for unit in self.charm.state.servers @@ -241,6 +239,7 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: self.charm.state.cluster.model.starting_member, units_requesting_start, ) + return self.charm.state.cluster.update( {"starting_member": units_requesting_start[0] if units_requesting_start else ""} diff --git a/src/literals.py b/src/literals.py index 07e6c38..665b182 100644 --- a/src/literals.py +++ b/src/literals.py @@ -75,7 +75,11 @@ class StartState(StrEnum): """Start states for the service.""" NOT_STARTED = "not_started" + WAITING_TO_START = "waiting_to_start" + WAITING_FOR_PRIMARY_START = "waiting_for_primary_start" + CONFIGURATION_ERROR = "configuration_error" STARTING_WAITING_VALKEY = "starting_waiting_valkey" STARTING_WAITING_SENTINEL = "starting_waiting_sentinel" STARTING_WAITING_REPLICA_SYNC = "starting_waiting_replica_sync" + ERROR_ON_START = "error_on_start" STARTED = "started" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 3cb535f..b8c7817 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -19,7 +19,7 @@ from core.base_workload import WorkloadBase from core.cluster_state import ClusterState from literals import CharmUsers, StartState -from statuses import CharmStatuses, ClusterStatuses, ValkeyServiceStatuses +from statuses import CharmStatuses, StartStatuses logger = logging.getLogger(__name__) @@ -123,32 +123,36 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje match self.state.unit_server.model.start_state: case StartState.NOT_STARTED.value: - if self.state.charm.unit.is_leader(): - status_list.append( - CharmStatuses.SERVICE_NOT_STARTED.value, - ) - elif ( - not self.state.cluster.internal_users_credentials - or not self.state.number_units_started - ): - status_list.append( - ClusterStatuses.WAITING_FOR_PRIMARY_START.value, - ) - else: - status_list.append( - CharmStatuses.WAITING_TO_START.value, - ) + status_list.append( + StartStatuses.SERVICE_NOT_STARTED.value, + ) + case StartState.WAITING_TO_START.value: + status_list.append( + StartStatuses.WAITING_TO_START.value, + ) + case StartState.WAITING_FOR_PRIMARY_START.value: + status_list.append( + StartStatuses.WAITING_FOR_PRIMARY_START.value, + ) + case StartState.CONFIGURATION_ERROR.value: + status_list.append( + StartStatuses.CONFIGURATION_ERROR.value, + ) case StartState.STARTING_WAITING_VALKEY.value: status_list.append( - ValkeyServiceStatuses.SERVICE_STARTING.value, + StartStatuses.SERVICE_STARTING.value, ) case StartState.STARTING_WAITING_SENTINEL.value: status_list.append( - ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, + StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, ) case StartState.STARTING_WAITING_REPLICA_SYNC.value: status_list.append( - ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value, + StartStatuses.WAITING_FOR_REPLICA_SYNC.value, + ) + case StartState.ERROR_ON_START.value: + status_list.append( + StartStatuses.ERROR_ON_START.value, ) return status_list or [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/managers/config.py b/src/managers/config.py index 6542033..5c74c80 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -229,7 +229,6 @@ def set_sentinel_config_properties(self, primary_ip: str) -> None: ) full_config_string = f"{other_config_string}\n{sentinel_config_string}" - logger.debug("Full Sentinel config:\n%s", full_config_string) # on k8s we need to set the ownership of the sentinel config file to the non-root user that the valkey process runs as in order for sentinel to be able to read/write it self.workload.write_file( full_config_string, diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 18fff66..04b3cc7 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -102,7 +102,7 @@ def get_primary_ip(self) -> str | None: wait=wait_fixed(5), stop=stop_after_attempt(5), retry=retry_if_result(lambda result: result is False), - retry_error_callback=lambda retry_state: False, + retry_error_callback=lambda _: False, ) def is_healthy(self) -> bool: """Check if the sentinel service is healthy.""" diff --git a/src/statuses.py b/src/statuses.py index e42b1f4..213054e 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -18,24 +18,11 @@ class CharmStatuses(Enum): status="active", message="", ) - SERVICE_NOT_STARTED = StatusObject( - status="blocked", - message="Service not started", - ) SECRET_ACCESS_ERROR = StatusObject( status="blocked", message="Cannot access configured secret, check permissions", running="async", ) - WAITING_TO_START = StatusObject( - status="maintenance", - message="Waiting for leader to allow service start", - ) - CONFIGURATION_ERROR = StatusObject( - status="blocked", - message="Configuration error, check logs for details", - running="async", - ) class ClusterStatuses(Enum): @@ -47,6 +34,26 @@ class ClusterStatuses(Enum): running="async", ) + +class StartStatuses(Enum): + """Collection of possible statuses related to the service start.""" + + SERVICE_NOT_STARTED = StatusObject( + status="maintenance", + message="Service not started", + ) + WAITING_TO_START = StatusObject( + status="maintenance", + message="Waiting for leader to allow service start", + ) + CONFIGURATION_ERROR = StatusObject( + status="blocked", + message="Configuration error, check logs for details", + ) + SERVICE_STARTING = StatusObject( + status="maintenance", + message="Waiting for Valkey to start...", + ) WAITING_FOR_SENTINEL_DISCOVERY = StatusObject( status="maintenance", message="Waiting for sentinel to start and be discovered by other units...", @@ -61,18 +68,7 @@ class ClusterStatuses(Enum): status="maintenance", message="Waiting for the primary unit to start...", ) - - -class ValkeyServiceStatuses(Enum): - """Collection of possible Valkey service related statuses.""" - - SERVICE_STARTING = StatusObject( - status="maintenance", - message="Waiting for Valkey to start...", - running="async", - ) - SERVICE_NOT_RUNNING = StatusObject( + ERROR_ON_START = StatusObject( status="blocked", - message="Valkey service not running", - running="async", + message="Error occurred during service start, check logs for details", ) diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 31a959a..97f0dac 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -11,7 +11,11 @@ from charmlibs import pathops from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed -from common.exceptions import ValkeyWorkloadCommandError +from common.exceptions import ( + ValkeyServiceNotAliveError, + ValkeyServicesFailedToStartError, + ValkeyWorkloadCommandError, +) from core.base_workload import WorkloadBase from literals import ( ACL_FILE, @@ -86,19 +90,23 @@ def pebble_layer(self) -> ops.pebble.Layer: return ops.pebble.Layer(layer_config) @override - def start(self) -> bool: - self.container.add_layer(CHARM, self.pebble_layer, combine=True) - self.container.restart(self.valkey_service, self.sentinel_service, self.metric_service) - return self.alive() + def start(self) -> None: + try: + self.container.add_layer(CHARM, self.pebble_layer, combine=True) + self.container.restart(self.valkey_service, self.sentinel_service, self.metric_service) + except ops.pebble.ChangeError as e: + raise ValkeyServicesFailedToStartError(f"Failed to start Valkey services: {e}") from e + if not self.alive(): + raise ValkeyServiceNotAliveError("Valkey service is not alive after start.") @override @retry( stop=stop_after_attempt(3), wait=wait_fixed(1), retry=retry_if_result(lambda healthy: not healthy), + retry_error_callback=lambda _: False, ) def alive(self) -> bool: - """Check if the Valkey service is running.""" for service_name in [ self.valkey_service, self.sentinel_service, diff --git a/src/workload_vm.py b/src/workload_vm.py index 1edf192..2c3a043 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -6,6 +6,7 @@ import logging import subprocess +import time from typing import List, override from charmlibs import pathops, snap @@ -18,7 +19,11 @@ wait_fixed, ) -from common.exceptions import ValkeyWorkloadCommandError +from common.exceptions import ( + ValkeyServiceNotAliveError, + ValkeyServicesFailedToStartError, + ValkeyWorkloadCommandError, +) from core.base_workload import WorkloadBase from literals import ( SNAP_ACL_FILE, @@ -95,13 +100,19 @@ def install(self, revision: str | None = None, retry_and_raise: bool = True) -> return False @override - def start(self) -> bool: + def start(self) -> None: try: self.valkey.start(services=[SNAP_SERVICE, SNAP_SENTINEL_SERVICE]) - return self.alive() except snap.SnapError as e: logger.exception(str(e)) - return False + raise ValkeyServicesFailedToStartError(f"Failed to start Valkey services: {e}") from e + + # The service might start but fail to load and die immediately + # On k8s starting the services will wait (poll) for them to be started. + # We do the same here to make sure the services are alive after start. + if not self.wait_for_services_to_be_alive(duration=3): + logger.error("Valkey service is not alive after start.") + raise ValkeyServiceNotAliveError("Valkey service is not alive after start.") @override def exec(self, command: List[str]) -> tuple[str, str | None]: @@ -126,12 +137,36 @@ def exec(self, command: List[str]) -> tuple[str, str | None]: stop=stop_after_attempt(3), wait=wait_fixed(1), retry=retry_if_result(lambda healthy: not healthy), + retry_error_callback=lambda _: False, ) def alive(self) -> bool: - """Check if the Valkey service is running.""" try: return bool(self.valkey.services[SNAP_SERVICE]["active"]) and bool( self.valkey.services[SNAP_SENTINEL_SERVICE]["active"] ) except KeyError: return False + + @retry( + stop=stop_after_attempt(3), + wait=wait_fixed(1), + retry=retry_if_result(lambda healthy: not healthy), + retry_error_callback=lambda _: False, + ) + def wait_for_services_to_be_alive(self, duration: float = 30, delay: float = 0.1) -> bool: + """Poll until the Valkey services are alive for at least `duration` seconds. + + Args: + duration (float): The maximum duration to poll for the services to be alive. Default is 30 seconds. + delay (float): The delay between each poll attempt in seconds. Default is 0.1 seconds. + + Returns: + bool: True if the services are alive within the poll duration, False otherwise. + """ + deadline = time.time() + duration + while time.time() < deadline: + if not self.alive(): + return False + + time.sleep(delay) + return True From d4aa771932f5611a560ece7d32e32f30aaec1e45 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Feb 2026 07:26:24 +0000 Subject: [PATCH 090/159] remove unnecessary state and fix unit tests --- src/literals.py | 1 - src/managers/cluster.py | 4 --- tests/unit/test_charm.py | 62 ++++++++++++++++++++++++++-------------- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/src/literals.py b/src/literals.py index 665b182..b2ea675 100644 --- a/src/literals.py +++ b/src/literals.py @@ -76,7 +76,6 @@ class StartState(StrEnum): NOT_STARTED = "not_started" WAITING_TO_START = "waiting_to_start" - WAITING_FOR_PRIMARY_START = "waiting_for_primary_start" CONFIGURATION_ERROR = "configuration_error" STARTING_WAITING_VALKEY = "starting_waiting_valkey" STARTING_WAITING_SENTINEL = "starting_waiting_sentinel" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index b8c7817..ab6bbfd 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -130,10 +130,6 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje status_list.append( StartStatuses.WAITING_TO_START.value, ) - case StartState.WAITING_FOR_PRIMARY_START.value: - status_list.append( - StartStatuses.WAITING_FOR_PRIMARY_START.value, - ) case StartState.CONFIGURATION_ERROR.value: status_list.append( StartStatuses.CONFIGURATION_ERROR.value, diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 4816d92..ef81314 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -9,7 +9,7 @@ import yaml from ops import ActiveStatus, pebble, testing -from common.exceptions import ValkeyWorkloadCommandError +from common.exceptions import ValkeyServiceNotAliveError, ValkeyWorkloadCommandError from src.charm import ValkeyCharm from src.literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -19,7 +19,7 @@ CharmUsers, StartState, ) -from src.statuses import CharmStatuses, ClusterStatuses, ValkeyServiceStatuses +from src.statuses import CharmStatuses, ClusterStatuses, StartStatuses from .helpers import status_is @@ -86,25 +86,25 @@ def test_start_leader_unit(cloud_spec): state_in = ctx.run(ctx.on.leader_elected(), state_in) # start event - with patch("common.client.ValkeyClient.ping", return_value=False): - state_out = ctx.run(ctx.on.start(), state_in) - assert state_out.get_container(container.name).plan == expected_plan - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] - == pebble.ServiceStatus.ACTIVE - ) - assert ( - state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] - == pebble.ServiceStatus.ACTIVE - ) - assert status_is(state_out, ValkeyServiceStatuses.SERVICE_STARTING.value) + state_out = ctx.run(ctx.on.start(), state_in) + assert state_out.get_container(container.name).plan == expected_plan + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_VALKEY] + == pebble.ServiceStatus.ACTIVE + ) + assert ( + state_out.get_container(container.name).service_statuses[SERVICE_METRIC_EXPORTER] + == pebble.ServiceStatus.ACTIVE + ) + assert status_is(state_out, StartStatuses.SERVICE_STARTING.value) + with ( patch("common.client.ValkeyClient.ping", return_value=True), patch("common.client.ValkeyClient.get_persistence_info", return_value={"loading": "0"}), patch("common.client.ValkeyClient.set_value", return_value=True), ): state_out = ctx.run(ctx.on.start(), state_out) - assert status_is(state_out, ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) + assert status_is(state_out, StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) with ( patch("common.client.ValkeyClient.ping", return_value=True), @@ -116,6 +116,24 @@ def test_start_leader_unit(cloud_spec): assert state_out.unit_status == ActiveStatus() assert state_out.app_status == ActiveStatus() + with ( + patch( + "managers.config.ConfigManager.set_config_properties", + side_effect=ValkeyWorkloadCommandError, + ), + ): + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, StartStatuses.CONFIGURATION_ERROR.value) + + with ( + patch( + "workload_k8s.ValkeyK8sWorkload.start", + side_effect=ValkeyServiceNotAliveError, + ), + ): + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, StartStatuses.ERROR_ON_START.value) + # container not ready container = testing.Container(name=CONTAINER, can_connect=False) state_in = testing.State( @@ -126,8 +144,8 @@ def test_start_leader_unit(cloud_spec): ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value) - assert status_is(state_out, CharmStatuses.SERVICE_NOT_STARTED.value, is_app=True) + assert status_is(state_out, StartStatuses.SERVICE_NOT_STARTED.value) + assert status_is(state_out, StartStatuses.SERVICE_NOT_STARTED.value, is_app=True) def test_start_non_leader_unit(cloud_spec): @@ -161,7 +179,7 @@ def test_start_non_leader_unit(cloud_spec): ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, ClusterStatuses.WAITING_FOR_PRIMARY_START.value) + assert status_is(state_out, StartStatuses.WAITING_TO_START.value) relation = testing.PeerRelation( id=1, @@ -178,7 +196,7 @@ def test_start_non_leader_unit(cloud_spec): ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, CharmStatuses.WAITING_TO_START.value) + assert status_is(state_out, StartStatuses.WAITING_TO_START.value) # health check with patch("common.client.ValkeyClient.is_replica_synced", return_value=False): @@ -196,7 +214,7 @@ def test_start_non_leader_unit(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, ValkeyServiceStatuses.SERVICE_STARTING.value) + assert status_is(state_out, StartStatuses.SERVICE_STARTING.value) # replica syncing with ( @@ -218,7 +236,7 @@ def test_start_non_leader_unit(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, ClusterStatuses.WAITING_FOR_REPLICA_SYNC.value) + assert status_is(state_out, StartStatuses.WAITING_FOR_REPLICA_SYNC.value) # sentinel not yet discovered with ( @@ -240,7 +258,7 @@ def test_start_non_leader_unit(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, ClusterStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) + assert status_is(state_out, StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) # Happy path with sentinel discovered and replica synced with ( From 3beea80d648665c0d9a274e34a9296d77183d656 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Feb 2026 09:22:59 +0000 Subject: [PATCH 091/159] only leader starts priomary if num of units is 0 --- src/events/base_events.py | 17 ++++++++++++++++- src/literals.py | 1 + src/managers/cluster.py | 4 ++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 7f06c7d..e033fe6 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -119,7 +119,19 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - primary_ip = self.charm.sentinel_manager.get_primary_ip() or self.charm.state.bind_address + primary_ip = self.charm.sentinel_manager.get_primary_ip() + if not primary_ip: + if self.charm.state.number_units_started == 0 and self.charm.unit.is_leader(): + primary_ip = self.charm.state.bind_address + else: + logger.debug( + "Primary IP not available yet or other units have already started, deferring start event until leader starts the primary" + ) + self.charm.state.unit_server.update( + {"start_state": StartState.WAITING_FOR_PRIMARY_START.value} + ) + event.defer() + return try: self._configure_services(primary_ip) @@ -244,6 +256,9 @@ def _process_lock_requests(self) -> None: self.charm.state.cluster.update( {"starting_member": units_requesting_start[0] if units_requesting_start else ""} ) + logger.debug( + f"Updated starting member to {units_requesting_start[0] if units_requesting_start else ''}" + ) def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: """Handle the update-status event.""" diff --git a/src/literals.py b/src/literals.py index b2ea675..665b182 100644 --- a/src/literals.py +++ b/src/literals.py @@ -76,6 +76,7 @@ class StartState(StrEnum): NOT_STARTED = "not_started" WAITING_TO_START = "waiting_to_start" + WAITING_FOR_PRIMARY_START = "waiting_for_primary_start" CONFIGURATION_ERROR = "configuration_error" STARTING_WAITING_VALKEY = "starting_waiting_valkey" STARTING_WAITING_SENTINEL = "starting_waiting_sentinel" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index ab6bbfd..e1b13cb 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -126,6 +126,10 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje status_list.append( StartStatuses.SERVICE_NOT_STARTED.value, ) + case StartState.WAITING_FOR_PRIMARY_START.value: + status_list.append( + StartStatuses.WAITING_FOR_PRIMARY_START.value, + ) case StartState.WAITING_TO_START.value: status_list.append( StartStatuses.WAITING_TO_START.value, From eeddaadab188ea20448bb6795c69e5c031acd978 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Feb 2026 11:47:27 +0000 Subject: [PATCH 092/159] clean the cases where primary ip is None and set a blocked status if there are started flags but no primary ip --- src/events/base_events.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index e033fe6..8dd5ed7 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -119,16 +119,18 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - primary_ip = self.charm.sentinel_manager.get_primary_ip() - if not primary_ip: - if self.charm.state.number_units_started == 0 and self.charm.unit.is_leader(): + if (primary_ip := self.charm.sentinel_manager.get_primary_ip()) is None: + if self.charm.state.number_units_started == 0: + logger.debug( + "No primary discovered, but this is the first unit starting, proceeding with start." + ) primary_ip = self.charm.state.bind_address else: - logger.debug( - "Primary IP not available yet or other units have already started, deferring start event until leader starts the primary" + logger.error( + "Cannot get primary IP address from sentinel but there are already units started." ) self.charm.state.unit_server.update( - {"start_state": StartState.WAITING_FOR_PRIMARY_START.value} + {"start_state": StartState.ERROR_ON_START.value} ) event.defer() return From f2e80b06104de0a1ac1ff591f27653105f7bd264 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Feb 2026 11:55:34 +0000 Subject: [PATCH 093/159] extend unit test coverage and rename unit tests to reflect business logic --- tests/unit/test_charm.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index ef81314..0a781d3 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -39,7 +39,7 @@ ) -def test_start_leader_unit(cloud_spec): +def test_start_primary(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) @@ -148,7 +148,32 @@ def test_start_leader_unit(cloud_spec): assert status_is(state_out, StartStatuses.SERVICE_NOT_STARTED.value, is_app=True) -def test_start_non_leader_unit(cloud_spec): +def test_start_primary_started_flag_set(cloud_spec): + + ctx = testing.Context(ValkeyCharm, app_trusted=True) + # no primary but started flag set + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, peers_data={1: {"start-state": "started"}} + ) + status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) + + # happy path + container = testing.Container(name=CONTAINER, can_connect=True) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=True, + relations={relation, status_peer_relation}, + containers={container}, + ) + + # generate passwords + state_out = ctx.run(ctx.on.leader_elected(), state_in) + # start event + state_out = ctx.run(ctx.on.start(), state_out) + assert status_is(state_out, StartStatuses.ERROR_ON_START.value) + + +def test_start_non_primary(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) From 68b89a4b13a15e65009f8a77b376b2e2da10639c Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Feb 2026 15:10:40 +0000 Subject: [PATCH 094/159] leader has to start primary because non leaders might not see all units in peer relation --- src/events/base_events.py | 14 ++++++-------- tests/unit/test_charm.py | 25 ------------------------- 2 files changed, 6 insertions(+), 33 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 8dd5ed7..e033fe6 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -119,18 +119,16 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - if (primary_ip := self.charm.sentinel_manager.get_primary_ip()) is None: - if self.charm.state.number_units_started == 0: - logger.debug( - "No primary discovered, but this is the first unit starting, proceeding with start." - ) + primary_ip = self.charm.sentinel_manager.get_primary_ip() + if not primary_ip: + if self.charm.state.number_units_started == 0 and self.charm.unit.is_leader(): primary_ip = self.charm.state.bind_address else: - logger.error( - "Cannot get primary IP address from sentinel but there are already units started." + logger.debug( + "Primary IP not available yet or other units have already started, deferring start event until leader starts the primary" ) self.charm.state.unit_server.update( - {"start_state": StartState.ERROR_ON_START.value} + {"start_state": StartState.WAITING_FOR_PRIMARY_START.value} ) event.defer() return diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 0a781d3..35b4275 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -148,31 +148,6 @@ def test_start_primary(cloud_spec): assert status_is(state_out, StartStatuses.SERVICE_NOT_STARTED.value, is_app=True) -def test_start_primary_started_flag_set(cloud_spec): - - ctx = testing.Context(ValkeyCharm, app_trusted=True) - # no primary but started flag set - relation = testing.PeerRelation( - id=1, endpoint=PEER_RELATION, peers_data={1: {"start-state": "started"}} - ) - status_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) - - # happy path - container = testing.Container(name=CONTAINER, can_connect=True) - state_in = testing.State( - model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), - leader=True, - relations={relation, status_peer_relation}, - containers={container}, - ) - - # generate passwords - state_out = ctx.run(ctx.on.leader_elected(), state_in) - # start event - state_out = ctx.run(ctx.on.start(), state_out) - assert status_is(state_out, StartStatuses.ERROR_ON_START.value) - - def test_start_non_primary(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) From 250e39bd1d684a3cf86ff53b37e0c3ee734bb95a Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 19 Feb 2026 03:29:11 +0000 Subject: [PATCH 095/159] add running status for better UX --- src/events/base_events.py | 9 ++++++--- src/managers/cluster.py | 4 +--- src/statuses.py | 3 ++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index e033fe6..9d94aa6 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -27,7 +27,7 @@ StartState, Substrate, ) -from statuses import CharmStatuses, ClusterStatuses +from statuses import CharmStatuses, ClusterStatuses, StartStatuses if TYPE_CHECKING: from charm import ValkeyCharm @@ -150,8 +150,11 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - self.charm.state.unit_server.update( - {"start_state": StartState.STARTING_WAITING_VALKEY.value} + self.charm.status.set_running_status( + StartStatuses.SERVICE_STARTING.value, + scope="unit", + statuses_state=self.charm.state.statuses, + component_name=self.charm.cluster_manager.name, ) self.unit_fully_started.emit(is_primary=primary_ip == self.charm.state.bind_address) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index e1b13cb..5076ff0 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -113,9 +113,7 @@ def is_healthy(self, is_primary: bool = False, check_replica_sync: bool = True) def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" - status_list: list[StatusObject] = self.state.statuses.get( - scope=scope, component=self.name, running_status_only=True, running_status_type="async" - ).root + status_list: list[StatusObject] = [] # Peer relation not established yet, or model not built yet for unit or app if not self.state.cluster.model or not self.state.unit_server.model: diff --git a/src/statuses.py b/src/statuses.py index 213054e..f0a677b 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -53,6 +53,7 @@ class StartStatuses(Enum): SERVICE_STARTING = StatusObject( status="maintenance", message="Waiting for Valkey to start...", + running="async", ) WAITING_FOR_SENTINEL_DISCOVERY = StatusObject( status="maintenance", @@ -66,7 +67,7 @@ class StartStatuses(Enum): WAITING_FOR_PRIMARY_START = StatusObject( status="maintenance", - message="Waiting for the primary unit to start...", + message="Waiting to discover the primary unit...", ) ERROR_ON_START = StatusObject( status="blocked", From 147240f1e09b22e6e5929e263742941be52aa220 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 19 Feb 2026 10:55:11 +0000 Subject: [PATCH 096/159] move to glide and wrap client requests in helpers --- poetry.lock | 120 ++++++++- pyproject.toml | 4 +- tests/integration/continuous_writes.py | 294 ++++++++++++++--------- tests/integration/cw_helpers.py | 58 ++--- tests/integration/helpers.py | 245 +++++++++++++------ tests/integration/k8s/ha/test_scaling.py | 39 ++- tests/integration/k8s/test_charm.py | 152 +++++++----- tests/integration/vm/ha/test_scaling.py | 52 ++-- tests/integration/vm/test_charm.py | 152 +++++++----- 9 files changed, 697 insertions(+), 419 deletions(-) diff --git a/poetry.lock b/poetry.lock index d1bf741..7960d81 100644 --- a/poetry.lock +++ b/poetry.lock @@ -60,6 +60,25 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] +[[package]] +name = "anyio" +version = "4.12.1" +description = "High-level concurrency and networking framework on top of asyncio or Trio" +optional = false +python-versions = ">=3.9" +groups = ["integration"] +files = [ + {file = "anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c"}, + {file = "anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703"}, +] + +[package.dependencies] +idna = ">=2.8" +typing_extensions = {version = ">=4.5", markers = "python_version < \"3.13\""} + +[package.extras] +trio = ["trio (>=0.31.0) ; python_version < \"3.10\"", "trio (>=0.32.0) ; python_version >= \"3.10\""] + [[package]] name = "attrs" version = "25.4.0" @@ -259,6 +278,21 @@ rich = "*" all = ["pytest_operator (==0.36.0)"] tests = ["pytest_operator (==0.36.0)"] +[[package]] +name = "idna" +version = "3.11" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.8" +groups = ["integration"] +files = [ + {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, + {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, +] + +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + [[package]] name = "importlib-metadata" version = "8.7.1" @@ -443,6 +477,26 @@ files = [ dev = ["pre-commit", "tox"] testing = ["coverage", "pytest", "pytest-benchmark"] +[[package]] +name = "protobuf" +version = "6.33.5" +description = "" +optional = false +python-versions = ">=3.9" +groups = ["integration"] +files = [ + {file = "protobuf-6.33.5-cp310-abi3-win32.whl", hash = "sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b"}, + {file = "protobuf-6.33.5-cp310-abi3-win_amd64.whl", hash = "sha256:3093804752167bcab3998bec9f1048baae6e29505adaf1afd14a37bddede533c"}, + {file = "protobuf-6.33.5-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:a5cb85982d95d906df1e2210e58f8e4f1e3cdc088e52c921a041f9c9a0386de5"}, + {file = "protobuf-6.33.5-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:9b71e0281f36f179d00cbcb119cb19dec4d14a81393e5ea220f64b286173e190"}, + {file = "protobuf-6.33.5-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:8afa18e1d6d20af15b417e728e9f60f3aa108ee76f23c3b2c07a2c3b546d3afd"}, + {file = "protobuf-6.33.5-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:cbf16ba3350fb7b889fca858fb215967792dc125b35c7976ca4818bee3521cf0"}, + {file = "protobuf-6.33.5-cp39-cp39-win32.whl", hash = "sha256:a3157e62729aafb8df6da2c03aa5c0937c7266c626ce11a278b6eb7963c4e37c"}, + {file = "protobuf-6.33.5-cp39-cp39-win_amd64.whl", hash = "sha256:8f04fa32763dcdb4973d537d6b54e615cc61108c7cb38fe59310c3192d29510a"}, + {file = "protobuf-6.33.5-py3-none-any.whl", hash = "sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02"}, + {file = "protobuf-6.33.5.tar.gz", hash = "sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c"}, +] + [[package]] name = "pydantic" version = "2.12.5" @@ -846,6 +900,18 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +groups = ["integration"] +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + [[package]] name = "tenacity" version = "9.1.2" @@ -890,20 +956,56 @@ files = [ typing-extensions = ">=4.12.0" [[package]] -name = "valkey" -version = "6.1.1" -description = "Python client for Valkey forked from redis-py" +name = "valkey-glide" +version = "2.2.7" +description = "Valkey GLIDE Async client. Supports Valkey and Redis OSS." optional = false python-versions = ">=3.9" groups = ["integration"] files = [ - {file = "valkey-6.1.1-py3-none-any.whl", hash = "sha256:e2691541c6e1503b53c714ad9a35551ac9b7c0bbac93865f063dbc859a46de92"}, - {file = "valkey-6.1.1.tar.gz", hash = "sha256:5880792990c6c2b5eb604a5ed5f98f300880b6dd92d123819b66ed54bb259731"}, + {file = "valkey_glide-2.2.7-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:701b6ee036a54598ba63d7e6ecdee8f6ddd5b460cef67491f29414447deb7407"}, + {file = "valkey_glide-2.2.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:556dd3a906f61ff2d53f540fa782eee5c67a2048ed434f87089bb4f62cbd2564"}, + {file = "valkey_glide-2.2.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6622536445b7c78ae3f0f497ae449efac6a627f7c607b92c9ef934c5dd046c4b"}, + {file = "valkey_glide-2.2.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bd390f66dc324ce3e937a6ac7592bfbd4e6cf9eb5d4c28838fc766645f149b"}, + {file = "valkey_glide-2.2.7-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:e39a1db18d08f5a9995d87158b070af1a625a612dc7e57e27a9becee40f6144c"}, + {file = "valkey_glide-2.2.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:180aa1ee0cdfbcf34ae7322838fd063a720a6dae9e97a8e9462b8a12b1f65138"}, + {file = "valkey_glide-2.2.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44a9a6e85e8320220604468c35e0a84bea392dddbab2dcdf9cce9ece01b4a041"}, + {file = "valkey_glide-2.2.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7905c5f3efb67058c5f52b7906aa2d114288eff4aa76a5379107b312af6b8ec8"}, + {file = "valkey_glide-2.2.7-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:4db4ff570c0a63cc8a4551b780dd00069d61c8841a6e6eeaf2dda05d89ec0221"}, + {file = "valkey_glide-2.2.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:05f5ebe701f18b22d331a12af120e1250927391665b66fd78c273d563b2523c6"}, + {file = "valkey_glide-2.2.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca7aab86a175c678bb0573db29050d49d692adcf87c7dd01e2ff9da94bdac68f"}, + {file = "valkey_glide-2.2.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c81c7cb8bbae7a75c3efcfe9b05ebd97db6f332128606e5464e518ba5a7b8e02"}, + {file = "valkey_glide-2.2.7-cp313-cp313-macosx_10_7_x86_64.whl", hash = "sha256:1d40da535a77ce318367ac255b1d5de95cf0ca669b8cac79a158f678feed9fb3"}, + {file = "valkey_glide-2.2.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0f435ed9c14d7de72df04322300034931aba528d1183770b2f7624dd8fc18d7c"}, + {file = "valkey_glide-2.2.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6cad26daa0775ab6dd7ad5a1d8300c4b97ed4b39401c1f130200456f9f9b5234"}, + {file = "valkey_glide-2.2.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:340a9bdf31e811121e9ea7d95cb75161125c78690334581d4be08aae9c824f29"}, + {file = "valkey_glide-2.2.7-cp314-cp314-macosx_10_7_x86_64.whl", hash = "sha256:085c81403600555a7672cf45d68f2c786d1fac12d5759d8e6e3a3f7d5a79d8b7"}, + {file = "valkey_glide-2.2.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d2470a704f463600a0c12000b48adbcc888210be38fbb39fd33c7f36fe84bd66"}, + {file = "valkey_glide-2.2.7-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d1985f7c579c7b37bf7fc42125b141295dded29257d7b811d318bb5343343c8"}, + {file = "valkey_glide-2.2.7-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba5149f2019164024958778e5b314f05dc61187731e2c23411498cb884a9181b"}, + {file = "valkey_glide-2.2.7-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:20b586d2702a71cd90bb7c85380155f92585129f9534396450e2a64896e5b00c"}, + {file = "valkey_glide-2.2.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8316673b56632ae92b4cf22a990b8fc510fe87cbb29d3aac242496cf7a44d96c"}, + {file = "valkey_glide-2.2.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d3c5ae19adbb299c212c0011c1934ea3769b1dc364126a6fb5b443842678c2ec"}, + {file = "valkey_glide-2.2.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec85da03bd00402df90152c5e647cade29c0e539311839c844e135e945f84dbc"}, + {file = "valkey_glide-2.2.7-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:47949c900e08de0e64fb5b59abfa069e09a62a9a4db2ba6756ca3a6b440f012a"}, + {file = "valkey_glide-2.2.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8d6ba5b86d8910545dcd8429807780bae705def558ce38ca8f2a10ee13aa7021"}, + {file = "valkey_glide-2.2.7-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ce02ce683b42687b72fc21a70b7dfe3597c79cb1594c6e707b464fa37e8f3a3"}, + {file = "valkey_glide-2.2.7-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02900b8e6ea539a5a158c0a74e63d92043b4487dd43f33cc1b0bb03a0aeac0"}, + {file = "valkey_glide-2.2.7-pp311-pypy311_pp73-macosx_10_7_x86_64.whl", hash = "sha256:33e6a21430580499943f29d30c3d74bc9b53f421bb76ea190e43cead428fc832"}, + {file = "valkey_glide-2.2.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c2eff5bf9e30bb2e2efb4bad09ecf2568a7ca722e39b37f8a10d5244a512b3a"}, + {file = "valkey_glide-2.2.7-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:589e52f909bc7e7736e35af6e4b3d91e7dfcbf26b3bf13fca79668ad633d9ed4"}, + {file = "valkey_glide-2.2.7-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b8c9beaff220439b10906e8b84c5a141d4b6515ea28db38f076191777e26c05"}, + {file = "valkey_glide-2.2.7-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:1e353efd6b7d6b511be246e0376be0176869b2a7bde4ba7c4d8d0e25c3bda07b"}, + {file = "valkey_glide-2.2.7-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ba90316717570f550ffbacdad36bc023ca404468c35c997f2ee4bbd8b1cbb634"}, + {file = "valkey_glide-2.2.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:170ab03fa9fb958bb1c9ed467a4e173444d7b23886d5be01b8719d7c4d8ced8d"}, + {file = "valkey_glide-2.2.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6b1ad67ff44d23850713c10191a701c19b8bd4d800ca3ef1a442267563ad92f"}, + {file = "valkey_glide-2.2.7.tar.gz", hash = "sha256:2cd05b8c871c7878cb89679ac34f294f100481b64f79d797cde325a1d051cdc9"}, ] -[package.extras] -libvalkey = ["libvalkey (>=4.0.1)"] -ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==23.2.1)", "requests (>=2.31.0)"] +[package.dependencies] +anyio = ">=4.9.0" +protobuf = ">=6.20" +sniffio = "*" [[package]] name = "websocket-client" @@ -945,4 +1047,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "6710246ac0750c8538cb34d54f3465ad67023241c3cc2af36836b9f0a4d11354" +content-hash = "032d9f2c93fef6791d3a007057822223681c365d153fb1cd2573b2fa34bfd2f7" diff --git a/pyproject.toml b/pyproject.toml index f5441fe..ee849b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,8 +12,6 @@ charmlibs-pathops = "^1.2.0" charmlibs-snap = "^1.0.1" tenacity = "*" data-platform-helpers = ">=0.1.7" -# TODO replace with official release once build from source is possible -# https://github.com/valkey-io/valkey-glide/pull/5202 [tool.poetry.requires-plugins] poetry-plugin-export = ">=1.8" @@ -52,8 +50,8 @@ allure-pytest-default-results = "^0.1.2" data-platform-helpers = ">=0.1.7" jubilant = "^1.6.0" python-dateutil = "*" -valkey = "^6.1.1" tenacity = "^9.1.2" +valkey-glide = "^2.2.7" [tool.coverage.run] branch = true diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index ea1ae44..d0ea9fb 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -2,22 +2,21 @@ # Copyright 2026 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio import logging -import os -import time -from contextlib import contextmanager -from multiprocessing import Event, Process, Queue, log_to_stderr +import multiprocessing +import queue +from contextlib import asynccontextmanager +from multiprocessing import log_to_stderr +from pathlib import Path from types import SimpleNamespace -from typing import Generator +from typing import Optional import jubilant -import valkey +from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials from tenacity import ( - RetryError, - Retrying, retry, stop_after_attempt, - stop_after_delay, wait_fixed, wait_random, ) @@ -33,19 +32,14 @@ class WriteFailedError(Exception): class ContinuousWrites: - """Utility class for managing continuous writes to Valkey.""" + """Utility class for managing continuous async writes to Valkey using GLIDE.""" KEY = "cw_key" LAST_WRITTEN_VAL_PATH = "last_written_value" - SENTINEL_PORT = 26379 + VALKEY_PORT = 6379 def __init__( - self, - juju: jubilant.Juju, - app: str, - initial_count: int = 0, - log_written_values: bool = False, - in_between_sleep: float = 1, + self, juju: jubilant.Juju, app: str, initial_count: int = 0, in_between_sleep: float = 1.0 ): self._juju = juju self._app = app @@ -54,60 +48,56 @@ def __init__( self._queue = None self._process = None self._initial_count = initial_count - self._log_written_values = log_written_values self._in_between_sleep = in_between_sleep + self._mp_ctx = multiprocessing.get_context("spawn") def _get_config(self) -> SimpleNamespace: """Fetch current cluster configuration from Juju.""" return SimpleNamespace( endpoints=",".join(get_cluster_hostnames(self._juju, app_name=self._app)), valkey_password=get_password(self._juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_password=get_password(self._juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), ) - @contextmanager - def _get_client(self) -> Generator[valkey.Valkey, None, None]: - """Context manager to provide a master client and ensure cleanup.""" - conf = self._get_config() - sentinel = valkey.Sentinel( - [(host, self.SENTINEL_PORT) for host in conf.endpoints.split(",")], - username=CharmUsers.VALKEY_ADMIN.value, - password=conf.valkey_password, - sentinel_kwargs={ - "password": conf.sentinel_password, - "username": CharmUsers.SENTINEL_CHARM_ADMIN.value, - }, + async def _create_glide_client(self, config: Optional[SimpleNamespace] = None) -> GlideClient: + """Asynchronously create and return a configured GlideClient.""" + conf = config or self._get_config() + addresses = [NodeAddress(host, self.VALKEY_PORT) for host in conf.endpoints.split(",")] + + credentials = ServerCredentials( + username=CharmUsers.VALKEY_ADMIN.value, password=conf.valkey_password ) - master = sentinel.master_for("primary") - try: - yield master - finally: - # Valkey clients use connection pools, but we ensure logical separation - master.close() + + glide_config = GlideClientConfiguration( + addresses=addresses, + client_name="continuous_writes_client", + request_timeout=5000, + credentials=credentials, + ) + + return await GlideClient.create(glide_config) @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) def start(self) -> None: """Run continuous writes in the background.""" if not self._is_stopped: - self.stop() + self.clear() self._is_stopped = False - self._event = Event() - self._queue = Queue() + # Create primitives using the spawn context + self._event = self._mp_ctx.Event() + self._queue = self._mp_ctx.Queue() + + last_written_file = Path(self.LAST_WRITTEN_VAL_PATH) + if not last_written_file.exists(): + last_written_file.write_text(str(self._initial_count)) - self._process = Process( - target=self._run_wrapper, + self._process = self._mp_ctx.Process( + target=self._run_process, name="continuous_writes", - args=( - self._event, - self._queue, - self._initial_count, - self._log_written_values, - self._in_between_sleep, - ), + args=(self._event, self._queue, self._initial_count, self._in_between_sleep), ) - self.update() # Load initial config into queue + self.update() self._process.start() def update(self) -> None: @@ -122,24 +112,56 @@ def clear(self) -> SimpleNamespace | None: if not self._is_stopped: result = self.stop() - with self._get_client() as client: - client.delete(self.KEY) + asyncio.run(self._async_delete()) - if os.path.exists(self.LAST_WRITTEN_VAL_PATH): - os.remove(self.LAST_WRITTEN_VAL_PATH) + last_written_file = Path(self.LAST_WRITTEN_VAL_PATH) + if last_written_file.exists(): + last_written_file.unlink() + return result + + @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) + async def async_clear(self) -> SimpleNamespace | None: + """Stop writes and delete the tracking key/file.""" + result = None + if not self._is_stopped: + result = await self.async_stop() + await self._async_delete() + + last_written_file = Path(self.LAST_WRITTEN_VAL_PATH) + if last_written_file.exists(): + last_written_file.unlink() return result + async def _async_delete(self) -> None: + client = await self._create_glide_client() + try: + await client.delete([self.KEY]) + finally: + await client.close() + def count(self) -> int: """Return number of items in the list.""" - with self._get_client() as client: - return client.llen(self.KEY) + return asyncio.run(self._async_count()) + + async def _async_count(self) -> int: + client = await self._create_glide_client() + try: + return await client.llen(self.KEY) + finally: + await client.close() def max_stored_id(self) -> int: """Return the most recently inserted ID (top of list).""" - with self._get_client() as client: - val = client.lindex(self.KEY, 0) - return int(val) if val else 0 + return asyncio.run(self._async_max_stored_id()) + + async def _async_max_stored_id(self) -> int: + client = await self._create_glide_client() + try: + val = await client.lindex(self.KEY, 0) + return int(val.decode()) if val else 0 + finally: + await client.close() @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) def stop(self) -> SimpleNamespace: @@ -153,91 +175,125 @@ def stop(self) -> SimpleNamespace: result = SimpleNamespace() result.max_stored_id = self.max_stored_id() result.count = self.count() + result.last_expected_id = int(Path(self.LAST_WRITTEN_VAL_PATH).read_text().strip()) - # Retrieve the last ID the worker attempted to write - try: - for attempt in Retrying(stop=stop_after_delay(10), wait=wait_fixed(2)): - with attempt: - with open(self.LAST_WRITTEN_VAL_PATH, "r") as f: - result.last_expected_id = int(f.read().strip()) - except (RetryError, FileNotFoundError, ValueError): - result.last_expected_id = -1 + return result + + @retry(wait=wait_fixed(5) + wait_random(0, 5), stop=stop_after_attempt(5)) + async def async_stop(self) -> SimpleNamespace: + """Stop the background process and return summary statistics.""" + if not self._is_stopped and self._process: + self._event.set() + self._process.join(timeout=30) + self._process.terminate() + self._is_stopped = True + + result = SimpleNamespace() + result.max_stored_id = await self._async_max_stored_id() + result.count = await self._async_count() + result.last_expected_id = int(Path(self.LAST_WRITTEN_VAL_PATH).read_text().strip()) return result @staticmethod - def _run_wrapper( - event: Event, - data_queue: Queue, - starting_number: int, - log_written_values: bool = False, - in_between_sleep: float = 1, - ) -> None: - """Entry point for the Process; simplified without unnecessary asyncio.""" + def _run_process(event, data_queue, starting_number: int, in_between_sleep: float): + """Start synchronously the asyncio event loop.""" proc_logger = log_to_stderr() proc_logger.setLevel(logging.INFO) - def _make_client(conf): - s = valkey.Sentinel( - [(h, ContinuousWrites.SENTINEL_PORT) for h in conf.endpoints.split(",")], + # FIX 2: Do the blocking read synchronously BEFORE starting the async loop + initial_config = data_queue.get(block=True) + + asyncio.run( + ContinuousWrites._async_run( + event, data_queue, starting_number, initial_config, in_between_sleep, proc_logger + ) + ) + + @staticmethod + async def _async_run( + event, + data_queue, + starting_number: int, + initial_config: SimpleNamespace, + in_between_sleep: float, + proc_logger: logging.Logger, + ): + """Async loop for writing data continuously.""" + + async def _make_client(conf: SimpleNamespace) -> GlideClient: + addresses = [ + NodeAddress(host, ContinuousWrites.VALKEY_PORT) + for host in conf.endpoints.split(",") + ] + credentials = ServerCredentials( username=CharmUsers.VALKEY_ADMIN.value, password=conf.valkey_password, - sentinel_kwargs={ - "password": conf.sentinel_password, - "username": CharmUsers.SENTINEL_CHARM_ADMIN.value, - }, ) - return s.master_for("primary") + glide_config = GlideClientConfiguration( + addresses=addresses, + client_name="continuous_writes_worker", + request_timeout=5000, + credentials=credentials, + ) + return await GlideClient.create(glide_config) + + @asynccontextmanager + async def with_client(conf: SimpleNamespace): + client = await _make_client(conf) + try: + yield client + finally: + await client.close() current_val = starting_number - config = data_queue.get(block=True) - client = _make_client(config) + config = initial_config + # client = await _make_client(config) - proc_logger.info(f"Starting continuous writes from {current_val}") + proc_logger.info(f"Starting continuous async writes from {current_val}") try: while not event.is_set(): - # Check for config updates (e.g. cluster scaling) - if not data_queue.empty(): - config = data_queue.get(block=False) - client = _make_client(config) + try: + config = data_queue.get_nowait() + # await client.close() + # client = await _make_client(config) + proc_logger.info("Configuration updated, client reconnected.") + except queue.Empty: + pass try: - # note LPUSH returns the length of the list after the push - if client.lpush(ContinuousWrites.KEY, current_val): - if log_written_values: - proc_logger.info(f"Wrote value: {current_val}") - current_val += 1 - # Throttle to avoid flooding small test runners - time.sleep(in_between_sleep) - else: - raise WriteFailedError("LPUSH returned 0/None") + proc_logger.info(f"Writing value: {current_val}") + async with with_client(config) as client: + if not ( + res := await asyncio.wait_for( + client.lpush(ContinuousWrites.KEY, [str(current_val)]), timeout=5 + ) + ): + raise WriteFailedError("LPUSH returned 0/None") + proc_logger.info(f"Length after write: {res}") + await asyncio.sleep(in_between_sleep) except Exception as e: proc_logger.warning(f"Write failed at {current_val}: {e}") - time.sleep(2) - continue + finally: + if event.is_set(): + break + + current_val += 1 + finally: - # Persistent where we stopped - with open(ContinuousWrites.LAST_WRITTEN_VAL_PATH, "w") as f: - f.write(str(current_val - 1)) - os.fsync(f) + Path(ContinuousWrites.LAST_WRITTEN_VAL_PATH).write_text(str(current_val)) + proc_logger.info("Continuous writes process exiting.") if __name__ == "__main__": - # Example usage + import jubilant + juju_env = jubilant.Juju(model="testing") - cw = ContinuousWrites( - juju=juju_env, - app="valkey", - initial_count=100, - log_written_values=True, - in_between_sleep=1, - ) + cw = ContinuousWrites(juju=juju_env, app="valkey", in_between_sleep=0.5) cw.clear() cw.start() - # continue until manually stopped by ctrl+c or by calling cw.stop() from another process - try: - while True: - time.sleep(1) - except KeyboardInterrupt: - print(f"Stats: {cw.clear()}") + print("Continuous writes started. Press Enter to stop...") + input() + stats = cw.clear() + print(f"Stopped. Stats: {stats}") diff --git a/tests/integration/cw_helpers.py b/tests/integration/cw_helpers.py index 1b068d4..022c0b1 100644 --- a/tests/integration/cw_helpers.py +++ b/tests/integration/cw_helpers.py @@ -2,15 +2,13 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio import logging import subprocess -import time +from pathlib import Path -import valkey -from tenacity import Retrying, stop_after_attempt, wait_fixed - -from literals import CLIENT_PORT, SENTINEL_PORT from tests.integration.continuous_writes import ContinuousWrites +from tests.integration.helpers import create_valkey_client, exec_valkey_cli logger = logging.getLogger(__name__) @@ -48,53 +46,39 @@ def stop_continuous_writes() -> None: proc.communicate() -def assert_continuous_writes_increasing( - endpoints: str, - valkey_user: str, - valkey_password: str, - sentinel_user: str, - sentinel_password: str, +async def assert_continuous_writes_increasing( + hostnames: list[str], + username: str, + password: str, ) -> None: """Assert that the continuous writes are increasing.""" - client = valkey.Sentinel( - [(host, SENTINEL_PORT) for host in endpoints.split(",")], - username=valkey_user, - password=valkey_password, - sentinel_kwargs={"password": sentinel_password, "username": sentinel_user}, + client = await create_valkey_client( + hostnames, + username=username, + password=password, ) - master = client.master_for("primary") - writes_count = int(master.llen(KEY)) - time.sleep(10) - more_writes = int(master.llen(KEY)) + writes_count = await client.llen(KEY) + await asyncio.sleep(10) + more_writes = await client.llen(KEY) assert more_writes > writes_count, "Writes not continuing to DB" logger.info("Continuous writes are increasing.") def assert_continuous_writes_consistent( - endpoints: str, - valkey_user: str, - valkey_password: str, + hostnames: list[str], + username: str, + password: str, ) -> None: """Assert that the continuous writes are consistent.""" last_written_value = None - for attempt in Retrying(stop=stop_after_attempt(5), wait=wait_fixed(5)): - with attempt: - with open(WRITES_LAST_WRITTEN_VAL_PATH, "r") as f: - last_written_value = int(f.read().rstrip()) + last_written_value = int(Path(WRITES_LAST_WRITTEN_VAL_PATH).read_text()) if not last_written_value: raise ValueError("Could not read last written value from file.") - for endpoint in endpoints.split(","): - client = valkey.Valkey( - host=endpoint, - port=CLIENT_PORT, - username=valkey_user, - password=valkey_password, - decode_responses=True, - ) - last_value = int(client.lrange(KEY, 0, 0)[0]) - count = int(client.llen(KEY)) + for endpoint in hostnames: + last_value = int(exec_valkey_cli(endpoint, username, password, f"LRANGE {KEY} 0 0")[0]) + count = int(exec_valkey_cli(endpoint, username, password, f"LLEN {KEY}")[0]) assert last_written_value == last_value, ( f"endpoint: {endpoint}, expected value: {last_written_value}, current value: {last_value}" ) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 81e9b8e..332b815 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -5,16 +5,24 @@ import contextlib import logging import os +import re +import subprocess import time from datetime import datetime, timedelta from pathlib import Path from typing import List import jubilant -import valkey import yaml from data_platform_helpers.advanced_statuses.models import StatusObject from dateutil.parser import parse +from glide import ( + GlideClient, + GlideClientConfiguration, + InfoSection, + NodeAddress, + ServerCredentials, +) from ops import SecretNotFoundError, StatusBase from literals import ( @@ -237,61 +245,34 @@ def get_secret_by_label(juju: jubilant.Juju, label: str) -> dict[str, str]: raise SecretNotFoundError(f"Secret with label {label} not found") -def create_valkey_client( - hostname: str, +async def create_valkey_client( + hostnames: list[str], username: str | None = CharmUsers.VALKEY_ADMIN.value, password: str | None = None, -) -> valkey.Valkey: +): """Create and return a Valkey client connected to the cluster. Args: - hostname: The hostname of the Valkey cluster node. + hostnames: List of hostnames of the Valkey cluster nodes. username: The username for authentication. password: The password for the internal user. + tls_enabled: Whether TLS certificates are needed. Returns: A Valkey client instance connected to the cluster. """ - client = valkey.Valkey( - host=hostname, - port=CLIENT_PORT, - username=username, - password=password, - decode_responses=True, - ) - return client + addresses = [NodeAddress(host=host, port=CLIENT_PORT) for host in hostnames] + credentials = None + if username or password: + credentials = ServerCredentials(username=username, password=password) -def create_sentinel_client( - hostnames: list[str], - valkey_user: str | None = CharmUsers.VALKEY_ADMIN.value, - valkey_password: str | None = None, - sentinel_user: str | None = CharmUsers.SENTINEL_ADMIN.value, - sentinel_password: str | None = None, -) -> valkey.Sentinel: - """Create and return a Valkey Sentinel client connected to the cluster. - - Args: - hostnames: A list of hostnames for the Sentinel nodes. - valkey_user: The username for authentication to Valkey. - valkey_password: The password for the internal user for Valkey authentication. - sentinel_user: The username for authentication to Sentinel. - sentinel_password: The password for the internal user for Sentinel authentication. - - Returns: - A Valkey Sentinel client instance connected to the cluster. - """ - sentinel_client = valkey.Sentinel( - [(host, 26379) for host in hostnames], - username=valkey_user, - password=valkey_password, - sentinel_kwargs={ - "password": sentinel_password, - "username": sentinel_user, - }, - decode_responses=True, + client_config = GlideClientConfiguration( + addresses, + credentials=credentials, ) - return sentinel_client + + return await GlideClient.create(client_config) def set_password( @@ -336,21 +317,19 @@ def fast_forward(juju: jubilant.Juju): juju.model_config({"update-status-hook-interval": old}) -def get_primary_ip(juju: jubilant.Juju, app: str) -> str: +async def get_primary_ip(juju: jubilant.Juju, app: str) -> str: """Get the primary node of the Valkey cluster. Returns: The IP address of the primary node. """ hostnames = get_cluster_hostnames(juju, app) - client = create_sentinel_client( - hostnames=hostnames, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_user=CharmUsers.SENTINEL_CHARM_ADMIN.value, - sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), - ) - return client.discover_master("primary")[0] + client = await create_valkey_client([hostnames[0]], password=get_password(juju)) + info = await client.custom_command(["client", "info"]) + match = re.search(r"laddr=([\d\.]+):", info.decode()) + if match: + return match.group(1) + raise RuntimeError("Primary IP not found in client info output") def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN) -> str: @@ -367,15 +346,10 @@ def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN return secret.get(f"{user.value}-password", "") -def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: +async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: # Connect to Valkey - primary_ip = get_primary_ip(juju, APP_NAME) - client = valkey.Valkey( - host=primary_ip, - port=CLIENT_PORT, - username=CharmUsers.VALKEY_ADMIN.value, - password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - ) + hostnames = get_cluster_hostnames(juju, APP_NAME) + client = await create_valkey_client(hostnames, password=get_password(juju)) # Configuration value_size_bytes = 1024 # 1KB per value @@ -395,17 +369,11 @@ def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: try: while keys_added < total_keys: - pipe = client.pipeline(transaction=False) - - # Fill the batch - for i in range(batch_size): - key_idx = keys_added + i - pipe.set(f"{SEED_KEY_PREFIX}{key_idx}", random_data) + data = {f"{SEED_KEY_PREFIX}{key_idx}": random_data for key_idx in range(batch_size)} - if keys_added + i >= total_keys: - break + if await client.mset(data) != "OK": + raise RuntimeError("Failed to set data in Valkey cluster") - pipe.execute() keys_added += batch_size # Progress reporting @@ -420,3 +388,144 @@ def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: finally: total_time = time.time() - start_time logger.info(f"\nSeeding complete! Added {keys_added:,} keys in {total_time:.2f} seconds.") + + +def exec_valkey_cli(hostname: str, username: str, password: str, command: str) -> tuple[str, str]: + """Execute a Valkey CLI command and returns the output as a string.""" + command = f"charmed-valkey.cli -h {hostname} -p {CLIENT_PORT} --user {username} --pass {password} {command}" + result = subprocess.run( + command.split(), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + return result.stdout.strip(), result.stderr.strip() + + +async def set_key( + hostnames: list[str], + username: str, + password: str, + key: str, + value: str, +) -> bytes | None: + """Write a key-value pair to the Valkey cluster. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + key: The key to write. + value: The value to write. + username: The username for authentication. + password: The password for authentication. + """ + client = await create_valkey_client(hostnames=hostnames, username=username, password=password) + return await client.set(key, value) + + +async def get_key( + hostnames: list[str], + username: str, + password: str, + key: str, +) -> bytes | None: + """Read a value from the Valkey cluster by key. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + key: The key to read. + username: The username for authentication. + password: The password for authentication. + """ + client = await create_valkey_client(hostnames=hostnames, username=username, password=password) + return await client.get(key) + + +def ping( + hostname: str, + username: str, + password: str, +) -> bool: + """Ping a Valkey cluster node. + + Args: + hostname: The hostname of the Valkey cluster node. + username: The username for authentication. + password: The password for authentication. + + Returns: + True if the node responds to a ping, False otherwise. + """ + return exec_valkey_cli(hostname, username, password, "ping")[0] == "PONG" + + +async def ping_cluster( + hostnames: list[str], + username: str, + password: str, +) -> bool: + """Ping all nodes in the Valkey cluster. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + username: The username for authentication. + password: The password for authentication. + + Returns: + True if all nodes respond to a ping, False otherwise. + """ + client = await create_valkey_client(hostnames=hostnames, username=username, password=password) + return await client.ping() == "PONG".encode() + + +async def get_nbr_connected_slaves( + hostnames: list[str], + username: str, + password: str, +) -> int: + """Get the number of connected slaves in the Valkey cluster. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + username: The username for authentication. + password: The password for authentication. + + Returns: + The number of connected slaves. + """ + client = await create_valkey_client(hostnames=hostnames, username=username, password=password) + info = (await client.info([InfoSection.REPLICATION])).decode() + search_result = re.search(r"connected_slaves:([\d+])", info) + if not search_result: + raise ValueError("Could not parse number of connected slaves from info output") + return int(search_result.group(1)) + + +class NoAuthError(Exception): + """Raised when authentication fails due to missing credentials.""" + + +class WrongPassError(Exception): + """Raised when authentication fails due to incorrect credentials.""" + + +async def auth_test(hostnames: list[str], username: str | None, password: str | None) -> bool: + """Test authentication to the Valkey cluster by attempting to ping it. + + Args: + hostnames: List of hostnames of the Valkey cluster nodes. + username: The username for authentication. + password: The password for authentication. + + Returns: + True if authentication is successful and the cluster responds to a ping, False otherwise. + """ + try: + client = await create_valkey_client( + hostnames=hostnames, username=username, password=password + ) + return await client.ping() == "PONG".encode() + except Exception as e: + error_message = str(e) + if "NOAUTH" in error_message: + raise NoAuthError("Authentication failed: NOAUTH error") from e + elif "WRONGPASS" in error_message: + raise WrongPassError("Authentication failed: WRONGPASS error") from e + else: + raise e diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index e55530f..586b585 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -4,7 +4,6 @@ import logging import jubilant -import valkey from literals import CharmUsers from tests.integration.cw_helpers import ( @@ -16,6 +15,7 @@ IMAGE_RESOURCE, are_apps_active_and_agents_idle, get_cluster_hostnames, + get_nbr_connected_slaves, get_password, seed_valkey, ) @@ -40,12 +40,12 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: ) -def test_seed_data(juju: jubilant.Juju) -> None: +async def test_seed_data(juju: jubilant.Juju) -> None: """Seed some data to the cluster.""" - seed_valkey(juju, target_gb=1) + await seed_valkey(juju, target_gb=1) -def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: +async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) @@ -61,35 +61,26 @@ def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: assert num_units == NUM_UNITS, f"Expected {NUM_UNITS} units, got {num_units}." # check if all units have been added to the cluster - endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) + hostnames = get_cluster_hostnames(juju, APP_NAME) - sentinel_client = valkey.Sentinel( - [(host, 26379) for host in endpoints.split(",")], + connected_slaves = await get_nbr_connected_slaves( + hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_kwargs={ - "password": get_password(juju, user=CharmUsers.SENTINEL_ADMIN), - "username": CharmUsers.SENTINEL_ADMIN.value, - }, ) - master = sentinel_client.master_for("primary") - info = master.info("replication") - connected_slaves = info.get("connected_slaves", 0) assert connected_slaves == NUM_UNITS - 1, ( f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." ) - assert_continuous_writes_increasing( - endpoints=endpoints, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_user=CharmUsers.SENTINEL_ADMIN.value, - sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + await assert_continuous_writes_increasing( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) logger.info("Stopping continuous writes after scale up test.") - logger.info(c_writes.stop()) + logger.info(await c_writes.async_stop()) assert_continuous_writes_consistent( - endpoints=endpoints, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 021a195..9721ebb 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -5,7 +5,6 @@ import jubilant import pytest -from valkey import AuthenticationError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -16,14 +15,19 @@ APP_NAME, IMAGE_RESOURCE, INTERNAL_USERS_SECRET_LABEL, + NoAuthError, + WrongPassError, are_apps_active_and_agents_idle, - create_valkey_client, + auth_test, does_status_match, + exec_valkey_cli, fast_forward, get_cluster_hostnames, get_password, - get_primary_ip, get_secret_by_label, + ping, + ping_cluster, + set_key, set_password, ) @@ -45,23 +49,20 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: async def test_authentication(juju: jubilant.Juju) -> None: """Assert that we can authenticate to valkey.""" - primary = get_primary_ip(juju, APP_NAME) hostnames = get_cluster_hostnames(juju, APP_NAME) # try without authentication - with pytest.raises(AuthenticationError): - unauth_client = create_valkey_client(hostname=primary, username=None, password=None) - await unauth_client.ping() + with pytest.raises(NoAuthError): + await auth_test(hostnames, username=None, password=None) # Authenticate with internal user password = get_password(juju, user=CharmUsers.VALKEY_ADMIN) assert password is not None, "Admin password secret not found" for hostname in hostnames: - client = create_valkey_client(hostname=hostname, password=password) - assert client.ping() is True, ( - f"Authentication to Valkey cluster failed for host {hostname}" - ) + assert ( + "PONG" in exec_valkey_cli(hostname, CharmUsers.VALKEY_ADMIN.value, password, "ping")[0] + ), "Failed to authenticate with Valkey cluster using CLI" async def test_update_admin_password(juju: jubilant.Juju) -> None: @@ -81,22 +82,28 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: new_password_secret = get_password(juju, user=CharmUsers.VALKEY_ADMIN) assert new_password_secret == new_password, "Admin password not updated in secret" - primary = get_primary_ip(juju, APP_NAME) - + hostnames = get_cluster_hostnames(juju, APP_NAME) # confirm old password no longer works - with pytest.raises(AuthenticationError): - create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=old_password - ).ping() - # ping with new password - client = create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, "Failed to authenticate with new admin password" + with pytest.raises(WrongPassError): + await auth_test(hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=old_password) - assert client.set(TEST_KEY, TEST_VALUE) is True, ( - "Failed to write data after admin password update" - ) + assert ( + await ping_cluster( + hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + is True + ), "Failed to authenticate with new admin password" + + assert ( + await set_key( + hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, + value=TEST_VALUE, + ) + == "OK" + ), "Failed to write data after admin password update" # update the config again and remove the option `admin-password` logger.info("Ensure access is still possible after removing config option") @@ -109,15 +116,17 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: ) for hostname in get_cluster_hostnames(juju, APP_NAME): - client = create_valkey_client( - hostname=hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, ( + assert ( + ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ), ( f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) - assert client.get(TEST_KEY) == TEST_VALUE, ( - f"Failed to read data after admin password update on host {hostname}" - ) + assert ( + exec_valkey_cli( + hostname, CharmUsers.VALKEY_ADMIN.value, new_password, f"get {TEST_KEY}" + )[0] + == TEST_VALUE + ), f"Failed to read data after admin password update on host {hostname}" async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None: @@ -151,14 +160,25 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None ) # perform read operation with the updated password - primary = get_primary_ip(juju, APP_NAME) - client = create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, "Failed to authenticate with new admin password" - assert client.set(TEST_KEY, TEST_VALUE) is True, ( - "Failed to write data after admin password update" - ) + assert ( + await ping_cluster( + get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + ) + is True + ), "Failed to authenticate with new admin password" + + assert ( + await set_key( + get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, + value=TEST_VALUE, + ) + == "OK" + ), "Failed to write data after admin password update" logger.info("Comparing other users passwords to previously") updated_secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) @@ -201,26 +221,33 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform read operation with the updated password hostnames = get_cluster_hostnames(juju, APP_NAME) - primary = get_primary_ip(juju, APP_NAME) - client = create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, "Failed to authenticate with new admin password" - assert client.set(TEST_KEY, TEST_VALUE) is True, ( - "Failed to write data after admin password update" - ) - for hostname in hostnames: - client = create_valkey_client( - hostname=hostname, + assert ping_cluster( + hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ), "Failed to authenticate with new admin password" + + assert ( + await set_key( + hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password, + key=TEST_KEY, + value=TEST_VALUE, ) - assert client.ping() is True, ( - f"Failed to authenticate with new admin password on host {hostname}" - ) - assert client.get(TEST_KEY) == TEST_VALUE, ( - f"Failed to read data after admin password update on host {hostname}" + == "OK" + ), "Failed to write data after admin password update" + + for hostname in hostnames: + assert ( + ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ), ( + f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) + assert ( + exec_valkey_cli( + hostname, CharmUsers.VALKEY_ADMIN.value, new_password, f"get {TEST_KEY}" + )[0] + == TEST_VALUE + ), f"Failed to read data after admin password update on host {hostname}" logger.info("Password update successful after secret was granted") @@ -240,12 +267,9 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: ) # perform pings with the updated replica password - for hostname in hostnames: - client = create_valkey_client( - hostname=hostname, - username=CharmUsers.VALKEY_REPLICA.value, - password=replica_password, - ) - assert client.ping() is True, ( - f"Failed to authenticate with new replica password on host {hostname}" + for hostname in get_cluster_hostnames(juju, APP_NAME): + assert ( + ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ), ( + f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) diff --git a/tests/integration/vm/ha/test_scaling.py b/tests/integration/vm/ha/test_scaling.py index fbd977e..ca7254a 100644 --- a/tests/integration/vm/ha/test_scaling.py +++ b/tests/integration/vm/ha/test_scaling.py @@ -4,7 +4,6 @@ import logging import jubilant -import valkey from literals import CharmUsers from tests.integration.cw_helpers import ( @@ -15,6 +14,7 @@ APP_NAME, are_apps_active_and_agents_idle, get_cluster_hostnames, + get_nbr_connected_slaves, get_password, seed_valkey, ) @@ -39,57 +39,47 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: ) -def test_seed_data(juju: jubilant.Juju) -> None: +async def test_seed_data(juju: jubilant.Juju) -> None: """Seed some data to the cluster.""" - seed_valkey(juju, target_gb=1) + await seed_valkey(juju, target_gb=1) -def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: +async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) + # scale up - juju.add_unit(APP_NAME, num_units=2) + juju.add_unit(APP_NAME, num_units=NUM_UNITS - init_units_count) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, idle_period=10, unit_count=init_units_count + 2 + status, APP_NAME, idle_period=10, unit_count=NUM_UNITS ), timeout=1200, ) num_units = len(juju.status().apps[APP_NAME].units) - assert num_units == init_units_count + 2, ( - f"Expected {init_units_count + 2} units, got {num_units}." - ) + assert num_units == NUM_UNITS, f"Expected {NUM_UNITS} units, got {num_units}." # check if all units have been added to the cluster - endpoints = ",".join(get_cluster_hostnames(juju, APP_NAME)) + hostnames = get_cluster_hostnames(juju, APP_NAME) - sentinel_client = valkey.Sentinel( - [(host, 26379) for host in endpoints.split(",")], + connected_slaves = await get_nbr_connected_slaves( + hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_kwargs={ - "password": get_password(juju, user=CharmUsers.SENTINEL_ADMIN), - "username": CharmUsers.SENTINEL_ADMIN.value, - }, ) - master = sentinel_client.master_for("primary") - info = master.info("replication") - connected_slaves = info.get("connected_slaves", 0) - assert connected_slaves == num_units - 1, ( - f"Expected {num_units - 1} connected slaves, got {connected_slaves}." + assert connected_slaves == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." ) - assert_continuous_writes_increasing( - endpoints=endpoints, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), - sentinel_user=CharmUsers.SENTINEL_ADMIN.value, - sentinel_password=get_password(juju, user=CharmUsers.SENTINEL_ADMIN), + await assert_continuous_writes_increasing( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) logger.info("Stopping continuous writes after scale up test.") - logger.info(c_writes.stop()) + logger.info(await c_writes.async_stop()) assert_continuous_writes_consistent( - endpoints=endpoints, - valkey_user=CharmUsers.VALKEY_ADMIN.value, - valkey_password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index eb22aa3..41bdebd 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -5,7 +5,6 @@ import jubilant import pytest -from valkey import AuthenticationError from literals import ( INTERNAL_USERS_PASSWORD_CONFIG, @@ -15,14 +14,19 @@ from tests.integration.helpers import ( APP_NAME, INTERNAL_USERS_SECRET_LABEL, + NoAuthError, + WrongPassError, are_apps_active_and_agents_idle, - create_valkey_client, + auth_test, does_status_match, + exec_valkey_cli, fast_forward, get_cluster_hostnames, get_password, - get_primary_ip, get_secret_by_label, + ping, + ping_cluster, + set_key, set_password, ) @@ -44,23 +48,20 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: async def test_authentication(juju: jubilant.Juju) -> None: """Assert that we can authenticate to valkey.""" - primary = get_primary_ip(juju, APP_NAME) hostnames = get_cluster_hostnames(juju, APP_NAME) # try without authentication - with pytest.raises(AuthenticationError): - unauth_client = create_valkey_client(hostname=primary, username=None, password=None) - await unauth_client.ping() + with pytest.raises(NoAuthError): + await auth_test(hostnames, username=None, password=None) # Authenticate with internal user password = get_password(juju, user=CharmUsers.VALKEY_ADMIN) assert password is not None, "Admin password secret not found" for hostname in hostnames: - client = create_valkey_client(hostname=hostname, password=password) - assert client.ping() is True, ( - f"Authentication to Valkey cluster failed for host {hostname}" - ) + assert ( + "PONG" in exec_valkey_cli(hostname, CharmUsers.VALKEY_ADMIN.value, password, "ping")[0] + ), "Failed to authenticate with Valkey cluster using CLI" async def test_update_admin_password(juju: jubilant.Juju) -> None: @@ -80,22 +81,28 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: new_password_secret = get_password(juju, user=CharmUsers.VALKEY_ADMIN) assert new_password_secret == new_password, "Admin password not updated in secret" - primary = get_primary_ip(juju, APP_NAME) - + hostnames = get_cluster_hostnames(juju, APP_NAME) # confirm old password no longer works - with pytest.raises(AuthenticationError): - create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=old_password - ).ping() - # ping with new password - client = create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, "Failed to authenticate with new admin password" + with pytest.raises(WrongPassError): + await auth_test(hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=old_password) - assert client.set(TEST_KEY, TEST_VALUE) is True, ( - "Failed to write data after admin password update" - ) + assert ( + await ping_cluster( + hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ) + is True + ), "Failed to authenticate with new admin password" + + assert ( + await set_key( + hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, + value=TEST_VALUE, + ) + == "OK" + ), "Failed to write data after admin password update" # update the config again and remove the option `admin-password` logger.info("Ensure access is still possible after removing config option") @@ -108,15 +115,17 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: ) for hostname in get_cluster_hostnames(juju, APP_NAME): - client = create_valkey_client( - hostname=hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, ( + assert ( + ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ), ( f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) - assert client.get(TEST_KEY) == TEST_VALUE, ( - f"Failed to read data after admin password update on host {hostname}" - ) + assert ( + exec_valkey_cli( + hostname, CharmUsers.VALKEY_ADMIN.value, new_password, f"get {TEST_KEY}" + )[0] + == TEST_VALUE + ), f"Failed to read data after admin password update on host {hostname}" async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None: @@ -150,14 +159,25 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None ) # perform read operation with the updated password - primary = get_primary_ip(juju, APP_NAME) - client = create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, "Failed to authenticate with new admin password" - assert client.set(TEST_KEY, TEST_VALUE) is True, ( - "Failed to write data after admin password update" - ) + assert ( + await ping_cluster( + get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + ) + is True + ), "Failed to authenticate with new admin password" + + assert ( + await set_key( + get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=new_password, + key=TEST_KEY, + value=TEST_VALUE, + ) + == "OK" + ), "Failed to write data after admin password update" logger.info("Comparing other users passwords to previously") updated_secret = get_secret_by_label(juju, label=INTERNAL_USERS_SECRET_LABEL) @@ -200,26 +220,33 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform read operation with the updated password hostnames = get_cluster_hostnames(juju, APP_NAME) - primary = get_primary_ip(juju, APP_NAME) - client = create_valkey_client( - hostname=primary, username=CharmUsers.VALKEY_ADMIN.value, password=new_password - ) - assert client.ping() is True, "Failed to authenticate with new admin password" - assert client.set(TEST_KEY, TEST_VALUE) is True, ( - "Failed to write data after admin password update" - ) - for hostname in hostnames: - client = create_valkey_client( - hostname=hostname, + assert ping_cluster( + hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password + ), "Failed to authenticate with new admin password" + + assert ( + await set_key( + hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password, + key=TEST_KEY, + value=TEST_VALUE, ) - assert client.ping() is True, ( - f"Failed to authenticate with new admin password on host {hostname}" - ) - assert client.get(TEST_KEY) == TEST_VALUE, ( - f"Failed to read data after admin password update on host {hostname}" + == "OK" + ), "Failed to write data after admin password update" + + for hostname in hostnames: + assert ( + ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ), ( + f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) + assert ( + exec_valkey_cli( + hostname, CharmUsers.VALKEY_ADMIN.value, new_password, f"get {TEST_KEY}" + )[0] + == TEST_VALUE + ), f"Failed to read data after admin password update on host {hostname}" logger.info("Password update successful after secret was granted") @@ -239,12 +266,9 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: ) # perform pings with the updated replica password - for hostname in hostnames: - client = create_valkey_client( - hostname=hostname, - username=CharmUsers.VALKEY_REPLICA.value, - password=replica_password, - ) - assert client.ping() is True, ( - f"Failed to authenticate with new replica password on host {hostname}" + for hostname in get_cluster_hostnames(juju, APP_NAME): + assert ( + ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ), ( + f"Failed to authenticate with admin password after removing user secret on host {hostname}" ) From cb3e0ecfd7ca69dc36a57f8af9c1c89cc6cc2ad7 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 19 Feb 2026 12:00:26 +0000 Subject: [PATCH 097/159] install charmed-valkey snap --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index f50b41d..c955bd5 100644 --- a/tox.ini +++ b/tox.ini @@ -66,6 +66,7 @@ allowlist_externals = sh commands_pre = poetry install --only integration + sudo snap install charmed-valkey --channel 9/edge commands = # on CI, concierge will setup the model `testing` - locally we need to do it ourselves sh -c "if [ -z "$CI" ]; then juju add-model testing && juju model-config logging-config='=INFO;unit=DEBUG'; fi;" From 95abc33c109ad907d77cb272bdcea8ce91da76c8 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 19 Feb 2026 12:39:41 +0000 Subject: [PATCH 098/159] add sudo and snap to allowlist --- tox.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tox.ini b/tox.ini index c955bd5..7e8a2f0 100644 --- a/tox.ini +++ b/tox.ini @@ -64,6 +64,8 @@ pass_env = allowlist_externals = {[testenv]allowlist_externals} sh + sudo + snap commands_pre = poetry install --only integration sudo snap install charmed-valkey --channel 9/edge From 7d51cb4e1487f30c2a99b66eceb2a1562b3dca09 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 19 Feb 2026 14:02:01 +0000 Subject: [PATCH 099/159] mv from snap to downloading cli --- tests/integration/helpers.py | 4 +++- tox.ini | 11 +++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 332b815..802cde7 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -392,7 +392,9 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: def exec_valkey_cli(hostname: str, username: str, password: str, command: str) -> tuple[str, str]: """Execute a Valkey CLI command and returns the output as a string.""" - command = f"charmed-valkey.cli -h {hostname} -p {CLIENT_PORT} --user {username} --pass {password} {command}" + command = ( + f"valkey-cli -h {hostname} -p {CLIENT_PORT} --user {username} --pass {password} {command}" + ) result = subprocess.run( command.split(), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) diff --git a/tox.ini b/tox.ini index 7e8a2f0..3d98276 100644 --- a/tox.ini +++ b/tox.ini @@ -65,10 +65,17 @@ allowlist_externals = {[testenv]allowlist_externals} sh sudo - snap + apt + mv + wget + tar commands_pre = poetry install --only integration - sudo snap install charmed-valkey --channel 9/edge + sudo apt install wget -y + sh -c "mkdir -p /tmp/valkey_cli" + sh -c 'if [ "$(uname -m)" = "aarch64" ]; then wget https://download.valkey.io/releases/valkey-9.0.2-jammy-arm64.tar.gz -O /tmp/valkey_cli/valkey.tar.gz; else wget https://download.valkey.io/releases/valkey-9.0.2-jammy-x86_64.tar.gz -O /tmp/valkey_cli/valkey.tar.gz; fi' + tar -xvf /tmp/valkey_cli/valkey.tar.gz -C /tmp/valkey_cli + sh -c 'sudo mv /tmp/valkey_cli/valkey-9.0.2-jammy-*/bin/valkey-cli /usr/local/bin' commands = # on CI, concierge will setup the model `testing` - locally we need to do it ourselves sh -c "if [ -z "$CI" ]; then juju add-model testing && juju model-config logging-config='=INFO;unit=DEBUG'; fi;" From c1fa74e67d490905e0a1a53349c486189ff2dabc Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 20 Feb 2026 03:22:53 +0000 Subject: [PATCH 100/159] switch creating glie client to context manager to close connection automatically --- tests/integration/continuous_writes.py | 2 - tests/integration/cw_helpers.py | 14 ++-- tests/integration/helpers.py | 92 +++++++++++++++----------- tests/integration/k8s/test_charm.py | 7 +- tests/integration/vm/test_charm.py | 7 +- 5 files changed, 69 insertions(+), 53 deletions(-) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index d0ea9fb..ed87368 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -256,8 +256,6 @@ async def with_client(conf: SimpleNamespace): while not event.is_set(): try: config = data_queue.get_nowait() - # await client.close() - # client = await _make_client(config) proc_logger.info("Configuration updated, client reconnected.") except queue.Empty: pass diff --git a/tests/integration/cw_helpers.py b/tests/integration/cw_helpers.py index 022c0b1..150a399 100644 --- a/tests/integration/cw_helpers.py +++ b/tests/integration/cw_helpers.py @@ -52,16 +52,16 @@ async def assert_continuous_writes_increasing( password: str, ) -> None: """Assert that the continuous writes are increasing.""" - client = await create_valkey_client( + async with create_valkey_client( hostnames, username=username, password=password, - ) - writes_count = await client.llen(KEY) - await asyncio.sleep(10) - more_writes = await client.llen(KEY) - assert more_writes > writes_count, "Writes not continuing to DB" - logger.info("Continuous writes are increasing.") + ) as client: + writes_count = await client.llen(KEY) + await asyncio.sleep(10) + more_writes = await client.llen(KEY) + assert more_writes > writes_count, "Writes not continuing to DB" + logger.info("Continuous writes are increasing.") def assert_continuous_writes_consistent( diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 802cde7..5a8afa2 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -2,12 +2,12 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. -import contextlib import logging import os import re import subprocess import time +from contextlib import asynccontextmanager, contextmanager from datetime import datetime, timedelta from pathlib import Path from typing import List @@ -245,6 +245,7 @@ def get_secret_by_label(juju: jubilant.Juju, label: str) -> dict[str, str]: raise SecretNotFoundError(f"Secret with label {label} not found") +@asynccontextmanager async def create_valkey_client( hostnames: list[str], username: str | None = CharmUsers.VALKEY_ADMIN.value, @@ -256,7 +257,6 @@ async def create_valkey_client( hostnames: List of hostnames of the Valkey cluster nodes. username: The username for authentication. password: The password for the internal user. - tls_enabled: Whether TLS certificates are needed. Returns: A Valkey client instance connected to the cluster. @@ -272,7 +272,11 @@ async def create_valkey_client( credentials=credentials, ) - return await GlideClient.create(client_config) + client = await GlideClient.create(client_config) + try: + yield client + finally: + await client.close() def set_password( @@ -306,7 +310,7 @@ def set_password( juju.config(app=application, values={INTERNAL_USERS_PASSWORD_CONFIG: secret_id}) -@contextlib.contextmanager +@contextmanager def fast_forward(juju: jubilant.Juju): """Context manager that temporarily speeds up update-status hooks to fire every 10s.""" old = juju.model_config()["update-status-hook-interval"] @@ -324,8 +328,8 @@ async def get_primary_ip(juju: jubilant.Juju, app: str) -> str: The IP address of the primary node. """ hostnames = get_cluster_hostnames(juju, app) - client = await create_valkey_client([hostnames[0]], password=get_password(juju)) - info = await client.custom_command(["client", "info"]) + async with create_valkey_client([hostnames[0]], password=get_password(juju)) as client: + info = await client.custom_command(["client", "info"]) match = re.search(r"laddr=([\d\.]+):", info.decode()) if match: return match.group(1) @@ -349,7 +353,6 @@ def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: # Connect to Valkey hostnames = get_cluster_hostnames(juju, APP_NAME) - client = await create_valkey_client(hostnames, password=get_password(juju)) # Configuration value_size_bytes = 1024 # 1KB per value @@ -366,29 +369,34 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: # Generate a fixed random block to reuse (saves CPU cycles on generation) random_data = os.urandom(value_size_bytes).hex()[:value_size_bytes] - - try: - while keys_added < total_keys: - data = {f"{SEED_KEY_PREFIX}{key_idx}": random_data for key_idx in range(batch_size)} - - if await client.mset(data) != "OK": - raise RuntimeError("Failed to set data in Valkey cluster") - - keys_added += batch_size - - # Progress reporting - elapsed = time.time() - start_time - percent = (keys_added / total_keys) * 100 + async with create_valkey_client(hostnames, password=get_password(juju)) as client: + try: + while keys_added < total_keys: + data = { + f"{SEED_KEY_PREFIX}{key_idx}": random_data + for key_idx in range(keys_added, keys_added + batch_size) + } + + if await client.mset(data) != "OK": + raise RuntimeError("Failed to set data in Valkey cluster") + + keys_added += batch_size + + # Progress reporting + elapsed = time.time() - start_time + percent = (keys_added / total_keys) * 100 + logger.info( + f"Progress: {percent:.1f}% | Keys: {keys_added:,} | Elapsed: {elapsed:.1f}s", + ) + + except Exception as e: + logger.error(f"\nError: {e}") + finally: + total_time = time.time() - start_time logger.info( - f"Progress: {percent:.1f}% | Keys: {keys_added:,} | Elapsed: {elapsed:.1f}s", + f"\nSeeding complete! Added {keys_added:,} keys in {total_time:.2f} seconds." ) - except Exception as e: - logger.error(f"\nError: {e}") - finally: - total_time = time.time() - start_time - logger.info(f"\nSeeding complete! Added {keys_added:,} keys in {total_time:.2f} seconds.") - def exec_valkey_cli(hostname: str, username: str, password: str, command: str) -> tuple[str, str]: """Execute a Valkey CLI command and returns the output as a string.""" @@ -417,8 +425,10 @@ async def set_key( username: The username for authentication. password: The password for authentication. """ - client = await create_valkey_client(hostnames=hostnames, username=username, password=password) - return await client.set(key, value) + async with create_valkey_client( + hostnames=hostnames, username=username, password=password + ) as client: + return await client.set(key, value) async def get_key( @@ -435,8 +445,10 @@ async def get_key( username: The username for authentication. password: The password for authentication. """ - client = await create_valkey_client(hostnames=hostnames, username=username, password=password) - return await client.get(key) + async with create_valkey_client( + hostnames=hostnames, username=username, password=password + ) as client: + return await client.get(key) def ping( @@ -472,8 +484,10 @@ async def ping_cluster( Returns: True if all nodes respond to a ping, False otherwise. """ - client = await create_valkey_client(hostnames=hostnames, username=username, password=password) - return await client.ping() == "PONG".encode() + async with create_valkey_client( + hostnames=hostnames, username=username, password=password + ) as client: + return await client.ping() == "PONG".encode() async def get_nbr_connected_slaves( @@ -491,8 +505,10 @@ async def get_nbr_connected_slaves( Returns: The number of connected slaves. """ - client = await create_valkey_client(hostnames=hostnames, username=username, password=password) - info = (await client.info([InfoSection.REPLICATION])).decode() + async with create_valkey_client( + hostnames=hostnames, username=username, password=password + ) as client: + info = (await client.info([InfoSection.REPLICATION])).decode() search_result = re.search(r"connected_slaves:([\d+])", info) if not search_result: raise ValueError("Could not parse number of connected slaves from info output") @@ -519,10 +535,10 @@ async def auth_test(hostnames: list[str], username: str | None, password: str | True if authentication is successful and the cluster responds to a ping, False otherwise. """ try: - client = await create_valkey_client( + async with create_valkey_client( hostnames=hostnames, username=username, password=password - ) - return await client.ping() == "PONG".encode() + ) as client: + return await client.ping() == "PONG".encode() except Exception as e: error_message = str(e) if "NOAUTH" in error_message: diff --git a/tests/integration/k8s/test_charm.py b/tests/integration/k8s/test_charm.py index 9721ebb..23f6345 100644 --- a/tests/integration/k8s/test_charm.py +++ b/tests/integration/k8s/test_charm.py @@ -221,7 +221,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform read operation with the updated password hostnames = get_cluster_hostnames(juju, APP_NAME) - assert ping_cluster( + assert await ping_cluster( hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password ), "Failed to authenticate with new admin password" @@ -269,7 +269,8 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform pings with the updated replica password for hostname in get_cluster_hostnames(juju, APP_NAME): assert ( - ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ping(hostname, username=CharmUsers.VALKEY_REPLICA.value, password=replica_password) + is True ), ( - f"Failed to authenticate with admin password after removing user secret on host {hostname}" + f"Failed to authenticate with replica password after removing user secret on host {hostname}" ) diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index 41bdebd..dfcf05d 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -220,7 +220,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform read operation with the updated password hostnames = get_cluster_hostnames(juju, APP_NAME) - assert ping_cluster( + assert await ping_cluster( hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=new_password ), "Failed to authenticate with new admin password" @@ -268,7 +268,8 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: # perform pings with the updated replica password for hostname in get_cluster_hostnames(juju, APP_NAME): assert ( - ping(hostname, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) is True + ping(hostname, username=CharmUsers.VALKEY_REPLICA.value, password=replica_password) + is True ), ( - f"Failed to authenticate with admin password after removing user secret on host {hostname}" + f"Failed to authenticate with replica password after removing user secret on host {hostname}" ) From dfbde4192cc1efccbac0046382c6f05c11766413 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 20 Feb 2026 09:34:08 +0000 Subject: [PATCH 101/159] wip scale down --- metadata.yaml | 10 +++ src/common/client.py | 101 +++++++++++++++++++++++++- src/common/exceptions.py | 12 ++++ src/common/locks.py | 143 ++++++++++++++++++++++++++++++++++++ src/core/base_workload.py | 5 ++ src/core/models.py | 5 +- src/events/base_events.py | 147 ++++++++++++++++++++++++-------------- src/literals.py | 13 ++++ src/managers/sentinel.py | 121 +++++++++++++++++++++++++++++++ src/workload_k8s.py | 11 +++ src/workload_vm.py | 17 +++++ 11 files changed, 531 insertions(+), 54 deletions(-) create mode 100644 src/common/locks.py diff --git a/metadata.yaml b/metadata.yaml index 69e11f4..0da524c 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -22,6 +22,9 @@ website: containers: valkey: resource: valkey-image + mounts: + - storage: data + location: /var/lib/valkey/ resources: valkey-image: @@ -34,3 +37,10 @@ peers: interface: valkey_peers status-peers: interface: status_peers + +storage: + data: + type: filesystem + location: /var/snap/charmed-valkey/common/var/lib/charmed-valkey + description: storage for valkey data + minimum-size: 1G diff --git a/src/common/client.py b/src/common/client.py index 75ae51d..b72a9f6 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -6,6 +6,8 @@ import logging from typing import Literal +from tenacity import retry, stop_after_attempt, wait_fixed + from common.exceptions import ValkeyWorkloadCommandError from core.base_workload import WorkloadBase from literals import CLIENT_PORT, PRIMARY_NAME, SENTINEL_PORT @@ -57,8 +59,9 @@ def exec_cli_command( "--pass", self.password, ] + command + logger.debug(f"Executing CLI command on {hostname}: {cli_command}") output, error = self.workload.exec(cli_command) - return output, error + return output.strip(), error def ping(self, hostname: str) -> bool: """Ping the Valkey server to check if it's responsive. @@ -273,3 +276,99 @@ def sentinel_get_master_info(self, hostname: str) -> dict[str, str] | None: except ValkeyWorkloadCommandError as e: logger.error(f"Failed to get master info from sentinel at {hostname}: {e}") return None + + def sentinel_failover(self, hostname: str): + """Trigger a failover through the sentinel. + + Args: + hostname (str): The hostname to connect to. + + Returns: + bool: True if the failover command was executed successfully, False otherwise. + """ + if not self.connect_to == "sentinel": + logger.error( + "Attempted to trigger failover through sentinel while client is configured to connect to valkey." + ) + raise ValueError("Client is not configured to connect to sentinel.") + try: + output, err = self.exec_cli_command( + command=["sentinel", "failover", PRIMARY_NAME, "coordinated"], + hostname=hostname, + ) + if "OK" not in output.strip(): + logger.error( + "Failed to trigger failover through sentinel at %s: stdout: %s, stderr: %s", + hostname, + output, + err, + ) + raise ValkeyWorkloadCommandError( + f"Failed to trigger failover through sentinel at {hostname}: stdout, stderr: {(output, err)}" + ) + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to trigger failover through sentinel at {hostname}: {e}") + raise + + def sentinel_reset_state(self, hostname: str) -> None: + """Reset the sentinel state for the primary. + + Args: + hostname (str): The hostname to connect to. + """ + if not self.connect_to == "sentinel": + logger.error( + "Attempted to reset sentinel state through sentinel while client is configured to connect to valkey." + ) + raise ValueError("Client is not configured to connect to sentinel.") + try: + output, err = self.exec_cli_command( + command=["sentinel", "reset", PRIMARY_NAME], + hostname=hostname, + ) + if output != "1": + raise ValkeyWorkloadCommandError( + f"Failed to reset sentinel state through sentinel at {hostname}: stdout, stderr: {(output, err)}" + ) + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to reset sentinel state through sentinel at {hostname}: {e}") + raise + + @retry( + stop=stop_after_attempt(3), + wait=wait_fixed(1), + reraise=True, + ) + def sentinel_get_replica_info(self, hostname: str) -> str: + """Get the replicas information of the primary from sentinel. + + Args: + hostname (str): The hostname to connect to. + + Returns: + str | None: The output of the "sentinel replicas" command if retrieved successfully, None otherwise. + """ + if not self.connect_to == "sentinel": + logger.error( + "Attempted to get replica info from sentinel while client is configured to connect to valkey." + ) + raise ValueError("Client is not configured to connect to sentinel.") + try: + output, err = self.exec_cli_command( + command=["sentinel", "replicas", PRIMARY_NAME], + hostname=hostname, + ) + logger.debug( + "Output of 'sentinel replicas' command from sentinel at %s: stdout, stderr: %s", + hostname, + (output, err), + ) + if not output.strip(): + logger.warning(f"No replica info found in sentinel at {hostname}.") + raise ValkeyWorkloadCommandError( + f"No replica info found in sentinel at {hostname}." + ) + return output.strip() + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to get replica info from sentinel at {hostname}: {e}") + raise diff --git a/src/common/exceptions.py b/src/common/exceptions.py index 756f285..2936558 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -34,3 +34,15 @@ class ValkeyServiceNotAliveError(Exception): class ValkeyConfigurationError(Exception): """Custom Exception if Valkey configuration fails to be set.""" + + +class SentinelFailoverError(Exception): + """Custom Exception if triggering sentinel failover fails.""" + + +class ValkeyServicesCouldNotBeStoppedError(Exception): + """Custom Exception if Valkey services could not be stopped.""" + + +class CannotSeeAllActiveSentinelsError(Exception): + """Custom Exception if the local sentinel cannot see all active sentinels in the cluster.""" diff --git a/src/common/locks.py b/src/common/locks.py new file mode 100644 index 0000000..d945532 --- /dev/null +++ b/src/common/locks.py @@ -0,0 +1,143 @@ +# Copyright 2026 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Collection of lock names for cluster operations.""" + +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from core.cluster_state import ClusterState + from core.models import ValkeyServer + + +logger = logging.getLogger(__name__) + + +class Lock(ABC): + """Base class for locks.""" + + unit_request_lock_atr_name: str + member_with_lock_atr_name: str + + def __init__(self, state: "ClusterState") -> None: + self.state = state + + @property + def name(self) -> str: + """Get the name of the lock.""" + return self.__class__.__name__.lower() + + @property + def units_requesting_lock(self) -> list[str]: + """Get the list of units requesting the start lock.""" + return [ + unit.unit_name + for unit in self.state.servers + if unit.model and getattr(unit.model, self.unit_request_lock_atr_name, False) + ] + + @property + def next_unit_to_give_lock(self) -> str | None: + """Get the next unit to give the start lock to.""" + return self.units_requesting_lock[0] if self.units_requesting_lock else None + + @property + def unit_with_lock(self) -> "ValkeyServer | None": + """Get the unit that currently holds the start lock.""" + return next( + ( + unit + for unit in self.state.servers + if unit.unit_name + == getattr(self.state.cluster.model, self.member_with_lock_atr_name, "") + ), + None, + ) + + @property + @abstractmethod + def is_lock_free_to_give(self) -> bool: + """Check if the unit with the lock has completed its operation.""" + pass + + def do_i_hold_lock(self) -> bool: + """Check if the local unit holds the start lock.""" + return self.state.unit_server.unit_name == getattr( + self.state.cluster.model, self.member_with_lock_atr_name, "" + ) + + def request_lock(self) -> None: + """Request the lock for the local unit.""" + self.state.unit_server.update( + { + self.unit_request_lock_atr_name: True, + } + ) + if self.state.unit_server.unit.is_leader(): + logger.info( + f"Leader unit requesting {self.name} lock. Triggering lock request processing." + ) + self.process() + + def release_lock(self) -> None: + """Release the lock from the local unit.""" + self.state.unit_server.update( + { + self.unit_request_lock_atr_name: False, + } + ) + if self.state.unit_server.unit.is_leader(): + logger.info( + f"Leader unit releasing {self.name} lock. Triggering lock request processing." + ) + self.process() + + def process(self) -> None: + """Process the lock requests and update the unit with the lock.""" + if not self.state.unit_server.unit.is_leader(): + logger.info(f"Only the leader can process {self.name} lock requests.") + return + + if self.is_lock_free_to_give: + next_unit = self.next_unit_to_give_lock + self.state.cluster.update({self.member_with_lock_atr_name: next_unit}) + logger.debug(f"Gave {self.name} lock to {next_unit}") + logger.debug( + f"{self.name} lock is currently held by {getattr(self.state.cluster.model, self.member_with_lock_atr_name)}" + ) + + +class StartLock(Lock): + """Lock for starting operations.""" + + unit_request_lock_atr_name = "request_start_lock" + member_with_lock_atr_name = "start_member" + + @property + def is_lock_free_to_give(self) -> bool: + """Check if the unit with the start lock has completed its operation.""" + starting_unit = self.unit_with_lock + return ( + not self.state.cluster.model.start_member + or not starting_unit + or starting_unit.is_started + ) + + +class ScaleDownLock(Lock): + """Lock for scale down operations.""" + + unit_request_lock_atr_name = "request_scale_down_lock" + member_with_lock_atr_name = "scale_down_member" + + @property + def is_lock_free_to_give(self) -> bool: + """Check if the unit with the scale down lock has completed its operation.""" + scaling_down_unit = self.unit_with_lock + return ( + not self.state.cluster.model.scale_down_member + or not scaling_down_unit + or scaling_down_unit.model.request_scale_down_lock is False + ) diff --git a/src/core/base_workload.py b/src/core/base_workload.py index 1f97310..f7ead8f 100644 --- a/src/core/base_workload.py +++ b/src/core/base_workload.py @@ -42,6 +42,11 @@ def start(self) -> None: """ pass + @abstractmethod + def stop(self) -> None: + """Stop the workload service.""" + pass + @abstractmethod def exec(self, command: list[str]) -> tuple[str, str | None]: """Run a command on the workload substrate.""" diff --git a/src/core/models.py b/src/core/models.py index fcf79bc..5a01972 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -36,7 +36,8 @@ class PeerAppModel(PeerModel): charmed_stats_password: InternalUsersSecret = Field(default="") charmed_sentinel_peers_password: InternalUsersSecret = Field(default="") charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") - starting_member: str = Field(default="") + start_member: str = Field(default="") + scale_down_member: str = Field(default="") class PeerUnitModel(PeerModel): @@ -47,6 +48,8 @@ class PeerUnitModel(PeerModel): hostname: str = Field(default="") private_ip: str = Field(default="") request_start_lock: bool = Field(default=False) + request_scale_down_lock: bool = Field(default=False) + scale_down_state: str = Field(default="") class RelationState: diff --git a/src/events/base_events.py b/src/events/base_events.py index 9d94aa6..8da55df 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -11,19 +11,25 @@ import ops from common.exceptions import ( + CannotSeeAllActiveSentinelsError, + SentinelFailoverError, ValkeyACLLoadError, ValkeyConfigSetError, ValkeyConfigurationError, ValkeyServiceNotAliveError, + ValkeyServicesCouldNotBeStoppedError, ValkeyServicesFailedToStartError, ValkeyWorkloadCommandError, ) +from common.locks import ScaleDownLock, StartLock from literals import ( CLIENT_PORT, + DATA_STORAGE, INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, CharmUsers, + ScaleDownState, StartState, Substrate, ) @@ -75,6 +81,9 @@ def __init__(self, charm: "ValkeyCharm"): self.framework.observe(self.charm.on.config_changed, self._on_config_changed) self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed) self.framework.observe(self.unit_fully_started, self._on_unit_fully_started) + self.framework.observe( + self.charm.on[DATA_STORAGE].storage_detaching, self._on_storage_detaching + ) def _on_install(self, event: ops.InstallEvent) -> None: """Handle install event.""" @@ -89,7 +98,14 @@ def _on_install(self, event: ops.InstallEvent) -> None: def _on_start(self, event: ops.StartEvent) -> None: """Handle the on start event.""" - self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) + self.charm.state.unit_server.update( + { + "start_state": StartState.NOT_STARTED.value, + "hostname": socket.gethostname(), + "private_ip": self.charm.state.bind_address, + } + ) + start_lock = StartLock(self.charm.state) if not self.charm.workload.can_connect: logger.warning("Workload not ready yet") @@ -103,18 +119,10 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return - self.charm.state.unit_server.update( - {"start_state": StartState.WAITING_TO_START.value, "request_start_lock": True} - ) + self.charm.state.unit_server.update({"start_state": StartState.WAITING_TO_START.value}) + start_lock.request_lock() - if self.charm.unit.is_leader(): - logger.info( - "Leader unit requesting lock to start services. Triggering lock request processing." - ) - self._process_lock_requests() - - # TODO unit.name would not work across models we need to switch to using `model.unit.name + model_uuid` - if self.charm.state.cluster.model.starting_member != self.charm.unit.name: + if not start_lock.do_i_hold_lock(): logger.info("Waiting for lock to start") event.defer() return @@ -195,7 +203,9 @@ def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: event.defer() return - if not event.is_primary and not self.charm.sentinel_manager.is_sentinel_discovered(): + if not event.is_primary and not self.charm.sentinel_manager.is_sentinel_discovered( + self.charm.state.bind_address + ): logger.info("Sentinel service not yet discovered by other units. Deferring event.") self.charm.state.unit_server.update( {"start_state": StartState.STARTING_WAITING_SENTINEL.value} @@ -223,45 +233,8 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: if not self.charm.unit.is_leader(): return - self._process_lock_requests() - - def _process_lock_requests(self) -> None: - """Process start lock requests. - - The leader unit will choose one of the units that requested the lock to start, and update the cluster model with that unit as the starting member. - """ - units_requesting_start = [ - unit.unit_name - for unit in self.charm.state.servers - if unit.model and unit.model.request_start_lock - ] - starting_unit = next( - ( - unit - for unit in self.charm.state.servers - if unit.unit_name == self.charm.state.cluster.model.starting_member - ), - None, - ) - if ( - # if the starting member has not started yet, we want to wait for it to start instead of choosing another unit that requested start - self.charm.state.cluster.model.starting_member - and starting_unit - and not starting_unit.is_started - ): - logger.debug( - "Starting member %s has not started yet. Units requesting start: %s. ", - self.charm.state.cluster.model.starting_member, - units_requesting_start, - ) - return - - self.charm.state.cluster.update( - {"starting_member": units_requesting_start[0] if units_requesting_start else ""} - ) - logger.debug( - f"Updated starting member to {units_requesting_start[0] if units_requesting_start else ''}" - ) + for lock in [StartLock(self.charm.state), ScaleDownLock(self.charm.state)]: + lock.process() def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: """Handle the update-status event.""" @@ -463,3 +436,73 @@ def _update_internal_users_password(self, secret_id: str) -> None: scope="app", component=self.charm.cluster_manager.name, ) + + def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: + """Handle removal of the data storage mount, e.g. when removing a unit.""" + # get scale down lock + scale_down_lock = ScaleDownLock(self.charm.state) + + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.WAIT_FOR_LOCK}) + scale_down_lock.request_lock() + if not scale_down_lock.do_i_hold_lock(): + logger.debug("Waiting for lock to scale down") + event.defer() + return + + # Consider scaling to 0 if we need to clean databag + + # consider quorom when removing unit + + # if unit has primary then failover + if self.charm.sentinel_manager.get_primary_ip() == self.charm.state.bind_address: + self.charm.state.unit_server.update( + {"scale_down_state": ScaleDownState.WAIT_TO_FAILOVER} + ) + logger.debug( + "Unit with IP %s is primary, triggering failover before scale down", + self.charm.state.bind_address, + ) + try: + self.charm.sentinel_manager.failover() + logger.debug( + "Failover completed, new primary ip %s", + self.charm.sentinel_manager.get_primary_ip(), + ) + except SentinelFailoverError: + logger.error("Failed to trigger failover before scale down") + event.defer() + return + + # stop valkey and sentinel processes + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.STOP_SERVICES}) + try: + self.charm.workload.stop() + except ValkeyServicesCouldNotBeStoppedError: + logger.error("Failed to stop Valkey services before scale down") + event.defer() + return + + # reset sentinel states on other units + self.charm.state.unit_server.update( + { + "scale_down_state": ScaleDownState.RESET_SENTINEL, + "start_state": StartState.NOT_STARTED.value, + } + ) + try: + self.charm.sentinel_manager.reset_sentinel_states() + except (ValkeyWorkloadCommandError, CannotSeeAllActiveSentinelsError): + logger.error("Failed to reset sentinel states before scale down") + event.defer() + return + + # check health after scale down + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) + + if not self.charm.sentinel_manager.verify_expected_replica_count(): + logger.error("Not all sentinels see the expected number of replicas after scale down") + event.defer() + return + + # release lock + scale_down_lock.release_lock() diff --git a/src/literals.py b/src/literals.py index 665b182..ab41959 100644 --- a/src/literals.py +++ b/src/literals.py @@ -38,6 +38,8 @@ INTERNAL_USERS_PASSWORD_CONFIG = "system-users" INTERNAL_USERS_SECRET_LABEL_SUFFIX = "internal_users_secret" +DATA_STORAGE = "data" + # As per the valkey users spec # https://docs.google.com/document/d/1EImKKHK3wLY73-D1M2ItpHe88NHeB-Iq2M3lz7AQB7E @@ -83,3 +85,14 @@ class StartState(StrEnum): STARTING_WAITING_REPLICA_SYNC = "starting_waiting_replica_sync" ERROR_ON_START = "error_on_start" STARTED = "started" + + +class ScaleDownState(StrEnum): + """Scale down states for the service.""" + + NO_SCALE_DOWN = "" + WAIT_FOR_LOCK = "wait_for_lock" + WAIT_TO_FAILOVER = "wait_to_failover" + STOP_SERVICES = "stopped_services" + RESET_SENTINEL = "reset_sentinel" + HEALTH_CHECK = "health_check" diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 04b3cc7..250234b 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -13,6 +13,8 @@ from common.client import ValkeyClient from common.exceptions import ( + CannotSeeAllActiveSentinelsError, + SentinelFailoverError, ValkeyWorkloadCommandError, ) from core.base_workload import WorkloadBase @@ -123,6 +125,125 @@ def is_healthy(self) -> bool: return True + def failover(self) -> None: + """Trigger a failover in the cluster.""" + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + try: + client.sentinel_failover(self.state.bind_address) + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to trigger failover: {e}") + raise SentinelFailoverError from e + + def reset_sentinel_states(self) -> None: + """Reset the sentinel states on all sentinels in the cluster.""" + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + + active_sentinels = [unit for unit in self.state.servers if unit.is_started] + logger.debug( + "Resetting sentinel states on %s", str([unit.unit_name for unit in active_sentinels]) + ) + for unit in active_sentinels: + try: + client.sentinel_reset_state(hostname=unit.model.private_ip) + except ValkeyWorkloadCommandError: + logger.warning( + f"Could not reset sentinel state on {unit.unit_name} ({unit.model.private_ip})." + ) + raise + + if not self.sentinel_sees_all_others(target_sentinel_ip=unit.model.private_ip): + logger.warning( + f"Sentinel at {unit.model.private_ip} does not see all other sentinels after reset." + ) + raise CannotSeeAllActiveSentinelsError( + f"Sentinel at {unit.model.private_ip} does not see all other sentinels after reset." + ) + + @retry( + wait=wait_fixed(1), + stop=stop_after_attempt(5), + retry=retry_if_result(lambda result: result is False), + retry_error_callback=lambda _: False, + ) + def sentinel_sees_all_others(self, target_sentinel_ip: str) -> bool: + """Check if the sentinel of the local unit sees all the other sentinels in the cluster.""" + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + + other_active_sentinels = [ + unit.model.private_ip + for unit in self.state.servers + if unit.is_started and unit.model.private_ip != target_sentinel_ip + ] + + logger.debug( + "Checking if sentinel at %s sees all other sentinels: %s", + target_sentinel_ip, + other_active_sentinels, + ) + + for sentinel_ip in other_active_sentinels: + try: + output, _ = client.exec_cli_command( + command=["sentinel", "sentinels", PRIMARY_NAME], + hostname=target_sentinel_ip, + ) + if sentinel_ip not in output: + logger.debug( + f"Sentinel at {target_sentinel_ip} does not see sentinel at {sentinel_ip}" + ) + return False + except ValkeyWorkloadCommandError: + logger.warning( + f"Could not query sentinel at {target_sentinel_ip} for sentinel discovery." + ) + return False + return True + + def verify_expected_replica_count(self) -> bool: + """Verify that the sentinels in the cluster see the expected number of replicas.""" + client = ValkeyClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + connect_to="sentinel", + ) + + units_started = [unit for unit in self.state.servers if unit.is_started] + # all started servers except primary are expected to be replicas + expected_replicas = len(units_started) - 1 + logger.debug( + "Verifying expected replica count. Expected replicas: %d, started servers: %s", + expected_replicas, + str([unit.unit_name for unit in units_started]), + ) + try: + for unit in units_started: + replica_info = client.sentinel_get_replica_info(hostname=unit.model.private_ip) + if expected_replicas != (nbr_replicas := replica_info.count("name")): + logger.warning( + f"Sentinel at {unit.model.private_ip} sees {nbr_replicas} replicas, expected {expected_replicas}." + ) + return False + except ValkeyWorkloadCommandError: + logger.warning("Could not query sentinel for replica information.") + return False + return True + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the sentinel manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 97f0dac..91f0a3f 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -13,6 +13,7 @@ from common.exceptions import ( ValkeyServiceNotAliveError, + ValkeyServicesCouldNotBeStoppedError, ValkeyServicesFailedToStartError, ValkeyWorkloadCommandError, ) @@ -127,3 +128,13 @@ def exec(self, command: list[str]) -> tuple[str, str | None]: except ops.pebble.ExecError as e: logger.error("Command failed with %s, %s", e.exit_code, e.stdout) raise ValkeyWorkloadCommandError(e) + + @override + def stop(self) -> None: + try: + self.container.stop(self.valkey_service, self.sentinel_service, self.metric_service) + except ops.pebble.ChangeError as e: + logger.error("Failed to stop Valkey services: %s", e) + raise ValkeyServicesCouldNotBeStoppedError( + f"Failed to stop Valkey services: {e}" + ) from e diff --git a/src/workload_vm.py b/src/workload_vm.py index 2c3a043..b956284 100644 --- a/src/workload_vm.py +++ b/src/workload_vm.py @@ -21,6 +21,7 @@ from common.exceptions import ( ValkeyServiceNotAliveError, + ValkeyServicesCouldNotBeStoppedError, ValkeyServicesFailedToStartError, ValkeyWorkloadCommandError, ) @@ -170,3 +171,19 @@ def wait_for_services_to_be_alive(self, duration: float = 30, delay: float = 0.1 time.sleep(delay) return True + + @override + def stop(self) -> None: + try: + self.valkey.stop(services=[SNAP_SERVICE, SNAP_SENTINEL_SERVICE]) + except snap.SnapError as e: + logger.error("Failed to stop Valkey services: %s", e) + raise ValkeyServicesCouldNotBeStoppedError( + f"Failed to stop Valkey services: {e}" + ) from e + + if self.alive(): + logger.error("Valkey services are still alive after stop.") + raise ValkeyServicesCouldNotBeStoppedError( + "Valkey services are still alive after stop." + ) From ec578b7fd8505ea7ea0581efd8771e8437593142 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 20 Feb 2026 10:36:20 +0000 Subject: [PATCH 102/159] revert back is sentinel discovered argument --- src/events/base_events.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 8da55df..803639d 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -203,9 +203,7 @@ def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: event.defer() return - if not event.is_primary and not self.charm.sentinel_manager.is_sentinel_discovered( - self.charm.state.bind_address - ): + if not event.is_primary and not self.charm.sentinel_manager.is_sentinel_discovered(): logger.info("Sentinel service not yet discovered by other units. Deferring event.") self.charm.state.unit_server.update( {"start_state": StartState.STARTING_WAITING_SENTINEL.value} @@ -449,9 +447,7 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: event.defer() return - # Consider scaling to 0 if we need to clean databag - - # consider quorom when removing unit + # TODO consider quorom when removing unit # if unit has primary then failover if self.charm.sentinel_manager.get_primary_ip() == self.charm.state.bind_address: @@ -498,7 +494,6 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: # check health after scale down self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) - if not self.charm.sentinel_manager.verify_expected_replica_count(): logger.error("Not all sentinels see the expected number of replicas after scale down") event.defer() From a14839d9137a0859ef8e8e5d7568ccf0d8c279c6 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 24 Feb 2026 05:56:30 +0000 Subject: [PATCH 103/159] statuses for scale down --- src/common/client.py | 22 ++++++- src/common/locks.py | 117 +++++++++++++++++++++++++++++++------- src/core/models.py | 16 +++++- src/events/base_events.py | 45 ++++++++++----- src/literals.py | 1 + src/managers/cluster.py | 60 ++++++++++--------- src/managers/config.py | 2 +- src/managers/sentinel.py | 14 ++--- src/statuses.py | 19 +++++++ 9 files changed, 224 insertions(+), 72 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index b72a9f6..b3b4ff6 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -136,6 +136,26 @@ def set_value(self, hostname: str, key: str, value: str) -> bool: logger.error(f"Failed to set key {key} on Valkey server at {hostname}: {e}") return False + def get_value(self, hostname: str, key: str) -> str | None: + """Get the value of a key from the Valkey server. + + Args: + hostname (str): The hostname to connect to. + key (str): The key to retrieve. + + Returns: + str | None: The value of the key if retrieved successfully, None otherwise. + """ + try: + output, err = self.exec_cli_command(["get", key], hostname=hostname) + if not output.strip(): + logger.warning(f"Key {key} not found on Valkey server at {hostname}.") + return None + return output.strip() + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to get key {key} from Valkey server at {hostname}: {e}") + return None + def is_replica_synced(self, hostname: str) -> bool: """Check if the replica is synced with the primary. @@ -335,7 +355,7 @@ def sentinel_reset_state(self, hostname: str) -> None: raise @retry( - stop=stop_after_attempt(3), + stop=stop_after_attempt(5), wait=wait_fixed(1), reraise=True, ) diff --git a/src/common/locks.py b/src/common/locks.py index d945532..6ef0ce4 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -4,10 +4,15 @@ """Collection of lock names for cluster operations.""" import logging -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING +from abc import abstractmethod +from typing import TYPE_CHECKING, Protocol, override + +from common.client import ValkeyClient +from core.cluster_state import ClusterState +from literals import CharmUsers if TYPE_CHECKING: + from charm import ValkeyCharm from core.cluster_state import ClusterState from core.models import ValkeyServer @@ -15,7 +20,32 @@ logger = logging.getLogger(__name__) -class Lock(ABC): +class Lockable(Protocol): + """Protocol for lockable operations.""" + + @property + def name(self) -> str: + """Get the name of the lock.""" + return self.__class__.__name__.lower() + + @abstractmethod + def request_lock(self) -> None: + """Request the lock for the local unit.""" + raise NotImplementedError + + @abstractmethod + def release_lock(self) -> None: + """Release the lock from the local unit.""" + raise NotImplementedError + + @property + @abstractmethod + def do_i_hold_lock(self) -> bool: + """Check if the local unit holds the lock.""" + raise NotImplementedError + + +class DataBagLock(Lockable): """Base class for locks.""" unit_request_lock_atr_name: str @@ -24,11 +54,6 @@ class Lock(ABC): def __init__(self, state: "ClusterState") -> None: self.state = state - @property - def name(self) -> str: - """Get the name of the lock.""" - return self.__class__.__name__.lower() - @property def units_requesting_lock(self) -> list[str]: """Get the list of units requesting the start lock.""" @@ -60,8 +85,9 @@ def unit_with_lock(self) -> "ValkeyServer | None": @abstractmethod def is_lock_free_to_give(self) -> bool: """Check if the unit with the lock has completed its operation.""" - pass + raise NotImplementedError + @property def do_i_hold_lock(self) -> bool: """Check if the local unit holds the start lock.""" return self.state.unit_server.unit_name == getattr( @@ -109,7 +135,7 @@ def process(self) -> None: ) -class StartLock(Lock): +class StartLock(DataBagLock): """Lock for starting operations.""" unit_request_lock_atr_name = "request_start_lock" @@ -126,18 +152,69 @@ def is_lock_free_to_give(self) -> bool: ) -class ScaleDownLock(Lock): - """Lock for scale down operations.""" +class ScaleDownLock(Lockable): + """Lock for scale down operations. - unit_request_lock_atr_name = "request_scale_down_lock" - member_with_lock_atr_name = "scale_down_member" + This will use valkey to store the lock state and will check if the unit with the lock has completed its scale down operation + """ + + lock_key = "scale_down_lock" + + def __init__(self, charm: "ValkeyCharm") -> None: + self.charm = charm @property - def is_lock_free_to_give(self) -> bool: - """Check if the unit with the scale down lock has completed its operation.""" - scaling_down_unit = self.unit_with_lock + def client(self) -> ValkeyClient: + """Get a ValkeyClient instance.""" + return ValkeyClient( + username=CharmUsers.VALKEY_ADMIN.value, + password=self.charm.state.unit_server.valkey_admin_password, + workload=self.charm.workload, + ) + + @property + def unit_with_lock(self) -> str | None: + """Get the unit that currently holds the start lock.""" + return self.client.get_value(self.charm.sentinel_manager.get_primary_ip(), self.lock_key) + + @override + def request_lock(self) -> None: + """Request the lock for the local unit.""" + if not self.unit_with_lock: + self.client.set_value( + hostname=self.charm.sentinel_manager.get_primary_ip(), + key=self.lock_key, + value=self.charm.state.unit_server.unit_name, + ) + logger.info(f"{self.charm.state.unit_server.unit_name} requested {self.name} lock.") + else: + logger.info( + f"{self.charm.state.unit_server.unit_name} attempted to request {self.name} lock, but it is currently held by {self.unit_with_lock}." + ) + + @property + def do_i_hold_lock(self) -> bool: + """Check if the local unit holds the lock.""" return ( - not self.state.cluster.model.scale_down_member - or not scaling_down_unit - or scaling_down_unit.model.request_scale_down_lock is False + self.unit_with_lock is not None + and self.unit_with_lock == self.charm.state.unit_server.unit_name ) + + def release_lock(self) -> None: + """Release the lock from the local unit.""" + if self.do_i_hold_lock: + self.client.set_value( + hostname=self.charm.sentinel_manager.get_primary_ip(), + key=self.lock_key, + value="", + ) + logger.info(f"{self.charm.state.unit_server.unit_name} released {self.name} lock.") + else: + logger.info( + f"{self.charm.state.unit_server.unit_name} attempted to release {self.name} lock, but it is currently held by {self.unit_with_lock if self.unit_with_lock else 'no one'}." + ) + + @property + def is_lock_free_to_give(self) -> bool: + """Check if the unit with the lock has completed its operation.""" + return not self.unit_with_lock diff --git a/src/core/models.py b/src/core/models.py index 5a01972..3946b7b 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -18,7 +18,7 @@ from pydantic import Field from typing_extensions import Annotated -from literals import CharmUsers, StartState +from literals import CharmUsers, ScaleDownState, StartState logger = logging.getLogger(__name__) @@ -123,6 +123,20 @@ def is_started(self) -> bool: """Check if the unit has started.""" return self.model.start_state == StartState.STARTED.value if self.model else False + @property + def is_being_removed(self) -> bool: + """Check if the unit is being removed from the cluster.""" + return ( + self.model.scale_down_state != ScaleDownState.NO_SCALE_DOWN.value + if self.model + else False + ) + + @property + def is_active(self) -> bool: + """Check if the unit is started and not being removed.""" + return self.is_started and not self.is_being_removed + @property def valkey_admin_password(self) -> str: """Retrieve the password for the valkey admin user.""" diff --git a/src/events/base_events.py b/src/events/base_events.py index 803639d..57ff01b 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -6,6 +6,7 @@ import logging import socket +import time from typing import TYPE_CHECKING import ops @@ -33,7 +34,7 @@ StartState, Substrate, ) -from statuses import CharmStatuses, ClusterStatuses, StartStatuses +from statuses import CharmStatuses, ClusterStatuses, ScaleDownStatuses, StartStatuses if TYPE_CHECKING: from charm import ValkeyCharm @@ -122,7 +123,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.state.unit_server.update({"start_state": StartState.WAITING_TO_START.value}) start_lock.request_lock() - if not start_lock.do_i_hold_lock(): + if not start_lock.do_i_hold_lock: logger.info("Waiting for lock to start") event.defer() return @@ -231,7 +232,7 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: if not self.charm.unit.is_leader(): return - for lock in [StartLock(self.charm.state), ScaleDownLock(self.charm.state)]: + for lock in [StartLock(self.charm.state)]: lock.process() def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: @@ -438,17 +439,34 @@ def _update_internal_users_password(self, secret_id: str) -> None: def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: """Handle removal of the data storage mount, e.g. when removing a unit.""" # get scale down lock - scale_down_lock = ScaleDownLock(self.charm.state) + scale_down_lock = ScaleDownLock(self.charm) self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.WAIT_FOR_LOCK}) + self.charm.status.set_running_status( + ScaleDownStatuses.WAIT_FOR_LOCK.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) scale_down_lock.request_lock() - if not scale_down_lock.do_i_hold_lock(): + while not scale_down_lock.do_i_hold_lock: logger.debug("Waiting for lock to scale down") - event.defer() - return + time.sleep(5) + self.charm.state.statuses.delete( + ScaleDownStatuses.WAIT_FOR_LOCK.value, + scope="unit", + component=self.charm.cluster_manager.name, + ) # TODO consider quorom when removing unit + self.charm.status.set_running_status( + ScaleDownStatuses.SCALING_DOWN.value, + scope="unit", + component_name=self.charm.cluster_manager.name, + statuses_state=self.charm.state.statuses, + ) + # if unit has primary then failover if self.charm.sentinel_manager.get_primary_ip() == self.charm.state.bind_address: self.charm.state.unit_server.update( @@ -466,8 +484,7 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: ) except SentinelFailoverError: logger.error("Failed to trigger failover before scale down") - event.defer() - return + raise # stop valkey and sentinel processes self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.STOP_SERVICES}) @@ -475,8 +492,7 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: self.charm.workload.stop() except ValkeyServicesCouldNotBeStoppedError: logger.error("Failed to stop Valkey services before scale down") - event.defer() - return + raise # reset sentinel states on other units self.charm.state.unit_server.update( @@ -489,15 +505,14 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: self.charm.sentinel_manager.reset_sentinel_states() except (ValkeyWorkloadCommandError, CannotSeeAllActiveSentinelsError): logger.error("Failed to reset sentinel states before scale down") - event.defer() - return + raise # check health after scale down self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) if not self.charm.sentinel_manager.verify_expected_replica_count(): logger.error("Not all sentinels see the expected number of replicas after scale down") - event.defer() - return + raise # release lock + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) scale_down_lock.release_lock() diff --git a/src/literals.py b/src/literals.py index ab41959..dbe7383 100644 --- a/src/literals.py +++ b/src/literals.py @@ -96,3 +96,4 @@ class ScaleDownState(StrEnum): STOP_SERVICES = "stopped_services" RESET_SENTINEL = "reset_sentinel" HEALTH_CHECK = "health_check" + GOING_AWAY = "going_away" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 5076ff0..3b6d2b4 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -18,8 +18,8 @@ ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import CharmUsers, StartState -from statuses import CharmStatuses, StartStatuses +from literals import CharmUsers, ScaleDownState, StartState +from statuses import CharmStatuses, ScaleDownStatuses, StartStatuses logger = logging.getLogger(__name__) @@ -119,38 +119,44 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje if not self.state.cluster.model or not self.state.unit_server.model: return status_list or [CharmStatuses.ACTIVE_IDLE.value] + if start_status := self._get_start_status(): + status_list.append(start_status) + + if scale_down_status := self._get_scale_down_status(): + status_list.append(scale_down_status) + + return status_list or [CharmStatuses.ACTIVE_IDLE.value] + + def _get_start_status(self) -> StatusObject | None: + """Get the current start status of the unit.""" match self.state.unit_server.model.start_state: case StartState.NOT_STARTED.value: - status_list.append( - StartStatuses.SERVICE_NOT_STARTED.value, - ) + if ( + self.state.unit_server.model.scale_down_state + == ScaleDownState.NO_SCALE_DOWN.value + ): + return StartStatuses.SERVICE_NOT_STARTED.value case StartState.WAITING_FOR_PRIMARY_START.value: - status_list.append( - StartStatuses.WAITING_FOR_PRIMARY_START.value, - ) + return StartStatuses.WAITING_FOR_PRIMARY_START.value case StartState.WAITING_TO_START.value: - status_list.append( - StartStatuses.WAITING_TO_START.value, - ) + return StartStatuses.WAITING_TO_START.value case StartState.CONFIGURATION_ERROR.value: - status_list.append( - StartStatuses.CONFIGURATION_ERROR.value, - ) + return StartStatuses.CONFIGURATION_ERROR.value case StartState.STARTING_WAITING_VALKEY.value: - status_list.append( - StartStatuses.SERVICE_STARTING.value, - ) + return StartStatuses.SERVICE_STARTING.value case StartState.STARTING_WAITING_SENTINEL.value: - status_list.append( - StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value, - ) + return StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value case StartState.STARTING_WAITING_REPLICA_SYNC.value: - status_list.append( - StartStatuses.WAITING_FOR_REPLICA_SYNC.value, - ) + return StartStatuses.WAITING_FOR_REPLICA_SYNC.value case StartState.ERROR_ON_START.value: - status_list.append( - StartStatuses.ERROR_ON_START.value, - ) + return StartStatuses.ERROR_ON_START.value - return status_list or [CharmStatuses.ACTIVE_IDLE.value] + return None + + def _get_scale_down_status(self) -> StatusObject | None: + """Get the current scale down status of the unit.""" + match self.state.unit_server.model.scale_down_state: + case ScaleDownState.GOING_AWAY.value: + return ScaleDownStatuses.GOING_AWAY.value + + return None diff --git a/src/managers/config.py b/src/managers/config.py index 5c74c80..6966e66 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -94,7 +94,7 @@ def _generate_replica_config(self, primary_ip: str) -> dict[str, str]: CharmUsers.VALKEY_REPLICA.value, "" ), } - if primary_ip != self.state.unit_server.model.private_ip: + if primary_ip != self.state.bind_address: # set replicaof logger.debug("Setting replicaof to primary %s", primary_ip) replica_config["replicaof"] = f"{primary_ip} {CLIENT_PORT}" diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 250234b..ec311dc 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -51,11 +51,11 @@ def admin_password(self) -> str: ) def is_sentinel_discovered(self) -> bool: """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" - # list of active sentinels: units with started flag true + # list of active sentinels: units with started flag true and not being removed active_sentinels = [ unit.model.private_ip for unit in self.state.servers - if unit.is_started and unit.model.private_ip != self.state.unit_server.model.private_ip + if unit.is_active and unit.model.private_ip != self.state.bind_address ] client = ValkeyClient( @@ -71,7 +71,7 @@ def is_sentinel_discovered(self) -> bool: command=["sentinel", "sentinels", PRIMARY_NAME], hostname=sentinel_ip, ) - if self.state.unit_server.model.private_ip not in output: + if self.state.bind_address not in output: logger.info(f"Sentinel at {sentinel_ip} has not discovered this sentinel") return False except ValkeyWorkloadCommandError: @@ -81,7 +81,7 @@ def is_sentinel_discovered(self) -> bool: def get_primary_ip(self) -> str | None: """Get the IP address of the primary node in the cluster.""" - started_servers = [unit for unit in self.state.servers if unit.is_started] + started_servers = [unit for unit in self.state.servers if unit.is_active] client = ValkeyClient( username=self.admin_user, @@ -148,7 +148,7 @@ def reset_sentinel_states(self) -> None: connect_to="sentinel", ) - active_sentinels = [unit for unit in self.state.servers if unit.is_started] + active_sentinels = [unit for unit in self.state.servers if unit.is_active] logger.debug( "Resetting sentinel states on %s", str([unit.unit_name for unit in active_sentinels]) ) @@ -187,7 +187,7 @@ def sentinel_sees_all_others(self, target_sentinel_ip: str) -> bool: other_active_sentinels = [ unit.model.private_ip for unit in self.state.servers - if unit.is_started and unit.model.private_ip != target_sentinel_ip + if unit.is_active and unit.model.private_ip != target_sentinel_ip ] logger.debug( @@ -223,7 +223,7 @@ def verify_expected_replica_count(self) -> bool: connect_to="sentinel", ) - units_started = [unit for unit in self.state.servers if unit.is_started] + units_started = [unit for unit in self.state.servers if unit.is_active] # all started servers except primary are expected to be replicas expected_replicas = len(units_started) - 1 logger.debug( diff --git a/src/statuses.py b/src/statuses.py index f0a677b..d7faa1e 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -73,3 +73,22 @@ class StartStatuses(Enum): status="blocked", message="Error occurred during service start, check logs for details", ) + + +class ScaleDownStatuses(Enum): + """Collection of possible statuses related to scale down operations.""" + + WAIT_FOR_LOCK = StatusObject( + status="maintenance", + message="Waiting for lock to perform scale down operations...", + running="async", + ) + SCALING_DOWN = StatusObject( + status="maintenance", + message="Performing scale down operations...", + running="async", + ) + GOING_AWAY = StatusObject( + status="maintenance", + message="Waiting for unit to be removed by juju...", + ) From 487ec644bc0bbf5e7b072316f472959bd32e0487 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 08:32:34 +0000 Subject: [PATCH 104/159] refactor client tp separate valkey and sentinel and use json where possible --- src/common/client.py | 395 ++++++++++++++++++++----------------------- 1 file changed, 185 insertions(+), 210 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index b3b4ff6..5e43c3b 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -3,10 +3,11 @@ """ValkeyClient utility class to connect to valkey servers.""" +import json import logging -from typing import Literal +from typing import Any -from tenacity import retry, stop_after_attempt, wait_fixed +from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed from common.exceptions import ValkeyWorkloadCommandError from core.base_workload import WorkloadBase @@ -15,53 +16,96 @@ logger = logging.getLogger(__name__) -class ValkeyClient: +class CliClient: """Handle valkey client connections.""" + port: int = CLIENT_PORT + def __init__( self, username: str, password: str, workload: WorkloadBase, - connect_to: Literal["valkey", "sentinel"] = "valkey", ): self.username = username self.password = password self.workload = workload - self.connect_to = connect_to def exec_cli_command( self, command: list[str], hostname: str, - ) -> tuple[str, str | None]: + json_output: bool = True, + ) -> Any: """Execute a Valkey CLI command on the server. Args: command (list[str]): The CLI command to execute, as a list of arguments. hostname (str): The hostname to connect to. + json_output (bool): Whether to parse the output as JSON. Returns: - tuple[str, str | None]: The standard output and standard error from the command execution. + Any: The output from the command execution, parsed as JSON if requested. Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute. """ - port = CLIENT_PORT if self.connect_to == "valkey" else SENTINEL_PORT - cli_command: list[str] = [ - self.workload.cli, - "-h", - hostname, - "-p", - str(port), - "--user", - self.username, - "--pass", - self.password, - ] + command - logger.debug(f"Executing CLI command on {hostname}: {cli_command}") + port = self.port + cli_command: list[str] = ( + [ + self.workload.cli, + "--no-auth-warning", + "-h", + hostname, + "-p", + str(port), + "--user", + self.username, + "--pass", + self.password, + ] + + (["--json"] if json_output else []) + + command + ) output, error = self.workload.exec(cli_command) - return output.strip(), error + output = output.strip() + if error: + logger.error( + "Error executing CLI command on Valkey server at %s: stderr: %s", + hostname, + error, + ) + raise ValkeyWorkloadCommandError( + f"Error executing CLI command on Valkey server at {hostname}: stderr: {error}" + ) + + if json_output: + try: + output = json.loads(output) + except json.JSONDecodeError as e: + logger.error( + "Failed to parse JSON output from CLI command on Valkey server at %s: %s", + hostname, + output, + ) + raise ValkeyWorkloadCommandError( + f"Failed to parse JSON output from CLI command on Valkey server at {hostname}: {output}" + ) from e + return output + + +class ValkeyClient(CliClient): + """Handle valkey client connections.""" + + port: int = CLIENT_PORT + + def __init__( + self, + username: str, + password: str, + workload: WorkloadBase, + ): + super().__init__(username, password, workload) def ping(self, hostname: str) -> bool: """Ping the Valkey server to check if it's responsive. @@ -73,8 +117,7 @@ def ping(self, hostname: str) -> bool: bool: True if the server responds to the ping command, False otherwise. """ try: - output, _ = self.exec_cli_command(["ping"], hostname=hostname) - return "PONG" in output + return "PONG" in self.exec_cli_command(["ping"], hostname=hostname, json_output=False) except ValkeyWorkloadCommandError: return False @@ -90,7 +133,10 @@ def get_persistence_info(self, hostname: str) -> dict[str, str] | None: Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute. """ - output, _ = self.exec_cli_command(["info", "persistence"], hostname=hostname) + # command does not have a JSON output format, so we need to parse the raw output + output = self.exec_cli_command( + ["info", "persistence"], hostname=hostname, json_output=False + ) values = {} if not output.strip(): logger.warning(f"No persistence info found on Valkey server at {hostname}.") @@ -119,24 +165,13 @@ def set_value(self, hostname: str, key: str, value: str) -> bool: Returns: bool: True if the command executed successfully, False otherwise. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - try: - output, err = self.exec_cli_command(["set", key, value], hostname=hostname) - if output.strip() == "OK": - return True - logger.error( - "Failed to set key %s on Valkey server at %s: stdout: %s, stderr: %s", - key, - hostname, - output, - err, - ) - return False - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to set key {key} on Valkey server at {hostname}: {e}") - return False + return self.exec_cli_command(["set", key, value], hostname=hostname) == "OK" - def get_value(self, hostname: str, key: str) -> str | None: + def get_value(self, hostname: str, key: str) -> str: """Get the value of a key from the Valkey server. Args: @@ -144,17 +179,12 @@ def get_value(self, hostname: str, key: str) -> str | None: key (str): The key to retrieve. Returns: - str | None: The value of the key if retrieved successfully, None otherwise. + str: The value of the key if retrieved successfully. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - try: - output, err = self.exec_cli_command(["get", key], hostname=hostname) - if not output.strip(): - logger.warning(f"Key {key} not found on Valkey server at {hostname}.") - return None - return output.strip() - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to get key {key} from Valkey server at {hostname}: {e}") - return None + return self.exec_cli_command(["get", key], hostname=hostname) def is_replica_synced(self, hostname: str) -> bool: """Check if the replica is synced with the primary. @@ -164,20 +194,12 @@ def is_replica_synced(self, hostname: str) -> bool: Returns: bool: True if the replica is synced with the primary, False otherwise. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - try: - output, _ = self.exec_cli_command(["role"], hostname=hostname) - output_parts = output.strip().split() - return ( - bool(output_parts) - and output_parts[0] == "slave" - and output_parts[3] == "connected" - ) - except ValkeyWorkloadCommandError: - logger.warning( - "Could not determine replica sync status from Valkey server at %s.", hostname - ) - return False + output = self.exec_cli_command(["role"], hostname=hostname) + return output[0] == "slave" and output[3] == "connected" def config_set(self, hostname: str, parameter: str, value: str) -> bool: """Set a runtime configuration parameter on the Valkey server. @@ -189,26 +211,15 @@ def config_set(self, hostname: str, parameter: str, value: str) -> bool: Returns: bool: True if the command executed successfully, False otherwise. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - try: - output, err = self.exec_cli_command( - ["config", "set", parameter, value], hostname=hostname - ) - if output.strip() == "OK": - return True - logger.error( - "Failed to set config %s on Valkey server at %s: stdout: %s, stderr: %s", - parameter, - hostname, - output, - err, - ) - return False - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to set config {parameter} on Valkey server at {hostname}: {e}") - return False + return ( + self.exec_cli_command(["config", "set", parameter, value], hostname=hostname) == "OK" + ) - def load_acl(self, hostname: str) -> bool: + def acl_load(self, hostname: str) -> bool: """Load the ACL file into the Valkey server. Args: @@ -216,88 +227,59 @@ def load_acl(self, hostname: str) -> bool: Returns: bool: True if the ACL file was loaded successfully, False otherwise. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - try: - output, err = self.exec_cli_command(["acl", "load"], hostname=hostname) - if output.strip() == "OK": - return True - logger.error( - "Failed to load ACL file on Valkey server at %s: stdout: %s, stderr: %s", - hostname, - output, - err, - ) - return False - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to load ACL file on Valkey server at {hostname}: {e}") - return False + return self.exec_cli_command(["acl", "load"], hostname=hostname) == "OK" + + +class SentinelClient(CliClient): + """Handle sentinel-specific client connections.""" + + port: int = SENTINEL_PORT + + def __init__( + self, + username: str, + password: str, + workload: WorkloadBase, + ): + super().__init__(username, password, workload) - def sentinel_get_primary_ip(self, hostname: str) -> str | None: + def get_primary_ip(self, hostname: str) -> str: """Get the primary IP address from the sentinel. Args: hostname (str): The hostname to connect to. Returns: - str | None: The primary IP address if retrieved successfully, None otherwise. + str: The primary IP address if retrieved successfully. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - if not self.connect_to == "sentinel": - logger.error( - "Attempted to get primary IP from sentinel while client is configured to connect to valkey." - ) - raise ValueError("Client is not configured to connect to sentinel.") - try: - output, _ = self.exec_cli_command( - command=["sentinel", "get-master-addr-by-name", PRIMARY_NAME], hostname=hostname - ) - output_parts = output.strip().split() - if len(output_parts) != 2: - logger.error( - "Unexpected output format when getting primary IP from sentinel at %s: %s", - hostname, - output, - ) - return None - return output_parts[0] - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to get primary IP from sentinel at {hostname}: {e}") - return None + return self.exec_cli_command( + command=["sentinel", "get-primary-addr-by-name", PRIMARY_NAME], hostname=hostname + )[0] - def sentinel_get_master_info(self, hostname: str) -> dict[str, str] | None: - """Get the master info from the sentinel. + def get_primary_info(self, hostname: str) -> dict[str, str]: + r"""Get the primary info from the sentinel. Args: hostname (str): The hostname to connect to. Returns: - dict[str, str] | None: The master info if retrieved successfully, None otherwise. + dict[str, str]: The primary info if retrieved successfully. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - if not self.connect_to == "sentinel": - logger.error( - "Attempted to get master info from sentinel while client is configured to connect to valkey." - ) - raise ValueError("Client is not configured to connect to sentinel.") - try: - output, _ = self.exec_cli_command( - command=["sentinel", "master", PRIMARY_NAME], hostname=hostname - ) - if not output.strip(): - logger.warning(f"No master info found in sentinel at {hostname}.") - return None - info_parts = output.strip().split() - if len(info_parts) % 2 != 0: - logger.error( - "Unexpected output format when getting master info from sentinel at %s: %s", - hostname, - output, - ) - return None - return {info_parts[i]: info_parts[i + 1] for i in range(0, len(info_parts), 2)} - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to get master info from sentinel at {hostname}: {e}") - return None + return self.exec_cli_command( + command=["sentinel", "primary", PRIMARY_NAME], hostname=hostname + ) - def sentinel_failover(self, hostname: str): + def trigger_failover(self, hostname: str) -> bool: """Trigger a failover through the sentinel. Args: @@ -305,90 +287,83 @@ def sentinel_failover(self, hostname: str): Returns: bool: True if the failover command was executed successfully, False otherwise. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - if not self.connect_to == "sentinel": - logger.error( - "Attempted to trigger failover through sentinel while client is configured to connect to valkey." - ) - raise ValueError("Client is not configured to connect to sentinel.") - try: - output, err = self.exec_cli_command( + return ( + self.exec_cli_command( command=["sentinel", "failover", PRIMARY_NAME, "coordinated"], hostname=hostname, ) - if "OK" not in output.strip(): - logger.error( - "Failed to trigger failover through sentinel at %s: stdout: %s, stderr: %s", - hostname, - output, - err, - ) - raise ValkeyWorkloadCommandError( - f"Failed to trigger failover through sentinel at {hostname}: stdout, stderr: {(output, err)}" - ) - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to trigger failover through sentinel at {hostname}: {e}") - raise + == "OK" + ) - def sentinel_reset_state(self, hostname: str) -> None: + @retry( + stop=stop_after_attempt(5), + wait=wait_fixed(1), + retry=retry_if_result(lambda in_progress: in_progress), + retry_error_callback=lambda _: True, + ) + def is_failover_in_progress(self, hostname: str) -> bool: + """Check if a failover is in progress through the sentinel. + + Args: + hostname (str): The hostname to connect to. + + Returns: + bool: True if a failover is in progress, False otherwise. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. + """ + return "failover_in_progress" in self.get_primary_info(hostname=hostname).get("flags", "") + + def reset(self, hostname: str) -> None: """Reset the sentinel state for the primary. Args: hostname (str): The hostname to connect to. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output """ - if not self.connect_to == "sentinel": - logger.error( - "Attempted to reset sentinel state through sentinel while client is configured to connect to valkey." - ) - raise ValueError("Client is not configured to connect to sentinel.") - try: - output, err = self.exec_cli_command( - command=["sentinel", "reset", PRIMARY_NAME], - hostname=hostname, - ) - if output != "1": - raise ValkeyWorkloadCommandError( - f"Failed to reset sentinel state through sentinel at {hostname}: stdout, stderr: {(output, err)}" - ) - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to reset sentinel state through sentinel at {hostname}: {e}") - raise + self.exec_cli_command( + command=["sentinel", "reset", PRIMARY_NAME], + hostname=hostname, + ) @retry( stop=stop_after_attempt(5), wait=wait_fixed(1), reraise=True, ) - def sentinel_get_replica_info(self, hostname: str) -> str: + def replicas_primary(self, hostname: str) -> list[dict[str, str]]: """Get the replicas information of the primary from sentinel. Args: hostname (str): The hostname to connect to. Returns: - str | None: The output of the "sentinel replicas" command if retrieved successfully, None otherwise. + (list[dict[str, str]]): The list of replicas with their information. """ - if not self.connect_to == "sentinel": - logger.error( - "Attempted to get replica info from sentinel while client is configured to connect to valkey." - ) - raise ValueError("Client is not configured to connect to sentinel.") - try: - output, err = self.exec_cli_command( - command=["sentinel", "replicas", PRIMARY_NAME], - hostname=hostname, - ) - logger.debug( - "Output of 'sentinel replicas' command from sentinel at %s: stdout, stderr: %s", - hostname, - (output, err), - ) - if not output.strip(): - logger.warning(f"No replica info found in sentinel at {hostname}.") - raise ValkeyWorkloadCommandError( - f"No replica info found in sentinel at {hostname}." - ) - return output.strip() - except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to get replica info from sentinel at {hostname}: {e}") - raise + return self.exec_cli_command( + command=["sentinel", "replicas", PRIMARY_NAME], + hostname=hostname, + ) + + def sentinels_primary(self, hostname: str) -> list[dict[str, str]]: + """Get the list of sentinels that see the same primary from the sentinel. + + Args: + hostname (str): The hostname to connect to. + + Returns: + (list[dict[str, str]]): result of `sentinel sentinels primary` structured into a list of dicts + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. + """ + return self.exec_cli_command( + command=["sentinel", "sentinels", PRIMARY_NAME], hostname=hostname + ) From b300ccd63aa7682567ce7314135e64741debc7ff Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 08:32:53 +0000 Subject: [PATCH 105/159] only recompute model when writing to databag --- src/core/models.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/core/models.py b/src/core/models.py index 3946b7b..d8655ef 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -66,6 +66,7 @@ def __init__( self.relation = relation self.data_interface = data_interface self.component = component + self.model = self.data_interface.build_model(self.relation.id) if self.relation else None def update(self, items: dict[str, Any]) -> None: """Write to relation data.""" @@ -78,14 +79,13 @@ def update(self, items: dict[str, Any]) -> None: delete_fields = [key for key in items if not items[key]] update_content = {k: items[k] for k in items if k not in delete_fields} - model = self.data_interface.build_model(self.relation.id) for field, value in update_content.items(): - setattr(model, field.replace("-", "_"), value) + setattr(self.model, field.replace("-", "_"), value) for field in delete_fields: - setattr(model, field.replace("-", "_"), None) + setattr(self.model, field.replace("-", "_"), None) - self.data_interface.write_model(self.relation.id, model) + self.data_interface.write_model(self.relation.id, self.model) @final @@ -103,11 +103,6 @@ def __init__( self.data_interface = data_interface self.unit = component - @property - def model(self) -> PeerUnitModel | None: - """The peer relation model for this unit.""" - return self.data_interface.build_model(self.relation.id) if self.relation else None - @property def unit_id(self) -> int: """The id of the unit from the unit name.""" @@ -159,11 +154,6 @@ def __init__( self.app = component self.data_interface = data_interface - @property - def model(self) -> PeerAppModel | None: - """The peer relation model for this application.""" - return self.data_interface.build_model(self.relation.id) if self.relation else None - @property def internal_users_credentials(self) -> dict[str, str]: """Retrieve the credentials for the internal admin users.""" From 40789bfe6dd89649e751f85f8b44d6b405c7fdaa Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 11:53:07 +0000 Subject: [PATCH 106/159] client refactoring and added delief --- src/common/client.py | 48 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 5e43c3b..0ad7e5d 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -155,13 +155,14 @@ def get_persistence_info(self, hostname: str) -> dict[str, str] | None: values[values_parts[0]] = values_parts[1] return values - def set_value(self, hostname: str, key: str, value: str) -> bool: + def set(self, hostname: str, key: str, value: str, additional_args: list[str] = []) -> bool: """Set a key-value pair on the Valkey server. Args: hostname (str): The hostname to connect to. key (str): The key to set. value (str): The value to set for the key. + additional_args (list[str]): Additional arguments to include in the CLI command. Default is an empty list. Returns: bool: True if the command executed successfully, False otherwise. @@ -169,9 +170,11 @@ def set_value(self, hostname: str, key: str, value: str) -> bool: Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - return self.exec_cli_command(["set", key, value], hostname=hostname) == "OK" + return ( + self.exec_cli_command(["set", key, value] + additional_args, hostname=hostname) == "OK" + ) - def get_value(self, hostname: str, key: str) -> str: + def get(self, hostname: str, key: str) -> str: """Get the value of a key from the Valkey server. Args: @@ -186,6 +189,22 @@ def get_value(self, hostname: str, key: str) -> str: """ return self.exec_cli_command(["get", key], hostname=hostname) + def delifeq(self, hostname: str, key: str, value: str) -> str: + """Delete a key from the Valkey server if it is equal to a specific value. + + Args: + hostname (str): The hostname to connect to. + key (str): The key to delete if the value matches. + value (str): The value to compare against before deleting the key. + + Returns: + str: The result of the delifeq command. + + Raises: + ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. + """ + return self.exec_cli_command(["delifeq", key, value], hostname=hostname, json_output=False) + def is_replica_synced(self, hostname: str) -> bool: """Check if the replica is synced with the primary. @@ -247,6 +266,20 @@ def __init__( ): super().__init__(username, password, workload) + def ping(self, hostname: str) -> bool: + """Ping the Valkey server to check if it's responsive. + + Args: + hostname (str): The hostname to connect to. + + Returns: + bool: True if the server responds to the ping command, False otherwise. + """ + try: + return "PONG" in self.exec_cli_command(["ping"], hostname=hostname, json_output=False) + except ValkeyWorkloadCommandError: + return False + def get_primary_ip(self, hostname: str) -> str: """Get the primary IP address from the sentinel. @@ -270,7 +303,7 @@ def get_primary_info(self, hostname: str) -> dict[str, str]: hostname (str): The hostname to connect to. Returns: - dict[str, str]: The primary info if retrieved successfully. + (dict[str, str]): The primary info if retrieved successfully. Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. @@ -347,10 +380,11 @@ def replicas_primary(self, hostname: str) -> list[dict[str, str]]: Returns: (list[dict[str, str]]): The list of replicas with their information. """ - return self.exec_cli_command( - command=["sentinel", "replicas", PRIMARY_NAME], - hostname=hostname, + replicas = self.exec_cli_command( + command=["sentinel", "replicas", PRIMARY_NAME], hostname=hostname ) + logger.debug("Retrieved replicas information from sentinel at %s: %s", hostname, replicas) + return replicas def sentinels_primary(self, hostname: str) -> list[dict[str, str]]: """Get the list of sentinels that see the same primary from the sentinel. From 46cb2a93ab18c745423519e253a1d0f8ec12bf44 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 11:53:35 +0000 Subject: [PATCH 107/159] refactor locks to adhere to ux of advanced rollingops --- src/common/exceptions.py | 4 ++ src/common/locks.py | 97 +++++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 30 deletions(-) diff --git a/src/common/exceptions.py b/src/common/exceptions.py index 2936558..ed8fa4d 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -20,6 +20,10 @@ class ValkeyConfigSetError(ValkeyClientError): """Custom Exception if setting configuration on valkey cluster fails.""" +class ValkeyCannotGetPrimaryIPError(ValkeyClientError): + """Custom Exception if the primary IP cannot be determined from the sentinels.""" + + class ValkeyWorkloadCommandError(Exception): """Custom Exception if any workload-related command fails.""" diff --git a/src/common/locks.py b/src/common/locks.py index 6ef0ce4..9b31a4b 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -4,10 +4,12 @@ """Collection of lock names for cluster operations.""" import logging +import time from abc import abstractmethod from typing import TYPE_CHECKING, Protocol, override from common.client import ValkeyClient +from common.exceptions import ValkeyWorkloadCommandError from core.cluster_state import ClusterState from literals import CharmUsers @@ -29,12 +31,12 @@ def name(self) -> str: return self.__class__.__name__.lower() @abstractmethod - def request_lock(self) -> None: + def request_lock(self) -> bool: """Request the lock for the local unit.""" raise NotImplementedError @abstractmethod - def release_lock(self) -> None: + def release_lock(self) -> bool: """Release the lock from the local unit.""" raise NotImplementedError @@ -94,7 +96,7 @@ def do_i_hold_lock(self) -> bool: self.state.cluster.model, self.member_with_lock_atr_name, "" ) - def request_lock(self) -> None: + def request_lock(self) -> bool: """Request the lock for the local unit.""" self.state.unit_server.update( { @@ -107,7 +109,9 @@ def request_lock(self) -> None: ) self.process() - def release_lock(self) -> None: + return self.do_i_hold_lock + + def release_lock(self) -> bool: """Release the lock from the local unit.""" self.state.unit_server.update( { @@ -120,6 +124,8 @@ def release_lock(self) -> None: ) self.process() + return True + def process(self) -> None: """Process the lock requests and update the unit with the lock.""" if not self.state.unit_server.unit.is_leader(): @@ -172,49 +178,80 @@ def client(self) -> ValkeyClient: workload=self.charm.workload, ) - @property - def unit_with_lock(self) -> str | None: + def get_unit_with_lock(self, primary_ip: str | None = None) -> str | None: """Get the unit that currently holds the start lock.""" - return self.client.get_value(self.charm.sentinel_manager.get_primary_ip(), self.lock_key) + return self.client.get( + primary_ip or self.charm.sentinel_manager.get_primary_ip(), self.lock_key + ) @override - def request_lock(self) -> None: + def request_lock(self, timeout: int | None = None) -> bool: """Request the lock for the local unit.""" - if not self.unit_with_lock: - self.client.set_value( - hostname=self.charm.sentinel_manager.get_primary_ip(), - key=self.lock_key, - value=self.charm.state.unit_server.unit_name, + logger.debug(f"{self.charm.state.unit_server.unit_name} is requesting {self.name} lock.") + retry_until = time.time() + timeout if timeout else None + primary_ip = self.charm.sentinel_manager.get_primary_ip() + if self.get_unit_with_lock(primary_ip) == self.charm.state.unit_server.unit_name: + logger.debug( + f"{self.charm.state.unit_server.unit_name} already holds {self.name} lock. No need to request it again." ) - logger.info(f"{self.charm.state.unit_server.unit_name} requested {self.name} lock.") - else: + return True + + while True: + try: + if self.client.set( + hostname=primary_ip, + key=self.lock_key, + value=self.charm.state.unit_server.unit_name, + additional_args=[ + "NX", + "PX", + str( + 5 * 60 * 1000 + ), # Set the lock with a TTL of 5 minutes to prevent deadlocks + ], + ): + logger.debug( + f"{self.charm.state.unit_server.unit_name} acquired {self.name} lock." + ) + return True + except ValkeyWorkloadCommandError: + logger.warning( + f"{self.charm.state.unit_server.unit_name} failed to acquire {self.name} lock due to a workload command error. Retrying..." + ) + if retry_until and time.time() > retry_until: + logger.warning( + f"{self.charm.state.unit_server.unit_name} failed to acquire {self.name} lock within timeout. Giving up." + ) + return False logger.info( - f"{self.charm.state.unit_server.unit_name} attempted to request {self.name} lock, but it is currently held by {self.unit_with_lock}." + f"{self.charm.state.unit_server.unit_name} failed to acquire {self.name} lock. Retrying in 5 seconds." ) + time.sleep(5) + # update the primary ip in case a failover happens when we are waiting to acquire the lock + primary_ip = self.charm.sentinel_manager.get_primary_ip() @property def do_i_hold_lock(self) -> bool: """Check if the local unit holds the lock.""" + unit_with_lock = self.get_unit_with_lock() return ( - self.unit_with_lock is not None - and self.unit_with_lock == self.charm.state.unit_server.unit_name + unit_with_lock is not None and unit_with_lock == self.charm.state.unit_server.unit_name ) - def release_lock(self) -> None: + def release_lock(self) -> bool: """Release the lock from the local unit.""" - if self.do_i_hold_lock: - self.client.set_value( + if ( + self.client.delifeq( hostname=self.charm.sentinel_manager.get_primary_ip(), key=self.lock_key, - value="", + value=self.charm.state.unit_server.unit_name, ) - logger.info(f"{self.charm.state.unit_server.unit_name} released {self.name} lock.") + == "1" + ): + logger.debug(f"{self.charm.state.unit_server.unit_name} released {self.name} lock.") + return True else: - logger.info( - f"{self.charm.state.unit_server.unit_name} attempted to release {self.name} lock, but it is currently held by {self.unit_with_lock if self.unit_with_lock else 'no one'}." + logger.warning( + f"{self.charm.state.unit_server.unit_name} failed to release {self.name} lock. It may not have held the lock or it may have already been released." ) - - @property - def is_lock_free_to_give(self) -> bool: - """Check if the unit with the lock has completed its operation.""" - return not self.unit_with_lock + return False From 01e8a73963fd253fa9ba6807deb0946cd4e905b1 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 11:53:59 +0000 Subject: [PATCH 108/159] refactor managers to use the new clients --- src/managers/cluster.py | 2 +- src/managers/sentinel.py | 161 ++++++++++++++++++++++----------------- 2 files changed, 91 insertions(+), 72 deletions(-) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 3b6d2b4..5f7dc84 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -47,7 +47,7 @@ def reload_acl_file(self) -> None: password=self.admin_password, workload=self.workload, ) - if not client.load_acl(hostname=self.state.bind_address): + if not client.acl_load(hostname=self.state.bind_address): raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index ec311dc..e9f4269 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -11,15 +11,16 @@ from data_platform_helpers.advanced_statuses.types import Scope from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed -from common.client import ValkeyClient +from common.client import SentinelClient from common.exceptions import ( CannotSeeAllActiveSentinelsError, SentinelFailoverError, + ValkeyCannotGetPrimaryIPError, ValkeyWorkloadCommandError, ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import PRIMARY_NAME, CharmUsers +from literals import CharmUsers from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -45,7 +46,7 @@ def admin_password(self) -> str: @retry( wait=wait_fixed(5), - stop=stop_after_attempt(5), + stop=stop_after_attempt(6), retry=retry_if_result(lambda result: result is False), retry_error_callback=lambda _: False, ) @@ -58,68 +59,80 @@ def is_sentinel_discovered(self) -> bool: if unit.is_active and unit.model.private_ip != self.state.bind_address ] - client = ValkeyClient( + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) for sentinel_ip in active_sentinels: try: - output, _ = client.exec_cli_command( - command=["sentinel", "sentinels", PRIMARY_NAME], - hostname=sentinel_ip, - ) - if self.state.bind_address not in output: - logger.info(f"Sentinel at {sentinel_ip} has not discovered this sentinel") + discovered_sentinels = { + sentinel["ip"] for sentinel in client.sentinels_primary(hostname=sentinel_ip) + } + if self.state.bind_address not in discovered_sentinels: + logger.warning( + f"Sentinel at {sentinel_ip} does not see local sentinel at {self.state.bind_address}." + ) return False + except ValkeyWorkloadCommandError: logger.warning(f"Could not query sentinel at {sentinel_ip} for primary discovery.") return False return True - def get_primary_ip(self) -> str | None: - """Get the IP address of the primary node in the cluster.""" - started_servers = [unit for unit in self.state.servers if unit.is_active] + def get_primary_ip(self) -> str: + """Get the IP address of the primary node in the cluster. - client = ValkeyClient( + This method queries the sentinels in the cluster for the primary information and returns the primary's IP address. + + Raises: + ValkeyWorkloadCommandError: If the CLI command to get primary information fails on all sentinels. + """ + started_servers = [unit.model.private_ip for unit in self.state.servers if unit.is_active] + + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) - for unit in started_servers: - if primary_ip := client.sentinel_get_primary_ip(hostname=unit.model.private_ip): - logger.info(f"Primary IP address is {primary_ip}") - return primary_ip + for unit_ip in started_servers: + try: + return client.get_primary_ip(hostname=unit_ip) + except ValkeyWorkloadCommandError: + logger.warning( + "Could not query sentinel for primary information from server at %s.", + unit_ip, + ) + continue logger.error( - "Could not determine primary IP from sentinels. Number of started servers: %d.", - len(started_servers), + "Could not determine primary IP from sentinels: %s.", + started_servers, ) - return None + raise ValkeyCannotGetPrimaryIPError("Could not determine primary IP from sentinels.") @retry( wait=wait_fixed(5), - stop=stop_after_attempt(5), + stop=stop_after_attempt(6), retry=retry_if_result(lambda result: result is False), retry_error_callback=lambda _: False, ) def is_healthy(self) -> bool: """Check if the sentinel service is healthy.""" - client = ValkeyClient( + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) if not client.ping(hostname=self.state.bind_address): logger.warning("Health check failed: Sentinel did not respond to ping.") return False - if not client.sentinel_get_master_info(hostname=self.state.bind_address): + try: + client.get_primary_info(hostname=self.state.bind_address) + except ValkeyWorkloadCommandError: logger.warning("Health check failed: Could not query sentinel for master information.") return False @@ -127,68 +140,59 @@ def is_healthy(self) -> bool: def failover(self) -> None: """Trigger a failover in the cluster.""" - client = ValkeyClient( + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) try: - client.sentinel_failover(self.state.bind_address) + client.trigger_failover(self.state.bind_address) + # check if failover is in progress every second for 5 seconds, if it is not then assume failover failed + client.is_failover_in_progress(hostname=self.state.bind_address) except ValkeyWorkloadCommandError as e: logger.error(f"Failed to trigger failover: {e}") raise SentinelFailoverError from e - def reset_sentinel_states(self) -> None: + def reset_sentinel_states(self, sentinel_ips: list[str]) -> None: """Reset the sentinel states on all sentinels in the cluster.""" - client = ValkeyClient( + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) - active_sentinels = [unit for unit in self.state.servers if unit.is_active] - logger.debug( - "Resetting sentinel states on %s", str([unit.unit_name for unit in active_sentinels]) - ) - for unit in active_sentinels: + for sentinel_ip in sentinel_ips: try: - client.sentinel_reset_state(hostname=unit.model.private_ip) + client.reset(hostname=sentinel_ip) except ValkeyWorkloadCommandError: - logger.warning( - f"Could not reset sentinel state on {unit.unit_name} ({unit.model.private_ip})." - ) + logger.warning("Could not reset sentinel state on %s.", sentinel_ip) raise - if not self.sentinel_sees_all_others(target_sentinel_ip=unit.model.private_ip): + if not self.target_sees_all_others( + target_sentinel_ip=sentinel_ip, sentinel_ips=sentinel_ips + ): logger.warning( - f"Sentinel at {unit.model.private_ip} does not see all other sentinels after reset." + "Sentinel at %s does not see all other sentinels after reset.", sentinel_ip ) raise CannotSeeAllActiveSentinelsError( - f"Sentinel at {unit.model.private_ip} does not see all other sentinels after reset." + "Sentinel at %s does not see all other sentinels after reset." % sentinel_ip ) @retry( - wait=wait_fixed(1), - stop=stop_after_attempt(5), + wait=wait_fixed(5), + stop=stop_after_attempt(6), retry=retry_if_result(lambda result: result is False), retry_error_callback=lambda _: False, ) - def sentinel_sees_all_others(self, target_sentinel_ip: str) -> bool: + def target_sees_all_others(self, target_sentinel_ip: str, sentinel_ips: list[str]) -> bool: """Check if the sentinel of the local unit sees all the other sentinels in the cluster.""" - client = ValkeyClient( + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) - other_active_sentinels = [ - unit.model.private_ip - for unit in self.state.servers - if unit.is_active and unit.model.private_ip != target_sentinel_ip - ] + other_active_sentinels = [ip for ip in sentinel_ips if ip != target_sentinel_ip] logger.debug( "Checking if sentinel at %s sees all other sentinels: %s", @@ -198,11 +202,10 @@ def sentinel_sees_all_others(self, target_sentinel_ip: str) -> bool: for sentinel_ip in other_active_sentinels: try: - output, _ = client.exec_cli_command( - command=["sentinel", "sentinels", PRIMARY_NAME], - hostname=target_sentinel_ip, - ) - if sentinel_ip not in output: + if sentinel_ip not in { + sentinel["ip"] + for sentinel in client.sentinels_primary(hostname=target_sentinel_ip) + }: logger.debug( f"Sentinel at {target_sentinel_ip} does not see sentinel at {sentinel_ip}" ) @@ -214,29 +217,34 @@ def sentinel_sees_all_others(self, target_sentinel_ip: str) -> bool: return False return True - def verify_expected_replica_count(self) -> bool: + @retry( + wait=wait_fixed(5), + stop=stop_after_attempt(6), + retry=retry_if_result(lambda result: result is False), + retry_error_callback=lambda _: False, + ) + def verify_expected_replica_count(self, sentinel_ips: list[str]) -> bool: """Verify that the sentinels in the cluster see the expected number of replicas.""" - client = ValkeyClient( + client = SentinelClient( username=self.admin_user, password=self.admin_password, workload=self.workload, - connect_to="sentinel", ) - units_started = [unit for unit in self.state.servers if unit.is_active] # all started servers except primary are expected to be replicas - expected_replicas = len(units_started) - 1 + expected_replicas = len(sentinel_ips) - 1 logger.debug( - "Verifying expected replica count. Expected replicas: %d, started servers: %s", + "Verifying expected replica count. Expected replicas: %d, active servers: %s", expected_replicas, - str([unit.unit_name for unit in units_started]), + sentinel_ips, ) try: - for unit in units_started: - replica_info = client.sentinel_get_replica_info(hostname=unit.model.private_ip) - if expected_replicas != (nbr_replicas := replica_info.count("name")): + for sentinel_ip in sentinel_ips: + if expected_replicas != ( + number_replicas := len(client.replicas_primary(hostname=sentinel_ip)) + ): logger.warning( - f"Sentinel at {unit.model.private_ip} sees {nbr_replicas} replicas, expected {expected_replicas}." + f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." ) return False except ValkeyWorkloadCommandError: @@ -244,6 +252,17 @@ def verify_expected_replica_count(self) -> bool: return False return True + def get_active_sentinelips(self, hostname: str) -> list[str]: + """Get a list of IP addresses of the active sentinels in the cluster.""" + client = SentinelClient( + username=self.admin_user, + password=self.admin_password, + workload=self.workload, + ) + return [client.get_primary_ip(hostname=hostname)] + [ + sentinel["ip"] for sentinel in client.sentinels_primary(hostname=hostname) + ] + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the sentinel manager's statuses.""" status_list: list[StatusObject] = self.state.statuses.get( From c82522312c95789275d2729b5152ca90c61a64e8 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 12:04:23 +0000 Subject: [PATCH 109/159] refactor verify_expected_replica_count --- src/common/exceptions.py | 4 ++++ src/managers/sentinel.py | 33 +++++++++++++++++++++------------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/common/exceptions.py b/src/common/exceptions.py index ed8fa4d..2bd603a 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -50,3 +50,7 @@ class ValkeyServicesCouldNotBeStoppedError(Exception): class CannotSeeAllActiveSentinelsError(Exception): """Custom Exception if the local sentinel cannot see all active sentinels in the cluster.""" + + +class SentinelIncorrectReplicaCountError(Exception): + """Custom Exception if the sentinel sees an incorrect number of replicas.""" diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index e9f4269..a17b2a6 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -15,6 +15,7 @@ from common.exceptions import ( CannotSeeAllActiveSentinelsError, SentinelFailoverError, + SentinelIncorrectReplicaCountError, ValkeyCannotGetPrimaryIPError, ValkeyWorkloadCommandError, ) @@ -220,11 +221,17 @@ def target_sees_all_others(self, target_sentinel_ip: str, sentinel_ips: list[str @retry( wait=wait_fixed(5), stop=stop_after_attempt(6), - retry=retry_if_result(lambda result: result is False), - retry_error_callback=lambda _: False, + reraise=True, ) - def verify_expected_replica_count(self, sentinel_ips: list[str]) -> bool: - """Verify that the sentinels in the cluster see the expected number of replicas.""" + def verify_expected_replica_count(self, sentinel_ips: list[str]) -> None: + """Verify that the sentinels in the cluster see the expected number of replicas. + + The expected number of replicas is the number of active sentinels minus one (the primary). + + Raises: + SentinelIncorrectReplicaCountError: If any sentinel sees an incorrect number of replicas. + ValkeyWorkloadCommandError: If the CLI command to get replica information fails on any sentinel. + """ client = SentinelClient( username=self.admin_user, password=self.admin_password, @@ -238,21 +245,23 @@ def verify_expected_replica_count(self, sentinel_ips: list[str]) -> bool: expected_replicas, sentinel_ips, ) - try: - for sentinel_ip in sentinel_ips: + + for sentinel_ip in sentinel_ips: + try: if expected_replicas != ( number_replicas := len(client.replicas_primary(hostname=sentinel_ip)) ): logger.warning( f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." ) - return False - except ValkeyWorkloadCommandError: - logger.warning("Could not query sentinel for replica information.") - return False - return True + raise SentinelIncorrectReplicaCountError( + f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." + ) + except ValkeyWorkloadCommandError: + logger.warning("Could not query sentinel for replica information.") + raise - def get_active_sentinelips(self, hostname: str) -> list[str]: + def get_active_sentinel_ips(self, hostname: str) -> list[str]: """Get a list of IP addresses of the active sentinels in the cluster.""" client = SentinelClient( username=self.admin_user, From f9c37f88e1e87f07d96a4b99612f61af9d404ea8 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 12:04:48 +0000 Subject: [PATCH 110/159] update base events with new refactoring --- src/events/base_events.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 57ff01b..cd9d315 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -6,15 +6,14 @@ import logging import socket -import time from typing import TYPE_CHECKING import ops from common.exceptions import ( - CannotSeeAllActiveSentinelsError, SentinelFailoverError, ValkeyACLLoadError, + ValkeyCannotGetPrimaryIPError, ValkeyConfigSetError, ValkeyConfigurationError, ValkeyServiceNotAliveError, @@ -127,9 +126,9 @@ def _on_start(self, event: ops.StartEvent) -> None: logger.info("Waiting for lock to start") event.defer() return - - primary_ip = self.charm.sentinel_manager.get_primary_ip() - if not primary_ip: + try: + primary_ip = self.charm.sentinel_manager.get_primary_ip() + except ValkeyCannotGetPrimaryIPError: if self.charm.state.number_units_started == 0 and self.charm.unit.is_leader(): primary_ip = self.charm.state.bind_address else: @@ -448,10 +447,8 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) + # blocks until the lock is acquired scale_down_lock.request_lock() - while not scale_down_lock.do_i_hold_lock: - logger.debug("Waiting for lock to scale down") - time.sleep(5) self.charm.state.statuses.delete( ScaleDownStatuses.WAIT_FOR_LOCK.value, @@ -466,9 +463,10 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) - # if unit has primary then failover - if self.charm.sentinel_manager.get_primary_ip() == self.charm.state.bind_address: + if ( + primary_ip := self.charm.sentinel_manager.get_primary_ip() + ) == self.charm.state.bind_address: self.charm.state.unit_server.update( {"scale_down_state": ScaleDownState.WAIT_TO_FAILOVER} ) @@ -477,10 +475,12 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: self.charm.state.bind_address, ) try: + logger.debug("Triggering sentinel failover on primary IP %s", primary_ip) self.charm.sentinel_manager.failover() + primary_ip = self.charm.sentinel_manager.get_primary_ip() logger.debug( "Failover completed, new primary ip %s", - self.charm.sentinel_manager.get_primary_ip(), + primary_ip, ) except SentinelFailoverError: logger.error("Failed to trigger failover before scale down") @@ -501,17 +501,17 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: "start_state": StartState.NOT_STARTED.value, } ) - try: - self.charm.sentinel_manager.reset_sentinel_states() - except (ValkeyWorkloadCommandError, CannotSeeAllActiveSentinelsError): - logger.error("Failed to reset sentinel states before scale down") - raise + active_units = [ + ip + for ip in self.charm.sentinel_manager.get_active_sentinel_ips(primary_ip) + if ip != self.charm.state.bind_address + ] + logger.debug("Resetting sentinel states on active units: %s", active_units) + self.charm.sentinel_manager.reset_sentinel_states(active_units) # check health after scale down self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) - if not self.charm.sentinel_manager.verify_expected_replica_count(): - logger.error("Not all sentinels see the expected number of replicas after scale down") - raise + self.charm.sentinel_manager.verify_expected_replica_count(active_units) # release lock self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) From 674b96ffced3fcc89c378fd14c38e8662efd356e Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 25 Feb 2026 13:01:26 +0000 Subject: [PATCH 111/159] remove unnecessary debug log --- src/common/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/client.py b/src/common/client.py index 0ad7e5d..d0a9234 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -383,7 +383,6 @@ def replicas_primary(self, hostname: str) -> list[dict[str, str]]: replicas = self.exec_cli_command( command=["sentinel", "replicas", PRIMARY_NAME], hostname=hostname ) - logger.debug("Retrieved replicas information from sentinel at %s: %s", hostname, replicas) return replicas def sentinels_primary(self, hostname: str) -> list[dict[str, str]]: From 733dbd1ba9cbae877918364d0dba165d6420e139 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 06:46:35 +0000 Subject: [PATCH 112/159] shorten statuses --- src/common/client.py | 1 + src/statuses.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index b15aed0..9354b32 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -5,6 +5,7 @@ import json import logging +from typing import Any from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed diff --git a/src/statuses.py b/src/statuses.py index d7faa1e..b6cd779 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -80,15 +80,15 @@ class ScaleDownStatuses(Enum): WAIT_FOR_LOCK = StatusObject( status="maintenance", - message="Waiting for lock to perform scale down operations...", + message="Waiting for lock to scale down ...", running="async", ) SCALING_DOWN = StatusObject( status="maintenance", - message="Performing scale down operations...", + message="Scaling down ...", running="async", ) GOING_AWAY = StatusObject( status="maintenance", - message="Waiting for unit to be removed by juju...", + message="Waiting for juju to remove the unit ...", ) From b1258b4529c9d9cd39edfc2522f866349d08f589 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 09:03:07 +0000 Subject: [PATCH 113/159] fix unit tests and change some function names on client --- src/common/client.py | 19 ++++---- src/common/exceptions.py | 4 ++ src/common/locks.py | 11 ++++- src/events/base_events.py | 4 +- src/managers/cluster.py | 5 +- src/managers/sentinel.py | 8 ++-- tests/unit/helpers.py | 6 +++ tests/unit/test_charm.py | 96 +++++++++++++-------------------------- 8 files changed, 70 insertions(+), 83 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 9354b32..7c5e4b0 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -116,7 +116,7 @@ def ping(self, hostname: str) -> bool: except ValkeyWorkloadCommandError: return False - def get_persistence_info(self, hostname: str) -> dict[str, str] | None: + def info_persistence(self, hostname: str) -> dict[str, str] | None: """Get the persistence information of the Valkey server. Args: @@ -168,7 +168,7 @@ def set(self, hostname: str, key: str, value: str, additional_args: list[str] = self.exec_cli_command(["set", key, value] + additional_args, hostname=hostname) == "OK" ) - def get(self, hostname: str, key: str) -> str: + def get(self, hostname: str, key: str) -> Any: """Get the value of a key from the Valkey server. Args: @@ -176,7 +176,7 @@ def get(self, hostname: str, key: str) -> str: key (str): The key to retrieve. Returns: - str: The value of the key if retrieved successfully. + Any: The value of the key if retrieved successfully. Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. @@ -199,7 +199,7 @@ def delifeq(self, hostname: str, key: str, value: str) -> str: """ return self.exec_cli_command(["delifeq", key, value], hostname=hostname, json_output=False) - def is_replica_synced(self, hostname: str) -> bool: + def role(self, hostname: str) -> list[str | Any]: """Check if the replica is synced with the primary. Args: @@ -211,8 +211,7 @@ def is_replica_synced(self, hostname: str) -> bool: Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - output = self.exec_cli_command(["role"], hostname=hostname) - return output[0] == "slave" and output[3] == "connected" + return self.exec_cli_command(["role"], hostname=hostname) def config_set(self, hostname: str, parameter: str, value: str) -> bool: """Set a runtime configuration parameter on the Valkey server. @@ -274,7 +273,7 @@ def ping(self, hostname: str) -> bool: except ValkeyWorkloadCommandError: return False - def get_primary_ip(self, hostname: str) -> str: + def get_primary_addr_by_name(self, hostname: str) -> str: """Get the primary IP address from the sentinel. Args: @@ -290,7 +289,7 @@ def get_primary_ip(self, hostname: str) -> str: command=["sentinel", "get-primary-addr-by-name", PRIMARY_NAME], hostname=hostname )[0] - def get_primary_info(self, hostname: str) -> dict[str, str]: + def primary(self, hostname: str) -> dict[str, str]: r"""Get the primary info from the sentinel. Args: @@ -306,7 +305,7 @@ def get_primary_info(self, hostname: str) -> dict[str, str]: command=["sentinel", "primary", PRIMARY_NAME], hostname=hostname ) - def trigger_failover(self, hostname: str) -> bool: + def failover_primary_coordinated(self, hostname: str) -> bool: """Trigger a failover through the sentinel. Args: @@ -344,7 +343,7 @@ def is_failover_in_progress(self, hostname: str) -> bool: Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ - return "failover_in_progress" in self.get_primary_info(hostname=hostname).get("flags", "") + return "failover_in_progress" in self.primary(hostname=hostname).get("flags", "") def reset(self, hostname: str) -> None: """Reset the sentinel state for the primary. diff --git a/src/common/exceptions.py b/src/common/exceptions.py index 2bd603a..14e47c9 100644 --- a/src/common/exceptions.py +++ b/src/common/exceptions.py @@ -54,3 +54,7 @@ class CannotSeeAllActiveSentinelsError(Exception): class SentinelIncorrectReplicaCountError(Exception): """Custom Exception if the sentinel sees an incorrect number of replicas.""" + + +class RequestingLockTimedOutError(Exception): + """Custom Exception if requesting a lock times out.""" diff --git a/src/common/locks.py b/src/common/locks.py index 9b31a4b..2de3ae3 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -186,7 +186,16 @@ def get_unit_with_lock(self, primary_ip: str | None = None) -> str | None: @override def request_lock(self, timeout: int | None = None) -> bool: - """Request the lock for the local unit.""" + """Request the lock for the local unit. + + This method will keep trying to acquire the lock until it is acquired or until the timeout is reached (if provided). + + Args: + timeout (int | None): The maximum time to keep trying to acquire the lock, in seconds. If None, it will keep trying indefinitely. + + Returns: + bool: True if the lock was acquired, False if the timeout was reached before acquiring the lock. + """ logger.debug(f"{self.charm.state.unit_server.unit_name} is requesting {self.name} lock.") retry_until = time.time() + timeout if timeout else None primary_ip = self.charm.sentinel_manager.get_primary_ip() diff --git a/src/events/base_events.py b/src/events/base_events.py index 5cddac0..ba77bbe 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -11,6 +11,7 @@ import ops from common.exceptions import ( + RequestingLockTimedOutError, SentinelFailoverError, ValkeyACLLoadError, ValkeyCannotGetPrimaryIPError, @@ -432,7 +433,8 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: statuses_state=self.charm.state.statuses, ) # blocks until the lock is acquired - scale_down_lock.request_lock() + if not scale_down_lock.request_lock(): + raise RequestingLockTimedOutError("Failed to acquire scale down lock within timeout") self.charm.state.statuses.delete( ScaleDownStatuses.WAIT_FOR_LOCK.value, diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 5f7dc84..da6febf 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -79,7 +79,8 @@ def is_replica_synced(self) -> bool: password=self.admin_password, workload=self.workload, ) - return client.is_replica_synced(hostname=self.state.bind_address) + role_info = client.role(hostname=self.state.bind_address) + return role_info[0] == "slave" and role_info[3] == "connected" @retry( wait=wait_fixed(5), @@ -100,7 +101,7 @@ def is_healthy(self, is_primary: bool = False, check_replica_sync: bool = True) return False if ( - persistence_info := client.get_persistence_info(hostname=self.state.bind_address) + persistence_info := client.info_persistence(hostname=self.state.bind_address) ) and persistence_info.get("loading", "") != "0": logger.warning("Health check failed: Valkey server is still loading data.") return False diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index cc6e999..dda1016 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -100,7 +100,7 @@ def get_primary_ip(self) -> str: for unit_ip in started_servers: try: - return client.get_primary_ip(hostname=unit_ip) + return client.get_primary_addr_by_name(hostname=unit_ip) except ValkeyWorkloadCommandError: logger.warning( "Could not query sentinel for primary information from server at %s.", @@ -132,7 +132,7 @@ def is_healthy(self) -> bool: return False try: - client.get_primary_info(hostname=self.state.bind_address) + client.primary(hostname=self.state.bind_address) except ValkeyWorkloadCommandError: logger.warning("Health check failed: Could not query sentinel for master information.") return False @@ -154,7 +154,7 @@ def failover(self) -> None: workload=self.workload, ) try: - client.trigger_failover(self.state.bind_address) + client.failover_primary_coordinated(self.state.bind_address) client.is_failover_in_progress(hostname=self.state.bind_address) except ValkeyWorkloadCommandError as e: logger.error(f"Failed to trigger failover: {e}") @@ -300,7 +300,7 @@ def get_active_sentinel_ips(self, hostname: str) -> list[str]: password=self.admin_password, workload=self.workload, ) - return [client.get_primary_ip(hostname=hostname)] + [ + return [client.get_primary_addr_by_name(hostname=hostname)] + [ sentinel["ip"] for sentinel in client.sentinels_primary(hostname=hostname) ] diff --git a/tests/unit/helpers.py b/tests/unit/helpers.py index afd9eef..75876bf 100644 --- a/tests/unit/helpers.py +++ b/tests/unit/helpers.py @@ -2,10 +2,16 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. +from pathlib import Path + +import yaml from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.utils import as_status from ops import testing +METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) +APP_NAME = METADATA["name"] + def status_is(state_out: testing.State, to_status: StatusObject, is_app: bool = False) -> bool: """Check if the status is set to the given status.""" diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 35b4275..95bce93 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -2,11 +2,9 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. -from pathlib import Path from unittest.mock import patch import pytest -import yaml from ops import ActiveStatus, pebble, testing from common.exceptions import ValkeyServiceNotAliveError, ValkeyWorkloadCommandError @@ -21,7 +19,7 @@ ) from src.statuses import CharmStatuses, ClusterStatuses, StartStatuses -from .helpers import status_is +from .helpers import APP_NAME, status_is CHARM_USER = "_daemon_" CONTAINER = "valkey" @@ -29,8 +27,6 @@ SERVICE_METRIC_EXPORTER = "metric_exporter" SERVICE_SENTINEL = "valkey-sentinel" -METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) -APP_NAME = METADATA["name"] internal_passwords_secret = testing.Secret( tracked_content={f"{user.value}-password": "secure-password" for user in CharmUsers}, @@ -100,17 +96,18 @@ def test_start_primary(cloud_spec): with ( patch("common.client.ValkeyClient.ping", return_value=True), - patch("common.client.ValkeyClient.get_persistence_info", return_value={"loading": "0"}), - patch("common.client.ValkeyClient.set_value", return_value=True), + patch("common.client.ValkeyClient.info_persistence", return_value={"loading": "0"}), + patch("common.client.ValkeyClient.set", return_value=True), ): state_out = ctx.run(ctx.on.start(), state_out) assert status_is(state_out, StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) with ( patch("common.client.ValkeyClient.ping", return_value=True), - patch("common.client.ValkeyClient.get_persistence_info", return_value={"loading": "0"}), - patch("common.client.ValkeyClient.set_value", return_value=True), - patch("common.client.ValkeyClient.sentinel_get_master_info", return_value={"ip": "test"}), + patch("common.client.SentinelClient.ping", return_value=True), + patch("common.client.ValkeyClient.info_persistence", return_value={"loading": "0"}), + patch("common.client.ValkeyClient.set", return_value=True), + patch("common.client.SentinelClient.primary", return_value={"ip": "test"}), ): state_out = ctx.run(ctx.on.start(), state_out) assert state_out.unit_status == ActiveStatus() @@ -199,11 +196,14 @@ def test_start_non_primary(cloud_spec): assert status_is(state_out, StartStatuses.WAITING_TO_START.value) # health check - with patch("common.client.ValkeyClient.is_replica_synced", return_value=False): + with patch( + "common.client.ValkeyClient.role", + return_value=["slave", "ip", 6379, "sync", 467184], + ): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"starting-member": "valkey/0"}, + local_app_data={"start-member": "valkey/0"}, peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( @@ -216,16 +216,16 @@ def test_start_non_primary(cloud_spec): state_out = ctx.run(ctx.on.start(), state_in) assert status_is(state_out, StartStatuses.SERVICE_STARTING.value) - # replica syncing + # sentinel not yet discovered with ( - patch("managers.cluster.ClusterManager.is_replica_synced", return_value=False), + patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=False), patch("managers.cluster.ClusterManager.is_healthy", return_value=True), patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), ): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"starting-member": "valkey/0"}, + local_app_data={"start-member": "valkey/0"}, peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( @@ -236,18 +236,19 @@ def test_start_non_primary(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, StartStatuses.WAITING_FOR_REPLICA_SYNC.value) + assert status_is(state_out, StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) - # sentinel not yet discovered + # replica syncing with ( - patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=False), + patch("managers.cluster.ClusterManager.is_replica_synced", return_value=False), + patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=True), patch("managers.cluster.ClusterManager.is_healthy", return_value=True), patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), ): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"starting-member": "valkey/0"}, + local_app_data={"start-member": "valkey/0"}, peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( @@ -258,8 +259,7 @@ def test_start_non_primary(cloud_spec): containers={container}, ) state_out = ctx.run(ctx.on.start(), state_in) - assert status_is(state_out, StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) - + assert status_is(state_out, StartStatuses.WAITING_FOR_REPLICA_SYNC.value) # Happy path with sentinel discovered and replica synced with ( patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=True), @@ -270,7 +270,7 @@ def test_start_non_primary(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"starting-member": "valkey/0"}, + local_app_data={"start-member": "valkey/0"}, peers_data={1: {"start-state": "started"}}, ) state_in = testing.State( @@ -476,12 +476,12 @@ def test_config_changed_leader_unit(cloud_spec): ) with ( patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.load_acl") as mock_load_acl, + patch("common.client.ValkeyClient.acl_load") as mock_acl_load, patch("common.client.ValkeyClient.config_set") as mock_config_set, ): state_out = ctx.run(ctx.on.config_changed(), state_in) mock_set_acl_file.assert_called_once() - mock_load_acl.assert_called_once() + mock_acl_load.assert_called_once() mock_config_set.assert_called_once() secret_out = state_out.get_secret( label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" @@ -492,40 +492,6 @@ def test_config_changed_leader_unit(cloud_spec): ) -# def test_config_changed_leader_unit_primary(cloud_spec): -# ctx = testing.Context(ValkeyCharm, app_trusted=True) -# relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) -# container = testing.Container(name=CONTAINER, can_connect=True) - -# password_secret = testing.Secret( -# tracked_content={user.value: "secure-password" for user in CharmUsers}, -# remote_grants=APP_NAME, -# ) -# state_in = testing.State( -# leader=True, -# relations={relation}, -# containers={container}, -# secrets={password_secret}, -# config={INTERNAL_USERS_PASSWORD_CONFIG: password_secret.id}, -# model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), -# ) -# with ( -# patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, -# patch("common.client.ValkeyClient.load_acl") as mock_load_acl, -# patch("common.client.ValkeyClient.config_set") as mock_config_set, -# patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), -# ): -# state_out = ctx.run(ctx.on.config_changed(), state_in) -# mock_set_acl_file.assert_called_once() -# secret_out = state_out.get_secret( -# label=f"{PEER_RELATION}.{APP_NAME}.app.{INTERNAL_USERS_SECRET_LABEL_SUFFIX}" -# ) -# assert ( -# secret_out.latest_content.get(f"{CharmUsers.VALKEY_ADMIN.value}-password") -# == "secure-password" -# ) - - def test_config_changed_leader_unit_wrong_username(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) @@ -585,14 +551,14 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): "events.base_events.BaseEvents._update_internal_users_password" ) as mock_update_password, patch("managers.config.ConfigManager.set_acl_file") as mock_set_acl_file, - patch("common.client.ValkeyClient.load_acl") as mock_load_acl, + patch("common.client.ValkeyClient.acl_load") as mock_acl_load, patch("common.client.ValkeyClient.config_set") as mock_config_set, patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.0.1.1"), ): ctx.run(ctx.on.secret_changed(password_secret), state_in) mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_load_acl.assert_called_once() + mock_acl_load.assert_called_once() mock_config_set.assert_called_once() @@ -683,7 +649,7 @@ def test_relation_changed_event_leader_setting_starting_member(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) state_out = ctx.run(ctx.on.relation_changed(relation), state_in) - assert state_out.get_relation(1).local_app_data.get("starting-member") == "valkey/1" + assert state_out.get_relation(1).local_app_data.get("start-member") == "valkey/1" def test_relation_changed_event_leader_clears_starting_member(cloud_spec): @@ -691,7 +657,7 @@ def test_relation_changed_event_leader_clears_starting_member(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"starting-member": "valkey/1"}, + local_app_data={"start-member": "valkey/1"}, local_unit_data={"start-state": "started"}, peers_data={1: {"start-state": "started"}}, ) @@ -704,7 +670,7 @@ def test_relation_changed_event_leader_clears_starting_member(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) state_out = ctx.run(ctx.on.relation_changed(relation), state_in) - assert state_out.get_relation(1).local_app_data.get("starting-member") is None + assert state_out.get_relation(1).local_app_data.get("start-member") is None def test_relation_changed_event_leader_leaves_starting_member_as_is(cloud_spec): @@ -712,7 +678,7 @@ def test_relation_changed_event_leader_leaves_starting_member_as_is(cloud_spec): relation = testing.PeerRelation( id=1, endpoint=PEER_RELATION, - local_app_data={"starting-member": "valkey/1"}, + local_app_data={"start-member": "valkey/1"}, local_unit_data={"start-state": StartState.STARTED.value}, peers_data={ 1: { @@ -730,4 +696,4 @@ def test_relation_changed_event_leader_leaves_starting_member_as_is(cloud_spec): model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) state_out = ctx.run(ctx.on.relation_changed(relation), state_in) - assert state_out.get_relation(1).local_app_data.get("starting-member") == "valkey/1" + assert state_out.get_relation(1).local_app_data.get("start-member") == "valkey/1" From cbe8f661bd4b76bace2000c5110f652d664d7f56 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 09:18:32 +0000 Subject: [PATCH 114/159] remove unnecessary catches --- src/events/base_events.py | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index ba77bbe..269312b 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -12,13 +12,11 @@ from common.exceptions import ( RequestingLockTimedOutError, - SentinelFailoverError, ValkeyACLLoadError, ValkeyCannotGetPrimaryIPError, ValkeyConfigSetError, ValkeyConfigurationError, ValkeyServiceNotAliveError, - ValkeyServicesCouldNotBeStoppedError, ValkeyServicesFailedToStartError, ValkeyWorkloadCommandError, ) @@ -456,29 +454,17 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: self.charm.state.unit_server.update( {"scale_down_state": ScaleDownState.WAIT_TO_FAILOVER} ) + logger.debug("Triggering sentinel failover on primary IP %s", primary_ip) + self.charm.sentinel_manager.failover() + primary_ip = self.charm.sentinel_manager.get_primary_ip() logger.debug( - "Unit with IP %s is primary, triggering failover before scale down", - self.charm.state.bind_address, + "Failover completed, new primary ip %s", + primary_ip, ) - try: - logger.debug("Triggering sentinel failover on primary IP %s", primary_ip) - self.charm.sentinel_manager.failover() - primary_ip = self.charm.sentinel_manager.get_primary_ip() - logger.debug( - "Failover completed, new primary ip %s", - primary_ip, - ) - except SentinelFailoverError: - logger.error("Failed to trigger failover before scale down") - raise # stop valkey and sentinel processes self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.STOP_SERVICES}) - try: - self.charm.workload.stop() - except ValkeyServicesCouldNotBeStoppedError: - logger.error("Failed to stop Valkey services before scale down") - raise + self.charm.workload.stop() # reset sentinel states on other units self.charm.state.unit_server.update( From b30e1e973f0811dedbdd466394f655afa5975e21 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 09:19:37 +0000 Subject: [PATCH 115/159] add scale down unit tests --- tests/unit/test_scaledown.py | 137 +++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 tests/unit/test_scaledown.py diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py new file mode 100644 index 0000000..46db2d4 --- /dev/null +++ b/tests/unit/test_scaledown.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# Copyright 2025 Canonical Ltd. +# See LICENSE file for licensing details. + +from unittest.mock import PropertyMock, patch + +import pytest +from ops import testing + +from charm import ValkeyCharm +from literals import CONTAINER, PEER_RELATION +from statuses import ScaleDownStatuses +from tests.unit.helpers import status_is + + +def get_3_unit_peer_relation(): + """Helper function to create a peer relation with 3 units.""" + return testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_unit_data={ + "hostname": "valkey-0", + "private-ip": "10.0.1.0", + "start-state": "started", + }, + peers_data={ + unit_id: { + "hostname": f"valkey-{unit_id}", + "private-ip": f"10.0.1.{unit_id}", + "start-state": "started", + } + for unit_id in range(1, 4) + }, + ) + + +def test_other_unit_has_lock(cloud_spec): + """Test that if another unit has the lock, then the lock is not acquired.""" + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = get_3_unit_peer_relation() + container = testing.Container(name=CONTAINER, can_connect=True) + data_stroage = testing.Storage(name="data") + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + relations={relation}, + leader=True, + containers={container}, + storages={data_stroage}, + ) + + with ( + patch("common.locks.ScaleDownLock.request_lock", return_value=False), + ): + # expect raised exception due to lock not being acquired + with pytest.raises(testing.errors.UncaughtCharmError) as exc_info: + ctx.run(ctx.on.storage_detaching(data_stroage), state_in) + assert "RequestingLockTimedOutError" in str(exc_info.value) + + +def test_non_primary(cloud_spec): + """Test that if another unit has the lock, then the lock is not acquired.""" + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = get_3_unit_peer_relation() + container = testing.Container(name=CONTAINER, can_connect=True) + data_stroage = testing.Storage(name="data") + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + relations={relation}, + leader=True, + containers={container}, + storages={data_stroage}, + ) + + with ( + patch("common.locks.ScaleDownLock.request_lock", return_value=True), + patch("common.locks.ScaleDownLock.release_lock", return_value=True), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.1"), + patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, + patch( + "managers.sentinel.SentinelManager.reset_sentinel_states" + ) as mock_reset_sentinel_states, + patch( + "managers.sentinel.SentinelManager.verify_expected_replica_count" + ) as mock_verify_expected_replica_count, + patch( + "managers.sentinel.SentinelManager.get_active_sentinel_ips", + return_value=["10.0.1.1", "10.0.1.2", "10.0.1.3"], + ), + ): + state_out = ctx.run(ctx.on.storage_detaching(data_stroage), state_in) + mock_stop.assert_called_once() + mock_reset_sentinel_states.assert_called_once() + mock_verify_expected_replica_count.assert_called_once() + status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) + + +def test_primary(cloud_spec): + """Test that if another unit has the lock, then the lock is not acquired.""" + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = get_3_unit_peer_relation() + container = testing.Container(name=CONTAINER, can_connect=True) + data_stroage = testing.Storage(name="data") + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + relations={relation}, + leader=True, + containers={container}, + storages={data_stroage}, + ) + + with ( + patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock(return_value="10.0.1.0"), + ), + patch("common.locks.ScaleDownLock.request_lock", return_value=True), + patch("common.locks.ScaleDownLock.release_lock", return_value=True), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.0"), + patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, + patch("managers.sentinel.SentinelManager.failover") as mock_failover, + patch( + "managers.sentinel.SentinelManager.reset_sentinel_states" + ) as mock_reset_sentinel_states, + patch( + "managers.sentinel.SentinelManager.verify_expected_replica_count" + ) as mock_verify_expected_replica_count, + patch( + "managers.sentinel.SentinelManager.get_active_sentinel_ips", + return_value=["10.0.1.1", "10.0.1.2", "10.0.1.3"], + ), + ): + state_out = ctx.run(ctx.on.storage_detaching(data_stroage), state_in) + mock_failover.assert_called_once() + mock_stop.assert_called_once() + mock_reset_sentinel_states.assert_called_once() + mock_verify_expected_replica_count.assert_called_once() + status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) From e2ba6ef319fde8d62c5480c2178aa2f166f8815a Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 10:43:25 +0000 Subject: [PATCH 116/159] only try to update passwords on valkey if it is started --- src/events/base_events.py | 12 ++++++++---- src/statuses.py | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 269312b..ee6ead9 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -321,10 +321,12 @@ def _on_secret_changed(self, event: ops.SecretChangedEvent) -> None: # leader unit processed the secret change from user, non-leader units can replicate try: self.charm.config_manager.set_acl_file() - self.charm.cluster_manager.reload_acl_file() + if self.charm.state.unit_server.is_started: + self.charm.cluster_manager.reload_acl_file() # update the local unit admin password to match the leader self.charm.config_manager.update_local_valkey_admin_password() - self.charm.cluster_manager.update_primary_auth() + if self.charm.state.unit_server.is_started: + self.charm.cluster_manager.update_primary_auth() except (ValkeyACLLoadError, ValkeyConfigSetError, ValkeyWorkloadCommandError) as e: logger.error(e) self.charm.status.set_running_status( @@ -382,7 +384,8 @@ def _update_internal_users_password(self, secret_id: str) -> None: logger.info("Password(s) for internal users have changed") try: self.charm.config_manager.set_acl_file(passwords=new_passwords) - self.charm.cluster_manager.reload_acl_file() + if self.charm.state.unit_server.is_started: + self.charm.cluster_manager.reload_acl_file() self.charm.state.cluster.update( { f"{user.value.replace('-', '_')}_password": new_passwords[user.value] @@ -391,7 +394,8 @@ def _update_internal_users_password(self, secret_id: str) -> None: ) # update the local unit admin password self.charm.config_manager.update_local_valkey_admin_password() - self.charm.cluster_manager.update_primary_auth() + if self.charm.state.unit_server.is_started: + self.charm.cluster_manager.update_primary_auth() except ( ValkeyACLLoadError, ValueError, diff --git a/src/statuses.py b/src/statuses.py index b6cd779..adf8c65 100644 --- a/src/statuses.py +++ b/src/statuses.py @@ -80,15 +80,15 @@ class ScaleDownStatuses(Enum): WAIT_FOR_LOCK = StatusObject( status="maintenance", - message="Waiting for lock to scale down ...", + message="Waiting for lock to scale down...", running="async", ) SCALING_DOWN = StatusObject( status="maintenance", - message="Scaling down ...", + message="Scaling down...", running="async", ) GOING_AWAY = StatusObject( status="maintenance", - message="Waiting for juju to remove the unit ...", + message="Waiting for juju to remove the unit...", ) From ba0ccc0f83fb63daae1c12ca2e0125dd2bbeb800 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 10:43:34 +0000 Subject: [PATCH 117/159] add k8s scaledown tests --- tests/integration/k8s/ha/test_scaling.py | 71 ++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 11daee8..128f4a0 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -84,3 +84,74 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) + + +async def test_scale_down(juju: jubilant.Juju) -> None: + """Make sure scale down operations complete successfully.""" + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected slaves, got {number_of_slaves}." + ) + + # scale down + juju.remove_unit(APP_NAME, num_units=1) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=60 + ) + ) + num_units = len(juju.status().get_units(APP_NAME)) + assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS - 2, ( + f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." + ) + + +async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: + """Make sure multiple scale down operations complete successfully.""" + number_current_units = len(juju.status().apps[APP_NAME].units) + juju.add_unit(APP_NAME, num_units=(NUM_UNITS + 1) - number_current_units) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, idle_period=10, unit_count=NUM_UNITS + 1 + ), + timeout=1200, + ) + + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS, ( + f"Expected {NUM_UNITS} connected slaves, got {number_of_slaves}." + ) + + # scale down multiple units + juju.remove_unit(APP_NAME, num_units=2) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=60 + ) + ) + num_units = len(juju.status().get_units(APP_NAME)) + assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS - 2, ( + f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." + ) From 9949da3fbd53cd9191cef2d70f9789d9f1f7e1e0 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 11:06:02 +0000 Subject: [PATCH 118/159] fix unit tests --- tests/unit/test_charm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 95bce93..446cec7 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -437,7 +437,9 @@ def test_config_changed_non_leader_unit(cloud_spec): def test_config_changed_leader_unit_valkey_update_fails(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, local_unit_data={"start-state": "started"} + ) container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( @@ -459,7 +461,9 @@ def test_config_changed_leader_unit_valkey_update_fails(cloud_spec): def test_config_changed_leader_unit(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, local_unit_data={"start-state": "started"} + ) container = testing.Container(name=CONTAINER, can_connect=True) password_secret = testing.Secret( @@ -564,7 +568,9 @@ def test_change_password_secret_changed_non_leader_unit(cloud_spec): def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) - relation = testing.PeerRelation(id=1, endpoint=PEER_RELATION) + relation = testing.PeerRelation( + id=1, endpoint=PEER_RELATION, local_unit_data={"start-state": "started"} + ) statuses_peer_relation = testing.PeerRelation(id=2, endpoint=STATUS_PEERS_RELATION) container = testing.Container(name=CONTAINER, can_connect=True) From d4cfb59aae23e3234c5f761d09b037438e94d994 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 12:23:14 +0000 Subject: [PATCH 119/159] handle scale down to 0 --- src/common/locks.py | 4 ++++ src/core/models.py | 10 +++++----- src/events/base_events.py | 27 ++++++++++++--------------- src/managers/cluster.py | 9 +++++---- src/managers/sentinel.py | 29 +++++++++++++++-------------- 5 files changed, 41 insertions(+), 38 deletions(-) diff --git a/src/common/locks.py b/src/common/locks.py index 2de3ae3..1aeb850 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -205,6 +205,10 @@ def request_lock(self, timeout: int | None = None) -> bool: ) return True + if len(self.charm.sentinel_manager.get_active_sentinel_ips(primary_ip)) == 1: + logger.debug("Last unit in the cluster scaling down. Lock will be skipped.") + return True + while True: try: if self.client.set( diff --git a/src/core/models.py b/src/core/models.py index ef307a3..223f734 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -122,11 +122,11 @@ def is_started(self) -> bool: @property def is_being_removed(self) -> bool: """Check if the unit is being removed from the cluster.""" - return ( - self.model.scale_down_state != ScaleDownState.NO_SCALE_DOWN.value - if self.model - else False - ) + return self.model.scale_down_state not in { + ScaleDownState.NO_SCALE_DOWN.value, + ScaleDownState.WAIT_FOR_LOCK.value, + ScaleDownState.WAIT_TO_FAILOVER.value, + } @property def is_active(self) -> bool: diff --git a/src/events/base_events.py b/src/events/base_events.py index ee6ead9..9c962e5 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -452,9 +452,9 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: statuses_state=self.charm.state.statuses, ) # if unit has primary then failover - if ( - primary_ip := self.charm.sentinel_manager.get_primary_ip() - ) == self.charm.state.bind_address: + primary_ip = self.charm.sentinel_manager.get_primary_ip() + active_sentinels = self.charm.sentinel_manager.get_active_sentinel_ips(primary_ip) + if primary_ip == self.charm.state.bind_address and len(active_sentinels) > 1: self.charm.state.unit_server.update( {"scale_down_state": ScaleDownState.WAIT_TO_FAILOVER} ) @@ -469,6 +469,7 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: # stop valkey and sentinel processes self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.STOP_SERVICES}) self.charm.workload.stop() + active_sentinels = [ip for ip in active_sentinels if ip != self.charm.state.bind_address] # reset sentinel states on other units self.charm.state.unit_server.update( @@ -477,18 +478,14 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: "start_state": StartState.NOT_STARTED.value, } ) - active_units = [ - ip - for ip in self.charm.sentinel_manager.get_active_sentinel_ips(primary_ip) - if ip != self.charm.state.bind_address - ] - logger.debug("Resetting sentinel states on active units: %s", active_units) - self.charm.sentinel_manager.reset_sentinel_states(active_units) - - # check health after scale down - self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) - self.charm.sentinel_manager.verify_expected_replica_count(active_units) + if active_sentinels: + logger.debug("Resetting sentinel states on active units: %s", active_sentinels) + self.charm.sentinel_manager.reset_sentinel_states(active_sentinels) + + # check health after scale down + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) + self.charm.sentinel_manager.verify_expected_replica_count(active_sentinels) + scale_down_lock.release_lock() # release lock self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) - scale_down_lock.release_lock() diff --git a/src/managers/cluster.py b/src/managers/cluster.py index da6febf..7bbf9be 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -120,11 +120,12 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje if not self.state.cluster.model or not self.state.unit_server.model: return status_list or [CharmStatuses.ACTIVE_IDLE.value] - if start_status := self._get_start_status(): - status_list.append(start_status) + if scope == "unit": + if start_status := self._get_start_status(): + status_list.append(start_status) - if scale_down_status := self._get_scale_down_status(): - status_list.append(scale_down_status) + if scale_down_status := self._get_scale_down_status(): + status_list.append(scale_down_status) return status_list or [CharmStatuses.ACTIVE_IDLE.value] diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index dda1016..315d4c4 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -175,6 +175,7 @@ def reset_sentinel_states(self, sentinel_ips: list[str]) -> None: for sentinel_ip in sentinel_ips: try: + logger.debug("Resetting sentinel state on %s.", sentinel_ip) client.reset(hostname=sentinel_ip) except ValkeyWorkloadCommandError: logger.warning("Could not reset sentinel state on %s.", sentinel_ip) @@ -212,29 +213,29 @@ def target_sees_all_others(self, target_sentinel_ip: str, sentinel_ips: list[str workload=self.workload, ) - other_active_sentinels = [ip for ip in sentinel_ips if ip != target_sentinel_ip] + sentinel_ips_set = set(sentinel_ips) - {target_sentinel_ip} logger.debug( "Checking if sentinel at %s sees all other sentinels: %s", target_sentinel_ip, - other_active_sentinels, + sentinel_ips_set, ) - for sentinel_ip in other_active_sentinels: - try: - if sentinel_ip not in { - sentinel["ip"] - for sentinel in client.sentinels_primary(hostname=target_sentinel_ip) - }: - logger.debug( - f"Sentinel at {target_sentinel_ip} does not see sentinel at {sentinel_ip}" - ) - return False - except ValkeyWorkloadCommandError: + try: + discovered_sentinels = { + sentinel["ip"] + for sentinel in client.sentinels_primary(hostname=target_sentinel_ip) + } + if discovered_sentinels != sentinel_ips_set: logger.warning( - f"Could not query sentinel at {target_sentinel_ip} for sentinel discovery." + f"Sentinel at {target_sentinel_ip} sees sentinels {discovered_sentinels}, expected {sentinel_ips_set}." ) return False + except ValkeyWorkloadCommandError: + logger.warning( + f"Could not query sentinel at {target_sentinel_ip} for sentinel discovery." + ) + return False return True @retry( From ac4348b2ea68d602e1a9b5607481ef5808a51774 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 12:32:15 +0000 Subject: [PATCH 120/159] fix unit test --- tests/unit/test_charm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 446cec7..a5acf2a 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -142,7 +142,6 @@ def test_start_primary(cloud_spec): state_out = ctx.run(ctx.on.start(), state_in) assert status_is(state_out, StartStatuses.SERVICE_NOT_STARTED.value) - assert status_is(state_out, StartStatuses.SERVICE_NOT_STARTED.value, is_app=True) def test_start_non_primary(cloud_spec): From 3fed063aaf38bf77cc2fb954fdc808e27da227e7 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 12:44:40 +0000 Subject: [PATCH 121/159] add scaling down to 0 and back --- src/events/base_events.py | 5 --- tests/integration/k8s/ha/test_scaling.py | 47 +++++++++++++++++++++++- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 9c962e5..e726e3f 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -267,11 +267,6 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: ) # update local unit admin password self.charm.config_manager.update_local_valkey_admin_password() - try: - self.charm.config_manager.set_acl_file() - except ValkeyWorkloadCommandError: - logger.error("Failed to write acl file") - raise def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: """Handle the config_changed event.""" diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 128f4a0..74ce2d2 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # Copyright 2026 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio import logging import jubilant @@ -101,7 +102,7 @@ async def test_scale_down(juju: jubilant.Juju) -> None: juju.remove_unit(APP_NAME, num_units=1) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=60 + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 ) ) num_units = len(juju.status().get_units(APP_NAME)) @@ -141,7 +142,7 @@ async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: juju.remove_unit(APP_NAME, num_units=2) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=60 + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 ) ) num_units = len(juju.status().get_units(APP_NAME)) @@ -155,3 +156,45 @@ async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: assert number_of_slaves == NUM_UNITS - 2, ( f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." ) + + +async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: + """Make sure that removing all units and then adding them again works.""" + # remove all remaining units + juju.remove_unit(APP_NAME, num_units=len(juju.status().apps[APP_NAME].units)) + juju.wait(lambda status: len(juju.status().get_units(APP_NAME)) == 0) + + # scale up again + juju.add_unit(APP_NAME, num_units=NUM_UNITS) + + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS, idle_period=10 + ), + timeout=1200, + ) + + hostnames = get_cluster_hostnames(juju, APP_NAME) + + connected_slaves = await get_number_connected_slaves( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert connected_slaves == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." + ) + c_writes.start() + await asyncio.sleep(10) # let the continuous writes write some data + await assert_continuous_writes_increasing( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + logger.info("Stopping continuous writes after scale up test.") + logger.info(await c_writes.async_stop()) + assert_continuous_writes_consistent( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) From e2a49641d637cc79db819b6478050fb45f88f3bd Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 13:04:56 +0000 Subject: [PATCH 122/159] clear cw --- tests/integration/k8s/ha/test_scaling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 74ce2d2..c3c5381 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -184,6 +184,7 @@ async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: assert connected_slaves == NUM_UNITS - 1, ( f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." ) + await c_writes.async_clear() c_writes.start() await asyncio.sleep(10) # let the continuous writes write some data await assert_continuous_writes_increasing( @@ -198,3 +199,4 @@ async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) + await c_writes.async_clear() From 72882924d89eafdafb9ff5c764e22142316aaafa Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 13:06:49 +0000 Subject: [PATCH 123/159] fix linter --- tests/unit/test_scaledown.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py index 46db2d4..6ddc1c6 100644 --- a/tests/unit/test_scaledown.py +++ b/tests/unit/test_scaledown.py @@ -14,7 +14,6 @@ def get_3_unit_peer_relation(): - """Helper function to create a peer relation with 3 units.""" return testing.PeerRelation( id=1, endpoint=PEER_RELATION, From b5210892acd154056eafe2c2305c3e4c14cf6baa Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 26 Feb 2026 13:47:38 +0000 Subject: [PATCH 124/159] add remove app test --- tests/integration/k8s/ha/test_scaling.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index c3c5381..5a16116 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -200,3 +200,14 @@ async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) await c_writes.async_clear() + + +def test_remove_application(juju: jubilant.Juju) -> None: + """Make sure the application can be removed.""" + juju.remove_application(APP_NAME) + + juju.wait( + lambda status: APP_NAME not in status.apps, + timeout=600, + delay=5, + ) From e6267cb94f8762a8026ecb1763b215f9494872d1 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 27 Feb 2026 07:32:48 +0000 Subject: [PATCH 125/159] copilot feedback --- src/common/client.py | 12 ++++++++---- src/core/models.py | 1 - src/events/base_events.py | 2 +- tests/unit/test_scaledown.py | 22 +++++++++++----------- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 7c5e4b0..a731018 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -149,7 +149,9 @@ def info_persistence(self, hostname: str) -> dict[str, str] | None: values[values_parts[0]] = values_parts[1] return values - def set(self, hostname: str, key: str, value: str, additional_args: list[str] = []) -> bool: + def set( + self, hostname: str, key: str, value: str, additional_args: list[str] | None = None + ) -> bool: """Set a key-value pair on the Valkey server. Args: @@ -164,6 +166,8 @@ def set(self, hostname: str, key: str, value: str, additional_args: list[str] = Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. """ + if additional_args is None: + additional_args = [] return ( self.exec_cli_command(["set", key, value] + additional_args, hostname=hostname) == "OK" ) @@ -200,13 +204,13 @@ def delifeq(self, hostname: str, key: str, value: str) -> str: return self.exec_cli_command(["delifeq", key, value], hostname=hostname, json_output=False) def role(self, hostname: str) -> list[str | Any]: - """Check if the replica is synced with the primary. + """Get the role information of the Valkey server. Args: hostname (str): The hostname to connect to. Returns: - bool: True if the replica is synced with the primary, False otherwise. + list[str | Any]: The role information retrieved from the server. Raises: ValkeyWorkloadCommandError: If the CLI command fails to execute or returns unexpected output. @@ -290,7 +294,7 @@ def get_primary_addr_by_name(self, hostname: str) -> str: )[0] def primary(self, hostname: str) -> dict[str, str]: - r"""Get the primary info from the sentinel. + """Get the primary info from the sentinel. Args: hostname (str): The hostname to connect to. diff --git a/src/core/models.py b/src/core/models.py index 223f734..0a185e3 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -37,7 +37,6 @@ class PeerAppModel(PeerModel): charmed_sentinel_peers_password: InternalUsersSecret = Field(default="") charmed_sentinel_operator_password: InternalUsersSecret = Field(default="") start_member: str = Field(default="") - scale_down_member: str = Field(default="") class PeerUnitModel(PeerModel): diff --git a/src/events/base_events.py b/src/events/base_events.py index e726e3f..cc74d4e 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -438,7 +438,7 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: scope="unit", component=self.charm.cluster_manager.name, ) - # TODO consider quorom when removing unit + # TODO consider quorum when removing unit self.charm.status.set_running_status( ScaleDownStatuses.SCALING_DOWN.value, diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py index 6ddc1c6..e6e3f16 100644 --- a/tests/unit/test_scaledown.py +++ b/tests/unit/test_scaledown.py @@ -38,13 +38,13 @@ def test_other_unit_has_lock(cloud_spec): ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = get_3_unit_peer_relation() container = testing.Container(name=CONTAINER, can_connect=True) - data_stroage = testing.Storage(name="data") + data_storage = testing.Storage(name="data") state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), relations={relation}, leader=True, containers={container}, - storages={data_stroage}, + storages={data_storage}, ) with ( @@ -52,22 +52,22 @@ def test_other_unit_has_lock(cloud_spec): ): # expect raised exception due to lock not being acquired with pytest.raises(testing.errors.UncaughtCharmError) as exc_info: - ctx.run(ctx.on.storage_detaching(data_stroage), state_in) + ctx.run(ctx.on.storage_detaching(data_storage), state_in) assert "RequestingLockTimedOutError" in str(exc_info.value) def test_non_primary(cloud_spec): - """Test that if another unit has the lock, then the lock is not acquired.""" + """Test scale-down behavior when this unit is not the primary but successfully acquires the lock.""" ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = get_3_unit_peer_relation() container = testing.Container(name=CONTAINER, can_connect=True) - data_stroage = testing.Storage(name="data") + data_strorage = testing.Storage(name="data") state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), relations={relation}, leader=True, containers={container}, - storages={data_stroage}, + storages={data_strorage}, ) with ( @@ -86,7 +86,7 @@ def test_non_primary(cloud_spec): return_value=["10.0.1.1", "10.0.1.2", "10.0.1.3"], ), ): - state_out = ctx.run(ctx.on.storage_detaching(data_stroage), state_in) + state_out = ctx.run(ctx.on.storage_detaching(data_strorage), state_in) mock_stop.assert_called_once() mock_reset_sentinel_states.assert_called_once() mock_verify_expected_replica_count.assert_called_once() @@ -94,17 +94,17 @@ def test_non_primary(cloud_spec): def test_primary(cloud_spec): - """Test that if another unit has the lock, then the lock is not acquired.""" + """Test scale-down behavior when this unit is the primary and successfully acquires the lock.""" ctx = testing.Context(ValkeyCharm, app_trusted=True) relation = get_3_unit_peer_relation() container = testing.Container(name=CONTAINER, can_connect=True) - data_stroage = testing.Storage(name="data") + data_strorage = testing.Storage(name="data") state_in = testing.State( model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), relations={relation}, leader=True, containers={container}, - storages={data_stroage}, + storages={data_strorage}, ) with ( @@ -128,7 +128,7 @@ def test_primary(cloud_spec): return_value=["10.0.1.1", "10.0.1.2", "10.0.1.3"], ), ): - state_out = ctx.run(ctx.on.storage_detaching(data_stroage), state_in) + state_out = ctx.run(ctx.on.storage_detaching(data_strorage), state_in) mock_failover.assert_called_once() mock_stop.assert_called_once() mock_reset_sentinel_states.assert_called_once() From f1030918579fc1fc56a9bcb82245ed00253ab835 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Fri, 27 Feb 2026 10:31:56 +0000 Subject: [PATCH 126/159] port fix from tls for leader elected event --- src/events/base_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index cc74d4e..ef8b7cd 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -224,7 +224,7 @@ def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: """Handle the leader-elected event.""" - if not self.charm.state.peer_relation: + if not (self.charm.state.peer_relation and self.charm.workload.can_connect): event.defer() return From a8f8912e4d8437f5c36994e08fad1b1fe0a2a480 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 3 Mar 2026 07:05:00 +0000 Subject: [PATCH 127/159] cw use databag to filter units and use helper to remove units on both substrates --- tests/integration/continuous_writes.py | 44 +++++-- tests/integration/helpers.py | 61 +++++++++- tests/integration/k8s/ha/test_scaling.py | 23 ++-- tests/integration/vm/ha/test_scaling.py | 142 ++++++++++++++++++++++- tests/integration/vm/test_charm.py | 3 +- 5 files changed, 254 insertions(+), 19 deletions(-) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index ed87368..267c3d7 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -6,6 +6,7 @@ import logging import multiprocessing import queue +import time from contextlib import asynccontextmanager from multiprocessing import log_to_stderr from pathlib import Path @@ -13,7 +14,12 @@ from typing import Optional import jubilant -from glide import GlideClient, GlideClientConfiguration, NodeAddress, ServerCredentials +from glide import ( + GlideClient, + GlideClientConfiguration, + NodeAddress, + ServerCredentials, +) from tenacity import ( retry, stop_after_attempt, @@ -22,7 +28,7 @@ ) from literals import CharmUsers -from tests.integration.helpers import get_cluster_hostnames, get_password +from tests.integration.helpers import get_data_bag, get_password logger = logging.getLogger(__name__) @@ -31,6 +37,18 @@ class WriteFailedError(Exception): """Raised when a single write operation has failed.""" +def get_active_hostnames(juju: jubilant.Juju, app_name: str) -> str: + """Get hostnames of units in started state and not marked for scale down.""" + return ",".join( + [ + unit["private-ip"] + for unit in get_data_bag(juju, app_name, "valkey-peers").values() + if unit.get("start-state", "") == "started" + and unit.get("scale-down-state", None) is None + ] + ) + + class ContinuousWrites: """Utility class for managing continuous async writes to Valkey using GLIDE.""" @@ -54,7 +72,7 @@ def __init__( def _get_config(self) -> SimpleNamespace: """Fetch current cluster configuration from Juju.""" return SimpleNamespace( - endpoints=",".join(get_cluster_hostnames(self._juju, app_name=self._app)), + endpoints=get_active_hostnames(self._juju, self._app), valkey_password=get_password(self._juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -70,7 +88,7 @@ async def _create_glide_client(self, config: Optional[SimpleNamespace] = None) - glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_client", - request_timeout=5000, + request_timeout=250, credentials=credentials, ) @@ -233,7 +251,7 @@ async def _make_client(conf: SimpleNamespace) -> GlideClient: glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_worker", - request_timeout=5000, + request_timeout=250, credentials=credentials, ) return await GlideClient.create(glide_config) @@ -262,6 +280,7 @@ async def with_client(conf: SimpleNamespace): try: proc_logger.info(f"Writing value: {current_val}") + proc_logger.info(f"Current endpoints={config.endpoints}") async with with_client(config) as client: if not ( res := await asyncio.wait_for( @@ -291,7 +310,18 @@ async def with_client(conf: SimpleNamespace): cw = ContinuousWrites(juju=juju_env, app="valkey", in_between_sleep=0.5) cw.clear() cw.start() - print("Continuous writes started. Press Enter to stop...") - input() + # stop on ctrl + C or after some time + hostnames = get_active_hostnames(juju_env, "valkey") + try: + while True: + time.sleep(1) + if new_hostnames := get_active_hostnames(juju_env, "valkey") != hostnames: + logger.info( + f"Hostnames changed from {hostnames} to {new_hostnames}, updating continuous writes client." + ) + hostnames = new_hostnames + cw.update() + except KeyboardInterrupt: + pass stats = cw.clear() print(f"Stopped. Stats: {stats}") diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index f6b677c..a87879d 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -2,6 +2,7 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. +import json import logging import os import re @@ -10,7 +11,7 @@ from contextlib import asynccontextmanager, contextmanager from datetime import datetime, timedelta from pathlib import Path -from typing import List, NamedTuple +from typing import List, Literal, NamedTuple import jubilant import yaml @@ -31,6 +32,7 @@ INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, CharmUsers, + Substrate, ) logger = logging.getLogger(__name__) @@ -556,3 +558,60 @@ async def auth_test(hostnames: list[str], username: str | None, password: str | raise WrongPassError("Authentication failed: WRONGPASS error") from e else: raise e + + +def remove_number_units( + juju: jubilant.Juju, app: str, num_units: int, substrate: Substrate +) -> None: + """Remove a specified number of units from an application. + + Args: + juju: An instance of Jubilant's Juju class on which to run Juju commands + app: The name of the application from which to remove units + num_units: The number of units to remove + substrate: The substrate type ("k8s" or "vm") + """ + match substrate: + case "k8s": + juju.remove_unit(app, num_units=num_units) + case "vm": + # get units names + unit_names = list(juju.status().get_units(app)) + # remove units by name until num_units have been removed + juju.remove_unit(*unit_names[:num_units]) + + +def get_data_bag( + juju: jubilant.Juju, + app_name: str, + relation_name: str, + scope: Literal["app", "unit"] = "unit", +) -> dict: + """Get the data bag for a given unit. + + Args: + juju: An instance of Jubilant's Juju class on which to run Juju commands + app_name: The name of the application whose data bag to retrieve + relation_name: The name of the relation for which to retrieve the data bag + scope: Specify whether to get the data bag for the app or unit + = + Returns: + The data bag for the specified unit. + """ + unit_name = next(iter(juju.status().get_units(app_name))) + unit_info = juju.cli("show-unit", unit_name, "--format", "json") + json_info = json.loads(unit_info) + relation = next( + rel for rel in json_info[unit_name]["relation-info"] if rel["endpoint"] == relation_name + ) + if not relation: + raise ValueError(f"Relation {relation_name} not found for unit {unit_name}") + if scope == "app": + return relation["application-data"] + local_data = relation["local-unit"]["data"] + remote_data = ( + {u_name: data["data"] for u_name, data in relation["related-units"].items()} + if relation.get("related-units") + else {} + ) + return {unit_name: local_data} | remote_data diff --git a/tests/integration/k8s/ha/test_scaling.py b/tests/integration/k8s/ha/test_scaling.py index 5a16116..464ea4a 100644 --- a/tests/integration/k8s/ha/test_scaling.py +++ b/tests/integration/k8s/ha/test_scaling.py @@ -6,7 +6,7 @@ import jubilant -from literals import CharmUsers +from literals import CharmUsers, Substrate from tests.integration.cw_helpers import ( assert_continuous_writes_consistent, assert_continuous_writes_increasing, @@ -18,6 +18,7 @@ get_cluster_hostnames, get_number_connected_slaves, get_password, + remove_number_units, seed_valkey, ) @@ -28,9 +29,14 @@ TEST_VALUE = "test_value" -def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: +def test_build_and_deploy(charm: str, juju: jubilant.Juju, substrate: Substrate) -> None: """Build the charm-under-test and deploy it with three units.""" - juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=1, trust=True) + juju.deploy( + charm, + resources=IMAGE_RESOURCE if substrate == Substrate.K8S else None, + num_units=1, + trust=True, + ) juju.wait( lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, @@ -87,7 +93,7 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: ) -async def test_scale_down(juju: jubilant.Juju) -> None: +async def test_scale_down(juju: jubilant.Juju, substrate: Substrate) -> None: """Make sure scale down operations complete successfully.""" number_of_slaves = await get_number_connected_slaves( hostnames=get_cluster_hostnames(juju, APP_NAME), @@ -99,7 +105,7 @@ async def test_scale_down(juju: jubilant.Juju) -> None: ) # scale down - juju.remove_unit(APP_NAME, num_units=1) + remove_number_units(juju, APP_NAME, num_units=1, substrate=substrate) juju.wait( lambda status: are_apps_active_and_agents_idle( status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 @@ -139,7 +145,8 @@ async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: ) # scale down multiple units - juju.remove_unit(APP_NAME, num_units=2) + remove_number_units(juju, APP_NAME, num_units=2, substrate=Substrate.K8S) + juju.wait( lambda status: are_apps_active_and_agents_idle( status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 @@ -161,7 +168,9 @@ async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: """Make sure that removing all units and then adding them again works.""" # remove all remaining units - juju.remove_unit(APP_NAME, num_units=len(juju.status().apps[APP_NAME].units)) + remove_number_units( + juju, APP_NAME, num_units=len(juju.status().apps[APP_NAME].units), substrate=Substrate.K8S + ) juju.wait(lambda status: len(juju.status().get_units(APP_NAME)) == 0) # scale up again diff --git a/tests/integration/vm/ha/test_scaling.py b/tests/integration/vm/ha/test_scaling.py index 3b33fd5..c6ddc57 100644 --- a/tests/integration/vm/ha/test_scaling.py +++ b/tests/integration/vm/ha/test_scaling.py @@ -1,21 +1,24 @@ #!/usr/bin/env python3 # Copyright 2026 Canonical Ltd. # See LICENSE file for licensing details. +import asyncio import logging import jubilant -from literals import CharmUsers +from literals import CharmUsers, Substrate from tests.integration.cw_helpers import ( assert_continuous_writes_consistent, assert_continuous_writes_increasing, ) from tests.integration.helpers import ( APP_NAME, + IMAGE_RESOURCE, are_apps_active_and_agents_idle, get_cluster_hostnames, get_number_connected_slaves, get_password, + remove_number_units, seed_valkey, ) @@ -26,9 +29,14 @@ TEST_VALUE = "test_value" -def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: +def test_build_and_deploy(charm: str, juju: jubilant.Juju, substrate: Substrate) -> None: """Build the charm-under-test and deploy it with three units.""" - juju.deploy(charm, num_units=1, trust=True) + juju.deploy( + charm, + resources=IMAGE_RESOURCE if substrate == Substrate.K8S else None, + num_units=1, + trust=True, + ) juju.wait( lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, @@ -83,3 +91,131 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) + + +async def test_scale_down(juju: jubilant.Juju, substrate: Substrate) -> None: + """Make sure scale down operations complete successfully.""" + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected slaves, got {number_of_slaves}." + ) + + # scale down + remove_number_units(juju, APP_NAME, num_units=1, substrate=substrate) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 + ) + ) + num_units = len(juju.status().get_units(APP_NAME)) + assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS - 2, ( + f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." + ) + + +async def test_scale_down_multiple_units(juju: jubilant.Juju, substrate: Substrate) -> None: + """Make sure multiple scale down operations complete successfully.""" + number_current_units = len(juju.status().apps[APP_NAME].units) + juju.add_unit(APP_NAME, num_units=(NUM_UNITS + 1) - number_current_units) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, idle_period=10, unit_count=NUM_UNITS + 1 + ), + timeout=1200, + ) + + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS, ( + f"Expected {NUM_UNITS} connected slaves, got {number_of_slaves}." + ) + + # scale down multiple units + remove_number_units(juju, APP_NAME, num_units=2, substrate=substrate) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 + ) + ) + num_units = len(juju.status().get_units(APP_NAME)) + assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + + number_of_slaves = await get_number_connected_slaves( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert number_of_slaves == NUM_UNITS - 2, ( + f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." + ) + + +async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes, substrate: Substrate) -> None: + """Make sure that removing all units and then adding them again works.""" + # remove all remaining units + remove_number_units( + juju, APP_NAME, num_units=len(juju.status().apps[APP_NAME].units), substrate=substrate + ) + juju.wait(lambda status: len(juju.status().get_units(APP_NAME)) == 0) + + # scale up again + juju.add_unit(APP_NAME, num_units=NUM_UNITS) + + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS, idle_period=10 + ), + timeout=1200, + ) + + hostnames = get_cluster_hostnames(juju, APP_NAME) + + connected_slaves = await get_number_connected_slaves( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + assert connected_slaves == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." + ) + await c_writes.async_clear() + c_writes.start() + await asyncio.sleep(10) # let the continuous writes write some data + await assert_continuous_writes_increasing( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + logger.info("Stopping continuous writes after scale up test.") + logger.info(await c_writes.async_stop()) + assert_continuous_writes_consistent( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + await c_writes.async_clear() + + +def test_remove_application(juju: jubilant.Juju) -> None: + """Make sure the application can be removed.""" + juju.remove_application(APP_NAME) + + juju.wait( + lambda status: APP_NAME not in status.apps, + timeout=600, + delay=5, + ) diff --git a/tests/integration/vm/test_charm.py b/tests/integration/vm/test_charm.py index df4b6ef..576eee2 100644 --- a/tests/integration/vm/test_charm.py +++ b/tests/integration/vm/test_charm.py @@ -13,6 +13,7 @@ from statuses import CharmStatuses, ClusterStatuses from tests.integration.helpers import ( APP_NAME, + IMAGE_RESOURCE, INTERNAL_USERS_SECRET_LABEL, NoAuthError, WrongPassError, @@ -39,7 +40,7 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju) -> None: """Build the charm-under-test and deploy it with three units.""" - juju.deploy(charm, num_units=NUM_UNITS, trust=True) + juju.deploy(charm, resources=IMAGE_RESOURCE, num_units=NUM_UNITS, trust=True) juju.wait( lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=600, From 7ea81752fb3a46e3c10137c72501229afbb5ee5f Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 3 Mar 2026 07:36:16 +0000 Subject: [PATCH 128/159] add c_writes to scale down --- tests/integration/ha/test_scaling.py | 51 ++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 464ea4a..33e15ad 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -93,7 +93,7 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: ) -async def test_scale_down(juju: jubilant.Juju, substrate: Substrate) -> None: +async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_writes) -> None: """Make sure scale down operations complete successfully.""" number_of_slaves = await get_number_connected_slaves( hostnames=get_cluster_hostnames(juju, APP_NAME), @@ -104,6 +104,10 @@ async def test_scale_down(juju: jubilant.Juju, substrate: Substrate) -> None: f"Expected {NUM_UNITS - 1} connected slaves, got {number_of_slaves}." ) + await c_writes.async_clear() + c_writes.start() + await asyncio.sleep(10) # let the continuous writes write some data + # scale down remove_number_units(juju, APP_NAME, num_units=1, substrate=substrate) juju.wait( @@ -123,8 +127,28 @@ async def test_scale_down(juju: jubilant.Juju, substrate: Substrate) -> None: f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." ) + # update hostnames after scale down + c_writes.update() + + await assert_continuous_writes_increasing( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + + logger.info("Stopping continuous writes after scale up test.") + logger.info(await c_writes.async_stop()) + + assert_continuous_writes_consistent( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + -async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: +async def test_scale_down_multiple_units( + juju: jubilant.Juju, substrate: Substrate, c_writes +) -> None: """Make sure multiple scale down operations complete successfully.""" number_current_units = len(juju.status().apps[APP_NAME].units) juju.add_unit(APP_NAME, num_units=(NUM_UNITS + 1) - number_current_units) @@ -144,8 +168,12 @@ async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: f"Expected {NUM_UNITS} connected slaves, got {number_of_slaves}." ) + await c_writes.async_clear() + c_writes.start() + await asyncio.sleep(10) # let the continuous writes write some data + # scale down multiple units - remove_number_units(juju, APP_NAME, num_units=2, substrate=Substrate.K8S) + remove_number_units(juju, APP_NAME, num_units=2, substrate=substrate) juju.wait( lambda status: are_apps_active_and_agents_idle( @@ -164,6 +192,23 @@ async def test_scale_down_multiple_units(juju: jubilant.Juju) -> None: f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." ) + c_writes.update() + + await assert_continuous_writes_increasing( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + + logger.info("Stopping continuous writes after scale down test.") + logger.info(await c_writes.async_stop()) + + assert_continuous_writes_consistent( + hostnames=get_cluster_hostnames(juju, APP_NAME), + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: """Make sure that removing all units and then adding them again works.""" From 64bb344378f6c57949a2a2d9908203977821dc27 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 3 Mar 2026 07:36:58 +0000 Subject: [PATCH 129/159] fail faster if any hostname is down --- tests/integration/continuous_writes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index 267c3d7..129daaa 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -15,6 +15,7 @@ import jubilant from glide import ( + BackoffStrategy, GlideClient, GlideClientConfiguration, NodeAddress, @@ -90,6 +91,7 @@ async def _create_glide_client(self, config: Optional[SimpleNamespace] = None) - client_name="continuous_writes_client", request_timeout=250, credentials=credentials, + reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) return await GlideClient.create(glide_config) @@ -253,6 +255,7 @@ async def _make_client(conf: SimpleNamespace) -> GlideClient: client_name="continuous_writes_worker", request_timeout=250, credentials=credentials, + reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) return await GlideClient.create(glide_config) From 320d17d3901f78fdb8bbccb11dfb7cfa3aef7378 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 3 Mar 2026 07:39:03 +0000 Subject: [PATCH 130/159] rename tests so we can easily run all scale down tests using -k --- tests/integration/ha/test_scaling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 33e15ad..8ee6645 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -210,7 +210,7 @@ async def test_scale_down_multiple_units( ) -async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: +async def test_scale_down_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: """Make sure that removing all units and then adding them again works.""" # remove all remaining units remove_number_units( @@ -256,7 +256,7 @@ async def test_scale_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: await c_writes.async_clear() -def test_remove_application(juju: jubilant.Juju) -> None: +def test_scale_down_remove_application(juju: jubilant.Juju) -> None: """Make sure the application can be removed.""" juju.remove_application(APP_NAME) From ce045e41278d915ce1e719611622e51b3f018656 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 3 Mar 2026 08:18:47 +0000 Subject: [PATCH 131/159] vm agnostic test --- tests/integration/ha/test_scaling.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 8ee6645..bad6ec7 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -210,11 +210,13 @@ async def test_scale_down_multiple_units( ) -async def test_scale_down_to_zero_and_back(juju: jubilant.Juju, c_writes) -> None: +async def test_scale_down_to_zero_and_back( + juju: jubilant.Juju, substrate: Substrate, c_writes +) -> None: """Make sure that removing all units and then adding them again works.""" # remove all remaining units remove_number_units( - juju, APP_NAME, num_units=len(juju.status().apps[APP_NAME].units), substrate=Substrate.K8S + juju, APP_NAME, num_units=len(juju.status().apps[APP_NAME].units), substrate=substrate ) juju.wait(lambda status: len(juju.status().get_units(APP_NAME)) == 0) From 71008c567a34d337e1705bd6c2daac0946edee4e Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 3 Mar 2026 14:24:54 +0000 Subject: [PATCH 132/159] add scale down primary test on vm --- tests/integration/ha/test_scaling.py | 45 ++++++++++++++++++++++++++++ tests/integration/helpers.py | 26 +++++++++------- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index bad6ec7..c992103 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -5,6 +5,7 @@ import logging import jubilant +import pytest from literals import CharmUsers, Substrate from tests.integration.cw_helpers import ( @@ -18,6 +19,7 @@ get_cluster_hostnames, get_number_connected_slaves, get_password, + get_primary_ip, remove_number_units, seed_valkey, ) @@ -258,6 +260,49 @@ async def test_scale_down_to_zero_and_back( await c_writes.async_clear() +async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_writes) -> None: + """Make sure that removing the primary unit triggers a new primary to be elected and the cluster remains available.""" + if substrate == Substrate.K8S: + pytest.skip("Primary unit can only targeted on VM") + + await c_writes.async_clear() + c_writes.start() + primary_ip = get_primary_ip(juju, APP_NAME) + primary_unit = next( + unit + for unit, unit_value in juju.status().get_units(APP_NAME).items() + if unit_value.public_address == primary_ip + ) + assert primary_unit is not None, "Failed to identify primary unit for scale down test." + logger.debug( + f"Identified primary unit {primary_unit} with IP {primary_ip} for scale down test." + ) + juju.remove_unit(primary_unit) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 + ) + ) + c_writes.update() + new_primary_ip = get_primary_ip(juju, APP_NAME) + assert new_primary_ip != primary_ip, "Primary IP did not change after removing primary unit." + logger.debug(f"New primary IP after scale down is {new_primary_ip}.") + hostnames = get_cluster_hostnames(juju, APP_NAME) + await assert_continuous_writes_increasing( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + logger.info("Stopping continuous writes after primary scale down test.") + logger.info(await c_writes.async_stop()) + assert_continuous_writes_consistent( + hostnames=hostnames, + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), + ) + await c_writes.async_clear() + + def test_scale_down_remove_application(juju: jubilant.Juju) -> None: """Make sure the application can be removed.""" juju.remove_application(APP_NAME) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index a87879d..0378b33 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -323,19 +323,27 @@ def fast_forward(juju: jubilant.Juju): juju.model_config({"update-status-hook-interval": old}) -async def get_primary_ip(juju: jubilant.Juju, app: str) -> str: +def get_primary_ip(juju: jubilant.Juju, app: str) -> str: """Get the primary node of the Valkey cluster. Returns: The IP address of the primary node. """ hostnames = get_cluster_hostnames(juju, app) - async with create_valkey_client([hostnames[0]], password=get_password(juju)) as client: - info = await client.custom_command(["client", "info"]) - match = re.search(r"laddr=([\d\.]+):", info.decode()) - if match: - return match.group(1) - raise RuntimeError("Primary IP not found in client info output") + replication_info = exec_valkey_cli( + hostnames[0], + username=CharmUsers.VALKEY_ADMIN.value, + password=get_password(juju), + command="info replication", + ).stdout + # if master then we return the hostname + if "role:master" in replication_info: + return hostnames[0] + # extract ip + match = re.search(r"master_host:([^\s]+)", replication_info) + if not match: + raise ValueError("Could not find master_host in replication info") + return match.group(1) def get_password(juju: jubilant.Juju, user: CharmUsers = CharmUsers.VALKEY_ADMIN) -> str: @@ -409,9 +417,7 @@ def exec_valkey_cli( hostname: str, username: str, password: str, command: str ) -> valkey_cli_result: """Execute a Valkey CLI command and returns the output as a string.""" - command = ( - f"valkey-cli -h {hostname} -p {CLIENT_PORT} --user {username} --pass {password} {command}" - ) + command = f"valkey-cli --no-auth-warning -h {hostname} -p {CLIENT_PORT} --user {username} --pass {password} {command}" result = subprocess.run( command.split(), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) From 3fef2bf5a7be2fd57063aeafa22608968a0a0e2c Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 04:39:26 +0000 Subject: [PATCH 133/159] feedback from rene --- src/common/client.py | 2 +- src/common/locks.py | 21 ++++++------- src/core/models.py | 9 +++--- src/events/base_events.py | 3 +- src/literals.py | 2 +- src/workload_k8s.py | 7 ++++- tests/integration/conftest.py | 2 +- tests/integration/continuous_writes.py | 1 - tests/integration/ha/test_scaling.py | 42 +++++++++++++------------- tests/integration/helpers.py | 10 +++--- 10 files changed, 52 insertions(+), 47 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index a731018..66760f1 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -264,7 +264,7 @@ def __init__( super().__init__(username, password, workload) def ping(self, hostname: str) -> bool: - """Ping the Valkey server to check if it's responsive. + """Ping the Sentinel server to check if it's responsive. Args: hostname (str): The hostname to connect to. diff --git a/src/common/locks.py b/src/common/locks.py index 1aeb850..dd03b48 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -1,7 +1,7 @@ # Copyright 2026 Canonical Ltd. # See LICENSE file for licensing details. -"""Collection of lock names for cluster operations.""" +"""Collection of locks for cluster operations.""" import logging import time @@ -42,7 +42,7 @@ def release_lock(self) -> bool: @property @abstractmethod - def do_i_hold_lock(self) -> bool: + def is_held_by_this_unit(self) -> bool: """Check if the local unit holds the lock.""" raise NotImplementedError @@ -90,7 +90,7 @@ def is_lock_free_to_give(self) -> bool: raise NotImplementedError @property - def do_i_hold_lock(self) -> bool: + def is_held_by_this_unit(self) -> bool: """Check if the local unit holds the start lock.""" return self.state.unit_server.unit_name == getattr( self.state.cluster.model, self.member_with_lock_atr_name, "" @@ -109,7 +109,7 @@ def request_lock(self) -> bool: ) self.process() - return self.do_i_hold_lock + return self.is_held_by_this_unit def release_lock(self) -> bool: """Release the lock from the local unit.""" @@ -135,10 +135,10 @@ def process(self) -> None: if self.is_lock_free_to_give: next_unit = self.next_unit_to_give_lock self.state.cluster.update({self.member_with_lock_atr_name: next_unit}) - logger.debug(f"Gave {self.name} lock to {next_unit}") - logger.debug( - f"{self.name} lock is currently held by {getattr(self.state.cluster.model, self.member_with_lock_atr_name)}" - ) + logger.debug("Gave %s to %s", self.name, next_unit) + + if unit_with_lock := self.state.cluster.model[self.member_with_lock_atr_name]: + logger.debug("%s is currently held by %s", self.name, unit_with_lock) class StartLock(DataBagLock): @@ -164,10 +164,9 @@ class ScaleDownLock(Lockable): This will use valkey to store the lock state and will check if the unit with the lock has completed its scale down operation """ - lock_key = "scale_down_lock" - def __init__(self, charm: "ValkeyCharm") -> None: self.charm = charm + self.lock_key = f"scale_down_lock_{self.charm.app.name}" @property def client(self) -> ValkeyClient: @@ -244,7 +243,7 @@ def request_lock(self, timeout: int | None = None) -> bool: primary_ip = self.charm.sentinel_manager.get_primary_ip() @property - def do_i_hold_lock(self) -> bool: + def is_held_by_this_unit(self) -> bool: """Check if the local unit holds the lock.""" unit_with_lock = self.get_unit_with_lock() return ( diff --git a/src/core/models.py b/src/core/models.py index 0a185e3..864b0d7 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -121,10 +121,11 @@ def is_started(self) -> bool: @property def is_being_removed(self) -> bool: """Check if the unit is being removed from the cluster.""" - return self.model.scale_down_state not in { - ScaleDownState.NO_SCALE_DOWN.value, - ScaleDownState.WAIT_FOR_LOCK.value, - ScaleDownState.WAIT_TO_FAILOVER.value, + return self.model.scale_down_state in { + ScaleDownState.STOP_SERVICES.value, + ScaleDownState.RESET_SENTINEL.value, + ScaleDownState.HEALTH_CHECK.value, + ScaleDownState.GOING_AWAY.value, } @property diff --git a/src/events/base_events.py b/src/events/base_events.py index ef8b7cd..63cc9cf 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -121,7 +121,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.state.unit_server.update({"start_state": StartState.WAITING_TO_START.value}) start_lock.request_lock() - if not start_lock.do_i_hold_lock: + if not start_lock.is_held_by_this_unit: logger.info("Waiting for lock to start") event.defer() return @@ -137,6 +137,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.state.unit_server.update( {"start_state": StartState.WAITING_FOR_PRIMARY_START.value} ) + start_lock.release_lock() event.defer() return diff --git a/src/literals.py b/src/literals.py index dbe7383..75a947f 100644 --- a/src/literals.py +++ b/src/literals.py @@ -93,7 +93,7 @@ class ScaleDownState(StrEnum): NO_SCALE_DOWN = "" WAIT_FOR_LOCK = "wait_for_lock" WAIT_TO_FAILOVER = "wait_to_failover" - STOP_SERVICES = "stopped_services" + STOP_SERVICES = "stopping_services" RESET_SENTINEL = "reset_sentinel" HEALTH_CHECK = "health_check" GOING_AWAY = "going_away" diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 91f0a3f..7e017d0 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -133,7 +133,12 @@ def exec(self, command: list[str]) -> tuple[str, str | None]: def stop(self) -> None: try: self.container.stop(self.valkey_service, self.sentinel_service, self.metric_service) - except ops.pebble.ChangeError as e: + except ( + ops.pebble.ChangeError, + ops.pebble.TimeoutError, + ops.pebble.ConnectionError, + ops.pebble.APIError, + ) as e: logger.error("Failed to stop Valkey services: %s", e) raise ValkeyServicesCouldNotBeStoppedError( f"Failed to stop Valkey services: {e}" diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index fef0088..423654a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -18,7 +18,7 @@ def c_writes(juju: jubilant.Juju): """Create instance of the ContinuousWrites.""" app = APP_NAME - logger.debug(f"Creating ContinuousWrites instance for app with name {app}") + logger.info(f"Creating ContinuousWrites instance for app with name {app}") return ContinuousWrites(juju, app) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index 129daaa..dae9fef 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -269,7 +269,6 @@ async def with_client(conf: SimpleNamespace): current_val = starting_number config = initial_config - # client = await _make_client(config) proc_logger.info(f"Starting continuous async writes from {current_val}") diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index c992103..5e7b886 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -17,7 +17,7 @@ IMAGE_RESOURCE, are_apps_active_and_agents_idle, get_cluster_hostnames, - get_number_connected_slaves, + get_number_connected_replicas, get_password, get_primary_ip, remove_number_units, @@ -72,13 +72,13 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: # check if all units have been added to the cluster hostnames = get_cluster_hostnames(juju, APP_NAME) - connected_slaves = await get_number_connected_slaves( + connected_replicas = await get_number_connected_replicas( hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert connected_slaves == NUM_UNITS - 1, ( - f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." + assert connected_replicas == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected replicas, got {connected_replicas}." ) await assert_continuous_writes_increasing( @@ -97,13 +97,13 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_writes) -> None: """Make sure scale down operations complete successfully.""" - number_of_slaves = await get_number_connected_slaves( + number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, APP_NAME), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_slaves == NUM_UNITS - 1, ( - f"Expected {NUM_UNITS - 1} connected slaves, got {number_of_slaves}." + assert number_of_replicas == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected replicas, got {number_of_replicas}." ) await c_writes.async_clear() @@ -120,13 +120,13 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ num_units = len(juju.status().get_units(APP_NAME)) assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." - number_of_slaves = await get_number_connected_slaves( + number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, APP_NAME), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_slaves == NUM_UNITS - 2, ( - f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." + assert number_of_replicas == NUM_UNITS - 2, ( + f"Expected {NUM_UNITS - 2} connected replicas, got {number_of_replicas}." ) # update hostnames after scale down @@ -161,13 +161,13 @@ async def test_scale_down_multiple_units( timeout=1200, ) - number_of_slaves = await get_number_connected_slaves( + number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, APP_NAME), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_slaves == NUM_UNITS, ( - f"Expected {NUM_UNITS} connected slaves, got {number_of_slaves}." + assert number_of_replicas == NUM_UNITS, ( + f"Expected {NUM_UNITS} connected replicas, got {number_of_replicas}." ) await c_writes.async_clear() @@ -185,13 +185,13 @@ async def test_scale_down_multiple_units( num_units = len(juju.status().get_units(APP_NAME)) assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." - number_of_slaves = await get_number_connected_slaves( + number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, APP_NAME), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_slaves == NUM_UNITS - 2, ( - f"Expected {NUM_UNITS - 2} connected slaves, got {number_of_slaves}." + assert number_of_replicas == NUM_UNITS - 2, ( + f"Expected {NUM_UNITS - 2} connected replicas, got {number_of_replicas}." ) c_writes.update() @@ -234,13 +234,13 @@ async def test_scale_down_to_zero_and_back( hostnames = get_cluster_hostnames(juju, APP_NAME) - connected_slaves = await get_number_connected_slaves( + connected_replicas = await get_number_connected_replicas( hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert connected_slaves == NUM_UNITS - 1, ( - f"Expected {NUM_UNITS - 1} connected slaves, got {connected_slaves}." + assert connected_replicas == NUM_UNITS - 1, ( + f"Expected {NUM_UNITS - 1} connected replicas, got {connected_replicas}." ) await c_writes.async_clear() c_writes.start() @@ -274,7 +274,7 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w if unit_value.public_address == primary_ip ) assert primary_unit is not None, "Failed to identify primary unit for scale down test." - logger.debug( + logger.info( f"Identified primary unit {primary_unit} with IP {primary_ip} for scale down test." ) juju.remove_unit(primary_unit) @@ -286,7 +286,7 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w c_writes.update() new_primary_ip = get_primary_ip(juju, APP_NAME) assert new_primary_ip != primary_ip, "Primary IP did not change after removing primary unit." - logger.debug(f"New primary IP after scale down is {new_primary_ip}.") + logger.info(f"New primary IP after scale down is {new_primary_ip}.") hostnames = get_cluster_hostnames(juju, APP_NAME) await assert_continuous_writes_increasing( hostnames=hostnames, diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 0378b33..0d4687b 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -370,7 +370,7 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: total_bytes_target = target_gb * 1024 * 1024 * 1024 total_keys = total_bytes_target // value_size_bytes - logger.debug( + logger.info( f"Targeting ~{target_gb}GB ({total_keys:,} keys of {value_size_bytes} bytes each)" ) @@ -507,12 +507,12 @@ async def ping_cluster( return await client.ping() == "PONG".encode() -async def get_number_connected_slaves( +async def get_number_connected_replicas( hostnames: list[str], username: str, password: str, ) -> int: - """Get the number of connected slaves in the Valkey cluster. + """Get the number of connected replicas in the Valkey cluster. Args: hostnames: List of hostnames of the Valkey cluster nodes. @@ -520,7 +520,7 @@ async def get_number_connected_slaves( password: The password for authentication. Returns: - The number of connected slaves. + The number of connected replicas. """ async with create_valkey_client( hostnames=hostnames, username=username, password=password @@ -528,7 +528,7 @@ async def get_number_connected_slaves( info = (await client.info([InfoSection.REPLICATION])).decode() search_result = re.search(r"connected_slaves:([\d+])", info) if not search_result: - raise ValueError("Could not parse number of connected slaves from info output") + raise ValueError("Could not parse number of connected replicas from info output") return int(search_result.group(1)) From f500b434d16f68d85beb4c242292e150c49cccbe Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 04:43:04 +0000 Subject: [PATCH 134/159] add a todo comment --- src/common/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/client.py b/src/common/client.py index 66760f1..4026d8b 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -92,6 +92,7 @@ def exec_cli_command( class ValkeyClient(CliClient): """Handle valkey client connections.""" + # TODO Handle TLS port when TLS is merged port: int = CLIENT_PORT def __init__( From 49f8826bd189c56e2db69798ed270140c1efbca3 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 04:50:14 +0000 Subject: [PATCH 135/159] lint and add clearing c_writes --- tests/integration/ha/test_scaling.py | 7 ++++++- tests/integration/helpers.py | 4 +--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 5e7b886..4880709 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -54,9 +54,11 @@ async def test_seed_data(juju: jubilant.Juju) -> None: await seed_valkey(juju, target_gb=1) -async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: +async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) + await c_writes.async_clear() + c_writes.start() # scale up juju.add_unit(APP_NAME, num_units=NUM_UNITS - init_units_count) @@ -93,6 +95,7 @@ async def test_scale_up(juju: jubilant.Juju, c_writes, c_writes_runner) -> None: username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) + await c_writes.async_clear() async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_writes) -> None: @@ -146,6 +149,7 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) + await c_writes.async_clear() async def test_scale_down_multiple_units( @@ -210,6 +214,7 @@ async def test_scale_down_multiple_units( username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) + await c_writes.async_clear() async def test_scale_down_to_zero_and_back( diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 0d4687b..09a95c7 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -370,9 +370,7 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: total_bytes_target = target_gb * 1024 * 1024 * 1024 total_keys = total_bytes_target // value_size_bytes - logger.info( - f"Targeting ~{target_gb}GB ({total_keys:,} keys of {value_size_bytes} bytes each)" - ) + logger.info(f"Targeting ~{target_gb}GB ({total_keys:,} keys of {value_size_bytes} bytes each)") start_time = time.time() keys_added = 0 From fde927fcca572cb39230b6e6b75a258ea3f9ebc3 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 06:04:39 +0000 Subject: [PATCH 136/159] increase cw request timeout to 1s --- src/common/client.py | 11 ++--------- src/managers/sentinel.py | 4 ++-- tests/integration/continuous_writes.py | 4 ++-- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 4026d8b..f3b4073 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -141,12 +141,6 @@ def info_persistence(self, hostname: str) -> dict[str, str] | None: if line.startswith("#"): continue values_parts = line.split(":", 1) - if len(values_parts) != 2: - logger.error( - "Unexpected output format when getting persistence info from Valkey server at %s", - hostname, - ) - return None values[values_parts[0]] = values_parts[1] return values @@ -292,7 +286,7 @@ def get_primary_addr_by_name(self, hostname: str) -> str: """ return self.exec_cli_command( command=["sentinel", "get-primary-addr-by-name", PRIMARY_NAME], hostname=hostname - )[0] + ) def primary(self, hostname: str) -> dict[str, str]: """Get the primary info from the sentinel. @@ -378,10 +372,9 @@ def replicas_primary(self, hostname: str) -> list[dict[str, str]]: Returns: (list[dict[str, str]]): The list of replicas with their information. """ - replicas = self.exec_cli_command( + return self.exec_cli_command( command=["sentinel", "replicas", PRIMARY_NAME], hostname=hostname ) - return replicas def sentinels_primary(self, hostname: str) -> list[dict[str, str]]: """Get the list of sentinels that see the same primary from the sentinel. diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 315d4c4..f564986 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -100,7 +100,7 @@ def get_primary_ip(self) -> str: for unit_ip in started_servers: try: - return client.get_primary_addr_by_name(hostname=unit_ip) + return client.get_primary_addr_by_name(hostname=unit_ip)[0] except ValkeyWorkloadCommandError: logger.warning( "Could not query sentinel for primary information from server at %s.", @@ -301,7 +301,7 @@ def get_active_sentinel_ips(self, hostname: str) -> list[str]: password=self.admin_password, workload=self.workload, ) - return [client.get_primary_addr_by_name(hostname=hostname)] + [ + return [client.get_primary_addr_by_name(hostname=hostname)[0]] + [ sentinel["ip"] for sentinel in client.sentinels_primary(hostname=hostname) ] diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index dae9fef..c6d1096 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -89,7 +89,7 @@ async def _create_glide_client(self, config: Optional[SimpleNamespace] = None) - glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_client", - request_timeout=250, + request_timeout=1000, credentials=credentials, reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) @@ -253,7 +253,7 @@ async def _make_client(conf: SimpleNamespace) -> GlideClient: glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_worker", - request_timeout=250, + request_timeout=1000, credentials=credentials, reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) From f56dd748682c851a8f6993fe7dd19fe157288e19 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 09:31:11 +0000 Subject: [PATCH 137/159] remove unneeded raises and augment unit test coverage for sentinel manager --- src/managers/cluster.py | 6 +++- src/managers/sentinel.py | 32 +++++++----------- tests/unit/test_charm.py | 42 +++++++++++++++++++++-- tests/unit/test_scaledown.py | 65 +++++++++++++++++++++++------------- 4 files changed, 98 insertions(+), 47 deletions(-) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 7bbf9be..0b71e93 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -80,7 +80,11 @@ def is_replica_synced(self) -> bool: workload=self.workload, ) role_info = client.role(hostname=self.state.bind_address) - return role_info[0] == "slave" and role_info[3] == "connected" + try: + return role_info[0] == "slave" and role_info[3] == "connected" + except IndexError as e: + logger.warning(f"Unexpected role information format: {role_info}. Error: {e}") + return False @retry( wait=wait_fixed(5), diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index f564986..e0fafa3 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -174,12 +174,8 @@ def reset_sentinel_states(self, sentinel_ips: list[str]) -> None: ) for sentinel_ip in sentinel_ips: - try: - logger.debug("Resetting sentinel state on %s.", sentinel_ip) - client.reset(hostname=sentinel_ip) - except ValkeyWorkloadCommandError: - logger.warning("Could not reset sentinel state on %s.", sentinel_ip) - raise + logger.debug("Resetting sentinel state on %s.", sentinel_ip) + client.reset(hostname=sentinel_ip) if not self.target_sees_all_others( target_sentinel_ip=sentinel_ip, sentinel_ips=sentinel_ips @@ -270,19 +266,15 @@ def verify_expected_replica_count(self, sentinel_ips: list[str]) -> None: ) for sentinel_ip in sentinel_ips: - try: - if expected_replicas != ( - number_replicas := len(client.replicas_primary(hostname=sentinel_ip)) - ): - logger.warning( - f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." - ) - raise SentinelIncorrectReplicaCountError( - f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." - ) - except ValkeyWorkloadCommandError: - logger.warning("Could not query sentinel for replica information.") - raise + if expected_replicas != ( + number_replicas := len(client.replicas_primary(hostname=sentinel_ip)) + ): + logger.warning( + f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." + ) + raise SentinelIncorrectReplicaCountError( + f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." + ) def get_active_sentinel_ips(self, hostname: str) -> list[str]: """Get a list of IP addresses of the active sentinels in the cluster. @@ -301,7 +293,7 @@ def get_active_sentinel_ips(self, hostname: str) -> list[str]: password=self.admin_password, workload=self.workload, ) - return [client.get_primary_addr_by_name(hostname=hostname)[0]] + [ + return [hostname] + [ sentinel["ip"] for sentinel in client.sentinels_primary(hostname=hostname) ] diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index a5acf2a..d1ddbfb 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -2,7 +2,7 @@ # Copyright 2025 Canonical Ltd. # See LICENSE file for licensing details. -from unittest.mock import patch +from unittest.mock import PropertyMock, patch import pytest from ops import ActiveStatus, pebble, testing @@ -215,9 +215,45 @@ def test_start_non_primary(cloud_spec): state_out = ctx.run(ctx.on.start(), state_in) assert status_is(state_out, StartStatuses.SERVICE_STARTING.value) - # sentinel not yet discovered + # sentinel not yet discovered error raised with ( - patch("managers.sentinel.SentinelManager.is_sentinel_discovered", return_value=False), + patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock(return_value="10.0.1.0"), + ), + patch( + "common.client.SentinelClient.sentinels_primary", + side_effect=ValkeyWorkloadCommandError("errored out"), + ), + patch("managers.cluster.ClusterManager.is_healthy", return_value=True), + patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), + ): + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"start-member": "valkey/0"}, + peers_data={1: {"start-state": "started"}}, + ) + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + leader=False, + relations={relation, status_peer_relation}, + secrets={internal_passwords_secret}, + containers={container}, + ) + state_out = ctx.run(ctx.on.start(), state_in) + assert status_is(state_out, StartStatuses.WAITING_FOR_SENTINEL_DISCOVERY.value) + + # sentinel not yet discovered sentinel not seeing other sentinel + with ( + patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock(return_value="10.0.1.0"), + ), + patch( + "common.client.SentinelClient.sentinels_primary", + return_value=[{"ip": "10.0.1.1"}, {"ip": "10.0.1.2"}], + ), patch("managers.cluster.ClusterManager.is_healthy", return_value=True), patch("managers.sentinel.SentinelManager.is_healthy", return_value=True), ): diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py index e6e3f16..dc6e4f7 100644 --- a/tests/unit/test_scaledown.py +++ b/tests/unit/test_scaledown.py @@ -8,6 +8,7 @@ from ops import testing from charm import ValkeyCharm +from common.exceptions import ValkeyWorkloadCommandError from literals import CONTAINER, PEER_RELATION from statuses import ScaleDownStatuses from tests.unit.helpers import status_is @@ -28,7 +29,7 @@ def get_3_unit_peer_relation(): "private-ip": f"10.0.1.{unit_id}", "start-state": "started", } - for unit_id in range(1, 4) + for unit_id in range(1, 3) }, ) @@ -71,25 +72,36 @@ def test_non_primary(cloud_spec): ) with ( + patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock(return_value="10.0.1.0"), + ), patch("common.locks.ScaleDownLock.request_lock", return_value=True), patch("common.locks.ScaleDownLock.release_lock", return_value=True), - patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.1"), - patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, - patch( - "managers.sentinel.SentinelManager.reset_sentinel_states" - ) as mock_reset_sentinel_states, patch( - "managers.sentinel.SentinelManager.verify_expected_replica_count" - ) as mock_verify_expected_replica_count, + "common.client.SentinelClient.get_primary_addr_by_name", + side_effect=[ + ValkeyWorkloadCommandError("errored out"), + ("10.0.1.1", 6379), + ], + ), + patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, + patch("common.client.SentinelClient.reset") as mock_reset, patch( - "managers.sentinel.SentinelManager.get_active_sentinel_ips", - return_value=["10.0.1.1", "10.0.1.2", "10.0.1.3"], + "common.client.SentinelClient.sentinels_primary", + side_effect=[ + [{"ip": "10.0.1.0"}, {"ip": "10.0.1.2"}], # for get_active_sentinel_ips + [{"ip": "10.0.1.2"}], # for target_sees_all_others unit 10.0.1.1 + [{"ip": "10.0.1.1"}], # for target_sees_all_others unit 10.0.1.2 + ], ), + patch( + "common.client.SentinelClient.replicas_primary", return_value=[{"ip": "ip"}] + ), # we need the len to be 1 ): state_out = ctx.run(ctx.on.storage_detaching(data_strorage), state_in) mock_stop.assert_called_once() - mock_reset_sentinel_states.assert_called_once() - mock_verify_expected_replica_count.assert_called_once() + assert mock_reset.call_count == 2 status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) @@ -116,21 +128,28 @@ def test_primary(cloud_spec): patch("common.locks.ScaleDownLock.release_lock", return_value=True), patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.0"), patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, - patch("managers.sentinel.SentinelManager.failover") as mock_failover, + patch("common.client.SentinelClient.failover_primary_coordinated") as mock_failover, + patch("common.client.SentinelClient.is_failover_in_progress") as mock_failover_in_progress, + patch("common.client.SentinelClient.reset") as mock_reset, patch( - "managers.sentinel.SentinelManager.reset_sentinel_states" - ) as mock_reset_sentinel_states, - patch( - "managers.sentinel.SentinelManager.verify_expected_replica_count" - ) as mock_verify_expected_replica_count, - patch( - "managers.sentinel.SentinelManager.get_active_sentinel_ips", - return_value=["10.0.1.1", "10.0.1.2", "10.0.1.3"], + "common.client.SentinelClient.sentinels_primary", + side_effect=[ + [{"ip": "10.0.1.1"}, {"ip": "10.0.1.2"}], # for get_active_sentinel_ips + [], # for target_sees_all_others unit 10.0.1.1 not yet + ValkeyWorkloadCommandError( + "errored out" + ), # for target_sees_all_others unit 10.0.1.1 network mishap + [{"ip": "10.0.1.2"}], # for target_sees_all_others unit 10.0.1.1 + [{"ip": "10.0.1.1"}], # for target_sees_all_others unit 10.0.1.2 + ], ), + patch( + "common.client.SentinelClient.replicas_primary", return_value=[{"ip": "ip"}] + ), # we need the len to be 1 ): state_out = ctx.run(ctx.on.storage_detaching(data_strorage), state_in) mock_failover.assert_called_once() + mock_failover_in_progress.assert_called_once() mock_stop.assert_called_once() - mock_reset_sentinel_states.assert_called_once() - mock_verify_expected_replica_count.assert_called_once() + assert mock_reset.call_count == 2 status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) From 2dee78ce7e82d48b80243c0ac1355e05e6ba3cac Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 12:04:28 +0000 Subject: [PATCH 138/159] reduce request timeout --- tests/integration/continuous_writes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index c6d1096..b15c41c 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -89,7 +89,7 @@ async def _create_glide_client(self, config: Optional[SimpleNamespace] = None) - glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_client", - request_timeout=1000, + request_timeout=500, credentials=credentials, reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) @@ -253,7 +253,7 @@ async def _make_client(conf: SimpleNamespace) -> GlideClient: glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_worker", - request_timeout=1000, + request_timeout=500, credentials=credentials, reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) From 5fe97f01ab8968a6084c84dcfa3478ce26074c2f Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 4 Mar 2026 12:59:53 +0000 Subject: [PATCH 139/159] fix conflicts --- src/common/client.py | 46 ++++++++++++++++++++++++++--------------- src/managers/cluster.py | 2 +- src/workload_k8s.py | 7 +++++-- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 2867d9b..737e684 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -53,23 +53,19 @@ def exec_cli_command( ValkeyWorkloadCommandError: If the CLI command fails to execute. """ port = self.port - cli_command: list[str] = ( - [ - self.workload.cli, - "--no-auth-warning", - "-h", - hostname, - "-p", - str(port), - "--user", - self.username, - "--pass", - self.password, - ] - + ["--json"] - if json_output - else [] - ) + cli_command: list[str] = [ + self.workload.cli, + "--no-auth-warning", + "-h", + hostname, + "-p", + str(port), + "--user", + self.username, + "--pass", + self.password, + ] + (["--json"] if json_output else []) + if self.tls: cli_command.append("--tls") cli_command.append("--cert") @@ -256,6 +252,22 @@ def acl_load(self, hostname: str) -> bool: """ return self.exec_cli_command(["acl", "load"], hostname=hostname) == "OK" + def reload_tls(self, tls_config: dict[str, str], hostname: str) -> None: + """Trigger to load the TLS settings.""" + cmd = ["CONFIG", "SET"] + + for key, value in tls_config.items(): + cmd.append(key) + cmd.append(value) + logger.debug("Loading TLS settings: %s", cmd) + + try: + result = self.exec_cli_command(command=cmd, hostname=hostname) + logger.debug("Loading TLS settings: %s", result) + except ValkeyWorkloadCommandError: + logger.error("Error loading TLS settings") + raise ValkeyTLSLoadError("Could not load TLS settings") + class SentinelClient(CliClient): """Handle sentinel-specific client connections.""" diff --git a/src/managers/cluster.py b/src/managers/cluster.py index d4614f5..fa755f9 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -54,7 +54,7 @@ def _get_valkey_client(self) -> ValkeyClient: def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" client = self._get_valkey_client() - if not client.load_acl(hostname=self.state.bind_address): + if not client.acl_load(hostname=self.state.bind_address): raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: diff --git a/src/workload_k8s.py b/src/workload_k8s.py index 6703253..40a0f3d 100644 --- a/src/workload_k8s.py +++ b/src/workload_k8s.py @@ -136,8 +136,11 @@ def exec(self, command: list[str]) -> tuple[str, str | None]: command=command, ) return process.wait_output() - except (ops.pebble.ExecError, ops.pebble.APIError) as e: - logger.error("Command failed with %s, %s", e.exit_code, e.stdout) + except ops.pebble.APIError as e: + logger.error("Command failed with %s, %s", e.code, e.body) + raise ValkeyWorkloadCommandError(e) + except ops.pebble.ExecError as e: + logger.error("Command failed with: %s, %s", e.exit_code, e.stdout) raise ValkeyWorkloadCommandError(e) @override From f15a45a9bf7a1acf4872ffc8396abb5bc9147fbd Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Mar 2026 07:59:40 +0000 Subject: [PATCH 140/159] add is_tls_enabled property --- src/common/locks.py | 1 + src/core/models.py | 5 +++++ src/events/tls.py | 2 +- src/managers/cluster.py | 6 ++---- src/managers/sentinel.py | 6 ++---- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/common/locks.py b/src/common/locks.py index dd03b48..700fb50 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -174,6 +174,7 @@ def client(self) -> ValkeyClient: return ValkeyClient( username=CharmUsers.VALKEY_ADMIN.value, password=self.charm.state.unit_server.valkey_admin_password, + tls=self.charm.state.unit_server.is_tls_enabled, workload=self.charm.workload, ) diff --git a/src/core/models.py b/src/core/models.py index 5b3d098..15e8cc6 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -166,6 +166,11 @@ def tls_client_state(self) -> TLSState: return TLSState(self.model.tls_client_state or TLSState.NO_TLS.value) + @property + def is_tls_enabled(self) -> bool: + """Check if TLS is enabled for client connections.""" + return self.tls_client_state in [TLSState.TLS, TLSState.TO_NO_TLS] + @final class ValkeyCluster(RelationState): diff --git a/src/events/tls.py b/src/events/tls.py index 485e586..8d8407d 100644 --- a/src/events/tls.py +++ b/src/events/tls.py @@ -153,7 +153,7 @@ def _on_tls_relation_broken(self, event: ops.RelationBrokenEvent) -> None: event.defer() return - if self.charm.state.unit_server.tls_client_state in [TLSState.TLS, TLSState.TO_NO_TLS]: + if self.charm.state.unit_server.is_tls_enabled: logger.info("Disabling client TLS") self.charm.tls_manager.set_tls_state(TLSState.TO_NO_TLS) try: diff --git a/src/managers/cluster.py b/src/managers/cluster.py index fa755f9..b0c0c93 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -18,7 +18,7 @@ ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import CharmUsers, ScaleDownState, StartState, TLSState +from literals import CharmUsers, ScaleDownState, StartState from statuses import CharmStatuses, ScaleDownStatuses, StartStatuses logger = logging.getLogger(__name__) @@ -45,9 +45,7 @@ def _get_valkey_client(self) -> ValkeyClient: return ValkeyClient( username=self.admin_user, password=self.admin_password, - tls=True - if self.state.unit_server.tls_client_state in [TLSState.TLS, TLSState.TO_NO_TLS] - else False, + tls=self.state.unit_server.is_tls_enabled, workload=self.workload, ) diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 527e0f0..5e6e8b6 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -21,7 +21,7 @@ ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import CharmUsers, TLSState +from literals import CharmUsers from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -50,9 +50,7 @@ def _get_sentinel_client(self) -> SentinelClient: return SentinelClient( username=self.admin_user, password=self.admin_password, - tls=True - if self.state.unit_server.tls_client_state in [TLSState.TLS, TLSState.TO_NO_TLS] - else False, + tls=self.state.unit_server.is_tls_enabled, workload=self.workload, ) From f87db8269551f6bbf967bf7b9f85daa52d3c2efb Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Mar 2026 12:11:01 +0000 Subject: [PATCH 141/159] add primary ip to valkey lock --- src/common/locks.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/common/locks.py b/src/common/locks.py index 700fb50..7b9ab17 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -185,20 +185,21 @@ def get_unit_with_lock(self, primary_ip: str | None = None) -> str | None: ) @override - def request_lock(self, timeout: int | None = None) -> bool: + def request_lock(self, timeout: int | None = None, primary_ip: str | None = None) -> bool: """Request the lock for the local unit. This method will keep trying to acquire the lock until it is acquired or until the timeout is reached (if provided). Args: timeout (int | None): The maximum time to keep trying to acquire the lock, in seconds. If None, it will keep trying indefinitely. + primary_ip (str | None): The primary IP to use for the lock. If None, it will get the current primary IP from the sentinel manager. Returns: bool: True if the lock was acquired, False if the timeout was reached before acquiring the lock. """ logger.debug(f"{self.charm.state.unit_server.unit_name} is requesting {self.name} lock.") retry_until = time.time() + timeout if timeout else None - primary_ip = self.charm.sentinel_manager.get_primary_ip() + primary_ip = primary_ip or self.charm.sentinel_manager.get_primary_ip() if self.get_unit_with_lock(primary_ip) == self.charm.state.unit_server.unit_name: logger.debug( f"{self.charm.state.unit_server.unit_name} already holds {self.name} lock. No need to request it again." @@ -251,11 +252,12 @@ def is_held_by_this_unit(self) -> bool: unit_with_lock is not None and unit_with_lock == self.charm.state.unit_server.unit_name ) - def release_lock(self) -> bool: + def release_lock(self, primary_ip: str | None = None) -> bool: """Release the lock from the local unit.""" + primary_ip = primary_ip or self.charm.sentinel_manager.get_primary_ip() if ( self.client.delifeq( - hostname=self.charm.sentinel_manager.get_primary_ip(), + hostname=primary_ip, key=self.lock_key, value=self.charm.state.unit_server.unit_name, ) From 10940e4e55605d7b7c73b028f18d8adf8166ceeb Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Mar 2026 12:12:12 +0000 Subject: [PATCH 142/159] try to get primary ip for 40s and clean certicicates on leader going out --- src/events/base_events.py | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 1df88b3..9d5f4b6 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING import ops +import tenacity from common.exceptions import ( RequestingLockTimedOutError, @@ -443,8 +444,26 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: component_name=self.charm.cluster_manager.name, statuses_state=self.charm.state.statuses, ) + + # retry to get the primary ip until 2x restart delay is reached. + # Pebble uses backoff and is maxed at 30s + # Snap delay is set at 20s + # 40s should be enough to cover both substrates + try: + primary_ip = self._get_primary_ip_for_scale_down() + except ValkeyCannotGetPrimaryIPError as e: + logger.error(e) + self.charm.state.cluster.update( + { + "internal_ca_certificate": None, + "internal_ca_private_key": None, + } + ) + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) + return + # blocks until the lock is acquired - if not scale_down_lock.request_lock(): + if not scale_down_lock.request_lock(primary_ip=primary_ip): raise RequestingLockTimedOutError("Failed to acquire scale down lock within timeout") self.charm.state.statuses.delete( @@ -494,7 +513,21 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: # check health after scale down self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) self.charm.sentinel_manager.verify_expected_replica_count(active_sentinels) - scale_down_lock.release_lock() + # release lock + scale_down_lock.release_lock(primary_ip=primary_ip) + + if self.charm.app.planned_units() == 0 and self.charm.unit.is_leader(): + # clear app data bag + self.charm.state.cluster.update( + { + "internal_ca_certificate": None, + "internal_ca_private_key": None, + } + ) - # release lock self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) + + @tenacity.retry(wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_delay(40), reraise=True) + def _get_primary_ip_for_scale_down(self) -> str: + """Get the primary IP to use for scale down operations.""" + return self.charm.sentinel_manager.get_primary_ip() From d6a0bcec7c19b31540e75649ef7525777fb1e3f5 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Mar 2026 12:47:14 +0000 Subject: [PATCH 143/159] fix and increase unit tests --- src/events/base_events.py | 17 ++++--- tests/unit/test_scaledown.py | 98 ++++++++++++++++++++++++++++++++++-- tests/unit/test_tls.py | 24 +++++++++ 3 files changed, 127 insertions(+), 12 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index 9d5f4b6..ec993bf 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -453,12 +453,15 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: primary_ip = self._get_primary_ip_for_scale_down() except ValkeyCannotGetPrimaryIPError as e: logger.error(e) - self.charm.state.cluster.update( - { - "internal_ca_certificate": None, - "internal_ca_private_key": None, - } - ) + if self.charm.app.planned_units() == 0 and self.charm.unit.is_leader(): + # clear app data bag + self.charm.state.cluster.update( + { + "internal_ca_certificate": None, + "internal_ca_private_key": None, + } + ) + self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) return @@ -527,7 +530,7 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) - @tenacity.retry(wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_delay(40), reraise=True) + @tenacity.retry(wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_attempt(8), reraise=True) def _get_primary_ip_for_scale_down(self) -> str: """Get the primary IP to use for scale down operations.""" return self.charm.sentinel_manager.get_primary_ip() diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py index dc6e4f7..3195ed4 100644 --- a/tests/unit/test_scaledown.py +++ b/tests/unit/test_scaledown.py @@ -8,7 +8,7 @@ from ops import testing from charm import ValkeyCharm -from common.exceptions import ValkeyWorkloadCommandError +from common.exceptions import ValkeyCannotGetPrimaryIPError, ValkeyWorkloadCommandError from literals import CONTAINER, PEER_RELATION from statuses import ScaleDownStatuses from tests.unit.helpers import status_is @@ -50,6 +50,13 @@ def test_other_unit_has_lock(cloud_spec): with ( patch("common.locks.ScaleDownLock.request_lock", return_value=False), + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + side_effect=[ + ValkeyWorkloadCommandError("errored out"), + ("10.0.1.1", 6379), + ], + ), ): # expect raised exception due to lock not being acquired with pytest.raises(testing.errors.UncaughtCharmError) as exc_info: @@ -80,10 +87,7 @@ def test_non_primary(cloud_spec): patch("common.locks.ScaleDownLock.release_lock", return_value=True), patch( "common.client.SentinelClient.get_primary_addr_by_name", - side_effect=[ - ValkeyWorkloadCommandError("errored out"), - ("10.0.1.1", 6379), - ], + return_value=("10.0.1.1", 6379), ), patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, patch("common.client.SentinelClient.reset") as mock_reset, @@ -153,3 +157,87 @@ def test_primary(cloud_spec): mock_stop.assert_called_once() assert mock_reset.call_count == 2 status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) + + +def test_last_leader_unit_going_down(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_unit_data={ + "hostname": "valkey-0", + "private-ip": "10.0.1.0", + "start-state": "started", + }, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + data_strorage = testing.Storage(name="data") + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + relations={relation}, + leader=True, + containers={container}, + storages={data_strorage}, + ) + + with ( + patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock(return_value="10.0.1.0"), + ), + patch("common.locks.ScaleDownLock.request_lock", return_value=True), + patch("common.locks.ScaleDownLock.release_lock", return_value=True), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.0"), + patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, + patch("common.client.SentinelClient.sentinels_primary", return_value=[]), + patch("core.models.ValkeyCluster.update") as cluster_update, + patch("ops.model.Application.planned_units", return_value=0), + ): + state_out = ctx.run(ctx.on.storage_detaching(data_strorage), state_in) + mock_stop.assert_called_once() + status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) + cluster_update.assert_called_once_with( + {"internal_ca_certificate": None, "internal_ca_private_key": None} + ) + + +def test_cannot_get_primary_ip_leader(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_unit_data={ + "hostname": "valkey-0", + "private-ip": "10.0.1.0", + "start-state": "started", + }, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + data_strorage = testing.Storage(name="data") + state_in = testing.State( + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + relations={relation}, + leader=True, + containers={container}, + storages={data_strorage}, + ) + + with ( + patch( + "core.cluster_state.ClusterState.bind_address", + new_callable=PropertyMock(return_value="10.0.1.0"), + ), + patch( + "managers.sentinel.SentinelManager.get_primary_ip", + side_effect=ValkeyCannotGetPrimaryIPError("errored out"), + ), + patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, + patch("core.models.ValkeyCluster.update") as cluster_update, + patch("ops.model.Application.planned_units", return_value=0), + ): + state_out = ctx.run(ctx.on.storage_detaching(data_strorage), state_in) + mock_stop.assert_not_called() + status_is(state_out, ScaleDownStatuses.GOING_AWAY.value) + cluster_update.assert_called_once_with( + {"internal_ca_certificate": None, "internal_ca_private_key": None} + ) diff --git a/tests/unit/test_tls.py b/tests/unit/test_tls.py index 0263207..0395de6 100644 --- a/tests/unit/test_tls.py +++ b/tests/unit/test_tls.py @@ -116,6 +116,10 @@ def test_client_tls_relation_broken(cloud_spec): patch("managers.tls.TLSManager.rehash_ca_certificates"), patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + return_value=("10.0.1.1", 6379), + ), ): state_out = ctx.run(ctx.on.relation_broken(relation=client_tls_relation), state_in) assert reload_tls.call_count == 2 @@ -158,6 +162,10 @@ def test_client_tls_relation_broken_disabling_tls_fails(cloud_spec): "managers.config.ConfigManager.set_config_properties", side_effect=ValueError("failed") ), patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + return_value=("10.0.1.1", 6379), + ), ): state_out = ctx.run(ctx.on.relation_broken(relation=client_tls_relation), state_in) reload_tls.assert_not_called() @@ -225,6 +233,10 @@ def test_client_tls_relation_broken_writing_internal_cert_fails(cloud_spec): patch("core.base_workload.WorkloadBase.write_file", side_effect=PermissionError("failed")), patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + return_value=("10.0.1.1", 6379), + ), ): state_out = ctx.run(ctx.on.relation_broken(relation=client_tls_relation), state_in) reload_tls.assert_called_once() @@ -259,6 +271,10 @@ def test_client_tls_relation_broken_run_deferred_event(cloud_spec): patch("managers.cluster.ClusterManager.reload_tls_settings"), patch("managers.sentinel.SentinelManager.restart_service"), patch("charmlibs.pathops.ContainerPath.mkdir"), + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + return_value=("10.0.1.1", 6379), + ), ): state_out = ctx.run(ctx.on.relation_broken(relation=client_tls_relation), state_in) assert state_out.get_relation(1).local_unit_data.get("client-cert-ready") == "false" @@ -303,6 +319,10 @@ def test_client_certificate_available(cloud_spec): patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), patch("managers.tls.TLSManager.write_certificate"), + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + return_value=("10.0.1.1", 6379), + ), ): event.certificate = certificate.certificate charm.tls_events._on_certificate_available(event) @@ -354,6 +374,10 @@ def test_client_certificate_available_enabling_fails(cloud_spec): ), patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.tls.TLSManager.write_certificate"), + patch( + "common.client.SentinelClient.get_primary_addr_by_name", + return_value=("10.0.1.1", 6379), + ), ): event.certificate = certificate.certificate charm.tls_events._on_certificate_available(event) From 75a90d3495827eb06e3c69df9d3a59e8c6dd3b30 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 10 Mar 2026 12:53:07 +0000 Subject: [PATCH 144/159] lint --- src/common/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/client.py b/src/common/client.py index 5686e3f..731c981 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -9,7 +9,7 @@ from tenacity import retry, retry_if_result, stop_after_attempt, wait_fixed -from common.exceptions import ValkeyWorkloadCommandError +from common.exceptions import ValkeyTLSLoadError, ValkeyWorkloadCommandError from core.base_workload import WorkloadBase from literals import CLIENT_PORT, PRIMARY_NAME, SENTINEL_PORT, SENTINEL_TLS_PORT, TLS_PORT From 867e699d849ccab29bcd7c0ee3bff9bb3d83f891 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 11 Mar 2026 09:39:57 +0000 Subject: [PATCH 145/159] feedback from rene --- src/events/base_events.py | 35 +++++++++++++---------------------- src/events/tls.py | 2 +- src/managers/sentinel.py | 12 ++++++++++++ 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/events/base_events.py b/src/events/base_events.py index ec993bf..def6e55 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -9,7 +9,6 @@ from typing import TYPE_CHECKING import ops -import tenacity from common.exceptions import ( RequestingLockTimedOutError, @@ -445,24 +444,11 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: statuses_state=self.charm.state.statuses, ) - # retry to get the primary ip until 2x restart delay is reached. - # Pebble uses backoff and is maxed at 30s - # Snap delay is set at 20s - # 40s should be enough to cover both substrates try: - primary_ip = self._get_primary_ip_for_scale_down() + primary_ip = self.charm.sentinel_manager.get_primary_ip_for_scale_down() except ValkeyCannotGetPrimaryIPError as e: logger.error(e) - if self.charm.app.planned_units() == 0 and self.charm.unit.is_leader(): - # clear app data bag - self.charm.state.cluster.update( - { - "internal_ca_certificate": None, - "internal_ca_private_key": None, - } - ) - - self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) + self._set_state_for_going_away() return # blocks until the lock is acquired @@ -483,7 +469,13 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: statuses_state=self.charm.state.statuses, ) # if unit has primary then failover - primary_ip = self.charm.sentinel_manager.get_primary_ip() + try: + primary_ip = self.charm.sentinel_manager.get_primary_ip_for_scale_down() + except ValkeyCannotGetPrimaryIPError as e: + logger.error(e) + self._set_state_for_going_away() + return + active_sentinels = self.charm.sentinel_manager.get_active_sentinel_ips(primary_ip) if primary_ip == self.charm.state.bind_address and len(active_sentinels) > 1: self.charm.state.unit_server.update( @@ -519,6 +511,10 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: # release lock scale_down_lock.release_lock(primary_ip=primary_ip) + self._set_state_for_going_away() + + def _set_state_for_going_away(self) -> None: + """Set the state to going away when the unit is going down.""" if self.charm.app.planned_units() == 0 and self.charm.unit.is_leader(): # clear app data bag self.charm.state.cluster.update( @@ -529,8 +525,3 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: ) self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) - - @tenacity.retry(wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_attempt(8), reraise=True) - def _get_primary_ip_for_scale_down(self) -> str: - """Get the primary IP to use for scale down operations.""" - return self.charm.sentinel_manager.get_primary_ip() diff --git a/src/events/tls.py b/src/events/tls.py index 8d8407d..789a309 100644 --- a/src/events/tls.py +++ b/src/events/tls.py @@ -142,7 +142,7 @@ def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: def _on_tls_relation_broken(self, event: ops.RelationBrokenEvent) -> None: """Handle the `relation-broken` event.""" - if self.charm.app.planned_units() == 0: + if self.charm.app.planned_units() == 0 or self.charm.state.unit_server.is_being_removed: return if not self.charm.state.cluster.internal_ca_certificate: diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 5e6e8b6..335b13b 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -6,6 +6,7 @@ import logging +import tenacity from data_platform_helpers.advanced_statuses.models import StatusObject from data_platform_helpers.advanced_statuses.protocol import ManagerStatusProtocol from data_platform_helpers.advanced_statuses.types import Scope @@ -285,3 +286,14 @@ def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObje ).root return status_list or [CharmStatuses.ACTIVE_IDLE.value] + + @tenacity.retry(wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_attempt(8), reraise=True) + def get_primary_ip_for_scale_down(self) -> str: + """Get the primary IP to use for scale down operations. + + Retry to get the primary ip until 2x restart delay is reached. + Pebble uses backoff and is maxed at 30s + Snap delay is set at 20s + 40s covers both substrates + """ + return self.get_primary_ip() From e094a15d118e5741eede0388777c2465ea28cfe3 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 12 Mar 2026 11:24:35 +0000 Subject: [PATCH 146/159] add quorum reconfig based on cluster size --- src/common/client.py | 18 +++++++++++++++++ src/events/base_events.py | 30 ++++++++++++++++++++++++++++ src/managers/config.py | 10 +++++++--- src/managers/sentinel.py | 12 ++++++++++- tests/integration/ha/test_scaling.py | 15 ++++++++++++++ tests/integration/helpers.py | 29 +++++++++++++++++++++++++-- 6 files changed, 108 insertions(+), 6 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 731c981..b7a885e 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -415,3 +415,21 @@ def sentinels_primary(self, hostname: str) -> list[dict[str, str]]: return self.exec_cli_command( command=["sentinel", "sentinels", PRIMARY_NAME], hostname=hostname ) + + def set(self, hostname: str, *args: str) -> bool: + """Set a sentinel configuration parameter through the CLI. + + Args: + hostname (str): The hostname to connect to. + *args (str): The sentinel configuration parameters to set, as a variable list of strings. + + Returns: + bool: True if the command executed successfully, False otherwise. + """ + return ( + self.exec_cli_command( + command=["sentinel", "set"] + list(args), + hostname=hostname, + ) + == "OK" + ) diff --git a/src/events/base_events.py b/src/events/base_events.py index def6e55..13fad8d 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -77,6 +77,9 @@ def __init__(self, charm: "ValkeyCharm"): self.framework.observe( self.charm.on[PEER_RELATION].relation_changed, self._on_peer_relation_changed ) + self.framework.observe( + self.charm.on[PEER_RELATION].relation_departed, self._on_peer_relation_departed + ) self.framework.observe(self.charm.on.update_status, self._on_update_status) self.framework.observe(self.charm.on.leader_elected, self._on_leader_elected) self.framework.observe(self.charm.on.config_changed, self._on_config_changed) @@ -224,12 +227,18 @@ def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: """Handle event received by all units when a unit's relation data changes.""" + self._reconfigure_quorum_if_necessary() + if not self.charm.unit.is_leader(): return for lock in [StartLock(self.charm.state)]: lock.process() + def _on_peer_relation_departed(self, event: ops.RelationDepartedEvent) -> None: + """Handle event received by all units when a unit departs.""" + self._reconfigure_quorum_if_necessary() + def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: """Handle the update-status event.""" if not self.charm.state.unit_server.is_started: @@ -525,3 +534,24 @@ def _set_state_for_going_away(self) -> None: ) self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) + + def _reconfigure_quorum_if_necessary(self) -> None: + """Reconfigure the sentinel quorum if it does not match the current cluster size.""" + # if the unit / all units are being removed, we do not need to reconfigure the quorum + if ( + not self.charm.state.unit_server.is_active + or self.charm.state.unit_server.is_being_removed + or self.model.app.planned_units() == 0 + ): + return + + if self.charm.sentinel_manager.get_configured_quorum() != self.charm.config_manager.quorum: + logger.debug("Updating sentinel quorum to match current cluster size") + try: + self.charm.sentinel_manager.set_quorum(self.charm.config_manager.quorum) + self.charm.config_manager.set_sentinel_config_properties( + self.charm.sentinel_manager.get_primary_ip() + ) + except ValkeyWorkloadCommandError as e: + logger.error(f"Failed to update sentinel quorum: {e}") + # not critical to defer here, we can wait for the next relation change or config change to try again diff --git a/src/managers/config.py b/src/managers/config.py index b962530..d6c7fde 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -21,7 +21,6 @@ CHARM_USERS_ROLE_MAP, CLIENT_PORT, PRIMARY_NAME, - QUORUM_NUMBER, SENTINEL_PORT, SENTINEL_TLS_PORT, TLS_PORT, @@ -253,8 +252,7 @@ def get_sentinel_config_properties(self, primary_ip: str) -> dict[str, str | dic def _generate_sentinel_configs(self, primary_ip: str) -> dict[str, str]: """Generate the sentinel config properties based on the current cluster state.""" sentinel_configs = {} - # TODO consider adding quorum calculation based on number of planned_units and the parity of the number of units - sentinel_configs["monitor"] = f"{PRIMARY_NAME} {primary_ip} {TLS_PORT} {QUORUM_NUMBER}" + sentinel_configs["monitor"] = f"{PRIMARY_NAME} {primary_ip} {TLS_PORT} {self.quorum}" # auth settings # auth-user is used by sentinel to authenticate to the valkey primary sentinel_configs["auth-user"] = f"{PRIMARY_NAME} {CharmUsers.VALKEY_SENTINEL.value}" @@ -354,6 +352,12 @@ def configure_services(self, primary_ip: str) -> None: ) raise ValkeyConfigurationError("Failed to set configuration") from e + @property + def quorum(self) -> int: + """Calculate the quorum based on the number of units in the cluster.""" + num_units = len([server for server in self.state.servers if server.is_active]) + return (num_units // 2) + 1 + def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the config manager's statuses.""" status_list: list[StatusObject] = [] diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 335b13b..788253a 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -22,7 +22,7 @@ ) from core.base_workload import WorkloadBase from core.cluster_state import ClusterState -from literals import CharmUsers +from literals import PRIMARY_NAME, CharmUsers from statuses import CharmStatuses logger = logging.getLogger(__name__) @@ -297,3 +297,13 @@ def get_primary_ip_for_scale_down(self) -> str: 40s covers both substrates """ return self.get_primary_ip() + + def get_configured_quorum(self) -> int: + """Get the currently configured quorum for the sentinel cluster.""" + client = self._get_sentinel_client() + return int(client.primary(self.state.bind_address)["quorum"]) + + def set_quorum(self, quorum: int) -> None: + """Set the quorum for the sentinel cluster.""" + client = self._get_sentinel_client() + client.set(self.state.bind_address, PRIMARY_NAME, "quorum", str(quorum)) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 4880709..cb74a56 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -20,6 +20,7 @@ get_number_connected_replicas, get_password, get_primary_ip, + get_quorum, remove_number_units, seed_valkey, ) @@ -54,6 +55,11 @@ async def test_seed_data(juju: jubilant.Juju) -> None: await seed_valkey(juju, target_gb=1) +async def test_check_quorum(juju: jubilant.Juju) -> None: + """Check quorum value.""" + assert get_quorum(juju, f"{APP_NAME}/0") == 1, "Unexpected quorum value after initial deploy" + + async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: """Make sure new units are added to the valkey downtime.""" init_units_count = len(juju.status().apps[APP_NAME].units) @@ -71,6 +77,9 @@ async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: num_units = len(juju.status().apps[APP_NAME].units) assert num_units == NUM_UNITS, f"Expected {NUM_UNITS} units, got {num_units}." + for unit in juju.status().apps[APP_NAME].units: + assert get_quorum(juju, unit) == (NUM_UNITS // 2) + 1 + # check if all units have been added to the cluster hostnames = get_cluster_hostnames(juju, APP_NAME) @@ -123,6 +132,9 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ num_units = len(juju.status().get_units(APP_NAME)) assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + for unit in juju.status().apps[APP_NAME].units: + assert get_quorum(juju, unit) == (num_units // 2) + 1 + number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, APP_NAME), username=CharmUsers.VALKEY_ADMIN.value, @@ -198,6 +210,9 @@ async def test_scale_down_multiple_units( f"Expected {NUM_UNITS - 2} connected replicas, got {number_of_replicas}." ) + for unit in juju.status().apps[APP_NAME].units: + assert get_quorum(juju, unit) == (num_units // 2) + 1 + c_writes.update() await assert_continuous_writes_increasing( diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 4de3c02..3149cab 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -33,6 +33,7 @@ INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, + SENTINEL_PORT, TLS_PORT, CharmUsers, Substrate, @@ -458,10 +459,15 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: def exec_valkey_cli( - hostname: str, username: str, password: str, command: str + hostname: str, + username: str, + password: str, + command: str, + port: int = CLIENT_PORT, + json: bool = False, ) -> valkey_cli_result: """Execute a Valkey CLI command and returns the output as a string.""" - command = f"valkey-cli --no-auth-warning -h {hostname} -p {CLIENT_PORT} --user {username} --pass {password} {command}" + command = f"valkey-cli --no-auth-warning -h {hostname} -p {port} --user {username} --pass {password} {'--json' if json else ''} {command}" result = subprocess.run( command.split(), check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) @@ -470,6 +476,25 @@ def exec_valkey_cli( ) +def get_quorum(juju: jubilant.Juju, unit_name: str) -> int: + """Get the currently configured sentinel quorum.""" + status = juju.status() + model_info = juju.show_model() + units = status.get_units(APP_NAME) + unit_endpoint = ( + units[unit_name].public_address if model_info.type != "kubernetes" else units[unit_name].address + ) + result = exec_valkey_cli( + hostname=unit_endpoint, + username=CharmUsers.SENTINEL_CHARM_ADMIN.value, + password=get_password(juju, user=CharmUsers.SENTINEL_CHARM_ADMIN), + command="SENTINEL primary primary", + port=SENTINEL_PORT, + json=True, + ) + return int(json.loads(result.stdout)["quorum"]) + + async def set_key( hostnames: list[str], username: str, From d2826dd8cf88aa12ea90ca83a173c111ef9bad5a Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 12 Mar 2026 11:34:49 +0000 Subject: [PATCH 147/159] format --- tests/integration/helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 3149cab..792ab0d 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -482,7 +482,9 @@ def get_quorum(juju: jubilant.Juju, unit_name: str) -> int: model_info = juju.show_model() units = status.get_units(APP_NAME) unit_endpoint = ( - units[unit_name].public_address if model_info.type != "kubernetes" else units[unit_name].address + units[unit_name].public_address + if model_info.type != "kubernetes" + else units[unit_name].address ) result = exec_valkey_cli( hostname=unit_endpoint, From b3ff254e39b317ffc82580f86bf64babb96ea2bf Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 12 Mar 2026 11:49:27 +0000 Subject: [PATCH 148/159] fix and add unit tests --- tests/unit/test_charm.py | 70 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e945a69..a4844cf 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -13,6 +13,7 @@ INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, + PRIMARY_NAME, STATUS_PEERS_RELATION, CharmUsers, StartState, @@ -692,8 +693,9 @@ def test_relation_changed_event_leader_setting_starting_member(cloud_spec): containers={container}, model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) - state_out = ctx.run(ctx.on.relation_changed(relation), state_in) - assert state_out.get_relation(1).local_app_data.get("start-member") == "valkey/1" + with patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}): + state_out = ctx.run(ctx.on.relation_changed(relation), state_in) + assert state_out.get_relation(1).local_app_data.get("start-member") == "valkey/1" def test_relation_changed_event_leader_clears_starting_member(cloud_spec): @@ -713,8 +715,9 @@ def test_relation_changed_event_leader_clears_starting_member(cloud_spec): containers={container}, model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) - state_out = ctx.run(ctx.on.relation_changed(relation), state_in) - assert state_out.get_relation(1).local_app_data.get("start-member") is None + with patch("common.client.SentinelClient.primary", return_value={"quorum": "2"}): + state_out = ctx.run(ctx.on.relation_changed(relation), state_in) + assert state_out.get_relation(1).local_app_data.get("start-member") is None def test_relation_changed_event_leader_leaves_starting_member_as_is(cloud_spec): @@ -739,5 +742,60 @@ def test_relation_changed_event_leader_leaves_starting_member_as_is(cloud_spec): containers={container}, model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), ) - state_out = ctx.run(ctx.on.relation_changed(relation), state_in) - assert state_out.get_relation(1).local_app_data.get("start-member") == "valkey/1" + with patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}): + state_out = ctx.run(ctx.on.relation_changed(relation), state_in) + assert state_out.get_relation(1).local_app_data.get("start-member") == "valkey/1" + + +def test_relation_changed_event_update_quorum(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"start-member": "valkey/1"}, + local_unit_data={"start-state": StartState.STARTED.value}, + peers_data={1: {"start-state": StartState.STARTED.value}}, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + ) + with ( + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), + patch("common.client.SentinelClient.set") as mock_set, + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="127.1.0.1"), + ): + ctx.run(ctx.on.relation_changed(relation), state_in) + mock_set.assert_called_once_with("127.1.1.1", PRIMARY_NAME, "quorum", "2") + + +def test_relation_changed_event_do_not_update_quorum(cloud_spec): + ctx = testing.Context(ValkeyCharm, app_trusted=True) + relation = testing.PeerRelation( + id=1, + endpoint=PEER_RELATION, + local_app_data={"start-member": "valkey/1"}, + local_unit_data={"start-state": StartState.STARTED.value}, + peers_data={ + 1: {"start-state": StartState.STARTED.value}, + 2: {"start-state": StartState.STARTED.value}, + }, + ) + container = testing.Container(name=CONTAINER, can_connect=True) + + state_in = testing.State( + leader=True, + relations={relation}, + containers={container}, + model=testing.Model(name="my-vm-model", type="lxd", cloud_spec=cloud_spec), + ) + with ( + patch("common.client.SentinelClient.primary", return_value={"quorum": "2"}), + patch("common.client.SentinelClient.set") as mock_set, + ): + ctx.run(ctx.on.relation_changed(relation), state_in) + mock_set.assert_not_called() From cace9f5ec5ad3d3fe0b1608b3e5c17a896d0d134 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Thu, 12 Mar 2026 13:19:53 +0000 Subject: [PATCH 149/159] give enough time to the client to try other addresses --- tests/integration/continuous_writes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index b15c41c..c6d1096 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -89,7 +89,7 @@ async def _create_glide_client(self, config: Optional[SimpleNamespace] = None) - glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_client", - request_timeout=500, + request_timeout=1000, credentials=credentials, reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) @@ -253,7 +253,7 @@ async def _make_client(conf: SimpleNamespace) -> GlideClient: glide_config = GlideClientConfiguration( addresses=addresses, client_name="continuous_writes_worker", - request_timeout=500, + request_timeout=1000, credentials=credentials, reconnect_strategy=BackoffStrategy(num_of_retries=1, factor=50, exponent_base=2), ) From 0bf408ef5adb74550512a13382068e525d9ecb93 Mon Sep 17 00:00:00 2001 From: Smail KOURTA Date: Fri, 13 Mar 2026 15:16:37 +0400 Subject: [PATCH 150/159] [DPE-9373]: Use hostnames instead of IPs for k8s (#19) This pull request introduces a significant refactor to how the codebase handles primary endpoint addressing, shifting from using only IP addresses to supporting both IP addresses and hostnames, depending on the deployment substrate. It updates configuration management and event handling to use the new `primary_endpoint` concept, adds hostname resolution to Sentinel management, and ensures better compatibility with non-VM substrates. Additionally, it introduces a utility function for IP validation and improves Sentinel configuration for hostname-based communication. Key changes include: **Primary Endpoint Refactor and Configuration Management:** - Refactored all configuration and service management logic in `config.py` and related event handlers to use a new `primary_endpoint` parameter (which can be a hostname or IP) instead of just `primary_ip`. This includes updating method signatures, internal logic, and how endpoints are determined based on the substrate (VM vs. Kubernetes). [[1]](diffhunk://#diff-977a7d8c04cab4f9aaa5659892eadd9b991f51cd8f5814ecb51e3162bf23e31eL106-R106) [[2]](diffhunk://#diff-977a7d8c04cab4f9aaa5659892eadd9b991f51cd8f5814ecb51e3162bf23e31eL140-R147) [[3]](diffhunk://#diff-977a7d8c04cab4f9aaa5659892eadd9b991f51cd8f5814ecb51e3162bf23e31eL156-R160) [[4]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL50-R50) [[5]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL85-R88) [[6]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL97-R122) [[7]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL196-R205) [[8]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL244-R253) [[9]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL253-R268) [[10]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cR284-R292) [[11]](diffhunk://#diff-a24e7472936c291b5c95f9c56ccc25cccef3adf4c4cbd1de135e2dde31ae592cL338-R361) [[12]](diffhunk://#diff-3dfb4a104b407f7d4b2e07795a63745a09cde228a8dbe86602e7b72ea600fbe4L161-R166) [[13]](diffhunk://#diff-3dfb4a104b407f7d4b2e07795a63745a09cde228a8dbe86602e7b72ea600fbe4L209-R212) - Updated event-driven updates to always use FQDN hostnames instead of short hostnames, improving consistency and DNS compatibility. [[1]](diffhunk://#diff-977a7d8c04cab4f9aaa5659892eadd9b991f51cd8f5814ecb51e3162bf23e31eL106-R106) [[2]](diffhunk://#diff-977a7d8c04cab4f9aaa5659892eadd9b991f51cd8f5814ecb51e3162bf23e31eL248-R256) [[3]](diffhunk://#diff-977a7d8c04cab4f9aaa5659892eadd9b991f51cd8f5814ecb51e3162bf23e31eL290-R298) **Sentinel Management Improvements:** - Enhanced Sentinel management to resolve hostnames to IPs when necessary, using the new `is_valid_ip` helper, and improved logic for retrieving active Sentinel IPs. [[1]](diffhunk://#diff-b226109a257f8cc9cb6b0a4a1eb4c1c730d2cc9620744b6855e4f3a96ca3041dR22) [[2]](diffhunk://#diff-b226109a257f8cc9cb6b0a4a1eb4c1c730d2cc9620744b6855e4f3a96ca3041dL272-R285) - Added `resolve-hostnames` and `announce-hostnames` options to the generated Sentinel configuration to enable hostname-based communication within the cluster. **Utility Enhancements:** - Introduced a new `is_valid_ip` helper function in `common/helpers.py` to reliably check if a string is a valid IP address, supporting the above refactors and Sentinel logic. These changes collectively improve the charm's flexibility and reliability in heterogeneous environments, especially for Kubernetes and other non-VM substrates. --- src/core/cluster_state.py | 28 ++++++++++++++++++++ src/core/models.py | 9 +++++++ src/events/base_events.py | 32 ++++++++++++++-------- src/events/tls.py | 10 ++++--- src/managers/cluster.py | 12 ++++----- src/managers/config.py | 51 +++++++++++++++++++++--------------- src/managers/sentinel.py | 23 +++++++++------- tests/unit/test_charm.py | 4 ++- tests/unit/test_scaledown.py | 12 ++++----- 9 files changed, 123 insertions(+), 58 deletions(-) diff --git a/src/core/cluster_state.py b/src/core/cluster_state.py index 773a760..f09336e 100644 --- a/src/core/cluster_state.py +++ b/src/core/cluster_state.py @@ -133,6 +133,20 @@ def ingress_address(self) -> str | None: return str(address) + @property + def hostname(self) -> str: + """The hostname of the unit.""" + return self.get_unit_hostname(self.model.unit.name) + + @property + def endpoint(self) -> str: + """The endpoint to be used by other units to connect to this unit. + + On VM-based substrates, this should be the bind address. + On Kubernetes, this should be the fully qualified domain name of the unit. + """ + return self.bind_address if self.substrate == Substrate.VM else self.hostname + def get_secret_from_id(self, secret_id: str) -> dict[str, str]: """Resolve the given id of a Juju secret and return the content as a dict. @@ -151,6 +165,20 @@ def get_secret_from_id(self, secret_id: str) -> dict[str, str]: return secret_content + def get_unit_hostname(self, unit_name: str | None = None) -> str: + """Get the hostname.localdomain for a unit. + + Translate juju unit name to hostname.localdomain, necessary + for correct name resolution under k8s. + + Args: + unit_name: unit name + Returns: + A string representing the hostname.localdomain of the unit. + """ + unit_name = unit_name or self.charm.unit.name + return f"{unit_name.replace('/', '-')}.{self.charm.app.name}-endpoints" + @property def number_units_started(self) -> int: """Return the number of units in the cluster that have their Valkey server started.""" diff --git a/src/core/models.py b/src/core/models.py index 15e8cc6..697e5a5 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -28,6 +28,7 @@ CharmUsers, ScaleDownState, StartState, + Substrate, TLSState, ) @@ -171,6 +172,14 @@ def is_tls_enabled(self) -> bool: """Check if TLS is enabled for client connections.""" return self.tls_client_state in [TLSState.TLS, TLSState.TO_NO_TLS] + def get_endpoint(self, substrate: Substrate) -> str: + """Return the endpoint to be used by other units to connect to this unit. + + On VM-based substrates, this should be the private IP address. + On Kubernetes, this should be the hostname of the unit. + """ + return self.model.private_ip if substrate == Substrate.VM else self.model.hostname + @final class ValkeyCluster(RelationState): diff --git a/src/events/base_events.py b/src/events/base_events.py index def6e55..b8f8c08 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -5,7 +5,6 @@ """Valkey base event handlers.""" import logging -import socket from typing import TYPE_CHECKING import ops @@ -102,7 +101,7 @@ def _on_start(self, event: ops.StartEvent) -> None: self.charm.state.unit_server.update( { "start_state": StartState.NOT_STARTED.value, - "hostname": socket.gethostname(), + "hostname": self.charm.state.hostname, "private_ip": self.charm.state.bind_address, } ) @@ -136,10 +135,12 @@ def _on_start(self, event: ops.StartEvent) -> None: event.defer() return try: - primary_ip = self.charm.sentinel_manager.get_primary_ip() + primary_endpoint = self.charm.sentinel_manager.get_primary_ip() except ValkeyCannotGetPrimaryIPError: if self.charm.state.number_units_started == 0 and self.charm.unit.is_leader(): - primary_ip = self.charm.state.bind_address + primary_endpoint = self.charm.state.unit_server.get_endpoint( + self.charm.state.substrate + ) else: logger.debug( "Primary IP not available yet or other units have already started, deferring start event until leader starts the primary" @@ -152,7 +153,7 @@ def _on_start(self, event: ops.StartEvent) -> None: return try: - self.charm.config_manager.configure_services(primary_ip) + self.charm.config_manager.configure_services(primary_endpoint) self.charm.workload.start() except ValkeyConfigurationError: self.charm.state.unit_server.update( @@ -174,8 +175,10 @@ def _on_start(self, event: ops.StartEvent) -> None: statuses_state=self.charm.state.statuses, component_name=self.charm.cluster_manager.name, ) - - self.unit_fully_started.emit(is_primary=primary_ip == self.charm.state.bind_address) + self.unit_fully_started.emit( + is_primary=primary_endpoint + == self.charm.state.unit_server.get_endpoint(self.charm.state.substrate) + ) # TODO check how to trigger if deferred without update status event def _on_unit_fully_started(self, event: UnitFullyStarted) -> None: @@ -244,7 +247,7 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: self.charm.state.unit_server.update( { - "hostname": socket.gethostname(), + "hostname": self.charm.state.hostname, "private_ip": self.charm.state.bind_address, } ) @@ -286,7 +289,7 @@ def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: """Handle the config_changed event.""" self.charm.state.unit_server.update( { - "hostname": socket.gethostname(), + "hostname": self.charm.state.hostname, "private_ip": self.charm.state.bind_address, } ) @@ -477,7 +480,10 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: return active_sentinels = self.charm.sentinel_manager.get_active_sentinel_ips(primary_ip) - if primary_ip == self.charm.state.bind_address and len(active_sentinels) > 1: + if ( + primary_ip == self.charm.state.unit_server.get_endpoint(self.charm.state.substrate) + and len(active_sentinels) > 1 + ): self.charm.state.unit_server.update( {"scale_down_state": ScaleDownState.WAIT_TO_FAILOVER} ) @@ -492,7 +498,11 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: # stop valkey and sentinel processes self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.STOP_SERVICES}) self.charm.workload.stop() - active_sentinels = [ip for ip in active_sentinels if ip != self.charm.state.bind_address] + active_sentinels = [ + ip + for ip in active_sentinels + if ip != self.charm.state.unit_server.get_endpoint(self.charm.state.substrate) + ] # reset sentinel states on other units self.charm.state.unit_server.update( diff --git a/src/events/tls.py b/src/events/tls.py index 789a309..8889856 100644 --- a/src/events/tls.py +++ b/src/events/tls.py @@ -158,10 +158,12 @@ def _on_tls_relation_broken(self, event: ops.RelationBrokenEvent) -> None: self.charm.tls_manager.set_tls_state(TLSState.TO_NO_TLS) try: primary_ip = self.charm.sentinel_manager.get_primary_ip() - self.charm.config_manager.set_config_properties(primary_ip=primary_ip) + self.charm.config_manager.set_config_properties(primary_endpoint=primary_ip) tls_config = self.charm.config_manager.generate_tls_config() self.charm.cluster_manager.reload_tls_settings(tls_config) - self.charm.config_manager.set_sentinel_config_properties(primary_ip=primary_ip) + self.charm.config_manager.set_sentinel_config_properties( + primary_endpoint=primary_ip + ) self.charm.sentinel_manager.restart_service() except ( ValkeyWorkloadCommandError, @@ -206,8 +208,8 @@ def _enable_client_tls(self) -> None: logger.info("Enabling client TLS in Valkey") primary_ip = self.charm.sentinel_manager.get_primary_ip() - self.charm.config_manager.set_config_properties(primary_ip=primary_ip) - self.charm.config_manager.set_sentinel_config_properties(primary_ip=primary_ip) + self.charm.config_manager.set_config_properties(primary_endpoint=primary_ip) + self.charm.config_manager.set_sentinel_config_properties(primary_endpoint=primary_ip) tls_config = self.charm.config_manager.generate_tls_config() self.charm.cluster_manager.reload_tls_settings(tls_config) self.charm.sentinel_manager.restart_service() diff --git a/src/managers/cluster.py b/src/managers/cluster.py index b0c0c93..bbfae71 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -52,14 +52,14 @@ def _get_valkey_client(self) -> ValkeyClient: def reload_acl_file(self) -> None: """Reload the ACL file into the cluster.""" client = self._get_valkey_client() - if not client.acl_load(hostname=self.state.bind_address): + if not client.acl_load(hostname=self.state.endpoint): raise ValkeyACLLoadError("Could not load ACL file into Valkey cluster.") def update_primary_auth(self) -> None: """Update the primaryauth runtime configuration on the Valkey server.""" client = self._get_valkey_client() if not client.config_set( - hostname=self.state.bind_address, + hostname=self.state.endpoint, parameter="primaryauth", value=self.state.cluster.internal_users_credentials.get( CharmUsers.VALKEY_REPLICA.value, "" @@ -76,7 +76,7 @@ def update_primary_auth(self) -> None: def is_replica_synced(self) -> bool: """Check if the replica is synced with the primary.""" client = self._get_valkey_client() - role_info = client.role(hostname=self.state.bind_address) + role_info = client.role(hostname=self.state.endpoint) try: return role_info[0] == "slave" and role_info[3] == "connected" except IndexError as e: @@ -93,12 +93,12 @@ def is_healthy(self, is_primary: bool = False, check_replica_sync: bool = True) """Check if a valkey instance is healthy.""" client = self._get_valkey_client() - if not client.ping(hostname=self.state.bind_address): + if not client.ping(hostname=self.state.endpoint): logger.warning("Health check failed: Valkey server did not respond to ping.") return False if ( - persistence_info := client.info_persistence(hostname=self.state.bind_address) + persistence_info := client.info_persistence(hostname=self.state.endpoint) ) and persistence_info.get("loading", "") != "0": logger.warning("Health check failed: Valkey server is still loading data.") return False @@ -112,7 +112,7 @@ def is_healthy(self, is_primary: bool = False, check_replica_sync: bool = True) def reload_tls_settings(self, tls_config: dict[str, str]) -> None: """Update TLS by loading the TLS settings.""" client = self._get_valkey_client() - client.reload_tls(tls_config, hostname=self.state.bind_address) + client.reload_tls(tls_config, hostname=self.state.endpoint) def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]: """Compute the cluster manager's statuses.""" diff --git a/src/managers/config.py b/src/managers/config.py index b962530..9290c1d 100644 --- a/src/managers/config.py +++ b/src/managers/config.py @@ -47,7 +47,7 @@ def __init__(self, state: ClusterState, workload: WorkloadBase): self.state = state self.workload = workload - def get_config_properties(self, primary_ip: str) -> dict[str, str]: + def get_config_properties(self, primary_endpoint: str) -> dict[str, str]: """Assemble the config properties. Returns: @@ -79,13 +79,10 @@ def get_config_properties(self, primary_ip: str) -> dict[str, str]: config_properties["dir"] = self.workload.working_dir.as_posix() # bind to all interfaces - if self.state.substrate == Substrate.VM: - config_properties["bind"] = self.state.bind_address - else: - config_properties["bind"] = "0.0.0.0 -::1" + config_properties["bind"] = self.state.endpoint # replica related config - replica_config = self._generate_replica_config(primary_ip=primary_ip) + replica_config = self._generate_replica_config(primary_endpoint=primary_endpoint) config_properties.update(replica_config) # TLS related configuration @@ -94,25 +91,29 @@ def get_config_properties(self, primary_ip: str) -> dict[str, str]: return config_properties - def _generate_replica_config(self, primary_ip: str) -> dict[str, str]: + def _generate_replica_config(self, primary_endpoint: str) -> dict[str, str]: """Generate the config properties related to replica configuration based on the current cluster state.""" + local_unit_endpoint = self.state.unit_server.get_endpoint(self.state.substrate) replica_config = { "primaryuser": CharmUsers.VALKEY_REPLICA.value, "primaryauth": self.state.cluster.internal_users_credentials.get( CharmUsers.VALKEY_REPLICA.value, "" ), + "replica-announce-ip": local_unit_endpoint, } - if primary_ip != self.state.bind_address: + if primary_endpoint != local_unit_endpoint: # set replicaof - logger.debug("Setting replicaof to primary %s", primary_ip) + logger.debug("Setting replicaof to primary %s", primary_endpoint) # internal communication always uses peer TLS (`tls-replication=yes`) - replica_config["replicaof"] = f"{primary_ip} {TLS_PORT}" + replica_config["replicaof"] = f"{primary_endpoint} {TLS_PORT}" return replica_config - def set_config_properties(self, primary_ip: str) -> None: + def set_config_properties(self, primary_endpoint: str) -> None: """Write the config properties to the config file.""" logger.debug("Writing configuration") - self.workload.write_config_file(config=self.get_config_properties(primary_ip=primary_ip)) + self.workload.write_config_file( + config=self.get_config_properties(primary_endpoint=primary_endpoint) + ) def generate_tls_config(self) -> dict[str, str]: """Return the TLS configuration based on the current state.""" @@ -193,7 +194,9 @@ def _get_user_acl_line(self, user: CharmUsers, passwords: dict[str, str] | None password_hash = hashlib.sha256(password.encode("utf-8")).hexdigest() return f"user {user.value} on #{password_hash} {CHARM_USERS_ROLE_MAP[user]}\n" - def get_sentinel_config_properties(self, primary_ip: str) -> dict[str, str | dict[str, str]]: + def get_sentinel_config_properties( + self, primary_endpoint: str + ) -> dict[str, str | dict[str, str]]: """Assemble the sentinel config properties. Returns: @@ -241,7 +244,7 @@ def get_sentinel_config_properties(self, primary_ip: str) -> dict[str, str | dic # sentinel configs config_properties["sentinel"] = sentinel_properties | self._generate_sentinel_configs( - primary_ip=primary_ip + primary_endpoint=primary_endpoint ) # tls config @@ -250,11 +253,13 @@ def get_sentinel_config_properties(self, primary_ip: str) -> dict[str, str | dic return config_properties - def _generate_sentinel_configs(self, primary_ip: str) -> dict[str, str]: + def _generate_sentinel_configs(self, primary_endpoint: str) -> dict[str, str]: """Generate the sentinel config properties based on the current cluster state.""" sentinel_configs = {} # TODO consider adding quorum calculation based on number of planned_units and the parity of the number of units - sentinel_configs["monitor"] = f"{PRIMARY_NAME} {primary_ip} {TLS_PORT} {QUORUM_NUMBER}" + sentinel_configs["monitor"] = ( + f"{PRIMARY_NAME} {primary_endpoint} {TLS_PORT} {QUORUM_NUMBER}" + ) # auth settings # auth-user is used by sentinel to authenticate to the valkey primary sentinel_configs["auth-user"] = f"{PRIMARY_NAME} {CharmUsers.VALKEY_SENTINEL.value}" @@ -270,13 +275,17 @@ def _generate_sentinel_configs(self, primary_ip: str) -> dict[str, str]: sentinel_configs["down-after-milliseconds"] = f"{PRIMARY_NAME} 30000" sentinel_configs["failover-timeout"] = f"{PRIMARY_NAME} 180000" sentinel_configs["parallel-syncs"] = f"{PRIMARY_NAME} 1" + if self.state.substrate == Substrate.K8S: + sentinel_configs["resolve-hostnames"] = "yes" + sentinel_configs["announce-hostnames"] = "yes" + sentinel_configs["announce-ip"] = self.state.unit_server.model.hostname return sentinel_configs - def set_sentinel_config_properties(self, primary_ip: str) -> None: + def set_sentinel_config_properties(self, primary_endpoint: str) -> None: """Write sentinel configuration file.""" logger.debug("Writing Sentinel configuration") - sentinel_config = self.get_sentinel_config_properties(primary_ip=primary_ip) + sentinel_config = self.get_sentinel_config_properties(primary_endpoint=primary_endpoint) sentinel_config_string = "\n".join( f"sentinel {key} {value}" for key, value in sentinel_config["sentinel"].items() @@ -335,7 +344,7 @@ def update_local_valkey_admin_password(self) -> None: } ) - def configure_services(self, primary_ip: str) -> None: + def configure_services(self, primary_endpoint: str) -> None: """Start Valkey and Sentinel services. Raises: @@ -343,9 +352,9 @@ def configure_services(self, primary_ip: str) -> None: """ try: self.update_local_valkey_admin_password() - self.set_config_properties(primary_ip=primary_ip) + self.set_config_properties(primary_endpoint=primary_endpoint) self.set_acl_file() - self.set_sentinel_config_properties(primary_ip=primary_ip) + self.set_sentinel_config_properties(primary_endpoint=primary_endpoint) self.set_sentinel_acl_file() except (ValkeyWorkloadCommandError, ValueError) as e: logger.error("Failed to set configuration properties: %s", e) diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 335b13b..7a04900 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -65,9 +65,9 @@ def is_sentinel_discovered(self) -> bool: """Check if the sentinel of the local unit was discovered by the other sentinels in the cluster.""" # list of active sentinels: units with started flag true and not being removed active_sentinels = [ - unit.model.private_ip + unit.get_endpoint(self.state.substrate) for unit in self.state.servers - if unit.is_active and unit.model.private_ip != self.state.bind_address + if unit.is_active and unit.get_endpoint(self.state.substrate) != self.state.endpoint ] client = self._get_sentinel_client() @@ -77,9 +77,9 @@ def is_sentinel_discovered(self) -> bool: discovered_sentinels = { sentinel["ip"] for sentinel in client.sentinels_primary(hostname=sentinel_ip) } - if self.state.bind_address not in discovered_sentinels: + if self.state.endpoint not in discovered_sentinels: logger.warning( - f"Sentinel at {sentinel_ip} does not see local sentinel at {self.state.bind_address}." + f"Sentinel at {sentinel_ip} does not see local sentinel at {self.state.endpoint}." ) return False @@ -96,7 +96,11 @@ def get_primary_ip(self) -> str: Raises: ValkeyWorkloadCommandError: If the CLI command to get primary information fails on all sentinels. """ - started_servers = [unit.model.private_ip for unit in self.state.servers if unit.is_active] + started_servers = [ + unit.get_endpoint(self.state.substrate) + for unit in self.state.servers + if unit.is_active + ] client = self._get_sentinel_client() @@ -125,12 +129,12 @@ def is_healthy(self) -> bool: """Check if the sentinel service is healthy.""" client = self._get_sentinel_client() - if not client.ping(hostname=self.state.bind_address): + if not client.ping(hostname=self.state.endpoint): logger.warning("Health check failed: Sentinel did not respond to ping.") return False try: - client.primary(hostname=self.state.bind_address) + client.primary(hostname=self.state.endpoint) except ValkeyWorkloadCommandError: logger.warning("Health check failed: Could not query sentinel for master information.") return False @@ -147,8 +151,8 @@ def failover(self) -> None: """ client = self._get_sentinel_client() try: - client.failover_primary_coordinated(self.state.bind_address) - client.is_failover_in_progress(hostname=self.state.bind_address) + client.failover_primary_coordinated(self.state.endpoint) + client.is_failover_in_progress(self.state.endpoint) except ValkeyWorkloadCommandError as e: logger.error(f"Failed to trigger failover: {e}") raise SentinelFailoverError from e @@ -270,6 +274,7 @@ def get_active_sentinel_ips(self, hostname: str) -> list[str]: ValkeyWorkloadCommandError: If the CLI command to get sentinel information fails. """ client = self._get_sentinel_client() + return [hostname] + [ sentinel["ip"] for sentinel in client.sentinels_primary(hostname=hostname) ] diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index e945a69..90b1932 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -641,7 +641,9 @@ def test_change_password_secret_changed_non_leader_unit_not_successful(cloud_spe state_out = manager.run() mock_update_password.assert_not_called() mock_set_acl_file.assert_called_once() - mock_exec_command.assert_called_once_with(["acl", "load"], hostname="127.1.1.1") + mock_exec_command.assert_called_once_with( + ["acl", "load"], hostname="valkey-0.valkey-endpoints" + ) cluster_statuses = charm.state.statuses.get( scope="unit", component=charm.cluster_manager.name, diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py index 3195ed4..d4cd7cc 100644 --- a/tests/unit/test_scaledown.py +++ b/tests/unit/test_scaledown.py @@ -87,16 +87,16 @@ def test_non_primary(cloud_spec): patch("common.locks.ScaleDownLock.release_lock", return_value=True), patch( "common.client.SentinelClient.get_primary_addr_by_name", - return_value=("10.0.1.1", 6379), + return_value=("valkey-1", 6379), ), patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, patch("common.client.SentinelClient.reset") as mock_reset, patch( "common.client.SentinelClient.sentinels_primary", side_effect=[ - [{"ip": "10.0.1.0"}, {"ip": "10.0.1.2"}], # for get_active_sentinel_ips - [{"ip": "10.0.1.2"}], # for target_sees_all_others unit 10.0.1.1 - [{"ip": "10.0.1.1"}], # for target_sees_all_others unit 10.0.1.2 + [{"ip": "valkey-0"}, {"ip": "valkey-2"}], # for get_active_sentinel_ips + [{"ip": "valkey-2"}], # for target_sees_all_others unit valkey-1 + [{"ip": "valkey-1"}], # for target_sees_all_others unit valkey-2 ], ), patch( @@ -130,7 +130,7 @@ def test_primary(cloud_spec): ), patch("common.locks.ScaleDownLock.request_lock", return_value=True), patch("common.locks.ScaleDownLock.release_lock", return_value=True), - patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.0"), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="valkey-0"), patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, patch("common.client.SentinelClient.failover_primary_coordinated") as mock_failover, patch("common.client.SentinelClient.is_failover_in_progress") as mock_failover_in_progress, @@ -187,7 +187,7 @@ def test_last_leader_unit_going_down(cloud_spec): ), patch("common.locks.ScaleDownLock.request_lock", return_value=True), patch("common.locks.ScaleDownLock.release_lock", return_value=True), - patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="10.0.1.0"), + patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="valkey-0"), patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, patch("common.client.SentinelClient.sentinels_primary", return_value=[]), patch("core.models.ValkeyCluster.update") as cluster_update, From d00a189019df5f986afb6f2573bdc4f11ba6cce3 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Mar 2026 08:52:24 +0000 Subject: [PATCH 151/159] remove f strings in loggers --- src/common/client.py | 2 +- src/common/locks.py | 44 +++++++++++++++++--------- src/core/models.py | 3 +- src/events/base_events.py | 17 ++-------- src/managers/cluster.py | 2 +- src/managers/sentinel.py | 27 ++++++++++++---- src/managers/tls.py | 2 +- tests/integration/conftest.py | 2 +- tests/integration/continuous_writes.py | 14 ++++---- tests/integration/cw_helpers.py | 2 +- tests/integration/ha/test_scaling.py | 4 ++- tests/integration/helpers.py | 20 +++++++++--- 12 files changed, 85 insertions(+), 54 deletions(-) diff --git a/src/common/client.py b/src/common/client.py index 731c981..9fdbc4b 100644 --- a/src/common/client.py +++ b/src/common/client.py @@ -143,7 +143,7 @@ def info_persistence(self, hostname: str) -> dict[str, str] | None: ) values = {} if not output.strip(): - logger.warning(f"No persistence info found on Valkey server at {hostname}.") + logger.warning("No persistence info found on Valkey server at %s.", hostname) return None for line in output.strip().splitlines(): if line.startswith("#"): diff --git a/src/common/locks.py b/src/common/locks.py index 7b9ab17..04fff90 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -105,7 +105,8 @@ def request_lock(self) -> bool: ) if self.state.unit_server.unit.is_leader(): logger.info( - f"Leader unit requesting {self.name} lock. Triggering lock request processing." + "Leader unit requesting %s lock. Triggering lock request processing.", + self.name, ) self.process() @@ -120,7 +121,8 @@ def release_lock(self) -> bool: ) if self.state.unit_server.unit.is_leader(): logger.info( - f"Leader unit releasing {self.name} lock. Triggering lock request processing." + "Leader unit releasing %s lock. Triggering lock request processing.", + self.name, ) self.process() @@ -129,7 +131,7 @@ def release_lock(self) -> bool: def process(self) -> None: """Process the lock requests and update the unit with the lock.""" if not self.state.unit_server.unit.is_leader(): - logger.info(f"Only the leader can process {self.name} lock requests.") + logger.info("Only the leader can process lock requests.") return if self.is_lock_free_to_give: @@ -197,12 +199,16 @@ def request_lock(self, timeout: int | None = None, primary_ip: str | None = None Returns: bool: True if the lock was acquired, False if the timeout was reached before acquiring the lock. """ - logger.debug(f"{self.charm.state.unit_server.unit_name} is requesting {self.name} lock.") + logger.debug( + "%s is requesting %s lock.", self.charm.state.unit_server.unit_name, self.name + ) retry_until = time.time() + timeout if timeout else None primary_ip = primary_ip or self.charm.sentinel_manager.get_primary_ip() if self.get_unit_with_lock(primary_ip) == self.charm.state.unit_server.unit_name: logger.debug( - f"{self.charm.state.unit_server.unit_name} already holds {self.name} lock. No need to request it again." + "%s already holds %s lock. No need to request it again.", + self.charm.state.unit_server.unit_name, + self.name, ) return True @@ -225,20 +231,26 @@ def request_lock(self, timeout: int | None = None, primary_ip: str | None = None ], ): logger.debug( - f"{self.charm.state.unit_server.unit_name} acquired {self.name} lock." + "%s acquired %s lock.", self.charm.state.unit_server.unit_name, self.name ) return True except ValkeyWorkloadCommandError: logger.warning( - f"{self.charm.state.unit_server.unit_name} failed to acquire {self.name} lock due to a workload command error. Retrying..." + "%s failed to acquire %s lock due to a workload command error. Retrying...", + self.charm.state.unit_server.unit_name, + self.name, ) if retry_until and time.time() > retry_until: logger.warning( - f"{self.charm.state.unit_server.unit_name} failed to acquire {self.name} lock within timeout. Giving up." + "%s failed to acquire %s lock within timeout. Giving up.", + self.charm.state.unit_server.unit_name, + self.name, ) return False logger.info( - f"{self.charm.state.unit_server.unit_name} failed to acquire {self.name} lock. Retrying in 5 seconds." + "%s failed to acquire %s lock. Retrying in 5 seconds.", + self.charm.state.unit_server.unit_name, + self.name, ) time.sleep(5) # update the primary ip in case a failover happens when we are waiting to acquire the lock @@ -263,10 +275,12 @@ def release_lock(self, primary_ip: str | None = None) -> bool: ) == "1" ): - logger.debug(f"{self.charm.state.unit_server.unit_name} released {self.name} lock.") + logger.debug("%s released %s lock.", self.charm.state.unit_server.unit_name, self.name) return True - else: - logger.warning( - f"{self.charm.state.unit_server.unit_name} failed to release {self.name} lock. It may not have held the lock or it may have already been released." - ) - return False + + logger.warning( + "%s failed to release %s lock. It may not have held the lock or it may have already been released.", + self.charm.state.unit_server.unit_name, + self.name, + ) + return False diff --git a/src/core/models.py b/src/core/models.py index 697e5a5..e4d6148 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -89,7 +89,8 @@ def update(self, items: dict[str, Any]) -> None: """Write to relation data.""" if not self.relation: logger.warning( - f"Fields {list(items.keys())} were attempted to be written on the relation before it exists." + "Fields %s were attempted to be written on the relation before it exists.", + list(items.keys()), ) return diff --git a/src/events/base_events.py b/src/events/base_events.py index b8f8c08..e63e6c2 100644 --- a/src/events/base_events.py +++ b/src/events/base_events.py @@ -267,7 +267,7 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: str(admin_secret_id) ) except (ops.ModelError, ops.SecretNotFoundError) as e: - logger.error(f"Could not access secret {admin_secret_id}: {e}") + logger.error("Could not access secret %s: %s", admin_secret_id, e) raise # generate passwords for all internal users if not specified in the user secret @@ -380,7 +380,7 @@ def _update_internal_users_password(self, secret_id: str) -> None: ) if any(key not in CharmUsers for key in secret_content.keys()): - logger.error(f"Invalid username in secret {secret_id}.") + logger.error("Invalid username in secret %s.", secret_id) self.charm.status.set_running_status( ClusterStatuses.PASSWORD_UPDATE_FAILED.value, scope="app", @@ -439,7 +439,6 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: # get scale down lock scale_down_lock = ScaleDownLock(self.charm) - self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.WAIT_FOR_LOCK}) self.charm.status.set_running_status( ScaleDownStatuses.WAIT_FOR_LOCK.value, scope="unit", @@ -484,9 +483,6 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: primary_ip == self.charm.state.unit_server.get_endpoint(self.charm.state.substrate) and len(active_sentinels) > 1 ): - self.charm.state.unit_server.update( - {"scale_down_state": ScaleDownState.WAIT_TO_FAILOVER} - ) logger.debug("Triggering sentinel failover on primary IP %s", primary_ip) self.charm.sentinel_manager.failover() primary_ip = self.charm.sentinel_manager.get_primary_ip() @@ -496,7 +492,6 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: ) # stop valkey and sentinel processes - self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.STOP_SERVICES}) self.charm.workload.stop() active_sentinels = [ ip @@ -505,18 +500,12 @@ def _on_storage_detaching(self, event: ops.StorageDetachingEvent) -> None: ] # reset sentinel states on other units - self.charm.state.unit_server.update( - { - "scale_down_state": ScaleDownState.RESET_SENTINEL, - "start_state": StartState.NOT_STARTED.value, - } - ) + self.charm.state.unit_server.update({"start_state": StartState.NOT_STARTED.value}) if active_sentinels: logger.debug("Resetting sentinel states on active units: %s", active_sentinels) self.charm.sentinel_manager.reset_sentinel_states(active_sentinels) # check health after scale down - self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.HEALTH_CHECK}) self.charm.sentinel_manager.verify_expected_replica_count(active_sentinels) # release lock scale_down_lock.release_lock(primary_ip=primary_ip) diff --git a/src/managers/cluster.py b/src/managers/cluster.py index bbfae71..74574dc 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -80,7 +80,7 @@ def is_replica_synced(self) -> bool: try: return role_info[0] == "slave" and role_info[3] == "connected" except IndexError as e: - logger.warning(f"Unexpected role information format: {role_info}. Error: {e}") + logger.warning("Unexpected role information format: %s. Error: %s", role_info, e) return False @retry( diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 7a04900..30a65e3 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -79,12 +79,16 @@ def is_sentinel_discovered(self) -> bool: } if self.state.endpoint not in discovered_sentinels: logger.warning( - f"Sentinel at {sentinel_ip} does not see local sentinel at {self.state.endpoint}." + "Sentinel at %s does not see local sentinel at %s.", + sentinel_ip, + self.state.endpoint, ) return False except ValkeyWorkloadCommandError: - logger.warning(f"Could not query sentinel at {sentinel_ip} for primary discovery.") + logger.warning( + "Could not query sentinel at %s for primary discovery.", sentinel_ip + ) return False return True @@ -154,7 +158,7 @@ def failover(self) -> None: client.failover_primary_coordinated(self.state.endpoint) client.is_failover_in_progress(self.state.endpoint) except ValkeyWorkloadCommandError as e: - logger.error(f"Failed to trigger failover: {e}") + logger.error("Failed to trigger failover: %s", e) raise SentinelFailoverError from e def reset_sentinel_states(self, sentinel_ips: list[str]) -> None: @@ -213,12 +217,15 @@ def target_sees_all_others(self, target_sentinel_ip: str, sentinel_ips: list[str } if discovered_sentinels != sentinel_ips_set: logger.warning( - f"Sentinel at {target_sentinel_ip} sees sentinels {discovered_sentinels}, expected {sentinel_ips_set}." + "Sentinel at %s sees sentinels %s, expected %s.", + target_sentinel_ip, + discovered_sentinels, + sentinel_ips_set, ) return False except ValkeyWorkloadCommandError: logger.warning( - f"Could not query sentinel at {target_sentinel_ip} for sentinel discovery." + "Could not query sentinel at %s for sentinel discovery.", target_sentinel_ip ) return False return True @@ -255,10 +262,16 @@ def verify_expected_replica_count(self, sentinel_ips: list[str]) -> None: number_replicas := len(client.replicas_primary(hostname=sentinel_ip)) ): logger.warning( - f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." + "Sentinel at %s sees %d replicas, expected %d.", + sentinel_ip, + number_replicas, + expected_replicas, ) raise SentinelIncorrectReplicaCountError( - f"Sentinel at {sentinel_ip} sees {number_replicas} replicas, expected {expected_replicas}." + "Sentinel at %s sees %d replicas, expected %d.", + sentinel_ip, + number_replicas, + expected_replicas, ) def get_active_sentinel_ips(self, hostname: str) -> list[str]: diff --git a/src/managers/tls.py b/src/managers/tls.py index 1e61a59..82e5a20 100644 --- a/src/managers/tls.py +++ b/src/managers/tls.py @@ -42,7 +42,7 @@ def set_tls_state(self, state: TLSState) -> None: Args: state (TLSState): The TLS state. """ - logger.debug(f"Setting TLS state to {state}") + logger.debug("Setting TLS state to %s", state) self.state.unit_server.update({"tls_client_state": state.value}) def set_cert_state(self, is_ready: bool) -> None: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 423654a..36269e4 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -18,7 +18,7 @@ def c_writes(juju: jubilant.Juju): """Create instance of the ContinuousWrites.""" app = APP_NAME - logger.info(f"Creating ContinuousWrites instance for app with name {app}") + logger.info("Creating ContinuousWrites instance for app with name %s", app) return ContinuousWrites(juju, app) diff --git a/tests/integration/continuous_writes.py b/tests/integration/continuous_writes.py index b15c41c..3cc44bc 100644 --- a/tests/integration/continuous_writes.py +++ b/tests/integration/continuous_writes.py @@ -270,7 +270,7 @@ async def with_client(conf: SimpleNamespace): current_val = starting_number config = initial_config - proc_logger.info(f"Starting continuous async writes from {current_val}") + proc_logger.info("Starting continuous async writes from %s", current_val) try: while not event.is_set(): @@ -281,8 +281,8 @@ async def with_client(conf: SimpleNamespace): pass try: - proc_logger.info(f"Writing value: {current_val}") - proc_logger.info(f"Current endpoints={config.endpoints}") + proc_logger.info("Writing value: %s", current_val) + proc_logger.info("Current endpoints=%s", config.endpoints) async with with_client(config) as client: if not ( res := await asyncio.wait_for( @@ -290,10 +290,10 @@ async def with_client(conf: SimpleNamespace): ) ): raise WriteFailedError("LPUSH returned 0/None") - proc_logger.info(f"Length after write: {res}") + proc_logger.info("Length after write: %s", res) await asyncio.sleep(in_between_sleep) except Exception as e: - proc_logger.warning(f"Write failed at {current_val}: {e}") + proc_logger.warning("Write failed at %s: %s", current_val, e) finally: if event.is_set(): break @@ -319,7 +319,9 @@ async def with_client(conf: SimpleNamespace): time.sleep(1) if new_hostnames := get_active_hostnames(juju_env, "valkey") != hostnames: logger.info( - f"Hostnames changed from {hostnames} to {new_hostnames}, updating continuous writes client." + "Hostnames changed from %s to %s, updating continuous writes client.", + hostnames, + new_hostnames, ) hostnames = new_hostnames cw.update() diff --git a/tests/integration/cw_helpers.py b/tests/integration/cw_helpers.py index d19d773..0756328 100644 --- a/tests/integration/cw_helpers.py +++ b/tests/integration/cw_helpers.py @@ -85,4 +85,4 @@ def assert_continuous_writes_consistent( assert count == last_written_value + 1, ( f"endpoint: {endpoint}, expected count: {last_written_value + 1}, current count: {count}" ) - logger.info(f"Continuous writes are consistent on {endpoint}.") + logger.info("Continuous writes are consistent on %s.", endpoint) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 4880709..cf45599 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -280,7 +280,9 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w ) assert primary_unit is not None, "Failed to identify primary unit for scale down test." logger.info( - f"Identified primary unit {primary_unit} with IP {primary_ip} for scale down test." + "Identified primary unit %s with IP %s for scale down test.", + primary_unit, + primary_ip, ) juju.remove_unit(primary_unit) juju.wait( diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 4de3c02..aaf1c3f 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -135,7 +135,7 @@ def does_message_match(expected_status_message: str, status: StatusObject) -> bo ) ) except KeyError as e: - logger.error(f"Error attempting to convert StatusObject to ops.StatusBase: {e}") + logger.error("Error attempting to convert StatusObject to ops.StatusBase: %s", e) return False @@ -416,7 +416,12 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: total_bytes_target = target_gb * 1024 * 1024 * 1024 total_keys = total_bytes_target // value_size_bytes - logger.info(f"Targeting ~{target_gb}GB ({total_keys:,} keys of {value_size_bytes} bytes each)") + logger.info( + "Targeting ~%sGB (%s keys of %s bytes each)", + target_gb, + total_keys, + value_size_bytes, + ) start_time = time.time() keys_added = 0 @@ -440,15 +445,20 @@ async def seed_valkey(juju: jubilant.Juju, target_gb: float = 1.0) -> None: elapsed = time.time() - start_time percent = (keys_added / total_keys) * 100 logger.info( - f"Progress: {percent:.1f}% | Keys: {keys_added:,} | Elapsed: {elapsed:.1f}s", + "Progress: %.1f%% | Keys: %s | Elapsed: %.1f s", + percent, + keys_added, + elapsed, ) except Exception as e: - logger.error(f"\nError: {e}") + logger.error("Error: %s", e) finally: total_time = time.time() - start_time logger.info( - f"\nSeeding complete! Added {keys_added:,} keys in {total_time:.2f} seconds." + "Seeding complete! Added %s keys in %.2f seconds.", + keys_added, + total_time, ) From abe43b9dfebf722cec536e064ce818ca08b673e4 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Mar 2026 09:20:23 +0000 Subject: [PATCH 152/159] charm level feedback --- src/common/locks.py | 44 +++++++++++++++--------------------- src/core/models.py | 9 +++----- src/managers/cluster.py | 5 ++-- src/managers/sentinel.py | 11 +++++---- tests/integration/helpers.py | 3 +-- 5 files changed, 30 insertions(+), 42 deletions(-) diff --git a/src/common/locks.py b/src/common/locks.py index 04fff90..b1593e8 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -4,12 +4,12 @@ """Collection of locks for cluster operations.""" import logging -import time from abc import abstractmethod from typing import TYPE_CHECKING, Protocol, override +from tenacity import Retrying, stop_after_attempt, wait_fixed + from common.client import ValkeyClient -from common.exceptions import ValkeyWorkloadCommandError from core.cluster_state import ClusterState from literals import CharmUsers @@ -202,7 +202,6 @@ def request_lock(self, timeout: int | None = None, primary_ip: str | None = None logger.debug( "%s is requesting %s lock.", self.charm.state.unit_server.unit_name, self.name ) - retry_until = time.time() + timeout if timeout else None primary_ip = primary_ip or self.charm.sentinel_manager.get_primary_ip() if self.get_unit_with_lock(primary_ip) == self.charm.state.unit_server.unit_name: logger.debug( @@ -216,8 +215,22 @@ def request_lock(self, timeout: int | None = None, primary_ip: str | None = None logger.debug("Last unit in the cluster scaling down. Lock will be skipped.") return True - while True: - try: + number_of_retries = min(timeout // 5 if timeout else 1, 1) + + for attempt in Retrying( + wait=wait_fixed(5), + stop=stop_after_attempt(number_of_retries), + retry_error_callback=lambda _: False, + after=lambda retry_state: logger.info( + "%s failed to acquire %s lock on attempt %d. Retrying in 5 seconds.", + self.charm.state.unit_server.unit_name, + self.name, + retry_state.attempt_number, + ), + ): + with attempt: + # update the primary ip in case a failover happens when we are waiting to acquire the lock + primary_ip = self.charm.sentinel_manager.get_primary_ip() if self.client.set( hostname=primary_ip, key=self.lock_key, @@ -234,27 +247,6 @@ def request_lock(self, timeout: int | None = None, primary_ip: str | None = None "%s acquired %s lock.", self.charm.state.unit_server.unit_name, self.name ) return True - except ValkeyWorkloadCommandError: - logger.warning( - "%s failed to acquire %s lock due to a workload command error. Retrying...", - self.charm.state.unit_server.unit_name, - self.name, - ) - if retry_until and time.time() > retry_until: - logger.warning( - "%s failed to acquire %s lock within timeout. Giving up.", - self.charm.state.unit_server.unit_name, - self.name, - ) - return False - logger.info( - "%s failed to acquire %s lock. Retrying in 5 seconds.", - self.charm.state.unit_server.unit_name, - self.name, - ) - time.sleep(5) - # update the primary ip in case a failover happens when we are waiting to acquire the lock - primary_ip = self.charm.sentinel_manager.get_primary_ip() @property def is_held_by_this_unit(self) -> bool: diff --git a/src/core/models.py b/src/core/models.py index e4d6148..77cfdd9 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -141,12 +141,9 @@ def is_started(self) -> bool: @property def is_being_removed(self) -> bool: """Check if the unit is being removed from the cluster.""" - return self.model.scale_down_state in { - ScaleDownState.STOP_SERVICES.value, - ScaleDownState.RESET_SENTINEL.value, - ScaleDownState.HEALTH_CHECK.value, - ScaleDownState.GOING_AWAY.value, - } + return ( + self.model.scale_down_state == ScaleDownState.GOING_AWAY.value if self.model else False + ) @property def is_active(self) -> bool: diff --git a/src/managers/cluster.py b/src/managers/cluster.py index 74574dc..b6b4009 100644 --- a/src/managers/cluster.py +++ b/src/managers/cluster.py @@ -159,8 +159,7 @@ def _get_start_status(self) -> StatusObject | None: def _get_scale_down_status(self) -> StatusObject | None: """Get the current scale down status of the unit.""" - match self.state.unit_server.model.scale_down_state: - case ScaleDownState.GOING_AWAY.value: - return ScaleDownStatuses.GOING_AWAY.value + if self.state.unit_server.model.scale_down_state == ScaleDownState.WAIT_FOR_LOCK.value: + return ScaleDownStatuses.GOING_AWAY.value return None diff --git a/src/managers/sentinel.py b/src/managers/sentinel.py index 30a65e3..41cfe41 100644 --- a/src/managers/sentinel.py +++ b/src/managers/sentinel.py @@ -72,22 +72,22 @@ def is_sentinel_discovered(self) -> bool: client = self._get_sentinel_client() - for sentinel_ip in active_sentinels: + for sentinel_host in active_sentinels: try: discovered_sentinels = { - sentinel["ip"] for sentinel in client.sentinels_primary(hostname=sentinel_ip) + sentinel["ip"] for sentinel in client.sentinels_primary(hostname=sentinel_host) } if self.state.endpoint not in discovered_sentinels: logger.warning( "Sentinel at %s does not see local sentinel at %s.", - sentinel_ip, + sentinel_host, self.state.endpoint, ) return False except ValkeyWorkloadCommandError: logger.warning( - "Could not query sentinel at %s for primary discovery.", sentinel_ip + "Could not query sentinel at %s for primary discovery.", sentinel_host ) return False return True @@ -156,7 +156,8 @@ def failover(self) -> None: client = self._get_sentinel_client() try: client.failover_primary_coordinated(self.state.endpoint) - client.is_failover_in_progress(self.state.endpoint) + if client.is_failover_in_progress(self.state.endpoint): + raise SentinelFailoverError("Failover is in progress after triggering failover.") except ValkeyWorkloadCommandError as e: logger.error("Failed to trigger failover: %s", e) raise SentinelFailoverError from e diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index aaf1c3f..c13d6a6 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -386,8 +386,7 @@ def get_primary_ip(juju: jubilant.Juju, app: str) -> str: if "role:master" in replication_info: return hostnames[0] # extract ip - match = re.search(r"master_host:([^\s]+)", replication_info) - if not match: + if not (match := re.search(r"master_host:([^\s]+)", replication_info)): raise ValueError("Could not find master_host in replication info") return match.group(1) From d801de9d2bfac44379d23f6b55314c24d9305b8d Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Mar 2026 09:35:20 +0000 Subject: [PATCH 153/159] rename ip to endpoint and add existing app --- tests/integration/ha/test_scaling.py | 93 ++++++++++++++++------------ tests/integration/helpers.py | 13 ++++ 2 files changed, 66 insertions(+), 40 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index cf45599..1aff2d0 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -16,6 +16,7 @@ APP_NAME, IMAGE_RESOURCE, are_apps_active_and_agents_idle, + existing_app, get_cluster_hostnames, get_number_connected_replicas, get_password, @@ -33,6 +34,9 @@ def test_build_and_deploy(charm: str, juju: jubilant.Juju, substrate: Substrate) -> None: """Build the charm-under-test and deploy it with three units.""" + if existing_app(juju): + return + juju.deploy( charm, resources=IMAGE_RESOURCE if substrate == Substrate.K8S else None, @@ -56,23 +60,24 @@ async def test_seed_data(juju: jubilant.Juju) -> None: async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: """Make sure new units are added to the valkey downtime.""" - init_units_count = len(juju.status().apps[APP_NAME].units) + app_name = existing_app(juju) or APP_NAME + init_units_count = len(juju.status().apps[app_name].units) await c_writes.async_clear() c_writes.start() # scale up - juju.add_unit(APP_NAME, num_units=NUM_UNITS - init_units_count) + juju.add_unit(app_name, num_units=NUM_UNITS - init_units_count) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, idle_period=10, unit_count=NUM_UNITS + status, app_name, idle_period=10, unit_count=NUM_UNITS ), timeout=1200, ) - num_units = len(juju.status().apps[APP_NAME].units) + num_units = len(juju.status().apps[app_name].units) assert num_units == NUM_UNITS, f"Expected {NUM_UNITS} units, got {num_units}." # check if all units have been added to the cluster - hostnames = get_cluster_hostnames(juju, APP_NAME) + hostnames = get_cluster_hostnames(juju, app_name) connected_replicas = await get_number_connected_replicas( hostnames=hostnames, @@ -100,8 +105,9 @@ async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_writes) -> None: """Make sure scale down operations complete successfully.""" + app_name = existing_app(juju) or APP_NAME number_of_replicas = await get_number_connected_replicas( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -114,17 +120,17 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ await asyncio.sleep(10) # let the continuous writes write some data # scale down - remove_number_units(juju, APP_NAME, num_units=1, substrate=substrate) + remove_number_units(juju, app_name, num_units=1, substrate=substrate) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 + status, app_name, unit_count=NUM_UNITS - 1, idle_period=10 ) ) - num_units = len(juju.status().get_units(APP_NAME)) + num_units = len(juju.status().get_units(app_name)) assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." number_of_replicas = await get_number_connected_replicas( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -136,7 +142,7 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ c_writes.update() await assert_continuous_writes_increasing( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -145,7 +151,7 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ logger.info(await c_writes.async_stop()) assert_continuous_writes_consistent( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -156,17 +162,18 @@ async def test_scale_down_multiple_units( juju: jubilant.Juju, substrate: Substrate, c_writes ) -> None: """Make sure multiple scale down operations complete successfully.""" - number_current_units = len(juju.status().apps[APP_NAME].units) - juju.add_unit(APP_NAME, num_units=(NUM_UNITS + 1) - number_current_units) + app_name = existing_app(juju) or APP_NAME + number_current_units = len(juju.status().apps[app_name].units) + juju.add_unit(app_name, num_units=(NUM_UNITS + 1) - number_current_units) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, idle_period=10, unit_count=NUM_UNITS + 1 + status, app_name, idle_period=10, unit_count=NUM_UNITS + 1 ), timeout=1200, ) number_of_replicas = await get_number_connected_replicas( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -179,18 +186,18 @@ async def test_scale_down_multiple_units( await asyncio.sleep(10) # let the continuous writes write some data # scale down multiple units - remove_number_units(juju, APP_NAME, num_units=2, substrate=substrate) + remove_number_units(juju, app_name, num_units=2, substrate=substrate) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 + status, app_name, unit_count=NUM_UNITS - 1, idle_period=10 ) ) - num_units = len(juju.status().get_units(APP_NAME)) + num_units = len(juju.status().get_units(app_name)) assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." number_of_replicas = await get_number_connected_replicas( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -201,7 +208,7 @@ async def test_scale_down_multiple_units( c_writes.update() await assert_continuous_writes_increasing( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) @@ -210,34 +217,35 @@ async def test_scale_down_multiple_units( logger.info(await c_writes.async_stop()) assert_continuous_writes_consistent( - hostnames=get_cluster_hostnames(juju, APP_NAME), + hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) await c_writes.async_clear() -async def test_scale_down_to_zero_and_back( +async def test_scale_down_to_zero_and_back_up( juju: jubilant.Juju, substrate: Substrate, c_writes ) -> None: """Make sure that removing all units and then adding them again works.""" + app_name = existing_app(juju) or APP_NAME # remove all remaining units remove_number_units( - juju, APP_NAME, num_units=len(juju.status().apps[APP_NAME].units), substrate=substrate + juju, app_name, num_units=len(juju.status().apps[app_name].units), substrate=substrate ) - juju.wait(lambda status: len(juju.status().get_units(APP_NAME)) == 0) + juju.wait(lambda status: len(juju.status().get_units(app_name)) == 0) # scale up again - juju.add_unit(APP_NAME, num_units=NUM_UNITS) + juju.add_unit(app_name, num_units=NUM_UNITS) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, unit_count=NUM_UNITS, idle_period=10 + status, app_name, unit_count=NUM_UNITS, idle_period=10 ), timeout=1200, ) - hostnames = get_cluster_hostnames(juju, APP_NAME) + hostnames = get_cluster_hostnames(juju, app_name) connected_replicas = await get_number_connected_replicas( hostnames=hostnames, @@ -270,31 +278,35 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w if substrate == Substrate.K8S: pytest.skip("Primary unit can only targeted on VM") + app_name = existing_app(juju) or APP_NAME + await c_writes.async_clear() c_writes.start() - primary_ip = get_primary_ip(juju, APP_NAME) + primary_endpoint = get_primary_ip(juju, app_name) primary_unit = next( unit - for unit, unit_value in juju.status().get_units(APP_NAME).items() - if unit_value.public_address == primary_ip + for unit, unit_value in juju.status().get_units(app_name).items() + if unit_value.public_address == primary_endpoint ) assert primary_unit is not None, "Failed to identify primary unit for scale down test." logger.info( - "Identified primary unit %s with IP %s for scale down test.", + "Identified primary unit %s with endpoint %s for scale down test.", primary_unit, - primary_ip, + primary_endpoint, ) juju.remove_unit(primary_unit) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, APP_NAME, unit_count=NUM_UNITS - 1, idle_period=10 + status, app_name, unit_count=NUM_UNITS - 1, idle_period=10 ) ) c_writes.update() - new_primary_ip = get_primary_ip(juju, APP_NAME) - assert new_primary_ip != primary_ip, "Primary IP did not change after removing primary unit." - logger.info(f"New primary IP after scale down is {new_primary_ip}.") - hostnames = get_cluster_hostnames(juju, APP_NAME) + new_primary_endpoint = get_primary_ip(juju, app_name) + assert new_primary_endpoint != primary_endpoint, ( + "Primary endpoint did not change after removing primary unit." + ) + logger.info(f"New primary endpoint after scale down is {new_primary_endpoint}.") + hostnames = get_cluster_hostnames(juju, app_name) await assert_continuous_writes_increasing( hostnames=hostnames, username=CharmUsers.VALKEY_ADMIN.value, @@ -312,10 +324,11 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w def test_scale_down_remove_application(juju: jubilant.Juju) -> None: """Make sure the application can be removed.""" - juju.remove_application(APP_NAME) + app_name = existing_app(juju) or APP_NAME + juju.remove_application(app_name) juju.wait( - lambda status: APP_NAME not in status.apps, + lambda status: app_name not in status.apps, timeout=600, delay=5, ) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index c13d6a6..d62cba5 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -678,3 +678,16 @@ def get_data_bag( else {} ) return {unit_name: local_data} | remote_data + + +def existing_app(juju: jubilant.Juju) -> str | None: + """Return the name of an existing valkey cluster. + + Returns: + str | None: name of an application deployment for `valkey` if it exists, None otherwise. + """ + for app_name, app_status in juju.status().apps.items(): + if "valkey" == app_status.charm_name: + return app_name + + return None From 4dc634038cf78353540daf13dd19377ed620e177 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Mar 2026 09:47:49 +0000 Subject: [PATCH 154/159] add support for existing app in scale tests --- tests/integration/ha/test_scaling.py | 84 +++++++++++++++++++--------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 1aff2d0..5574b83 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -66,15 +66,17 @@ async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: c_writes.start() # scale up - juju.add_unit(app_name, num_units=NUM_UNITS - init_units_count) + juju.add_unit(app_name, num_units=2) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, app_name, idle_period=10, unit_count=NUM_UNITS + status, app_name, idle_period=10, unit_count=init_units_count + 2 ), timeout=1200, ) num_units = len(juju.status().apps[app_name].units) - assert num_units == NUM_UNITS, f"Expected {NUM_UNITS} units, got {num_units}." + assert num_units == init_units_count + 2, ( + f"Expected {init_units_count + 2} units, got {num_units}." + ) # check if all units have been added to the cluster hostnames = get_cluster_hostnames(juju, app_name) @@ -84,8 +86,8 @@ async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert connected_replicas == NUM_UNITS - 1, ( - f"Expected {NUM_UNITS - 1} connected replicas, got {connected_replicas}." + assert connected_replicas == init_units_count + 1, ( + f"Expected {init_units_count + 1} connected replicas, got {connected_replicas}." ) await assert_continuous_writes_increasing( @@ -106,13 +108,25 @@ async def test_scale_up(juju: jubilant.Juju, c_writes) -> None: async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_writes) -> None: """Make sure scale down operations complete successfully.""" app_name = existing_app(juju) or APP_NAME + init_units_count = len(juju.status().apps[app_name].units) + + if init_units_count < 1: + juju.add_unit(app_name, num_units=NUM_UNITS - init_units_count) + init_units_count = NUM_UNITS + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, app_name, idle_period=10, unit_count=init_units_count + ), + timeout=1200, + ) + number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_replicas == NUM_UNITS - 1, ( - f"Expected {NUM_UNITS - 1} connected replicas, got {number_of_replicas}." + assert number_of_replicas == init_units_count - 1, ( + f"Expected {init_units_count - 1} connected replicas, got {number_of_replicas}." ) await c_writes.async_clear() @@ -123,19 +137,21 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ remove_number_units(juju, app_name, num_units=1, substrate=substrate) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, app_name, unit_count=NUM_UNITS - 1, idle_period=10 + status, app_name, unit_count=init_units_count - 1, idle_period=10 ) ) num_units = len(juju.status().get_units(app_name)) - assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + assert num_units == init_units_count - 1, ( + f"Expected {init_units_count - 1} units, got {num_units}." + ) number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_replicas == NUM_UNITS - 2, ( - f"Expected {NUM_UNITS - 2} connected replicas, got {number_of_replicas}." + assert number_of_replicas == init_units_count - 2, ( + f"Expected {init_units_count - 2} connected replicas, got {number_of_replicas}." ) # update hostnames after scale down @@ -147,7 +163,7 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - logger.info("Stopping continuous writes after scale up test.") + logger.info("Stopping continuous writes after scale down test.") logger.info(await c_writes.async_stop()) assert_continuous_writes_consistent( @@ -163,22 +179,24 @@ async def test_scale_down_multiple_units( ) -> None: """Make sure multiple scale down operations complete successfully.""" app_name = existing_app(juju) or APP_NAME - number_current_units = len(juju.status().apps[app_name].units) - juju.add_unit(app_name, num_units=(NUM_UNITS + 1) - number_current_units) - juju.wait( - lambda status: are_apps_active_and_agents_idle( - status, app_name, idle_period=10, unit_count=NUM_UNITS + 1 - ), - timeout=1200, - ) + init_units_count = len(juju.status().apps[app_name].units) + if init_units_count < NUM_UNITS + 1: + juju.add_unit(app_name, num_units=(NUM_UNITS + 1) - init_units_count) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, app_name, idle_period=10, unit_count=NUM_UNITS + 1 + ), + timeout=1200, + ) + init_units_count = NUM_UNITS + 1 number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_replicas == NUM_UNITS, ( - f"Expected {NUM_UNITS} connected replicas, got {number_of_replicas}." + assert number_of_replicas == init_units_count - 1, ( + f"Expected {init_units_count - 1} connected replicas, got {number_of_replicas}." ) await c_writes.async_clear() @@ -190,19 +208,21 @@ async def test_scale_down_multiple_units( juju.wait( lambda status: are_apps_active_and_agents_idle( - status, app_name, unit_count=NUM_UNITS - 1, idle_period=10 + status, app_name, unit_count=init_units_count - 2, idle_period=10 ) ) num_units = len(juju.status().get_units(app_name)) - assert num_units == NUM_UNITS - 1, f"Expected {NUM_UNITS - 1} units, got {num_units}." + assert num_units == init_units_count - 2, ( + f"Expected {init_units_count - 2} units, got {num_units}." + ) number_of_replicas = await get_number_connected_replicas( hostnames=get_cluster_hostnames(juju, app_name), username=CharmUsers.VALKEY_ADMIN.value, password=get_password(juju, user=CharmUsers.VALKEY_ADMIN), ) - assert number_of_replicas == NUM_UNITS - 2, ( - f"Expected {NUM_UNITS - 2} connected replicas, got {number_of_replicas}." + assert number_of_replicas == init_units_count - 3, ( + f"Expected {init_units_count - 3} connected replicas, got {number_of_replicas}." ) c_writes.update() @@ -279,6 +299,16 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w pytest.skip("Primary unit can only targeted on VM") app_name = existing_app(juju) or APP_NAME + init_units_count = len(juju.status().apps[app_name].units) + if init_units_count < NUM_UNITS: + juju.add_unit(app_name, num_units=NUM_UNITS - init_units_count) + juju.wait( + lambda status: are_apps_active_and_agents_idle( + status, app_name, idle_period=10, unit_count=NUM_UNITS + ), + timeout=1200, + ) + init_units_count = NUM_UNITS await c_writes.async_clear() c_writes.start() @@ -297,7 +327,7 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w juju.remove_unit(primary_unit) juju.wait( lambda status: are_apps_active_and_agents_idle( - status, app_name, unit_count=NUM_UNITS - 1, idle_period=10 + status, app_name, unit_count=init_units_count - 1, idle_period=10 ) ) c_writes.update() From 4e399a874ec896d756633f2bbc1ad8676c2ef603 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Tue, 17 Mar 2026 10:04:07 +0000 Subject: [PATCH 155/159] patch is_failover_in_progress --- tests/unit/test_scaledown.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_scaledown.py b/tests/unit/test_scaledown.py index d4cd7cc..81f472b 100644 --- a/tests/unit/test_scaledown.py +++ b/tests/unit/test_scaledown.py @@ -133,7 +133,9 @@ def test_primary(cloud_spec): patch("managers.sentinel.SentinelManager.get_primary_ip", return_value="valkey-0"), patch("workload_k8s.ValkeyK8sWorkload.stop") as mock_stop, patch("common.client.SentinelClient.failover_primary_coordinated") as mock_failover, - patch("common.client.SentinelClient.is_failover_in_progress") as mock_failover_in_progress, + patch( + "common.client.SentinelClient.is_failover_in_progress", return_value=False + ) as mock_failover_in_progress, patch("common.client.SentinelClient.reset") as mock_reset, patch( "common.client.SentinelClient.sentinels_primary", From a931e7ca9b258b647a8f545d759736dbf57d638c Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Mar 2026 08:26:45 +0000 Subject: [PATCH 156/159] only remove APP_NAME in tests --- tests/integration/ha/test_scaling.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 5574b83..5531fa6 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -354,11 +354,10 @@ async def test_scale_down_primary(juju: jubilant.Juju, substrate: Substrate, c_w def test_scale_down_remove_application(juju: jubilant.Juju) -> None: """Make sure the application can be removed.""" - app_name = existing_app(juju) or APP_NAME - juju.remove_application(app_name) + juju.remove_application(APP_NAME) juju.wait( - lambda status: app_name not in status.apps, + lambda status: APP_NAME not in status.apps, timeout=600, delay=5, ) From d0aeff68e3ecf90503ab2050d38e890502154354 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Mar 2026 08:30:16 +0000 Subject: [PATCH 157/159] minor feedback --- src/common/locks.py | 2 ++ tests/integration/ha/test_scaling.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common/locks.py b/src/common/locks.py index b1593e8..02e87e8 100644 --- a/src/common/locks.py +++ b/src/common/locks.py @@ -248,6 +248,8 @@ def request_lock(self, timeout: int | None = None, primary_ip: str | None = None ) return True + return False + @property def is_held_by_this_unit(self) -> bool: """Check if the local unit holds the lock.""" diff --git a/tests/integration/ha/test_scaling.py b/tests/integration/ha/test_scaling.py index 5531fa6..4121e96 100644 --- a/tests/integration/ha/test_scaling.py +++ b/tests/integration/ha/test_scaling.py @@ -110,7 +110,7 @@ async def test_scale_down_one_unit(juju: jubilant.Juju, substrate: Substrate, c_ app_name = existing_app(juju) or APP_NAME init_units_count = len(juju.status().apps[app_name].units) - if init_units_count < 1: + if init_units_count < NUM_UNITS: juju.add_unit(app_name, num_units=NUM_UNITS - init_units_count) init_units_count = NUM_UNITS juju.wait( From 449bce48602058c971a3898cb5925b8915a0c861 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Mar 2026 08:48:22 +0000 Subject: [PATCH 158/159] fix unit tests --- tests/unit/test_charm.py | 11 +++++++++-- tests/unit/test_tls.py | 6 ++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index 6e5374d..5e18ef6 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -13,6 +13,7 @@ INTERNAL_USERS_PASSWORD_CONFIG, INTERNAL_USERS_SECRET_LABEL_SUFFIX, PEER_RELATION, + PRIMARY_NAME, STATUS_PEERS_RELATION, CharmUsers, StartState, @@ -344,7 +345,10 @@ def test_update_status_leader_unit(cloud_spec): containers={container}, ) - with patch("managers.tls.TLSManager.will_certificate_expire"): + with ( + patch("managers.tls.TLSManager.will_certificate_expire"), + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), + ): state_out = ctx.run(ctx.on.update_status(), state_in) assert state_out.unit_status == ActiveStatus() @@ -363,7 +367,10 @@ def test_update_status_non_leader_unit(cloud_spec): relations={relation, status_peer_relation}, containers={container}, ) - with patch("managers.tls.TLSManager.will_certificate_expire"): + with ( + patch("managers.tls.TLSManager.will_certificate_expire"), + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), + ): state_out = ctx.run(ctx.on.update_status(), state_in) assert state_out.unit_status == ActiveStatus() diff --git a/tests/unit/test_tls.py b/tests/unit/test_tls.py index 2f49a08..6b7296d 100644 --- a/tests/unit/test_tls.py +++ b/tests/unit/test_tls.py @@ -698,6 +698,7 @@ def test_internal_peer_ca_rotation_single_unit(cloud_spec): patch("managers.tls.TLSManager.rehash_ca_certificates"), patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), ): state_out = ctx.run(ctx.on.relation_changed(peer_relation, remote_unit=1), state_in) @@ -739,6 +740,7 @@ def test_internal_peer_ca_rotation_started(cloud_spec): patch("managers.tls.TLSManager.rehash_ca_certificates"), patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), ): state_out = ctx.run(ctx.on.relation_changed(peer_relation, remote_unit=1), state_in) @@ -783,6 +785,7 @@ def test_ca_rotation_not_all_units_added(cloud_spec): ) with ( patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), ): state_out = ctx.run(ctx.on.relation_changed(peer_relation), state_in) @@ -828,6 +831,7 @@ def test_ca_rotation_all_units_added(cloud_spec): with ( patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), ): state_out = ctx.run(ctx.on.relation_changed(peer_relation), state_in) @@ -872,6 +876,7 @@ def test_ca_rotation_not_all_units_ca_updated(cloud_spec): ) with ( patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), ): state_out = ctx.run(ctx.on.relation_changed(peer_relation), state_in) @@ -918,6 +923,7 @@ def test_ca_rotation_all_units_ca_updated(cloud_spec): patch("managers.cluster.ClusterManager.reload_tls_settings") as reload_tls, patch("managers.sentinel.SentinelManager.restart_service"), patch("managers.tls.TLSManager.rehash_ca_certificates"), + patch("common.client.SentinelClient.primary", return_value={"quorum": "1"}), ): state_out = ctx.run(ctx.on.relation_changed(peer_relation), state_in) From 0358c3b3b9862b1df3d968f2fe4726a938319244 Mon Sep 17 00:00:00 2001 From: Smail Kourta Date: Wed, 18 Mar 2026 21:31:13 +0000 Subject: [PATCH 159/159] increase idle times to stabilize tests --- tests/integration/test_charm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index a89ab1e..5f27728 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -84,7 +84,7 @@ async def test_update_admin_password(juju: jubilant.Juju) -> None: # wait for config-changed hook to finish executing juju.wait( - lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=1200, ) @@ -164,7 +164,7 @@ async def test_update_admin_password_wrong_username(juju: jubilant.Juju) -> None set_password(juju, username=CharmUsers.VALKEY_ADMIN.value, password=new_password) # wait for config-changed hook to finish executing juju.wait( - lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=1200, ) @@ -224,7 +224,7 @@ async def test_user_secret_permissions(juju: jubilant.Juju) -> None: with fast_forward(juju): juju.grant_secret(identifier=secret_name, app=APP_NAME) juju.wait( - lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=10), + lambda status: are_apps_active_and_agents_idle(status, APP_NAME, idle_period=30), timeout=1200, )