Skip to content

Commit 4f0bab6

Browse files
[DPE-8050] Backup action cluster checks (#679)
1 parent f122090 commit 4f0bab6

File tree

5 files changed

+134
-79
lines changed

5 files changed

+134
-79
lines changed

actions.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ set-tls-private-key:
4545
create-backup:
4646
description: Create a database backup using xtrabackup.
4747
S3 credentials are retrieved from a relation with the S3 integrator charm.
48+
params:
49+
force:
50+
type: boolean
51+
default: False
52+
description: |
53+
Whether to ignore cluster health concerns and create the backup regardless.
54+
Use it with caution, as it can potentially create a backup from stale data.
4855
4956
list-backups:
5057
description: List available backup_ids in the S3 bucket and path provided by the S3 integrator charm.

docs/how-to/back-up-and-restore/create-a-backup.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,18 @@ Once `juju status` shows Charmed MySQL as `active` and `idle`, you can create yo
1515
juju run mysql/leader create-backup
1616
```
1717

18-
If you have a cluster of one unit, you can run the `create-backup` action on `mysql-k8s/leader` (which will also be the primary unit).
18+
If you have a cluster of one unit, you can run the `create-backup` action on the leader (which will also be the primary unit).
19+
Otherwise, you must run the `create-backup` action on a non-primary unit. To find the primary, see `juju status` or
20+
run `juju run mysql/leader get-cluster-status` to find the primary unit.
1921

20-
Otherwise, you must run the `create-backup` action on a non-primary unit. To find the primary, see `juju status` or run `juju run mysql-k8s/leader get-cluster-status` to find the primary unit).
22+
The `create-backup` action validates that the unit in charge of the backup is healthy, by:
23+
- Checking that the MySQL cluster is in a valid state (`OK` or `OK_PARTIAL` from the InnoDB [cluster status](https://dev.mysql.com/doc/mysql-shell/8.0/en/monitoring-innodb-cluster.html))
24+
- Checking that the MySQL instance is in a valid state (`ONLINE` from Replication [member states](https://dev.mysql.com/doc/refman/8.0/en/group-replication-server-states.html).
25+
26+
In order to override these precautions, use the `force` flag:
27+
```shell
28+
juju run mysql/leader create-backup force=True
29+
```
2130

2231
## List backups
2332
You can list your available, failed, and in progress backups by running the `list-backups` command:

lib/charms/mysql/v0/backups.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def is_unit_blocked(self) -> bool:
5757
S3Requirer,
5858
)
5959
from charms.mysql.v0.mysql import (
60+
MySQLClusterState,
6061
MySQLConfigureInstanceError,
6162
MySQLCreateClusterError,
6263
MySQLCreateClusterSetError,
@@ -66,6 +67,7 @@ def is_unit_blocked(self) -> bool:
6667
MySQLExecuteBackupCommandsError,
6768
MySQLInitializeJujuOperationsTableError,
6869
MySQLKillSessionError,
70+
MySQLMemberState,
6971
MySQLNoMemberStateError,
7072
MySQLOfflineModeAndHiddenInstanceExistsError,
7173
MySQLPrepareBackupForRestoreError,
@@ -111,7 +113,7 @@ def is_unit_blocked(self) -> bool:
111113

112114
# Increment this PATCH version before using `charmcraft publish-lib` or reset
113115
# to 0 if you are raising the major API version
114-
LIBPATCH = 16
116+
LIBPATCH = 17
115117

116118
ANOTHER_S3_CLUSTER_REPOSITORY_ERROR_MESSAGE = "S3 repository claimed by another cluster"
117119
MOVE_RESTORED_CLUSTER_TO_ANOTHER_S3_REPOSITORY_ERROR = (
@@ -280,6 +282,7 @@ def _pre_create_backup_checks(self, event: ActionEvent) -> bool:
280282
def _on_create_backup(self, event: ActionEvent) -> None:
281283
"""Handle the create backup action."""
282284
logger.info("A backup has been requested on unit")
285+
force = event.params.get("force", False)
283286

284287
if not self._pre_create_backup_checks(event):
285288
return
@@ -295,9 +298,16 @@ def _on_create_backup(self, event: ActionEvent) -> None:
295298

296299
backup_path = str(pathlib.Path(s3_parameters["path"]) / datetime_backup_requested)
297300

301+
# Check if this cluster can perform backup
302+
can_cluster_perform_backup, validation_message = self._can_cluster_perform_backup()
303+
if not (can_cluster_perform_backup or force):
304+
logger.error(f"Backup failed: {validation_message}")
305+
event.fail(validation_message or "")
306+
return
307+
298308
# Check if this unit can perform backup
299309
can_unit_perform_backup, validation_message = self._can_unit_perform_backup()
300-
if not can_unit_perform_backup:
310+
if not (can_unit_perform_backup or force):
301311
logger.error(f"Backup failed: {validation_message}")
302312
event.fail(validation_message or "")
303313
return
@@ -355,6 +365,21 @@ def _on_create_backup(self, event: ActionEvent) -> None:
355365
})
356366
self.charm._on_update_status(None)
357367

368+
def _can_cluster_perform_backup(self) -> tuple[bool, str | None]:
369+
"""Validates whether this cluster can perform a backup.
370+
371+
Returns: tuple of (success, error_message)
372+
"""
373+
cluster_status = self.charm._mysql.get_cluster_status()
374+
if not cluster_status:
375+
return False, "Cluster status unknown"
376+
377+
cluster_status = cluster_status["defaultreplicaset"]["status"]
378+
if cluster_status not in [MySQLClusterState.OK, MySQLClusterState.OK_PARTIAL]:
379+
return False, "Cluster is not in a healthy state"
380+
381+
return True, None
382+
358383
def _can_unit_perform_backup(self) -> tuple[bool, str | None]:
359384
"""Validates whether this unit can perform a backup.
360385
@@ -381,7 +406,7 @@ def _can_unit_perform_backup(self) -> tuple[bool, str | None]:
381406
if role == "primary" and self.charm.app.planned_units() > 1:
382407
return False, "Unit cannot perform backups as it is the cluster primary"
383408

384-
if state in ["recovering", "offline", "error"]:
409+
if state not in [MySQLMemberState.ONLINE]:
385410
return False, f"Unit cannot perform backups as its state is {state}"
386411

387412
return True, None

lib/charms/mysql/v0/mysql.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def wait_until_mysql_connection(self) -> None:
127127
# Increment this major API version when introducing breaking changes
128128
LIBAPI = 0
129129

130-
LIBPATCH = 93
130+
LIBPATCH = 94
131131

132132
UNIT_TEARDOWN_LOCKNAME = "unit-teardown"
133133
UNIT_ADD_LOCKNAME = "unit-add"
@@ -916,11 +916,7 @@ def get_cluster_endpoints(self, relation_name: str) -> tuple[str, str, str]:
916916

917917
return ",".join(rw_endpoints), ",".join(ro_endpoints), ",".join(no_endpoints)
918918

919-
def get_secret(
920-
self,
921-
scope: Scopes,
922-
key: str,
923-
) -> str | None:
919+
def get_secret(self, scope: Scopes, key: str) -> str | None:
924920
"""Get secret from the secret storage.
925921
926922
Retrieve secret from juju secrets backend if secret exists there.
@@ -1012,7 +1008,18 @@ class MySQLMemberState(str, enum.Enum):
10121008
class MySQLClusterState(str, enum.Enum):
10131009
"""MySQL Cluster state."""
10141010

1011+
# TODO: python 3.11 has new enum.StrEnum
1012+
# that can remove str inheritance
1013+
10151014
OK = "ok"
1015+
OK_PARTIAL = "ok_partial"
1016+
OK_NO_TOLERANCE = "ok_no_tolerance"
1017+
OK_NO_TOLERANCE_PARTIAL = "ok_no_tolerance_partial"
1018+
NO_QUORUM = "no_quorum"
1019+
OFFLINE = "offline"
1020+
ERROR = "error"
1021+
UNREACHABLE = "unreachable"
1022+
UNKNOWN = "unknown"
10161023
FENCED = "fenced_writes"
10171024

10181025

0 commit comments

Comments
 (0)