Skip to content

Commit

Permalink
Adding ceph health check bypass in sat bootsys ncn-power
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1787
Reviewer:Ryan
Adding ceph health check bypass prompt for the user to decide whether to wait
or proceed with skipping the health check after unfreezing of ceph is done.
As it may take some time and the next steps may not explicitly require, by the
time it comes back it would be good to use.
  • Loading branch information
Shivaprasad Ashok Metimath committed Jul 15, 2024
1 parent 0e03fd5 commit 693c00a
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 8 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
ncn power stage
- If containers fail to stop, automate the procedure of trying to stop them again
in the `platform-services` stage.
- Adding a ceph health check bypass prompt to take input from user and act accordingly.
unfreezing of ceph would be done, only the wait period will be skipped if user wishes to.

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
Expand Down
50 changes: 42 additions & 8 deletions sat/cli/bootsys/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,34 @@ def do_ceph_freeze():
raise FatalPlatformError(str(err))


def prompt_user_for_health_check():
"""Prompt the user to decide if they want to skip the Ceph health check.
Returns:
bool: True if the user wants to skip the health check, False otherwise.
"""
while True:
response = input("Do you want to skip the Ceph health check after unfreezing? (yes/no): ").strip().lower()
if response in ['yes', 'no']:
return response == 'yes'
else:
print("Invalid response. Please enter 'yes' or 'no'.")


def prompt_user_for_health_check():
"""Prompt the user to decide if they want to skip the Ceph health check.
Returns:
bool: True if the user wants to skip the health check, False otherwise.
"""
while True:
response = input("Do you want to skip the Ceph health check after unfreezing? (yes/no): ").strip().lower()
if response in ['yes', 'no']:
return response == 'yes'
else:
print("Invalid response. Please enter 'yes' or 'no'.")


def do_ceph_unfreeze(ncn_groups):
"""Start inactive Ceph services, unfreeze Ceph and wait for it to be healthy.
Expand All @@ -551,14 +579,20 @@ def do_ceph_unfreeze(ncn_groups):
except RuntimeError as err:
raise FatalPlatformError(str(err))

with BeginEndLogger('wait for ceph health'):
ceph_timeout = get_config_value('bootsys.ceph_timeout')
LOGGER.info(f'Waiting up to {ceph_timeout} seconds for Ceph to become healthy after unfreeze')
ceph_waiter = CephHealthWaiter(ceph_timeout, storage_hosts, retries=1)
if not ceph_waiter.wait_for_completion():
raise FatalPlatformError(f'Ceph is not healthy. Please correct Ceph health and try again.')
else:
LOGGER.info('Ceph is healthy.')
# Prompt the user to decide if they want to skip the Ceph health check
skip_health_check = prompt_user_for_health_check()

if skip_health_check:
LOGGER.info("Skipping Ceph health check as per user's request.")
else:
with BeginEndLogger('wait for ceph health'):
ceph_timeout = get_config_value('bootsys.ceph_timeout')
LOGGER.info(f'Waiting up to {ceph_timeout} seconds for Ceph to become healthy after unfreeze')
ceph_waiter = CephHealthWaiter(ceph_timeout, storage_hosts, retries=1)
if not ceph_waiter.wait_for_completion():
raise FatalPlatformError(f'Ceph is not healthy. Please correct Ceph health and try again.')
else:
LOGGER.info('Ceph is healthy.')


def do_etcd_snapshot(ncn_groups):
Expand Down

0 comments on commit 693c00a

Please sign in to comment.