Skip to content

Commit

Permalink
[generic-config-updater] Handle failed service restarts (sonic-net#2020)
Browse files Browse the repository at this point in the history
What I did
During config update, update of certain tables do demand service restart.
With multiple related updates are not grouped together, this might result in too many service restarts, which could fail with "hitting start limit". When that happens, call reset-failed, try to restart. If it fails again, take a pause and try to restart again.

How I did it
When service restart fails, call reset-failed, try, pause and then call service restart again.
  • Loading branch information
renukamanavalan authored Jan 20, 2022
1 parent 48b5e73 commit 4f2773c
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 10 deletions.
35 changes: 32 additions & 3 deletions generic_config_updater/services_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,42 @@ def set_verbose(verbose=False):

def _service_restart(svc_name):
rc = os.system(f"systemctl restart {svc_name}")
logger.log(logger.LOG_PRIORITY_NOTICE,
f"Restarted {svc_name}", print_to_console)
if rc != 0:
# This failure is likely due to too many restarts
#
rc = os.system(f"systemctl reset-failed {svc_name}")
logger.log(logger.LOG_PRIORITY_ERROR,
f"Service has been reset. rc={rc}; Try restart again...",
print_to_console)

rc = os.system(f"systemctl restart {svc_name}")
if rc != 0:
# Even with reset-failed, restart fails.
# Give a pause before retry.
#
logger.log(logger.LOG_PRIORITY_ERROR,
f"Restart failed for {svc_name} rc={rc} after reset; Pause for 10s & retry",
print_to_console)
os.system("sleep 10s")
rc = os.system(f"systemctl restart {svc_name}")

if rc == 0:
logger.log(logger.LOG_PRIORITY_NOTICE,
f"Restart succeeded for {svc_name}",
print_to_console)
else:
logger.log(logger.LOG_PRIORITY_ERROR,
f"Restart failed for {svc_name} rc={rc}",
print_to_console)
return rc == 0


def rsyslog_validator(old_config, upd_config, keys):
return _service_restart("rsyslog-config")
rc = os.system("/usr/bin/rsyslog-config.sh")
if rc != 0:
return _service_restart("rsyslog")
else:
return True


def dhcp_validator(old_config, upd_config, keys):
Expand Down
43 changes: 36 additions & 7 deletions tests/generic_config_updater/service_validator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,25 @@
from collections import defaultdict
from unittest.mock import patch

from generic_config_updater.services_validator import vlan_validator
from generic_config_updater.services_validator import vlan_validator, rsyslog_validator
import generic_config_updater.gu_common


# Mimics os.system call
#
os_system_expected_cmd = ""
os_system_calls = []
os_system_call_index = 0
msg = ""

def os_system_cfggen(cmd):
assert cmd == os_system_expected_cmd, msg
return 0
def mock_os_system_call(cmd):
global os_system_calls, os_system_call_index

assert os_system_call_index < len(os_system_calls)
entry = os_system_calls[os_system_call_index]
os_system_call_index += 1

assert cmd == entry["cmd"], msg
return entry["rc"]


test_data = [
Expand Down Expand Up @@ -53,19 +60,41 @@ def os_system_cfggen(cmd):
}
]

test_rsyslog_fail = [
# Fail the calls, to get the entire fail path calls invoked
#
{ "cmd": "/usr/bin/rsyslog-config.sh", "rc": 1 }, # config update; fails
{ "cmd": "systemctl restart rsyslog", "rc": 1 }, # rsyslog restart; fails
{ "cmd": "systemctl reset-failed rsyslog", "rc": 1 }, # reset; failure here just logs
{ "cmd": "systemctl restart rsyslog", "rc": 1 }, # restart again; fails
{ "cmd": "sleep 10s", "rc": 0 }, # sleep; rc ignored
{ "cmd": "systemctl restart rsyslog", "rc": 1 }, # restart again; fails
]


class TestServiceValidator(unittest.TestCase):

@patch("generic_config_updater.change_applier.os.system")
def test_change_apply(self, mock_os_sys):
global os_system_expected_cmd
global os_system_calls, os_system_call_index

mock_os_sys.side_effect = os_system_cfggen
mock_os_sys.side_effect = mock_os_system_call

i = 0
for entry in test_data:
os_system_expected_cmd = entry["cmd"]
if entry["cmd"]:
os_system_calls.append({"cmd": entry["cmd"], "rc": 0 })
msg = "case failed: {}".format(str(entry))

vlan_validator(entry["old"], entry["upd"], None)


# Test failure case
#
os_system_calls = test_rsyslog_fail
os_system_call_index = 0

rc = rsyslog_validator("", "", "")
assert not rc, "rsyslog_validator expected to fail"

0 comments on commit 4f2773c

Please sign in to comment.