Skip to content

Commit

Permalink
[sanity_checks]: add critical process check in sanity checks (#1617)
Browse files Browse the repository at this point in the history
- first read the critical process list (/etc/supervisord/critical_process)
from the container, and then check if any of the process crashed.

- add snmp container to the critical service list

- add auto_recover support

Signed-off-by: Guohan Lu <gulv@microsoft.com>

* add comment

Signed-off-by: Guohan Lu <gulv@microsoft.com>

* add auto_recover support if process check failed
  • Loading branch information
lguohan authored Apr 26, 2020
1 parent 9a0e00f commit 2e8d641
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 8 deletions.
62 changes: 56 additions & 6 deletions tests/common/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import json
import logging
import os
import re
from multiprocessing.pool import ThreadPool
from datetime import datetime

Expand Down Expand Up @@ -91,7 +92,7 @@ class SonicHost(AnsibleHostBase):
For running ansible module on the SONiC switch
"""
CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"]
CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"]

def __init__(self, ansible_adhoc, hostname, gather_facts=False):
AnsibleHostBase.__init__(self, ansible_adhoc, hostname)
Expand Down Expand Up @@ -214,6 +215,55 @@ def critical_services_fully_started(self):
logging.debug("Status of critical services: %s" % str(result))
return all(result.values())

def critical_process_status(self, service):
"""
@summary: Check whether critical process status of a service.
@param service: Name of the SONiC service
"""
result = {'status': True}
result['exited_critical_process'] = []
result['running_critical_process'] = []
critical_process_list = []

# return false if the service is not started
service_status = self.is_service_fully_started(service)
if service_status == False:
result['status'] = False
return result

# get critical process list for the service
output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True)
for l in output['stdout'].split():
critical_process_list.append(l.rstrip())
if len(critical_process_list) == 0:
return result

# get process status for the service
output = self.command("docker exec {} supervisorctl status".format(service))
logging.info("====== supervisor process status for service {} ======".format(service))

for l in output['stdout_lines']:
(pname, status, info) = re.split("\s+", l, 2)
if status != "RUNNING":
if pname in critical_process_list:
result['exited_critical_process'].append(pname)
result['status'] = False
else:
if pname in critical_process_list:
result['running_critical_process'].append(pname)

return result

def all_critical_process_status(self):
"""
@summary: Check whether all critical processes status for all critical services
"""
result = {}
for service in self.CRITICAL_SERVICES:
result[service] = self.critical_process_status(service)
return result

def get_crm_resources(self):
"""
@summary: Run the "crm show resources all" command and parse its output
Expand Down Expand Up @@ -257,7 +307,7 @@ def get_pmon_daemon_states(self):

daemons = self.shell('docker exec pmon supervisorctl status')['stdout_lines']

daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ]
daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ]

daemon_ctl_key_prefix = 'skip_'
daemon_config_file_path = os.path.join('/usr/share/sonic/device', self.facts["platform"], 'pmon_daemon_control.json')
Expand Down Expand Up @@ -294,7 +344,7 @@ def num_npus(self):
return the number of NPUs on the DUT
"""
return self.facts["num_npu"]

def get_syncd_docker_names(self):
"""
@summary: get the list of syncd dockers names for the number of NPUs present on the DUT
Expand Down Expand Up @@ -454,10 +504,10 @@ def get_fanout_os(self):

def get_fanout_type(self):
return self.type

def shutdown(self, interface_name):
return self.host.shutdown(interface_name)[self.hostname]

def no_shutdown(self, interface_name):
return self.host.no_shutdown(interface_name)[self.hostname]

Expand All @@ -466,7 +516,7 @@ def command(self, cmd):

def __str__(self):
return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type)

def __repr__(self):
return self.__str__()

Expand Down
44 changes: 44 additions & 0 deletions tests/common/plugins/sanity_check/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def check_interfaces(dut):
return check_result

def check_dbmemory(dut):
logger.info("Checking database memory...")

total_omem = 0
re_omem = re.compile("omem=(\d+)")
res = dut.command("/usr/bin/redis-cli client list")
Expand All @@ -115,6 +117,46 @@ def check_dbmemory(dut):
logger.info("Done checking database memory")
return check_result

def check_processes(dut):
logger.info("Checking process status...")

networking_uptime = dut.get_networking_uptime().seconds
timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0)
interval = 20
logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \
(networking_uptime, timeout, interval))

check_result = {"failed": False, "check_item": "processes"}
if timeout == 0: # Check processes status, do not retry.
processes_status = dut.all_critical_process_status()
check_result["processes_status"] = processes_status
check_result["services_status"] = {}
for k, v in processes_status.items():
if v['status'] == False or len(v['exited_critical_process']) > 0:
check_result['failed'] = True
check_result["services_status"].update({k: v['status']})
else: # Retry checking processes status
start = time.time()
elapsed = 0
while elapsed < timeout:
processes_status = dut.all_critical_process_status()
check_result["processes_status"] = processes_status
check_result["services_status"] = {}
for k, v in processes_status.items():
if v['status'] == False or len(v['exited_critical_process']) > 0:
check_result['failed'] = True
check_result["services_status"].update({k: v['status']})

if check_result["failed"]:
wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \
(interval, int(timeout - elapsed), str(check_result["processes_status"])))
elapsed = time.time() - start
else:
break

logger.info("Done checking processes status.")
return check_result

def do_checks(dut, check_items):
results = []
for item in check_items:
Expand All @@ -124,6 +166,8 @@ def do_checks(dut, check_items):
results.append(check_interfaces(dut))
elif item == "dbmemory":
results.append(check_dbmemory(dut))
elif item == "processes":
results.append(check_processes(dut))

return results

Expand Down
2 changes: 1 addition & 1 deletion tests/common/plugins/sanity_check/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30},
} # All supported recover methods

SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"] # Supported checks
SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"] # Supported checks
2 changes: 1 addition & 1 deletion tests/common/plugins/sanity_check/recover.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time):
logging.info("Restoring {}".format(result))
if result['check_item'] == 'interfaces':
__recover_interfaces(dut, fanouthosts, result, wait_time)
elif result['check_item'] == 'services':
elif result['check_item'] in ['services', 'processes']:
action = __recover_services(dut, result)
# Only allow outstanding_action be overridden when it is
# None. In case the outstanding_action has already been
Expand Down

0 comments on commit 2e8d641

Please sign in to comment.