[sanity_checks]: add critical process check in sanity checks (#1617)

- first read the critical process list (/etc/supervisord/critical_process) from the container, and then check if any of the process crashed. - add snmp container to the critical service list - add auto_recover support Signed-off-by: Guohan Lu <gulv@microsoft.com> * add comment Signed-off-by: Guohan Lu <gulv@microsoft.com> * add auto_recover support if process check failed
sonic-net · Apr 26, 2020 · 2e8d641 · 2e8d641
1 parent 9a0e00f
commit 2e8d641
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 8 deletions.
diff --git a/tests/common/devices.py b/tests/common/devices.py
@@ -10,6 +10,7 @@
 import json
 import logging
 import os
+import re
 from multiprocessing.pool import ThreadPool
 from datetime import datetime
 
@@ -91,7 +92,7 @@ class SonicHost(AnsibleHostBase):
 
     For running ansible module on the SONiC switch
     """
-    CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"]
+    CRITICAL_SERVICES = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp", "snmp"]
 
     def __init__(self, ansible_adhoc, hostname, gather_facts=False):
         AnsibleHostBase.__init__(self, ansible_adhoc, hostname)
@@ -214,6 +215,55 @@ def critical_services_fully_started(self):
         logging.debug("Status of critical services: %s" % str(result))
         return all(result.values())
 
+    def critical_process_status(self, service):
+        """
+        @summary: Check whether critical process status of a service.
+
+        @param service: Name of the SONiC service
+        """
+        result = {'status': True}
+        result['exited_critical_process'] = []
+        result['running_critical_process'] = []
+        critical_process_list = []
+
+        # return false if the service is not started
+        service_status = self.is_service_fully_started(service)
+        if service_status == False:
+            result['status'] = False
+            return result
+
+        # get critical process list for the service
+        output = self.command("docker exec {} bash -c '[ -f /etc/supervisor/critical_processes ] && cat /etc/supervisor/critical_processes'".format(service), module_ignore_errors=True)
+        for l in output['stdout'].split():
+            critical_process_list.append(l.rstrip())
+        if len(critical_process_list) == 0:
+            return result
+
+        # get process status for the service
+        output = self.command("docker exec {} supervisorctl status".format(service))
+        logging.info("====== supervisor process status for service {} ======".format(service))
+
+        for l in output['stdout_lines']:
+            (pname, status, info) = re.split("\s+", l, 2)
+            if status != "RUNNING":
+                if pname in critical_process_list:
+                    result['exited_critical_process'].append(pname)
+                    result['status'] = False
+            else:
+                if pname in critical_process_list:
+                    result['running_critical_process'].append(pname)
+
+        return result
+
+    def all_critical_process_status(self):
+        """
+        @summary: Check whether all critical processes status for all critical services
+        """
+        result = {}
+        for service in self.CRITICAL_SERVICES:
+            result[service] = self.critical_process_status(service)
+        return result
+
     def get_crm_resources(self):
         """
         @summary: Run the "crm show resources all" command and parse its output
@@ -257,7 +307,7 @@ def get_pmon_daemon_states(self):
 
         daemons = self.shell('docker exec pmon supervisorctl status')['stdout_lines']
 
-        daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ] 
+        daemon_list = [ line.strip().split()[0] for line in daemons if len(line.strip()) > 0 ]
 
         daemon_ctl_key_prefix = 'skip_'
         daemon_config_file_path = os.path.join('/usr/share/sonic/device', self.facts["platform"], 'pmon_daemon_control.json')
@@ -294,7 +344,7 @@ def num_npus(self):
         return the number of NPUs on the DUT
         """
         return self.facts["num_npu"]
-    
+
     def get_syncd_docker_names(self):
         """
         @summary: get the list of syncd dockers names for the number of NPUs present on the DUT
@@ -454,10 +504,10 @@ def get_fanout_os(self):
 
     def get_fanout_type(self):
         return self.type
-    
+
     def shutdown(self, interface_name):
         return self.host.shutdown(interface_name)[self.hostname]
-    
+
     def no_shutdown(self, interface_name):
         return self.host.no_shutdown(interface_name)[self.hostname]
 
@@ -466,7 +516,7 @@ def command(self, cmd):
 
     def __str__(self):
         return "{ os: '%s', hostname: '%s', device_type: '%s' }" % (self.os, self.hostname, self.type)
-    
+
     def __repr__(self):
         return self.__str__()
 

diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py
@@ -97,6 +97,8 @@ def check_interfaces(dut):
     return check_result
 
 def check_dbmemory(dut):
+    logger.info("Checking database memory...")
+
     total_omem = 0
     re_omem = re.compile("omem=(\d+)")
     res = dut.command("/usr/bin/redis-cli client list")
@@ -115,6 +117,46 @@ def check_dbmemory(dut):
     logger.info("Done checking database memory")
     return check_result
 
+def check_processes(dut):
+    logger.info("Checking process status...")
+
+    networking_uptime = dut.get_networking_uptime().seconds
+    timeout = max((SYSTEM_STABILIZE_MAX_TIME - networking_uptime), 0)
+    interval = 20
+    logger.info("networking_uptime=%d seconds, timeout=%d seconds, interval=%d seconds" % \
+                (networking_uptime, timeout, interval))
+
+    check_result = {"failed": False, "check_item": "processes"}
+    if timeout == 0:    # Check processes status, do not retry.
+        processes_status = dut.all_critical_process_status()
+        check_result["processes_status"] = processes_status
+        check_result["services_status"] = {}
+        for k, v in processes_status.items():
+            if v['status'] == False or len(v['exited_critical_process']) > 0:
+                check_result['failed'] = True
+            check_result["services_status"].update({k: v['status']})
+    else:               # Retry checking processes status
+        start = time.time()
+        elapsed = 0
+        while elapsed < timeout:
+            processes_status = dut.all_critical_process_status()
+            check_result["processes_status"] = processes_status
+            check_result["services_status"] = {}
+            for k, v in processes_status.items():
+                if v['status'] == False or len(v['exited_critical_process']) > 0:
+                    check_result['failed'] = True
+                check_result["services_status"].update({k: v['status']})
+
+            if check_result["failed"]:
+                wait(interval, msg="Not all processes are started, wait %d seconds to retry. Remaining time: %d %s" % \
+                     (interval, int(timeout - elapsed), str(check_result["processes_status"])))
+                elapsed = time.time() - start
+            else:
+                break
+
+    logger.info("Done checking processes status.")
+    return check_result
+
 def do_checks(dut, check_items):
     results = []
     for item in check_items:
@@ -124,6 +166,8 @@ def do_checks(dut, check_items):
             results.append(check_interfaces(dut))
         elif item == "dbmemory":
             results.append(check_dbmemory(dut))
+        elif item == "processes":
+            results.append(check_processes(dut))
 
     return results
 

diff --git a/tests/common/plugins/sanity_check/constants.py b/tests/common/plugins/sanity_check/constants.py
@@ -20,4 +20,4 @@
     "adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30},
 }       # All supported recover methods
 
-SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory"]          # Supported checks
+SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes"]          # Supported checks
diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py
@@ -58,7 +58,7 @@ def adaptive_recover(dut, localhost, fanouthosts, check_results, wait_time):
             logging.info("Restoring {}".format(result))
             if result['check_item'] == 'interfaces':
                 __recover_interfaces(dut, fanouthosts, result, wait_time)
-            elif result['check_item'] == 'services':
+            elif result['check_item'] in ['services', 'processes']:
                 action             = __recover_services(dut, result)
                 # Only allow outstanding_action be overridden when it is
                 # None. In case the outstanding_action has already been