From 4fa81b4f8d97281f19bd4cce379cd72eb56fa259 Mon Sep 17 00:00:00 2001 From: yozhao101 <56170650+yozhao101@users.noreply.github.com> Date: Thu, 25 Jun 2020 21:18:21 -0700 Subject: [PATCH] [dockers] Update critical_processes file syntax (#4831) **- Why I did it** Initially, the critical_processes file contains either the name of critical process or the name of group. For example, the critical_processes file in the dhcp_relay container contains a single group name `isc-dhcp-relay`. When testing the autorestart feature of each container, we need get all the critical processes and test whether a container can be restarted correctly if one of its critical processes is killed. However, it will be difficult to differentiate whether the names in the critical_processes file are the critical processes or group names. At the same time, changing the syntax in this file will separate the individual process from the groups and also makes it clear to the user. Right now the critical_processes file contains two different kind of entries. One is "program:xxx" which indicates a critical process. Another is "group:xxx" which indicates a group of critical processes managed by supervisord using the name "xxx". At the same time, I also updated the logic to parse the file critical_processes in supervisor-proc-event-listener script. **- How to verify it** We can first enable the autorestart feature of a specified container for example `dhcp_relay` by running the comman `sudo config container feature autorestart dhcp_relay enabled` on DUT. Then we can select a critical process from the command `docker top dhcp_relay` and use the command `sudo kill -SIGKILL ` to kill that critical process. Final step is to check whether the container is restarted correctly or not. --- dockers/docker-database/critical_processes | 2 +- dockers/docker-dhcp-relay/critical_processes | 2 +- dockers/docker-fpm-frr/critical_processes | 10 ++--- dockers/docker-fpm-gobgp/critical_processes | 4 +- dockers/docker-fpm-quagga/critical_processes | 8 ++-- dockers/docker-lldp/critical_processes | 6 +-- dockers/docker-nat/critical_processes | 4 +- dockers/docker-orchagent/critical_processes | 20 +++++----- .../critical_processes | 6 +-- .../critical_processes | 2 +- dockers/docker-sflow/critical_processes | 2 +- dockers/docker-snmp/critical_processes | 4 +- .../docker-sonic-restapi/critical_processes | 2 +- .../docker-sonic-telemetry/critical_processes | 4 +- dockers/docker-teamd/critical_processes | 4 +- files/scripts/supervisor-proc-exit-listener | 38 ++++++++++++++++--- .../docker-syncd-bfn/critical_processes | 2 +- .../docker-syncd-brcm/critical_processes | 4 +- .../docker-syncd-cavm/critical_processes | 2 +- .../docker-syncd-centec/critical_processes | 2 +- .../docker-syncd-mrvl/critical_processes | 2 +- .../docker-syncd-mrvl/critical_processes | 2 +- .../docker-syncd-mrvl/critical_processes | 2 +- .../docker-syncd-mlnx/critical_processes | 2 +- .../docker-syncd-nephos/critical_processes | 4 +- .../vs/docker-syncd-vs/critical_processes | 2 +- 26 files changed, 84 insertions(+), 58 deletions(-) diff --git a/dockers/docker-database/critical_processes b/dockers/docker-database/critical_processes index 7800f0fad3ff..53a45931dfc9 100644 --- a/dockers/docker-database/critical_processes +++ b/dockers/docker-database/critical_processes @@ -1 +1 @@ -redis +program:redis diff --git a/dockers/docker-dhcp-relay/critical_processes b/dockers/docker-dhcp-relay/critical_processes index ddb183963a67..855851bf2d68 100644 --- a/dockers/docker-dhcp-relay/critical_processes +++ b/dockers/docker-dhcp-relay/critical_processes @@ -1 +1 @@ -isc-dhcp-relay +group:isc-dhcp-relay diff --git a/dockers/docker-fpm-frr/critical_processes b/dockers/docker-fpm-frr/critical_processes index 8ea09e1bb538..2631fee15e66 100644 --- a/dockers/docker-fpm-frr/critical_processes +++ b/dockers/docker-fpm-frr/critical_processes @@ -1,5 +1,5 @@ -zebra -staticd -bgpd -fpmsyncd -bgpcfgd +program:zebra +program:staticd +program:bgpd +program:fpmsyncd +program:bgpcfgd diff --git a/dockers/docker-fpm-gobgp/critical_processes b/dockers/docker-fpm-gobgp/critical_processes index 2a9e47831e0d..789f4685244a 100644 --- a/dockers/docker-fpm-gobgp/critical_processes +++ b/dockers/docker-fpm-gobgp/critical_processes @@ -1,2 +1,2 @@ -gobgpd -fpmsyncd +program:gobgpd +program:fpmsyncd diff --git a/dockers/docker-fpm-quagga/critical_processes b/dockers/docker-fpm-quagga/critical_processes index f151af9c4bdd..2dceb501e3e5 100644 --- a/dockers/docker-fpm-quagga/critical_processes +++ b/dockers/docker-fpm-quagga/critical_processes @@ -1,4 +1,4 @@ -zebra -bgpd -fpmsyncd -bgpcfgd +program:zebra +program:bgpd +program:fpmsyncd +program:bgpcfgd diff --git a/dockers/docker-lldp/critical_processes b/dockers/docker-lldp/critical_processes index b845b70bb3f7..7a6b137a76f6 100644 --- a/dockers/docker-lldp/critical_processes +++ b/dockers/docker-lldp/critical_processes @@ -1,3 +1,3 @@ -lldpd -lldp-syncd -lldpmgrd +program:lldpd +program:lldp_syncd +program:lldpmgrd diff --git a/dockers/docker-nat/critical_processes b/dockers/docker-nat/critical_processes index d442976143f1..5edc5eedc3d5 100644 --- a/dockers/docker-nat/critical_processes +++ b/dockers/docker-nat/critical_processes @@ -1,2 +1,2 @@ -natmgrd -natsyncd +program:natmgrd +program:natsyncd diff --git a/dockers/docker-orchagent/critical_processes b/dockers/docker-orchagent/critical_processes index 7fd8a516520c..99f501e2cc03 100644 --- a/dockers/docker-orchagent/critical_processes +++ b/dockers/docker-orchagent/critical_processes @@ -1,10 +1,10 @@ -orchagent -portsyncd -neighsyncd -vlanmgrd -intfmgrd -portmgrd -buffermgrd -vrfmgrd -nbrmgrd -vxlanmgrd +program:orchagent +program:portsyncd +program:neighsyncd +program:vlanmgrd +program:intfmgrd +program:portmgrd +program:buffermgrd +program:vrfmgrd +program:nbrmgrd +program:vxlanmgrd diff --git a/dockers/docker-platform-monitor/critical_processes b/dockers/docker-platform-monitor/critical_processes index 9798dffc4c27..3165d13ecdbc 100644 --- a/dockers/docker-platform-monitor/critical_processes +++ b/dockers/docker-platform-monitor/critical_processes @@ -1,3 +1,3 @@ -ledd -xcvrd -psud +program:ledd +program:xcvrd +program:psud diff --git a/dockers/docker-router-advertiser/critical_processes b/dockers/docker-router-advertiser/critical_processes index 238a0346ac9f..a343765f78e0 100644 --- a/dockers/docker-router-advertiser/critical_processes +++ b/dockers/docker-router-advertiser/critical_processes @@ -1 +1 @@ -radvd +program:radvd diff --git a/dockers/docker-sflow/critical_processes b/dockers/docker-sflow/critical_processes index 5b24e2d8e1da..8180f8ad1c37 100644 --- a/dockers/docker-sflow/critical_processes +++ b/dockers/docker-sflow/critical_processes @@ -1 +1 @@ -sflowmgrd +program:sflowmgrd diff --git a/dockers/docker-snmp/critical_processes b/dockers/docker-snmp/critical_processes index e6039c5b9840..cff479d7fa11 100644 --- a/dockers/docker-snmp/critical_processes +++ b/dockers/docker-snmp/critical_processes @@ -1,2 +1,2 @@ -snmpd -snmp-subagent +program:snmpd +program:snmp-subagent diff --git a/dockers/docker-sonic-restapi/critical_processes b/dockers/docker-sonic-restapi/critical_processes index 3106eaa9410a..455b7fb2fa46 100644 --- a/dockers/docker-sonic-restapi/critical_processes +++ b/dockers/docker-sonic-restapi/critical_processes @@ -1 +1 @@ -restapi +program:restapi diff --git a/dockers/docker-sonic-telemetry/critical_processes b/dockers/docker-sonic-telemetry/critical_processes index d6953dd0c883..612a94d9edac 100644 --- a/dockers/docker-sonic-telemetry/critical_processes +++ b/dockers/docker-sonic-telemetry/critical_processes @@ -1,2 +1,2 @@ -telemetry -dialout +program:telemetry +program:dialout diff --git a/dockers/docker-teamd/critical_processes b/dockers/docker-teamd/critical_processes index b5c543df050d..286d6aef4c7a 100644 --- a/dockers/docker-teamd/critical_processes +++ b/dockers/docker-teamd/critical_processes @@ -1,2 +1,2 @@ -teammgrd -teamsyncd +program:teammgrd +program:teamsyncd diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index cf154b3a5c10..04a2a5001371 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -10,14 +10,42 @@ import swsssdk from supervisor import childutils -# Contents of file should be the names of critical processes (as defined in -# supervisor.conf file), one per line +# Each line of this file should specify either one critical process or one +# critical process group, (as defined in supervisord.conf file), in the +# following format: +# +# program: +# group: CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' # This table in databse contains the features for container and each # feature for a row will be configured a state or number. CONTAINER_FEATURE_TABLE_NAME = 'CONTAINER_FEATURE' +# Read the critical processes/group names from CRITICAL_PROCESSES_FILE +def get_critical_group_and_process_list(): + critical_group_list = [] + critical_process_list = [] + + with open(CRITICAL_PROCESSES_FILE, 'r') as file: + for line in file: + line_info = line.strip(' \n').split(':') + if len(line_info) != 2: + syslog.syslog(syslog.LOG_ERR, "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) + sys.exit(5) + + identifier_key = line_info[0].strip() + identifier_value = line_info[1].strip() + if identifier_key == "group" and identifier_value: + critical_group_list.append(identifier_value) + elif identifier_key == "program" and identifier_value: + critical_process_list.append(identifier_value) + else: + syslog.syslog(syslog.LOG_ERR, "Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line)) + sys.exit(6) + + return critical_group_list, critical_process_list + def main(argv): container_name = None opts, args = getopt.getopt(argv, "c:", ["container-name="]) @@ -29,9 +57,7 @@ def main(argv): syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...") sys.exit(1) - # Read the list of critical processes from a file - with open(CRITICAL_PROCESSES_FILE, 'r') as f: - critical_processes = [line.rstrip('\n') for line in f] + critical_group_list, critical_process_list = get_critical_group_and_process_list() while True: # Transition from ACKNOWLEDGED to READY @@ -73,7 +99,7 @@ def main(argv): # If container is database or auto-restart feature is enabled and at the same time # a critical process exited unexpectedly, terminate supervisor if ((container_name == 'database' or restart_feature == 'enabled') and expected == 0 and - (processname in critical_processes or groupname in critical_processes)): + (processname in critical_process_list or groupname in critical_group_list)): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) diff --git a/platform/barefoot/docker-syncd-bfn/critical_processes b/platform/barefoot/docker-syncd-bfn/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/barefoot/docker-syncd-bfn/critical_processes +++ b/platform/barefoot/docker-syncd-bfn/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/broadcom/docker-syncd-brcm/critical_processes b/platform/broadcom/docker-syncd-brcm/critical_processes index 489668a89e08..d1163a9c3046 100644 --- a/platform/broadcom/docker-syncd-brcm/critical_processes +++ b/platform/broadcom/docker-syncd-brcm/critical_processes @@ -1,2 +1,2 @@ -dsserve -syncd +program:dsserve +program:syncd diff --git a/platform/cavium/docker-syncd-cavm/critical_processes b/platform/cavium/docker-syncd-cavm/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/cavium/docker-syncd-cavm/critical_processes +++ b/platform/cavium/docker-syncd-cavm/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/centec/docker-syncd-centec/critical_processes b/platform/centec/docker-syncd-centec/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/centec/docker-syncd-centec/critical_processes +++ b/platform/centec/docker-syncd-centec/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/marvell-arm64/docker-syncd-mrvl/critical_processes b/platform/marvell-arm64/docker-syncd-mrvl/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/marvell-arm64/docker-syncd-mrvl/critical_processes +++ b/platform/marvell-arm64/docker-syncd-mrvl/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/marvell-armhf/docker-syncd-mrvl/critical_processes b/platform/marvell-armhf/docker-syncd-mrvl/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/marvell-armhf/docker-syncd-mrvl/critical_processes +++ b/platform/marvell-armhf/docker-syncd-mrvl/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/marvell/docker-syncd-mrvl/critical_processes b/platform/marvell/docker-syncd-mrvl/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/marvell/docker-syncd-mrvl/critical_processes +++ b/platform/marvell/docker-syncd-mrvl/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/mellanox/docker-syncd-mlnx/critical_processes b/platform/mellanox/docker-syncd-mlnx/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/mellanox/docker-syncd-mlnx/critical_processes +++ b/platform/mellanox/docker-syncd-mlnx/critical_processes @@ -1 +1 @@ -syncd +program:syncd diff --git a/platform/nephos/docker-syncd-nephos/critical_processes b/platform/nephos/docker-syncd-nephos/critical_processes index 489668a89e08..d1163a9c3046 100644 --- a/platform/nephos/docker-syncd-nephos/critical_processes +++ b/platform/nephos/docker-syncd-nephos/critical_processes @@ -1,2 +1,2 @@ -dsserve -syncd +program:dsserve +program:syncd diff --git a/platform/vs/docker-syncd-vs/critical_processes b/platform/vs/docker-syncd-vs/critical_processes index 6082f242b872..bdd6903c5690 100644 --- a/platform/vs/docker-syncd-vs/critical_processes +++ b/platform/vs/docker-syncd-vs/critical_processes @@ -1 +1 @@ -syncd +program:syncd