From 3342a31721707540e0b301a4312be7fcc6dc5a28 Mon Sep 17 00:00:00 2001 From: Scott Pilkey Date: Tue, 24 Jan 2023 13:56:04 -0800 Subject: [PATCH] Add asic presence filtering for container checking in system-health --- .../health_checker/service_checker.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index ed6c7296fde3..31c7f7b9717a 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table): """ expected_running_containers = set() container_feature_dict = {} + + # Get current asic presence list. For multi_asic system, multi instance containers + # should be checked only for asics present. + asics_id_presence = multi_asic.get_asic_presence_list() + + # Some services may run all the instances irrespective of asic presence. + # Add those to exception list. + # database service: Currently services have dependency on all database services to + # be up irrespective of asic presence. + # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven, + # it will be removed from exception list. + run_all_instance_list = ['database', 'bgp'] + for feature_name, feature_entry in feature_table.items(): if feature_entry["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): @@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table): if feature_entry["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - expected_running_containers.add(feature_name + str(asic_id)) - container_feature_dict[feature_name + str(asic_id)] = feature_name + if asic_id in asics_id_presence or feature_name in run_all_instance_list: + expected_running_containers.add(feature_name + str(asic_id)) + container_feature_dict[feature_name + str(asic_id)] = feature_name else: expected_running_containers.add(feature_name) container_feature_dict[feature_name] = feature_name @@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config, process_status = utils.run_command(cmd) if process_status is None: for process_name in critical_process_list: - self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name)) self.publish_events(container_name, critical_process_list) return @@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config, # and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status. if process_name in process_status: if process_status[process_name] != 'RUNNING': - self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name)) + self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name)) else: self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))