Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

system-health service_checker should check containers based on asic presence #13497

Merged
merged 1 commit into from
Feb 11, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table):
"""
expected_running_containers = set()
container_feature_dict = {}

# Get current asic presence list. For multi_asic system, multi instance containers
# should be checked only for asics present.
asics_id_presence = multi_asic.get_asic_presence_list()

# Some services may run all the instances irrespective of asic presence.
# Add those to exception list.
# database service: Currently services have dependency on all database services to
# be up irrespective of asic presence.
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
# it will be removed from exception list.
run_all_instance_list = ['database', 'bgp']

for feature_name, feature_entry in feature_table.items():
if feature_entry["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
Expand All @@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table):
if feature_entry["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
else:
expected_running_containers.add(feature_name)
container_feature_dict[feature_name] = feature_name
Expand Down Expand Up @@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config,
process_status = utils.run_command(cmd)
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
self.publish_events(container_name, critical_process_list)
return

Expand All @@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config,
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
if process_name in process_status:
if process_status[process_name] != 'RUNNING':
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
else:
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))