diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 217ea181d495..4c1b2b366f13 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -210,12 +210,6 @@ sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install $PLATF sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_PDDF_COMMON_PY2_WHEEL_NAME {% endif %} -# Install system-health Python 2 package -SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}}) -sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME -sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install $SYSTEM_HEALTH_PY2_WHEEL_NAME -sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME - # Install sonic-platform-common Python 3 package PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}}) sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME @@ -228,6 +222,12 @@ sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip2 install thrift sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install thrift==0.13.0 {% endif %} +# Install system-health Python 3 package +SYSTEM_HEALTH_PY3_WHEEL_NAME=$(basename {{system_health_py3_wheel_path}}) +sudo cp {{system_health_py3_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY3_WHEEL_NAME +sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip3 install $SYSTEM_HEALTH_PY3_WHEEL_NAME +sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY3_WHEEL_NAME + # Install prerequisites needed for installing the Python m2crypto package, used by sonic-utilities # These packages can be uninstalled after intallation sudo LANG=C DEBIAN_FRONTEND=noninteractive chroot $FILESYSTEM_ROOT apt-get -y install build-essential libssl-dev swig diff --git a/rules/system-health.mk b/rules/system-health.mk index 16648508b9f4..bafc09daa70f 100644 --- a/rules/system-health.mk +++ b/rules/system-health.mk @@ -1,9 +1,9 @@ -# system health python2 wheel +# system health Python wheel -SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl +SYSTEM_HEALTH = system_health-1.0-py3-none-any.whl $(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health -$(SYSTEM_HEALTH)_PYTHON_VERSION = 2 -$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE) +$(SYSTEM_HEALTH)_PYTHON_VERSION = 3 +$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY3) $(SWSSSDK_PY3) $(SONIC_CONFIG_ENGINE_PY3) SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) -export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" +export system_health_py3_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" diff --git a/src/system-health/health_checker/__init__.py b/src/system-health/health_checker/__init__.py index 18c49c8d81a1..1473ad9f3705 100644 --- a/src/system-health/health_checker/__init__.py +++ b/src/system-health/health_checker/__init__.py @@ -1,2 +1,2 @@ -from . import hardware_checker -from . import service_checker +from . import hardware_checker +from . import service_checker diff --git a/src/system-health/health_checker/config.py b/src/system-health/health_checker/config.py index 47b5f82b172b..3d287d778377 100644 --- a/src/system-health/health_checker/config.py +++ b/src/system-health/health_checker/config.py @@ -1,144 +1,144 @@ -import json -import os - -from sonic_py_common import device_info - - -class Config(object): - """ - Manage configuration of system health. - """ - - # Default system health check interval - DEFAULT_INTERVAL = 60 - - # Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work. - DEFAULT_BOOTUP_TIMEOUT = 300 - - # Default LED configuration. Different platform has different LED capability. This configuration allow vendor to - # override the default behavior. - DEFAULT_LED_CONFIG = { - 'fault': 'red', - 'normal': 'green', - 'booting': 'orange_blink' - } - - # System health configuration file name - CONFIG_FILE = 'system_health_monitoring_config.json' - - # Monit service configuration file path - MONIT_CONFIG_FILE = '/etc/monit/monitrc' - - # Monit service start delay configuration entry - MONIT_START_DELAY_CONFIG = 'with start delay' - - def __init__(self): - """ - Constructor. Initialize all configuration entry to default value in case there is no configuration file. - """ - self.platform_name = device_info.get_platform() - self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE) - self._last_mtime = None - self.config_data = None - self.interval = Config.DEFAULT_INTERVAL - self.ignore_services = None - self.ignore_devices = None - self.user_defined_checkers = None - - def config_file_exists(self): - return os.path.exists(self._config_file) - - def load_config(self): - """ - Load the configuration file from disk. - 1. If there is no configuration file, current config entries will reset to default value - 2. Only read the configuration file is last_mtime changes for better performance - 3. If there is any format issues in configuration file, current config entries will reset to default value - :return: - """ - if not self.config_file_exists(): - if self._last_mtime is not None: - self._reset() - return - - mtime = os.stat(self._config_file) - if mtime != self._last_mtime: - try: - self._last_mtime = mtime - with open(self._config_file, 'r') as f: - self.config_data = json.load(f) - - self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) - self.ignore_services = self._get_list_data('services_to_ignore') - self.ignore_devices = self._get_list_data('devices_to_ignore') - self.user_defined_checkers = self._get_list_data('user_defined_checkers') - except Exception as e: - self._reset() - - def _reset(self): - """ - Reset current configuration entry to default value - :return: - """ - self._last_mtime = None - self.config_data = None - self.interval = Config.DEFAULT_INTERVAL - self.ignore_services = None - self.ignore_devices = None - self.user_defined_checkers = None - - def get_led_color(self, status): - """ - Get desired LED color according to the input status - :param status: System health status - :return: StringLED color - """ - if self.config_data and 'led_color' in self.config_data: - if status in self.config_data['led_color']: - return self.config_data['led_color'][status] - - return self.DEFAULT_LED_CONFIG[status] - - def get_bootup_timeout(self): - """ - Get boot up timeout from monit configuration file. - 1. If monit configuration file does not exist, return default value - 2. If there is any exception while parsing monit config, return default value - :return: Integer timeout value - """ - if not os.path.exists(Config.MONIT_CONFIG_FILE): - return self.DEFAULT_BOOTUP_TIMEOUT - - try: - with open(Config.MONIT_CONFIG_FILE) as f: - lines = f.readlines() - for line in lines: - if not line: - continue - - line = line.strip() - if not line: - continue - - pos = line.find('#') - if pos == 0: - continue - - line = line[:pos] - pos = line.find(Config.MONIT_START_DELAY_CONFIG) - if pos != -1: - return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip()) - except Exception: - return self.DEFAULT_BOOTUP_TIMEOUT - - def _get_list_data(self, key): - """ - Get list type configuration data by key and remove duplicate element. - :param key: Key of the configuration entry - :return: A set of configuration data if key exists - """ - if key in self.config_data: - data = self.config_data[key] - if isinstance(data, list): - return set(data) - return None +import json +import os + +from sonic_py_common import device_info + + +class Config(object): + """ + Manage configuration of system health. + """ + + # Default system health check interval + DEFAULT_INTERVAL = 60 + + # Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work. + DEFAULT_BOOTUP_TIMEOUT = 300 + + # Default LED configuration. Different platform has different LED capability. This configuration allow vendor to + # override the default behavior. + DEFAULT_LED_CONFIG = { + 'fault': 'red', + 'normal': 'green', + 'booting': 'orange_blink' + } + + # System health configuration file name + CONFIG_FILE = 'system_health_monitoring_config.json' + + # Monit service configuration file path + MONIT_CONFIG_FILE = '/etc/monit/monitrc' + + # Monit service start delay configuration entry + MONIT_START_DELAY_CONFIG = 'with start delay' + + def __init__(self): + """ + Constructor. Initialize all configuration entry to default value in case there is no configuration file. + """ + self.platform_name = device_info.get_platform() + self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE) + self._last_mtime = None + self.config_data = None + self.interval = Config.DEFAULT_INTERVAL + self.ignore_services = None + self.ignore_devices = None + self.user_defined_checkers = None + + def config_file_exists(self): + return os.path.exists(self._config_file) + + def load_config(self): + """ + Load the configuration file from disk. + 1. If there is no configuration file, current config entries will reset to default value + 2. Only read the configuration file is last_mtime changes for better performance + 3. If there is any format issues in configuration file, current config entries will reset to default value + :return: + """ + if not self.config_file_exists(): + if self._last_mtime is not None: + self._reset() + return + + mtime = os.stat(self._config_file) + if mtime != self._last_mtime: + try: + self._last_mtime = mtime + with open(self._config_file, 'r') as f: + self.config_data = json.load(f) + + self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) + self.ignore_services = self._get_list_data('services_to_ignore') + self.ignore_devices = self._get_list_data('devices_to_ignore') + self.user_defined_checkers = self._get_list_data('user_defined_checkers') + except Exception as e: + self._reset() + + def _reset(self): + """ + Reset current configuration entry to default value + :return: + """ + self._last_mtime = None + self.config_data = None + self.interval = Config.DEFAULT_INTERVAL + self.ignore_services = None + self.ignore_devices = None + self.user_defined_checkers = None + + def get_led_color(self, status): + """ + Get desired LED color according to the input status + :param status: System health status + :return: StringLED color + """ + if self.config_data and 'led_color' in self.config_data: + if status in self.config_data['led_color']: + return self.config_data['led_color'][status] + + return self.DEFAULT_LED_CONFIG[status] + + def get_bootup_timeout(self): + """ + Get boot up timeout from monit configuration file. + 1. If monit configuration file does not exist, return default value + 2. If there is any exception while parsing monit config, return default value + :return: Integer timeout value + """ + if not os.path.exists(Config.MONIT_CONFIG_FILE): + return self.DEFAULT_BOOTUP_TIMEOUT + + try: + with open(Config.MONIT_CONFIG_FILE) as f: + lines = f.readlines() + for line in lines: + if not line: + continue + + line = line.strip() + if not line: + continue + + pos = line.find('#') + if pos == 0: + continue + + line = line[:pos] + pos = line.find(Config.MONIT_START_DELAY_CONFIG) + if pos != -1: + return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip()) + except Exception: + return self.DEFAULT_BOOTUP_TIMEOUT + + def _get_list_data(self, key): + """ + Get list type configuration data by key and remove duplicate element. + :param key: Key of the configuration entry + :return: A set of configuration data if key exists + """ + if key in self.config_data: + data = self.config_data[key] + if isinstance(data, list): + return set(data) + return None diff --git a/src/system-health/health_checker/hardware_checker.py b/src/system-health/health_checker/hardware_checker.py index a04fe2abf43a..cf5b86bea2be 100644 --- a/src/system-health/health_checker/hardware_checker.py +++ b/src/system-health/health_checker/hardware_checker.py @@ -1,248 +1,248 @@ -from natsort import natsorted -from swsssdk import SonicV2Connector - -from .health_checker import HealthChecker - - -class HardwareChecker(HealthChecker): - """ - Check system hardware status. For now, it checks ASIC, PSU and fan status. - """ - ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC' - FAN_TABLE_NAME = 'FAN_INFO' - PSU_TABLE_NAME = 'PSU_INFO' - - def __init__(self): - HealthChecker.__init__(self) - self._db = SonicV2Connector(host="127.0.0.1") - self._db.connect(self._db.STATE_DB) - - def get_category(self): - return 'Hardware' - - def check(self, config): - self.reset() - self._check_asic_status(config) - self._check_fan_status(config) - self._check_psu_status(config) - - def _check_asic_status(self, config): - """ - Check if ASIC temperature is in valid range. - :param config: Health checker configuration - :return: - """ - if config.ignore_devices and 'asic' in config.ignore_devices: - return - - temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature') - temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold') - if not temperature: - self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature') - elif not temperature_threshold: - self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold') - else: - try: - temperature = float(temperature) - temperature_threshold = float(temperature_threshold) - if temperature > temperature_threshold: - self.set_object_not_ok('ASIC', 'ASIC', - 'ASIC temperature is too hot, temperature={}, threshold={}'.format( - temperature, - temperature_threshold)) - else: - self.set_object_ok('ASIC', 'ASIC') - except ValueError as e: - self.set_object_not_ok('ASIC', 'ASIC', - 'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature, - temperature_threshold)) - - def _check_fan_status(self, config): - """ - Check fan status including: - 1. Check all fans are present - 2. Check all fans are in good state - 3. Check fan speed is in valid range - :param config: Health checker configuration - :return: - """ - if config.ignore_devices and 'fan' in config.ignore_devices: - return - - keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*') - if not keys: - self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information') - return - - for key in natsorted(keys): - key_list = key.split('|') - if len(key_list) != 2: # error data in DB, log it and ignore - self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key)) - continue - - name = key_list[1] - if config.ignore_devices and name in config.ignore_devices: - continue - data_dict = self._db.get_all(self._db.STATE_DB, key) - presence = data_dict.get('presence', 'false') - if presence.lower() != 'true': - self.set_object_not_ok('Fan', name, '{} is missing'.format(name)) - continue - - status = data_dict.get('status', 'false') - if status.lower() != 'true': - self.set_object_not_ok('Fan', name, '{} is broken'.format(name)) - continue - - if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'): - speed = data_dict.get('speed', None) - speed_target = data_dict.get('speed_target', None) - speed_tolerance = data_dict.get('speed_tolerance', None) - if not speed: - self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name)) - continue - elif not speed_target: - self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name)) - continue - elif not speed_tolerance: - self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name)) - continue - else: - try: - speed = float(speed) - speed_target = float(speed_target) - speed_tolerance = float(speed_tolerance) - speed_min_th = speed_target * (1 - float(speed_tolerance) / 100) - speed_max_th = speed_target * (1 + float(speed_tolerance) / 100) - if speed < speed_min_th or speed > speed_max_th: - self.set_object_not_ok('Fan', name, - '{} speed is out of range, speed={}, range=[{},{}]'.format(name, - speed, - speed_min_th, - speed_max_th)) - continue - except ValueError: - self.set_object_not_ok('Fan', name, - 'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format( - name, - speed, - speed_target, - speed_tolerance)) - continue - - self.set_object_ok('Fan', name) - - def _check_psu_status(self, config): - """ - Check PSU status including: - 1. Check all PSUs are present - 2. Check all PSUs are power on - 3. Check PSU temperature is in valid range - 4. Check PSU voltage is in valid range - :param config: Health checker configuration - :return: - """ - if config.ignore_devices and 'psu' in config.ignore_devices: - return - - keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*') - if not keys: - self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information') - return - - for key in natsorted(keys): - key_list = key.split('|') - if len(key_list) != 2: # error data in DB, log it and ignore - self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key)) - continue - - name = key_list[1] - if config.ignore_devices and name in config.ignore_devices: - continue - - data_dict = self._db.get_all(self._db.STATE_DB, key) - presence = data_dict.get('presence', 'false') - if presence.lower() != 'true': - self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name)) - continue - - status = data_dict.get('status', 'false') - if status.lower() != 'true': - self.set_object_not_ok('PSU', name, '{} is out of power'.format(name)) - continue - - if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'): - temperature = data_dict.get('temp', None) - temperature_threshold = data_dict.get('temp_threshold', None) - if temperature is None: - self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name)) - continue - elif temperature_threshold is None: - self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name)) - continue - else: - try: - temperature = float(temperature) - temperature_threshold = float(temperature_threshold) - if temperature > temperature_threshold: - self.set_object_not_ok('PSU', name, - '{} temperature is too hot, temperature={}, threshold={}'.format( - name, temperature, - temperature_threshold)) - continue - except ValueError: - self.set_object_not_ok('PSU', name, - 'Invalid temperature data for {}, temperature={}, threshold={}'.format( - name, temperature, - temperature_threshold)) - continue - - if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'): - voltage = data_dict.get('voltage', None) - voltage_min_th = data_dict.get('voltage_min_threshold', None) - voltage_max_th = data_dict.get('voltage_max_threshold', None) - if voltage is None: - self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name)) - continue - elif voltage_min_th is None: - self.set_object_not_ok('PSU', name, - 'Failed to get voltage minimum threshold data for {}'.format(name)) - continue - elif voltage_max_th is None: - self.set_object_not_ok('PSU', name, - 'Failed to get voltage maximum threshold data for {}'.format(name)) - continue - else: - try: - voltage = float(voltage) - voltage_min_th = float(voltage_min_th) - voltage_max_th = float(voltage_max_th) - if voltage < voltage_min_th or voltage > voltage_max_th: - self.set_object_not_ok('PSU', name, - '{} voltage is out of range, voltage={}, range=[{},{}]'.format(name, - voltage, - voltage_min_th, - voltage_max_th)) - continue - except ValueError: - self.set_object_not_ok('PSU', name, - 'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name, - voltage, - voltage_min_th, - voltage_max_th)) - continue - self.set_object_ok('PSU', name) - - def reset(self): - self._info = {} - - @classmethod - def _ignore_check(cls, ignore_set, category, object_name, check_point): - if not ignore_set: - return False - - if '{}.{}'.format(category, check_point) in ignore_set: - return True - elif '{}.{}'.format(object_name, check_point) in ignore_set: - return True - return False +from natsort import natsorted +from swsssdk import SonicV2Connector + +from .health_checker import HealthChecker + + +class HardwareChecker(HealthChecker): + """ + Check system hardware status. For now, it checks ASIC, PSU and fan status. + """ + ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC' + FAN_TABLE_NAME = 'FAN_INFO' + PSU_TABLE_NAME = 'PSU_INFO' + + def __init__(self): + HealthChecker.__init__(self) + self._db = SonicV2Connector(host="127.0.0.1") + self._db.connect(self._db.STATE_DB) + + def get_category(self): + return 'Hardware' + + def check(self, config): + self.reset() + self._check_asic_status(config) + self._check_fan_status(config) + self._check_psu_status(config) + + def _check_asic_status(self, config): + """ + Check if ASIC temperature is in valid range. + :param config: Health checker configuration + :return: + """ + if config.ignore_devices and 'asic' in config.ignore_devices: + return + + temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature') + temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold') + if not temperature: + self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature') + elif not temperature_threshold: + self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold') + else: + try: + temperature = float(temperature) + temperature_threshold = float(temperature_threshold) + if temperature > temperature_threshold: + self.set_object_not_ok('ASIC', 'ASIC', + 'ASIC temperature is too hot, temperature={}, threshold={}'.format( + temperature, + temperature_threshold)) + else: + self.set_object_ok('ASIC', 'ASIC') + except ValueError as e: + self.set_object_not_ok('ASIC', 'ASIC', + 'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature, + temperature_threshold)) + + def _check_fan_status(self, config): + """ + Check fan status including: + 1. Check all fans are present + 2. Check all fans are in good state + 3. Check fan speed is in valid range + :param config: Health checker configuration + :return: + """ + if config.ignore_devices and 'fan' in config.ignore_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*') + if not keys: + self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information') + return + + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + data_dict = self._db.get_all(self._db.STATE_DB, key) + presence = data_dict.get('presence', 'false') + if presence.lower() != 'true': + self.set_object_not_ok('Fan', name, '{} is missing'.format(name)) + continue + + status = data_dict.get('status', 'false') + if status.lower() != 'true': + self.set_object_not_ok('Fan', name, '{} is broken'.format(name)) + continue + + if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'): + speed = data_dict.get('speed', None) + speed_target = data_dict.get('speed_target', None) + speed_tolerance = data_dict.get('speed_tolerance', None) + if not speed: + self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name)) + continue + elif not speed_target: + self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name)) + continue + elif not speed_tolerance: + self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name)) + continue + else: + try: + speed = float(speed) + speed_target = float(speed_target) + speed_tolerance = float(speed_tolerance) + speed_min_th = speed_target * (1 - float(speed_tolerance) / 100) + speed_max_th = speed_target * (1 + float(speed_tolerance) / 100) + if speed < speed_min_th or speed > speed_max_th: + self.set_object_not_ok('Fan', name, + '{} speed is out of range, speed={}, range=[{},{}]'.format(name, + speed, + speed_min_th, + speed_max_th)) + continue + except ValueError: + self.set_object_not_ok('Fan', name, + 'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format( + name, + speed, + speed_target, + speed_tolerance)) + continue + + self.set_object_ok('Fan', name) + + def _check_psu_status(self, config): + """ + Check PSU status including: + 1. Check all PSUs are present + 2. Check all PSUs are power on + 3. Check PSU temperature is in valid range + 4. Check PSU voltage is in valid range + :param config: Health checker configuration + :return: + """ + if config.ignore_devices and 'psu' in config.ignore_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*') + if not keys: + self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information') + return + + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + + data_dict = self._db.get_all(self._db.STATE_DB, key) + presence = data_dict.get('presence', 'false') + if presence.lower() != 'true': + self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name)) + continue + + status = data_dict.get('status', 'false') + if status.lower() != 'true': + self.set_object_not_ok('PSU', name, '{} is out of power'.format(name)) + continue + + if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'): + temperature = data_dict.get('temp', None) + temperature_threshold = data_dict.get('temp_threshold', None) + if temperature is None: + self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name)) + continue + elif temperature_threshold is None: + self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name)) + continue + else: + try: + temperature = float(temperature) + temperature_threshold = float(temperature_threshold) + if temperature > temperature_threshold: + self.set_object_not_ok('PSU', name, + '{} temperature is too hot, temperature={}, threshold={}'.format( + name, temperature, + temperature_threshold)) + continue + except ValueError: + self.set_object_not_ok('PSU', name, + 'Invalid temperature data for {}, temperature={}, threshold={}'.format( + name, temperature, + temperature_threshold)) + continue + + if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'): + voltage = data_dict.get('voltage', None) + voltage_min_th = data_dict.get('voltage_min_threshold', None) + voltage_max_th = data_dict.get('voltage_max_threshold', None) + if voltage is None: + self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name)) + continue + elif voltage_min_th is None: + self.set_object_not_ok('PSU', name, + 'Failed to get voltage minimum threshold data for {}'.format(name)) + continue + elif voltage_max_th is None: + self.set_object_not_ok('PSU', name, + 'Failed to get voltage maximum threshold data for {}'.format(name)) + continue + else: + try: + voltage = float(voltage) + voltage_min_th = float(voltage_min_th) + voltage_max_th = float(voltage_max_th) + if voltage < voltage_min_th or voltage > voltage_max_th: + self.set_object_not_ok('PSU', name, + '{} voltage is out of range, voltage={}, range=[{},{}]'.format(name, + voltage, + voltage_min_th, + voltage_max_th)) + continue + except ValueError: + self.set_object_not_ok('PSU', name, + 'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name, + voltage, + voltage_min_th, + voltage_max_th)) + continue + self.set_object_ok('PSU', name) + + def reset(self): + self._info = {} + + @classmethod + def _ignore_check(cls, ignore_set, category, object_name, check_point): + if not ignore_set: + return False + + if '{}.{}'.format(category, check_point) in ignore_set: + return True + elif '{}.{}'.format(object_name, check_point) in ignore_set: + return True + return False diff --git a/src/system-health/health_checker/health_checker.py b/src/system-health/health_checker/health_checker.py index 59519d0a05c4..7d702a24bd3e 100644 --- a/src/system-health/health_checker/health_checker.py +++ b/src/system-health/health_checker/health_checker.py @@ -1,86 +1,86 @@ -class HealthChecker(object): - """ - Base class for health checker. A checker is an object that performs system health check for a particular category, - it collects and stores information after the check. - """ - INFO_FIELD_OBJECT_TYPE = 'type' - INFO_FIELD_OBJECT_STATUS = 'status' - INFO_FIELD_OBJECT_MSG = 'message' - - STATUS_OK = 'OK' - STATUS_NOT_OK = 'Not OK' - - summary = STATUS_OK - - def __init__(self): - self._info = {} - - def reset(self): - """ - Reset the status of the checker. Called every time before the check. - :return: - """ - pass - - def get_category(self): - """ - Get category of the checker. - :return: String category - """ - pass - - def get_info(self): - """ - Get information of the checker. A checker usually checks a few objects and each object status will be put to - self._info. - :return: Check result. - """ - return self._info - - def check(self, config): - """ - Perform the check. - :param config: Health checker configuration. - :return: - """ - pass - - def __str__(self): - return self.__class__.__name__ - - def add_info(self, object_name, key, value): - """ - Add check result for an object. - :param object_name: Object name. - :param key: Object attribute name. - :param value: Object attribute value. - :return: - """ - if object_name not in self._info: - self._info[object_name] = {} - - self._info[object_name][key] = value - - def set_object_not_ok(self, object_type, object_name, message): - """ - Set that an object is not OK. - :param object_type: Object type. - :param object_name: Object name. - :param message: A message to describe what is wrong with the object. - :return: - """ - self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) - self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message) - self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK) - HealthChecker.summary = HealthChecker.STATUS_NOT_OK - - def set_object_ok(self, object_type, object_name): - """ - Set that an object is in good state. - :param object_type: Object type. - :param object_name: Object name. - :return: - """ - self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) - self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '') - self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK) +class HealthChecker(object): + """ + Base class for health checker. A checker is an object that performs system health check for a particular category, + it collects and stores information after the check. + """ + INFO_FIELD_OBJECT_TYPE = 'type' + INFO_FIELD_OBJECT_STATUS = 'status' + INFO_FIELD_OBJECT_MSG = 'message' + + STATUS_OK = 'OK' + STATUS_NOT_OK = 'Not OK' + + summary = STATUS_OK + + def __init__(self): + self._info = {} + + def reset(self): + """ + Reset the status of the checker. Called every time before the check. + :return: + """ + pass + + def get_category(self): + """ + Get category of the checker. + :return: String category + """ + pass + + def get_info(self): + """ + Get information of the checker. A checker usually checks a few objects and each object status will be put to + self._info. + :return: Check result. + """ + return self._info + + def check(self, config): + """ + Perform the check. + :param config: Health checker configuration. + :return: + """ + pass + + def __str__(self): + return self.__class__.__name__ + + def add_info(self, object_name, key, value): + """ + Add check result for an object. + :param object_name: Object name. + :param key: Object attribute name. + :param value: Object attribute value. + :return: + """ + if object_name not in self._info: + self._info[object_name] = {} + + self._info[object_name][key] = value + + def set_object_not_ok(self, object_type, object_name, message): + """ + Set that an object is not OK. + :param object_type: Object type. + :param object_name: Object name. + :param message: A message to describe what is wrong with the object. + :return: + """ + self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) + self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message) + self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK) + HealthChecker.summary = HealthChecker.STATUS_NOT_OK + + def set_object_ok(self, object_type, object_name): + """ + Set that an object is in good state. + :param object_type: Object type. + :param object_name: Object name. + :return: + """ + self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) + self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '') + self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK) diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py index 933d6a9d543f..f31e482e336a 100644 --- a/src/system-health/health_checker/manager.py +++ b/src/system-health/health_checker/manager.py @@ -1,101 +1,101 @@ -class HealthCheckerManager(object): - """ - Manage all system health checkers and system health configuration. - """ - STATE_BOOTING = 'booting' - STATE_RUNNING = 'running' - boot_timeout = None - - def __init__(self): - self._checkers = [] - self._state = self.STATE_BOOTING - - from .config import Config - self.config = Config() - self.initialize() - - def initialize(self): - """ - Initialize the manager. Create service checker and hardware checker by default. - :return: - """ - from .service_checker import ServiceChecker - from .hardware_checker import HardwareChecker - self._checkers.append(ServiceChecker()) - self._checkers.append(HardwareChecker()) - - def check(self, chassis): - """ - Load new configuration if any and perform the system health check for all existing checkers. - :param chassis: A chassis object. - :return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that - contains the status for all objects that was checked. - """ - from .health_checker import HealthChecker - HealthChecker.summary = HealthChecker.STATUS_OK - stats = {} - self.config.load_config() - # check state first to avoid user change boot timeout in configuration file - # after finishing system boot - if self._state == self.STATE_BOOTING and self._is_system_booting(): - self._set_system_led(chassis, self.config, 'booting') - return self._state, stats - - for checker in self._checkers: - self._do_check(checker, stats) - - if self.config.user_defined_checkers: - from .user_defined_checker import UserDefinedChecker - for udc in self.config.user_defined_checkers: - checker = UserDefinedChecker(udc) - self._do_check(checker, stats) - - led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault' - self._set_system_led(chassis, self.config, led_status) - - return self._state, stats - - def _do_check(self, checker, stats): - """ - Do check for a particular checker and collect the check statistic. - :param checker: A checker object. - :param stats: Check statistic. - :return: - """ - try: - checker.check(self.config) - category = checker.get_category() - info = checker.get_info() - if category not in stats: - stats[category] = info - else: - stats[category].update(info) - except Exception as e: - from .health_checker import HealthChecker - error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) - entry = {str(checker): { - HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, - HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg - }} - if 'Internal' not in stats: - stats['Internal'] = entry - else: - stats['Internal'].update(entry) - - def _is_system_booting(self): - from .utils import get_uptime - uptime = get_uptime() - if not self.boot_timeout: - self.boot_timeout = self.config.get_bootup_timeout() - booting = uptime < self.boot_timeout - if not booting: - self._state = self.STATE_RUNNING - return booting - - def _set_system_led(self, chassis, config, status): - try: - chassis.set_status_led(config.get_led_color(status)) - except NotImplementedError: - print('chassis.set_status_led is not implemented') - except Exception as e: - print('Failed to set system led due to - {}'.format(repr(e))) +class HealthCheckerManager(object): + """ + Manage all system health checkers and system health configuration. + """ + STATE_BOOTING = 'booting' + STATE_RUNNING = 'running' + boot_timeout = None + + def __init__(self): + self._checkers = [] + self._state = self.STATE_BOOTING + + from .config import Config + self.config = Config() + self.initialize() + + def initialize(self): + """ + Initialize the manager. Create service checker and hardware checker by default. + :return: + """ + from .service_checker import ServiceChecker + from .hardware_checker import HardwareChecker + self._checkers.append(ServiceChecker()) + self._checkers.append(HardwareChecker()) + + def check(self, chassis): + """ + Load new configuration if any and perform the system health check for all existing checkers. + :param chassis: A chassis object. + :return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that + contains the status for all objects that was checked. + """ + from .health_checker import HealthChecker + HealthChecker.summary = HealthChecker.STATUS_OK + stats = {} + self.config.load_config() + # check state first to avoid user change boot timeout in configuration file + # after finishing system boot + if self._state == self.STATE_BOOTING and self._is_system_booting(): + self._set_system_led(chassis, self.config, 'booting') + return self._state, stats + + for checker in self._checkers: + self._do_check(checker, stats) + + if self.config.user_defined_checkers: + from .user_defined_checker import UserDefinedChecker + for udc in self.config.user_defined_checkers: + checker = UserDefinedChecker(udc) + self._do_check(checker, stats) + + led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault' + self._set_system_led(chassis, self.config, led_status) + + return self._state, stats + + def _do_check(self, checker, stats): + """ + Do check for a particular checker and collect the check statistic. + :param checker: A checker object. + :param stats: Check statistic. + :return: + """ + try: + checker.check(self.config) + category = checker.get_category() + info = checker.get_info() + if category not in stats: + stats[category] = info + else: + stats[category].update(info) + except Exception as e: + from .health_checker import HealthChecker + error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) + entry = {str(checker): { + HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, + HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg + }} + if 'Internal' not in stats: + stats['Internal'] = entry + else: + stats['Internal'].update(entry) + + def _is_system_booting(self): + from .utils import get_uptime + uptime = get_uptime() + if not self.boot_timeout: + self.boot_timeout = self.config.get_bootup_timeout() + booting = uptime < self.boot_timeout + if not booting: + self._state = self.STATE_RUNNING + return booting + + def _set_system_led(self, chassis, config, status): + try: + chassis.set_status_led(config.get_led_color(status)) + except NotImplementedError: + print('chassis.set_status_led is not implemented') + except Exception as e: + print('Failed to set system led due to - {}'.format(repr(e))) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py index 8f18a6d7245e..a98e2d33c3ad 100644 --- a/src/system-health/health_checker/service_checker.py +++ b/src/system-health/health_checker/service_checker.py @@ -1,72 +1,72 @@ -from .health_checker import HealthChecker -from . import utils - - -class ServiceChecker(HealthChecker): - """ - Checker that checks critical system service status via monit service. - """ - - # Command to query the status of monit service. - CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' - - # Command to get summary of critical system service. - CHECK_CMD = 'monit summary -B' - MIN_CHECK_CMD_LINES = 3 - - # Expect status for different system service category. - EXPECT_STATUS_DICT = { - 'System': 'Running', - 'Process': 'Running', - 'Filesystem': 'Accessible', - 'Program': 'Status ok' - } - - def __init__(self): - HealthChecker.__init__(self) - - def reset(self): - self._info = {} - - def get_category(self): - return 'Services' - - def check(self, config): - """ - Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system, - process and file system. - :param config: Health checker configuration. - :return: - """ - self.reset() - output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() - if output != 'active': - self.set_object_not_ok('Service', 'monit', 'monit service is not running') - return - - output = utils.run_command(ServiceChecker.CHECK_CMD) - lines = output.splitlines() - if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES: - self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') - return - - status_begin = lines[1].find('Status') - type_begin = lines[1].find('Type') - if status_begin < 0 or type_begin < 0: - self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') - return - - for line in lines[2:]: - name = line[0:status_begin].strip() - if config.ignore_services and name in config.ignore_services: - continue - status = line[status_begin:type_begin].strip() - service_type = line[type_begin:].strip() - if service_type not in ServiceChecker.EXPECT_STATUS_DICT: - continue - expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type] - if expect_status != status: - self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status)) - else: - self.set_object_ok(service_type, name) - return +from .health_checker import HealthChecker +from . import utils + + +class ServiceChecker(HealthChecker): + """ + Checker that checks critical system service status via monit service. + """ + + # Command to query the status of monit service. + CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' + + # Command to get summary of critical system service. + CHECK_CMD = 'monit summary -B' + MIN_CHECK_CMD_LINES = 3 + + # Expect status for different system service category. + EXPECT_STATUS_DICT = { + 'System': 'Running', + 'Process': 'Running', + 'Filesystem': 'Accessible', + 'Program': 'Status ok' + } + + def __init__(self): + HealthChecker.__init__(self) + + def reset(self): + self._info = {} + + def get_category(self): + return 'Services' + + def check(self, config): + """ + Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system, + process and file system. + :param config: Health checker configuration. + :return: + """ + self.reset() + output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() + if output != 'active': + self.set_object_not_ok('Service', 'monit', 'monit service is not running') + return + + output = utils.run_command(ServiceChecker.CHECK_CMD) + lines = output.splitlines() + if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES: + self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') + return + + status_begin = lines[1].find('Status') + type_begin = lines[1].find('Type') + if status_begin < 0 or type_begin < 0: + self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') + return + + for line in lines[2:]: + name = line[0:status_begin].strip() + if config.ignore_services and name in config.ignore_services: + continue + status = line[status_begin:type_begin].strip() + service_type = line[type_begin:].strip() + if service_type not in ServiceChecker.EXPECT_STATUS_DICT: + continue + expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type] + if expect_status != status: + self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status)) + else: + self.set_object_ok(service_type, name) + return diff --git a/src/system-health/health_checker/system_health_monitoring_config.json b/src/system-health/health_checker/system_health_monitoring_config.json index 0fc475e766e4..654d43d81096 100644 --- a/src/system-health/health_checker/system_health_monitoring_config.json +++ b/src/system-health/health_checker/system_health_monitoring_config.json @@ -1,11 +1,11 @@ -{ - "services_to_ignore": [], - "devices_to_ignore": [], - "user_defined_checkers": [], - "polling_interval": 60, - "led_color": { - "fault": "amber", - "normal": "green", - "booting": "orange_blink" - } -} \ No newline at end of file +{ + "services_to_ignore": [], + "devices_to_ignore": [], + "user_defined_checkers": [], + "polling_interval": 60, + "led_color": { + "fault": "amber", + "normal": "green", + "booting": "orange_blink" + } +} diff --git a/src/system-health/health_checker/user_defined_checker.py b/src/system-health/health_checker/user_defined_checker.py index ed0cdce6194d..0e0237fb5c53 100644 --- a/src/system-health/health_checker/user_defined_checker.py +++ b/src/system-health/health_checker/user_defined_checker.py @@ -1,88 +1,89 @@ -from .health_checker import HealthChecker -from . import utils - - -class UserDefinedChecker(HealthChecker): - """ - User could implement a script or program to perform customize check for particular system. In order to enable a - user defined checker: - 1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string - that can be executed by shell. For example: "python my_checker.py". - 2. The command output must match the following pattern: - ${UserDefineCategory} - ${Object1}:${ObjectStatusMessage1} - ${Object2}:${ObjectStatusMessage2} - - An example of the command output: - MyCategory - Device1:OK - Device2:OK - Device3:Out of power - """ - def __init__(self, cmd): - """ - Constructor. - :param cmd: Command string of the user defined checker. - """ - HealthChecker.__init__(self) - self._cmd = cmd - self._category = None - - def reset(self): - self._category = 'UserDefine' - self._info = {} - - def get_category(self): - return self._category - - def check(self, config): - """ - Execute the user defined command and parse the output. - :param config: Health checker configuration. - :return: - """ - self.reset() - - output = utils.run_command(self._cmd) - if not output: - self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) - return - - output = output.strip() - if not output: - self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) - return - - raw_lines = output.splitlines() - if not raw_lines: - self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) - return - - lines = [] - for line in raw_lines: - line = line.strip() - if not line: - continue - - lines.append(line) - - if not lines: - self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) - return - - self._category = lines[0] - if len(lines) > 1: - for line in lines[1:]: - pos = line.find(':') - if pos == -1: - continue - obj_name = line[:pos].strip() - msg = line[pos + 1:].strip() - if msg != 'OK': - self.set_object_not_ok('UserDefine', obj_name, msg) - else: - self.set_object_ok('UserDefine', obj_name) - return - - def __str__(self): - return 'UserDefinedChecker - {}'.format(self._cmd) +from .health_checker import HealthChecker +from . import utils + + +class UserDefinedChecker(HealthChecker): + """ + User could implement a script or program to perform customize check for particular system. In order to enable a + user defined checker: + 1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string + that can be executed by shell. For example: "python my_checker.py". + 2. The command output must match the following pattern: + ${UserDefineCategory} + ${Object1}:${ObjectStatusMessage1} + ${Object2}:${ObjectStatusMessage2} + + An example of the command output: + MyCategory + Device1:OK + Device2:OK + Device3:Out of power + """ + + def __init__(self, cmd): + """ + Constructor. + :param cmd: Command string of the user defined checker. + """ + HealthChecker.__init__(self) + self._cmd = cmd + self._category = None + + def reset(self): + self._category = 'UserDefine' + self._info = {} + + def get_category(self): + return self._category + + def check(self, config): + """ + Execute the user defined command and parse the output. + :param config: Health checker configuration. + :return: + """ + self.reset() + + output = utils.run_command(self._cmd) + if not output: + self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) + return + + output = output.strip() + if not output: + self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) + return + + raw_lines = output.splitlines() + if not raw_lines: + self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) + return + + lines = [] + for line in raw_lines: + line = line.strip() + if not line: + continue + + lines.append(line) + + if not lines: + self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) + return + + self._category = lines[0] + if len(lines) > 1: + for line in lines[1:]: + pos = line.find(':') + if pos == -1: + continue + obj_name = line[:pos].strip() + msg = line[pos + 1:].strip() + if msg != 'OK': + self.set_object_not_ok('UserDefine', obj_name, msg) + else: + self.set_object_ok('UserDefine', obj_name) + return + + def __str__(self): + return 'UserDefinedChecker - {}'.format(self._cmd) diff --git a/src/system-health/health_checker/utils.py b/src/system-health/health_checker/utils.py index 5da8a7346c2a..fe26054e420d 100644 --- a/src/system-health/health_checker/utils.py +++ b/src/system-health/health_checker/utils.py @@ -1,25 +1,25 @@ -import subprocess - - -def run_command(command): - """ - Utility function to run an shell command and return the output. - :param command: Shell command string. - :return: Output of the shell command. - """ - try: - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) - return process.communicate()[0].encode('utf-8') - except Exception: - return None - - -def get_uptime(): - """ - Utility to get the system up time. - :return: System up time in seconds. - """ - with open('/proc/uptime', 'r') as f: - uptime_seconds = float(f.readline().split()[0]) - - return uptime_seconds +import subprocess + + +def run_command(command): + """ + Utility function to run an shell command and return the output. + :param command: Shell command string. + :return: Output of the shell command. + """ + try: + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + return process.communicate()[0].encode('utf-8') + except Exception: + return None + + +def get_uptime(): + """ + Utility to get the system up time. + :return: System up time in seconds. + """ + with open('/proc/uptime', 'r') as f: + uptime_seconds = float(f.readline().split()[0]) + + return uptime_seconds diff --git a/src/system-health/scripts/healthd b/src/system-health/scripts/healthd index 77faf5494374..799fa028f2e7 100644 --- a/src/system-health/scripts/healthd +++ b/src/system-health/scripts/healthd @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 """ healthd diff --git a/src/system-health/setup.py b/src/system-health/setup.py index a7decca09a97..e8084701f08c 100644 --- a/src/system-health/setup.py +++ b/src/system-health/setup.py @@ -24,10 +24,10 @@ scripts=[ 'scripts/healthd', ], - setup_requires= [ + setup_requires=[ 'pytest-runner' ], - tests_require = [ + tests_require=[ 'pytest', 'mock>=2.0.0' ], @@ -40,10 +40,9 @@ 'License :: OSI Approved :: Apache Software License', 'Natural Language :: English', 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.7', 'Topic :: System :: Hardware', ], keywords='SONiC sonic HEALTH health', test_suite='setup.get_test_suite' ) - diff --git a/src/system-health/tests/mock_connector.py b/src/system-health/tests/mock_connector.py index c65198dbe145..d32017ff8485 100644 --- a/src/system-health/tests/mock_connector.py +++ b/src/system-health/tests/mock_connector.py @@ -22,4 +22,3 @@ def keys(self, db_id, pattern): def get_all(self, db_id, key): return MockConnector.data[key] - diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index 6a791613a475..8fe8642b1014 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -73,8 +73,8 @@ def mock_run_command(cmd): 'telemetry Does not exist Process\n' \ 'orchagent Running Process\n' \ 'root-overlay Accessible Filesystem\n' \ - 'var-log Is not accessible Filesystem\n' - + 'var-log Is not accessible Filesystem\n' + checker = ServiceChecker() config = Config() checker.check(config)