Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
vdahiya12 committed Aug 16, 2023
2 parents dc68973 + 816059b commit 784b9e2
Show file tree
Hide file tree
Showing 14 changed files with 1,025 additions and 155 deletions.
3 changes: 2 additions & 1 deletion sonic-pcied/scripts/pcied
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ class DaemonPcied(daemon_base.DaemonBase):

self.aer_stats = {}
if Id is not None:
self.device_table.set(self.device_name, [('id', Id)])
fvp = swsscommon.FieldValuePairs([('id', Id)])
self.device_table.set(self.device_name, fvp)
self.aer_stats = platform_pcieutil.get_pcie_aer_stats(bus=Bus, dev=Dev, func=Fn)
self.update_aer_to_statedb()

Expand Down
3 changes: 3 additions & 0 deletions sonic-pcied/tests/mocked_libs/swsscommon/swsscommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def get(self, key):
def get_size(self):
return (len(self.mock_dict))

def getKeys(self):
return list(self.mock_dict.keys())


class FieldValuePairs:
fv_dict = {}
Expand Down
11 changes: 0 additions & 11 deletions sonic-pcied/tests/test_DaemonPcied.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,22 +158,18 @@ def test_check_pcie_devices(self):
@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_update_pcie_devices_status_db(self):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.status_table = mock.MagicMock()
daemon_pcied.log_info = mock.MagicMock()
daemon_pcied.log_error = mock.MagicMock()

# test for pass resultInfo
daemon_pcied.update_pcie_devices_status_db(0)
assert daemon_pcied.status_table.set.call_count == 1
assert daemon_pcied.log_info.call_count == 1
assert daemon_pcied.log_error.call_count == 0

daemon_pcied.status_table.set.reset_mock()
daemon_pcied.log_info.reset_mock()

# test for resultInfo with 1 device failed to detect
daemon_pcied.update_pcie_devices_status_db(1)
assert daemon_pcied.status_table.set.call_count == 1
assert daemon_pcied.log_info.call_count == 0
assert daemon_pcied.log_error.call_count == 1

Expand All @@ -182,28 +178,24 @@ def test_update_pcie_devices_status_db(self):
@mock.patch('pcied.read_id_file')
def test_check_n_update_pcie_aer_stats(self, mock_read):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.device_table = mock.MagicMock()
daemon_pcied.update_aer_to_statedb = mock.MagicMock()
pcied.platform_pcieutil.get_pcie_aer_stats = mock.MagicMock()

mock_read.return_value = None
daemon_pcied.check_n_update_pcie_aer_stats(0,1,0)
assert daemon_pcied.update_aer_to_statedb.call_count == 0
assert daemon_pcied.device_table.set.call_count == 0
assert pcied.platform_pcieutil.get_pcie_aer_stats.call_count == 0

mock_read.return_value = '1714'
daemon_pcied.check_n_update_pcie_aer_stats(0,1,0)
assert daemon_pcied.update_aer_to_statedb.call_count == 1
assert daemon_pcied.device_table.set.call_count == 1
assert pcied.platform_pcieutil.get_pcie_aer_stats.call_count == 1


@mock.patch('pcied.load_platform_pcieutil', mock.MagicMock())
def test_update_aer_to_statedb(self):
daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)
daemon_pcied.log_debug = mock.MagicMock()
daemon_pcied.device_table = mock.MagicMock()
daemon_pcied.device_name = mock.MagicMock()
daemon_pcied.aer_stats = pcie_aer_stats_no_err

Expand All @@ -220,6 +212,3 @@ def test_update_aer_to_statedb(self):

daemon_pcied.update_aer_to_statedb()
assert daemon_pcied.log_debug.call_count == 0
assert daemon_pcied.device_table.set.call_count == 1

daemon_pcied.device_table.set.reset_mock()
24 changes: 19 additions & 5 deletions sonic-psud/scripts/psud
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ class DaemonPsud(daemon_base.DaemonBase):
self.psu_tbl = None
self.psu_chassis_info = None
self.first_run = True
self.psu_threshold_exceeded_logged = False

global platform_psuutil
global platform_chassis
Expand Down Expand Up @@ -458,6 +459,7 @@ class DaemonPsud(daemon_base.DaemonBase):
if not platform_chassis:
return

self.psu_threshold_exceeded_logged = False
for index, psu in enumerate(platform_chassis.get_all_psus()):
try:
self._update_single_psu_data(index + 1, psu)
Expand Down Expand Up @@ -535,25 +537,37 @@ class DaemonPsud(daemon_base.DaemonBase):
power_warning_suppress_threshold = try_get(psu.get_psu_power_warning_suppress_threshold, NOT_AVAILABLE)
power_critical_threshold = try_get(psu.get_psu_power_critical_threshold, NOT_AVAILABLE)
if psu_status.check_psu_power_threshold:
# Calculate total power
system_power = float(power)
for _, other_psu in enumerate(platform_chassis.get_all_psus()):
if other_psu is psu:
# Skip the current PSU
continue
power_str = try_get(other_psu.get_power, NOT_AVAILABLE)
if power_str != NOT_AVAILABLE:
system_power += float(power_str)

if power_warning_suppress_threshold == NOT_AVAILABLE or power_critical_threshold == NOT_AVAILABLE:
self.log_error("PSU power thresholds become invalid: threshold {} critical threshold {}".format(power_warning_suppress_threshold, power_critical_threshold))
psu_status.check_psu_power_threshold = False
psu_status.power_exceeded_threshold = False
elif psu_status.power_exceeded_threshold:
# The failing threshold is the warning threshold
if power < power_warning_suppress_threshold:
if system_power < power_warning_suppress_threshold:
# Clear alarm
power_exceeded_threshold = False
else:
# The rising threshold is the critical threshold
if power >= power_critical_threshold:
if system_power >= power_critical_threshold:
# Raise alarm
power_exceeded_threshold = True

if psu_status.set_power_exceed_threshold(power_exceeded_threshold):
if psu_status.set_power_exceed_threshold(power_exceeded_threshold) and not self.psu_threshold_exceeded_logged:
# Since this is a system level PSU power exceeding check, we do not need to log it for each PSU
log_on_status_changed(self, not psu_status.power_exceeded_threshold,
'PSU power warning cleared: {} power {} is back to normal.'.format(name, power),
'PSU power warning: {} power {} exceeds critical threshold {}.'.format(name, power, power_critical_threshold))
'PSU power warning cleared: system power {} is back to normal, below the warning suppress threshold {}.'.format(system_power, power_warning_suppress_threshold),
'PSU power warning: system power {} exceeds the critical threshold {}.'.format(system_power, power_critical_threshold))
self.psu_threshold_exceeded_logged = True

if presence and psu_status.set_voltage(voltage, voltage_high_threshold, voltage_low_threshold):
set_led = True
Expand Down
21 changes: 12 additions & 9 deletions sonic-psud/tests/test_DaemonPsud.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,19 +188,22 @@ def test_power_threshold(self):
psu = MockPsu('PSU 1', 0, True, 'Fake Model', '12345678', '1234')
psud.platform_chassis = MockChassis()
psud.platform_chassis._psu_list.append(psu)
another_psu = MockPsu('PSU 2', 0, True, 'Fake Model', '12345678', '1234')
another_psu.set_power(10.0)
psud.platform_chassis._psu_list.append(another_psu)

daemon_psud = psud.DaemonPsud(SYSLOG_IDENTIFIER)

daemon_psud.psu_tbl = mock.MagicMock()
psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=120.0)
psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(return_value=110.0)
psu.get_psu_power_critical_threshold = mock.MagicMock(return_value=130.0)
psu.get_psu_power_warning_suppress_threshold = mock.MagicMock(return_value=120.0)

# Normal start. All good and all thresholds are supported
# Power is in normal range (below warning threshold)
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(100.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(100.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -213,7 +216,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(115.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -224,7 +227,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True)
expected_fvp = self._construct_expected_fvp(125.0, 120.0, 130.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -235,7 +238,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(115.0, 110.0, 120.0, True)
expected_fvp = self._construct_expected_fvp(115.0, 120.0, 130.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -246,7 +249,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(105.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
daemon_psud._update_led_color()
Expand All @@ -257,7 +260,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(125.0, 110.0, 120.0, True)
expected_fvp = self._construct_expected_fvp(125.0, 120.0, 130.0, True)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand All @@ -268,7 +271,7 @@ def test_power_threshold(self):
daemon_psud._update_single_psu_data(1, psu)
assert daemon_psud.psu_status_dict[1].check_psu_power_threshold
assert not daemon_psud.psu_status_dict[1].power_exceeded_threshold
expected_fvp = self._construct_expected_fvp(105.0, 110.0, 120.0, False)
expected_fvp = self._construct_expected_fvp(105.0, 120.0, 130.0, False)
daemon_psud.psu_tbl.set.assert_called_with(psud.PSU_INFO_KEY_TEMPLATE.format(1), expected_fvp)
daemon_psud._update_led_color()
assert psu.STATUS_LED_COLOR_GREEN == psu.get_status_led()
Expand Down
79 changes: 30 additions & 49 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -119,57 +119,35 @@ class FanStatus(logger.Logger):
self.status = status
return True

def _check_speed_value_available(self, speed, target_speed, tolerance, current_status):
if speed == NOT_AVAILABLE or target_speed == NOT_AVAILABLE or tolerance == NOT_AVAILABLE:
if isinstance(tolerance, int) and (tolerance > 100 or tolerance < 0):
self.log_warning('Invalid tolerance value: {}'.format(tolerance))
return False

if current_status is True:
self.log_warning('Fan speed or target_speed or tolerance became unavailable, '
'speed={}, target_speed={}, tolerance={}'.format(speed, target_speed, tolerance))
return False
return True

def set_under_speed(self, speed, target_speed, tolerance):
def set_under_speed(self, is_under_speed):
"""
Set and cache Fan under speed status
:param speed: Fan speed
:param target_speed: Fan target speed
:param tolerance: Threshold between Fan speed and target speed
:param is_under_speed: Fan under speed threshold status
:return: True if status changed else False
"""
if not self._check_speed_value_available(speed, target_speed, tolerance, self.under_speed):
old_status = self.under_speed
self.under_speed = False
return old_status != self.under_speed
if is_under_speed == NOT_AVAILABLE:
if self.under_speed:
self.log_warning('Fan under speed threshold check became unavailable')
is_under_speed = False

status = speed < target_speed * (1 - float(tolerance) / 100)
if status == self.under_speed:
return False
old_status = self.under_speed
self.under_speed = is_under_speed
return old_status != self.under_speed

self.under_speed = status
return True

def set_over_speed(self, speed, target_speed, tolerance):
def set_over_speed(self, is_over_speed):
"""
Set and cache Fan over speed status
:param speed: Fan speed
:param target_speed: Fan target speed
:param tolerance: Threshold between Fan speed and target speed
:param is_over_speed: Fan over speed threshold status
:return: True if status changed else False
"""
if not self._check_speed_value_available(speed, target_speed, tolerance, self.over_speed):
old_status = self.over_speed
self.over_speed = False
return old_status != self.over_speed
if is_over_speed == NOT_AVAILABLE:
if self.over_speed:
self.log_warning('Fan over speed threshold check became unavailable')
is_over_speed = False

status = speed > target_speed * (1 + float(tolerance) / 100)
if status == self.over_speed:
return False

self.over_speed = status
return True
old_status = self.over_speed
self.over_speed = is_over_speed
return old_status != self.over_speed

def is_ok(self):
"""
Expand Down Expand Up @@ -315,16 +293,18 @@ class FanUpdater(logger.Logger):
fan_status = self.fan_status_dict[fan_name]

speed = NOT_AVAILABLE
speed_tolerance = NOT_AVAILABLE
speed_target = NOT_AVAILABLE
is_under_speed = NOT_AVAILABLE
is_over_speed = NOT_AVAILABLE
fan_fault_status = NOT_AVAILABLE
fan_direction = NOT_AVAILABLE
is_replaceable = try_get(fan.is_replaceable, False)
presence = try_get(fan.get_presence, False)
if presence:
speed = try_get(fan.get_speed)
speed_tolerance = try_get(fan.get_speed_tolerance)
speed_target = try_get(fan.get_target_speed)
is_under_speed = try_get(fan.is_under_speed)
is_over_speed = try_get(fan.is_over_speed)
fan_fault_status = try_get(fan.get_status, False)
fan_direction = try_get(fan.get_direction)

Expand All @@ -344,20 +324,20 @@ class FanUpdater(logger.Logger):
'Fan fault warning: {} is broken'.format(fan_name)
)

if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_under_speed(is_under_speed):
set_led = True
self._log_on_status_changed(not fan_status.under_speed,
'Fan low speed warning cleared: {} speed is back to normal'.format(fan_name),
'Fan low speed warning: {} current speed={}, target speed={}, tolerance={}'.
format(fan_name, speed, speed_target, speed_tolerance)
'Fan low speed warning: {} current speed={}, target speed={}'.
format(fan_name, speed, speed_target)
)

if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
if presence and fan_status.set_over_speed(is_over_speed):
set_led = True
self._log_on_status_changed(not fan_status.over_speed,
'Fan high speed warning cleared: {} speed is back to normal'.format(fan_name),
'Fan high speed warning: {} target speed={}, current speed={}, tolerance={}'.
format(fan_name, speed_target, speed, speed_tolerance)
'Fan high speed warning: {} current speed={}, target speed={}'.
format(fan_name, speed, speed_target)
)

# We don't set PSU led here, PSU led will be handled in psud
Expand All @@ -376,8 +356,9 @@ class FanUpdater(logger.Logger):
('status', str(fan_fault_status)),
('direction', str(fan_direction)),
('speed', str(speed)),
('speed_tolerance', str(speed_tolerance)),
('speed_target', str(speed_target)),
('is_under_speed', str(is_under_speed)),
('is_over_speed', str(is_over_speed)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])
Expand Down
Loading

0 comments on commit 784b9e2

Please sign in to comment.