Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[chassis][midplane] Modify the chassisd to log expected/unexpected midplane connectivity messages #480

Merged
merged 4 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 52 additions & 2 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ CHASSIS_MIDPLANE_INFO_ACCESS_FIELD = 'access'
CHASSIS_MODULE_HOSTNAME_TABLE = 'CHASSIS_MODULE_TABLE'
CHASSIS_MODULE_INFO_HOSTNAME_FIELD = 'hostname'

CHASSIS_MODULE_REBOOT_INFO_TABLE = 'CHASSIS_MODULE_REBOOT_INFO_TABLE'
CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
DEFAULT_LINECARD_REBOOT_TIMEOUT = 180
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"

CHASSIS_INFO_UPDATE_PERIOD_SECS = 10
CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD = 30 # Minutes

Expand Down Expand Up @@ -198,9 +204,18 @@ class ModuleUpdater(logger.Logger):
CHASSIS_ASIC_INFO_TABLE)

self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE)
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None

self.linecard_reboot_timeout = DEFAULT_LINECARD_REBOOT_TIMEOUT
if os.path.isfile(PLATFORM_ENV_CONF_FILE):
with open(PLATFORM_ENV_CONF_FILE, 'r') as file:
for line in file:
field = line.split('=')[0].strip()
if field == "linecard_reboot_timeout":
self.linecard_reboot_timeout = int(line.split('=')[1].strip())

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
self.log_error("Chassisd midplane intialization failed")
Expand Down Expand Up @@ -362,6 +377,31 @@ class ModuleUpdater(logger.Logger):
else:
return False

def is_module_reboot_expected(self, key):
fvs = self.module_reboot_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
fvs = dict(fvs[-1])
if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected":
return True
return False

def module_reboot_set_time(self, key):
time_now = time.time()
fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
self.module_reboot_table.set(key,fvs)

def is_module_reboot_system_up_expired(self, key):
fvs = self.module_reboot_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
fvs = dict(fvs[-1])
if CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD in fvs.keys():
timestamp= float(fvs[CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD])
time_now = time.time()
if time_now - timestamp >= self.linecard_reboot_timeout:
self.module_reboot_table._del(key)
return True
return False

def check_midplane_reachability(self):
if not self.midplane_initialized:
return
Expand Down Expand Up @@ -395,10 +435,20 @@ class ModuleUpdater(logger.Logger):
current_midplane_state = fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

if midplane_access is False and current_midplane_state == 'True':
self.log_warning("Module {} lost midplane connectivity".format(module_key))
if self.is_module_reboot_expected(module_key):
self.module_reboot_set_time(module_key)
self.log_warning("Expected: Module {} lost midplane connectivity".format(module_key))
else:
self.log_warning("Unexpected: Module {} lost midplane connectivity".format(module_key))
elif midplane_access is True and current_midplane_state == 'False':
self.log_notice("Module {} midplane connectivity is up".format(module_key))

# clean up the reboot_info_table
if self.module_reboot_table.get(module_key) is not None:
self.module_reboot_table._del(module_key)
elif midplane_access is False and current_midplane_state == 'False':
if self.is_module_reboot_system_up_expired(module_key):
self.log_warning("Unexpected: Module {} midplane connectivity is not restored in {} seconds".format(module_key, self.linecard_reboot_timeout))

# Update db with midplane information
fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip),
(CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))])
Expand Down
124 changes: 124 additions & 0 deletions sonic-chassisd/tests/test_chassisd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import mock
from imp import load_source

from mock import Mock, MagicMock, patch
Expand Down Expand Up @@ -40,6 +41,10 @@
CHASSIS_ASIC_PCI_ADDRESS_FIELD = 'asic_pci_address'
CHASSIS_ASIC_ID_IN_MODULE_FIELD = 'asic_id_in_module'

CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"

def setup_function():
ModuleUpdater.log_notice = MagicMock()
ModuleUpdater.log_warning = MagicMock()
Expand Down Expand Up @@ -357,6 +362,125 @@ def test_midplane_presence_modules():
fvs = midplane_table.get(name)
assert fvs == None

builtin_open = open # save the unpatched version
def mock_open(*args, **kwargs):
if args[0] == PLATFORM_ENV_CONF_FILE:
return mock.mock_open(read_data="dummy=1\nlinecard_reboot_timeout=240\n")(*args, **kwargs)
# unpatched version for every other path
return builtin_open(*args, **kwargs)

@patch("builtins.open", mock_open)
@patch('os.path.isfile', MagicMock(return_value=True))
def test_midplane_presence_modules_linecard_reboot():
chassis = MockChassis()

#Supervisor
index = 0
name = "SUPERVISOR0"
desc = "Supervisor card"
slot = 16
serial = "RP1000101"
module_type = ModuleBase.MODULE_TYPE_SUPERVISOR
supervisor = MockModule(index, name, desc, module_type, slot, serial)
supervisor.set_midplane_ip()
chassis.module_list.append(supervisor)

#Linecard
index = 1
name = "LINE-CARD0"
desc = "36 port 400G card"
slot = 1
serial = "LC1000101"
module_type = ModuleBase.MODULE_TYPE_LINE
module = MockModule(index, name, desc, module_type, slot, serial)
module.set_midplane_ip()
chassis.module_list.append(module)

#Fabric-card
index = 1
name = "FABRIC-CARD0"
desc = "Switch fabric card"
slot = 17
serial = "FC1000101"
module_type = ModuleBase.MODULE_TYPE_FABRIC
fabric = MockModule(index, name, desc, module_type, slot, serial)
chassis.module_list.append(fabric)

#Run on supervisor
module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, slot,
module.supervisor_slot)
module_updater.supervisor_slot = supervisor.get_slot()
module_updater.my_slot = supervisor.get_slot()
module_updater.modules_num_update()
module_updater.module_db_update()
module_updater.check_midplane_reachability()

midplane_table = module_updater.midplane_table
#Check only one entry in database
assert 1 == midplane_table.size()

#Check fields in database
name = "LINE-CARD0"
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

#Set access of line-card to Up (midplane connectivity is down initially)
module.set_midplane_reachable(True)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]


#Set access of line-card to Down (to mock midplane connectivity state change)
module.set_midplane_reachable(False)
# set expected reboot of linecard
module_reboot_table = module_updater.module_reboot_table
linecard_fvs = swsscommon.FieldValuePairs([("reboot", "expected")])
module_reboot_table.set(name,linecard_fvs)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

#Set access of line-card to up on time (to mock midplane connectivity state change)
module.set_midplane_reachable(True)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

# test linecard reboot midplane connectivity restored timeout
# Set access of line-card to Down (to mock midplane connectivity state change)
module.set_midplane_reachable(False)
linecard_fvs = swsscommon.FieldValuePairs([("reboot", "expected")])
module_reboot_table.set(name,linecard_fvs)
module_updater.check_midplane_reachability()
time_now= time.time() - module_updater.linecard_reboot_timeout
linecard_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
module_reboot_table.set(name,linecard_fvs)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
assert module_updater.linecard_reboot_timeout == 240

def test_midplane_presence_supervisor():
chassis = MockChassis()

Expand Down
Loading