From d73808cead2aaf386ae033d9d571882c92f0377c Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan <93744978+assrinivasan@users.noreply.github.com> Date: Wed, 12 Jul 2023 10:58:39 -0700 Subject: [PATCH] Added PCIe transaction check for all peripherals on the bus (#331) * Added PCIe transaction check for all peripherals on the bus This is to determine if there are any unreachable devices on the PCIe bus. The gold list of PCI peripherals is a static file that varies depending on the platform. BDF information is parsed from this YAML file. Transcations are then carried out to determine the health of each peripheral. * Modified code to read pcie devices from the pre-configured YAML file Modified cmd to be executed to setpci (does pci transaction) Added exception handling * Formalized syslog error messages Added specific check for ASIC missing to differentiate from device mismatch Added yaml support * Made changes to the way setpci command is called per SA review comments * Added comments per prgeor review comments * Added relevant comments per prgeor review comments * Fixed a bug that was causing UnboundLocalError. * Added unit tests for the check_pcie_devices method --- sonic-pcied/scripts/pcied | 53 ++++++++++ sonic-pcied/tests/test_DaemonPcied.py | 138 ++++++++++++++++++++++++-- 2 files changed, 184 insertions(+), 7 deletions(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index f88f159d1..e42748eca 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -9,6 +9,8 @@ import os import signal import sys import threading +import subprocess +import yaml from sonic_py_common import daemon_base, device_info, logger from swsscommon import swsscommon @@ -32,6 +34,7 @@ PCIED_MAIN_THREAD_SLEEP_SECS = 60 PCIEUTIL_CONF_FILE_ERROR = 1 PCIEUTIL_LOAD_ERROR = 2 +PLATFORM_PCIE_YAML_FILE = "/usr/share/sonic/platform/pcie.yaml" platform_pcieutil = None @@ -157,6 +160,56 @@ class DaemonPcied(daemon_base.DaemonBase): if self.resultInfo is None: return + # This block reads the PCIE YAML file, iterates through each device and + # for each device runs a transaction check to verify that the device ID + # matches the one on file. In the event of an invalid device ID such as + # '0xffff', it marks the device as missing as that is the only scenario + # in which such a value is returned. + + # Default, invalid BDF values to begin + bus = None + device = None + func = None + + try: + stream = open(PLATFORM_PCIE_YAML_FILE, 'r') + + # Load contents of the PCIe YAML file into a dictionary object + dictionary = yaml.safe_load(stream) + + # Iterate through each element in dict; extract the BDF information and device ID for that element + for elem in dictionary: + bus = int(elem.get('bus'), 16) + device = int(elem.get('dev'), 16) + func = int(elem.get('fn'), 16) + pcieDeviceID = elem.get('id') + + # Form the PCI device address from the BDF information above + pcie_device = ("{:02x}:{:02x}.{:01d}".format(bus, device, func)) + + # Form and run the command to read the device ID from the PCI device + cmd = ["setpci", "-s", pcie_device, "2.w"] + output = subprocess.check_output(cmd) + pcieDeviceQueryResult = output.decode('utf8').rstrip() + + # verify that the queried device ID matches what we have on file for the current PCI device + # If not, take appropriate action based on the queried value + if pcieDeviceQueryResult is None or "No devices selected" in pcieDeviceQueryResult: + self.log_error("Invalid PCIe Peripheral {:02x}:{:02x}.{:01d} - {}".format(bus, device, func, pcieDeviceQueryResult)) + elif pcieDeviceQueryResult != pcieDeviceID: + if pcieDeviceQueryResult == 'ffff': + self.log_error("PCIe device {:02x}:{:02x}.{:01d} missing.".format(bus, device, func)) + else: + self.log_error("PCIe device {:02x}:{:02x}.{:01d} ID mismatch. Expected {}, received {}".format(bus, device, func, pcieDeviceID, pcieDeviceQueryResult)) + + except Exception as ex: + # Verify that none of the BDF objects are None. If condition evals to False, do not mention BDF information in error log + if not [i for i in (bus, device, func) if i is None]: + self.log_error("PCIe Exception occurred for PCIe device {:02x}:{:02x}.{:01d}: {}".format(bus, device, func, ex)) + else: + self.log_error("PCIe Exception occurred: {}".format(ex)) + pass + for result in self.resultInfo: if result["result"] == "Failed": self.log_warning("PCIe Device: " + result["name"] + " Not Found") diff --git a/sonic-pcied/tests/test_DaemonPcied.py b/sonic-pcied/tests/test_DaemonPcied.py index 2c3c953e7..cfcbf33eb 100644 --- a/sonic-pcied/tests/test_DaemonPcied.py +++ b/sonic-pcied/tests/test_DaemonPcied.py @@ -59,18 +59,23 @@ pcie_check_result_no = [] -pcie_check_result_pass = \ -""" -[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, +pcie_check_result_pass = [{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B', 'result': 'Passed'}, {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f12', 'name': 'PCI C', 'result': 'Passed'}] -""" -pcie_check_result_fail = \ -""" -[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, + +pcie_check_result_fail = [{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B', 'result': 'Passed'}, {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f12', 'name': 'PCI C', 'result': 'Failed'}] + + +TEST_PLATFORM_PCIE_YAML_FILE = \ +""" +- bus: '00' + dev: '01' + fn: '0' + id: '9170' + name: 'PCI A' """ class TestDaemonPcied(object): @@ -143,6 +148,125 @@ def test_run(self): daemon_pcied.run() assert daemon_pcied.check_pcie_devices.call_count == 1 + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_yaml_file_open_error(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock() + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_result_fail(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(return_value=pcie_check_result_fail) + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_result_pass(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(return_value=pcie_check_result_pass) + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_result_none(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(return_value=None) + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_happy(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "9170" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_mismatch(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "0123" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_missing_device(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "ffff" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_invalid_device(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "No devices selected" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) def test_check_pcie_devices(self): daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)