From bbca9819a8e96ed6d1e69548d7c82acd74de91c5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Mon, 31 Oct 2022 21:46:41 +0000 Subject: [PATCH 1/8] Added PCIe transaction check for all peripherals on the bus This is to determine if there are any unreachable devices on the PCIe bus. The gold list of PCI peripherals is a static file that varies depending on the platform. BDF information is parsed from this YAML file. Transcations are then carried out to determine the health of each peripheral. --- sonic-pcied/scripts/pcied | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index f88f159d1..75937faad 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -9,6 +9,7 @@ import os import signal import sys import threading +import subprocess from sonic_py_common import daemon_base, device_info, logger from swsscommon import swsscommon @@ -157,6 +158,17 @@ class DaemonPcied(daemon_base.DaemonBase): if self.resultInfo is None: return + for result in self.resultInfo: + bus = int(result["bus"], 16) + device = int(result["dev"], 16) + func = int(result["fn"], 16) + + pcieDeviceArgs = ("%02x:%02x.%d" % (bus, device, func)) + pcieDeviceQueryResult = subprocess.run(['lspci', '-s', pcieDeviceArgs], stdout=subprocess.PIPE).stdout.decode('utf-8') + + if pcieDeviceQueryResult is None: + self.log_error("PCIe Peripheral {} -- Not Found".format(pcieDeviceQueryResult)) + for result in self.resultInfo: if result["result"] == "Failed": self.log_warning("PCIe Device: " + result["name"] + " Not Found") From 5fd0686252dd10f357e1080b2ce6c0b6fe3d9929 Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Fri, 11 Nov 2022 21:00:46 +0000 Subject: [PATCH 2/8] Modified code to read pcie devices from the pre-configured YAML file Modified cmd to be executed to setpci (does pci transaction) Added exception handling --- sonic-pcied/scripts/pcied | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index 75937faad..d1e7cf3b0 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -33,6 +33,7 @@ PCIED_MAIN_THREAD_SLEEP_SECS = 60 PCIEUTIL_CONF_FILE_ERROR = 1 PCIEUTIL_LOAD_ERROR = 2 +PLATFORM_PCIE_YAML_FILE = "/usr/share/sonic/platform/pcie.yaml" platform_pcieutil = None @@ -158,16 +159,28 @@ class DaemonPcied(daemon_base.DaemonBase): if self.resultInfo is None: return - for result in self.resultInfo: - bus = int(result["bus"], 16) - device = int(result["dev"], 16) - func = int(result["fn"], 16) - - pcieDeviceArgs = ("%02x:%02x.%d" % (bus, device, func)) - pcieDeviceQueryResult = subprocess.run(['lspci', '-s', pcieDeviceArgs], stdout=subprocess.PIPE).stdout.decode('utf-8') - - if pcieDeviceQueryResult is None: - self.log_error("PCIe Peripheral {} -- Not Found".format(pcieDeviceQueryResult)) + try: + stream = open(PLATFORM_PCIE_YAML_FILE, 'r') + dictionary = yaml.safe_load(stream) + for elem in dictionary: + bus = int(elem.get('bus'), 16) + device = int(elem.get('dev'), 16) + func = int(elem.get('fn'), 16) + pcieDeviceID = elem.get('id') + + cmd = ("setpci -s %02x:%02x.%d 2.w" % (bus, device, func)) + output, errs = subprocess.Popen(cmd, shell=True, executable='/bin/bash', stdout=subprocess.PIPE).communicate() + if errs: + raise ValueError("subprocess command {} returned {}".format(cmd, errs)) + pcieDeviceQueryResult = output.decode('utf8').rstrip() + + if pcieDeviceQueryResult is None or "No devices selected" in pcieDeviceQueryResult: + self.log_error("PCIe Peripheral {} -- Not Found".format(pcieDeviceQueryResult)) + elif pcieDeviceQueryResult != pcieDeviceID: + self.log_error("PCIe device %02x:%02x.%d ID DOES NOT match preconfigured YAML file" % (bus, device, func)) + except Exception as ex: + self.log_error("PCIe Exception occurred: {}".format(ex)) + pass for result in self.resultInfo: if result["result"] == "Failed": From 407193220157b94c196a592375d9748981bf529c Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Wed, 8 Mar 2023 03:49:50 +0000 Subject: [PATCH 3/8] Formalized syslog error messages Added specific check for ASIC missing to differentiate from device mismatch Added yaml support --- sonic-pcied/scripts/pcied | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index d1e7cf3b0..9f939f54e 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -10,6 +10,7 @@ import signal import sys import threading import subprocess +import yaml from sonic_py_common import daemon_base, device_info, logger from swsscommon import swsscommon @@ -177,7 +178,10 @@ class DaemonPcied(daemon_base.DaemonBase): if pcieDeviceQueryResult is None or "No devices selected" in pcieDeviceQueryResult: self.log_error("PCIe Peripheral {} -- Not Found".format(pcieDeviceQueryResult)) elif pcieDeviceQueryResult != pcieDeviceID: - self.log_error("PCIe device %02x:%02x.%d ID DOES NOT match preconfigured YAML file" % (bus, device, func)) + if pcieDeviceQueryResult == 'ffff': + self.log_error("PCIe device {:02x}:{:02x}.{:01d} missing.".format(bus, device, func)) + else: + self.log_error("PCIe device {:02x}:{:02x}.{:01d} ID mismatch. Expected {}, received {}".format(bus, device, func, pcieDeviceID, pcieDeviceQueryResult)) except Exception as ex: self.log_error("PCIe Exception occurred: {}".format(ex)) pass From 29bb6970c9595e557eb1da6b5011f4bb31060b68 Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Fri, 10 Mar 2023 21:52:41 +0000 Subject: [PATCH 4/8] Made changes to the way setpci command is called per SA review comments --- sonic-pcied/scripts/pcied | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index 9f939f54e..1416e90f7 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -169,10 +169,9 @@ class DaemonPcied(daemon_base.DaemonBase): func = int(elem.get('fn'), 16) pcieDeviceID = elem.get('id') - cmd = ("setpci -s %02x:%02x.%d 2.w" % (bus, device, func)) - output, errs = subprocess.Popen(cmd, shell=True, executable='/bin/bash', stdout=subprocess.PIPE).communicate() - if errs: - raise ValueError("subprocess command {} returned {}".format(cmd, errs)) + pcie_device = ("{:02x}:{:02x}.{:01d}".format(bus, device, func)) + cmd = ["setpci", "-s", pcie_device, "2.w"] + output = subprocess.check_output(cmd) pcieDeviceQueryResult = output.decode('utf8').rstrip() if pcieDeviceQueryResult is None or "No devices selected" in pcieDeviceQueryResult: From 5e01bccf2d751929cb3a92dfacb9ae9e48a2bdca Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Fri, 17 Mar 2023 01:20:07 +0000 Subject: [PATCH 5/8] Added comments per prgeor review comments --- sonic-pcied/scripts/pcied | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index 1416e90f7..c24df5460 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -162,20 +162,29 @@ class DaemonPcied(daemon_base.DaemonBase): try: stream = open(PLATFORM_PCIE_YAML_FILE, 'r') + + # Load contents of the PCIe YAML file into a dictionary object dictionary = yaml.safe_load(stream) + + # Iterate through each element in dict; extract the BDF information and device ID for that element for elem in dictionary: bus = int(elem.get('bus'), 16) device = int(elem.get('dev'), 16) func = int(elem.get('fn'), 16) pcieDeviceID = elem.get('id') + # Form the PCI device address from the BDF information above pcie_device = ("{:02x}:{:02x}.{:01d}".format(bus, device, func)) + + # Form and run the command to read the device ID from the PCI device cmd = ["setpci", "-s", pcie_device, "2.w"] output = subprocess.check_output(cmd) pcieDeviceQueryResult = output.decode('utf8').rstrip() + # verify that the queried device ID matches what we have on file for the current PCI device + # If not, take appropriate action based on the queried value if pcieDeviceQueryResult is None or "No devices selected" in pcieDeviceQueryResult: - self.log_error("PCIe Peripheral {} -- Not Found".format(pcieDeviceQueryResult)) + self.log_error("Invalid PCIe Peripheral {:02x}:{:02x}.{:01d} - {}".format(bus, device, func, pcieDeviceQueryResult)) elif pcieDeviceQueryResult != pcieDeviceID: if pcieDeviceQueryResult == 'ffff': self.log_error("PCIe device {:02x}:{:02x}.{:01d} missing.".format(bus, device, func)) From e2e698de2115451c4c4d8ee80c5c9fa445435c4c Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Mon, 20 Mar 2023 18:06:14 +0000 Subject: [PATCH 6/8] Added relevant comments per prgeor review comments --- sonic-pcied/scripts/pcied | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index c24df5460..b07ca1a87 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -160,6 +160,12 @@ class DaemonPcied(daemon_base.DaemonBase): if self.resultInfo is None: return + # This block reads the PCIE YAML file, iterates through each device and + # for each device runs a transaction check to verify that the device ID + # matches the one on file. In the event of an invalid device ID such as + # '0xffff', it marks the device as missing as that is the only scenario + # in which such a value is returned. + try: stream = open(PLATFORM_PCIE_YAML_FILE, 'r') @@ -191,7 +197,7 @@ class DaemonPcied(daemon_base.DaemonBase): else: self.log_error("PCIe device {:02x}:{:02x}.{:01d} ID mismatch. Expected {}, received {}".format(bus, device, func, pcieDeviceID, pcieDeviceQueryResult)) except Exception as ex: - self.log_error("PCIe Exception occurred: {}".format(ex)) + self.log_error("PCIe Exception occurred for PCIe device {:02x}:{:02x}.{:01d}: {}".format(bus, device, func, ex)) pass for result in self.resultInfo: From dc854a81ec793795a25a1595c3915683ffe25908 Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Mon, 20 Mar 2023 23:36:19 +0000 Subject: [PATCH 7/8] Fixed a bug that was causing UnboundLocalError. --- sonic-pcied/scripts/pcied | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sonic-pcied/scripts/pcied b/sonic-pcied/scripts/pcied index b07ca1a87..e42748eca 100644 --- a/sonic-pcied/scripts/pcied +++ b/sonic-pcied/scripts/pcied @@ -166,6 +166,11 @@ class DaemonPcied(daemon_base.DaemonBase): # '0xffff', it marks the device as missing as that is the only scenario # in which such a value is returned. + # Default, invalid BDF values to begin + bus = None + device = None + func = None + try: stream = open(PLATFORM_PCIE_YAML_FILE, 'r') @@ -196,8 +201,13 @@ class DaemonPcied(daemon_base.DaemonBase): self.log_error("PCIe device {:02x}:{:02x}.{:01d} missing.".format(bus, device, func)) else: self.log_error("PCIe device {:02x}:{:02x}.{:01d} ID mismatch. Expected {}, received {}".format(bus, device, func, pcieDeviceID, pcieDeviceQueryResult)) + except Exception as ex: - self.log_error("PCIe Exception occurred for PCIe device {:02x}:{:02x}.{:01d}: {}".format(bus, device, func, ex)) + # Verify that none of the BDF objects are None. If condition evals to False, do not mention BDF information in error log + if not [i for i in (bus, device, func) if i is None]: + self.log_error("PCIe Exception occurred for PCIe device {:02x}:{:02x}.{:01d}: {}".format(bus, device, func, ex)) + else: + self.log_error("PCIe Exception occurred: {}".format(ex)) pass for result in self.resultInfo: From 5727c9f6258146356802bc24ee6b1beac2e4e3f2 Mon Sep 17 00:00:00 2001 From: Ashwin Srinivasan Date: Sat, 24 Jun 2023 03:32:18 +0000 Subject: [PATCH 8/8] Added unit tests for the check_pcie_devices method --- sonic-pcied/tests/test_DaemonPcied.py | 138 ++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 7 deletions(-) diff --git a/sonic-pcied/tests/test_DaemonPcied.py b/sonic-pcied/tests/test_DaemonPcied.py index 2c3c953e7..cfcbf33eb 100644 --- a/sonic-pcied/tests/test_DaemonPcied.py +++ b/sonic-pcied/tests/test_DaemonPcied.py @@ -59,18 +59,23 @@ pcie_check_result_no = [] -pcie_check_result_pass = \ -""" -[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, +pcie_check_result_pass = [{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B', 'result': 'Passed'}, {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f12', 'name': 'PCI C', 'result': 'Passed'}] -""" -pcie_check_result_fail = \ -""" -[{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, + +pcie_check_result_fail = [{'bus': '00', 'dev': '01', 'fn': '0', 'id': '1f10', 'name': 'PCI A', 'result': 'Passed'}, {'bus': '00', 'dev': '02', 'fn': '0', 'id': '1f11', 'name': 'PCI B', 'result': 'Passed'}, {'bus': '00', 'dev': '03', 'fn': '0', 'id': '1f12', 'name': 'PCI C', 'result': 'Failed'}] + + +TEST_PLATFORM_PCIE_YAML_FILE = \ +""" +- bus: '00' + dev: '01' + fn: '0' + id: '9170' + name: 'PCI A' """ class TestDaemonPcied(object): @@ -143,6 +148,125 @@ def test_run(self): daemon_pcied.run() assert daemon_pcied.check_pcie_devices.call_count == 1 + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_yaml_file_open_error(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock() + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_result_fail(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(return_value=pcie_check_result_fail) + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_result_pass(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(return_value=pcie_check_result_pass) + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_result_none(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + pcied.platform_pcieutil.get_pcie_check = mock.MagicMock(return_value=None) + + daemon_pcied.check_pcie_devices() + + assert pcied.platform_pcieutil.get_pcie_check.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_happy(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "9170" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_mismatch(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "0123" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_missing_device(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "ffff" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) + def test_check_pcie_devices_load_yaml_invalid_device(self): + daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER) + + with mock.patch('builtins.open', new_callable=mock.mock_open, read_data=TEST_PLATFORM_PCIE_YAML_FILE) as mock_fd: + + class MockOutput: + def decode(self, encodingType): + return self + def rstrip(self): + return "No devices selected" + + mock_output = MockOutput() + with mock.patch('subprocess.check_output', mock.MagicMock()) as mock_check_output: + mock_check_output.return_value = mock_output + + daemon_pcied.check_pcie_devices() + + assert mock_check_output.called + + + @mock.patch('pcied.load_platform_pcieutil', mock.MagicMock()) def test_check_pcie_devices(self): daemon_pcied = pcied.DaemonPcied(SYSLOG_IDENTIFIER)