diff --git a/cfgmgr/buffer_pool_mellanox.lua b/cfgmgr/buffer_pool_mellanox.lua index ee48fe0403f..15bb81efb21 100644 --- a/cfgmgr/buffer_pool_mellanox.lua +++ b/cfgmgr/buffer_pool_mellanox.lua @@ -133,7 +133,7 @@ local function iterate_profile_list(all_items) return 0 end -local function fetch_buffer_pool_size_from_appldb() +local function fetch_buffer_pool_size_from_appldb(shp_enabled) local buffer_pools = {} redis.call('SELECT', config_db) local buffer_pool_keys = redis.call('KEYS', 'BUFFER_POOL|*') @@ -158,7 +158,18 @@ local function fetch_buffer_pool_size_from_appldb() end xoff = redis.call('HGET', 'BUFFER_POOL_TABLE:' .. buffer_pools[i], 'xoff') if not xoff then - table.insert(result, buffer_pools[i] .. ':' .. size) + if shp_enabled and size == "0" and buffer_pools[i] == "ingress_lossless_pool" then + -- During initialization, if SHP is enabled + -- 1. the buffer pool sizes, xoff have initialized to 0, which means the shared headroom pool is disabled + -- 2. but the buffer profiles already indicate the shared headroom pool is enabled + -- 3. later on the buffer pool sizes are updated with xoff being non-zero + -- In case the orchagent starts handling buffer configuration between 2 and 3, + -- It is inconsistent between buffer pools and profiles, which fails Mellanox SAI sanity check + -- To avoid it, it indicates the shared headroom pool is enabled by setting a very small buffer pool and shared headroom pool sizes + table.insert(result, buffer_pools[i] .. ':2048:1024') + else + table.insert(result, buffer_pools[i] .. ':' .. size) + end else table.insert(result, buffer_pools[i] .. ':' .. size .. ':' .. xoff) end @@ -295,7 +306,7 @@ local fail_count = 0 fail_count = fail_count + iterate_all_items(all_pgs, true) fail_count = fail_count + iterate_all_items(all_tcs, false) if fail_count > 0 then - fetch_buffer_pool_size_from_appldb() + fetch_buffer_pool_size_from_appldb(shp_enabled) return result end @@ -305,7 +316,7 @@ local all_egress_profile_lists = redis.call('KEYS', 'BUFFER_PORT_EGRESS_PROFILE_ fail_count = fail_count + iterate_profile_list(all_ingress_profile_lists) fail_count = fail_count + iterate_profile_list(all_egress_profile_lists) if fail_count > 0 then - fetch_buffer_pool_size_from_appldb() + fetch_buffer_pool_size_from_appldb(shp_enabled) return result end diff --git a/tests/test_buffer_dynamic.py b/tests/test_buffer_dynamic.py index 49d36b357ca..1813ebf4309 100644 --- a/tests/test_buffer_dynamic.py +++ b/tests/test_buffer_dynamic.py @@ -865,3 +865,41 @@ def test_bufferPortMaxParameter(self, dvs, testlog): dvs.port_admin_set('Ethernet0', 'down') self.cleanup_db(dvs) + + + def test_bufferPoolInitWithSHP(self, dvs, testlog): + self.setup_db(dvs) + + try: + # 1. Enable the shared headroom pool + default_lossless_buffer_parameter = self.config_db.get_entry('DEFAULT_LOSSLESS_BUFFER_PARAMETER', 'AZURE') + default_lossless_buffer_parameter['over_subscribe_ratio'] = '2' + self.config_db.update_entry('DEFAULT_LOSSLESS_BUFFER_PARAMETER', 'AZURE', default_lossless_buffer_parameter) + + # 2. Stop the orchagent + _, oa_pid = dvs.runcmd("pgrep orchagent") + dvs.runcmd("kill -s SIGSTOP {}".format(oa_pid)) + + # 3. Remove the size from CONFIG_DB|BUFFER_POOL.ingress_lossless_pool + original_ingress_lossless_pool = self.config_db.get_entry('BUFFER_POOL', 'ingress_lossless_pool') + try: + self.config_db.delete_field('BUFFER_POOL', 'ingress_lossless_pool', 'size') + self.config_db.delete_field('BUFFER_POOL', 'ingress_lossless_pool', 'xoff') + except Exception as e: + pass + + # 4. Remove the ingress_lossless_pool from the APPL_DB + self.app_db.delete_entry('BUFFER_POOL_TABLE', 'ingress_lossless_pool') + + # 5. Mock it by adding a "TABLE_SET" entry to trigger the fallback logic + self.app_db.update_entry("BUFFER_PG_TABLE_SET", "", {"NULL": "NULL"}) + + # 6. Invoke the lua plugin + _, output = dvs.runcmd("redis-cli --eval /usr/share/swss/buffer_pool_vs.lua") + assert "ingress_lossless_pool:2048:1024" in output + + finally: + self.config_db.update_entry('BUFFER_POOL', 'ingress_lossless_pool', original_ingress_lossless_pool) + self.config_db.delete_entry('DEFAULT_LOSSLESS_BUFFER_PARAMETER', 'AZURE') + self.app_db.delete_entry("BUFFER_PG_TABLE_SET", "") + dvs.runcmd("kill -s SIGCONT {}".format(oa_pid))