Skip to content

Commit

Permalink
[Mellanox] Fix inconsistence in the shared headroom pool initializati…
Browse files Browse the repository at this point in the history
…on (sonic-net#3057)

* Fix inconsistence in the shared headroom pool initialization

* Why I did it

During initialization, if SHP is enabled

the buffer pool sizes, xoff have initialized to 0, which means SHP is disabled
but the buffer profiles already indicate SHP
later on the buffer pool sizes are updated with off being non-zero
In case the orchagent starts handling buffer configuration between 2 and 3, it is inconsistent between buffer pools and profiles, which fails Mellanox SAI sanity check.
To avoid it, it indicates SHP enabled by setting a very small buffer pool and SHP sizes
  • Loading branch information
stephenxs authored and cscarpitta committed Apr 5, 2024
1 parent 2763f85 commit 0ba40c4
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 4 deletions.
19 changes: 15 additions & 4 deletions cfgmgr/buffer_pool_mellanox.lua
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ local function iterate_profile_list(all_items)
return 0
end

local function fetch_buffer_pool_size_from_appldb()
local function fetch_buffer_pool_size_from_appldb(shp_enabled)
local buffer_pools = {}
redis.call('SELECT', config_db)
local buffer_pool_keys = redis.call('KEYS', 'BUFFER_POOL|*')
Expand All @@ -158,7 +158,18 @@ local function fetch_buffer_pool_size_from_appldb()
end
xoff = redis.call('HGET', 'BUFFER_POOL_TABLE:' .. buffer_pools[i], 'xoff')
if not xoff then
table.insert(result, buffer_pools[i] .. ':' .. size)
if shp_enabled and size == "0" and buffer_pools[i] == "ingress_lossless_pool" then
-- During initialization, if SHP is enabled
-- 1. the buffer pool sizes, xoff have initialized to 0, which means the shared headroom pool is disabled
-- 2. but the buffer profiles already indicate the shared headroom pool is enabled
-- 3. later on the buffer pool sizes are updated with xoff being non-zero
-- In case the orchagent starts handling buffer configuration between 2 and 3,
-- It is inconsistent between buffer pools and profiles, which fails Mellanox SAI sanity check
-- To avoid it, it indicates the shared headroom pool is enabled by setting a very small buffer pool and shared headroom pool sizes
table.insert(result, buffer_pools[i] .. ':2048:1024')
else
table.insert(result, buffer_pools[i] .. ':' .. size)
end
else
table.insert(result, buffer_pools[i] .. ':' .. size .. ':' .. xoff)
end
Expand Down Expand Up @@ -295,7 +306,7 @@ local fail_count = 0
fail_count = fail_count + iterate_all_items(all_pgs, true)
fail_count = fail_count + iterate_all_items(all_tcs, false)
if fail_count > 0 then
fetch_buffer_pool_size_from_appldb()
fetch_buffer_pool_size_from_appldb(shp_enabled)
return result
end

Expand All @@ -305,7 +316,7 @@ local all_egress_profile_lists = redis.call('KEYS', 'BUFFER_PORT_EGRESS_PROFILE_
fail_count = fail_count + iterate_profile_list(all_ingress_profile_lists)
fail_count = fail_count + iterate_profile_list(all_egress_profile_lists)
if fail_count > 0 then
fetch_buffer_pool_size_from_appldb()
fetch_buffer_pool_size_from_appldb(shp_enabled)
return result
end

Expand Down
38 changes: 38 additions & 0 deletions tests/test_buffer_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,3 +865,41 @@ def test_bufferPortMaxParameter(self, dvs, testlog):
dvs.port_admin_set('Ethernet0', 'down')

self.cleanup_db(dvs)


def test_bufferPoolInitWithSHP(self, dvs, testlog):
self.setup_db(dvs)

try:
# 1. Enable the shared headroom pool
default_lossless_buffer_parameter = self.config_db.get_entry('DEFAULT_LOSSLESS_BUFFER_PARAMETER', 'AZURE')
default_lossless_buffer_parameter['over_subscribe_ratio'] = '2'
self.config_db.update_entry('DEFAULT_LOSSLESS_BUFFER_PARAMETER', 'AZURE', default_lossless_buffer_parameter)

# 2. Stop the orchagent
_, oa_pid = dvs.runcmd("pgrep orchagent")
dvs.runcmd("kill -s SIGSTOP {}".format(oa_pid))

# 3. Remove the size from CONFIG_DB|BUFFER_POOL.ingress_lossless_pool
original_ingress_lossless_pool = self.config_db.get_entry('BUFFER_POOL', 'ingress_lossless_pool')
try:
self.config_db.delete_field('BUFFER_POOL', 'ingress_lossless_pool', 'size')
self.config_db.delete_field('BUFFER_POOL', 'ingress_lossless_pool', 'xoff')
except Exception as e:
pass

# 4. Remove the ingress_lossless_pool from the APPL_DB
self.app_db.delete_entry('BUFFER_POOL_TABLE', 'ingress_lossless_pool')

# 5. Mock it by adding a "TABLE_SET" entry to trigger the fallback logic
self.app_db.update_entry("BUFFER_PG_TABLE_SET", "", {"NULL": "NULL"})

# 6. Invoke the lua plugin
_, output = dvs.runcmd("redis-cli --eval /usr/share/swss/buffer_pool_vs.lua")
assert "ingress_lossless_pool:2048:1024" in output

finally:
self.config_db.update_entry('BUFFER_POOL', 'ingress_lossless_pool', original_ingress_lossless_pool)
self.config_db.delete_entry('DEFAULT_LOSSLESS_BUFFER_PARAMETER', 'AZURE')
self.app_db.delete_entry("BUFFER_PG_TABLE_SET", "")
dvs.runcmd("kill -s SIGCONT {}".format(oa_pid))

0 comments on commit 0ba40c4

Please sign in to comment.