From ff0cc80a77092afb45efa638e18a3dc44e3d70dc Mon Sep 17 00:00:00 2001 From: Longxiang Lyu <35479537+lolyu@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:36:20 +0800 Subject: [PATCH] [active-standby] Fix the oscillation logic (#261) Approach What is the motivation for this PR? Fix the unexpected toggle introduced by the oscillation logic. Signed-off-by: Longxiang Lyu lolv@microsoft.com Work item tracking Microsoft ADO (number only): 28397786 How did you do it? The oscillation is introduced to allow the active side to toggle to standby if no heartbeat is received. The workflow is described as the following: (wait, active, up) ---> set oscillation timer [1] ... (wait, active, up) ---> (wait, wait, up) [2] (wait, wait, up) ---> (wait, active, up) ... (wait, active, up) <--- oscillation timer expires, toggle to standby [3] [1]: the ToR enters (wait, active, up), no heartbeats received, oscillation timer is set. [2]: the ToR consistently probes the mux status, and transits between (wait, active, up) and (wait, wait, up). [3]: when the oscillation timer expires and the ToR is (wait, active, up), make the toggle to standby request. We need to ensure that, the ToR is only allowed to transits between (wait, active, up) and (wait, wait, up) during [2]. So any link prober active/standby, mux standby, or link down events should cancel the oscillation. How did you verify/test it? UT Any platform specific information? Documentation --- .../LinkManagerStateMachineActiveStandby.cpp | 31 +++++++++++++-- .../LinkManagerStateMachineActiveStandby.h | 10 +++++ test/LinkManagerStateMachineTest.cpp | 39 ++++++++++++++++++- 3 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp index e7c668c..9045bd4 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.cpp +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.cpp @@ -61,8 +61,6 @@ ActiveStandbyStateMachine::ActiveStandbyStateMachine( mMuxStateMachine.setWaitStateCause(mux_state::WaitState::WaitStateCause::SwssUpdate); mMuxPortPtr->setMuxLinkmgrState(mLabel); initializeTransitionFunctionTable(); - - mOscillationTimer.expires_from_now(boost::posix_time::seconds(1)); } // @@ -476,12 +474,16 @@ void ActiveStandbyStateMachine::handleStateChange(LinkProberEvent &event, link_p mMuxPortPtr->postLinkProberMetricsEvent(link_manager::ActiveStandbyStateMachine::LinkProberMetrics::LinkProberActiveStart); mStandbyUnknownUpCount = 0; + + tryCancelOscillationTimerIfAlive(); } if (state == link_prober::LinkProberState::Label::Standby) { mMuxPortPtr->postLinkProberMetricsEvent(link_manager::ActiveStandbyStateMachine::LinkProberMetrics::LinkProberStandbyStart); mActiveUnknownUpCount = 0; + + tryCancelOscillationTimerIfAlive(); } CompositeState nextState = mCompositeState; @@ -542,6 +544,10 @@ void ActiveStandbyStateMachine::handleStateChange(MuxStateEvent &event, mux_stat mStandbyUnknownUpCount = 0; } + if (state == mux_state::MuxState::Label::Standby) { + tryCancelOscillationTimerIfAlive(); + } + updateMuxLinkmgrState(); } @@ -582,6 +588,8 @@ void ActiveStandbyStateMachine::handleStateChange(LinkStateEvent &event, link_st mWaitActiveUpCount = 0; mWaitStandbyUpBackoffFactor = 1; mUnknownActiveUpBackoffFactor = 1; + + tryCancelOscillationTimerIfAlive(); } else { mStateTransitionHandler[ps(nextState)][ms(nextState)][ls(nextState)](nextState); } @@ -1064,6 +1072,8 @@ void ActiveStandbyStateMachine::handleMuxWaitTimeout(boost::system::error_code e void ActiveStandbyStateMachine::startOscillationTimer() { // Note: This timer is started when Mux state is active and link prober is in wait state. + MUXLOGINFO(boost::format("%s: start the oscillation timer") % mMuxPortConfig.getPortName()); + mOscillationTimerAlive = true; mOscillationTimer.expires_from_now(boost::posix_time::seconds( mMuxPortConfig.getOscillationInterval_sec() )); @@ -1074,6 +1084,20 @@ void ActiveStandbyStateMachine::startOscillationTimer() ))); } +// +// ---> tryCancelOscillationTimerIfAlive(); +// +// cancel the oscillation timer if it is alive +// +void ActiveStandbyStateMachine::tryCancelOscillationTimerIfAlive() +{ + if (mOscillationTimerAlive) { + MUXLOGINFO(boost::format("%s: cancel the oscillation timer") % mMuxPortConfig.getPortName()); + mOscillationTimerAlive = false; + mOscillationTimer.cancel(); + } +} + // // ---> handleOscillationTimeout(boost::system::error_code errorCode); // @@ -1083,6 +1107,7 @@ void ActiveStandbyStateMachine::handleOscillationTimeout(boost::system::error_co { MUXLOGDEBUG(mMuxPortConfig.getPortName()); + mOscillationTimerAlive = false; if (mMuxPortConfig.getIfOscillationEnabled() && errorCode == boost::system::errc::success && ps(mCompositeState) == link_prober::LinkProberState::Label::Wait && @@ -1318,7 +1343,7 @@ void ActiveStandbyStateMachine::LinkProberWaitMuxActiveLinkUpTransitionFunction( mSuspendTxFnPtr(mMuxPortConfig.getLinkWaitTimeout_msec()); } - if (mOscillationTimer.expires_at() < boost::posix_time::microsec_clock::local_time()) { + if (!mOscillationTimerAlive) { startOscillationTimer(); } } diff --git a/src/link_manager/LinkManagerStateMachineActiveStandby.h b/src/link_manager/LinkManagerStateMachineActiveStandby.h index 147b952..c1b1d58 100644 --- a/src/link_manager/LinkManagerStateMachineActiveStandby.h +++ b/src/link_manager/LinkManagerStateMachineActiveStandby.h @@ -464,6 +464,15 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase, */ void startOscillationTimer(); + /** + *@method tryCancelOscillationTimerIfAlive + * + *@brief cancel the oscillation timer if it is alive + * + *@return none + */ + inline void tryCancelOscillationTimerIfAlive(); + /** *@method handleOscillationTimeout * @@ -859,6 +868,7 @@ class ActiveStandbyStateMachine: public LinkManagerStateMachineBase, boost::asio::deadline_timer mDeadlineTimer; boost::asio::deadline_timer mWaitTimer; boost::asio::deadline_timer mOscillationTimer; + bool mOscillationTimerAlive = false; boost::function mInitializeProberFnPtr; boost::function mStartProbingFnPtr; diff --git a/test/LinkManagerStateMachineTest.cpp b/test/LinkManagerStateMachineTest.cpp index b611c9c..64001a2 100644 --- a/test/LinkManagerStateMachineTest.cpp +++ b/test/LinkManagerStateMachineTest.cpp @@ -21,6 +21,9 @@ * Author: Tamer Ahmed */ +#include +#include + #include "LinkManagerStateMachineTest.h" #include "link_prober/LinkProberStateMachineBase.h" #include "common/MuxLogger.h" @@ -50,7 +53,7 @@ LinkManagerStateMachineTest::LinkManagerStateMachineTest() : mMuxConfig.setPositiveStateChangeRetryCount(mPositiveUpdateCount); mMuxConfig.setMuxStateChangeRetryCount(mPositiveUpdateCount); mMuxConfig.setLinkStateChangeRetryCount(mPositiveUpdateCount); - mMuxConfig.setOscillationInterval_sec(1,true); + mMuxConfig.setOscillationInterval_sec(2, true); } void LinkManagerStateMachineTest::runIoService(uint32_t count) @@ -1468,7 +1471,7 @@ TEST_F(LinkManagerStateMachineTest, TimedOscillation) handleMuxState("active", 3); VALIDATE_STATE(Wait, Active, Up); - runIoService(2); + runIoService(1); VALIDATE_STATE(Wait, Wait, Up); handleProbeMuxState("active", 3); @@ -1481,6 +1484,38 @@ TEST_F(LinkManagerStateMachineTest, TimedOscillation) mMuxConfig.setTimeoutIpv4_msec(10); } +TEST_F(LinkManagerStateMachineTest, TimedOscillationMuxProbeStandbyCancel) +{ + setMuxStandby(); + + // set icmp timeout to be 500ms otherwise it will probe mux state endlessly and get no chance to do timed oscillation + mMuxConfig.setTimeoutIpv4_msec(500); + + postLinkProberEvent(link_prober::LinkProberState::Unknown, 2); + VALIDATE_STATE(Wait, Wait, Up); + EXPECT_EQ(mDbInterfacePtr->mSetMuxStateInvokeCount, 1); + EXPECT_EQ(mDbInterfacePtr->mLastPostedSwitchCause, link_manager::ActiveStandbyStateMachine::SwitchCause::PeerHeartbeatMissing); + + // swss notification + handleMuxState("active", 3); + VALIDATE_STATE(Wait, Active, Up); + + runIoService(1); + VALIDATE_STATE(Wait, Wait, Up); + + handleProbeMuxState("standby", 3); + VALIDATE_STATE(Wait, Standby, Up); + + boost::this_thread::sleep(boost::posix_time::seconds(2)); + + // 3 mux probe active notifiations + 1 oscillation timeout + 1 mux probe timeout + handleProbeMuxState("active", 5); + VALIDATE_STATE(Wait, Active, Up); + EXPECT_EQ(mDbInterfacePtr->mLastPostedSwitchCause, link_manager::ActiveStandbyStateMachine::SwitchCause::PeerHeartbeatMissing); + + mMuxConfig.setTimeoutIpv4_msec(10); +} + TEST_F(LinkManagerStateMachineTest, OrchagentRollback) { setMuxStandby();