From 8bb9c5a7d370af8e71c9d2d9f50a8d3e23a2d063 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Tue, 19 Oct 2021 09:22:01 +0800 Subject: [PATCH] Add retry reading/setting mux status to simulated y-cable driver (#221) Description Add retry reading/setting mux status to simulated y-cable driver Motivation and Context When DUT is rebooted, xcvrd may call the simulated y-cable driver to get mux direction before the mgmt interface is up. The simulated y-cable driver needs to send HTTP request to mux simulator server to read mux status. It has dependency on the mgmt interface. So, this could result in below error: Oct 16 03:15:44.029933 sonic-dut ERR pmon#xcvrd[34]: y_cable_port 1: GET http://192.168.1.33:8082/mux/vms21-6/0 for physical_port 1 failed with URLError(OSError(113, 'No route to host')) Oct 16 03:15:44.030306 sonic-dut ERR pmon#xcvrd[34]: Error: Could not establish the active side for Y cable port Ethernet0 to perform read_y_cable update state db This would cause other problem and may result in same interface on both upper tor and lower tor remains in "standby" state. The fix is to add retry to the simulated y-cable driver for reading or setting mux status. The retry interval is 1 second. Retry timeout is 30 seconds. How Has This Been Tested? * The issue can be reliably reproduced on a 7260 dualtor testbed after run the test_acl::TestAclWithReboot cases. With this fix, the issue cannot be reproduced. * Tested the config mux mode active command with or without icmp responder. * Tested updating mux status by call mux simulator API to see if the new status is reflected to DUTs. Signed-off-by: Xin Wang --- sonic_y_cable/microsoft/y_cable_simulated.py | 125 ++++++++++++++----- 1 file changed, 91 insertions(+), 34 deletions(-) diff --git a/sonic_y_cable/microsoft/y_cable_simulated.py b/sonic_y_cable/microsoft/y_cable_simulated.py index 6586041a9249..77c6ded0d73c 100644 --- a/sonic_y_cable/microsoft/y_cable_simulated.py +++ b/sonic_y_cable/microsoft/y_cable_simulated.py @@ -9,6 +9,7 @@ import os import urllib.request import urllib.error +import time from sonic_py_common import device_info from portconfig import get_port_config @@ -32,6 +33,10 @@ class YCable(YCableBase): NIC_VOLTAGE = 5.0 LOCAL_VOLTAGE = 5.0 + POLL_TIMEOUT = 30 + POLL_INTERVAL = 1 + URLOPEN_TIMEOUT = 5 + def __init__(self, port, logger): YCableBase.__init__(self, port, logger) if not os.path.exists(self.MUX_SIMULATOR_CONFIG_FILE) or not os.path.isfile(self.MUX_SIMULATOR_CONFIG_FILE): @@ -85,22 +90,45 @@ def _get(self, url=None): else: get_url = self._url - try: + start_time = time.time() + attempt = 1 + while True: try: - req = urllib.request.Request(get_url) - with urllib.request.urlopen(req) as resp: - return json.loads(resp.read().decode('utf-8')) - except urllib.error.HTTPError as e: - self.log_error('GET {} for physical_port {} failed with {}, detail: {}'.format( + try: + req = urllib.request.Request(get_url) + with urllib.request.urlopen(req, timeout=self.URLOPEN_TIMEOUT) as resp: + return json.loads(resp.read().decode('utf-8')) + except urllib.error.HTTPError as e: + self.log_error('attempt={}, GET {} for physical_port {} failed with {}, detail: {}'.format( + attempt, + get_url, + self.port, + repr(e), + e.read())) + except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e: + self.log_error('attempt={}, GET {} for physical_port {} failed with {}'.format( + attempt, + get_url, + self.port, + repr(e))) + + # Retry in case of exception, to workaround 'no route to host' issue after pmon restart + if (time.time() - start_time) > self.POLL_TIMEOUT: + self.log_error('Retry GET {} for physical port {} timeout after {} seconds, attempted={}'.format( get_url, self.port, - repr(e), - e.read())) - except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e: - self.log_error('GET {} for physical_port {} failed with {}'.format( - get_url, - self.port, - repr(e))) + self.POLL_TIMEOUT, + attempt + )) + break + else: + self.log_notice('Sleep {} seconds to retry GET {} for physical port {}'.format( + self.POLL_INTERVAL, + get_url, + self.port + )) + attempt += 1 + time.sleep(self.POLL_INTERVAL) return None @@ -118,27 +146,52 @@ def _post(self, url=None, data=None): else: post_data = None - try: + start_time = time.time() + attempt = 1 + while True: try: - headers = {'Accept': 'application/json', 'Content-Type': 'application/json'} - req = urllib.request.Request(post_url, post_data, headers, method='POST') - with urllib.request.urlopen(req) as resp: - return json.loads(resp.read().decode('utf-8')) - except urllib.error.HTTPError as e: - self.log_error('POST {} with data {} for physical_port {} failed with {}, detail: {}'.format( - post_url, + try: + headers = {'Accept': 'application/json', 'Content-Type': 'application/json'} + req = urllib.request.Request(post_url, post_data, headers, method='POST') + with urllib.request.urlopen(req, timeout=self.URLOPEN_TIMEOUT) as resp: + return json.loads(resp.read().decode('utf-8')) + except urllib.error.HTTPError as e: + self.log_error('attempt={}, POST {} with data {} for physical_port {} failed with {}, detail: {}'.format( + attempt, + post_url, + post_data, + self.port, + repr(e), + e.read() + )) + except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e: + self.log_error('attempt={}, POST {} with data {} for physical_port {} failed with {}'.format( + attempt, + post_url, + post_data, + self.port, + repr(e) + )) + + # Retry in case of exception, to workaround 'no route to host' issue after pmon restart + if time.time() - start_time > self.POLL_TIMEOUT: + self.log_error('Retry POST {} with data{} for physical port {} timeout after {} seconds, attempted={}'.format( + get_url, post_data, self.port, - repr(e), - e.read() + self.POLL_TIMEOUT, + attempt )) - except (urllib.error.URLError, json.decoder.JSONDecodeError, Exception) as e: - self.log_error('POST {} with data {} for physical_port {} failed with {}'.format( - post_url, + break + else: + self.log_notice('Sleep {} seconds to retry POST {} with data {} for physical port {}'.format( + self.POLL_INTERVAL, + get_url, post_data, - self.port, - repr(e) + self.port )) + attempt += 1 + time.sleep(self.POLL_INTERVAL) return None @@ -244,13 +297,17 @@ def get_mux_direction(self): TARGET_UNKNOWN, if mux direction API fails. """ status = self._get_status() - if not status: + + if not isinstance(status, dict): return self.TARGET_UNKNOWN - if status['active_side'] == self.UPPER_TOR: - return self.TARGET_TOR_A - elif status['active_side'] == self.LOWER_TOR: - return self.TARGET_TOR_B + if 'active_side' in status: + if status['active_side'] == self.UPPER_TOR: + return self.TARGET_TOR_A + elif status['active_side'] == self.LOWER_TOR: + return self.TARGET_TOR_B + else: + return self.TARGET_UNKNOWN else: return self.TARGET_UNKNOWN @@ -1243,4 +1300,4 @@ def debug_dump_registers(self, option=None): which would help diagnose the cable for proper functioning """ - return {} \ No newline at end of file + return {}