Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Reboot test] refactoring reboot test and add continuous reboot test #1503

Merged
merged 3 commits into from
Mar 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 104 additions & 11 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,59 @@
SONIC_SSH_PORT = 22
SONIC_SSH_REGEX = 'OpenSSH_[\\w\\.]+ Debian'

# map reboot type -> reboot command
reboot_commands =\
{
'cold': 'reboot',
'fast': 'fast-reboot',
'warm': 'warm-reboot',
REBOOT_TYPE_WARM = "warm"
REBOOT_TYPE_COLD = "cold"
REBOOT_TYPE_FAST = "fast"
REBOOT_TYPE_POWEROFF = "power off"
REBOOT_TYPE_WATCHDOG = "watchdog"
REBOOT_TYPE_UNKNOWN = "Unknown"

'''
command : command to reboot the DUT
timeout : timeout waiting for DUT to come back after reboot
wait : time wait for switch the stablize
cause : search string to determine reboot cause
test_reboot_cause_only : indicate if the purpose of test is for reboot cause only
'''
reboot_ctrl_dict = {
REBOOT_TYPE_POWEROFF: {
"timeout": 300,
"wait": 120,
"cause": "Power Loss",
"test_reboot_cause_only": True
},
REBOOT_TYPE_COLD: {
"command": "reboot",
"timeout": 300,
"wait": 120,
"cause": "'reboot'",
"test_reboot_cause_only": False
},
REBOOT_TYPE_FAST: {
"command": "fast-reboot",
"timeout": 180,
"wait": 120,
"cause": "fast-reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_WARM: {
"command": "warm-reboot",
"timeout": 210,
"wait": 90,
"cause": "warm-reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_WATCHDOG: {
"command": "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog().arm(5); exit()\"",
"timeout": 300,
"wait": 120,
"cause": "Watchdog",
"test_reboot_cause_only": True
}
}


def reboot(duthost, localhost, reboot_type='cold', delay=10, timeout=180, wait=120):
def reboot(duthost, localhost, reboot_type='cold', delay=10, timeout=0, wait=0, reboot_helper=None, reboot_kwargs=None):
"""
reboots DUT
:param duthost: DUT host object
Expand All @@ -27,6 +70,8 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10, timeout=180, wait=1
:param delay: delay between ssh availability checks
:param timeout: timeout for waiting ssh port state change
:param wait: time to wait for DUT to initialize
:param reboot_helper: helper function to execute the power toggling
:param reboot_kwargs: arguments to pass to the reboot_helper
:return:
"""

Expand All @@ -35,15 +80,30 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10, timeout=180, wait=1
dut_ip = duthost.setup()['ansible_facts']['ansible_eth0']['ipv4']['address']

try:
reboot_command = reboot_commands[reboot_type]
reboot_ctrl = reboot_ctrl_dict[reboot_type]
reboot_command = reboot_ctrl['command'] if reboot_type != REBOOT_TYPE_POWEROFF else None
if timeout == 0:
timeout = reboot_ctrl['timeout']
if wait == 0:
wait = reboot_ctrl['wait']
except KeyError:
raise ValueError('invalid reboot type: "{}"'.format(reboot_type))

def execute_reboot():
def execute_reboot_command():
logger.info('rebooting with command "{}"'.format(reboot_command))
return duthost.command(reboot_command)

reboot_res = pool.apply_async(execute_reboot)
def execute_reboot_helper():
logger.info('rebooting with helper "{}"'.format(reboot_helper))
return reboot_helper(reboot_kwargs)

dut_datetime = duthost.get_now_time()

if reboot_type != REBOOT_TYPE_POWEROFF:
reboot_res = pool.apply_async(execute_reboot_command)
else:
assert reboot_helper is not None, "A reboot function must be provided for power off reboot"
reboot_res = pool.apply_async(execute_reboot_helper)

logger.info('waiting for ssh to drop')
res = localhost.wait_for(host=dut_ip,
Expand Down Expand Up @@ -82,15 +142,17 @@ def execute_reboot():
logger.info('waiting for warmboot-finalizer service to finish')
res = duthost.command('systemctl is-active warmboot-finalizer.service',module_ignore_errors=True)
finalizer_state = res['stdout'].strip()
logger.info('warmboot finalizer service state {}'.format(finalizer_state))
assert finalizer_state == 'activating'
count = 0
while finalizer_state == 'activating':
try:
res = duthost.command('systemctl is-active warmboot-finalizer.service')
res = duthost.command('systemctl is-active warmboot-finalizer.service',module_ignore_errors=True)
except AnsibleModuleException as err:
res = err.module_result

finalizer_state = res['stdout'].strip()
logger.info('warmboot finalizer service state {}'.format(finalizer_state))
time.sleep(delay)
if count * delay > timeout:
raise Exception('warmboot-finalizer.service did not finish')
Expand All @@ -100,3 +162,34 @@ def execute_reboot():
logger.info('{} reboot finished'.format(reboot_type))

pool.terminate()

dut_uptime = duthost.get_up_time()
logger.info('DUT up since {}'.format(dut_uptime))
assert float(dut_uptime.strftime("%s")) - float(dut_datetime.strftime("%s")) > 10, "Device did not reboot"


def get_reboot_cause(dut):
"""
@summary: get the reboot cause on DUT.
@param dut: The AnsibleHost object of DUT.
"""
logging.info('Getting reboot cause from dut {}'.format(dut.hostname))
output = dut.shell('show reboot-cause')
cause = output['stdout']

for type, ctrl in reboot_ctrl_dict.items():
if re.search(ctrl['cause'], cause):
return type

return REBOOT_TYPE_UNKNOWN


def check_reboot_cause(dut, reboot_cause_expected):
"""
@summary: Check the reboot cause on DUT. Can be used with wailt_until
@param dut: The AnsibleHost object of DUT.
@param reboot_cause_expected: The expected reboot cause.
"""
reboot_cause_got = get_reboot_cause(dut)
logging.debug("dut {} last reboot-cause {}".format(dut.hostname, reboot_cause_got))
return reboot_cause_got == reboot_cause_expected
12 changes: 0 additions & 12 deletions tests/platform/platform_fixtures.py

This file was deleted.

117 changes: 21 additions & 96 deletions tests/platform/test_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,56 +17,20 @@

import pytest

from platform_fixtures import conn_graph_facts
from common.fixtures.conn_graph_facts import conn_graph_facts
from common.utilities import wait_until
from common.reboot import *
from common.platform.interface_utils import check_interface_information
from common.platform.transceiver_utils import check_transceiver_basic

from check_critical_services import check_critical_services
from check_transceiver_status import check_transceiver_basic
from check_daemon_status import check_pmon_daemon_status
from check_all_interface_info import check_interface_information

pytestmark = [pytest.mark.disable_loganalyzer]

MAX_WAIT_TIME_FOR_INTERFACES = 300
MAX_WAIT_TIME_FOR_REBOOT_CAUSE = 120

REBOOT_TYPE_WARM = "warm"
REBOOT_TYPE_COLD = "cold"
REBOOT_TYPE_FAST = "fast"
REBOOT_TYPE_POWEROFF = "power off"
REBOOT_TYPE_WATCHDOG = "watchdog"

reboot_ctrl_dict = {
REBOOT_TYPE_POWEROFF: {
"timeout": 300,
"cause": "Power Loss",
"test_reboot_cause_only": True
},
REBOOT_TYPE_COLD: {
"command": "reboot",
"timeout": 300,
"cause": "reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_FAST: {
"command": "fast-reboot",
"timeout": 180,
"cause": "fast-reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_WARM: {
"command": "warm-reboot",
"timeout": 180,
"cause": "warm-reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_WATCHDOG: {
"command": "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog().arm(5); exit()\"",
"timeout": 300,
"cause": "Watchdog",
"test_reboot_cause_only": True
}
}


@pytest.fixture(scope="module", autouse=True)
def teardown_module(duthost, conn_graph_facts):
Expand All @@ -78,21 +42,6 @@ def teardown_module(duthost, conn_graph_facts):
check_interfaces_and_services(duthost, interfaces)



def check_reboot_cause(dut, reboot_cause_expected):
"""
@summary: Check the reboot cause on DUT.
@param dut: The AnsibleHost object of DUT.
@param reboot_cause_expected: The expected reboot cause.
"""
logging.info("Check the reboot cause")
output = dut.shell("show reboot-cause")
reboot_cause_got = output["stdout"]
logging.debug("show reboot-cause returns {}".format(reboot_cause_got))
m = re.search(reboot_cause_expected, reboot_cause_got)
return m is not None


def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, reboot_helper=None, reboot_kwargs=None):
"""
Perform the specified type of reboot and check platform status.
Expand All @@ -105,45 +54,11 @@ def reboot_and_check(localhost, dut, interfaces, reboot_type=REBOOT_TYPE_COLD, r
"""
logging.info("Run %s reboot on DUT" % reboot_type)

assert reboot_type in reboot_ctrl_dict.keys(), "Unknown reboot type %s" % reboot_type

reboot_timeout = reboot_ctrl_dict[reboot_type]["timeout"]
ansible_host = dut.host.options["inventory_manager"].get_host(dut.hostname).vars["ansible_host"]

dut_datetime = datetime.strptime(dut.command('date -u +"%Y-%m-%d %H:%M:%S"')["stdout"], "%Y-%m-%d %H:%M:%S")

if reboot_type == REBOOT_TYPE_POWEROFF:
assert reboot_helper is not None, "A reboot function must be provided for power off reboot"

reboot_helper(reboot_kwargs)

localhost.wait_for(host=ansible_host, port=22, state="stopped", search_regex="OpenSSH_[\\w\\.]+ Debian", delay=10, timeout=120)
else:
reboot_cmd = reboot_ctrl_dict[reboot_type]["command"]
reboot_task, reboot_res = dut.command(reboot_cmd, module_ignore_errors=True, module_async=True)

logging.info("Wait for DUT to go down")
res = localhost.wait_for(host=ansible_host, port=22, state="stopped", search_regex="OpenSSH_[\\w\\.]+ Debian", timeout=180, module_ignore_errors=True)
if "failed" in res:
try:
logging.error("Wait for switch down failed, try to kill any possible stuck reboot task")
pid = dut.command("pgrep -f '%s'" % reboot_cmd)["stdout"]
dut.command("kill -9 %s" % pid)
reboot_task.terminate()
logging.error("Result of command '%s': " + str(reboot_res.get(timeout=0)))
except Exception as e:
logging.error("Exception raised while cleanup reboot task and get result: " + repr(e))

logging.info("Wait for DUT to come back")
localhost.wait_for(host=ansible_host, port=22, state="started", search_regex="OpenSSH_[\\w\\.]+ Debian", delay=10, timeout=reboot_timeout)


logging.info("Check the uptime to verify whether reboot was performed")
dut_uptime = datetime.strptime(dut.command("uptime -s")["stdout"], "%Y-%m-%d %H:%M:%S")
assert float(dut_uptime.strftime("%s")) - float(dut_datetime.strftime("%s")) > 10, "Device did not reboot"
reboot(dut, localhost, reboot_type=reboot_type, reboot_helper=reboot_helper, reboot_kwargs=reboot_kwargs)

check_interfaces_and_services(dut, interfaces, reboot_type)


def check_interfaces_and_services(dut, interfaces, reboot_type = None):
"""
Perform a further check after reboot-cause, including transceiver status, interface status
Expand All @@ -156,9 +71,8 @@ def check_interfaces_and_services(dut, interfaces, reboot_type = None):

if reboot_type is not None:
logging.info("Check reboot cause")
reboot_cause = reboot_ctrl_dict[reboot_type]["cause"]
assert wait_until(MAX_WAIT_TIME_FOR_REBOOT_CAUSE, 20, check_reboot_cause, dut, reboot_cause), \
"got reboot-cause failed after rebooted by %s" % reboot_cause
assert wait_until(MAX_WAIT_TIME_FOR_REBOOT_CAUSE, 20, check_reboot_cause, dut, reboot_type), \
"got reboot-cause failed after rebooted by %s" % reboot_type

if reboot_ctrl_dict[reboot_type]["test_reboot_cause_only"]:
logging.info("Further checking skipped for %s test which intends to verify reboot-cause only" % reboot_type)
Expand Down Expand Up @@ -305,8 +219,19 @@ def test_watchdog_reboot(testbed_devices, conn_graph_facts):

test_watchdog_supported = "python -c \"import sonic_platform.platform as P; P.Platform().get_chassis().get_watchdog(); exit()\""

watchdog_supported = ans_host.command(test_watchdog_supported)["stderr"]
watchdog_supported = ans_host.command(test_watchdog_supported,module_ignore_errors=True)["stderr"]
if "" != watchdog_supported:
pytest.skip("Watchdog is not supported on this DUT, skip this test case")

reboot_and_check(localhost, ans_host, conn_graph_facts["device_conn"], REBOOT_TYPE_WATCHDOG)


def test_continuous_reboot(testbed_devices, conn_graph_facts):
"""
@summary: This test case is to perform 3 cold reboot in a row
"""
ans_host = testbed_devices["dut"]
localhost = testbed_devices["localhost"]

for i in range(3):
reboot_and_check(localhost, ans_host, conn_graph_facts["device_conn"], reboot_type=REBOOT_TYPE_COLD)