Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Auto-Techsupport] Issues related to Multiple Cores crashing handled #1948

Merged
merged 6 commits into from
Dec 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions scripts/coredump_gen_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import time
import argparse
import syslog
import re
from swsscommon.swsscommon import SonicV2Connector
from utilities_common.auto_techsupport_helper import *

Expand Down Expand Up @@ -54,7 +55,6 @@ def __init__(self, core_name, container_name, db):
self.db = db
self.proc_mp = {}
self.core_ts_map = {}
self.curr_ts_list = []

def handle_core_dump_creation_event(self):
file_path = os.path.join(CORE_DUMP_DIR, self.core_name)
Expand Down Expand Up @@ -93,7 +93,7 @@ def handle_core_dump_creation_event(self):
since_cfg = self.get_since_arg()
new_file = self.invoke_ts_cmd(since_cfg)
if new_file:
self.write_to_state_db(int(time.time()), new_file[0])
self.write_to_state_db(int(time.time()), new_file)

def write_to_state_db(self, timestamp, ts_dump):
name = strip_ts_ext(ts_dump)
Expand All @@ -111,26 +111,33 @@ def get_since_arg(self):
return since_cfg
return SINCE_DEFAULT

def parse_ts_dump_name(self, ts_stdout):
""" Figure out the ts_dump name from the techsupport stdout """
matches = re.findall(TS_PTRN, ts_stdout)
if matches:
return matches[-1]
syslog.syslog(syslog.LOG_ERR, "stdout of the 'show techsupport' cmd doesn't have the dump name")
return ""

def invoke_ts_cmd(self, since_cfg):
since_cfg = "'" + since_cfg + "'"
cmd = " ".join(["show", "techsupport", "--since", since_cfg])
rc, _, stderr = subprocess_exec(["show", "techsupport", "--since", since_cfg], env=ENV_VAR)
cmd_opts = ["show", "techsupport", "--silent", "--since", since_cfg]
cmd = " ".join(cmd_opts)
rc, stdout, stderr = subprocess_exec(cmd_opts, env=ENV_VAR)
if not rc:
syslog.syslog(syslog.LOG_ERR, "show techsupport failed with exit code {}, stderr:{}".format(rc, stderr))
new_list = get_ts_dumps(True)
diff = list(set(new_list).difference(set(self.curr_ts_list)))
self.curr_ts_list = new_list
if not diff:
new_dump = self.parse_ts_dump_name(stdout)
if not new_dump:
syslog.syslog(syslog.LOG_ERR, "{} was run, but no techsupport dump is found".format(cmd))
else:
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, diff))
return diff
syslog.syslog(syslog.LOG_INFO, "{} is successful, {} is created".format(cmd, new_dump))
return new_dump

def verify_rate_limit_intervals(self, global_cooloff, container_cooloff):
"""Verify both the global and per-proc rate_limit_intervals have passed"""
self.curr_ts_list = get_ts_dumps(True)
if global_cooloff and self.curr_ts_list:
last_ts_dump_creation = os.path.getmtime(self.curr_ts_list[-1])
curr_ts_list = get_ts_dumps(True)
if global_cooloff and curr_ts_list:
last_ts_dump_creation = os.path.getmtime(curr_ts_list[-1])
if time.time() - last_ts_dump_creation < global_cooloff:
msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
syslog.syslog(syslog.LOG_INFO, msg.format(self.core_name))
Expand Down
3 changes: 1 addition & 2 deletions scripts/generate_dump
Original file line number Diff line number Diff line change
Expand Up @@ -1339,8 +1339,7 @@ main() {
fi

# Invoke the TechSupport Cleanup Hook
setsid $(echo > /tmp/techsupport_cleanup.log;
python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &>> /tmp/techsupport_cleanup.log) &
setsid python3 /usr/local/bin/techsupport_cleanup.py ${TARFILE} &> /tmp/techsupport_cleanup.log &

echo ${TARFILE}

Expand Down
4 changes: 2 additions & 2 deletions scripts/techsupport_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def handle_techsupport_creation_event(dump_name, db):
file_path = os.path.join(TS_DIR, dump_name)
if not verify_recent_file_creation(file_path):
return
_ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN))
_ , num_bytes = get_stats(os.path.join(TS_DIR, TS_PTRN_GLOB))

if db.get(CFG_DB, AUTO_TS, CFG_STATE) != "enabled":
msg = "techsupport_cleanup is disabled. No cleanup is performed. current size occupied : {}"
Expand All @@ -40,7 +40,7 @@ def handle_techsupport_creation_event(dump_name, db):
syslog.syslog(syslog.LOG_NOTICE, msg.format(pretty_size(num_bytes)))
return

removed_files = cleanup_process(max_ts, TS_PTRN, TS_DIR)
removed_files = cleanup_process(max_ts, TS_PTRN_GLOB, TS_DIR)
clean_state_db_entries(removed_files, db)


Expand Down
57 changes: 36 additions & 21 deletions tests/coredump_gen_handler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
sys.path.append("scripts")
import coredump_gen_handler as cdump_mod

AUTO_TS_STDOUT="""
Techsupport is running with silent option. This command might take a long time.
The SAI dump is generated to /tmp/saisdkdump/sai_sdk_dump_11_22_2021_11_07_PM
/tmp/saisdkdump
"""

def set_auto_ts_cfg(redis_mock, state="disabled",
rate_limit_interval="0",
Expand Down Expand Up @@ -74,12 +79,13 @@ def test_invoc_ts_state_db_update(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand All @@ -105,12 +111,13 @@ def test_global_rate_limit_interval(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand Down Expand Up @@ -138,12 +145,13 @@ def test_per_container_rate_limit_interval(self):
"orchagent;{};swss".format(int(time.time()))})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz")
Expand All @@ -167,12 +175,13 @@ def test_invoc_ts_after_rate_limit_interval(self):
"orchagent;{};swss".format(int(time.time()))})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand All @@ -197,12 +206,13 @@ def test_core_dump_with_invalid_container_name(self):
populate_state_db(redis_mock, {})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/core/snmpd.12345.123.core.gz")
Expand All @@ -225,12 +235,13 @@ def test_feature_table_not_set(self):
populate_state_db(redis_mock, {})
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/core/python3.12345.123.core.gz")
Expand All @@ -251,10 +262,11 @@ def test_since_argument(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport --since '4 days ago'" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
return 0, "", ""
if "--since '4 days ago'" in cmd_str:
patcher.fs.create_file(ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
elif "date --date='4 days ago'" in cmd_str:
return 0, "", ""
else:
Expand Down Expand Up @@ -284,12 +296,13 @@ def test_masic_core_dump(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
patcher.fs.create_file(ts_dump)
else:
return 1, "", "Command Not Found"
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
cdump_mod.subprocess_exec = mock_cmd
patcher.fs.create_file("/var/dump/sonic_dump_random1.tar.gz")
patcher.fs.create_file("/var/dump/sonic_dump_random2.tar.gz")
Expand All @@ -315,10 +328,12 @@ def test_invalid_since_argument(self):
populate_state_db(redis_mock)
with Patcher() as patcher:
def mock_cmd(cmd, env):
ts_dump = "/var/dump/sonic_dump_random3.tar.gz"
cmd_str = " ".join(cmd)
if "show techsupport --since '2 days ago'" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
return 0, "", ""
if "--since '2 days ago'" in cmd_str:
patcher.fs.create_file(ts_dump)
print(AUTO_TS_STDOUT + ts_dump)
return 0, AUTO_TS_STDOUT + ts_dump, ""
elif "date --date='whatever'" in cmd_str:
return 1, "", "Invalid Date Format"
else:
Expand Down Expand Up @@ -370,7 +385,7 @@ def mock_cmd(cmd, env):
cmd_str = " ".join(cmd)
if "show techsupport" in cmd_str:
patcher.fs.create_file("/var/dump/sonic_dump_random3.tar.gz")
return 0, "", ""
return 0, AUTO_TS_STDOUT + ts_dump, ""
patcher.fs.set_disk_usage(2000, path="/var/core/")
patcher.fs.create_file("/var/core/orchagent.12345.123.core.gz", st_size=25)
patcher.fs.create_file("/var/core/lldpmgrd.12345.22.core.gz", st_size=25)
Expand Down
12 changes: 8 additions & 4 deletions utilities_common/auto_techsupport_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"CFG_DB", "AUTO_TS", "CFG_STATE", "CFG_MAX_TS", "COOLOFF",
"CFG_CORE_USAGE", "CFG_SINCE", "FEATURE", "STATE_DB",
"TS_MAP", "CORE_DUMP", "TIMESTAMP", "CONTAINER",
"TIME_BUF", "SINCE_DEFAULT"
"TIME_BUF", "SINCE_DEFAULT", "TS_PTRN_GLOB"
] + [ # Methods
"verify_recent_file_creation",
"get_ts_dumps",
Expand All @@ -30,7 +30,9 @@
CORE_DUMP_PTRN = "*.core.gz"

TS_DIR = "/var/dump"
TS_PTRN = "sonic_dump_*.tar*"
TS_ROOT = "sonic_dump_*"
TS_PTRN = "sonic_dump_.*tar.*" # Regex Exp
TS_PTRN_GLOB = "sonic_dump_*tar*" # Glob Exp

# CONFIG DB Attributes
CFG_DB = "CONFIG_DB"
Expand Down Expand Up @@ -78,8 +80,10 @@ def strip_ts_ext(ts_path):


def get_ts_dumps(full_path=False):
""" Get the list of TS dumps in the TS_DIR, sorted by the creation time """
curr_list = glob.glob(os.path.join(TS_DIR, TS_PTRN))
"""
Get the list of TS dumps in the TS_DIR, sorted by the creation time
"""
curr_list = glob.glob(os.path.join(TS_DIR, TS_ROOT))
curr_list.sort(key=os.path.getmtime)
if full_path:
return curr_list
Expand Down