From 5d437d0ff8624c64314951932ffaf0a0f6ff20e3 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Thu, 1 Sep 2022 14:04:10 +0200 Subject: [PATCH 1/9] Fix and refactor machine issues. --- cmd/machine.go | 234 +++--------- cmd/tableprinters/common.go | 2 - cmd/tableprinters/context.go | 4 +- cmd/tableprinters/machine.go | 33 +- docs/metalctl_machine_issues.md | 28 +- pkg/api/issue.go | 97 ----- pkg/api/issues.go | 612 ++++++++++++++++++++++++++++++++ pkg/api/issues_test.go | 356 +++++++++++++++++++ 8 files changed, 1048 insertions(+), 318 deletions(-) delete mode 100644 pkg/api/issue.go create mode 100644 pkg/api/issues.go create mode 100644 pkg/api/issues_test.go diff --git a/cmd/machine.go b/cmd/machine.go index 32a4a88b..c39fe717 100644 --- a/cmd/machine.go +++ b/cmd/machine.go @@ -21,7 +21,6 @@ import ( "github.com/metal-stack/metal-lib/pkg/genericcli/printers" "github.com/metal-stack/metal-lib/pkg/pointer" "github.com/metal-stack/metalctl/cmd/sorters" - "github.com/metal-stack/metalctl/cmd/tableprinters" "github.com/metal-stack/metalctl/pkg/api" "github.com/spf13/cobra" "github.com/spf13/viper" @@ -323,14 +322,16 @@ In case the machine did not register properly a direct ipmi console access is av RunE: func(cmd *cobra.Command, args []string) error { return w.machineIpmi(args) }, + ValidArgsFunction: c.comp.MachineListCompletion, } machineIssuesCmd := &cobra.Command{ - Use: "issues", + Use: "issues []", Short: `display machines which are in a potential bad state`, Long: `display machines which are in a potential bad state` + "\n" + api.EmojiHelpText(), RunE: func(cmd *cobra.Command, args []string) error { - return w.machineIssues() + return w.machineIssues(args) }, + ValidArgsFunction: c.comp.MachineListCompletion, } machineLogsCmd := &cobra.Command{ Use: "logs ", @@ -354,20 +355,23 @@ In case the machine did not register properly a direct ipmi console access is av w.listCmdFlags(machineIpmiCmd) w.listCmdFlags(machineIssuesCmd) - machineIssuesCmd.Flags().StringSliceP("only", "", []string{}, "issue types to include [optional]") - machineIssuesCmd.Flags().StringSliceP("omit", "", []string{}, "issue types to omit [optional]") + machineIssuesCmd.Flags().StringSlice("only", []string{}, "issue types to include [optional]") + machineIssuesCmd.Flags().StringSlice("omit", []string{}, "issue types to omit [optional]") + machineIssuesCmd.Flags().String("severity", "", "issue severity to include [optional]") + must(machineIssuesCmd.RegisterFlagCompletionFunc("severity", cobra.FixedCompletions([]string{"critical", "major", "minor"}, cobra.ShellCompDirectiveNoFileComp))) + machineIssuesCmd.Flags().Duration("failed-event-threshold", api.DefaultLastErrorThreshold(), "issue severity to include [optional]") must(machineIssuesCmd.RegisterFlagCompletionFunc("omit", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { var shortNames []string - for _, i := range api.AllIssues { - shortNames = append(shortNames, i.ShortName+"\t"+i.Description) + for _, i := range api.AllIssues() { + shortNames = append(shortNames, string(i.Type)+"\t"+i.Description) } return shortNames, cobra.ShellCompDirectiveNoFileComp })) must(machineIssuesCmd.RegisterFlagCompletionFunc("only", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { var shortNames []string - for _, i := range api.AllIssues { - shortNames = append(shortNames, i.ShortName+"\t"+i.Description) + for _, i := range api.AllIssues() { + shortNames = append(shortNames, string(i.Type)+"\t"+i.Description) } return shortNames, cobra.ShellCompDirectiveNoFileComp })) @@ -1107,7 +1111,7 @@ func (c *machineCmd) machineLogs(args []string) error { if pointer.SafeDeref(resp.Events).LastErrorEvent != nil { timeSince := time.Since(time.Time(resp.Events.LastErrorEvent.Time)) - if timeSince > tableprinters.LastErrorEventRelevant { + if timeSince > api.DefaultLastErrorThreshold() { return nil } @@ -1213,7 +1217,7 @@ func (c *machineCmd) machineConsole(args []string) error { } func (c *machineCmd) machineIpmi(args []string) error { - if len(args) == 1 { + if len(args) > 0 { id, err := genericcli.GetExactlyOneArg(args) if err != nil { return err @@ -1243,7 +1247,7 @@ func (c *machineCmd) machineIpmi(args []string) error { return c.listPrinter.Print(resp.Payload) } -func (c *machineCmd) machineIssues() error { +func (c *machineCmd) machineIssues(args []string) error { resp, err := c.client.Machine().FindIPMIMachines(machine.NewFindIPMIMachinesParams().WithBody(machineFindRequestFromCLI()), nil) if err != nil { return err @@ -1255,192 +1259,52 @@ func (c *machineCmd) machineIssues() error { } var ( - only = viper.GetStringSlice("only") - omit = viper.GetStringSlice("omit") - - res = api.MachineIssues{} - asnMap = map[int64][]*models.V1MachineIPMIResponse{} - bmcIPMap = map[string][]*models.V1MachineIPMIResponse{} - - includeThisIssue = func(issue api.Issue) bool { - for _, o := range omit { - if issue.ShortName == o { - return false - } - } - - if len(only) > 0 { - for _, o := range only { - if issue.ShortName == o { - return true - } - } - return false - } - - return true - } - - addIssue = func(m *models.V1MachineIPMIResponse, issue api.Issue) { - if m == nil { - return - } - - if !includeThisIssue(issue) { - return - } - - var mWithIssues *api.MachineWithIssues - for _, machine := range res { - machine := machine - if pointer.SafeDeref(m.ID) == pointer.SafeDeref(machine.Machine.ID) { - mWithIssues = &machine - break - } - } - if mWithIssues == nil { - mWithIssues = &api.MachineWithIssues{ - Machine: *m, - } - res = append(res, *mWithIssues) - } - - mWithIssues.Issues = append(mWithIssues.Issues, issue) - } + severity = api.IssueSeverityMinor + only []api.IssueType + omit []api.IssueType ) - for _, m := range resp.Payload { - if m == nil { - continue - } - - if m.Partition == nil { - addIssue(m, api.IssueNoPartition) - } - - if m.Liveliness != nil { - switch *m.Liveliness { - case "Alive": - case "Dead": - addIssue(m, api.IssueLivelinessDead) - case "Unknown": - addIssue(m, api.IssueLivelinessUnknown) - - default: - addIssue(m, api.IssueLivelinessNotAvailable) - } - } else { - addIssue(m, api.IssueLivelinessNotAvailable) - } - - if pointer.SafeDeref(pointer.SafeDeref(m.Events).FailedMachineReclaim) { - addIssue(m, api.IssueFailedMachineReclaim) - } - - if pointer.SafeDeref(pointer.SafeDeref(m.Events).CrashLoop) { - if m.Events != nil && len(m.Events.Log) > 0 && *m.Events.Log[0].Event == "Waiting" { - // Machine which are waiting are not considered to have issues - } else { - addIssue(m, api.IssueCrashLoop) - } - } - - if pointer.SafeDeref(m.Events).LastErrorEvent != nil { - timeSince := time.Since(time.Time(m.Events.LastErrorEvent.Time)) - if timeSince < tableprinters.LastErrorEventRelevant { - issue := api.IssueLastEventError - issue.Description = fmt.Sprintf("%s (%s ago)", issue.Description, timeSince.String()) - addIssue(m, issue) - } - } - - if m.Ipmi != nil { - if m.Ipmi.Mac == nil || *m.Ipmi.Mac == "" { - addIssue(m, api.IssueBMCWithoutMAC) - } - - if m.Ipmi.Address == nil || *m.Ipmi.Address == "" { - addIssue(m, api.IssueBMCWithoutIP) - } else { - entries := bmcIPMap[*m.Ipmi.Address] - entries = append(entries, m) - bmcIPMap[*m.Ipmi.Address] = entries - } - } - - if m.Allocation != nil && m.Allocation.Role != nil && *m.Allocation.Role == models.V1MachineAllocationRoleFirewall { - // collecting ASN overlaps - for _, n := range m.Allocation.Networks { - if n.Asn == nil { - continue - } - - machines, ok := asnMap[*n.Asn] - if !ok { - machines = []*models.V1MachineIPMIResponse{} - } - - alreadyContained := false - for _, mm := range machines { - if *mm.ID == *m.ID { - alreadyContained = true - break - } - } - - if alreadyContained { - continue - } - - machines = append(machines, m) - asnMap[*n.Asn] = machines - } - } + for _, o := range viper.GetStringSlice("only") { + only = append(only, api.IssueType(o)) + } + for _, o := range viper.GetStringSlice("omit") { + omit = append(omit, api.IssueType(o)) } - for asn, ms := range asnMap { - if len(ms) < 2 { - continue + if viper.IsSet("severity") { + severity, err = api.SeverityFromString(viper.GetString("severity")) + if err != nil { + return err } + } - for _, m := range ms { - var sharedIDs []string - for _, mm := range ms { - if *m.ID == *mm.ID { - continue - } - sharedIDs = append(sharedIDs, *mm.ID) - } - - issue := api.IssueASNUniqueness - issue.Description = fmt.Sprintf("ASN (%d) not unique, shared with %s", asn, sharedIDs) - - addIssue(m, issue) - } + issues, err := api.FindIssues(&api.IssueConfig{ + Machines: resp.Payload, + Severity: severity, + Only: only, + Omit: omit, + LastErrorThreshold: viper.GetDuration("failed-event-threshold"), + }) + if err != nil { + return err } - for ip, ms := range bmcIPMap { - if len(ms) < 2 { - continue + if len(args) > 0 { + id, err := genericcli.GetExactlyOneArg(args) + if err != nil { + return err } - for _, m := range ms { - var sharedIDs []string - for _, mm := range ms { - if *m.ID == *mm.ID { - continue - } - sharedIDs = append(sharedIDs, *mm.ID) - } - - issue := api.IssueNonDistinctBMCIP - issue.Description = fmt.Sprintf("BMC IP (%s) not unique, shared with %s", ip, sharedIDs) - - addIssue(m, issue) + mWithIssues := issues.Get(id) + if mWithIssues == nil { + fmt.Fprintf(c.out, "machine with id %q has no issues\n", id) + return nil } + + return c.describePrinter.Print(mWithIssues.Issues) } - return c.listPrinter.Print(res) + return c.listPrinter.Print(issues) } func (c *machineCmd) machineIpmiEvents(args []string) error { diff --git a/cmd/tableprinters/common.go b/cmd/tableprinters/common.go index 1f2f48d0..467eb8d7 100644 --- a/cmd/tableprinters/common.go +++ b/cmd/tableprinters/common.go @@ -9,8 +9,6 @@ import ( ) const ( - LastErrorEventRelevant = 7 * 24 * time.Hour - dot = "●" nbr = " " ) diff --git a/cmd/tableprinters/context.go b/cmd/tableprinters/context.go index c546e0a9..1680aa34 100644 --- a/cmd/tableprinters/context.go +++ b/cmd/tableprinters/context.go @@ -1,8 +1,6 @@ package tableprinters -import ( - "github.com/metal-stack/metalctl/pkg/api" -) +import "github.com/metal-stack/metalctl/pkg/api" func (t *TablePrinter) ContextTable(data *api.Contexts, wide bool) ([]string, [][]string, error) { var ( diff --git a/cmd/tableprinters/machine.go b/cmd/tableprinters/machine.go index d050a853..f557afcf 100644 --- a/cmd/tableprinters/machine.go +++ b/cmd/tableprinters/machine.go @@ -71,7 +71,7 @@ func (t *TablePrinter) MachineTable(data []*models.V1MachineResponse, wide bool) lastEvent = *machine.Events.Log[0].Event } - emojis, _ := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State) + emojis, _ := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State, 1*time.Hour) if wide { rows = append(rows, []string{machineID, lastEvent, when, age, desc, name, hostname, project, ips, sizeID, image, partitionID, started, tags, reserved}) @@ -83,7 +83,7 @@ func (t *TablePrinter) MachineTable(data []*models.V1MachineResponse, wide bool) return header, rows, nil } -func getMachineStatusEmojis(liveliness *string, events *models.V1MachineRecentProvisioningEvents, state *models.V1MachineState) (string, string) { +func getMachineStatusEmojis(liveliness *string, events *models.V1MachineRecentProvisioningEvents, state *models.V1MachineState, lastErrorThreshold time.Duration) (string, string) { var ( emojis []string wide []string @@ -122,7 +122,7 @@ func getMachineStatusEmojis(liveliness *string, events *models.V1MachineRecentPr wide = append(wide, "FailedReclaim") } - if events.LastErrorEvent != nil && time.Since(time.Time(events.LastErrorEvent.Time)) < LastErrorEventRelevant { + if events.LastErrorEvent != nil && time.Since(time.Time(events.LastErrorEvent.Time)) < lastErrorThreshold { emojis = append(emojis, api.Exclamation) wide = append(wide, "LastEventErrors") } @@ -182,7 +182,7 @@ func (t *TablePrinter) MachineIPMITable(data []*models.V1MachineIPMIResponse, wi biosVersion = pointer.SafeDeref(bios.Version) } - emojis, wideEmojis := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State) + emojis, wideEmojis := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State, 1*time.Hour) if wide { rows = append(rows, []string{id, wideEmojis, powerText, ipAddress, mac, bpn, cs, ps, biosVersion, bmcVersion, size, partition}) @@ -245,7 +245,7 @@ func (t *TablePrinter) MachineIssuesTable(data api.MachineIssues, wide bool) ([] header := []string{"ID", "Power", "Allocated", "", "Lock Reason", "Last Event", "When", "Issues"} if wide { - header = []string{"ID", "Name", "Partition", "Project", "Power", "State", "Lock Reason", "Last Event", "When", "Issues"} + header = []string{"ID", "Name", "Partition", "Project", "Power", "State", "Lock Reason", "Last Event", "When", "Issues", "Ref URL", "Details"} } for _, machineWithIssues := range data { @@ -290,21 +290,18 @@ func (t *TablePrinter) MachineIssuesTable(data api.MachineIssues, wide bool) ([] lastEvent = *machine.Events.Log[0].Event } - var issues []string + emojis, _ := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State, api.DefaultLastErrorThreshold()) + for _, issue := range machineWithIssues.Issues { - text := fmt.Sprintf("- %s (%s)", issue.Description, issue.ShortName) - if wide && issue.RefURL != "" { - text += " (" + issue.RefURL + ")" + text := fmt.Sprintf("%s (%s)", issue.Description, issue.Type) + ref := issue.RefURL + details := issue.Details + + if wide { + rows = append(rows, []string{pointer.SafeDeref(machine.ID), widename, partition, project, powerText, lockText, lockDescWide, lastEvent, when, text, ref, details}) + } else { + rows = append(rows, []string{pointer.SafeDeref(machine.ID), power, allocated, emojis, lockDesc, lastEvent, when, text}) } - issues = append(issues, text) - } - - emojis, _ := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State) - - if wide { - rows = append(rows, []string{pointer.SafeDeref(machine.ID), widename, partition, project, powerText, lockText, lockDescWide, lastEvent, when, strings.Join(issues, "\n")}) - } else { - rows = append(rows, []string{pointer.SafeDeref(machine.ID), power, allocated, emojis, lockDesc, lastEvent, when, strings.Join(issues, "\n")}) } } diff --git a/docs/metalctl_machine_issues.md b/docs/metalctl_machine_issues.md index 2b508104..5b912418 100644 --- a/docs/metalctl_machine_issues.md +++ b/docs/metalctl_machine_issues.md @@ -18,24 +18,26 @@ Meaning of the emojis: ``` -metalctl machine issues [flags] +metalctl machine issues [] [flags] ``` ### Options ``` - -h, --help help for issues - --hostname string allocation hostname to filter [optional] - --id string ID to filter [optional] - --image string allocation image to filter [optional] - --mac string mac to filter [optional] - --name string allocation name to filter [optional] - --omit strings issue types to omit [optional] - --only strings issue types to include [optional] - --partition string partition to filter [optional] - --project string allocation project to filter [optional] - --size string size to filter [optional] - --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". + --failed-event-threshold duration issue severity to include [optional] (default 168h0m0s) + -h, --help help for issues + --hostname string allocation hostname to filter [optional] + --id string ID to filter [optional] + --image string allocation image to filter [optional] + --mac string mac to filter [optional] + --name string allocation name to filter [optional] + --omit strings issue types to omit [optional] + --only strings issue types to include [optional] + --partition string partition to filter [optional] + --project string allocation project to filter [optional] + --severity string issue severity to include [optional] + --size string size to filter [optional] + --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". ``` ### Options inherited from parent commands diff --git a/pkg/api/issue.go b/pkg/api/issue.go deleted file mode 100644 index 5a6a516f..00000000 --- a/pkg/api/issue.go +++ /dev/null @@ -1,97 +0,0 @@ -package api - -import ( - "fmt" - - "github.com/metal-stack/metal-go/api/models" -) - -type ( - // MachineWithIssues summarizes a machine with issues - MachineWithIssues struct { - Machine models.V1MachineIPMIResponse - Issues Issues - } - // MachineIssues is map of a machine response to a list of machine issues - MachineIssues []MachineWithIssues - - // Issue formulates an issue of a machine - Issue struct { - ShortName string - Description string - RefURL string - } - // Issues is a list of machine issues - Issues []Issue -) - -var ( - IssueNoPartition = Issue{ - ShortName: "no-partition", - Description: "machine with no partition", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#no-partition", - } - IssueLivelinessDead = Issue{ - ShortName: "liveliness-dead", - Description: "the machine is not sending events anymore", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#liveliness-dead", - } - IssueLivelinessUnknown = Issue{ - ShortName: "liveliness-unknown", - Description: "the machine is not sending LLDP alive messages anymore", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#liveliness-unknown", - } - IssueLivelinessNotAvailable = Issue{ - ShortName: "liveliness-not-available", - Description: "the machine liveliness is not available", - } - IssueFailedMachineReclaim = Issue{ - ShortName: "failed-machine-reclaim", - Description: "machine phones home but not allocated", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#failed-machine-reclaim", - } - IssueCrashLoop = Issue{ - ShortName: "crashloop", - Description: fmt.Sprintf("machine is in a provisioning crash loop (%s)", Loop), - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop", - } - IssueASNUniqueness = Issue{ - ShortName: "asn-not-unique", - Description: "The ASN is not unique (only impact on firewalls)", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique", - } - IssueBMCWithoutMAC = Issue{ - ShortName: "bmc-without-mac", - Description: "BMC has no mac address", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-mac", - } - IssueBMCWithoutIP = Issue{ - ShortName: "bmc-without-ip", - Description: "BMC has no ip address", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-ip", - } - IssueNonDistinctBMCIP = Issue{ - ShortName: "bmc-no-distinct-ip", - Description: "BMC IP address is not distinct", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-no-distinct-ip", - } - IssueLastEventError = Issue{ - ShortName: "last-event-error", - Description: "the machine had an error during the provisioning lifecycle", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#last-event-error", - } - - AllIssues = Issues{ - IssueNoPartition, - IssueLivelinessDead, - IssueLivelinessUnknown, - IssueLivelinessNotAvailable, - IssueFailedMachineReclaim, - IssueCrashLoop, - IssueASNUniqueness, - IssueBMCWithoutMAC, - IssueBMCWithoutIP, - IssueNonDistinctBMCIP, - IssueLastEventError, - } -) diff --git a/pkg/api/issues.go b/pkg/api/issues.go new file mode 100644 index 00000000..f2dde381 --- /dev/null +++ b/pkg/api/issues.go @@ -0,0 +1,612 @@ +package api + +import ( + "fmt" + "sort" + "strings" + "time" + + "github.com/metal-stack/metal-go/api/models" + "github.com/metal-stack/metal-lib/pkg/pointer" +) + +const ( + // IssueSeverityMinor is an issue that needs is informational and should be checked from time to time. + IssueSeverityMinor IssueSeverity = iota + // IssueSeverityMajor is an issue where resources are wasted and for that may be good workarounds. + // This should be resolved as soon as possible. + IssueSeverityMajor IssueSeverity = iota + // IssueSeverityCritical is an issue that can lead to disfunction of the system and need to be handled as quickly as possible. + IssueSeverityCritical IssueSeverity = iota + + IssueTypeNoPartition IssueType = "no-partition" + IssueTypeLivelinessDead IssueType = "liveliness-dead" + IssueTypeLivelinessUnknown IssueType = "liveliness-unknown" + IssueTypeLivelinessNotAvailable IssueType = "liveliness-not-available" + IssueTypeFailedMachineReclaim IssueType = "failed-machine-reclaim" + IssueTypeCrashLoop IssueType = "crashloop" + IssueTypeLastEventError IssueType = "last-event-error" + IssueTypeBMCWithoutMAC IssueType = "bmc-without-mac" + IssueTypeBMCWithoutIP IssueType = "bmc-without-ip" + IssueTypeASNUniqueness IssueType = "asn-not-unique" + IssueTypeNonDistinctBMCIP IssueType = "bmc-no-distinct-ip" +) + +type ( + IssueSeverity int + IssueType string + + // Issue formulates an issue of a machine + Issue struct { + Type IssueType + Severity IssueSeverity + Description string + RefURL string + Details string + } + // Issues is a list of issues + Issues []Issue + + IssueConfig struct { + Machines []*models.V1MachineIPMIResponse + Severity IssueSeverity + Only []IssueType + Omit []IssueType + LastErrorThreshold time.Duration + } + + issue interface { + // Evaluates whether a given machine has the this issue. + // the second argument "ms" is a list of all the other machines, which can be necessary to determine a machine issue. + Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool + // Spec returns the issue spec of this issue. + Spec() *issueSpec + // Details returns additional information on the issue after the evaluation. + Details() string + } + + issueSpec struct { + Type IssueType + Severity IssueSeverity + Description string + RefURL string + } + + IssueNoPartition struct{} + IssueLivelinessDead struct{} + IssueLivelinessUnknown struct{} + IssueLivelinessNotAvailable struct{} + IssueFailedMachineReclaim struct{} + IssueCrashLoop struct{} + IssueLastEventError struct { + lastEventThreshold time.Duration + details string + } + IssueBMCWithoutMAC struct{} + IssueBMCWithoutIP struct{} + IssueASNUniqueness struct { + details string + } + IssueNonDistinctBMCIP struct { + details string + } + + // MachineWithIssues summarizes a machine with issues + MachineWithIssues struct { + Machine *models.V1MachineIPMIResponse + Issues Issues + } + // MachineIssues is map of a machine response to a list of machine issues + MachineIssues []*MachineWithIssues + + machineIssueMap map[*models.V1MachineIPMIResponse]Issues +) + +func DefaultLastErrorThreshold() time.Duration { + return 7 * 24 * time.Hour +} + +func AllIssueTypes() []IssueType { + return []IssueType{ + IssueTypeNoPartition, + IssueTypeLivelinessDead, + IssueTypeLivelinessUnknown, + IssueTypeLivelinessNotAvailable, + IssueTypeFailedMachineReclaim, + IssueTypeCrashLoop, + IssueTypeLastEventError, + IssueTypeBMCWithoutMAC, + IssueTypeBMCWithoutIP, + IssueTypeASNUniqueness, + IssueTypeNonDistinctBMCIP, + } +} + +func AllIssues() Issues { + var ( + c = &IssueConfig{} + res Issues + ) + + for _, t := range AllIssueTypes() { + i, err := c.NewIssueFromType(t) + if err != nil { + continue + } + + res = append(res, toIssue(i)) + } + + return res +} + +func toIssue(i issue) Issue { + return Issue{ + Type: i.Spec().Type, + Severity: i.Spec().Severity, + Description: i.Spec().Description, + RefURL: i.Spec().RefURL, + Details: i.Details(), + } +} + +func SeverityFromString(s string) (IssueSeverity, error) { + switch strings.ToLower(s) { + case "critical": + return IssueSeverityCritical, nil + case "major": + return IssueSeverityMajor, nil + case "minor": + return IssueSeverityMinor, nil + default: + return IssueSeverity(0), fmt.Errorf("unknown severity: %s", s) + } +} + +func (c *IssueConfig) NewIssueFromType(t IssueType) (issue, error) { + switch t { + case IssueTypeNoPartition: + return &IssueNoPartition{}, nil + case IssueTypeLivelinessDead: + return &IssueLivelinessDead{}, nil + case IssueTypeLivelinessUnknown: + return &IssueLivelinessUnknown{}, nil + case IssueTypeLivelinessNotAvailable: + return &IssueLivelinessNotAvailable{}, nil + case IssueTypeFailedMachineReclaim: + return &IssueFailedMachineReclaim{}, nil + case IssueTypeCrashLoop: + return &IssueCrashLoop{}, nil + case IssueTypeLastEventError: + return &IssueLastEventError{lastEventThreshold: c.LastErrorThreshold}, nil + case IssueTypeBMCWithoutMAC: + return &IssueBMCWithoutMAC{}, nil + case IssueTypeBMCWithoutIP: + return &IssueBMCWithoutIP{}, nil + case IssueTypeASNUniqueness: + return &IssueASNUniqueness{}, nil + case IssueTypeNonDistinctBMCIP: + return &IssueNonDistinctBMCIP{}, nil + default: + return nil, fmt.Errorf("unknown issue type: %s", t) + } +} + +func (mis MachineIssues) Get(id string) *MachineWithIssues { + for _, m := range mis { + m := m + + if m.Machine == nil || m.Machine.ID == nil { + continue + } + + if *m.Machine.ID == id { + return m + } + } + + return nil +} + +func FindIssues(c *IssueConfig) (MachineIssues, error) { + res := machineIssueMap{} + + for _, t := range AllIssueTypes() { + if !c.includeIssue(t) { + continue + } + + for _, m := range c.Machines { + m := m + + i, err := c.NewIssueFromType(t) + if err != nil { + return nil, err + } + + if m.ID == nil { + continue + } + + if i.Evaluate(m, c.Machines) { + res.add(m, toIssue(i)) + } + } + } + + return res.toList(), nil +} + +func (c *IssueConfig) includeIssue(t IssueType) bool { + issue, err := c.NewIssueFromType(t) + if err != nil { + return false + } + + if issue.Spec().Severity < c.Severity { + return false + } + + for _, o := range c.Omit { + if t == o { + return false + } + } + + if len(c.Only) > 0 { + for _, o := range c.Only { + if t == o { + return true + } + } + return false + } + + return true +} + +func (mim machineIssueMap) add(m *models.V1MachineIPMIResponse, issue Issue) { + issues, ok := mim[m] + if !ok { + issues = Issues{} + } + issues = append(issues, issue) + mim[m] = issues +} + +func (mim machineIssueMap) toList() MachineIssues { + var res MachineIssues + + for m, issues := range mim { + res = append(res, &MachineWithIssues{ + Machine: m, + Issues: issues, + }) + } + + sort.Slice(res, func(i, j int) bool { + return pointer.SafeDeref(res[i].Machine.ID) < pointer.SafeDeref(res[j].Machine.ID) + }) + + return res +} + +func (i *IssueNoPartition) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Partition == nil +} + +func (i *IssueNoPartition) Details() string { + return "" +} + +func (i *IssueNoPartition) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeNoPartition, + Severity: IssueSeverityMajor, + Description: "machine with no partition", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#no-partition", + } +} + +func (i *IssueLivelinessDead) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Liveliness != nil && *m.Liveliness == "Dead" +} + +func (i *IssueLivelinessDead) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeLivelinessDead, + Severity: IssueSeverityMajor, + Description: "the machine is not sending events anymore", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#liveliness-dead", + } +} + +func (i *IssueLivelinessDead) Details() string { + return "" +} + +func (i *IssueLivelinessUnknown) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Liveliness != nil && *m.Liveliness == "Unknown" +} + +func (i *IssueLivelinessUnknown) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeLivelinessUnknown, + Severity: IssueSeverityMajor, + Description: "the machine is not sending LLDP alive messages anymore", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#liveliness-unknown", + } +} + +func (i *IssueLivelinessUnknown) Details() string { + return "" +} + +func (i *IssueLivelinessNotAvailable) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + if m.Liveliness == nil { + return true + } + + allowed := map[string]bool{ + "Alive": true, + "Dead": true, + "Unknown": true, + } + + return !allowed[*m.Liveliness] +} + +func (i *IssueLivelinessNotAvailable) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeLivelinessNotAvailable, + Severity: IssueSeverityMinor, + Description: "the machine liveliness is not available", + } +} + +func (i *IssueLivelinessNotAvailable) Details() string { + return "" +} + +func (i *IssueFailedMachineReclaim) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return pointer.SafeDeref(pointer.SafeDeref(m.Events).FailedMachineReclaim) +} + +func (i *IssueFailedMachineReclaim) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeFailedMachineReclaim, + Severity: IssueSeverityCritical, + Description: "machine phones home but not allocated", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#failed-machine-reclaim", + } +} + +func (i *IssueFailedMachineReclaim) Details() string { + return "" +} + +func (i *IssueCrashLoop) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + if pointer.SafeDeref(pointer.SafeDeref(m.Events).CrashLoop) { + if m.Events != nil && len(m.Events.Log) > 0 && *m.Events.Log[0].Event == "Waiting" { + // Machine which are waiting are not considered to have issues + } else { + return true + } + } + return false +} + +func (i *IssueCrashLoop) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeCrashLoop, + Severity: IssueSeverityMajor, + Description: fmt.Sprintf("machine is in a provisioning crash loop (%s)", Loop), + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop", + } +} + +func (i *IssueCrashLoop) Details() string { + return "" +} + +func (i *IssueLastEventError) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + if i.lastEventThreshold == 0 { + return false + } + + if pointer.SafeDeref(m.Events).LastErrorEvent != nil { + timeSince := time.Since(time.Time(m.Events.LastErrorEvent.Time)) + if timeSince < i.lastEventThreshold { + i.details = fmt.Sprintf("occurred %s ago", timeSince.String()) + return true + } + } + return false +} + +func (i *IssueLastEventError) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeLastEventError, + Severity: IssueSeverityMinor, + Description: "the machine had an error during the provisioning lifecycle", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#last-event-error", + } +} + +func (i *IssueLastEventError) Details() string { + return i.details +} + +func (i *IssueBMCWithoutMAC) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Ipmi != nil && (m.Ipmi.Mac == nil || *m.Ipmi.Mac == "") +} + +func (i *IssueBMCWithoutMAC) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeBMCWithoutMAC, + Severity: IssueSeverityMajor, + Description: "BMC has no mac address", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-mac", + } +} + +func (i *IssueBMCWithoutMAC) Details() string { + return "" +} + +func (i *IssueBMCWithoutIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Ipmi != nil && (m.Ipmi.Address == nil || *m.Ipmi.Address == "") +} + +func (i *IssueBMCWithoutIP) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeBMCWithoutIP, + Severity: IssueSeverityMajor, + Description: "BMC has no ip address", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-without-ip", + } +} + +func (i *IssueBMCWithoutIP) Details() string { + return "" +} + +func (i *IssueASNUniqueness) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + var ( + machineASNs = map[int64][]*models.V1MachineIPMIResponse{} + overlaps []string + isNoFirewall = func(m *models.V1MachineIPMIResponse) bool { + return m.Allocation == nil || m.Allocation.Role == nil || *m.Allocation.Role != models.V1MachineAllocationRoleFirewall + } + ) + + if isNoFirewall(m) { + return false + } + + for _, n := range m.Allocation.Networks { + if n.Asn == nil { + continue + } + + machineASNs[*n.Asn] = nil + } + + for _, machineFromAll := range ms { + if pointer.SafeDeref(machineFromAll.ID) == pointer.SafeDeref(m.ID) { + continue + } + otherMachine := machineFromAll + + if isNoFirewall(otherMachine) { + continue + } + + for _, n := range otherMachine.Allocation.Networks { + if n.Asn == nil { + continue + } + + _, ok := machineASNs[*n.Asn] + if !ok { + continue + } + + machineASNs[*n.Asn] = append(machineASNs[*n.Asn], otherMachine) + } + } + + var asnList []int64 + for asn := range machineASNs { + asnList = append(asnList, asn) + } + sort.Slice(asnList, func(i, j int) bool { + return asnList[i] < asnList[j] + }) + + for _, asn := range asnList { + overlappingMachines, ok := machineASNs[asn] + if !ok || len(overlappingMachines) == 0 { + continue + } + + var sharedIDs []string + for _, m := range overlappingMachines { + m := m + sharedIDs = append(sharedIDs, *m.ID) + } + + overlaps = append(overlaps, fmt.Sprintf("- ASN (%d) not unique, shared with %s", asn, sharedIDs)) + } + + if len(overlaps) == 0 { + return false + } + + sort.Slice(overlaps, func(i, j int) bool { + return overlaps[i] < overlaps[j] + }) + + i.details = strings.Join(overlaps, "\n") + + return true +} + +func (i *IssueASNUniqueness) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeASNUniqueness, + Severity: IssueSeverityMinor, + Description: "The ASN is not unique (only impact on firewalls)", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique", + } +} + +func (i *IssueASNUniqueness) Details() string { + return i.details +} + +func (i *IssueNonDistinctBMCIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + if m.Ipmi == nil || m.Ipmi.Address == nil { + return false + } + + var ( + bmcIP = *m.Ipmi.Address + overlaps []string + ) + + for _, machineFromAll := range ms { + if pointer.SafeDeref(machineFromAll.ID) == pointer.SafeDeref(m.ID) { + continue + } + otherMachine := machineFromAll + + if otherMachine.Ipmi == nil || otherMachine.Ipmi.Address == nil { + continue + } + + if bmcIP == *otherMachine.Ipmi.Address { + overlaps = append(overlaps, *otherMachine.ID) + } + } + + if len(overlaps) == 0 { + return false + } + + i.details = fmt.Sprintf("BMC IP (%s) not unique, shared with %s", bmcIP, overlaps) + + return true +} + +func (i *IssueNonDistinctBMCIP) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeNonDistinctBMCIP, + Description: "BMC IP address is not distinct", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-no-distinct-ip", + } +} + +func (i *IssueNonDistinctBMCIP) Details() string { + return i.details +} diff --git a/pkg/api/issues_test.go b/pkg/api/issues_test.go new file mode 100644 index 00000000..d22b6dee --- /dev/null +++ b/pkg/api/issues_test.go @@ -0,0 +1,356 @@ +package api + +import ( + "fmt" + "testing" + "time" + + "bou.ke/monkey" + "github.com/go-openapi/strfmt" + "github.com/google/go-cmp/cmp" + "github.com/metal-stack/metal-go/api/models" + "github.com/metal-stack/metal-lib/pkg/pointer" + "github.com/stretchr/testify/require" +) + +var ( + testTime = time.Date(2022, time.May, 19, 1, 2, 3, 4, time.UTC) + goodMachine = &models.V1MachineIPMIResponse{ + ID: pointer.Pointer("0"), + Liveliness: pointer.Pointer("Alive"), + Partition: &models.V1PartitionResponse{ID: pointer.Pointer("a")}, + Ipmi: &models.V1MachineIPMI{ + Address: pointer.Pointer("1.2.3.4"), + Mac: pointer.Pointer("aa:bb:..."), + }, + } + noPartitionMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("1"), Partition: nil} + deadMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("2"), Liveliness: pointer.Pointer("Dead"), Partition: &models.V1PartitionResponse{ID: pointer.Pointer("a")}} + unknownMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("3"), Liveliness: pointer.Pointer("Unknown"), Partition: &models.V1PartitionResponse{ID: pointer.Pointer("a")}} + notAvailableMachine1 = &models.V1MachineIPMIResponse{ID: pointer.Pointer("4"), Partition: &models.V1PartitionResponse{ID: pointer.Pointer("a")}} + notAvailableMachine2 = &models.V1MachineIPMIResponse{ID: pointer.Pointer("5"), Liveliness: pointer.Pointer(""), Partition: &models.V1PartitionResponse{ID: pointer.Pointer("a")}} + failedReclaimMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("6"), Events: &models.V1MachineRecentProvisioningEvents{FailedMachineReclaim: pointer.Pointer(true)}} + crashingMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("7"), Events: &models.V1MachineRecentProvisioningEvents{CrashLoop: pointer.Pointer(true)}} + lastEventErrorMachine = &models.V1MachineIPMIResponse{ + ID: pointer.Pointer("8"), + Events: &models.V1MachineRecentProvisioningEvents{ + LastErrorEvent: &models.V1MachineProvisioningEvent{ + Time: strfmt.DateTime(testTime.Add(-5 * time.Minute)), + }, + }, + } + bmcWithoutMacMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("9"), Ipmi: &models.V1MachineIPMI{}} + bmcWithoutIPMachine = &models.V1MachineIPMIResponse{ID: pointer.Pointer("10"), Ipmi: &models.V1MachineIPMI{}} + asnSharedMachine1 = &models.V1MachineIPMIResponse{ + ID: pointer.Pointer("11"), + Allocation: &models.V1MachineAllocation{ + Role: pointer.Pointer(models.V1MachineAllocationRoleFirewall), + Networks: []*models.V1MachineNetwork{ + { + Asn: pointer.Pointer(int64(0)), + }, + { + Asn: pointer.Pointer(int64(100)), + }, + { + Asn: pointer.Pointer(int64(200)), + }, + }, + }, + } + asnSharedMachine2 = &models.V1MachineIPMIResponse{ + ID: pointer.Pointer("12"), + Allocation: &models.V1MachineAllocation{ + Role: pointer.Pointer(models.V1MachineAllocationRoleFirewall), + Networks: []*models.V1MachineNetwork{ + { + Asn: pointer.Pointer(int64(1)), + }, + { + Asn: pointer.Pointer(int64(100)), + }, + { + Asn: pointer.Pointer(int64(200)), + }, + }, + }, + } + nonDistinctBMCMachine1 = &models.V1MachineIPMIResponse{ + ID: pointer.Pointer("13"), + Ipmi: &models.V1MachineIPMI{ + Address: pointer.Pointer("127.0.0.1"), + }, + } + nonDistinctBMCMachine2 = &models.V1MachineIPMIResponse{ + ID: pointer.Pointer("14"), + Ipmi: &models.V1MachineIPMI{ + Address: pointer.Pointer("127.0.0.1"), + }, + } +) + +func init() { + _ = monkey.Patch(time.Now, func() time.Time { return testTime }) +} + +func TestFindIssues(t *testing.T) { + tests := []struct { + name string + c *IssueConfig + want MachineIssues + }{ + { + name: "no partition", + c: &IssueConfig{ + Only: []IssueType{IssueTypeNoPartition}, + Machines: []*models.V1MachineIPMIResponse{noPartitionMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: noPartitionMachine, + Issues: Issues{ + toIssue(&IssueNoPartition{}), + }, + }, + }, + }, + { + name: "liveliness dead", + c: &IssueConfig{ + Only: []IssueType{IssueTypeLivelinessDead}, + Machines: []*models.V1MachineIPMIResponse{deadMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: deadMachine, + Issues: Issues{ + toIssue(&IssueLivelinessDead{}), + }, + }, + }, + }, + { + name: "liveliness unknown", + c: &IssueConfig{ + Only: []IssueType{IssueTypeLivelinessUnknown}, + Machines: []*models.V1MachineIPMIResponse{unknownMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: unknownMachine, + Issues: Issues{ + toIssue(&IssueLivelinessUnknown{}), + }, + }, + }, + }, + { + name: "liveliness not available", + c: &IssueConfig{ + Only: []IssueType{IssueTypeLivelinessNotAvailable}, + Machines: []*models.V1MachineIPMIResponse{notAvailableMachine1, notAvailableMachine2, goodMachine}, + }, + want: MachineIssues{ + { + Machine: notAvailableMachine1, + Issues: Issues{ + toIssue(&IssueLivelinessNotAvailable{}), + }, + }, + { + Machine: notAvailableMachine2, + Issues: Issues{ + toIssue(&IssueLivelinessNotAvailable{}), + }, + }, + }, + }, + { + name: "failed machine reclaim", + c: &IssueConfig{ + Only: []IssueType{IssueTypeFailedMachineReclaim}, + Machines: []*models.V1MachineIPMIResponse{failedReclaimMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: failedReclaimMachine, + Issues: Issues{ + toIssue(&IssueFailedMachineReclaim{}), + }, + }, + }, + }, + { + name: "crashloop", + c: &IssueConfig{ + Only: []IssueType{IssueTypeCrashLoop}, + Machines: []*models.V1MachineIPMIResponse{crashingMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: crashingMachine, + Issues: Issues{ + toIssue(&IssueCrashLoop{}), + }, + }, + }, + }, + { + name: "last event error", + c: &IssueConfig{ + Only: []IssueType{IssueTypeLastEventError}, + Machines: []*models.V1MachineIPMIResponse{lastEventErrorMachine, goodMachine}, + LastErrorThreshold: 10 * time.Minute, + }, + want: MachineIssues{ + { + Machine: lastEventErrorMachine, + Issues: Issues{ + toIssue(&IssueLastEventError{lastEventThreshold: 10 * time.Minute, details: "occurred 5m0s ago"}), + }, + }, + }, + }, + { + name: "bmc without mac", + c: &IssueConfig{ + Only: []IssueType{IssueTypeBMCWithoutMAC}, + Machines: []*models.V1MachineIPMIResponse{bmcWithoutMacMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: bmcWithoutMacMachine, + Issues: Issues{ + toIssue(&IssueBMCWithoutMAC{}), + }, + }, + }, + }, + { + name: "bmc without ip", + c: &IssueConfig{ + Only: []IssueType{IssueTypeBMCWithoutIP}, + Machines: []*models.V1MachineIPMIResponse{bmcWithoutIPMachine, goodMachine}, + }, + want: MachineIssues{ + { + Machine: bmcWithoutIPMachine, + Issues: Issues{ + toIssue(&IssueBMCWithoutIP{}), + }, + }, + }, + }, + { + name: "asn shared", + c: &IssueConfig{ + Only: []IssueType{IssueTypeASNUniqueness}, + Machines: []*models.V1MachineIPMIResponse{asnSharedMachine1, asnSharedMachine2, goodMachine}, + }, + want: MachineIssues{ + { + Machine: asnSharedMachine1, + Issues: Issues{ + toIssue(&IssueASNUniqueness{ + details: fmt.Sprintf("- ASN (100) not unique, shared with [%[1]s]\n- ASN (200) not unique, shared with [%[1]s]", *asnSharedMachine2.ID), + }), + }, + }, + { + Machine: asnSharedMachine2, + Issues: Issues{ + toIssue(&IssueASNUniqueness{ + details: fmt.Sprintf("- ASN (100) not unique, shared with [%[1]s]\n- ASN (200) not unique, shared with [%[1]s]", *asnSharedMachine1.ID), + }), + }, + }, + }, + }, + { + name: "non distinct bmc ip", + c: &IssueConfig{ + Only: []IssueType{IssueTypeNonDistinctBMCIP}, + Machines: []*models.V1MachineIPMIResponse{nonDistinctBMCMachine1, nonDistinctBMCMachine2, goodMachine}, + }, + want: MachineIssues{ + { + Machine: nonDistinctBMCMachine1, + Issues: Issues{ + toIssue(&IssueNonDistinctBMCIP{ + details: fmt.Sprintf("BMC IP (127.0.0.1) not unique, shared with [%[1]s]", *nonDistinctBMCMachine2.ID), + }), + }, + }, + { + Machine: nonDistinctBMCMachine2, + Issues: Issues{ + toIssue(&IssueNonDistinctBMCIP{ + details: fmt.Sprintf("BMC IP (127.0.0.1) not unique, shared with [%[1]s]", *nonDistinctBMCMachine1.ID), + }), + }, + }, + }, + }, + { + name: "severity major", + c: &IssueConfig{ + Severity: IssueSeverityMajor, + Machines: []*models.V1MachineIPMIResponse{deadMachine, failedReclaimMachine, notAvailableMachine1, goodMachine}, + }, + want: MachineIssues{ + { + Machine: deadMachine, + Issues: Issues{ + toIssue(&IssueLivelinessDead{}), + }, + }, + { + Machine: failedReclaimMachine, + Issues: Issues{ + toIssue(&IssueNoPartition{}), + toIssue(&IssueFailedMachineReclaim{}), + }, + }, + }, + }, + { + name: "severity critical", + c: &IssueConfig{ + Severity: IssueSeverityCritical, + Machines: []*models.V1MachineIPMIResponse{deadMachine, failedReclaimMachine, notAvailableMachine1, goodMachine}, + }, + want: MachineIssues{ + { + Machine: failedReclaimMachine, + Issues: Issues{ + toIssue(&IssueFailedMachineReclaim{}), + }, + }, + }, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + got, err := FindIssues(tt.c) + require.NoError(t, err) + + if diff := cmp.Diff(tt.want, got, cmp.AllowUnexported(IssueLastEventError{}, IssueASNUniqueness{}, IssueNonDistinctBMCIP{})); diff != "" { + t.Errorf("diff (+got -want):\n %s", diff) + } + }) + } +} + +func TestAllIssues(t *testing.T) { + issuesTypes := map[IssueType]bool{} + for _, i := range AllIssues() { + issuesTypes[i.Type] = true + } + + for _, ty := range AllIssueTypes() { + if _, ok := issuesTypes[ty]; !ok { + t.Errorf("issue of type %s not contained in all issues", ty) + } + } +} From 467098d606fb86409e95b83cc1389ff393c67f56 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Thu, 1 Sep 2022 16:49:04 +0200 Subject: [PATCH 2/9] A bit reordering. --- cmd/machine.go | 2 +- pkg/api/issues.go | 236 +++++++++++++++++++++++----------------------- 2 files changed, 119 insertions(+), 119 deletions(-) diff --git a/cmd/machine.go b/cmd/machine.go index c39fe717..aec217dd 100644 --- a/cmd/machine.go +++ b/cmd/machine.go @@ -359,7 +359,7 @@ In case the machine did not register properly a direct ipmi console access is av machineIssuesCmd.Flags().StringSlice("omit", []string{}, "issue types to omit [optional]") machineIssuesCmd.Flags().String("severity", "", "issue severity to include [optional]") must(machineIssuesCmd.RegisterFlagCompletionFunc("severity", cobra.FixedCompletions([]string{"critical", "major", "minor"}, cobra.ShellCompDirectiveNoFileComp))) - machineIssuesCmd.Flags().Duration("failed-event-threshold", api.DefaultLastErrorThreshold(), "issue severity to include [optional]") + machineIssuesCmd.Flags().Duration("failed-event-threshold", api.DefaultLastErrorThreshold(), "the duration up to how long in the past a machine last event error will be counted as an issue [optional]") must(machineIssuesCmd.RegisterFlagCompletionFunc("omit", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { var shortNames []string diff --git a/pkg/api/issues.go b/pkg/api/issues.go index f2dde381..1fb93bfe 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -47,6 +47,7 @@ type ( // Issues is a list of issues Issues []Issue + // IssueConfig contains configuration parameters for finding machine issues IssueConfig struct { Machines []*models.V1MachineIPMIResponse Severity IssueSeverity @@ -55,23 +56,6 @@ type ( LastErrorThreshold time.Duration } - issue interface { - // Evaluates whether a given machine has the this issue. - // the second argument "ms" is a list of all the other machines, which can be necessary to determine a machine issue. - Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool - // Spec returns the issue spec of this issue. - Spec() *issueSpec - // Details returns additional information on the issue after the evaluation. - Details() string - } - - issueSpec struct { - Type IssueType - Severity IssueSeverity - Description string - RefURL string - } - IssueNoPartition struct{} IssueLivelinessDead struct{} IssueLivelinessUnknown struct{} @@ -99,6 +83,23 @@ type ( // MachineIssues is map of a machine response to a list of machine issues MachineIssues []*MachineWithIssues + issue interface { + // Evaluates whether a given machine has the this issue. + // the second argument "ms" is a list of all the other machines, which can be necessary to determine a machine issue. + Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool + // Spec returns the issue spec of this issue. + Spec() *issueSpec + // Details returns additional information on the issue after the evaluation. + Details() string + } + + issueSpec struct { + Type IssueType + Severity IssueSeverity + Description string + RefURL string + } + machineIssueMap map[*models.V1MachineIPMIResponse]Issues ) @@ -129,7 +130,7 @@ func AllIssues() Issues { ) for _, t := range AllIssueTypes() { - i, err := c.NewIssueFromType(t) + i, err := c.newIssueFromType(t) if err != nil { continue } @@ -163,35 +164,6 @@ func SeverityFromString(s string) (IssueSeverity, error) { } } -func (c *IssueConfig) NewIssueFromType(t IssueType) (issue, error) { - switch t { - case IssueTypeNoPartition: - return &IssueNoPartition{}, nil - case IssueTypeLivelinessDead: - return &IssueLivelinessDead{}, nil - case IssueTypeLivelinessUnknown: - return &IssueLivelinessUnknown{}, nil - case IssueTypeLivelinessNotAvailable: - return &IssueLivelinessNotAvailable{}, nil - case IssueTypeFailedMachineReclaim: - return &IssueFailedMachineReclaim{}, nil - case IssueTypeCrashLoop: - return &IssueCrashLoop{}, nil - case IssueTypeLastEventError: - return &IssueLastEventError{lastEventThreshold: c.LastErrorThreshold}, nil - case IssueTypeBMCWithoutMAC: - return &IssueBMCWithoutMAC{}, nil - case IssueTypeBMCWithoutIP: - return &IssueBMCWithoutIP{}, nil - case IssueTypeASNUniqueness: - return &IssueASNUniqueness{}, nil - case IssueTypeNonDistinctBMCIP: - return &IssueNonDistinctBMCIP{}, nil - default: - return nil, fmt.Errorf("unknown issue type: %s", t) - } -} - func (mis MachineIssues) Get(id string) *MachineWithIssues { for _, m := range mis { m := m @@ -219,7 +191,7 @@ func FindIssues(c *IssueConfig) (MachineIssues, error) { for _, m := range c.Machines { m := m - i, err := c.NewIssueFromType(t) + i, err := c.newIssueFromType(t) if err != nil { return nil, err } @@ -238,7 +210,7 @@ func FindIssues(c *IssueConfig) (MachineIssues, error) { } func (c *IssueConfig) includeIssue(t IssueType) bool { - issue, err := c.NewIssueFromType(t) + issue, err := c.newIssueFromType(t) if err != nil { return false } @@ -265,6 +237,35 @@ func (c *IssueConfig) includeIssue(t IssueType) bool { return true } +func (c *IssueConfig) newIssueFromType(t IssueType) (issue, error) { + switch t { + case IssueTypeNoPartition: + return &IssueNoPartition{}, nil + case IssueTypeLivelinessDead: + return &IssueLivelinessDead{}, nil + case IssueTypeLivelinessUnknown: + return &IssueLivelinessUnknown{}, nil + case IssueTypeLivelinessNotAvailable: + return &IssueLivelinessNotAvailable{}, nil + case IssueTypeFailedMachineReclaim: + return &IssueFailedMachineReclaim{}, nil + case IssueTypeCrashLoop: + return &IssueCrashLoop{}, nil + case IssueTypeLastEventError: + return &IssueLastEventError{lastEventThreshold: c.LastErrorThreshold}, nil + case IssueTypeBMCWithoutMAC: + return &IssueBMCWithoutMAC{}, nil + case IssueTypeBMCWithoutIP: + return &IssueBMCWithoutIP{}, nil + case IssueTypeASNUniqueness: + return &IssueASNUniqueness{}, nil + case IssueTypeNonDistinctBMCIP: + return &IssueNonDistinctBMCIP{}, nil + default: + return nil, fmt.Errorf("unknown issue type: %s", t) + } +} + func (mim machineIssueMap) add(m *models.V1MachineIPMIResponse, issue Issue) { issues, ok := mim[m] if !ok { @@ -291,14 +292,6 @@ func (mim machineIssueMap) toList() MachineIssues { return res } -func (i *IssueNoPartition) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { - return m.Partition == nil -} - -func (i *IssueNoPartition) Details() string { - return "" -} - func (i *IssueNoPartition) Spec() *issueSpec { return &issueSpec{ Type: IssueTypeNoPartition, @@ -308,8 +301,12 @@ func (i *IssueNoPartition) Spec() *issueSpec { } } -func (i *IssueLivelinessDead) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { - return m.Liveliness != nil && *m.Liveliness == "Dead" +func (i *IssueNoPartition) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Partition == nil +} + +func (i *IssueNoPartition) Details() string { + return "" } func (i *IssueLivelinessDead) Spec() *issueSpec { @@ -321,12 +318,12 @@ func (i *IssueLivelinessDead) Spec() *issueSpec { } } -func (i *IssueLivelinessDead) Details() string { - return "" +func (i *IssueLivelinessDead) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Liveliness != nil && *m.Liveliness == "Dead" } -func (i *IssueLivelinessUnknown) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { - return m.Liveliness != nil && *m.Liveliness == "Unknown" +func (i *IssueLivelinessDead) Details() string { + return "" } func (i *IssueLivelinessUnknown) Spec() *issueSpec { @@ -338,10 +335,21 @@ func (i *IssueLivelinessUnknown) Spec() *issueSpec { } } +func (i *IssueLivelinessUnknown) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Liveliness != nil && *m.Liveliness == "Unknown" +} + func (i *IssueLivelinessUnknown) Details() string { return "" } +func (i *IssueLivelinessNotAvailable) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeLivelinessNotAvailable, + Severity: IssueSeverityMinor, + Description: "the machine liveliness is not available", + } +} func (i *IssueLivelinessNotAvailable) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { if m.Liveliness == nil { return true @@ -356,22 +364,10 @@ func (i *IssueLivelinessNotAvailable) Evaluate(m *models.V1MachineIPMIResponse, return !allowed[*m.Liveliness] } -func (i *IssueLivelinessNotAvailable) Spec() *issueSpec { - return &issueSpec{ - Type: IssueTypeLivelinessNotAvailable, - Severity: IssueSeverityMinor, - Description: "the machine liveliness is not available", - } -} - func (i *IssueLivelinessNotAvailable) Details() string { return "" } -func (i *IssueFailedMachineReclaim) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { - return pointer.SafeDeref(pointer.SafeDeref(m.Events).FailedMachineReclaim) -} - func (i *IssueFailedMachineReclaim) Spec() *issueSpec { return &issueSpec{ Type: IssueTypeFailedMachineReclaim, @@ -381,10 +377,23 @@ func (i *IssueFailedMachineReclaim) Spec() *issueSpec { } } +func (i *IssueFailedMachineReclaim) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return pointer.SafeDeref(pointer.SafeDeref(m.Events).FailedMachineReclaim) +} + func (i *IssueFailedMachineReclaim) Details() string { return "" } +func (i *IssueCrashLoop) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeCrashLoop, + Severity: IssueSeverityMajor, + Description: fmt.Sprintf("machine is in a provisioning crash loop (%s)", Loop), + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop", + } +} + func (i *IssueCrashLoop) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { if pointer.SafeDeref(pointer.SafeDeref(m.Events).CrashLoop) { if m.Events != nil && len(m.Events.Log) > 0 && *m.Events.Log[0].Event == "Waiting" { @@ -396,19 +405,19 @@ func (i *IssueCrashLoop) Evaluate(m *models.V1MachineIPMIResponse, ms []*models. return false } -func (i *IssueCrashLoop) Spec() *issueSpec { - return &issueSpec{ - Type: IssueTypeCrashLoop, - Severity: IssueSeverityMajor, - Description: fmt.Sprintf("machine is in a provisioning crash loop (%s)", Loop), - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#crashloop", - } -} - func (i *IssueCrashLoop) Details() string { return "" } +func (i *IssueLastEventError) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeLastEventError, + Severity: IssueSeverityMinor, + Description: "the machine had an error during the provisioning lifecycle", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#last-event-error", + } +} + func (i *IssueLastEventError) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { if i.lastEventThreshold == 0 { return false @@ -424,23 +433,10 @@ func (i *IssueLastEventError) Evaluate(m *models.V1MachineIPMIResponse, ms []*mo return false } -func (i *IssueLastEventError) Spec() *issueSpec { - return &issueSpec{ - Type: IssueTypeLastEventError, - Severity: IssueSeverityMinor, - Description: "the machine had an error during the provisioning lifecycle", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#last-event-error", - } -} - func (i *IssueLastEventError) Details() string { return i.details } -func (i *IssueBMCWithoutMAC) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { - return m.Ipmi != nil && (m.Ipmi.Mac == nil || *m.Ipmi.Mac == "") -} - func (i *IssueBMCWithoutMAC) Spec() *issueSpec { return &issueSpec{ Type: IssueTypeBMCWithoutMAC, @@ -450,12 +446,12 @@ func (i *IssueBMCWithoutMAC) Spec() *issueSpec { } } -func (i *IssueBMCWithoutMAC) Details() string { - return "" +func (i *IssueBMCWithoutMAC) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Ipmi != nil && (m.Ipmi.Mac == nil || *m.Ipmi.Mac == "") } -func (i *IssueBMCWithoutIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { - return m.Ipmi != nil && (m.Ipmi.Address == nil || *m.Ipmi.Address == "") +func (i *IssueBMCWithoutMAC) Details() string { + return "" } func (i *IssueBMCWithoutIP) Spec() *issueSpec { @@ -467,10 +463,23 @@ func (i *IssueBMCWithoutIP) Spec() *issueSpec { } } +func (i *IssueBMCWithoutIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { + return m.Ipmi != nil && (m.Ipmi.Address == nil || *m.Ipmi.Address == "") +} + func (i *IssueBMCWithoutIP) Details() string { return "" } +func (i *IssueASNUniqueness) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeASNUniqueness, + Severity: IssueSeverityMinor, + Description: "The ASN is not unique (only impact on firewalls)", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique", + } +} + func (i *IssueASNUniqueness) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { var ( machineASNs = map[int64][]*models.V1MachineIPMIResponse{} @@ -552,19 +561,18 @@ func (i *IssueASNUniqueness) Evaluate(m *models.V1MachineIPMIResponse, ms []*mod return true } -func (i *IssueASNUniqueness) Spec() *issueSpec { - return &issueSpec{ - Type: IssueTypeASNUniqueness, - Severity: IssueSeverityMinor, - Description: "The ASN is not unique (only impact on firewalls)", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#asn-not-unique", - } -} - func (i *IssueASNUniqueness) Details() string { return i.details } +func (i *IssueNonDistinctBMCIP) Spec() *issueSpec { + return &issueSpec{ + Type: IssueTypeNonDistinctBMCIP, + Description: "BMC IP address is not distinct", + RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-no-distinct-ip", + } +} + func (i *IssueNonDistinctBMCIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { if m.Ipmi == nil || m.Ipmi.Address == nil { return false @@ -599,14 +607,6 @@ func (i *IssueNonDistinctBMCIP) Evaluate(m *models.V1MachineIPMIResponse, ms []* return true } -func (i *IssueNonDistinctBMCIP) Spec() *issueSpec { - return &issueSpec{ - Type: IssueTypeNonDistinctBMCIP, - Description: "BMC IP address is not distinct", - RefURL: "https://docs.metal-stack.io/stable/installation/troubleshoot/#bmc-no-distinct-ip", - } -} - func (i *IssueNonDistinctBMCIP) Details() string { return i.details } From 9820671f792efdc0b4d500beb1a1e83cc35cf3b5 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Thu, 1 Sep 2022 16:49:15 +0200 Subject: [PATCH 3/9] Generate. --- docs/metalctl_machine_issues.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/metalctl_machine_issues.md b/docs/metalctl_machine_issues.md index 5b912418..76bc27b0 100644 --- a/docs/metalctl_machine_issues.md +++ b/docs/metalctl_machine_issues.md @@ -24,7 +24,7 @@ metalctl machine issues [] [flags] ### Options ``` - --failed-event-threshold duration issue severity to include [optional] (default 168h0m0s) + --failed-event-threshold duration the duration up to how long in the past a machine last event error will be counted as an issue [optional] (default 168h0m0s) -h, --help help for issues --hostname string allocation hostname to filter [optional] --id string ID to filter [optional] From af832adabe59e89388b4103d170bfaf1efea32f3 Mon Sep 17 00:00:00 2001 From: Gerrit Date: Thu, 1 Sep 2022 17:12:20 +0200 Subject: [PATCH 4/9] Refactor severity. --- cmd/machine.go | 7 +++++- pkg/api/issues.go | 54 +++++++++++++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/cmd/machine.go b/cmd/machine.go index aec217dd..0569408a 100644 --- a/cmd/machine.go +++ b/cmd/machine.go @@ -358,7 +358,12 @@ In case the machine did not register properly a direct ipmi console access is av machineIssuesCmd.Flags().StringSlice("only", []string{}, "issue types to include [optional]") machineIssuesCmd.Flags().StringSlice("omit", []string{}, "issue types to omit [optional]") machineIssuesCmd.Flags().String("severity", "", "issue severity to include [optional]") - must(machineIssuesCmd.RegisterFlagCompletionFunc("severity", cobra.FixedCompletions([]string{"critical", "major", "minor"}, cobra.ShellCompDirectiveNoFileComp))) + + var severities []string + for _, s := range api.AllSevereties() { + severities = append(severities, string(s)) + } + must(machineIssuesCmd.RegisterFlagCompletionFunc("severity", cobra.FixedCompletions(severities, cobra.ShellCompDirectiveNoFileComp))) machineIssuesCmd.Flags().Duration("failed-event-threshold", api.DefaultLastErrorThreshold(), "the duration up to how long in the past a machine last event error will be counted as an issue [optional]") must(machineIssuesCmd.RegisterFlagCompletionFunc("omit", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { diff --git a/pkg/api/issues.go b/pkg/api/issues.go index 1fb93bfe..fd9e46cf 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -12,12 +12,12 @@ import ( const ( // IssueSeverityMinor is an issue that needs is informational and should be checked from time to time. - IssueSeverityMinor IssueSeverity = iota + IssueSeverityMinor IssueSeverity = "minor" // IssueSeverityMajor is an issue where resources are wasted and for that may be good workarounds. // This should be resolved as soon as possible. - IssueSeverityMajor IssueSeverity = iota + IssueSeverityMajor IssueSeverity = "major" // IssueSeverityCritical is an issue that can lead to disfunction of the system and need to be handled as quickly as possible. - IssueSeverityCritical IssueSeverity = iota + IssueSeverityCritical IssueSeverity = "critical" IssueTypeNoPartition IssueType = "no-partition" IssueTypeLivelinessDead IssueType = "liveliness-dead" @@ -33,7 +33,7 @@ const ( ) type ( - IssueSeverity int + IssueSeverity string IssueType string // Issue formulates an issue of a machine @@ -123,6 +123,37 @@ func AllIssueTypes() []IssueType { } } +func AllSevereties() []IssueSeverity { + return []IssueSeverity{ + IssueSeverityMinor, + IssueSeverityMajor, + IssueSeverityCritical, + } +} + +func SeverityFromString(input string) (IssueSeverity, error) { + switch IssueSeverity(input) { + case IssueSeverityCritical: + return IssueSeverityCritical, nil + case IssueSeverityMajor: + return IssueSeverityMajor, nil + case IssueSeverityMinor: + return IssueSeverityMinor, nil + default: + return "", fmt.Errorf("unknown issue severity: %s", input) + } +} + +func (s IssueSeverity) LowerThan(o IssueSeverity) bool { + smap := map[IssueSeverity]int{ + IssueSeverityCritical: 10, + IssueSeverityMajor: 5, + IssueSeverityMinor: 0, + } + + return smap[s] < smap[o] +} + func AllIssues() Issues { var ( c = &IssueConfig{} @@ -151,19 +182,6 @@ func toIssue(i issue) Issue { } } -func SeverityFromString(s string) (IssueSeverity, error) { - switch strings.ToLower(s) { - case "critical": - return IssueSeverityCritical, nil - case "major": - return IssueSeverityMajor, nil - case "minor": - return IssueSeverityMinor, nil - default: - return IssueSeverity(0), fmt.Errorf("unknown severity: %s", s) - } -} - func (mis MachineIssues) Get(id string) *MachineWithIssues { for _, m := range mis { m := m @@ -215,7 +233,7 @@ func (c *IssueConfig) includeIssue(t IssueType) bool { return false } - if issue.Spec().Severity < c.Severity { + if issue.Spec().Severity.LowerThan(c.Severity) { return false } From 6a9ab69386ceb8cf89e8a3c4ec52f82879b2981b Mon Sep 17 00:00:00 2001 From: Gerrit Date: Fri, 2 Sep 2022 09:29:02 +0200 Subject: [PATCH 5/9] No static usage of last error event duration. --- cmd/machine.go | 18 ++++++++++------- cmd/printers.go | 1 + cmd/tableprinters/machine.go | 10 ++++----- cmd/tableprinters/printer.go | 8 +++++++- docs/metalctl_machine_ipmi.md | 21 ++++++++++--------- docs/metalctl_machine_issues.md | 28 ++++++++++++------------- docs/metalctl_machine_list.md | 23 +++++++++++---------- docs/metalctl_machine_logs.md | 3 ++- pkg/api/issues.go | 36 ++++++++++++++++++--------------- 9 files changed, 83 insertions(+), 65 deletions(-) diff --git a/cmd/machine.go b/cmd/machine.go index 0569408a..f4a3e7ca 100644 --- a/cmd/machine.go +++ b/cmd/machine.go @@ -36,7 +36,7 @@ type machineCmd struct { *config } -func (c *machineCmd) listCmdFlags(cmd *cobra.Command) { +func (c *machineCmd) listCmdFlags(cmd *cobra.Command, lastEventErrorThresholdDefault time.Duration) { listFlagCompletions := []struct { flagName string f func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) @@ -57,10 +57,13 @@ func (c *machineCmd) listCmdFlags(cmd *cobra.Command) { cmd.Flags().String("hostname", "", "allocation hostname to filter [optional]") cmd.Flags().String("mac", "", "mac to filter [optional]") cmd.Flags().StringSlice("tags", []string{}, "tags to filter, use it like: --tags \"tag1,tag2\" or --tags \"tag3\".") + cmd.Flags().Duration("last-event-error-threshold", lastEventErrorThresholdDefault, "the duration up to how long in the past a machine last event error will be counted as an issue [optional]") + for _, c := range listFlagCompletions { c := c must(cmd.RegisterFlagCompletionFunc(c.flagName, c.f)) } + cmd.Long = cmd.Short + "\n" + api.EmojiHelpText() } @@ -126,7 +129,7 @@ Once created the machine installation can not be modified anymore. cmd.Flags().Bool("remove-from-database", false, "remove given machine from the database, is only required for maintenance reasons [optional] (admin only).") }, ListCmdMutateFn: func(cmd *cobra.Command) { - w.listCmdFlags(cmd) + w.listCmdFlags(cmd, 1*time.Hour) }, UpdateCmdMutateFn: func(cmd *cobra.Command) { cmd.Flags().String("description", "", "the description of the machine [optional]") @@ -352,8 +355,8 @@ In case the machine did not register properly a direct ipmi console access is av ValidArgsFunction: c.comp.MachineListCompletion, } - w.listCmdFlags(machineIpmiCmd) - w.listCmdFlags(machineIssuesCmd) + w.listCmdFlags(machineIpmiCmd, 1*time.Hour) + w.listCmdFlags(machineIssuesCmd, api.DefaultLastErrorThreshold()) machineIssuesCmd.Flags().StringSlice("only", []string{}, "issue types to include [optional]") machineIssuesCmd.Flags().StringSlice("omit", []string{}, "issue types to omit [optional]") @@ -364,7 +367,6 @@ In case the machine did not register properly a direct ipmi console access is av severities = append(severities, string(s)) } must(machineIssuesCmd.RegisterFlagCompletionFunc("severity", cobra.FixedCompletions(severities, cobra.ShellCompDirectiveNoFileComp))) - machineIssuesCmd.Flags().Duration("failed-event-threshold", api.DefaultLastErrorThreshold(), "the duration up to how long in the past a machine last event error will be counted as an issue [optional]") must(machineIssuesCmd.RegisterFlagCompletionFunc("omit", func(cmd *cobra.Command, args []string, toComplete string) ([]string, cobra.ShellCompDirective) { var shortNames []string @@ -381,6 +383,8 @@ In case the machine did not register properly a direct ipmi console access is av return shortNames, cobra.ShellCompDirectiveNoFileComp })) + machineLogsCmd.Flags().Duration("last-event-error-threshold", api.DefaultLastErrorThreshold(), "the duration up to how long in the past a machine last event error will be counted as an issue [optional]") + machineConsolePasswordCmd.Flags().StringP("reason", "", "", "a short description why access to the consolepassword is required") machineUpdateBiosCmd.Flags().StringP("revision", "", "", "the BIOS revision") @@ -1116,7 +1120,7 @@ func (c *machineCmd) machineLogs(args []string) error { if pointer.SafeDeref(resp.Events).LastErrorEvent != nil { timeSince := time.Since(time.Time(resp.Events.LastErrorEvent.Time)) - if timeSince > api.DefaultLastErrorThreshold() { + if timeSince > viper.GetDuration("last-event-error-threshold") { return nil } @@ -1288,7 +1292,7 @@ func (c *machineCmd) machineIssues(args []string) error { Severity: severity, Only: only, Omit: omit, - LastErrorThreshold: viper.GetDuration("failed-event-threshold"), + LastErrorThreshold: viper.GetDuration("last-event-error-threshold"), }) if err != nil { return err diff --git a/cmd/printers.go b/cmd/printers.go index b5e9802a..af45ae26 100644 --- a/cmd/printers.go +++ b/cmd/printers.go @@ -28,6 +28,7 @@ func newPrinterFromCLI(out io.Writer) printers.Printer { } tablePrinter := printers.NewTablePrinter(cfg).WithOut(out) tp.SetPrinter(tablePrinter) + tp.SetLastEventErrorThreshold(viper.GetDuration("last-event-error-threshold")) printer = tablePrinter case "template": printer = printers.NewTemplatePrinter(viper.GetString("template")).WithOut(out) diff --git a/cmd/tableprinters/machine.go b/cmd/tableprinters/machine.go index f557afcf..3fd60b99 100644 --- a/cmd/tableprinters/machine.go +++ b/cmd/tableprinters/machine.go @@ -71,7 +71,7 @@ func (t *TablePrinter) MachineTable(data []*models.V1MachineResponse, wide bool) lastEvent = *machine.Events.Log[0].Event } - emojis, _ := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State, 1*time.Hour) + emojis, _ := t.getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State) if wide { rows = append(rows, []string{machineID, lastEvent, when, age, desc, name, hostname, project, ips, sizeID, image, partitionID, started, tags, reserved}) @@ -83,7 +83,7 @@ func (t *TablePrinter) MachineTable(data []*models.V1MachineResponse, wide bool) return header, rows, nil } -func getMachineStatusEmojis(liveliness *string, events *models.V1MachineRecentProvisioningEvents, state *models.V1MachineState, lastErrorThreshold time.Duration) (string, string) { +func (t *TablePrinter) getMachineStatusEmojis(liveliness *string, events *models.V1MachineRecentProvisioningEvents, state *models.V1MachineState) (string, string) { var ( emojis []string wide []string @@ -122,7 +122,7 @@ func getMachineStatusEmojis(liveliness *string, events *models.V1MachineRecentPr wide = append(wide, "FailedReclaim") } - if events.LastErrorEvent != nil && time.Since(time.Time(events.LastErrorEvent.Time)) < lastErrorThreshold { + if events.LastErrorEvent != nil && time.Since(time.Time(events.LastErrorEvent.Time)) < t.lastEventErrorThreshold { emojis = append(emojis, api.Exclamation) wide = append(wide, "LastEventErrors") } @@ -182,7 +182,7 @@ func (t *TablePrinter) MachineIPMITable(data []*models.V1MachineIPMIResponse, wi biosVersion = pointer.SafeDeref(bios.Version) } - emojis, wideEmojis := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State, 1*time.Hour) + emojis, wideEmojis := t.getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State) if wide { rows = append(rows, []string{id, wideEmojis, powerText, ipAddress, mac, bpn, cs, ps, biosVersion, bmcVersion, size, partition}) @@ -290,7 +290,7 @@ func (t *TablePrinter) MachineIssuesTable(data api.MachineIssues, wide bool) ([] lastEvent = *machine.Events.Log[0].Event } - emojis, _ := getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State, api.DefaultLastErrorThreshold()) + emojis, _ := t.getMachineStatusEmojis(machine.Liveliness, machine.Events, machine.State) for _, issue := range machineWithIssues.Issues { text := fmt.Sprintf("%s (%s)", issue.Description, issue.Type) diff --git a/cmd/tableprinters/printer.go b/cmd/tableprinters/printer.go index 1324771b..2c8cbd73 100644 --- a/cmd/tableprinters/printer.go +++ b/cmd/tableprinters/printer.go @@ -2,6 +2,7 @@ package tableprinters import ( "fmt" + "time" "github.com/metal-stack/metal-go/api/models" "github.com/metal-stack/metal-lib/pkg/genericcli/printers" @@ -10,7 +11,8 @@ import ( ) type TablePrinter struct { - t *printers.TablePrinter + t *printers.TablePrinter + lastEventErrorThreshold time.Duration } func New() *TablePrinter { @@ -21,6 +23,10 @@ func (t *TablePrinter) SetPrinter(printer *printers.TablePrinter) { t.t = printer } +func (t *TablePrinter) SetLastEventErrorThreshold(threshold time.Duration) { + t.lastEventErrorThreshold = threshold +} + func (t *TablePrinter) ToHeaderAndRows(data any, wide bool) ([]string, [][]string, error) { switch d := data.(type) { case []*models.V1MachineResponse: diff --git a/docs/metalctl_machine_ipmi.md b/docs/metalctl_machine_ipmi.md index c9b17608..b1ddb2b3 100644 --- a/docs/metalctl_machine_ipmi.md +++ b/docs/metalctl_machine_ipmi.md @@ -24,16 +24,17 @@ metalctl machine ipmi [] [flags] ### Options ``` - -h, --help help for ipmi - --hostname string allocation hostname to filter [optional] - --id string ID to filter [optional] - --image string allocation image to filter [optional] - --mac string mac to filter [optional] - --name string allocation name to filter [optional] - --partition string partition to filter [optional] - --project string allocation project to filter [optional] - --size string size to filter [optional] - --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". + -h, --help help for ipmi + --hostname string allocation hostname to filter [optional] + --id string ID to filter [optional] + --image string allocation image to filter [optional] + --last-event-error-threshold duration the duration up to how long in the past a machine last event error will be counted as an issue [optional] (default 1h0m0s) + --mac string mac to filter [optional] + --name string allocation name to filter [optional] + --partition string partition to filter [optional] + --project string allocation project to filter [optional] + --size string size to filter [optional] + --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". ``` ### Options inherited from parent commands diff --git a/docs/metalctl_machine_issues.md b/docs/metalctl_machine_issues.md index 76bc27b0..ccc827a8 100644 --- a/docs/metalctl_machine_issues.md +++ b/docs/metalctl_machine_issues.md @@ -24,20 +24,20 @@ metalctl machine issues [] [flags] ### Options ``` - --failed-event-threshold duration the duration up to how long in the past a machine last event error will be counted as an issue [optional] (default 168h0m0s) - -h, --help help for issues - --hostname string allocation hostname to filter [optional] - --id string ID to filter [optional] - --image string allocation image to filter [optional] - --mac string mac to filter [optional] - --name string allocation name to filter [optional] - --omit strings issue types to omit [optional] - --only strings issue types to include [optional] - --partition string partition to filter [optional] - --project string allocation project to filter [optional] - --severity string issue severity to include [optional] - --size string size to filter [optional] - --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". + -h, --help help for issues + --hostname string allocation hostname to filter [optional] + --id string ID to filter [optional] + --image string allocation image to filter [optional] + --last-event-error-threshold duration the duration up to how long in the past a machine last event error will be counted as an issue [optional] (default 168h0m0s) + --mac string mac to filter [optional] + --name string allocation name to filter [optional] + --omit strings issue types to omit [optional] + --only strings issue types to include [optional] + --partition string partition to filter [optional] + --project string allocation project to filter [optional] + --severity string issue severity to include [optional] + --size string size to filter [optional] + --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". ``` ### Options inherited from parent commands diff --git a/docs/metalctl_machine_list.md b/docs/metalctl_machine_list.md index c3facfe1..61d7c2f3 100644 --- a/docs/metalctl_machine_list.md +++ b/docs/metalctl_machine_list.md @@ -24,17 +24,18 @@ metalctl machine list [flags] ### Options ``` - -h, --help help for list - --hostname string allocation hostname to filter [optional] - --id string ID to filter [optional] - --image string allocation image to filter [optional] - --mac string mac to filter [optional] - --name string allocation name to filter [optional] - --partition string partition to filter [optional] - --project string allocation project to filter [optional] - --size string size to filter [optional] - --sort-by strings sort by (comma separated) column(s), sort direction can be changed by appending :asc or :desc behind the column identifier. possible values: age|event|id|image|liveliness|partition|project|size|when - --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". + -h, --help help for list + --hostname string allocation hostname to filter [optional] + --id string ID to filter [optional] + --image string allocation image to filter [optional] + --last-event-error-threshold duration the duration up to how long in the past a machine last event error will be counted as an issue [optional] (default 1h0m0s) + --mac string mac to filter [optional] + --name string allocation name to filter [optional] + --partition string partition to filter [optional] + --project string allocation project to filter [optional] + --size string size to filter [optional] + --sort-by strings sort by (comma separated) column(s), sort direction can be changed by appending :asc or :desc behind the column identifier. possible values: age|event|id|image|liveliness|partition|project|size|when + --tags strings tags to filter, use it like: --tags "tag1,tag2" or --tags "tag3". ``` ### Options inherited from parent commands diff --git a/docs/metalctl_machine_logs.md b/docs/metalctl_machine_logs.md index 8dde0874..46a9bda5 100644 --- a/docs/metalctl_machine_logs.md +++ b/docs/metalctl_machine_logs.md @@ -9,7 +9,8 @@ metalctl machine logs [flags] ### Options ``` - -h, --help help for logs + -h, --help help for logs + --last-event-error-threshold duration the duration up to how long in the past a machine last event error will be counted as an issue [optional] (default 168h0m0s) ``` ### Options inherited from parent commands diff --git a/pkg/api/issues.go b/pkg/api/issues.go index fd9e46cf..66345ba4 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -75,6 +75,10 @@ type ( details string } + IssueNotEvaluableError struct { + err error + } + // MachineWithIssues summarizes a machine with issues MachineWithIssues struct { Machine *models.V1MachineIPMIResponse @@ -85,8 +89,8 @@ type ( issue interface { // Evaluates whether a given machine has the this issue. - // the second argument "ms" is a list of all the other machines, which can be necessary to determine a machine issue. - Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool + // the second argument contains additional information that may be required for the issue evaluation + Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool // Spec returns the issue spec of this issue. Spec() *issueSpec // Details returns additional information on the issue after the evaluation. @@ -218,7 +222,7 @@ func FindIssues(c *IssueConfig) (MachineIssues, error) { continue } - if i.Evaluate(m, c.Machines) { + if i.Evaluate(m, c) { res.add(m, toIssue(i)) } } @@ -319,7 +323,7 @@ func (i *IssueNoPartition) Spec() *issueSpec { } } -func (i *IssueNoPartition) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueNoPartition) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { return m.Partition == nil } @@ -336,7 +340,7 @@ func (i *IssueLivelinessDead) Spec() *issueSpec { } } -func (i *IssueLivelinessDead) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueLivelinessDead) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { return m.Liveliness != nil && *m.Liveliness == "Dead" } @@ -353,7 +357,7 @@ func (i *IssueLivelinessUnknown) Spec() *issueSpec { } } -func (i *IssueLivelinessUnknown) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueLivelinessUnknown) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { return m.Liveliness != nil && *m.Liveliness == "Unknown" } @@ -368,7 +372,7 @@ func (i *IssueLivelinessNotAvailable) Spec() *issueSpec { Description: "the machine liveliness is not available", } } -func (i *IssueLivelinessNotAvailable) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueLivelinessNotAvailable) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { if m.Liveliness == nil { return true } @@ -395,7 +399,7 @@ func (i *IssueFailedMachineReclaim) Spec() *issueSpec { } } -func (i *IssueFailedMachineReclaim) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueFailedMachineReclaim) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { return pointer.SafeDeref(pointer.SafeDeref(m.Events).FailedMachineReclaim) } @@ -412,7 +416,7 @@ func (i *IssueCrashLoop) Spec() *issueSpec { } } -func (i *IssueCrashLoop) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueCrashLoop) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { if pointer.SafeDeref(pointer.SafeDeref(m.Events).CrashLoop) { if m.Events != nil && len(m.Events.Log) > 0 && *m.Events.Log[0].Event == "Waiting" { // Machine which are waiting are not considered to have issues @@ -436,7 +440,7 @@ func (i *IssueLastEventError) Spec() *issueSpec { } } -func (i *IssueLastEventError) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueLastEventError) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { if i.lastEventThreshold == 0 { return false } @@ -464,7 +468,7 @@ func (i *IssueBMCWithoutMAC) Spec() *issueSpec { } } -func (i *IssueBMCWithoutMAC) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueBMCWithoutMAC) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { return m.Ipmi != nil && (m.Ipmi.Mac == nil || *m.Ipmi.Mac == "") } @@ -481,7 +485,7 @@ func (i *IssueBMCWithoutIP) Spec() *issueSpec { } } -func (i *IssueBMCWithoutIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueBMCWithoutIP) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { return m.Ipmi != nil && (m.Ipmi.Address == nil || *m.Ipmi.Address == "") } @@ -498,7 +502,7 @@ func (i *IssueASNUniqueness) Spec() *issueSpec { } } -func (i *IssueASNUniqueness) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueASNUniqueness) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { var ( machineASNs = map[int64][]*models.V1MachineIPMIResponse{} overlaps []string @@ -519,7 +523,7 @@ func (i *IssueASNUniqueness) Evaluate(m *models.V1MachineIPMIResponse, ms []*mod machineASNs[*n.Asn] = nil } - for _, machineFromAll := range ms { + for _, machineFromAll := range c.Machines { if pointer.SafeDeref(machineFromAll.ID) == pointer.SafeDeref(m.ID) { continue } @@ -591,7 +595,7 @@ func (i *IssueNonDistinctBMCIP) Spec() *issueSpec { } } -func (i *IssueNonDistinctBMCIP) Evaluate(m *models.V1MachineIPMIResponse, ms []*models.V1MachineIPMIResponse) bool { +func (i *IssueNonDistinctBMCIP) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { if m.Ipmi == nil || m.Ipmi.Address == nil { return false } @@ -601,7 +605,7 @@ func (i *IssueNonDistinctBMCIP) Evaluate(m *models.V1MachineIPMIResponse, ms []* overlaps []string ) - for _, machineFromAll := range ms { + for _, machineFromAll := range c.Machines { if pointer.SafeDeref(machineFromAll.ID) == pointer.SafeDeref(m.ID) { continue } From 42bc8b1ba7f80fe95f429b34cd6ec2ebd5d8180c Mon Sep 17 00:00:00 2001 From: Gerrit Date: Fri, 2 Sep 2022 09:55:24 +0200 Subject: [PATCH 6/9] Cleanup. --- pkg/api/issues.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/api/issues.go b/pkg/api/issues.go index 66345ba4..7b64fa91 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -75,10 +75,6 @@ type ( details string } - IssueNotEvaluableError struct { - err error - } - // MachineWithIssues summarizes a machine with issues MachineWithIssues struct { Machine *models.V1MachineIPMIResponse From 29f3a7f0983c48a37e8e3b74f1d4441b2a1df11f Mon Sep 17 00:00:00 2001 From: Gerrit Date: Fri, 2 Sep 2022 10:04:27 +0200 Subject: [PATCH 7/9] Improve comments. --- pkg/api/issues.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/api/issues.go b/pkg/api/issues.go index 7b64fa91..1e49e175 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -11,10 +11,10 @@ import ( ) const ( - // IssueSeverityMinor is an issue that needs is informational and should be checked from time to time. + // IssueSeverityMinor is an issue that should be checked from time to time but has no bad effects for the user. IssueSeverityMinor IssueSeverity = "minor" - // IssueSeverityMajor is an issue where resources are wasted and for that may be good workarounds. - // This should be resolved as soon as possible. + // IssueSeverityMajor is an issue where user experience is affected or provider resources are wasted. + // overall functionality is still maintained though. major issues should be resolved as soon as possible. IssueSeverityMajor IssueSeverity = "major" // IssueSeverityCritical is an issue that can lead to disfunction of the system and need to be handled as quickly as possible. IssueSeverityCritical IssueSeverity = "critical" From dd7794160f5e7161a836249de0ee462b0a7e6f1f Mon Sep 17 00:00:00 2001 From: Gerrit Date: Fri, 2 Sep 2022 10:06:56 +0200 Subject: [PATCH 8/9] Comment. --- pkg/api/issues.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/api/issues.go b/pkg/api/issues.go index 1e49e175..af052533 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -84,7 +84,7 @@ type ( MachineIssues []*MachineWithIssues issue interface { - // Evaluates whether a given machine has the this issue. + // Evaluate decides whether a given machine has the machine issue. // the second argument contains additional information that may be required for the issue evaluation Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool // Spec returns the issue spec of this issue. From d560872fd50f4e7c2fe6d062ffdc21e62b687f7a Mon Sep 17 00:00:00 2001 From: Gerrit Date: Fri, 2 Sep 2022 10:09:16 +0200 Subject: [PATCH 9/9] Small fixes. --- pkg/api/issues.go | 22 +++++++++------------- pkg/api/issues_test.go | 2 +- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/pkg/api/issues.go b/pkg/api/issues.go index af052533..652b5fd1 100644 --- a/pkg/api/issues.go +++ b/pkg/api/issues.go @@ -63,8 +63,7 @@ type ( IssueFailedMachineReclaim struct{} IssueCrashLoop struct{} IssueLastEventError struct { - lastEventThreshold time.Duration - details string + details string } IssueBMCWithoutMAC struct{} IssueBMCWithoutIP struct{} @@ -155,13 +154,10 @@ func (s IssueSeverity) LowerThan(o IssueSeverity) bool { } func AllIssues() Issues { - var ( - c = &IssueConfig{} - res Issues - ) + var res Issues for _, t := range AllIssueTypes() { - i, err := c.newIssueFromType(t) + i, err := newIssueFromType(t) if err != nil { continue } @@ -209,7 +205,7 @@ func FindIssues(c *IssueConfig) (MachineIssues, error) { for _, m := range c.Machines { m := m - i, err := c.newIssueFromType(t) + i, err := newIssueFromType(t) if err != nil { return nil, err } @@ -228,7 +224,7 @@ func FindIssues(c *IssueConfig) (MachineIssues, error) { } func (c *IssueConfig) includeIssue(t IssueType) bool { - issue, err := c.newIssueFromType(t) + issue, err := newIssueFromType(t) if err != nil { return false } @@ -255,7 +251,7 @@ func (c *IssueConfig) includeIssue(t IssueType) bool { return true } -func (c *IssueConfig) newIssueFromType(t IssueType) (issue, error) { +func newIssueFromType(t IssueType) (issue, error) { switch t { case IssueTypeNoPartition: return &IssueNoPartition{}, nil @@ -270,7 +266,7 @@ func (c *IssueConfig) newIssueFromType(t IssueType) (issue, error) { case IssueTypeCrashLoop: return &IssueCrashLoop{}, nil case IssueTypeLastEventError: - return &IssueLastEventError{lastEventThreshold: c.LastErrorThreshold}, nil + return &IssueLastEventError{}, nil case IssueTypeBMCWithoutMAC: return &IssueBMCWithoutMAC{}, nil case IssueTypeBMCWithoutIP: @@ -437,13 +433,13 @@ func (i *IssueLastEventError) Spec() *issueSpec { } func (i *IssueLastEventError) Evaluate(m *models.V1MachineIPMIResponse, c *IssueConfig) bool { - if i.lastEventThreshold == 0 { + if c.LastErrorThreshold == 0 { return false } if pointer.SafeDeref(m.Events).LastErrorEvent != nil { timeSince := time.Since(time.Time(m.Events.LastErrorEvent.Time)) - if timeSince < i.lastEventThreshold { + if timeSince < c.LastErrorThreshold { i.details = fmt.Sprintf("occurred %s ago", timeSince.String()) return true } diff --git a/pkg/api/issues_test.go b/pkg/api/issues_test.go index d22b6dee..a4f27f59 100644 --- a/pkg/api/issues_test.go +++ b/pkg/api/issues_test.go @@ -206,7 +206,7 @@ func TestFindIssues(t *testing.T) { { Machine: lastEventErrorMachine, Issues: Issues{ - toIssue(&IssueLastEventError{lastEventThreshold: 10 * time.Minute, details: "occurred 5m0s ago"}), + toIssue(&IssueLastEventError{details: "occurred 5m0s ago"}), }, }, },