Skip to content

Commit

Permalink
TaskHandler: add ValidateFirmware action
Browse files Browse the repository at this point in the history
It's not entirely read for primetime (primarily because bmclib's POST code
retrieval isn't implemented across many of its providers at the moment) and is
potentially vulnerable to race conditions between this code and activity on the
device, but the basic logic is there.
  • Loading branch information
zevweiss committed Aug 14, 2024
1 parent f5d6280 commit f6eb561
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 0 deletions.
10 changes: 10 additions & 0 deletions internal/device/bmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"time"

"github.com/bmc-toolbox/bmclib/v2"
"github.com/bmc-toolbox/bmclib/v2/constants"
"github.com/bmc-toolbox/bmclib/v2/providers"
logrusrv2 "github.com/bombsimon/logrusr/v2"
"github.com/metal-toolbox/flipflop/internal/model"
Expand Down Expand Up @@ -110,6 +111,15 @@ func (b *Bmc) PowerCycleBMC(ctx context.Context) error {
return err
}

func (b *Bmc) HostBooted(ctx context.Context) (bool, error) {
defer b.tracelog()
status, _, err := b.client.PostCode(ctx)
if err != nil {
return false, err
}
return status == constants.POSTStateOS, nil
}

func (b *Bmc) tracelog() {
pc, _, _, _ := runtime.Caller(1)
funcName := path.Base(runtime.FuncForPC(pc).Name())
Expand Down
5 changes: 5 additions & 0 deletions internal/device/dryRun.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ func (b *DryRunBMC) SetBootDevice(_ context.Context, _ string, _, _ bool) error
func (b *DryRunBMC) PowerCycleBMC(_ context.Context) error {
return nil
}

// HostBooted reports whether or not the device has booted the host OS
func (b *DryRunBMC) HostBooted(_ context.Context) (bool, error) {
return true, nil
}
1 change: 1 addition & 0 deletions internal/device/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ type Queryor interface {
SetPowerState(ctx context.Context, state string) error
SetBootDevice(ctx context.Context, device string, persistent, efiBoot bool) error
PowerCycleBMC(ctx context.Context) error
HostBooted(ctx context.Context) (bool, error)
}
51 changes: 51 additions & 0 deletions internal/worker/task_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ func (t *TaskHandler) Run(ctx context.Context) error {
return t.setPowerState(ctx, t.bmc)
case rctypes.SetNextBootDevice:
return t.setNextBootDevice(ctx, t.bmc)
case rctypes.ValidateFirmware:

Check failure on line 82 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

undefined: rctypes.ValidateFirmware

Check failure on line 82 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

undefined: rctypes.ValidateFirmware

Check failure on line 82 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

undefined: rctypes.ValidateFirmware

Check failure on line 82 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

undefined: rctypes.ValidateFirmware
return t.validateFirmware(ctx, t.bmc)
default:
return t.failedWithError(ctx, string(t.task.Parameters.Action), errUnsupportedAction)
}
Expand Down Expand Up @@ -152,6 +154,55 @@ func (t *TaskHandler) setNextBootDevice(ctx context.Context, bmc device.Queryor)
return t.successful(ctx, "next boot device set successfully: "+t.task.Parameters.ActionParameter)
}

func (t *TaskHandler) validateFirmware(ctx context.Context, bmc device.Queryor) error {
t.loggerEntry.Info("starting firmware validation")

deadline := time.Now().Add(t.task.Parameters.ValidateFirmwareTimeout)

Check failure on line 160 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

t.task.Parameters.ValidateFirmwareTimeout undefined (type *condition.ServerControlTaskParameters has no field or method ValidateFirmwareTimeout) (typecheck)

Check failure on line 160 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

t.task.Parameters.ValidateFirmwareTimeout undefined (type *condition.ServerControlTaskParameters has no field or method ValidateFirmwareTimeout)) (typecheck)

Check failure on line 160 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

t.task.Parameters.ValidateFirmwareTimeout undefined (type *condition.ServerControlTaskParameters has no field or method ValidateFirmwareTimeout) (typecheck)

Check failure on line 160 in internal/worker/task_handler.go

View workflow job for this annotation

GitHub Actions / lint-test

t.task.Parameters.ValidateFirmwareTimeout undefined (type *condition.ServerControlTaskParameters has no field or method ValidateFirmwareTimeout)) (typecheck)

// First reboot the BMC to ensure it's running the desired firmware
if err := bmc.PowerCycleBMC(ctx); err != nil {
return t.failedWithError(ctx, "failed to power cycle BMC", err)
}

var err error

// Next we want to cycle the host, but the BMC will take some
// time to reboot, so retry once every 30 seconds up to our
// timeout deadline (ideally we'd have a way to distinguish
// failures that are due to the BMC not being back online yet
// from ones that aren't going to be resolved by waiting and
// retrying...)
for time.Now().Before(deadline) {
time.Sleep(30 * time.Second)
err = bmc.SetPowerState(ctx, "cycle")
if err == nil {
break
}
}

if err != nil {
return t.failedWithError(ctx, "failed to cycle host power after BMC power cycle", err)
}

// Finally, wait for the host to boot successfully
for time.Now().Before(deadline) {
// sleep before checking to (hopefully) avoid seeing a
// stale POST code from a previous boot before the
// power-cycle has actually started happening
time.Sleep(30 * time.Second)
booted, err := bmc.HostBooted(ctx)
if err != nil {
return t.failedWithError(ctx, "failed to retrieve host boot status", err)
}
if booted {
// TODO: record successful result in fleetdb before returning
return nil
}
}

return t.failed(ctx, "host failed to boot successfully before deadline")
}

func (t *TaskHandler) publish(ctx context.Context, status string, state rctypes.State) error {
t.task.State = state
t.task.Status.Append(status)
Expand Down

0 comments on commit f6eb561

Please sign in to comment.