From 24e202ef73222a86c20b3c3d00e06e7c8af51048 Mon Sep 17 00:00:00 2001 From: Adrian Chiris Date: Tue, 7 Apr 2020 14:54:03 +0300 Subject: [PATCH] RDMA device rename - main flow This commit implements the main flow for RDMA device rename when moving the device to/from container On CmdAdd: rename device to the next available rdma device name according to the naming scheme defined in previous commit. On CmdDel: Restore the name to its orignal name as was saved in cache --- cmd/rdma/main.go | 138 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 116 insertions(+), 22 deletions(-) diff --git a/cmd/rdma/main.go b/cmd/rdma/main.go index e0fcd30..d790192 100644 --- a/cmd/rdma/main.go +++ b/cmd/rdma/main.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "os" + "strings" "github.com/Mellanox/rdma-cni/pkg/cache" "github.com/Mellanox/rdma-cni/pkg/rdma" @@ -103,9 +104,11 @@ func (plugin *rdmaCniPlugin) parseConf(data []byte, envArgs string) (*rdmatypes. return &conf, nil } -// Move RDMA device to namespace -func (plugin *rdmaCniPlugin) moveRdmaDevToNs(rdmaDev string, nsPath string) error { - log.Debug().Msgf("moving RDMA device %s to namespace %s", rdmaDev, nsPath) +// Move RDMA device, sRdmaDev, to namespace and rename RDMA device to cRdmadev +func (plugin *rdmaCniPlugin) moveRdmaDevToNs(sRdmaDev string, cRdmaDev string, nsPath string) error { + log.Debug().Msgf("Moving RDMA device %s to namespace %s", sRdmaDev, nsPath) + var err error + renameReq := sRdmaDev != cRdmaDev targetNs, err := plugin.nsManager.GetNS(nsPath) if err != nil { @@ -113,16 +116,52 @@ func (plugin *rdmaCniPlugin) moveRdmaDevToNs(rdmaDev string, nsPath string) erro } defer targetNs.Close() - err = plugin.rdmaManager.MoveRdmaDevToNs(rdmaDev, targetNs) + tmpName := sRdmaDev + if renameReq { + // set temp name for RDMA dev + tmpName = utils.GetRandomRdmaDevName() + err = plugin.rdmaManager.SetRdmaDevName(sRdmaDev, tmpName) + if err != nil { + return err + } + defer func() { + if err != nil { + log.Warn().Msgf("Error occured while moving RDMA device to namespace. %v", err) + restoreErr := plugin.rdmaManager.SetRdmaDevName(tmpName, sRdmaDev) + if restoreErr != nil { + log.Warn().Msgf("Failed to restore RDMA device name. %v", restoreErr) + } + } + }() + } + + err = plugin.rdmaManager.MoveRdmaDevToNs(tmpName, targetNs) if err != nil { - return fmt.Errorf("failed to move RDMA device %s to namespace. %v", rdmaDev, err) + return fmt.Errorf("failed to move RDMA device %s to namespace. %v", tmpName, err) } - return nil + + if renameReq { + // rename RDMA dev in container NS to target name + err = targetNs.Do(func(hostNs ns.NetNS) error { + renameErr := plugin.rdmaManager.SetRdmaDevName(tmpName, cRdmaDev) + if renameErr != nil { + // move RDMA device back to host namespace. + restoreErr := plugin.rdmaManager.MoveRdmaDevToNs(tmpName, hostNs) + if restoreErr != nil { + log.Warn().Msgf("Failed to move RDMA device to default namespace after error. %v", restoreErr) + } + } + return renameErr + }) + } + return err } -// Move RDMA device from namespace to current (default) namespace -func (plugin *rdmaCniPlugin) moveRdmaDevFromNs(rdmaDev string, nsPath string) error { - log.Debug().Msgf("INFO: moving RDMA device %s from namespace %s to default namespace", rdmaDev, nsPath) +// Move RDMA device, cRdmaDev, from namespace to current (default) namespace and rename it to sRdmaDev +func (plugin *rdmaCniPlugin) moveRdmaDevFromNs(cRdmaDev string, sRdmaDev string, nsPath string) error { + log.Debug().Msgf("Moving RDMA device %s from namespace %s to default namespace", cRdmaDev, nsPath) + var err error + renameReq := cRdmaDev != sRdmaDev sourceNs, err := plugin.nsManager.GetNS(nsPath) if err != nil { @@ -136,16 +175,69 @@ func (plugin *rdmaCniPlugin) moveRdmaDevFromNs(rdmaDev string, nsPath string) er } defer targetNs.Close() + tmpName := cRdmaDev + if renameReq { + tmpName = utils.GetRandomRdmaDevName() + } err = sourceNs.Do(func(_ ns.NetNS) error { - // Move RDMA device to default namespace - return plugin.rdmaManager.MoveRdmaDevToNs(rdmaDev, targetNs) + if renameReq { + // Move RDMA device to default namespace + var sourceNsError error + sourceNsError = plugin.rdmaManager.SetRdmaDevName(cRdmaDev, tmpName) + if sourceNsError != nil { + log.Warn().Msgf("Failed to restore RDMA device name to its original value. %v", sourceNsError) + return plugin.rdmaManager.MoveRdmaDevToNs(cRdmaDev, targetNs) + } + } + return plugin.rdmaManager.MoveRdmaDevToNs(tmpName, targetNs) }) if err != nil { - return fmt.Errorf("failed to move RDMA device %s to default namespace. %v", rdmaDev, err) + return fmt.Errorf("failed to move RDMA device %s to default namespace. %v", cRdmaDev, err) + } + if renameReq { + // set target name + err = targetNs.Do(func(_ ns.NetNS) error { + return plugin.rdmaManager.SetRdmaDevName(tmpName, sRdmaDev) + }) } return err } +func (plugin *rdmaCniPlugin) getContainerRdmaDeviceName(sRdmaDev string, nsPath string) string { + var err error + var sourceNs ns.NetNS + sourceNs, err = plugin.nsManager.GetNS(nsPath) + defer sourceNs.Close() + defer func() { + if err != nil { + log.Warn().Msgf("Failed to generate container RDMA device name, %s. Original name will be used.", err) + } + }() + + var cRdmaDevs []string + err = sourceNs.Do(func(_ ns.NetNS) error { + var cErr error + cRdmaDevs, cErr = plugin.rdmaManager.GetRdmaDeviceList() + return cErr + }) + if err != nil { + return sRdmaDev + } + + // Expected format is _ + s := strings.Split(sRdmaDev, "_") + if len(s) != 2 { + err = fmt.Errorf("unexpected RDMA device name %s", sRdmaDev) + return sRdmaDev + } + + cNextRdmaDev, err := utils.GetNextRdmaDeviceName(s[0], cRdmaDevs) + if err != nil { + return sRdmaDev + } + return cNextRdmaDev +} + func (plugin *rdmaCniPlugin) CmdAdd(args *skel.CmdArgs) error { log.Info().Msgf("RDMA-CNI: cmdAdd") conf, err := plugin.parseConf(args.StdinData, args.Args) @@ -197,25 +289,26 @@ func (plugin *rdmaCniPlugin) CmdAdd(args *skel.CmdArgs) error { } // Move RDMA device to container namespace - rdmaDev := rdmaDevs[0] + sRdmaDev := rdmaDevs[0] + cRdmaDev := plugin.getContainerRdmaDeviceName(sRdmaDev, args.Netns) + log.Debug().Msgf("Sandbox RDMA device: %s, Container RDMA device: %s", sRdmaDev, cRdmaDev) - err = plugin.moveRdmaDevToNs(rdmaDev, args.Netns) + err = plugin.moveRdmaDevToNs(sRdmaDev, cRdmaDev, args.Netns) if err != nil { - return fmt.Errorf("failed to move RDMA device %s to namespace. %v", rdmaDev, err) + return fmt.Errorf("failed to move RDMA device %s to namespace. %v", sRdmaDev, err) } - // Save RDMA state state := rdmatypes.NewRdmaNetState() state.DeviceID = conf.DeviceID - state.SandboxRdmaDevName = rdmaDev - state.ContainerRdmaDevName = rdmaDev + state.SandboxRdmaDevName = sRdmaDev + state.ContainerRdmaDevName = cRdmaDev pRef := plugin.stateCache.GetStateRef(conf.Name, args.ContainerID, args.IfName) err = plugin.stateCache.Save(pRef, &state) if err != nil { // Move RDMA dev back to current namespace - restoreErr := plugin.moveRdmaDevFromNs(state.ContainerRdmaDevName, args.Netns) + restoreErr := plugin.moveRdmaDevFromNs(state.ContainerRdmaDevName, state.SandboxRdmaDevName, args.Netns) if restoreErr != nil { - return fmt.Errorf("save to cache failed %v, failed while restoring namespace for RDMA device %s. %v", err, rdmaDev, restoreErr) + return fmt.Errorf("save to cache failed %v, failed while restoring namespace for RDMA device %s. %v", err, sRdmaDev, restoreErr) } return err } @@ -249,11 +342,12 @@ func (plugin *rdmaCniPlugin) CmdDel(args *skel.CmdArgs) error { pRef := plugin.stateCache.GetStateRef(conf.Name, args.ContainerID, args.IfName) err = plugin.stateCache.Load(pRef, &rdmaState) if err != nil { - return err + log.Warn().Msgf("Failed to load state from cache, %v. preceding cmdAdd operation may have failed early.", err) + return nil } // Move RDMA device to default namespace - err = plugin.moveRdmaDevFromNs(rdmaState.ContainerRdmaDevName, args.Netns) + err = plugin.moveRdmaDevFromNs(rdmaState.ContainerRdmaDevName, rdmaState.SandboxRdmaDevName, args.Netns) if err != nil { return fmt.Errorf( "failed to restore RDMA device %s to default namespace. %v", rdmaState.ContainerRdmaDevName, err)