Skip to content

Commit

Permalink
fix: slots being filled returned out of order on k8s [RM-42] (#9276)
Browse files Browse the repository at this point in the history
  • Loading branch information
NicholasBlaskey committed May 2, 2024
1 parent a9c8700 commit 21f76e9
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 14 deletions.
7 changes: 7 additions & 0 deletions docs/release-notes/k8s-slots-jumping.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
:orphan:

**Bug Fixes**

- Kubernetes: Fix an issue where the cluster page could display different slots filled on every
page refresh. Now the indication of slots filled should always be filled from the left to the
right.
19 changes: 16 additions & 3 deletions master/internal/authz/obfuscate.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package authz

import (
"slices"

"github.com/google/uuid"
"github.com/pkg/errors"

"github.com/determined-ai/determined/master/pkg/model"
"github.com/determined-ai/determined/proto/pkg/agentv1"
"github.com/determined-ai/determined/proto/pkg/containerv1"
"github.com/determined-ai/determined/proto/pkg/devicev1"
Expand Down Expand Up @@ -78,13 +81,23 @@ func ObfuscateAgent(agent *agentv1.Agent) error {
}
}

obfuscatedSlots := make(map[string]*agentv1.Slot)
// Retain map lexicographically order so the webui doesn't hop around every refresh.
slotIDToObfuscated := make(map[string]*agentv1.Slot)
var slotIDs []string
for _, slot := range agent.Slots {
if err := ObfuscateSlot(slot); err != nil {
return errors.Errorf("unable to obfuscate agent: %s", err)
}
obfuscatedKey := uuid.New().String()
obfuscatedSlots[obfuscatedKey] = slot

slotIDToObfuscated[slot.Id] = slot
slotIDs = append(slotIDs, slot.Id)
}
slices.Sort(slotIDs)
obfuscatedSlots := make(map[string]*agentv1.Slot)
for i, slotID := range slotIDs {
s := slotIDToObfuscated[slotID]
s.Id = model.SortableSlotIndex(i)
obfuscatedSlots[s.Id] = s
}
agent.Slots = obfuscatedSlots

Expand Down
28 changes: 28 additions & 0 deletions master/internal/authz/obfuscate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (

"github.com/stretchr/testify/require"

"github.com/determined-ai/determined/proto/pkg/agentv1"
"github.com/determined-ai/determined/proto/pkg/containerv1"
"github.com/determined-ai/determined/proto/pkg/devicev1"
)
Expand Down Expand Up @@ -49,3 +50,30 @@ func TestObfuscateContainer(t *testing.T) {
assertContainerDeviceObfuscated(t, device)
}
}

func TestObfuscateAgentSlots(t *testing.T) {
// Loop test so we know that we aren't relying on any random chances.
for i := 0; i < 100; i++ {
agent := &agentv1.Agent{
Slots: map[string]*agentv1.Slot{
"005": {
Id: "005",
Container: &containerv1.Container{
Id: "contID",
Parent: "parentID",
State: containerv1.State_STATE_RUNNING,
},
Device: &devicev1.Device{},
},
"006": {
Id: "006",
Device: &devicev1.Device{},
},
},
}

require.NoError(t, ObfuscateAgent(agent))
require.NotNil(t, agent.Slots["000"].Container)
require.Nil(t, agent.Slots["001"].Container)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"log"
"os"
"slices"
"strconv"
"testing"
"time"
Expand Down Expand Up @@ -187,6 +188,21 @@ func TestGetAgent(t *testing.T) {

auxNode1, auxNode2, compNode1, compNode2 := setupNodes()

largeNode := &k8sV1.Node{
ObjectMeta: metaV1.ObjectMeta{
ResourceVersion: "1",
Name: compNode1Name,
},
Status: k8sV1.NodeStatus{
Allocatable: map[k8sV1.ResourceName]resource.Quantity{
k8sV1.ResourceName(ResourceTypeNvidia): *resource.NewQuantity(
16,
resource.DecimalSI,
),
},
},
}

agentTests := []AgentTestCase{
{
Name: "GetAgent-CPU-NoPodLabels-Aux1",
Expand Down Expand Up @@ -260,6 +276,14 @@ func TestGetAgent(t *testing.T) {
agentExists: false,
wantedAgentID: "",
},
{
Name: "GetAgent-CUDA-Large-Node",
podsService: createMockPodsService(map[string]*k8sV1.Node{
compNode1Name: largeNode,
}, slotTypeGPU, false),
wantedAgentID: compNode1Name,
agentExists: false,
},
}

for _, test := range agentTests {
Expand All @@ -270,6 +294,21 @@ func TestGetAgent(t *testing.T) {
return
}
require.Equal(t, test.wantedAgentID, agentResp.Agent.Id)

// Check all filled slots come before an empty slot.
var slotIDs []string
for _, s := range agentResp.Agent.Slots {
slotIDs = append(slotIDs, s.Id)
}
slices.Sort(slotIDs)
seenEmptySlot := false
for _, s := range slotIDs {
if agentResp.Agent.Slots[s].Container != nil {
require.False(t, seenEmptySlot, "all filled slots must come before an empty slot")
} else {
seenEmptySlot = true
}
}
})
}
}
Expand Down Expand Up @@ -421,6 +460,18 @@ func TestGetSlot(t *testing.T) {
agentID: compNode1Name,
wantedSlotNum: strconv.Itoa(4),
},
{
Name: "GetSlot-GPU-PodLabels-Comp1-Id4",
podsService: createMockPodsService(map[string]*k8sV1.Node{
compNode1Name: compNode1,
compNode2Name: compNode2,
},
slotTypeGPU,
true,
),
agentID: compNode1Name,
wantedSlotNum: "004",
},
{
Name: "GetSlot-GPU-PodLabels-Comp1-Id0",
podsService: createMockPodsService(map[string]*k8sV1.Node{
Expand Down Expand Up @@ -461,14 +512,18 @@ func TestGetSlot(t *testing.T) {

for _, test := range slotTests {
t.Run(test.Name, func(t *testing.T) {
wantedSlotInt, err := strconv.Atoi(test.wantedSlotNum)
require.NoError(t, err)

slotResp := test.podsService.handleGetSlotRequest(test.agentID, test.wantedSlotNum)
if slotResp == nil {
wantedSlotInt, err := strconv.Atoi(test.wantedSlotNum)
require.NoError(t, err)
require.True(t, wantedSlotInt < 0 || wantedSlotInt >= int(nodeNumSlots))
return
}
require.Equal(t, test.wantedSlotNum, slotResp.Slot.Id)

actualSlotID, err := strconv.Atoi(slotResp.Slot.Id)
require.NoError(t, err)
require.Equal(t, wantedSlotInt, actualSlotID)
})
}
}
Expand Down
22 changes: 14 additions & 8 deletions master/internal/rm/kubernetesrm/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -1293,8 +1293,14 @@ func (p *pods) handleGetSlotRequest(agentID string, slotID string) *apiv1.GetSlo
slots := agentResp.Agent.Slots
slot, ok := slots[slotID]
if !ok {
p.syslog.Warnf("no slot with id %s", slotID)
return nil
// Try converting an index input to a slot and see if that exists (1 to 001).
tryIndex, err := strconv.Atoi(slotID)
if s, ok := slots[model.SortableSlotIndex(tryIndex)]; err == nil && ok {
slot = s
} else {
p.syslog.Warnf("no slot with id %s", slotID)
return nil
}
}
return &apiv1.GetSlotResponse{Slot: slot}
}
Expand Down Expand Up @@ -1528,8 +1534,8 @@ func (p *pods) summarizeClusterByNodes() map[string]model.AgentSummary {
continue
}

slotsSummary[strconv.Itoa(curSlot)] = model.SlotSummary{
ID: strconv.Itoa(curSlot),
slotsSummary[model.SortableSlotIndex(curSlot)] = model.SlotSummary{
ID: model.SortableSlotIndex(curSlot),
Device: device.Device{Type: deviceType},
Draining: isDraining,
Enabled: !isDisabled,
Expand All @@ -1546,8 +1552,8 @@ func (p *pods) summarizeClusterByNodes() map[string]model.AgentSummary {
continue
}

slotsSummary[strconv.Itoa(curSlot)] = model.SlotSummary{
ID: strconv.Itoa(curSlot),
slotsSummary[model.SortableSlotIndex(curSlot)] = model.SlotSummary{
ID: model.SortableSlotIndex(curSlot),
Device: device.Device{Type: deviceType},
Draining: isDraining,
Enabled: !isDisabled,
Expand All @@ -1563,8 +1569,8 @@ func (p *pods) summarizeClusterByNodes() map[string]model.AgentSummary {
}

for i := curSlot; i < int(numSlots); i++ {
slotsSummary[strconv.Itoa(i)] = model.SlotSummary{
ID: strconv.Itoa(i),
slotsSummary[model.SortableSlotIndex(i)] = model.SlotSummary{
ID: model.SortableSlotIndex(i),
Device: device.Device{Type: deviceType},
Draining: isDraining,
Enabled: !isDisabled,
Expand Down
16 changes: 16 additions & 0 deletions master/pkg/model/agent.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package model

import (
"fmt"
"time"

"github.com/determined-ai/determined/master/pkg/cproto"
Expand Down Expand Up @@ -129,3 +130,18 @@ type AgentStats struct {
AgentID string `db:"agent_id"`
Slots int `db:"slots"`
}

// SortableSlotIndex returns a slot index that will sort as you want to.
//
// This is a hack to fix a bug seen by the webui.
// The webui displays a list of slots and if they are filled, so they expect that
// the order of what slots are filled in is consistent. In Kubernetes this is an illusion,
// we don't know what slot is running what job.
// Our API returns a map of slot IDs to slots that get returned. This map gets parsed and display
// in the frontend lexicographically. Just doing indexes breaks when there are more than 10 GPUs
// per agent since it will go 1,10,11 instead of 1,2,3,4.
//
// To fix this on our just pad the numbers with 0s so they sort in the response.
func SortableSlotIndex(i int) string {
return fmt.Sprintf("%03d", i)
}
20 changes: 20 additions & 0 deletions master/pkg/model/agent_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package model

import (
"slices"
"testing"

"github.com/stretchr/testify/require"
)

func TestSortableSlotIndex(t *testing.T) {
require.Equal(t, SortableSlotIndex(2), "002")
require.Equal(t, SortableSlotIndex(16), "016")

// Do we actually sort?
var gpuIndexes []string
for i := 0; i <= 999; i++ {
gpuIndexes = append(gpuIndexes, SortableSlotIndex(i))
}
require.True(t, slices.IsSorted(gpuIndexes))
}

0 comments on commit 21f76e9

Please sign in to comment.