Skip to content

Commit 1fa583e

Browse files
Fix RemovalSimulation for parallel scale down
1 parent f44fb9f commit 1fa583e

File tree

5 files changed

+40
-17
lines changed

5 files changed

+40
-17
lines changed

cluster-autoscaler/core/scaledown/planner/planner.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,6 @@ func (p *Planner) injectPods(pods []*apiv1.Pod) error {
239239

240240
// categorizeNodes determines, for each node, whether it can be eventually
241241
// removed or if there are reasons preventing that.
242-
// TODO: Track remaining PDB budget.
243242
func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCandidates []*apiv1.Node) {
244243
unremovableTimeout := p.latestUpdate.Add(p.context.AutoscalingOptions.UnremovableNodeRecheckTimeout)
245244
unremovableCount := 0
@@ -251,26 +250,27 @@ func (p *Planner) categorizeNodes(podDestinations map[string]bool, scaleDownCand
251250
}
252251
p.nodeUtilizationMap = utilizationMap
253252
timer := time.NewTimer(p.context.ScaleDownSimulationTimeout)
253+
RemovalSimulation:
254254
for i, node := range currentlyUnneededNodeNames {
255255
select {
256256
case <-timer.C:
257257
klog.Warningf("%d out of %d nodes skipped in scale down simulation due to timeout.", len(currentlyUnneededNodeNames)-i, len(currentlyUnneededNodeNames))
258-
break
258+
break RemovalSimulation
259259
default:
260-
}
261-
removable, unremovable := p.rs.SimulateNodeRemoval(node, podDestinations, p.latestUpdate, p.context.RemainingPdbTracker.GetPdbs())
262-
if removable != nil {
263-
_, inParallel, _ := p.context.RemainingPdbTracker.CanRemovePods(removable.PodsToReschedule)
264-
if !inParallel {
265-
removable.IsRisky = true
260+
removable, unremovable := p.rs.SimulateNodeRemoval(node, podDestinations, p.latestUpdate, p.context.RemainingPdbTracker.GetPdbs())
261+
if removable != nil {
262+
_, inParallel, _ := p.context.RemainingPdbTracker.CanRemovePods(removable.PodsToReschedule)
263+
if !inParallel {
264+
removable.IsRisky = true
265+
}
266+
delete(podDestinations, removable.Node.Name)
267+
p.context.RemainingPdbTracker.RemovePods(removable.PodsToReschedule)
268+
removableList = append(removableList, *removable)
269+
}
270+
if unremovable != nil {
271+
unremovableCount += 1
272+
p.unremovableNodes.AddTimeout(unremovable, unremovableTimeout)
266273
}
267-
delete(podDestinations, removable.Node.Name)
268-
p.context.RemainingPdbTracker.RemovePods(removable.PodsToReschedule)
269-
removableList = append(removableList, *removable)
270-
}
271-
if unremovable != nil {
272-
unremovableCount += 1
273-
p.unremovableNodes.AddTimeout(unremovable, unremovableTimeout)
274274
}
275275
}
276276
p.unneededNodes.Update(removableList, p.latestUpdate)

cluster-autoscaler/core/scaledown/planner/planner_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,8 @@ func TestUpdateClusterState(t *testing.T) {
451451
assert.NoError(t, p.UpdateClusterState(tc.nodes, tc.nodes, tc.actuationStatus, time.Now()))
452452
wantUnneeded := asMap(tc.wantUnneeded)
453453
for _, n := range tc.nodes {
454-
assert.Equal(t, wantUnneeded[n.Name], p.unneededNodes.Contains(n.Name), n.Name)
454+
assert.Equal(t, wantUnneeded[n.Name], p.unneededNodes.Contains(n.Name), []string{n.Name, "unneeded"})
455+
assert.Equal(t, !wantUnneeded[n.Name], p.unremovableNodes.Contains(n.Name), []string{n.Name, "unremovable"})
455456
}
456457
})
457458
}

cluster-autoscaler/core/scaledown/unremovable/nodes.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ func (n *Nodes) Update(nodeInfos NodeInfoGetter, timestamp time.Time) {
6767
n.ttls = newTTLs
6868
}
6969

70+
// Contains returns true iff a given node is unremovable.
71+
func (n *Nodes) Contains(nodeName string) bool {
72+
_, found := n.reasons[nodeName]
73+
return found
74+
}
75+
7076
// Add adds an unremovable node.
7177
func (n *Nodes) Add(node *simulator.UnremovableNode) {
7278
n.reasons[node.Node.Name] = node

cluster-autoscaler/core/scaledown/unremovable/nodes_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525

2626
apiv1 "k8s.io/api/core/v1"
2727
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
2829
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
2930
)
3031

@@ -80,6 +81,21 @@ func TestUpdate(t *testing.T) {
8081
}
8182
}
8283

84+
func TestContains(t *testing.T) {
85+
n := NewNodes()
86+
nodes := []string{"n1", "n2", "n3"}
87+
88+
n.Add(makeUnremovableNode(nodes[0]))
89+
n.AddTimeout(makeUnremovableNode(nodes[1]), time.Now())
90+
n.AddReason(BuildTestNode(nodes[2], 0, 0), simulator.UnremovableReason(1))
91+
92+
for _, node := range nodes {
93+
if !n.Contains(node) {
94+
t.Errorf("n.Contains(%s) return false, want true", node)
95+
}
96+
}
97+
}
98+
8399
type fakeNodeInfoGetter struct {
84100
names map[string]bool
85101
}

cluster-autoscaler/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ var (
215215
skipNodesWithLocalStorage = flag.Bool("skip-nodes-with-local-storage", true, "If true cluster autoscaler will never delete nodes with pods with local storage, e.g. EmptyDir or HostPath")
216216
minReplicaCount = flag.Int("min-replica-count", 0, "Minimum number or replicas that a replica set or replication controller should have to allow their pods deletion in scale down")
217217
nodeDeleteDelayAfterTaint = flag.Duration("node-delete-delay-after-taint", 5*time.Second, "How long to wait before deleting a node after tainting it")
218-
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 5*time.Minute, "How long should we run scale down simulation.")
218+
scaleDownSimulationTimeout = flag.Duration("scale-down-simulation-timeout", 30*time.Second, "How long should we run scale down simulation.")
219219
parallelDrain = flag.Bool("parallel-drain", false, "Whether to allow parallel drain of nodes.")
220220
maxCapacityMemoryDifferenceRatio = flag.Float64("memory-difference-ratio", config.DefaultMaxCapacityMemoryDifferenceRatio, "Maximum difference in memory capacity between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's memory capacity.")
221221
maxFreeDifferenceRatio = flag.Float64("max-free-difference-ratio", config.DefaultMaxFreeDifferenceRatio, "Maximum difference in free resources between two similar node groups to be considered for balancing. Value is a ratio of the smaller node group's free resource.")

0 commit comments

Comments
 (0)