Skip to content

Commit b9bbed2

Browse files
authored
Merge pull request #5502 from yaroslava-serdiuk/min-size-fix
Check min size of node group and resource limits for set of nodes
2 parents 1238e1d + 60a264b commit b9bbed2

File tree

2 files changed

+171
-19
lines changed

2 files changed

+171
-19
lines changed

cluster-autoscaler/core/scaledown/unneeded/nodes.go

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -119,19 +119,24 @@ func (n *Nodes) Drop(node string) {
119119
// unneeded, but are not removable, annotated by reason.
120120
func (n *Nodes) RemovableAt(context *context.AutoscalingContext, ts time.Time, resourcesLeft resource.Limits, resourcesWithLimits []string, as scaledown.ActuationStatus) (empty, needDrain []simulator.NodeToBeRemoved, unremovable []*simulator.UnremovableNode) {
121121
nodeGroupSize := utils.GetNodeGroupSizeMap(context.CloudProvider)
122-
for nodeName, v := range n.byName {
123-
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
122+
resourcesLeftCopy := resourcesLeft.DeepCopy()
123+
emptyNodes, drainNodes := n.splitEmptyAndNonEmptyNodes()
124124

125-
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeft, resourcesWithLimits, as); r != simulator.NoReason {
125+
for nodeName, v := range emptyNodes {
126+
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
127+
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeftCopy, resourcesWithLimits, as); r != simulator.NoReason {
126128
unremovable = append(unremovable, &simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
127129
continue
128130
}
129-
130-
if len(v.ntbr.PodsToReschedule) > 0 {
131-
needDrain = append(needDrain, v.ntbr)
132-
} else {
133-
empty = append(empty, v.ntbr)
131+
empty = append(empty, v.ntbr)
132+
}
133+
for nodeName, v := range drainNodes {
134+
klog.V(2).Infof("%s was unneeded for %s", nodeName, ts.Sub(v.since).String())
135+
if r := n.unremovableReason(context, v, ts, nodeGroupSize, resourcesLeftCopy, resourcesWithLimits, as); r != simulator.NoReason {
136+
unremovable = append(unremovable, &simulator.UnremovableNode{Node: v.ntbr.Node, Reason: r})
137+
continue
134138
}
139+
needDrain = append(needDrain, v.ntbr)
135140
}
136141
return
137142
}
@@ -177,25 +182,18 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, v *node,
177182
}
178183
}
179184

180-
size, found := nodeGroupSize[nodeGroup.Id()]
181-
if !found {
182-
klog.Errorf("Error while checking node group size %s: group size not found in cache", nodeGroup.Id())
183-
return simulator.UnexpectedError
184-
}
185-
186-
deletionsInProgress := as.DeletionsCount(nodeGroup.Id())
187-
if size-deletionsInProgress <= nodeGroup.MinSize() {
188-
klog.V(1).Infof("Skipping %s - node group min size reached", node.Name)
189-
return simulator.NodeGroupMinSizeReached
185+
if reason := verifyMinSize(node.Name, nodeGroup, nodeGroupSize, as); reason != simulator.NoReason {
186+
return reason
190187
}
188+
nodeGroupSize[nodeGroup.Id()]--
191189

192190
resourceDelta, err := n.limitsFinder.DeltaForNode(context, node, nodeGroup, resourcesWithLimits)
193191
if err != nil {
194192
klog.Errorf("Error getting node resources: %v", err)
195193
return simulator.UnexpectedError
196194
}
197195

198-
checkResult := resourcesLeft.CheckDeltaWithinLimits(resourceDelta)
196+
checkResult := resourcesLeft.TryDecrementBy(resourceDelta)
199197
if checkResult.Exceeded() {
200198
klog.V(4).Infof("Skipping %s - minimal limit exceeded for %v", node.Name, checkResult.ExceededResources)
201199
for _, resource := range checkResult.ExceededResources {
@@ -213,3 +211,30 @@ func (n *Nodes) unremovableReason(context *context.AutoscalingContext, v *node,
213211

214212
return simulator.NoReason
215213
}
214+
215+
func (n *Nodes) splitEmptyAndNonEmptyNodes() (empty, needDrain map[string]*node) {
216+
empty = make(map[string]*node)
217+
needDrain = make(map[string]*node)
218+
for name, v := range n.byName {
219+
if len(v.ntbr.PodsToReschedule) > 0 {
220+
needDrain[name] = v
221+
} else {
222+
empty[name] = v
223+
}
224+
}
225+
return
226+
}
227+
228+
func verifyMinSize(nodeName string, nodeGroup cloudprovider.NodeGroup, nodeGroupSize map[string]int, as scaledown.ActuationStatus) simulator.UnremovableReason {
229+
size, found := nodeGroupSize[nodeGroup.Id()]
230+
if !found {
231+
klog.Errorf("Error while checking node group size %s: group size not found in cache", nodeGroup.Id())
232+
return simulator.UnexpectedError
233+
}
234+
deletionsInProgress := as.DeletionsCount(nodeGroup.Id())
235+
if size-deletionsInProgress <= nodeGroup.MinSize() {
236+
klog.V(1).Infof("Skipping %s - node group min size reached", nodeName)
237+
return simulator.NodeGroupMinSizeReached
238+
}
239+
return simulator.NoReason
240+
}

cluster-autoscaler/core/scaledown/unneeded/nodes_test.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,22 @@ limitations under the License.
1717
package unneeded
1818

1919
import (
20+
"fmt"
2021
"testing"
2122
"time"
2223

24+
apiv1 "k8s.io/api/core/v1"
25+
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
26+
testprovider "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/test"
27+
"k8s.io/autoscaler/cluster-autoscaler/config"
28+
"k8s.io/autoscaler/cluster-autoscaler/context"
29+
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/resource"
30+
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
31+
. "k8s.io/autoscaler/cluster-autoscaler/core/test"
2332
"k8s.io/autoscaler/cluster-autoscaler/simulator"
33+
kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
2434
. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
35+
"k8s.io/client-go/kubernetes/fake"
2536

2637
"github.com/stretchr/testify/assert"
2738
)
@@ -115,3 +126,119 @@ func makeNode(name, version string) simulator.NodeToBeRemoved {
115126
func version(n simulator.NodeToBeRemoved) string {
116127
return n.Node.Annotations[testVersion]
117128
}
129+
130+
func TestRemovableAt(t *testing.T) {
131+
132+
testCases := []struct {
133+
name string
134+
numEmpty int
135+
numDrain int
136+
minSize int
137+
targetSize int
138+
numOngoingDeletions int
139+
numEmptyToRemove int
140+
numDrainToRemove int
141+
}{
142+
{
143+
name: "Node group min size is not reached",
144+
numEmpty: 3,
145+
numDrain: 2,
146+
minSize: 1,
147+
targetSize: 10,
148+
numOngoingDeletions: 2,
149+
numEmptyToRemove: 3,
150+
numDrainToRemove: 2,
151+
},
152+
{
153+
name: "Node group min size is reached for drain nodes",
154+
numEmpty: 3,
155+
numDrain: 5,
156+
minSize: 1,
157+
targetSize: 10,
158+
numOngoingDeletions: 2,
159+
numEmptyToRemove: 3,
160+
numDrainToRemove: 4,
161+
},
162+
{
163+
name: "Node group min size is reached for empty and drain nodes",
164+
numEmpty: 3,
165+
numDrain: 5,
166+
minSize: 1,
167+
targetSize: 5,
168+
numOngoingDeletions: 2,
169+
numEmptyToRemove: 2,
170+
numDrainToRemove: 0,
171+
},
172+
}
173+
for _, tc := range testCases {
174+
t.Run(tc.name, func(t *testing.T) {
175+
ng := testprovider.NewTestNodeGroup("ng", 100, tc.minSize, tc.targetSize, true, false, "", nil, nil)
176+
empty := []simulator.NodeToBeRemoved{}
177+
for i := 0; i < tc.numEmpty; i++ {
178+
empty = append(empty, simulator.NodeToBeRemoved{
179+
Node: BuildTestNode(fmt.Sprintf("empty-%d", i), 10, 100),
180+
})
181+
}
182+
drain := []simulator.NodeToBeRemoved{}
183+
for i := 0; i < tc.numDrain; i++ {
184+
drain = append(drain, simulator.NodeToBeRemoved{
185+
Node: BuildTestNode(fmt.Sprintf("drain-%d", i), 10, 100),
186+
PodsToReschedule: []*apiv1.Pod{BuildTestPod(fmt.Sprintf("pod-%d", i), 1, 1)},
187+
})
188+
}
189+
190+
nodes := append(empty, drain...)
191+
provider := testprovider.NewTestCloudProvider(nil, nil)
192+
provider.InsertNodeGroup(ng)
193+
for _, node := range nodes {
194+
provider.AddNode("ng", node.Node)
195+
}
196+
197+
as := &fakeActuationStatus{deletionCount: map[string]int{"ng": tc.numOngoingDeletions}}
198+
199+
rsLister, err := kube_util.NewTestReplicaSetLister(nil)
200+
assert.NoError(t, err)
201+
registry := kube_util.NewListerRegistry(nil, nil, nil, nil, nil, nil, nil, nil, rsLister, nil)
202+
ctx, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{ScaleDownSimulationTimeout: 5 * time.Minute}, &fake.Clientset{}, registry, provider, nil, nil)
203+
assert.NoError(t, err)
204+
205+
n := NewNodes(&fakeScaleDownTimeGetter{}, &resource.LimitsFinder{})
206+
n.Update(nodes, time.Now())
207+
gotEmptyToRemove, gotDrainToRemove, _ := n.RemovableAt(&ctx, time.Now(), resource.Limits{}, []string{}, as)
208+
if len(gotDrainToRemove) != tc.numDrainToRemove || len(gotEmptyToRemove) != tc.numEmptyToRemove {
209+
t.Errorf("%s: getNodesToRemove() return %d, %d, want %d, %d", tc.name, len(gotEmptyToRemove), len(gotDrainToRemove), tc.numEmptyToRemove, tc.numDrainToRemove)
210+
}
211+
})
212+
}
213+
}
214+
215+
type fakeActuationStatus struct {
216+
recentEvictions []*apiv1.Pod
217+
deletionCount map[string]int
218+
}
219+
220+
func (f *fakeActuationStatus) RecentEvictions() []*apiv1.Pod {
221+
return f.recentEvictions
222+
}
223+
224+
func (f *fakeActuationStatus) DeletionsInProgress() ([]string, []string) {
225+
return nil, nil
226+
}
227+
228+
func (f *fakeActuationStatus) DeletionResults() (map[string]status.NodeDeleteResult, time.Time) {
229+
return nil, time.Time{}
230+
}
231+
232+
func (f *fakeActuationStatus) DeletionsCount(nodeGroup string) int {
233+
return f.deletionCount[nodeGroup]
234+
}
235+
236+
type fakeScaleDownTimeGetter struct{}
237+
238+
func (f *fakeScaleDownTimeGetter) GetScaleDownUnneededTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error) {
239+
return 0 * time.Second, nil
240+
}
241+
242+
func (f *fakeScaleDownTimeGetter) GetScaleDownUnreadyTime(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (time.Duration, error) {
243+
return 0 * time.Second, nil
244+
}

0 commit comments

Comments
 (0)