CA: stop passing NotStarted nodes as scale-down candidates

towca · towca · commit bb046ee7e471 · 2023-02-08T12:04:07.000+01:00
Without this, with aggressive settings, scale-down could be removing
NotStarted nodes before they have a chance to become ready (the duration
of which should be unrelated to the scale-down settings).
diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go
@@ -467,6 +467,20 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
 			return errors.ToAutoscalerError(errors.InternalError, err)
 		}
 	}
+	// Nodes that are registered but not yet ready are classified as NotStarted by CSR. Above, we inject "upcoming" replacement
+	// nodes for them that are faked to appear ready, so that we can pack unschedulable pods on them and not trigger another scale-up.
+	// The initial nodes have to be filtered out of the all nodes list so that scale-down can't consider them as candidates. Otherwise, with
+	// aggressive scale-down settings, we could be removing the nodes before they have a chance to first become ready (the duration
+	// of which should be unrelated to the scale-down settings).
+	allNodes = subtractNodesByName(allNodes, a.clusterStateRegistry.GetClusterReadiness().NotStarted)
+	// Remove the nodes from the snapshot as well so that the state is consistent.
+	for _, notStartedNodeName := range a.clusterStateRegistry.GetClusterReadiness().NotStarted {
+		err := a.ClusterSnapshot.RemoveNode(notStartedNodeName)
+		if err != nil {
+			klog.Errorf("Failed to remove NotStarted node %s from cluster snapshot: %v", notStartedNodeName, err)
+			return errors.ToAutoscalerError(errors.InternalError, err)
+		}
+	}
 
 	l, err := a.ClusterSnapshot.NodeInfos().List()
 	if err != nil {
@@ -921,17 +935,29 @@ func countsByReason(nodes []*simulator.UnremovableNode) map[simulator.Unremovabl
 	return counts
 }
 
-func subtractNodes(a []*apiv1.Node, b []*apiv1.Node) []*apiv1.Node {
+func subtractNodesByName(nodes []*apiv1.Node, namesToRemove []string) []*apiv1.Node {
 	var c []*apiv1.Node
-	namesToDrop := make(map[string]bool)
-	for _, n := range b {
-		namesToDrop[n.Name] = true
+	removeSet := make(map[string]bool)
+	for _, name := range namesToRemove {
+		removeSet[name] = true
 	}
-	for _, n := range a {
-		if namesToDrop[n.Name] {
+	for _, n := range nodes {
+		if removeSet[n.Name] {
 			continue
 		}
 		c = append(c, n)
 	}
 	return c
 }
+
+func subtractNodes(a []*apiv1.Node, b []*apiv1.Node) []*apiv1.Node {
+	return subtractNodesByName(a, nodeNames(b))
+}
+
+func nodeNames(ns []*apiv1.Node) []string {
+	names := make([]string, len(ns))
+	for i, node := range ns {
+		names[i] = node.Name
+	}
+	return names
+}
diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go
@@ -42,6 +42,9 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/estimator"
 	ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
 	"k8s.io/autoscaler/cluster-autoscaler/simulator"
+	"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
+	"k8s.io/autoscaler/cluster-autoscaler/simulator/utilization"
+	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	kube_util "k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"
 	. "k8s.io/autoscaler/cluster-autoscaler/utils/test"
@@ -1273,6 +1276,167 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {
 	nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0)
 }
 
+type candidateTrackingFakePlanner struct {
+	lastCandidateNodes map[string]bool
+}
+
+func (f *candidateTrackingFakePlanner) UpdateClusterState(podDestinations, scaleDownCandidates []*apiv1.Node, as scaledown.ActuationStatus, pdb []*policyv1.PodDisruptionBudget, currentTime time.Time) errors.AutoscalerError {
+	f.lastCandidateNodes = map[string]bool{}
+	for _, node := range scaleDownCandidates {
+		f.lastCandidateNodes[node.Name] = true
+	}
+	return nil
+}
+
+func (f *candidateTrackingFakePlanner) CleanUpUnneededNodes() {
+}
+
+func (f *candidateTrackingFakePlanner) NodesToDelete(currentTime time.Time) (empty, needDrain []*apiv1.Node) {
+	return nil, nil
+}
+
+func (f *candidateTrackingFakePlanner) UnneededNodes() []*apiv1.Node {
+	return nil
+}
+
+func (f *candidateTrackingFakePlanner) UnremovableNodes() []*simulator.UnremovableNode {
+	return nil
+}
+
+func (f *candidateTrackingFakePlanner) NodeUtilizationMap() map[string]utilization.Info {
+	return nil
+}
+
+func assertSnapshotNodeCount(t *testing.T, snapshot clustersnapshot.ClusterSnapshot, wantCount int) {
+	nodeInfos, err := snapshot.NodeInfos().List()
+	assert.NoError(t, err)
+	assert.Len(t, nodeInfos, wantCount)
+}
+
+func assertNodesNotInSnapshot(t *testing.T, snapshot clustersnapshot.ClusterSnapshot, nodeNames map[string]bool) {
+	nodeInfos, err := snapshot.NodeInfos().List()
+	assert.NoError(t, err)
+	for _, nodeInfo := range nodeInfos {
+		assert.NotContains(t, nodeNames, nodeInfo.Node().Name)
+	}
+}
+
+func assertNodesInSnapshot(t *testing.T, snapshot clustersnapshot.ClusterSnapshot, nodeNames map[string]bool) {
+	nodeInfos, err := snapshot.NodeInfos().List()
+	assert.NoError(t, err)
+	snapshotNodeNames := map[string]bool{}
+	for _, nodeInfo := range nodeInfos {
+		snapshotNodeNames[nodeInfo.Node().Name] = true
+	}
+	for nodeName := range nodeNames {
+		assert.Contains(t, snapshotNodeNames, nodeName)
+	}
+}
+
+func TestStaticAutoscalerUpcomingScaleDownCandidates(t *testing.T) {
+	startTime := time.Time{}
+
+	// Generate a number of ready and unready nodes created at startTime, spread across multiple node groups.
+	provider := testprovider.NewTestCloudProvider(nil, nil)
+	allNodeNames := map[string]bool{}
+	readyNodeNames := map[string]bool{}
+	notReadyNodeNames := map[string]bool{}
+	var allNodes []*apiv1.Node
+	var readyNodes []*apiv1.Node
+
+	readyNodesCount := 4
+	unreadyNodesCount := 2
+	nodeGroupCount := 2
+	for ngNum := 0; ngNum < nodeGroupCount; ngNum++ {
+		ngName := fmt.Sprintf("ng-%d", ngNum)
+		provider.AddNodeGroup(ngName, 0, 1000, readyNodesCount+unreadyNodesCount)
+
+		for i := 0; i < readyNodesCount; i++ {
+			node := BuildTestNode(fmt.Sprintf("%s-ready-node-%d", ngName, i), 2000, 1000)
+			node.CreationTimestamp = metav1.NewTime(startTime)
+			SetNodeReadyState(node, true, startTime)
+			provider.AddNode(ngName, node)
+
+			allNodes = append(allNodes, node)
+			allNodeNames[node.Name] = true
+
+			readyNodes = append(readyNodes, node)
+			readyNodeNames[node.Name] = true
+		}
+		for i := 0; i < unreadyNodesCount; i++ {
+			node := BuildTestNode(fmt.Sprintf("%s-unready-node-%d", ngName, i), 2000, 1000)
+			node.CreationTimestamp = metav1.NewTime(startTime)
+			SetNodeReadyState(node, false, startTime)
+			provider.AddNode(ngName, node)
+
+			allNodes = append(allNodes, node)
+			allNodeNames[node.Name] = true
+
+			notReadyNodeNames[node.Name] = true
+		}
+	}
+
+	// Create fake listers for the generated nodes, nothing returned by the rest (but the ones used in the tested path have to be defined).
+	allNodeLister := kubernetes.NewTestNodeLister(allNodes)
+	readyNodeLister := kubernetes.NewTestNodeLister(readyNodes)
+	daemonSetLister, err := kubernetes.NewTestDaemonSetLister(nil)
+	assert.NoError(t, err)
+	listerRegistry := kube_util.NewListerRegistry(allNodeLister, readyNodeLister, kubernetes.NewTestPodLister(nil), kubernetes.NewTestPodLister(nil), kubernetes.NewTestPodDisruptionBudgetLister(nil), daemonSetLister, nil, nil, nil, nil)
+
+	// Create context with minimal options that guarantee we reach the tested logic.
+	// We're only testing the input to UpdateClusterState which should be called whenever scale-down is enabled, other options shouldn't matter.
+	options := config.AutoscalingOptions{ScaleDownEnabled: true}
+	processorCallbacks := newStaticAutoscalerProcessorCallbacks()
+	ctx, err := NewScaleTestAutoscalingContext(options, &fake.Clientset{}, listerRegistry, provider, processorCallbacks, nil)
+	assert.NoError(t, err)
+
+	// Create CSR with unhealthy cluster protection effectively disabled, to guarantee we reach the tested logic.
+	csrConfig := clusterstate.ClusterStateRegistryConfig{OkTotalUnreadyCount: nodeGroupCount * unreadyNodesCount}
+	csr := clusterstate.NewClusterStateRegistry(provider, csrConfig, ctx.LogRecorder, NewBackoff())
+
+	// Setting the Actuator is necessary for testing any scale-down logic, it shouldn't have anything to do in this test.
+	actuator := actuation.NewActuator(&ctx, csr, deletiontracker.NewNodeDeletionTracker(0*time.Second), simulator.NodeDeleteOptions{})
+	ctx.ScaleDownActuator = actuator
+
+	// Fake planner that keeps track of the scale-down candidates passed to UpdateClusterState.
+	planner := &candidateTrackingFakePlanner{}
+
+	autoscaler := &StaticAutoscaler{
+		AutoscalingContext:   &ctx,
+		clusterStateRegistry: csr,
+		scaleDownActuator:    actuator,
+		scaleDownPlanner:     planner,
+		processors:           NewTestProcessors(&ctx),
+		processorCallbacks:   processorCallbacks,
+	}
+
+	// RunOnce run right when the nodes are created. Ready nodes should be passed as scale-down candidates, unready nodes should be classified as
+	// NotStarted and not passed as scale-down candidates (or inserted into the cluster snapshot). The fake upcoming nodes also shouldn't be passed,
+	// but they should be inserted into the snapshot.
+	err = autoscaler.RunOnce(startTime)
+	assert.NoError(t, err)
+	assert.Equal(t, readyNodeNames, planner.lastCandidateNodes)
+	assertNodesInSnapshot(t, autoscaler.ClusterSnapshot, readyNodeNames)
+	assertNodesNotInSnapshot(t, autoscaler.ClusterSnapshot, notReadyNodeNames)
+	assertSnapshotNodeCount(t, autoscaler.ClusterSnapshot, len(allNodeNames)) // Ready nodes + fake upcoming copies for unready nodes.
+
+	// RunOnce run in the last moment when unready nodes are still classified as NotStarted - assertions the same as above.
+	err = autoscaler.RunOnce(startTime.Add(clusterstate.MaxNodeStartupTime).Add(-time.Second))
+	assert.NoError(t, err)
+	assert.Equal(t, readyNodeNames, planner.lastCandidateNodes)
+	assertNodesInSnapshot(t, autoscaler.ClusterSnapshot, readyNodeNames)
+	assertNodesNotInSnapshot(t, autoscaler.ClusterSnapshot, notReadyNodeNames)
+	assertSnapshotNodeCount(t, autoscaler.ClusterSnapshot, len(allNodeNames)) // Ready nodes + fake upcoming copies for unready nodes.
+
+	// RunOnce run in the first moment when unready nodes exceed the startup threshold, stop being classified as NotStarted, and start being classified
+	// Unready instead. The unready nodes should be passed as scale-down candidates at this point, and inserted into the snapshot. Fake upcoming
+	// nodes should no longer be inserted.
+	err = autoscaler.RunOnce(startTime.Add(clusterstate.MaxNodeStartupTime).Add(time.Second))
+	assert.Equal(t, allNodeNames, planner.lastCandidateNodes)
+	assertNodesInSnapshot(t, autoscaler.ClusterSnapshot, allNodeNames)
+	assertSnapshotNodeCount(t, autoscaler.ClusterSnapshot, len(allNodeNames)) // Ready nodes + actual unready nodes.
+}
+
 func TestStaticAutoscalerProcessorCallbacks(t *testing.T) {
 	processorCallbacks := newStaticAutoscalerProcessorCallbacks()
 	assert.Equal(t, false, processorCallbacks.disableScaleDownForLoop)
@@ -1426,6 +1590,9 @@ func TestSubtractNodes(t *testing.T) {
 	for _, tc := range testCases {
 		got := subtractNodes(tc.a, tc.b)
 		assert.Equal(t, nodeNames(got), nodeNames(tc.c))
+
+		got = subtractNodesByName(tc.a, nodeNames(tc.b))
+		assert.Equal(t, nodeNames(got), nodeNames(tc.c))
 	}
 }
 
@@ -1526,14 +1693,6 @@ func TestFilterOutYoungPods(t *testing.T) {
 	}
 }
 
-func nodeNames(ns []*apiv1.Node) []string {
-	names := make([]string, len(ns))
-	for i, node := range ns {
-		names[i] = node.Name
-	}
-	return names
-}
-
 func waitForDeleteToFinish(t *testing.T, deleteFinished <-chan bool) {
 	select {
 	case <-deleteFinished: