diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go index 0aa2bd4e05b8..441362b37301 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup_test.go @@ -1413,8 +1413,6 @@ func TestNodeGroupTemplateNodeInfo(t *testing.T) { } else { t.Errorf("Expected node label %q to exist in node", key) } - if value != config.expectedNodeLabels[key] { - } } } } diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go index a68c35e8d37a..624d9050f7b2 100644 --- a/cluster-autoscaler/core/scale_up.go +++ b/cluster-autoscaler/core/scale_up.go @@ -209,7 +209,6 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto } now := time.Now() - gpuLabel := context.CloudProvider.GPULabel() availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes() expansionOptions := make(map[string]expander.Option, 0) skippedNodeGroups := map[string]status.Reasons{} @@ -405,7 +404,9 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos) for _, info := range scaleUpInfos { - typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil), now) + gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node()) + metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil) + typedErr := executeScaleUp(context, clusterStateRegistry, info, metricResourceName, metricGpuType, now) if typedErr != nil { return scaleUpError( &status.ScaleUpStatus{ @@ -444,7 +445,6 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors * nodes []*apiv1.Node, nodeInfos map[string]*schedulerframework.NodeInfo) (*status.ScaleUpStatus, errors.AutoscalerError) { now := time.Now() nodeGroups := context.CloudProvider.NodeGroups() - gpuLabel := context.CloudProvider.GPULabel() availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes() scaleUpInfos := make([]nodegroupset.ScaleUpInfo, 0) @@ -521,9 +521,9 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors * klog.Warningf("ScaleUpToNodeGroupMinSize: failed to get node info for node group %s", info.Group.Id()) continue } - - gpuType := gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil) - if err := executeScaleUp(context, clusterStateRegistry, info, gpuType, now); err != nil { + gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node()) + metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil) + if err := executeScaleUp(context, clusterStateRegistry, info, metricResourceName, metricGpuType, now); err != nil { return scaleUpError( &status.ScaleUpStatus{ FailedResizeNodeGroups: []cloudprovider.NodeGroup{info.Group}, @@ -605,7 +605,7 @@ func filterNodeGroupsByPods( return result } -func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string, now time.Time) errors.AutoscalerError { +func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, resourceName, gpuType string, now time.Time) errors.AutoscalerError { klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize) context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup", "Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize) @@ -620,7 +620,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c info.Group, increase, time.Now()) - metrics.RegisterScaleUp(increase, gpuType) + metrics.RegisterScaleUp(increase, resourceName, gpuType) context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup", "Scale-up: group %s size set to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize) return nil diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go index 407167dad610..66caa7639fb0 100644 --- a/cluster-autoscaler/core/scale_up_test.go +++ b/cluster-autoscaler/core/scale_up_test.go @@ -1053,7 +1053,7 @@ func TestAuthError(t *testing.T) { clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff()) - aerr := executeScaleUp(&context, clusterStateRegistry, info, "", time.Now()) + aerr := executeScaleUp(&context, clusterStateRegistry, info, "", "", time.Now()) assert.Error(t, aerr) req, err := http.NewRequest("GET", "/", nil) diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go index fd6c89c9287a..d60629c6581e 100644 --- a/cluster-autoscaler/core/scaledown/actuation/actuator.go +++ b/cluster-autoscaler/core/scaledown/actuation/actuator.go @@ -437,7 +437,9 @@ func RegisterAndRecordSuccessfulScaleDownEvent(ctx *context.AutoscalingContext, Time: time.Now(), ExpectedDeleteTime: time.Now().Add(MaxCloudProviderNodeDeletionTime), }) - metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(ctx.CloudProvider.GPULabel(), ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup), nodeScaleDownReason(node, drain)) + gpuConfig := ctx.CloudProvider.GetNodeGpuConfig(node) + metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup) + metrics.RegisterScaleDown(1, metricResourceName, metricGpuType, nodeScaleDownReason(node, drain)) if drain { ctx.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDown", "Scale-down: node %s removed with drain", node.Name) } else { diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go index 0d9736549a33..b92b6e913f69 100644 --- a/cluster-autoscaler/metrics/metrics.go +++ b/cluster-autoscaler/metrics/metrics.go @@ -246,7 +246,7 @@ var ( Namespace: caNamespace, Name: "scaled_up_gpu_nodes_total", Help: "Number of GPU nodes added by CA, by GPU name.", - }, []string{"gpu_name"}, + }, []string{"gpu_resource_name", "gpu_name"}, ) failedScaleUpCount = k8smetrics.NewCounterVec( @@ -270,7 +270,7 @@ var ( Namespace: caNamespace, Name: "scaled_down_gpu_nodes_total", Help: "Number of GPU nodes removed by CA, by reason and GPU name.", - }, []string{"reason", "gpu_name"}, + }, []string{"reason", "gpu_resource_name", "gpu_name"}, ) evictionsCount = k8smetrics.NewCounter( @@ -490,10 +490,10 @@ func RegisterError(err errors.AutoscalerError) { } // RegisterScaleUp records number of nodes added by scale up -func RegisterScaleUp(nodesCount int, gpuType string) { +func RegisterScaleUp(nodesCount int, resourceName, gpuType string) { scaleUpCount.Add(float64(nodesCount)) if gpuType != gpu.MetricsNoGPU { - gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount)) + gpuScaleUpCount.WithLabelValues(resourceName, gpuType).Add(float64(nodesCount)) } } @@ -503,10 +503,10 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) { } // RegisterScaleDown records number of nodes removed by scale down -func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) { +func RegisterScaleDown(nodesCount int, resourceName, gpuType string, reason NodeScaleDownReason) { scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount)) if gpuType != gpu.MetricsNoGPU { - gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount)) + gpuScaleDownCount.WithLabelValues(string(reason), resourceName, gpuType).Add(float64(nodesCount)) } } diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go index fd53863448c4..6ca265aeaa5c 100644 --- a/cluster-autoscaler/utils/gpu/gpu.go +++ b/cluster-autoscaler/utils/gpu/gpu.go @@ -47,27 +47,24 @@ const ( MetricsNoGPU = "" ) -// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU +// GetGpuInfoForMetrics returns the name of the custom resource and the GPU used on the node or empty string if there's no GPU // if the GPU type is unknown, "generic" is returned // NOTE: current implementation is GKE/GCE-specific -func GetGpuTypeForMetrics(GPULabel string, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) string { - // we use the GKE label if there is one - gpuType, labelFound := node.Labels[GPULabel] - capacity, capacityFound := node.Status.Capacity[ResourceNvidiaGPU] - - if !labelFound { - // no label, fallback to generic solution - if capacityFound && !capacity.IsZero() { - return MetricsGenericGPU - } - - // no signs of GPU - return MetricsNoGPU +func GetGpuInfoForMetrics(gpuConfig *cloudprovider.GpuConfig, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (gpuResource string, gpuType string) { + // There is no sign of GPU + if gpuConfig == nil { + return "", MetricsNoGPU + } + resourceName := gpuConfig.ResourceName + capacity, capacityFound := node.Status.Capacity[resourceName] + // There is no label value, fallback to generic solution + if gpuConfig.Type == "" && capacityFound && !capacity.IsZero() { + return resourceName.String(), MetricsGenericGPU } // GKE-specific label & capacity are present - consistent state if capacityFound { - return validateGpuType(availableGPUTypes, gpuType) + return resourceName.String(), validateGpuType(availableGPUTypes, gpuConfig.Type) } // GKE-specific label present but no capacity (yet?) - check the node template @@ -75,19 +72,19 @@ func GetGpuTypeForMetrics(GPULabel string, availableGPUTypes map[string]struct{} template, err := nodeGroup.TemplateNodeInfo() if err != nil { klog.Warningf("Failed to build template for getting GPU metrics for node %v: %v", node.Name, err) - return MetricsErrorGPU + return resourceName.String(), MetricsErrorGPU } - if _, found := template.Node().Status.Capacity[ResourceNvidiaGPU]; found { - return MetricsMissingGPU + if _, found := template.Node().Status.Capacity[resourceName]; found { + return resourceName.String(), MetricsMissingGPU } // if template does not define GPUs we assume node will not have any even if it has gpu label klog.Warningf("Template does not define GPUs even though node from its node group does; node=%v", node.Name) - return MetricsUnexpectedLabelGPU + return resourceName.String(), MetricsUnexpectedLabelGPU } - return MetricsUnexpectedLabelGPU + return resourceName.String(), MetricsUnexpectedLabelGPU } func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {