Skip to content

Commit c7ad47b

Browse files
authored
Merge pull request #6375 from walidghallab/status
Convert status in cluster-autoscaler-status to yaml and add error info for scale-up backoff
2 parents 0f7fe5c + 4b63993 commit c7ad47b

File tree

11 files changed

+506
-511
lines changed

11 files changed

+506
-511
lines changed

cluster-autoscaler/clusterstate/api/types.go

Lines changed: 130 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,14 @@ import (
2222
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2323
)
2424

25-
// ClusterAutoscalerConditionType is the type of ClusterAutoscalerCondition.
26-
type ClusterAutoscalerConditionType string
25+
// ClusterAutoscalerStatusCondition is the status of the cluster autoscaler.
26+
type ClusterAutoscalerStatusCondition string
2727

2828
const (
29-
// ClusterAutoscalerHealth - is a condition that explains what is the current health
30-
// of ClusterAutoscaler or its node groups.
31-
ClusterAutoscalerHealth ClusterAutoscalerConditionType = "Health"
32-
// ClusterAutoscalerScaleDown is a condition that explains what is the current status
33-
// of a node group with regard to scale down activities.
34-
ClusterAutoscalerScaleDown ClusterAutoscalerConditionType = "ScaleDown"
35-
// ClusterAutoscalerScaleUp is a condition that explains what is the current status
36-
// of a node group with regard to scale up activities.
37-
ClusterAutoscalerScaleUp ClusterAutoscalerConditionType = "ScaleUp"
29+
// ClusterAutoscalerRunning status means that the cluster autoscaler has been initialized and running.
30+
ClusterAutoscalerRunning ClusterAutoscalerStatusCondition = "Running"
31+
// ClusterAutoscalerInitializing status means that cluster autoscaler is currently being initialized.
32+
ClusterAutoscalerInitializing ClusterAutoscalerStatusCondition = "Initializing"
3833
)
3934

4035
// ClusterAutoscalerConditionStatus is a status of ClusterAutoscalerCondition.
@@ -69,36 +64,135 @@ const (
6964
ClusterAutoscalerBackoff ClusterAutoscalerConditionStatus = "Backoff"
7065
)
7166

72-
// ClusterAutoscalerCondition describes some aspect of ClusterAutoscaler work.
73-
type ClusterAutoscalerCondition struct {
74-
// Type defines the aspect that the condition describes. For example, it can be Health or ScaleUp/Down activity.
75-
Type ClusterAutoscalerConditionType `json:"type,omitempty"`
76-
// Status of the condition.
77-
Status ClusterAutoscalerConditionStatus `json:"status,omitempty"`
78-
// Message is a free text extra information about the condition. It may contain some
79-
// extra debugging data, like why the cluster is unhealthy.
80-
Message string `json:"message,omitempty"`
81-
// Reason is a unique, one-word, CamelCase reason for the condition's last transition.
82-
Reason string `json:"reason,omitempty"`
67+
// RegisteredUnreadyNodeCount contains node counts of registered but unready nodes.
68+
type RegisteredUnreadyNodeCount struct {
69+
// Total number of registered but unready nodes.
70+
Total int `json:"total" yaml:"total"`
71+
// ResourceUnready is the number of registered but unready nodes due to a missing resource (e.g. GPU).
72+
ResourceUnready int `json:"resourceUnready" yaml:"resourceUnready"`
73+
}
74+
75+
// RegisteredNodeCount contains node counts of registered nodes.
76+
type RegisteredNodeCount struct {
77+
Total int `json:"total" yaml:"total"`
78+
Ready int `json:"ready" yaml:"ready"`
79+
NotStarted int `json:"notStarted" yaml:"notStarted"`
80+
// Number of nodes that are being currently deleted. They exist in K8S but are not included in NodeGroup.TargetSize().
81+
BeingDeleted int `json:"beingDeleted,omitempty" yaml:"beingDeleted,omitempty"`
82+
Unready RegisteredUnreadyNodeCount `json:"unready,omitempty" yaml:"unready,omitempty"`
83+
}
84+
85+
// NodeCount contains number of nodes that satisfy different criteria.
86+
type NodeCount struct {
87+
Registered RegisteredNodeCount `json:"registered,omitempty" yaml:"registered,omitempty"`
88+
LongUnregistered int `json:"longUnregistered" yaml:"longUnregistered"`
89+
Unregistered int `json:"unregistered" yaml:"unregistered"`
90+
}
91+
92+
// ClusterHealthCondition contains information about health condition for the whole cluster.
93+
type ClusterHealthCondition struct {
94+
// Status of cluster health.
95+
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
96+
// NodeCounts contains number of nodes that satisfy different criteria in the cluster.
97+
NodeCounts NodeCount `json:"nodeCounts,omitempty" yaml:"nodeCounts,omitempty"`
8398
// LastProbeTime is the last time we probed the condition.
84-
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"`
99+
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
85100
// LastTransitionTime is the time since when the condition was in the given state.
86-
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
101+
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
87102
}
88103

89-
// ClusterAutoscalerStatus contains ClusterAutoscaler status.
90-
type ClusterAutoscalerStatus struct {
91-
// NodeGroupStatuses contains status information of individual node groups on which CA works.
92-
NodeGroupStatuses []NodeGroupStatus `json:"nodeGroupStatuses,omitempty"`
93-
// ClusterwideConditions contains conditions that apply to the whole autoscaler.
94-
ClusterwideConditions []ClusterAutoscalerCondition `json:"clusterwideConditions,omitempty"`
104+
// NodeGroupHealthCondition contains information about health condition for a node group.
105+
type NodeGroupHealthCondition struct {
106+
// Status of node group health.
107+
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
108+
// NodeCounts contains number of nodes that satisfy different criteria in the node group.
109+
NodeCounts NodeCount `json:"nodeCounts,omitempty" yaml:"nodeCounts,omitempty"`
110+
// CloudProviderTarget is the target size set by cloud provider.
111+
CloudProviderTarget int `json:"cloudProviderTarget" yaml:"cloudProviderTarget"`
112+
// MinSize is the CA max size of a node group.
113+
MinSize int `json:"minSize" yaml:"minSize"`
114+
// MaxSize is the CA max size of a node group.
115+
MaxSize int `json:"maxSize" yaml:"maxSize"`
116+
// LastProbeTime is the last time we probed the condition.
117+
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
118+
// LastTransitionTime is the time since when the condition was in the given state.
119+
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
120+
}
121+
122+
// ClusterScaleUpCondition contains information about scale up condition for the whole cluster.
123+
type ClusterScaleUpCondition struct {
124+
// Status of the scale up.
125+
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
126+
// LastProbeTime is the last time we probed the condition.
127+
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
128+
// LastTransitionTime is the time since when the condition was in the given state.
129+
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
130+
}
131+
132+
// BackoffInfo contains error information that caused the backoff.
133+
type BackoffInfo struct {
134+
// ErrorCode is a specific error code for error condition
135+
ErrorCode string `json:"errorCode,omitempty" yaml:"errorCode,omitempty"`
136+
// ErrorMessage is human readable description of error condition
137+
ErrorMessage string `json:"errorMessage,omitempty" yaml:"errorMessage,omitempty"`
95138
}
96139

97-
// NodeGroupStatus contains status of a group of nodes controlled by ClusterAutoscaler.
140+
// NodeGroupScaleUpCondition contains information about scale up condition for a node group.
141+
type NodeGroupScaleUpCondition struct {
142+
// Status of the scale up.
143+
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
144+
// LastProbeTime is the last time we probed the condition.
145+
BackoffInfo BackoffInfo `json:"backoffInfo,omitempty" yaml:"backoffInfo,omitempty"`
146+
// LastProbeTime is the last time we probed the condition.
147+
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
148+
// LastTransitionTime is the time since when the condition was in the given state.
149+
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
150+
}
151+
152+
// ScaleDownCondition contains information about scale down condition for a node group or the whole cluster.
153+
type ScaleDownCondition struct {
154+
// Status of the scale down.
155+
Status ClusterAutoscalerConditionStatus `json:"status,omitempty" yaml:"status,omitempty"`
156+
// Candidates number for the scale down.
157+
Candidates int `json:"candidates,omitempty" yaml:"candidates,omitempty"`
158+
// LastProbeTime is the last time we probed the condition.
159+
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty" yaml:"lastProbeTime,omitempty"`
160+
// LastTransitionTime is the time since when the condition was in the given state.
161+
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty" yaml:"lastTransitionTime,omitempty"`
162+
}
163+
164+
// ClusterWideStatus contains status that apply to the whole cluster.
165+
type ClusterWideStatus struct {
166+
// Health contains information about health condition of the cluster.
167+
Health ClusterHealthCondition `json:"health,omitempty" yaml:"health,omitempty"`
168+
// ScaleUp contains information about scale up condition of the cluster.
169+
ScaleUp ClusterScaleUpCondition `json:"scaleUp,omitempty" yaml:"scaleUp,omitempty"`
170+
// ScaleDown contains information about scale down condition of the node group.
171+
ScaleDown ScaleDownCondition `json:"scaleDown,omitempty" yaml:"scaleDown,omitempty"`
172+
}
173+
174+
// NodeGroupStatus contains status of an individual node group on which CA works..
98175
type NodeGroupStatus struct {
99-
// ProviderID is the cloud-provider-specific name of the node group. On GCE it will be equal
100-
// to MIG url, on AWS it will be ASG name, etc.
101-
ProviderID string `json:"providerID,omitempty"`
102-
// Conditions is a list of conditions that describe the state of the node group.
103-
Conditions []ClusterAutoscalerCondition `json:"conditions,omitempty"`
176+
// Name of the node group.
177+
Name string `json:"name,omitempty" yaml:"name,omitempty"`
178+
// Health contains information about health condition of the node group.
179+
Health NodeGroupHealthCondition `json:"health,omitempty" yaml:"health,omitempty"`
180+
// ScaleUp contains information about scale up condition of the node group.
181+
ScaleUp NodeGroupScaleUpCondition `json:"scaleUp,omitempty" yaml:"scaleUp,omitempty"`
182+
// ScaleDown contains information about scale down condition of the node group.
183+
ScaleDown ScaleDownCondition `json:"scaleDown,omitempty" yaml:"scaleDown,omitempty"`
184+
}
185+
186+
// ClusterAutoscalerStatus contains ClusterAutoscaler status.
187+
type ClusterAutoscalerStatus struct {
188+
// Time of the cluster autoscaler status.
189+
Time string `json:"time,omitempty" yaml:"time,omitempty"`
190+
// AutoscalerStatus contains status of ClusterAutoscaler (e.g. 'Initializing' & 'Running').
191+
AutoscalerStatus ClusterAutoscalerStatusCondition `json:"autoscalerStatus,omitempty" yaml:"autoscalerStatus,omitempty"`
192+
// Message contains extra information about the status.
193+
Message string `json:"message,omitempty" yaml:"message,omitempty"`
194+
// ClusterWide contains conditions that apply to the whole cluster.
195+
ClusterWide ClusterWideStatus `json:"clusterWide,omitempty" yaml:"clusterWide,omitempty"`
196+
// NodeGroups contains status information of individual node groups on which CA works.
197+
NodeGroups []NodeGroupStatus `json:"nodeGroups,omitempty" yaml:"nodeGroups,omitempty"`
104198
}

cluster-autoscaler/clusterstate/api/utils.go

Lines changed: 0 additions & 92 deletions
This file was deleted.

cluster-autoscaler/clusterstate/api/utils_test.go

Lines changed: 0 additions & 91 deletions
This file was deleted.

0 commit comments

Comments
 (0)