Skip to content

Commit 66d006f

Browse files
authored
Merge pull request #7021 from Shubham82/cherry-picked-of-#6750-upstream-cluster-autoscaler-release-1.28
Backport #6750 [CA] fix(hetzner): missing error return in scale up/down into CA 1.28
2 parents b35497d + c8a0959 commit 66d006f

File tree

1 file changed

+58
-21
lines changed

1 file changed

+58
-21
lines changed

cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package hetzner
1818

1919
import (
2020
"context"
21+
"errors"
2122
"fmt"
2223
"math/rand"
2324
"strings"
@@ -90,12 +91,14 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
9091
return fmt.Errorf("delta must be positive, have: %d", delta)
9192
}
9293

93-
targetSize := n.targetSize + delta
94-
if targetSize > n.MaxSize() {
95-
return fmt.Errorf("size increase is too large. current: %d desired: %d max: %d", n.targetSize, targetSize, n.MaxSize())
94+
desiredTargetSize := n.targetSize + delta
95+
if desiredTargetSize > n.MaxSize() {
96+
return fmt.Errorf("size increase is too large. current: %d desired: %d max: %d", n.targetSize, desiredTargetSize, n.MaxSize())
9697
}
9798

98-
klog.V(4).Infof("Scaling Instance Pool %s to %d", n.id, targetSize)
99+
actualDelta := delta
100+
101+
klog.V(4).Infof("Scaling Instance Pool %s to %d", n.id, desiredTargetSize)
99102

100103
n.clusterUpdateMutex.Lock()
101104
defer n.clusterUpdateMutex.Unlock()
@@ -108,25 +111,43 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
108111
return fmt.Errorf("server type %s not available in region %s", n.instanceType, n.region)
109112
}
110113

114+
defer func() {
115+
// create new servers cache
116+
if _, err := n.manager.cachedServers.servers(); err != nil {
117+
klog.Errorf("failed to update servers cache: %v", err)
118+
}
119+
120+
// Update target size
121+
n.resetTargetSize(actualDelta)
122+
}()
123+
124+
// There is no "Server Group" in Hetzner Cloud, we need to create every
125+
// server manually. This operation might fail for some of the servers
126+
// because of quotas, rate limiting or server type availability. We need to
127+
// collect the errors and inform cluster-autoscaler about this, so it can
128+
// try other node groups if configured.
111129
waitGroup := sync.WaitGroup{}
130+
errsCh := make(chan error, delta)
112131
for i := 0; i < delta; i++ {
113132
waitGroup.Add(1)
114133
go func() {
115134
defer waitGroup.Done()
116135
err := createServer(n)
117136
if err != nil {
118-
targetSize--
119-
klog.Errorf("failed to create error: %v", err)
137+
actualDelta--
138+
errsCh <- err
120139
}
121140
}()
122141
}
123142
waitGroup.Wait()
143+
close(errsCh)
124144

125-
n.targetSize = targetSize
126-
127-
// create new servers cache
128-
if _, err := n.manager.cachedServers.servers(); err != nil {
129-
klog.Errorf("failed to get servers: %v", err)
145+
errs := make([]error, 0, delta)
146+
for err = range errsCh {
147+
errs = append(errs, err)
148+
}
149+
if len(errs) > 0 {
150+
return fmt.Errorf("failed to create all servers: %w", errors.Join(errs...))
130151
}
131152

132153
return nil
@@ -140,34 +161,50 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
140161
n.clusterUpdateMutex.Lock()
141162
defer n.clusterUpdateMutex.Unlock()
142163

143-
targetSize := n.targetSize - len(nodes)
164+
delta := len(nodes)
165+
166+
targetSize := n.targetSize - delta
144167
if targetSize < n.MinSize() {
145168
return fmt.Errorf("size decrease is too large. current: %d desired: %d min: %d", n.targetSize, targetSize, n.MinSize())
146169
}
147170

148-
waitGroup := sync.WaitGroup{}
171+
actualDelta := delta
172+
173+
defer func() {
174+
// create new servers cache
175+
if _, err := n.manager.cachedServers.servers(); err != nil {
176+
klog.Errorf("failed to update servers cache: %v", err)
177+
}
178+
179+
n.resetTargetSize(-actualDelta)
180+
}()
149181

182+
waitGroup := sync.WaitGroup{}
183+
errsCh := make(chan error, len(nodes))
150184
for _, node := range nodes {
151185
waitGroup.Add(1)
152186
go func(node *apiv1.Node) {
153187
klog.Infof("Evicting server %s", node.Name)
154188

155189
err := n.manager.deleteByNode(node)
156190
if err != nil {
157-
klog.Errorf("failed to delete server ID %d error: %v", node.Name, err)
191+
actualDelta--
192+
errsCh <- fmt.Errorf("failed to delete server for node %q: %w", node.Name, err)
158193
}
159194

160195
waitGroup.Done()
161196
}(node)
162197
}
163198
waitGroup.Wait()
199+
close(errsCh)
164200

165-
// create new servers cache
166-
if _, err := n.manager.cachedServers.servers(); err != nil {
167-
klog.Errorf("failed to get servers: %v", err)
201+
errs := make([]error, 0, len(nodes))
202+
for err := range errsCh {
203+
errs = append(errs, err)
204+
}
205+
if len(errs) > 0 {
206+
return fmt.Errorf("failed to delete all nodes: %w", errors.Join(errs...))
168207
}
169-
170-
n.resetTargetSize(-len(nodes))
171208

172209
return nil
173210
}
@@ -519,8 +556,8 @@ func waitForServerAction(m *hetznerManager, serverName string, action *hcloud.Ac
519556
func (n *hetznerNodeGroup) resetTargetSize(expectedDelta int) {
520557
servers, err := n.manager.allServers(n.id)
521558
if err != nil {
522-
klog.Errorf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err)
523-
n.targetSize = n.targetSize - expectedDelta
559+
klog.Warningf("failed to set node pool %s size, using delta %d error: %v", n.id, expectedDelta, err)
560+
n.targetSize = n.targetSize + expectedDelta
524561
} else {
525562
klog.Infof("Set node group %s size from %d to %d, expected delta %d", n.id, n.targetSize, len(servers), expectedDelta)
526563
n.targetSize = len(servers)

0 commit comments

Comments
 (0)