@@ -18,6 +18,7 @@ package hetzner
1818
1919import (
2020 "context"
21+ "errors"
2122 "fmt"
2223 "math/rand"
2324 "strings"
@@ -90,12 +91,14 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
9091 return fmt .Errorf ("delta must be positive, have: %d" , delta )
9192 }
9293
93- targetSize := n .targetSize + delta
94- if targetSize > n .MaxSize () {
95- return fmt .Errorf ("size increase is too large. current: %d desired: %d max: %d" , n .targetSize , targetSize , n .MaxSize ())
94+ desiredTargetSize := n .targetSize + delta
95+ if desiredTargetSize > n .MaxSize () {
96+ return fmt .Errorf ("size increase is too large. current: %d desired: %d max: %d" , n .targetSize , desiredTargetSize , n .MaxSize ())
9697 }
9798
98- klog .V (4 ).Infof ("Scaling Instance Pool %s to %d" , n .id , targetSize )
99+ actualDelta := delta
100+
101+ klog .V (4 ).Infof ("Scaling Instance Pool %s to %d" , n .id , desiredTargetSize )
99102
100103 n .clusterUpdateMutex .Lock ()
101104 defer n .clusterUpdateMutex .Unlock ()
@@ -108,25 +111,43 @@ func (n *hetznerNodeGroup) IncreaseSize(delta int) error {
108111 return fmt .Errorf ("server type %s not available in region %s" , n .instanceType , n .region )
109112 }
110113
114+ defer func () {
115+ // create new servers cache
116+ if _ , err := n .manager .cachedServers .servers (); err != nil {
117+ klog .Errorf ("failed to update servers cache: %v" , err )
118+ }
119+
120+ // Update target size
121+ n .resetTargetSize (actualDelta )
122+ }()
123+
124+ // There is no "Server Group" in Hetzner Cloud, we need to create every
125+ // server manually. This operation might fail for some of the servers
126+ // because of quotas, rate limiting or server type availability. We need to
127+ // collect the errors and inform cluster-autoscaler about this, so it can
128+ // try other node groups if configured.
111129 waitGroup := sync.WaitGroup {}
130+ errsCh := make (chan error , delta )
112131 for i := 0 ; i < delta ; i ++ {
113132 waitGroup .Add (1 )
114133 go func () {
115134 defer waitGroup .Done ()
116135 err := createServer (n )
117136 if err != nil {
118- targetSize --
119- klog . Errorf ( "failed to create error: %v" , err )
137+ actualDelta --
138+ errsCh <- err
120139 }
121140 }()
122141 }
123142 waitGroup .Wait ()
143+ close (errsCh )
124144
125- n .targetSize = targetSize
126-
127- // create new servers cache
128- if _ , err := n .manager .cachedServers .servers (); err != nil {
129- klog .Errorf ("failed to get servers: %v" , err )
145+ errs := make ([]error , 0 , delta )
146+ for err = range errsCh {
147+ errs = append (errs , err )
148+ }
149+ if len (errs ) > 0 {
150+ return fmt .Errorf ("failed to create all servers: %w" , errors .Join (errs ... ))
130151 }
131152
132153 return nil
@@ -140,34 +161,50 @@ func (n *hetznerNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
140161 n .clusterUpdateMutex .Lock ()
141162 defer n .clusterUpdateMutex .Unlock ()
142163
143- targetSize := n .targetSize - len (nodes )
164+ delta := len (nodes )
165+
166+ targetSize := n .targetSize - delta
144167 if targetSize < n .MinSize () {
145168 return fmt .Errorf ("size decrease is too large. current: %d desired: %d min: %d" , n .targetSize , targetSize , n .MinSize ())
146169 }
147170
148- waitGroup := sync.WaitGroup {}
171+ actualDelta := delta
172+
173+ defer func () {
174+ // create new servers cache
175+ if _ , err := n .manager .cachedServers .servers (); err != nil {
176+ klog .Errorf ("failed to update servers cache: %v" , err )
177+ }
178+
179+ n .resetTargetSize (- actualDelta )
180+ }()
149181
182+ waitGroup := sync.WaitGroup {}
183+ errsCh := make (chan error , len (nodes ))
150184 for _ , node := range nodes {
151185 waitGroup .Add (1 )
152186 go func (node * apiv1.Node ) {
153187 klog .Infof ("Evicting server %s" , node .Name )
154188
155189 err := n .manager .deleteByNode (node )
156190 if err != nil {
157- klog .Errorf ("failed to delete server ID %d error: %v" , node .Name , err )
191+ actualDelta --
192+ errsCh <- fmt .Errorf ("failed to delete server for node %q: %w" , node .Name , err )
158193 }
159194
160195 waitGroup .Done ()
161196 }(node )
162197 }
163198 waitGroup .Wait ()
199+ close (errsCh )
164200
165- // create new servers cache
166- if _ , err := n .manager .cachedServers .servers (); err != nil {
167- klog .Errorf ("failed to get servers: %v" , err )
201+ errs := make ([]error , 0 , len (nodes ))
202+ for err := range errsCh {
203+ errs = append (errs , err )
204+ }
205+ if len (errs ) > 0 {
206+ return fmt .Errorf ("failed to delete all nodes: %w" , errors .Join (errs ... ))
168207 }
169-
170- n .resetTargetSize (- len (nodes ))
171208
172209 return nil
173210}
@@ -519,8 +556,8 @@ func waitForServerAction(m *hetznerManager, serverName string, action *hcloud.Ac
519556func (n * hetznerNodeGroup ) resetTargetSize (expectedDelta int ) {
520557 servers , err := n .manager .allServers (n .id )
521558 if err != nil {
522- klog .Errorf ("failed to set node pool %s size, using delta %d error: %v" , n .id , expectedDelta , err )
523- n .targetSize = n .targetSize - expectedDelta
559+ klog .Warningf ("failed to set node pool %s size, using delta %d error: %v" , n .id , expectedDelta , err )
560+ n .targetSize = n .targetSize + expectedDelta
524561 } else {
525562 klog .Infof ("Set node group %s size from %d to %d, expected delta %d" , n .id , n .targetSize , len (servers ), expectedDelta )
526563 n .targetSize = len (servers )
0 commit comments