Skip to content

Commit

Permalink
on stop timeout, kill then wait for services (#2169)
Browse files Browse the repository at this point in the history
also improve on the handling of services not stopping
by starting them over
  • Loading branch information
muhamadazmy authored Dec 18, 2023
1 parent fc3562a commit ded4029
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
12 changes: 11 additions & 1 deletion pkg/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -379,17 +379,27 @@ func (u *Upgrader) ensureRestarted(service ...string) error {

log.Debug().Strs("services", service).Msg("restarting services")
if err := u.zinit.StopMultiple(20*time.Second, service...); err != nil {
return err
// we log here so we don't leave the node in a bad state!
// by just trying to start as much services as we can
log.Error().Err(err).Msg("failed to stop all services")
}

for _, name := range service {
log.Info().Str("service", name).Msg("starting service")
if err := u.zinit.Forget(name); err != nil {
log.Warn().Err(err).Str("service", name).Msg("could not forget service")
}

if err := u.zinit.Monitor(name); err != nil && err != zinit.ErrAlreadyMonitored {
log.Error().Err(err).Str("service", name).Msg("could not monitor service")
}

// this has no effect if Monitor already worked with no issue
// but we do it anyway for services that could not be forgotten (did not stop)
// so we start them again
if err := u.zinit.Start(name); err != nil {
log.Error().Err(err).Str("service", name).Msg("could not start service")
}
}

return nil
Expand Down
16 changes: 11 additions & 5 deletions pkg/zinit/commands.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package zinit

import (
"context"
"fmt"
"os"
"os/exec"
Expand Down Expand Up @@ -496,8 +497,10 @@ func (c *Client) StopMultiple(timeout time.Duration, service ...string) error {
services[name] = struct{}{}
}

deadline := time.After(timeout)
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()

killCount := 0
for len(services) > 0 {
var stopped []string
for service := range services {
Expand Down Expand Up @@ -531,17 +534,20 @@ func (c *Client) StopMultiple(timeout time.Duration, service ...string) error {
}

select {
case <-deadline:
case <-ctx.Done():
for service := range services {
log.Warn().Str("service", service).Msg("service didn't stop in time. use SIGKILL")
if err := c.Kill(service, SIGKILL); err != nil {
log.Error().Err(err).Msgf("failed to send SIGKILL to service %s", service)
}
}
// after a kill we wait 1 second to make sure
// services are really dead before we move on
// we do kill -9 only 10 times before we give up
killCount += 1
if killCount == 10 {
return fmt.Errorf("not all services are dead in time")
}
// we wait 1 second between each kill
<-time.After(1 * time.Second)
return nil
case <-time.After(1 * time.Second):
}
}
Expand Down

0 comments on commit ded4029

Please sign in to comment.