Skip to content

Commit

Permalink
on stop timeout, kill then wait for services
Browse files Browse the repository at this point in the history
also improve on the handling of services not stopping
by starting them over
  • Loading branch information
muhamadazmy committed Dec 18, 2023
1 parent fc3562a commit 42a3d58
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
12 changes: 11 additions & 1 deletion pkg/upgrade/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -379,17 +379,27 @@ func (u *Upgrader) ensureRestarted(service ...string) error {

log.Debug().Strs("services", service).Msg("restarting services")
if err := u.zinit.StopMultiple(20*time.Second, service...); err != nil {
return err
// we log here so we don't leave the node in a bad state!
// by just trying to start as much services as we can
log.Error().Err(err).Msg("failed to stop all services")
}

for _, name := range service {
log.Info().Str("service", name).Msg("starting service")
if err := u.zinit.Forget(name); err != nil {
log.Warn().Err(err).Str("service", name).Msg("could not forget service")
}

if err := u.zinit.Monitor(name); err != nil && err != zinit.ErrAlreadyMonitored {
log.Error().Err(err).Str("service", name).Msg("could not monitor service")
}

// this has no effect if Monitor already worked with no issue
// but we do it anyway for services that could not be forgotten (did not stop)
// so we start them again
if err := u.zinit.Start(name); err != nil {
log.Error().Err(err).Str("service", name).Msg("could not start service")
}
}

return nil
Expand Down
16 changes: 11 additions & 5 deletions pkg/zinit/commands.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package zinit

import (
"context"
"fmt"
"os"
"os/exec"
Expand Down Expand Up @@ -496,8 +497,10 @@ func (c *Client) StopMultiple(timeout time.Duration, service ...string) error {
services[name] = struct{}{}
}

deadline := time.After(timeout)
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()

killCount := 0
for len(services) > 0 {
var stopped []string
for service := range services {
Expand Down Expand Up @@ -531,17 +534,20 @@ func (c *Client) StopMultiple(timeout time.Duration, service ...string) error {
}

select {
case <-deadline:
case <-ctx.Done():
for service := range services {
log.Warn().Str("service", service).Msg("service didn't stop in time. use SIGKILL")
if err := c.Kill(service, SIGKILL); err != nil {
log.Error().Err(err).Msgf("failed to send SIGKILL to service %s", service)
}
}
// after a kill we wait 1 second to make sure
// services are really dead before we move on
// we do kill -9 only 10 times before we give up
killCount += 1
if killCount == 10 {
return fmt.Errorf("not all services are dead in time")
}
// we wait 1 second between each kill
<-time.After(1 * time.Second)
return nil
case <-time.After(1 * time.Second):
}
}
Expand Down

0 comments on commit 42a3d58

Please sign in to comment.