From ded4029c32613dcd9e03d524f5d9007928b0a566 Mon Sep 17 00:00:00 2001 From: Muhamad Awad Date: Mon, 18 Dec 2023 11:58:29 +0100 Subject: [PATCH] on stop timeout, kill then wait for services (#2169) also improve on the handling of services not stopping by starting them over --- pkg/upgrade/upgrade.go | 12 +++++++++++- pkg/zinit/commands.go | 16 +++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/pkg/upgrade/upgrade.go b/pkg/upgrade/upgrade.go index ba55b0f0a..48ebeac2d 100644 --- a/pkg/upgrade/upgrade.go +++ b/pkg/upgrade/upgrade.go @@ -379,10 +379,13 @@ func (u *Upgrader) ensureRestarted(service ...string) error { log.Debug().Strs("services", service).Msg("restarting services") if err := u.zinit.StopMultiple(20*time.Second, service...); err != nil { - return err + // we log here so we don't leave the node in a bad state! + // by just trying to start as much services as we can + log.Error().Err(err).Msg("failed to stop all services") } for _, name := range service { + log.Info().Str("service", name).Msg("starting service") if err := u.zinit.Forget(name); err != nil { log.Warn().Err(err).Str("service", name).Msg("could not forget service") } @@ -390,6 +393,13 @@ func (u *Upgrader) ensureRestarted(service ...string) error { if err := u.zinit.Monitor(name); err != nil && err != zinit.ErrAlreadyMonitored { log.Error().Err(err).Str("service", name).Msg("could not monitor service") } + + // this has no effect if Monitor already worked with no issue + // but we do it anyway for services that could not be forgotten (did not stop) + // so we start them again + if err := u.zinit.Start(name); err != nil { + log.Error().Err(err).Str("service", name).Msg("could not start service") + } } return nil diff --git a/pkg/zinit/commands.go b/pkg/zinit/commands.go index ab9b0c911..6efd3847c 100644 --- a/pkg/zinit/commands.go +++ b/pkg/zinit/commands.go @@ -1,6 +1,7 @@ package zinit import ( + "context" "fmt" "os" "os/exec" @@ -496,8 +497,10 @@ func (c *Client) StopMultiple(timeout time.Duration, service ...string) error { services[name] = struct{}{} } - deadline := time.After(timeout) + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + killCount := 0 for len(services) > 0 { var stopped []string for service := range services { @@ -531,17 +534,20 @@ func (c *Client) StopMultiple(timeout time.Duration, service ...string) error { } select { - case <-deadline: + case <-ctx.Done(): for service := range services { log.Warn().Str("service", service).Msg("service didn't stop in time. use SIGKILL") if err := c.Kill(service, SIGKILL); err != nil { log.Error().Err(err).Msgf("failed to send SIGKILL to service %s", service) } } - // after a kill we wait 1 second to make sure - // services are really dead before we move on + // we do kill -9 only 10 times before we give up + killCount += 1 + if killCount == 10 { + return fmt.Errorf("not all services are dead in time") + } + // we wait 1 second between each kill <-time.After(1 * time.Second) - return nil case <-time.After(1 * time.Second): } }