Skip to content

Commit

Permalink
Fix for health controller race condition marking healthy pods as unhe…
Browse files Browse the repository at this point in the history
…althy (googleforgames#2554)

The HealthController processes events asynchronously and is rate limited.
It is possible for pods to not be unhealthy anymore after some time
spent in the queue being rate limited. This check ensures the pod is
still unhealthy when processing the event.
  • Loading branch information
Thiryn authored Apr 28, 2022
1 parent a216e4d commit f9c333d
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 22 deletions.
45 changes: 27 additions & 18 deletions pkg/gameservers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,27 @@ func (hc *HealthController) syncGameServer(ctx context.Context, key string) erro
return nil
}

if skip, err := hc.skipUnhealthy(gs); err != nil || skip {
return err
// retrieve the pod for the gameserver
pod, err := hc.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name)
if err != nil {
if !k8serrors.IsNotFound(err) {
// If the pod exists but there is an error, go back into the queue.
return errors.Wrapf(err, "error retrieving Pod %s for GameServer to check status", gs.ObjectMeta.Name)
}
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).Debug("Could not find Pod")
}

// Make sure that the pod has to be marked unhealthy
if pod != nil {
if skip, err := hc.skipUnhealthyGameContainer(gs, pod); err != nil || skip {
return err
}

// If the pod is not unhealthy anymore, go back in the queue
if !hc.isUnhealthy(pod) {
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("GameServer is not unhealthy anymore")
return nil
}
}

hc.loggerForGameServer(gs).Debug("Issue with GameServer pod, marking as GameServerStateUnhealthy")
Expand All @@ -215,24 +234,14 @@ func (hc *HealthController) syncGameServer(ctx context.Context, key string) erro
return nil
}

// skipUnhealthy determines if it's appropriate to not move to Unhealthy when a Pod's
// skipUnhealthyGameContainer determines if it's appropriate to not move to Unhealthy when a Pod's
// gameserver container has crashed, or let it restart as per usual K8s operations.
// It does this by checking a combination of the current GameServer state and annotation data that stores
// which container instance was live if the GameServer has been marked as Ready.
// The logic is as follows:
// - If the GameServer is not yet Ready, allow to restart (return true)
// - If the GameServer is in a state past Ready, move to Unhealthy
func (hc *HealthController) skipUnhealthy(gs *agonesv1.GameServer) (bool, error) {
pod, err := hc.podLister.Pods(gs.ObjectMeta.Namespace).Get(gs.ObjectMeta.Name)
if err != nil {
// Pod doesn't exist, so the GameServer is definitely not healthy
if k8serrors.IsNotFound(err) {
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).Debug("skipUnhealthy: Could not find Pod")
return false, nil
}
// if it's something else, go back into the queue
return false, errors.Wrapf(err, "error retrieving Pod %s for GameServer to check status", gs.ObjectMeta.Name)
}
func (hc *HealthController) skipUnhealthyGameContainer(gs *agonesv1.GameServer, pod *corev1.Pod) (bool, error) {
if !metav1.IsControlledBy(pod, gs) {
// This is not the Pod we are looking for 🤖
return false, nil
Expand All @@ -248,7 +257,7 @@ func (hc *HealthController) skipUnhealthy(gs *agonesv1.GameServer) (bool, error)
}

if gs.IsBeforeReady() {
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("state", gs.Status.State).Debug("skipUnhealthy: Is Before Ready. Checking failed container")
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("state", gs.Status.State).Debug("skipUnhealthyGameContainer: Is Before Ready. Checking failed container")
// If the reason for failure was a container failure, then we can skip moving to Unhealthy.
// otherwise, we know it was one of the other reasons (eviction, lack of ports), so we should definitely go to Unhealthy.
return hc.failedContainer(pod), nil
Expand All @@ -258,7 +267,7 @@ func (hc *HealthController) skipUnhealthy(gs *agonesv1.GameServer) (bool, error)
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == gs.Spec.Container {
if cs.State.Terminated != nil {
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("skipUnhealthy: Container is terminated, returning false")
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Container is terminated, returning false")
return false, nil
}
if cs.LastTerminationState.Terminated != nil {
Expand All @@ -267,14 +276,14 @@ func (hc *HealthController) skipUnhealthy(gs *agonesv1.GameServer) (bool, error)
// shouldn't move to Unhealthy.
check := cs.ContainerID == gsReadyContainerID
if !check {
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthy: Container crashed after Ready, returning false")
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Container crashed after Ready, returning false")
}
return check, nil
}
break
}
}

hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthy: Should not reach here")
hc.baseLogger.WithField("gs", gs.ObjectMeta.Name).WithField("gsMeta", gs.ObjectMeta).WithField("podStatus", pod.Status).Debug("skipUnhealthyGameContainer: Game Container has not crashed, game container may be healthy")
return false, nil
}
24 changes: 20 additions & 4 deletions pkg/gameservers/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func TestHealthUnschedulableWithNoFreePorts(t *testing.T) {
assert.False(t, hc.unschedulableWithNoFreePorts(pod))
}

func TestHealthControllerSkipUnhealthy(t *testing.T) {
func TestHealthControllerSkipUnhealthyGameContainer(t *testing.T) {
t.Parallel()

type expected struct {
Expand Down Expand Up @@ -191,10 +191,8 @@ func TestHealthControllerSkipUnhealthy(t *testing.T) {
return true, &corev1.PodList{Items: []corev1.Pod{*pod}}, nil
})

_, cancel := agtesting.StartInformers(m, hc.podSynced)
defer cancel()
result, err := hc.skipUnhealthyGameContainer(gs, pod)

result, err := hc.skipUnhealthy(gs)
if len(v.expected.err) > 0 {
require.EqualError(t, err, v.expected.err)
} else {
Expand Down Expand Up @@ -258,6 +256,24 @@ func TestHealthControllerSyncGameServer(t *testing.T) {
{Name: "container", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{}}}}},
expected: expected{updated: true},
},
"container recovered and starting after queueing": {
state: agonesv1.GameServerStateStarting,
podStatus: &corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{
{Name: "container", State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{}}}}},
expected: expected{updated: false},
},
"container recovered and ready after queueing": {
state: agonesv1.GameServerStateReady,
podStatus: &corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{
{Name: "container", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}}}},
expected: expected{updated: false},
},
"container recovered and allocated after queueing": {
state: agonesv1.GameServerStateAllocated,
podStatus: &corev1.PodStatus{ContainerStatuses: []corev1.ContainerStatus{
{Name: "container", State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}}}}},
expected: expected{updated: false},
},
}

for name, test := range fixtures {
Expand Down

0 comments on commit f9c333d

Please sign in to comment.