Skip to content

Commit

Permalink
chore: Add pod metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
jigisha620 committed Oct 7, 2024
1 parent 7e01f70 commit f521e80
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 11 deletions.
96 changes: 85 additions & 11 deletions pkg/controllers/metrics/pod/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/samber/lo"
Expand All @@ -40,7 +41,7 @@ import (

const (
podName = "name"
podNameSpace = "namespace"
podNamespace = "namespace"
ownerSelfLink = "owner"
podHostName = "node"
podNodePool = "nodepool"
Expand Down Expand Up @@ -71,24 +72,52 @@ var (
Objectives: metrics.SummaryObjectives(),
},
)
podBoundDurationSeconds = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "karpenter",
Subsystem: metrics.PodSubsystem,
Name: "bound_duration_seconds",
Help: "The time from pod creation until the pod is bound.",
Buckets: metrics.DurationBuckets(),
},
)
podCurrentUnboundTimeSeconds = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: metrics.PodSubsystem,
Name: "current_unbound_time_seconds",
Help: "The time from pod creation until the pod is bound.",
},
[]string{podName, podNamespace},
)
podUnstartedTimeSeconds = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "karpenter",
Subsystem: metrics.PodSubsystem,
Name: "unstarted_time_seconds",
Help: "The time from pod creation until the pod is running.",
},
[]string{podName, podNamespace},
)
)

// Controller for the resource
type Controller struct {
kubeClient client.Client
metricStore *metrics.Store

pendingPods sets.Set[string]
pendingPods sets.Set[string]
unscheduledPods sets.Set[string]
}

func init() {
crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds)
crmetrics.Registry.MustRegister(podState, podStartupDurationSeconds, podBoundDurationSeconds, podCurrentUnboundTimeSeconds, podUnstartedTimeSeconds)
}

func labelNames() []string {
return []string{
podName,
podNameSpace,
podNamespace,
ownerSelfLink,
podHostName,
podNodePool,
Expand All @@ -103,9 +132,10 @@ func labelNames() []string {
// NewController constructs a podController instance
func NewController(kubeClient client.Client) *Controller {
return &Controller{
kubeClient: kubeClient,
metricStore: metrics.NewStore(),
pendingPods: sets.New[string](),
kubeClient: kubeClient,
metricStore: metrics.NewStore(),
pendingPods: sets.New[string](),
unscheduledPods: sets.New[string](),
}
}

Expand All @@ -117,6 +147,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco
if err := c.kubeClient.Get(ctx, req.NamespacedName, pod); err != nil {
if errors.IsNotFound(err) {
c.pendingPods.Delete(req.NamespacedName.String())
c.unscheduledPods.Delete(req.NamespacedName.String())
c.metricStore.Delete(req.NamespacedName.String())
}
return reconcile.Result{}, client.IgnoreNotFound(err)
Expand All @@ -133,29 +164,72 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco
},
})
c.recordPodStartupMetric(pod)
c.recordPodBoundMetric(pod)
return reconcile.Result{}, nil
}

func (c *Controller) recordPodStartupMetric(pod *corev1.Pod) {
key := client.ObjectKeyFromObject(pod).String()
if pod.Status.Phase == phasePending {
podUnstartedTimeSeconds.With(map[string]string{
podName: pod.Name,
podNamespace: pod.Namespace,
}).Set(time.Since(pod.CreationTimestamp.Time).Seconds())
c.pendingPods.Insert(key)
return
}
cond, ok := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == corev1.PodReady
})
if c.pendingPods.Has(key) && ok {
podStartupDurationSeconds.Observe(cond.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds())
c.pendingPods.Delete(key)
if c.pendingPods.Has(key) {
if !ok || cond.Status != corev1.ConditionTrue {
podUnstartedTimeSeconds.With(map[string]string{
podName: pod.Name,
podNamespace: pod.Namespace,
}).Set(time.Since(pod.CreationTimestamp.Time).Seconds())
} else {
// Delete the unstarted metric since the pod is now started
podUnstartedTimeSeconds.Delete(map[string]string{
podName: pod.Name,
podNamespace: pod.Namespace,
})
podStartupDurationSeconds.Observe(cond.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds())
c.pendingPods.Delete(key)
}
}
}
func (c *Controller) recordPodBoundMetric(pod *corev1.Pod) {
key := client.ObjectKeyFromObject(pod).String()
condScheduled, ok := lo.Find(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == corev1.PodScheduled
})
if pod.Status.Phase == phasePending {
// If the podScheduled condition does not exist, or it exists and is not set to true, we emit pod_current_unbound_time_seconds metric.
if !ok || condScheduled.Status != corev1.ConditionTrue {
podCurrentUnboundTimeSeconds.With(map[string]string{
podName: pod.Name,
podNamespace: pod.Namespace,
}).Set(time.Since(pod.CreationTimestamp.Time).Seconds())
}
c.unscheduledPods.Insert(key)
return
}
if c.unscheduledPods.Has(key) && ok && condScheduled.Status == corev1.ConditionTrue {
// Delete the unbound metric since the pod is now bound
podCurrentUnboundTimeSeconds.Delete(map[string]string{
podName: pod.Name,
podNamespace: pod.Namespace,
})
podBoundDurationSeconds.Observe(condScheduled.LastTransitionTime.Sub(pod.CreationTimestamp.Time).Seconds())
c.unscheduledPods.Delete(key)
}
}

// makeLabels creates the makeLabels using the current state of the pod
func (c *Controller) makeLabels(ctx context.Context, pod *corev1.Pod) (prometheus.Labels, error) {
metricLabels := prometheus.Labels{}
metricLabels[podName] = pod.Name
metricLabels[podNameSpace] = pod.Namespace
metricLabels[podNamespace] = pod.Namespace
// Selflink has been deprecated after v.1.20
// Manually generate the selflink for the first owner reference
// Currently we do not support multiple owner references
Expand Down
93 changes: 93 additions & 0 deletions pkg/controllers/metrics/pod/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"context"
"testing"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -84,6 +86,97 @@ var _ = Describe("Pod Metrics", func() {
})
Expect(found).To(BeTrue())
})
It("should update the pod bound and unbound time metrics", func() {
p := test.Pod()
p.Status.Phase = corev1.PodPending

// PodScheduled condition does not exist, emit pods_current_unbound_time_seconds metric
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set
_, found := FindMetricWithLabelValues("karpenter_pods_current_unbound_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())

p.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodScheduled, Status: corev1.ConditionUnknown, LastTransitionTime: metav1.Now()}}
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set
metric, found := FindMetricWithLabelValues("karpenter_pods_current_unbound_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())
unboundTime := metric.GetGauge().Value

// Pod is still pending but has bound. At this step pods_unbound_duration should not change.
p.Status.Phase = corev1.PodPending
p.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodScheduled, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}}
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled or not
metric, found = FindMetricWithLabelValues("karpenter_pods_current_unbound_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())
Expect(metric.GetGauge().Value).To(Equal(unboundTime))

// Pod is still running and has bound. At this step pods_bound_duration should be fired and pods_current_unbound_time_seconds should be deleted
p.Status.Phase = corev1.PodRunning
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled or not
_, found = FindMetricWithLabelValues("karpenter_pods_current_unbound_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeFalse())
_, found = FindMetricWithLabelValues("karpenter_pods_bound_duration_seconds", map[string]string{})
Expect(found).To(BeTrue())
})
It("should update the pod startup and unstarted time metrics", func() {
p := test.Pod()
p.Status.Phase = corev1.PodPending
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will add pod to pending pods and unscheduled pods set
_, found := FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())

// Pod is now running but readiness condition is not set
p.Status.Phase = corev1.PodRunning
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled or not
_, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())

// Pod is now running but readiness is unknown
p.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionUnknown, LastTransitionTime: metav1.Now()}}
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled or not
_, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeTrue())

// Pod is now running and ready. At this step pods_startup_duration should be fired and pods_unstarted_time should be deleted
p.Status.Phase = corev1.PodRunning
p.Status.Conditions = []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now()}}
ExpectApplied(ctx, env.Client, p)
ExpectReconcileSucceeded(ctx, podController, client.ObjectKeyFromObject(p)) //This will check if the pod was scheduled or not
_, found = FindMetricWithLabelValues("karpenter_pods_unstarted_time_seconds", map[string]string{
"name": p.GetName(),
"namespace": p.GetNamespace(),
})
Expect(found).To(BeFalse())
_, found = FindMetricWithLabelValues("karpenter_pods_startup_duration_seconds", nil)
Expect(found).To(BeTrue())
})
It("should delete the pod state metric on pod delete", func() {
p := test.Pod()
ExpectApplied(ctx, env.Client, p)
Expand Down

0 comments on commit f521e80

Please sign in to comment.