From 4dcadb410cdb4192432ce8d8da4c8d1df6398c51 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 7 Feb 2025 16:39:48 +0800 Subject: [PATCH] add metrics for the internal pd client calls Signed-off-by: Ryan Leung --- client/metrics/metrics.go | 42 ++++++++++++++++++-- client/servicediscovery/service_discovery.go | 9 +++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/client/metrics/metrics.go b/client/metrics/metrics.go index da7637b19be..9e1981446ee 100644 --- a/client/metrics/metrics.go +++ b/client/metrics/metrics.go @@ -64,9 +64,11 @@ func InitAndRegisterMetrics(constLabels prometheus.Labels) { } var ( - cmdDuration *prometheus.HistogramVec - cmdFailedDuration *prometheus.HistogramVec - requestDuration *prometheus.HistogramVec + cmdDuration *prometheus.HistogramVec + cmdFailedDuration *prometheus.HistogramVec + internalCmdDuration *prometheus.HistogramVec + internalCmdFailedDuration *prometheus.HistogramVec + requestDuration *prometheus.HistogramVec // TSOBestBatchSize is the histogram of the best batch size of TSO requests. TSOBestBatchSize prometheus.Histogram @@ -105,6 +107,26 @@ func initMetrics(constLabels prometheus.Labels) { Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }, []string{"type"}) + internalCmdDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd_client", + Subsystem: "internal_cmd", + Name: "handle_cmds_duration_seconds", + Help: "Bucketed histogram of processing time (s) of handled success internal cmds.", + ConstLabels: constLabels, + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + }, []string{"type"}) + + internalCmdFailedDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd_client", + Subsystem: "internal_cmd", + Name: "handle_failed_cmds_duration_seconds", + Help: "Bucketed histogram of processing time (s) of failed handled internal cmds.", + ConstLabels: constLabels, + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + }, []string{"type"}) + requestDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "pd_client", @@ -228,6 +250,12 @@ var ( CmdFailedDurationUpdateGCSafePointV2 prometheus.Observer CmdFailedDurationUpdateServiceSafePointV2 prometheus.Observer + InternalCmdDurationGetClusterInfo prometheus.Observer + InternalCmdDurationGetMembers prometheus.Observer + + InternalCmdFailedDurationGetClusterInfo prometheus.Observer + InternalCmdFailedDurationGetMembers prometheus.Observer + // RequestDurationTSO records the durations of the successful TSO requests. RequestDurationTSO prometheus.Observer // RequestFailedDurationTSO records the durations of the failed TSO requests. @@ -281,6 +309,12 @@ func initCmdDurations() { CmdFailedDurationUpdateGCSafePointV2 = cmdFailedDuration.WithLabelValues("update_gc_safe_point_v2") CmdFailedDurationUpdateServiceSafePointV2 = cmdFailedDuration.WithLabelValues("update_service_safe_point_v2") + InternalCmdDurationGetClusterInfo = internalCmdDuration.WithLabelValues("get_cluster_info") + InternalCmdDurationGetMembers = internalCmdDuration.WithLabelValues("get_members") + + InternalCmdFailedDurationGetClusterInfo = internalCmdFailedDuration.WithLabelValues("get_cluster_info") + InternalCmdFailedDurationGetMembers = internalCmdFailedDuration.WithLabelValues("get_members") + RequestDurationTSO = requestDuration.WithLabelValues("tso") RequestFailedDurationTSO = requestDuration.WithLabelValues("tso-failed") } @@ -288,6 +322,8 @@ func initCmdDurations() { func registerMetrics() { prometheus.MustRegister(cmdDuration) prometheus.MustRegister(cmdFailedDuration) + prometheus.MustRegister(internalCmdDuration) + prometheus.MustRegister(internalCmdFailedDuration) prometheus.MustRegister(requestDuration) prometheus.MustRegister(TSOBestBatchSize) prometheus.MustRegister(TSOBatchSize) diff --git a/client/servicediscovery/service_discovery.go b/client/servicediscovery/service_discovery.go index 146a08aa381..e26c081cea7 100644 --- a/client/servicediscovery/service_discovery.go +++ b/client/servicediscovery/service_discovery.go @@ -38,6 +38,7 @@ import ( "github.com/tikv/pd/client/constants" "github.com/tikv/pd/client/errs" + "github.com/tikv/pd/client/metrics" "github.com/tikv/pd/client/opt" "github.com/tikv/pd/client/pkg/retry" "github.com/tikv/pd/client/pkg/utils/grpcutil" @@ -909,12 +910,16 @@ func (c *serviceDiscovery) getClusterInfo(ctx context.Context, url string, timeo if err != nil { return nil, err } + start := time.Now() + defer func() { metrics.InternalCmdDurationGetClusterInfo.Observe(time.Since(start).Seconds()) }() clusterInfo, err := pdpb.NewPDClient(cc).GetClusterInfo(ctx, &pdpb.GetClusterInfoRequest{}) if err != nil { + metrics.InternalCmdFailedDurationGetClusterInfo.Observe(time.Since(start).Seconds()) attachErr := errors.Errorf("error:%s target:%s status:%s", err, cc.Target(), cc.GetState().String()) return nil, errs.ErrClientGetClusterInfo.Wrap(attachErr).GenWithStackByCause() } if clusterInfo.GetHeader().GetError() != nil { + metrics.InternalCmdFailedDurationGetClusterInfo.Observe(time.Since(start).Seconds()) attachErr := errors.Errorf("error:%s target:%s status:%s", clusterInfo.GetHeader().GetError().String(), cc.Target(), cc.GetState().String()) return nil, errs.ErrClientGetClusterInfo.Wrap(attachErr).GenWithStackByCause() } @@ -928,12 +933,16 @@ func (c *serviceDiscovery) getMembers(ctx context.Context, url string, timeout t if err != nil { return nil, err } + start := time.Now() + defer func() { metrics.InternalCmdDurationGetMembers.Observe(time.Since(start).Seconds()) }() members, err := pdpb.NewPDClient(cc).GetMembers(ctx, &pdpb.GetMembersRequest{}) if err != nil { + metrics.InternalCmdFailedDurationGetMembers.Observe(time.Since(start).Seconds()) attachErr := errors.Errorf("error:%s target:%s status:%s", err, cc.Target(), cc.GetState().String()) return nil, errs.ErrClientGetMember.Wrap(attachErr).GenWithStackByCause() } if members.GetHeader().GetError() != nil { + metrics.InternalCmdFailedDurationGetMembers.Observe(time.Since(start).Seconds()) attachErr := errors.Errorf("error:%s target:%s status:%s", members.GetHeader().GetError().String(), cc.Target(), cc.GetState().String()) return nil, errs.ErrClientGetMember.Wrap(attachErr).GenWithStackByCause() }