diff --git a/docs/deployment/observability.md b/docs/deployment/observability.md index ebce6bb30..8f77a7323 100644 --- a/docs/deployment/observability.md +++ b/docs/deployment/observability.md @@ -5,13 +5,17 @@ Envoy Control uses [SLF4J](https://www.slf4j.org/) with [Logback](https://logback.qos.ch/) for logging. To override the default settings, point a file via environment variable + ```bash export ENVOY_CONTROL_RUNNER_OPTS="-Dlogging.config=/path/to/logback/logback.xml" ``` + and then run the `bin/envoy-control-runner` created from `distZip` task. `java-control-plane` produces quite a lot of logging on `INFO` level. Consider switching it to `WARN` + ```xml + ``` @@ -25,13 +29,12 @@ Sample logger configuration is available here. ### Envoy Control -Metric | Description ------------------------------| ----------------------------------- -**services.added** | Counter of added services events -**services.removed** | Counter of removed services events -**services.instanceChanged** | Counter of instance change events +Metric | Description | Labels +----------------------|------------------------------------| +**watched-services** | Counter of watched services events | status (added/removed/instances-changed/snapshot-changed) -Standard [Spring metrics](https://docs.spring.io/spring-boot/docs/current/reference/html/production-ready-metrics.html#production-ready-metrics-meter) (JVM, CPU, HTTP server) are also included. +Standard [Spring metrics](https://docs.spring.io/spring-boot/docs/current/reference/html/production-ready-metrics.html#production-ready-metrics-meter) ( +JVM, CPU, HTTP server) are also included. ### Envoy Control Runner @@ -39,41 +42,24 @@ Envoy Control Runner exposes a set of metrics on standard Spring Actuator's `/ac #### xDS connections -Metric | Description ------------------------------| -------------------------------------------------------- -**grpc.connections.ads** | Number of running gRPC ADS connections -**grpc.connections.cds** | Number of running gRPC CDS connections -**grpc.connections.eds** | Number of running gRPC EDS connections -**grpc.connections.lds** | Number of running gRPC LDS connections -**grpc.connections.rds** | Number of running gRPC RDS connections -**grpc.connections.sds** | Number of running gRPC SDS connections -**grpc.connections.unknown** | Number of running gRPC connections for unknown resource + Metric | Description | Labels +----------------------|----------------------------------------------------|------------------------------------ + **grpc.connections** | Number of running gRPC connections of a given type | type (cds/xds/lds/rds/sds/unknown) #### xDS requests -Metric | Description -------------------------------- | -------------------------------------------------------- -**grpc.requests.cds** | Counter of received gRPC CDS requests -**grpc.requests.eds** | Counter of received gRPC EDS requests -**grpc.requests.lds** | Counter of received gRPC LDS requests -**grpc.requests.rds** | Counter of received gRPC RDS requests -**grpc.requests.sds** | Counter of received gRPC SDS requests -**grpc.requests.unknown** | Counter of received gRPC requests for unknown resource -**grpc.requests.cds.delta** | Counter of received gRPC delta CDS requests -**grpc.requests.eds.delta** | Counter of received gRPC delta EDS requests -**grpc.requests.lds.delta** | Counter of received gRPC delta LDS requests -**grpc.requests.rds.delta** | Counter of received gRPC delta RDS requests -**grpc.requests.sds.delta** | Counter of received gRPC delta SDS requests -**grpc.requests.unknown.delta** | Counter of received gRPC delta requests for unknown resource + Metric | Description | Labels +-------------------------|---------------------------------------------------|-------------------------------------------------------------- + **grpc.requests.count** | Counter of received gRPC requests of a given type | type (cds/xds/lds/rds/sds/unknown), metric-type(total/delta) #### Snapshot -Metric | Description --------------------------| ---------------------------------- -**cache.groupCount** | Number of unique groups in SnapshotCache + Metric | Description | Labels +------------------------|------------------------------------------|-------- + **cache.groups.count** | Number of unique groups in SnapshotCache | - #### Synchronization -Metric | Description -----------------------------------------| ------------------------------------------------- -**cross-dc-synchronization.$dc.errors** | Counter of synchronization errors for given DC + Metric | Description | Labels +-------------------------------------------|----------------------------------------------------------------|---------------------------------------------- + **cross-dc-synchronization.errors.total** | Counter of synchronization errors for a given DC and operation | cluster, operation (get-instances/get-state) diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt index 8cb6ff604..6afc62fe1 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt @@ -36,9 +36,9 @@ class MetricsDiscoveryServerCallbacks(private val meterRegistry: MeterRegistry) .map { type -> type to AtomicInteger(0) } .toMap() - meterRegistry.gauge("grpc.connections", Tags.of("connection-type", "all"), connections) + meterRegistry.gauge("grpc.connections", Tags.of("type", "all"), connections) connectionsByType.forEach { (type, typeConnections) -> - meterRegistry.gauge("grpc.connections", Tags.of("connection-type", type.name.lowercase()), typeConnections) + meterRegistry.gauge("grpc.connections", Tags.of("type", type.name.lowercase()), typeConnections) } } diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt index 34aa618eb..c2aebe6f9 100644 --- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt +++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt @@ -62,8 +62,8 @@ class RemoteServices( .orTimeout(interval, TimeUnit.SECONDS) .exceptionally { meterRegistry.counter( - "cross-dc-synchronization.errors", - Tags.of("cluster", cluster, "operation", "get-cluster-state") + "cross-dc-synchronization.errors.total", + Tags.of("cluster", cluster, "operation", "get-state") ).increment() logger.warn("Error synchronizing instances ${it.message}", it) clusterStateCache[cluster] @@ -76,8 +76,8 @@ class RemoteServices( cluster to instances } catch (e: Exception) { meterRegistry.counter( - "cross-dc-synchronization.errors", - Tags.of("cluster", cluster, "operation", "get-cluster-state") + "cross-dc-synchronization.errors.total", + Tags.of("cluster", cluster, "operation", "get-instances") ).increment() logger.warn("Failed fetching instances from $cluster", e) cluster to emptyList() @@ -89,8 +89,7 @@ class RemoteServices( state: ServicesState ): ClusterState { meterRegistry.counter( - "cross-dc-synchronization", - Tags.of("operation", "service-update", "cluster", cluster) + "cross-dc-synchronization.total", Tags.of("cluster", cluster) ) .increment() val clusterState = ClusterState( diff --git a/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt b/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt index a62835a5a..033037043 100644 --- a/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt +++ b/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt @@ -173,14 +173,14 @@ class ControlPlaneConfig { ConsulClient(properties.host, properties.port).agentSelf.value?.config?.datacenter ?: "local" fun controlPlaneMetrics(meterRegistry: MeterRegistry): DefaultEnvoyControlMetrics { - val metricName = "services" + val metricName = "watched-services" return DefaultEnvoyControlMetrics(meterRegistry = meterRegistry).also { meterRegistry.gauge(metricName, Tags.of("status", "added"), it.servicesAdded) meterRegistry.gauge(metricName, Tags.of("status", "removed"), it.servicesRemoved) - meterRegistry.gauge(metricName, Tags.of("status", "instanceChanged"), it.instanceChanges) - meterRegistry.gauge(metricName, Tags.of("status", "snapshotChanged"), it.snapshotChanges) - meterRegistry.gauge("cache.groups.total", it.cacheGroupsCount) - it.meterRegistry.more().counter("services.watch.errors", listOf(), it.errorWatchingServices) + meterRegistry.gauge(metricName, Tags.of("status", "instance-changed"), it.instanceChanges) + meterRegistry.gauge(metricName, Tags.of("status", "snapshot-changed"), it.snapshotChanges) + meterRegistry.gauge("cache.groups.count", it.cacheGroupsCount) + it.meterRegistry.more().counter("services.watch.errors.total", listOf(), it.errorWatchingServices) } } diff --git a/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt b/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt index f743cb81e..f8e12ba44 100644 --- a/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt +++ b/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt @@ -236,11 +236,11 @@ interface MetricsDiscoveryServerCallbacksTest { val metric = "grpc.connections" assertThat( meterRegistry.find(metric) - .tags(Tags.of("connection-type", type.name.lowercase())).gauge() + .tags(Tags.of("type", type.name.lowercase())).gauge() ).isNotNull assertThat( meterRegistry.get(metric) - .tags(Tags.of("connection-type", type.name.lowercase())).gauge().value().toInt() + .tags(Tags.of("type", type.name.lowercase())).gauge().value().toInt() ).isEqualTo(value) } } @@ -261,7 +261,7 @@ interface MetricsDiscoveryServerCallbacksTest { private fun assertCondition(type: String, condition: Predicate, metricType: String) { val counterValue = - envoyControl().app.meterRegistry().find("grpc.requests.count") + envoyControl().app.meterRegistry().find("grpc.requests.total") .tags(Tags.of("type", type, "metric-type", metricType)) .counter()?.count()?.toInt() logger.info("$type $counterValue")