diff --git a/docs/deployment/observability.md b/docs/deployment/observability.md
index ebce6bb30..8f77a7323 100644
--- a/docs/deployment/observability.md
+++ b/docs/deployment/observability.md
@@ -5,13 +5,17 @@
Envoy Control uses [SLF4J](https://www.slf4j.org/) with [Logback](https://logback.qos.ch/) for logging.
To override the default settings, point a file via environment variable
+
```bash
export ENVOY_CONTROL_RUNNER_OPTS="-Dlogging.config=/path/to/logback/logback.xml"
```
+
and then run the `bin/envoy-control-runner` created from `distZip` task.
`java-control-plane` produces quite a lot of logging on `INFO` level. Consider switching it to `WARN`
+
```xml
+
```
@@ -25,13 +29,12 @@ Sample logger configuration is available here.
### Envoy Control
-Metric | Description
------------------------------| -----------------------------------
-**services.added** | Counter of added services events
-**services.removed** | Counter of removed services events
-**services.instanceChanged** | Counter of instance change events
+Metric | Description | Labels
+----------------------|------------------------------------|
+**watched-services** | Counter of watched services events | status (added/removed/instances-changed/snapshot-changed)
-Standard [Spring metrics](https://docs.spring.io/spring-boot/docs/current/reference/html/production-ready-metrics.html#production-ready-metrics-meter) (JVM, CPU, HTTP server) are also included.
+Standard [Spring metrics](https://docs.spring.io/spring-boot/docs/current/reference/html/production-ready-metrics.html#production-ready-metrics-meter) (
+JVM, CPU, HTTP server) are also included.
### Envoy Control Runner
@@ -39,41 +42,24 @@ Envoy Control Runner exposes a set of metrics on standard Spring Actuator's `/ac
#### xDS connections
-Metric | Description
------------------------------| --------------------------------------------------------
-**grpc.connections.ads** | Number of running gRPC ADS connections
-**grpc.connections.cds** | Number of running gRPC CDS connections
-**grpc.connections.eds** | Number of running gRPC EDS connections
-**grpc.connections.lds** | Number of running gRPC LDS connections
-**grpc.connections.rds** | Number of running gRPC RDS connections
-**grpc.connections.sds** | Number of running gRPC SDS connections
-**grpc.connections.unknown** | Number of running gRPC connections for unknown resource
+ Metric | Description | Labels
+----------------------|----------------------------------------------------|------------------------------------
+ **grpc.connections** | Number of running gRPC connections of a given type | type (cds/xds/lds/rds/sds/unknown)
#### xDS requests
-Metric | Description
-------------------------------- | --------------------------------------------------------
-**grpc.requests.cds** | Counter of received gRPC CDS requests
-**grpc.requests.eds** | Counter of received gRPC EDS requests
-**grpc.requests.lds** | Counter of received gRPC LDS requests
-**grpc.requests.rds** | Counter of received gRPC RDS requests
-**grpc.requests.sds** | Counter of received gRPC SDS requests
-**grpc.requests.unknown** | Counter of received gRPC requests for unknown resource
-**grpc.requests.cds.delta** | Counter of received gRPC delta CDS requests
-**grpc.requests.eds.delta** | Counter of received gRPC delta EDS requests
-**grpc.requests.lds.delta** | Counter of received gRPC delta LDS requests
-**grpc.requests.rds.delta** | Counter of received gRPC delta RDS requests
-**grpc.requests.sds.delta** | Counter of received gRPC delta SDS requests
-**grpc.requests.unknown.delta** | Counter of received gRPC delta requests for unknown resource
+ Metric | Description | Labels
+-------------------------|---------------------------------------------------|--------------------------------------------------------------
+ **grpc.requests.count** | Counter of received gRPC requests of a given type | type (cds/xds/lds/rds/sds/unknown), metric-type(total/delta)
#### Snapshot
-Metric | Description
--------------------------| ----------------------------------
-**cache.groupCount** | Number of unique groups in SnapshotCache
+ Metric | Description | Labels
+------------------------|------------------------------------------|--------
+ **cache.groups.count** | Number of unique groups in SnapshotCache | -
#### Synchronization
-Metric | Description
-----------------------------------------| -------------------------------------------------
-**cross-dc-synchronization.$dc.errors** | Counter of synchronization errors for given DC
+ Metric | Description | Labels
+-------------------------------------------|----------------------------------------------------------------|----------------------------------------------
+ **cross-dc-synchronization.errors.total** | Counter of synchronization errors for a given DC and operation | cluster, operation (get-instances/get-state)
diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt
index 8cb6ff604..6afc62fe1 100644
--- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt
+++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/server/callbacks/MetricsDiscoveryServerCallbacks.kt
@@ -36,9 +36,9 @@ class MetricsDiscoveryServerCallbacks(private val meterRegistry: MeterRegistry)
.map { type -> type to AtomicInteger(0) }
.toMap()
- meterRegistry.gauge("grpc.connections", Tags.of("connection-type", "all"), connections)
+ meterRegistry.gauge("grpc.connections", Tags.of("type", "all"), connections)
connectionsByType.forEach { (type, typeConnections) ->
- meterRegistry.gauge("grpc.connections", Tags.of("connection-type", type.name.lowercase()), typeConnections)
+ meterRegistry.gauge("grpc.connections", Tags.of("type", type.name.lowercase()), typeConnections)
}
}
diff --git a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt
index 34aa618eb..c2aebe6f9 100644
--- a/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt
+++ b/envoy-control-core/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/synchronization/RemoteServices.kt
@@ -62,8 +62,8 @@ class RemoteServices(
.orTimeout(interval, TimeUnit.SECONDS)
.exceptionally {
meterRegistry.counter(
- "cross-dc-synchronization.errors",
- Tags.of("cluster", cluster, "operation", "get-cluster-state")
+ "cross-dc-synchronization.errors.total",
+ Tags.of("cluster", cluster, "operation", "get-state")
).increment()
logger.warn("Error synchronizing instances ${it.message}", it)
clusterStateCache[cluster]
@@ -76,8 +76,8 @@ class RemoteServices(
cluster to instances
} catch (e: Exception) {
meterRegistry.counter(
- "cross-dc-synchronization.errors",
- Tags.of("cluster", cluster, "operation", "get-cluster-state")
+ "cross-dc-synchronization.errors.total",
+ Tags.of("cluster", cluster, "operation", "get-instances")
).increment()
logger.warn("Failed fetching instances from $cluster", e)
cluster to emptyList()
@@ -89,8 +89,7 @@ class RemoteServices(
state: ServicesState
): ClusterState {
meterRegistry.counter(
- "cross-dc-synchronization",
- Tags.of("operation", "service-update", "cluster", cluster)
+ "cross-dc-synchronization.total", Tags.of("cluster", cluster)
)
.increment()
val clusterState = ClusterState(
diff --git a/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt b/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt
index a62835a5a..033037043 100644
--- a/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt
+++ b/envoy-control-runner/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/infrastructure/ControlPlaneConfig.kt
@@ -173,14 +173,14 @@ class ControlPlaneConfig {
ConsulClient(properties.host, properties.port).agentSelf.value?.config?.datacenter ?: "local"
fun controlPlaneMetrics(meterRegistry: MeterRegistry): DefaultEnvoyControlMetrics {
- val metricName = "services"
+ val metricName = "watched-services"
return DefaultEnvoyControlMetrics(meterRegistry = meterRegistry).also {
meterRegistry.gauge(metricName, Tags.of("status", "added"), it.servicesAdded)
meterRegistry.gauge(metricName, Tags.of("status", "removed"), it.servicesRemoved)
- meterRegistry.gauge(metricName, Tags.of("status", "instanceChanged"), it.instanceChanges)
- meterRegistry.gauge(metricName, Tags.of("status", "snapshotChanged"), it.snapshotChanges)
- meterRegistry.gauge("cache.groups.total", it.cacheGroupsCount)
- it.meterRegistry.more().counter("services.watch.errors", listOf(), it.errorWatchingServices)
+ meterRegistry.gauge(metricName, Tags.of("status", "instance-changed"), it.instanceChanges)
+ meterRegistry.gauge(metricName, Tags.of("status", "snapshot-changed"), it.snapshotChanges)
+ meterRegistry.gauge("cache.groups.count", it.cacheGroupsCount)
+ it.meterRegistry.more().counter("services.watch.errors.total", listOf(), it.errorWatchingServices)
}
}
diff --git a/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt b/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt
index f743cb81e..f8e12ba44 100644
--- a/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt
+++ b/envoy-control-tests/src/main/kotlin/pl/allegro/tech/servicemesh/envoycontrol/MetricsDiscoveryServerCallbacksTest.kt
@@ -236,11 +236,11 @@ interface MetricsDiscoveryServerCallbacksTest {
val metric = "grpc.connections"
assertThat(
meterRegistry.find(metric)
- .tags(Tags.of("connection-type", type.name.lowercase())).gauge()
+ .tags(Tags.of("type", type.name.lowercase())).gauge()
).isNotNull
assertThat(
meterRegistry.get(metric)
- .tags(Tags.of("connection-type", type.name.lowercase())).gauge().value().toInt()
+ .tags(Tags.of("type", type.name.lowercase())).gauge().value().toInt()
).isEqualTo(value)
}
}
@@ -261,7 +261,7 @@ interface MetricsDiscoveryServerCallbacksTest {
private fun assertCondition(type: String, condition: Predicate, metricType: String) {
val counterValue =
- envoyControl().app.meterRegistry().find("grpc.requests.count")
+ envoyControl().app.meterRegistry().find("grpc.requests.total")
.tags(Tags.of("type", type, "metric-type", metricType))
.counter()?.count()?.toInt()
logger.info("$type $counterValue")