diff --git a/api/v1alpha1/healthcheck_types.go b/api/v1alpha1/healthcheck_types.go index 9ee90aa0c8a..0cfb1663028 100644 --- a/api/v1alpha1/healthcheck_types.go +++ b/api/v1alpha1/healthcheck_types.go @@ -9,11 +9,6 @@ import metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" // HealthCheck configuration to decide which endpoints // are healthy and can be used for routing. -// -// Note: Once the overall health of the backendRef drops below 50% (e.g. a backendRef having 10 endpoints -// with more than 5 unhealthy endpoints), Envoy will disregard health status and balance across all endpoints. -// This is called "panic mode". It's designed to prevent a situation in which host failures cascade throughout the cluster -// as load increases. type HealthCheck struct { // Active health check configuration // +optional @@ -22,6 +17,15 @@ type HealthCheck struct { // Passive passive check configuration // +optional Passive *PassiveHealthCheck `json:"passive,omitempty"` + + // When number of unhealthy endpoints for a backend reaches this threshold + // Envoy will disregard health status and balance across all endpoints. + // It's designed to prevent a situation in which host failures cascade throughout the cluster + // as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Maximum=100 + // +optional + PanicThreshold *uint32 `json:"panicThreshold,omitempty"` } // PassiveHealthCheck defines the configuration for passive health checks in the context of Envoy's Outlier Detection, diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index ee36c4ef023..de1bb429588 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -3209,6 +3209,11 @@ func (in *HealthCheck) DeepCopyInto(out *HealthCheck) { *out = new(PassiveHealthCheck) (*in).DeepCopyInto(*out) } + if in.PanicThreshold != nil { + in, out := &in.PanicThreshold, &out.PanicThreshold + *out = new(uint32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HealthCheck. diff --git a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_backendtrafficpolicies.yaml b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_backendtrafficpolicies.yaml index e0c56b6406d..86389fabdf6 100644 --- a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_backendtrafficpolicies.yaml +++ b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_backendtrafficpolicies.yaml @@ -423,6 +423,16 @@ spec: - message: The grpc field can only be set if the Health Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: diff --git a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyextensionpolicies.yaml b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyextensionpolicies.yaml index a0740148a16..fde62179020 100644 --- a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyextensionpolicies.yaml +++ b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyextensionpolicies.yaml @@ -526,6 +526,16 @@ spec: - message: The grpc field can only be set if the Health Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: diff --git a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyproxies.yaml b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyproxies.yaml index d6dcc22d9c5..e463204ee1e 100644 --- a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyproxies.yaml +++ b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_envoyproxies.yaml @@ -10992,6 +10992,16 @@ spec: is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration @@ -11969,6 +11979,16 @@ spec: is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration @@ -13025,6 +13045,16 @@ spec: if the Health Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: @@ -14008,6 +14038,16 @@ spec: Health Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: diff --git a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_securitypolicies.yaml b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_securitypolicies.yaml index 103d1fb7caa..6b5e5d363ce 100644 --- a/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_securitypolicies.yaml +++ b/charts/gateway-helm/crds/generated/gateway.envoyproxy.io_securitypolicies.yaml @@ -940,6 +940,16 @@ spec: Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: @@ -1821,6 +1831,16 @@ spec: Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: @@ -2843,6 +2863,16 @@ spec: the Health Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: @@ -3852,6 +3882,16 @@ spec: Checker type is GRPC. rule: 'has(self.grpc) ? self.type == ''GRPC'' : true' + panicThreshold: + description: |- + When number of unhealthy endpoints for a backend reaches this threshold + Envoy will disregard health status and balance across all endpoints. + It's designed to prevent a situation in which host failures cascade throughout the cluster + as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. + format: int32 + maximum: 100 + minimum: 0 + type: integer passive: description: Passive passive check configuration properties: diff --git a/internal/gatewayapi/clustersettings.go b/internal/gatewayapi/clustersettings.go index 76d3e1aeaff..1b77b8adb9b 100644 --- a/internal/gatewayapi/clustersettings.go +++ b/internal/gatewayapi/clustersettings.go @@ -360,7 +360,7 @@ func buildHealthCheck(policy egv1a1.ClusterSettings) *ir.HealthCheck { irhc := &ir.HealthCheck{} irhc.Passive = buildPassiveHealthCheck(*policy.HealthCheck) irhc.Active = buildActiveHealthCheck(*policy.HealthCheck) - + irhc.PanicThreshold = policy.HealthCheck.PanicThreshold return irhc } diff --git a/internal/gatewayapi/testdata/backendtrafficpolicy-with-panic-threshold.in.yaml b/internal/gatewayapi/testdata/backendtrafficpolicy-with-panic-threshold.in.yaml new file mode 100644 index 00000000000..7ed59e6832c --- /dev/null +++ b/internal/gatewayapi/testdata/backendtrafficpolicy-with-panic-threshold.in.yaml @@ -0,0 +1,124 @@ +gateways: + - apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + namespace: envoy-gateway + name: gateway-1 + spec: + gatewayClassName: envoy-gateway-class + listeners: + - name: http + protocol: HTTP + port: 80 + allowedRoutes: + namespaces: + from: All + - apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + namespace: envoy-gateway + name: gateway-2 + spec: + gatewayClassName: envoy-gateway-class + listeners: + - name: http + protocol: HTTP + port: 80 + allowedRoutes: + namespaces: + from: All +httpRoutes: + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + namespace: default + name: httproute-1 + spec: + hostnames: + - gateway.envoyproxy.io + parentRefs: + - namespace: envoy-gateway + name: gateway-2 + sectionName: http + rules: + - matches: + - path: + value: "/" + backendRefs: + - name: service-1 + port: 8080 + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + namespace: default + name: httproute-2 + spec: + hostnames: + - gateway.envoyproxy.io + parentRefs: + - namespace: envoy-gateway + name: gateway-2 + sectionName: http + rules: + - matches: + - path: + value: "/v2" + backendRefs: + - name: service-2 + port: 8080 + - apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + namespace: default + name: httproute-3 + spec: + hostnames: + - gateway.envoyproxy.io + parentRefs: + - namespace: envoy-gateway + name: gateway-1 + sectionName: http + rules: + - matches: + - path: + value: "/v3" + backendRefs: + - name: service-3 + port: 8080 +backendTrafficPolicies: + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + metadata: + namespace: default + name: policy-for-route-1 + spec: + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: httproute-1 + healthCheck: + panicThreshold: 66 + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + metadata: + namespace: default + name: policy-for-route-2 + spec: + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: httproute-2 + healthCheck: + panicThreshold: 10 + - apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + metadata: + namespace: envoy-gateway + name: policy-for-all-routes-in-gateway-1 + spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: gateway-1 + healthCheck: + panicThreshold: 80 diff --git a/internal/gatewayapi/testdata/backendtrafficpolicy-with-panic-threshold.out.yaml b/internal/gatewayapi/testdata/backendtrafficpolicy-with-panic-threshold.out.yaml new file mode 100644 index 00000000000..caab297462b --- /dev/null +++ b/internal/gatewayapi/testdata/backendtrafficpolicy-with-panic-threshold.out.yaml @@ -0,0 +1,421 @@ +backendTrafficPolicies: +- apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + metadata: + creationTimestamp: null + name: policy-for-route-1 + namespace: default + spec: + healthCheck: + panicThreshold: 66 + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: httproute-1 + status: + ancestors: + - ancestorRef: + group: gateway.networking.k8s.io + kind: Gateway + name: gateway-2 + namespace: envoy-gateway + sectionName: http + conditions: + - lastTransitionTime: null + message: Policy has been accepted. + reason: Accepted + status: "True" + type: Accepted + controllerName: gateway.envoyproxy.io/gatewayclass-controller +- apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + metadata: + creationTimestamp: null + name: policy-for-route-2 + namespace: default + spec: + healthCheck: + panicThreshold: 10 + targetRef: + group: gateway.networking.k8s.io + kind: HTTPRoute + name: httproute-2 + status: + ancestors: + - ancestorRef: + group: gateway.networking.k8s.io + kind: Gateway + name: gateway-2 + namespace: envoy-gateway + sectionName: http + conditions: + - lastTransitionTime: null + message: Policy has been accepted. + reason: Accepted + status: "True" + type: Accepted + controllerName: gateway.envoyproxy.io/gatewayclass-controller +- apiVersion: gateway.envoyproxy.io/v1alpha1 + kind: BackendTrafficPolicy + metadata: + creationTimestamp: null + name: policy-for-all-routes-in-gateway-1 + namespace: envoy-gateway + spec: + healthCheck: + panicThreshold: 80 + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: gateway-1 + status: + ancestors: + - ancestorRef: + group: gateway.networking.k8s.io + kind: Gateway + name: gateway-1 + namespace: envoy-gateway + conditions: + - lastTransitionTime: null + message: Policy has been accepted. + reason: Accepted + status: "True" + type: Accepted + controllerName: gateway.envoyproxy.io/gatewayclass-controller +gateways: +- apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + creationTimestamp: null + name: gateway-1 + namespace: envoy-gateway + spec: + gatewayClassName: envoy-gateway-class + listeners: + - allowedRoutes: + namespaces: + from: All + name: http + port: 80 + protocol: HTTP + status: + listeners: + - attachedRoutes: 1 + conditions: + - lastTransitionTime: null + message: Sending translated listener configuration to the data plane + reason: Programmed + status: "True" + type: Programmed + - lastTransitionTime: null + message: Listener has been successfully translated + reason: Accepted + status: "True" + type: Accepted + - lastTransitionTime: null + message: Listener references have been resolved + reason: ResolvedRefs + status: "True" + type: ResolvedRefs + name: http + supportedKinds: + - group: gateway.networking.k8s.io + kind: HTTPRoute + - group: gateway.networking.k8s.io + kind: GRPCRoute +- apiVersion: gateway.networking.k8s.io/v1 + kind: Gateway + metadata: + creationTimestamp: null + name: gateway-2 + namespace: envoy-gateway + spec: + gatewayClassName: envoy-gateway-class + listeners: + - allowedRoutes: + namespaces: + from: All + name: http + port: 80 + protocol: HTTP + status: + listeners: + - attachedRoutes: 2 + conditions: + - lastTransitionTime: null + message: Sending translated listener configuration to the data plane + reason: Programmed + status: "True" + type: Programmed + - lastTransitionTime: null + message: Listener has been successfully translated + reason: Accepted + status: "True" + type: Accepted + - lastTransitionTime: null + message: Listener references have been resolved + reason: ResolvedRefs + status: "True" + type: ResolvedRefs + name: http + supportedKinds: + - group: gateway.networking.k8s.io + kind: HTTPRoute + - group: gateway.networking.k8s.io + kind: GRPCRoute +httpRoutes: +- apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + creationTimestamp: null + name: httproute-1 + namespace: default + spec: + hostnames: + - gateway.envoyproxy.io + parentRefs: + - name: gateway-2 + namespace: envoy-gateway + sectionName: http + rules: + - backendRefs: + - name: service-1 + port: 8080 + matches: + - path: + value: / + status: + parents: + - conditions: + - lastTransitionTime: null + message: Route is accepted + reason: Accepted + status: "True" + type: Accepted + - lastTransitionTime: null + message: Resolved all the Object references for the Route + reason: ResolvedRefs + status: "True" + type: ResolvedRefs + controllerName: gateway.envoyproxy.io/gatewayclass-controller + parentRef: + name: gateway-2 + namespace: envoy-gateway + sectionName: http +- apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + creationTimestamp: null + name: httproute-2 + namespace: default + spec: + hostnames: + - gateway.envoyproxy.io + parentRefs: + - name: gateway-2 + namespace: envoy-gateway + sectionName: http + rules: + - backendRefs: + - name: service-2 + port: 8080 + matches: + - path: + value: /v2 + status: + parents: + - conditions: + - lastTransitionTime: null + message: Route is accepted + reason: Accepted + status: "True" + type: Accepted + - lastTransitionTime: null + message: Resolved all the Object references for the Route + reason: ResolvedRefs + status: "True" + type: ResolvedRefs + controllerName: gateway.envoyproxy.io/gatewayclass-controller + parentRef: + name: gateway-2 + namespace: envoy-gateway + sectionName: http +- apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + creationTimestamp: null + name: httproute-3 + namespace: default + spec: + hostnames: + - gateway.envoyproxy.io + parentRefs: + - name: gateway-1 + namespace: envoy-gateway + sectionName: http + rules: + - backendRefs: + - name: service-3 + port: 8080 + matches: + - path: + value: /v3 + status: + parents: + - conditions: + - lastTransitionTime: null + message: Route is accepted + reason: Accepted + status: "True" + type: Accepted + - lastTransitionTime: null + message: Resolved all the Object references for the Route + reason: ResolvedRefs + status: "True" + type: ResolvedRefs + controllerName: gateway.envoyproxy.io/gatewayclass-controller + parentRef: + name: gateway-1 + namespace: envoy-gateway + sectionName: http +infraIR: + envoy-gateway/gateway-1: + proxy: + listeners: + - address: null + name: envoy-gateway/gateway-1/http + ports: + - containerPort: 10080 + name: http-80 + protocol: HTTP + servicePort: 80 + metadata: + labels: + gateway.envoyproxy.io/owning-gateway-name: gateway-1 + gateway.envoyproxy.io/owning-gateway-namespace: envoy-gateway + name: envoy-gateway/gateway-1 + envoy-gateway/gateway-2: + proxy: + listeners: + - address: null + name: envoy-gateway/gateway-2/http + ports: + - containerPort: 10080 + name: http-80 + protocol: HTTP + servicePort: 80 + metadata: + labels: + gateway.envoyproxy.io/owning-gateway-name: gateway-2 + gateway.envoyproxy.io/owning-gateway-namespace: envoy-gateway + name: envoy-gateway/gateway-2 +xdsIR: + envoy-gateway/gateway-1: + accessLog: + text: + - path: /dev/stdout + http: + - address: 0.0.0.0 + hostnames: + - '*' + isHTTP2: false + metadata: + kind: Gateway + name: gateway-1 + namespace: envoy-gateway + sectionName: http + name: envoy-gateway/gateway-1/http + path: + escapedSlashesAction: UnescapeAndRedirect + mergeSlashes: true + port: 10080 + routes: + - destination: + name: httproute/default/httproute-3/rule/0 + settings: + - addressType: IP + endpoints: + - host: 7.7.7.7 + port: 8080 + protocol: HTTP + weight: 1 + hostname: gateway.envoyproxy.io + isHTTP2: false + metadata: + kind: HTTPRoute + name: httproute-3 + namespace: default + name: httproute/default/httproute-3/rule/0/match/0/gateway_envoyproxy_io + pathMatch: + distinct: false + name: "" + prefix: /v3 + traffic: + healthCheck: + panicThreshold: 80 + envoy-gateway/gateway-2: + accessLog: + text: + - path: /dev/stdout + http: + - address: 0.0.0.0 + hostnames: + - '*' + isHTTP2: false + metadata: + kind: Gateway + name: gateway-2 + namespace: envoy-gateway + sectionName: http + name: envoy-gateway/gateway-2/http + path: + escapedSlashesAction: UnescapeAndRedirect + mergeSlashes: true + port: 10080 + routes: + - destination: + name: httproute/default/httproute-2/rule/0 + settings: + - addressType: IP + endpoints: + - host: 7.7.7.7 + port: 8080 + protocol: HTTP + weight: 1 + hostname: gateway.envoyproxy.io + isHTTP2: false + metadata: + kind: HTTPRoute + name: httproute-2 + namespace: default + name: httproute/default/httproute-2/rule/0/match/0/gateway_envoyproxy_io + pathMatch: + distinct: false + name: "" + prefix: /v2 + traffic: + healthCheck: + panicThreshold: 10 + - destination: + name: httproute/default/httproute-1/rule/0 + settings: + - addressType: IP + endpoints: + - host: 7.7.7.7 + port: 8080 + protocol: HTTP + weight: 1 + hostname: gateway.envoyproxy.io + isHTTP2: false + metadata: + kind: HTTPRoute + name: httproute-1 + namespace: default + name: httproute/default/httproute-1/rule/0/match/0/gateway_envoyproxy_io + pathMatch: + distinct: false + name: "" + prefix: / + traffic: + healthCheck: + panicThreshold: 66 diff --git a/internal/ir/xds.go b/internal/ir/xds.go index f428297ace5..bbec9abdf5b 100644 --- a/internal/ir/xds.go +++ b/internal/ir/xds.go @@ -77,6 +77,7 @@ var ( ErrOutlierDetectionIntervalInvalid = errors.New("field OutlierDetection.Interval must be specified") ErrBothXForwardedForAndCustomHeaderInvalid = errors.New("only one of ClientIPDetection.XForwardedFor and ClientIPDetection.CustomHeader must be set") ErrBothNumTrustedHopsAndTrustedCIDRsInvalid = errors.New("only one of ClientIPDetection.XForwardedFor.NumTrustedHops and ClientIPDetection.XForwardedFor.TrustedCIDRs must be set") + ErrPanicThresholdInvalid = errors.New("PanicThreshold value is outside of 0-100 range") redacted = []byte("[redacted]") ) @@ -2357,6 +2358,8 @@ type HealthCheck struct { Active *ActiveHealthCheck `json:"active,omitempty" yaml:"active,omitempty"` Passive *OutlierDetection `json:"passive,omitempty" yaml:"passive,omitempty"` + + PanicThreshold *uint32 `json:"panicThreshold,omitempty" yaml:"panicThreshold,omitempty"` } // OutlierDetection defines passive health check settings @@ -2456,6 +2459,12 @@ func (h *HealthCheck) Validate() error { } } + if h.PanicThreshold != nil { + if *h.PanicThreshold > 100 { + errs = errors.Join(errs, ErrPanicThresholdInvalid) + } + } + return errs } diff --git a/internal/ir/xds_test.go b/internal/ir/xds_test.go index 40e3e5ba7d8..d25996bbcf3 100644 --- a/internal/ir/xds_test.go +++ b/internal/ir/xds_test.go @@ -1400,11 +1400,29 @@ func TestValidateHealthCheck(t *testing.T) { Path: "/healthz", ExpectedStatuses: []HTTPStatus{200, 400}, }, - }, - &OutlierDetection{}, + }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckTimeoutInvalid, }, + { + name: "invalid panic threshold", + input: HealthCheck{ + &ActiveHealthCheck{ + Timeout: &metav1.Duration{Duration: time.Duration(3)}, + Interval: &metav1.Duration{Duration: time.Second}, + UnhealthyThreshold: ptr.To[uint32](3), + HealthyThreshold: ptr.To[uint32](3), + HTTP: &HTTPHealthChecker{ + Host: "*", + Path: "/healthz", + ExpectedStatuses: []HTTPStatus{200, 400}, + }, + }, &OutlierDetection{}, + ptr.To[uint32](200), + }, + want: ErrPanicThresholdInvalid, + }, { name: "invalid interval", input: HealthCheck{ @@ -1421,6 +1439,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckIntervalInvalid, }, @@ -1440,6 +1459,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckUnhealthyThresholdInvalid, }, @@ -1459,6 +1479,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckHealthyThresholdInvalid, }, @@ -1477,6 +1498,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHCHTTPHostInvalid, }, @@ -1496,6 +1518,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHCHTTPPathInvalid, }, @@ -1515,6 +1538,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHCHTTPMethodInvalid, }, @@ -1534,6 +1558,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHCHTTPExpectedStatusesInvalid, }, @@ -1553,6 +1578,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHTTPStatusInvalid, }, @@ -1576,6 +1602,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckPayloadInvalid, }, @@ -1598,6 +1625,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckPayloadInvalid, }, @@ -1620,6 +1648,7 @@ func TestValidateHealthCheck(t *testing.T) { }, }, &OutlierDetection{}, + ptr.To[uint32](10), }, want: ErrHealthCheckPayloadInvalid, }, @@ -1631,6 +1660,7 @@ func TestValidateHealthCheck(t *testing.T) { Interval: &metav1.Duration{Duration: time.Duration(0)}, BaseEjectionTime: &metav1.Duration{Duration: time.Second}, }, + ptr.To[uint32](10), }, want: ErrOutlierDetectionIntervalInvalid, }, @@ -1642,6 +1672,7 @@ func TestValidateHealthCheck(t *testing.T) { Interval: &metav1.Duration{Duration: time.Second}, BaseEjectionTime: &metav1.Duration{Duration: time.Duration(0)}, }, + ptr.To[uint32](10), }, want: ErrOutlierDetectionBaseEjectionTimeInvalid, }, diff --git a/internal/ir/zz_generated.deepcopy.go b/internal/ir/zz_generated.deepcopy.go index 3c42375daa5..1856e1da8ae 100644 --- a/internal/ir/zz_generated.deepcopy.go +++ b/internal/ir/zz_generated.deepcopy.go @@ -1793,6 +1793,11 @@ func (in *HealthCheck) DeepCopyInto(out *HealthCheck) { *out = new(OutlierDetection) (*in).DeepCopyInto(*out) } + if in.PanicThreshold != nil { + in, out := &in.PanicThreshold, &out.PanicThreshold + *out = new(uint32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HealthCheck. diff --git a/internal/xds/translator/cluster.go b/internal/xds/translator/cluster.go index 39a1f9a4f27..c3b6ef87c86 100644 --- a/internal/xds/translator/cluster.go +++ b/internal/xds/translator/cluster.go @@ -95,6 +95,7 @@ func buildXdsCluster(args *xdsClusterArgs) *clusterv3.Cluster { dnsLookupFamily = clusterv3.Cluster_ALL } } + cluster := &clusterv3.Cluster{ Name: args.name, DnsLookupFamily: dnsLookupFamily, @@ -106,6 +107,13 @@ func buildXdsCluster(args *xdsClusterArgs) *clusterv3.Cluster { PerConnectionBufferLimitBytes: buildBackandConnectionBufferLimitBytes(args.backendConnection), } + // 50% is the Envoy default value for panic threshold. No need to explicitly set it in this case. + if args.healthCheck != nil && args.healthCheck.PanicThreshold != nil && *args.healthCheck.PanicThreshold != 50 { + cluster.CommonLbConfig.HealthyPanicThreshold = &xdstype.Percent{ + Value: float64(*args.healthCheck.PanicThreshold), + } + } + cluster.ConnectTimeout = buildConnectTimeout(args.timeout) // Initialize TrackClusterStats if any metrics are enabled diff --git a/internal/xds/translator/cluster_test.go b/internal/xds/translator/cluster_test.go index f4b71c59b44..131a37fef39 100644 --- a/internal/xds/translator/cluster_test.go +++ b/internal/xds/translator/cluster_test.go @@ -14,6 +14,7 @@ import ( "github.com/stretchr/testify/require" "google.golang.org/protobuf/encoding/protojson" "google.golang.org/protobuf/proto" + "k8s.io/utils/ptr" "sigs.k8s.io/yaml" "github.com/envoyproxy/gateway/internal/ir" @@ -32,6 +33,9 @@ func TestBuildXdsCluster(t *testing.T) { name: bootstrapXdsCluster.Name, tSocket: bootstrapXdsCluster.TransportSocket, endpointType: EndpointTypeDNS, + healthCheck: &ir.HealthCheck{ + PanicThreshold: ptr.To[uint32](66), + }, } dynamicXdsCluster := buildXdsCluster(args) diff --git a/internal/xds/translator/testdata/in/xds-ir/panic-threshold.yaml b/internal/xds/translator/testdata/in/xds-ir/panic-threshold.yaml new file mode 100644 index 00000000000..781d07ca8a2 --- /dev/null +++ b/internal/xds/translator/testdata/in/xds-ir/panic-threshold.yaml @@ -0,0 +1,66 @@ +http: +- name: "first-listener" + address: "::" + path: + mergeSlashes: true + escapedSlashesAction: UnescapeAndRedirect + port: 10080 + hostnames: + - "*" + routes: + - name: "first-route" + hostname: "*" + traffic: + healthCheck: + panicThreshold: 66 + destination: + name: "first-route-dest" + settings: + - endpoints: + - host: "1.2.3.4" + port: 50000 + - name: "second-route" + hostname: "*" + traffic: + healthCheck: + panicThreshold: 80 + destination: + name: "second-route-dest" + settings: + - endpoints: + - host: "1.2.3.4" + port: 50000 + - name: "third-route" + hostname: "*" + traffic: + healthCheck: + panicThreshold: 20 + destination: + name: "third-route-dest" + settings: + - endpoints: + - host: "1.2.3.4" + port: 50000 + - name: "fourth-route" + hostname: "*" + traffic: + healthCheck: + panicThreshold: 50 + destination: + name: "fourth-route-dest" + settings: + - endpoints: + - host: "1.2.3.4" + port: 50000 + - name: "fifth-route" + hostname: "*" + traffic: + healthCheck: + panicThreshold: 80 + destination: + name: "fifth-route-dest" + protocol: GRPC + settings: + - endpoints: + - host: "1.2.3.4" + port: 50000 diff --git a/internal/xds/translator/testdata/out/xds-ir/panic-threshold.clusters.yaml b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.clusters.yaml new file mode 100644 index 00000000000..6c478efffde --- /dev/null +++ b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.clusters.yaml @@ -0,0 +1,93 @@ +- circuitBreakers: + thresholds: + - maxRetries: 1024 + commonLbConfig: + healthyPanicThreshold: + value: 66 + localityWeightedLbConfig: {} + connectTimeout: 10s + dnsLookupFamily: V4_PREFERRED + edsClusterConfig: + edsConfig: + ads: {} + resourceApiVersion: V3 + serviceName: first-route-dest + ignoreHealthOnHostRemoval: true + lbPolicy: LEAST_REQUEST + name: first-route-dest + perConnectionBufferLimitBytes: 32768 + type: EDS +- circuitBreakers: + thresholds: + - maxRetries: 1024 + commonLbConfig: + healthyPanicThreshold: + value: 80 + localityWeightedLbConfig: {} + connectTimeout: 10s + dnsLookupFamily: V4_PREFERRED + edsClusterConfig: + edsConfig: + ads: {} + resourceApiVersion: V3 + serviceName: second-route-dest + ignoreHealthOnHostRemoval: true + lbPolicy: LEAST_REQUEST + name: second-route-dest + perConnectionBufferLimitBytes: 32768 + type: EDS +- circuitBreakers: + thresholds: + - maxRetries: 1024 + commonLbConfig: + healthyPanicThreshold: + value: 20 + localityWeightedLbConfig: {} + connectTimeout: 10s + dnsLookupFamily: V4_PREFERRED + edsClusterConfig: + edsConfig: + ads: {} + resourceApiVersion: V3 + serviceName: third-route-dest + ignoreHealthOnHostRemoval: true + lbPolicy: LEAST_REQUEST + name: third-route-dest + perConnectionBufferLimitBytes: 32768 + type: EDS +- circuitBreakers: + thresholds: + - maxRetries: 1024 + commonLbConfig: + localityWeightedLbConfig: {} + connectTimeout: 10s + dnsLookupFamily: V4_PREFERRED + edsClusterConfig: + edsConfig: + ads: {} + resourceApiVersion: V3 + serviceName: fourth-route-dest + ignoreHealthOnHostRemoval: true + lbPolicy: LEAST_REQUEST + name: fourth-route-dest + perConnectionBufferLimitBytes: 32768 + type: EDS +- circuitBreakers: + thresholds: + - maxRetries: 1024 + commonLbConfig: + healthyPanicThreshold: + value: 80 + localityWeightedLbConfig: {} + connectTimeout: 10s + dnsLookupFamily: V4_PREFERRED + edsClusterConfig: + edsConfig: + ads: {} + resourceApiVersion: V3 + serviceName: fifth-route-dest + ignoreHealthOnHostRemoval: true + lbPolicy: LEAST_REQUEST + name: fifth-route-dest + perConnectionBufferLimitBytes: 32768 + type: EDS diff --git a/internal/xds/translator/testdata/out/xds-ir/panic-threshold.endpoints.yaml b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.endpoints.yaml new file mode 100644 index 00000000000..b93d9b43bde --- /dev/null +++ b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.endpoints.yaml @@ -0,0 +1,60 @@ +- clusterName: first-route-dest + endpoints: + - lbEndpoints: + - endpoint: + address: + socketAddress: + address: 1.2.3.4 + portValue: 50000 + loadBalancingWeight: 1 + loadBalancingWeight: 1 + locality: + region: first-route-dest/backend/0 +- clusterName: second-route-dest + endpoints: + - lbEndpoints: + - endpoint: + address: + socketAddress: + address: 1.2.3.4 + portValue: 50000 + loadBalancingWeight: 1 + loadBalancingWeight: 1 + locality: + region: second-route-dest/backend/0 +- clusterName: third-route-dest + endpoints: + - lbEndpoints: + - endpoint: + address: + socketAddress: + address: 1.2.3.4 + portValue: 50000 + loadBalancingWeight: 1 + loadBalancingWeight: 1 + locality: + region: third-route-dest/backend/0 +- clusterName: fourth-route-dest + endpoints: + - lbEndpoints: + - endpoint: + address: + socketAddress: + address: 1.2.3.4 + portValue: 50000 + loadBalancingWeight: 1 + loadBalancingWeight: 1 + locality: + region: fourth-route-dest/backend/0 +- clusterName: fifth-route-dest + endpoints: + - lbEndpoints: + - endpoint: + address: + socketAddress: + address: 1.2.3.4 + portValue: 50000 + loadBalancingWeight: 1 + loadBalancingWeight: 1 + locality: + region: fifth-route-dest/backend/0 diff --git a/internal/xds/translator/testdata/out/xds-ir/panic-threshold.listeners.yaml b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.listeners.yaml new file mode 100644 index 00000000000..80ae84fd104 --- /dev/null +++ b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.listeners.yaml @@ -0,0 +1,34 @@ +- address: + socketAddress: + address: '::' + portValue: 10080 + defaultFilterChain: + filters: + - name: envoy.filters.network.http_connection_manager + typedConfig: + '@type': type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + commonHttpProtocolOptions: + headersWithUnderscoresAction: REJECT_REQUEST + http2ProtocolOptions: + initialConnectionWindowSize: 1048576 + initialStreamWindowSize: 65536 + maxConcurrentStreams: 100 + httpFilters: + - name: envoy.filters.http.router + typedConfig: + '@type': type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + suppressEnvoyHeaders: true + mergeSlashes: true + normalizePath: true + pathWithEscapedSlashesAction: UNESCAPE_AND_REDIRECT + rds: + configSource: + ads: {} + resourceApiVersion: V3 + routeConfigName: first-listener + serverHeaderTransformation: PASS_THROUGH + statPrefix: http-10080 + useRemoteAddress: true + name: first-listener + name: first-listener + perConnectionBufferLimitBytes: 32768 diff --git a/internal/xds/translator/testdata/out/xds-ir/panic-threshold.routes.yaml b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.routes.yaml new file mode 100644 index 00000000000..2f5c4977b24 --- /dev/null +++ b/internal/xds/translator/testdata/out/xds-ir/panic-threshold.routes.yaml @@ -0,0 +1,42 @@ +- ignorePortInHostMatching: true + name: first-listener + virtualHosts: + - domains: + - '*' + name: first-listener/* + routes: + - match: + prefix: / + name: first-route + route: + cluster: first-route-dest + upgradeConfigs: + - upgradeType: websocket + - match: + prefix: / + name: second-route + route: + cluster: second-route-dest + upgradeConfigs: + - upgradeType: websocket + - match: + prefix: / + name: third-route + route: + cluster: third-route-dest + upgradeConfigs: + - upgradeType: websocket + - match: + prefix: / + name: fourth-route + route: + cluster: fourth-route-dest + upgradeConfigs: + - upgradeType: websocket + - match: + prefix: / + name: fifth-route + route: + cluster: fifth-route-dest + upgradeConfigs: + - upgradeType: websocket diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 71fedaf1d1b..12181f7cc3a 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -2242,12 +2242,6 @@ _Appears in:_ HealthCheck configuration to decide which endpoints are healthy and can be used for routing. - -Note: Once the overall health of the backendRef drops below 50% (e.g. a backendRef having 10 endpoints -with more than 5 unhealthy endpoints), Envoy will disregard health status and balance across all endpoints. -This is called "panic mode". It's designed to prevent a situation in which host failures cascade throughout the cluster -as load increases. - _Appears in:_ - [BackendTrafficPolicySpec](#backendtrafficpolicyspec) - [ClusterSettings](#clustersettings) @@ -2256,6 +2250,7 @@ _Appears in:_ | --- | --- | --- | --- | --- | | `active` | _[ActiveHealthCheck](#activehealthcheck)_ | false | | Active health check configuration | | `passive` | _[PassiveHealthCheck](#passivehealthcheck)_ | false | | Passive passive check configuration | +| `panicThreshold` | _integer_ | false | | When number of unhealthy endpoints for a backend reaches this threshold
Envoy will disregard health status and balance across all endpoints.
It's designed to prevent a situation in which host failures cascade throughout the cluster
as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. | #### HealthCheckSettings diff --git a/site/content/zh/latest/api/extension_types.md b/site/content/zh/latest/api/extension_types.md index 71fedaf1d1b..12181f7cc3a 100644 --- a/site/content/zh/latest/api/extension_types.md +++ b/site/content/zh/latest/api/extension_types.md @@ -2242,12 +2242,6 @@ _Appears in:_ HealthCheck configuration to decide which endpoints are healthy and can be used for routing. - -Note: Once the overall health of the backendRef drops below 50% (e.g. a backendRef having 10 endpoints -with more than 5 unhealthy endpoints), Envoy will disregard health status and balance across all endpoints. -This is called "panic mode". It's designed to prevent a situation in which host failures cascade throughout the cluster -as load increases. - _Appears in:_ - [BackendTrafficPolicySpec](#backendtrafficpolicyspec) - [ClusterSettings](#clustersettings) @@ -2256,6 +2250,7 @@ _Appears in:_ | --- | --- | --- | --- | --- | | `active` | _[ActiveHealthCheck](#activehealthcheck)_ | false | | Active health check configuration | | `passive` | _[PassiveHealthCheck](#passivehealthcheck)_ | false | | Passive passive check configuration | +| `panicThreshold` | _integer_ | false | | When number of unhealthy endpoints for a backend reaches this threshold
Envoy will disregard health status and balance across all endpoints.
It's designed to prevent a situation in which host failures cascade throughout the cluster
as load increases. If not set, the default value is 50%. To disable panic mode, set value to `0`. | #### HealthCheckSettings diff --git a/test/cel-validation/backendtrafficpolicy_test.go b/test/cel-validation/backendtrafficpolicy_test.go index a75c7fa8845..41e3c75ac0e 100644 --- a/test/cel-validation/backendtrafficpolicy_test.go +++ b/test/cel-validation/backendtrafficpolicy_test.go @@ -1633,6 +1633,50 @@ func TestBackendTrafficPolicyTarget(t *testing.T) { }, wantErrors: []string{`response cost is not supported for Local Rate Limits`}, }, + { + desc: "panicThreshold is set", + mutate: func(btp *egv1a1.BackendTrafficPolicy) { + btp.Spec = egv1a1.BackendTrafficPolicySpec{ + PolicyTargetReferences: egv1a1.PolicyTargetReferences{ + TargetRef: &gwapiv1a2.LocalPolicyTargetReferenceWithSectionName{ + LocalPolicyTargetReference: gwapiv1a2.LocalPolicyTargetReference{ + Group: gwapiv1a2.Group("gateway.networking.k8s.io"), + Kind: gwapiv1a2.Kind("Gateway"), + Name: gwapiv1a2.ObjectName("eg"), + }, + }, + }, + ClusterSettings: egv1a1.ClusterSettings{ + HealthCheck: &egv1a1.HealthCheck{ + PanicThreshold: ptr.To[uint32](80), + }, + }, + } + }, + wantErrors: []string{}, + }, + { + desc: "panicThreshold fails validation", + mutate: func(btp *egv1a1.BackendTrafficPolicy) { + btp.Spec = egv1a1.BackendTrafficPolicySpec{ + PolicyTargetReferences: egv1a1.PolicyTargetReferences{ + TargetRef: &gwapiv1a2.LocalPolicyTargetReferenceWithSectionName{ + LocalPolicyTargetReference: gwapiv1a2.LocalPolicyTargetReference{ + Group: gwapiv1a2.Group("gateway.networking.k8s.io"), + Kind: gwapiv1a2.Kind("Gateway"), + Name: gwapiv1a2.ObjectName("eg"), + }, + }, + }, + ClusterSettings: egv1a1.ClusterSettings{ + HealthCheck: &egv1a1.HealthCheck{ + PanicThreshold: ptr.To[uint32](200), + }, + }, + } + }, + wantErrors: []string{`Invalid value: 200: spec.healthCheck.panicThreshold in body should be less than or equal to 100`}, + }, } for _, tc := range cases {