From ee3e1dc4c721a7bf15c13c45b4174f5ccaa56302 Mon Sep 17 00:00:00 2001 From: Peter Wilcsinszky Date: Tue, 17 Dec 2024 14:29:19 +0100 Subject: [PATCH] feat: force HotReload after grace period instead of blocking indefinitely Signed-off-by: Peter Wilcsinszky --- ...ogging.banzaicloud.io_fluentbitagents.yaml | 2 + .../crds/logging.banzaicloud.io_loggings.yaml | 2 + ...ogging.banzaicloud.io_fluentbitagents.yaml | 2 + .../logging.banzaicloud.io_loggings.yaml | 2 + .../logging/tenant-infra-logging.yaml | 15 +++++- .../crds/v1beta1/fluentbit_types.md | 5 ++ pkg/resources/fluentbit/config.go | 3 ++ pkg/resources/fluentbit/configsecret.go | 52 ++++++++++--------- .../logging/api/v1beta1/fluentbit_types.go | 5 ++ 9 files changed, 61 insertions(+), 27 deletions(-) diff --git a/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml b/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml index f104a8afd..eb1ee38ba 100644 --- a/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml +++ b/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml @@ -1386,6 +1386,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml b/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml index f78911431..18b48d72e 100644 --- a/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml +++ b/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml @@ -2433,6 +2433,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml b/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml index f104a8afd..eb1ee38ba 100644 --- a/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml +++ b/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml @@ -1386,6 +1386,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/config/crd/bases/logging.banzaicloud.io_loggings.yaml b/config/crd/bases/logging.banzaicloud.io_loggings.yaml index f78911431..18b48d72e 100644 --- a/config/crd/bases/logging.banzaicloud.io_loggings.yaml +++ b/config/crd/bases/logging.banzaicloud.io_loggings.yaml @@ -2433,6 +2433,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml b/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml index 2332fb827..bdbd6ddbd 100644 --- a/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml +++ b/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml @@ -11,7 +11,8 @@ metadata: tenant: infra spec: loggingRef: infra - fluentd: {} + fluentd: + metrics: {} controlNamespace: infra --- apiVersion: logging.banzaicloud.io/v1beta1 @@ -49,8 +50,12 @@ metadata: name: infra spec: loggingRef: infra + # this is required to reload even if there are pending tasks in one of the queues + # requires grace to be set, which is 5 by default + forceHotReloadAfterGrace: true inputTail: storage.type: filesystem + storage.pause_on_chunks_overlimit: "off" positiondb: hostPath: path: "" @@ -59,9 +64,15 @@ spec: path: "" network: connectTimeout: 2 + keepaliveMaxRecycle: 20 metrics: {} + bufferStorage: + storage.max_chunks_up: 10 + forwardOptions: + storage.total_limit_size: 50MB image: - tag: 2.2.2-debug + tag: 3.1.10-debug + configHotReload: {} --- apiVersion: logging.banzaicloud.io/v1beta1 kind: LoggingRoute diff --git a/docs/configuration/crds/v1beta1/fluentbit_types.md b/docs/configuration/crds/v1beta1/fluentbit_types.md index d58b705d9..4ab34baa6 100644 --- a/docs/configuration/crds/v1beta1/fluentbit_types.md +++ b/docs/configuration/crds/v1beta1/fluentbit_types.md @@ -122,6 +122,11 @@ Set the flush time in seconds.nanoseconds. The engine loop uses a Flush timeout Default: 1 +### forceHotReloadAfterGrace (bool, optional) {#fluentbitspec-forcehotreloadaftergrace} + +HotReload pauses all inputs and waits until they finish. In certain situations this is unacceptable, for example if an output is down for a longer time. An undocumented option called "Hot_Reload.Ensure_Thread_Safety Off" can be used at the [SERVICE] config to force hotreload after the grace period. Please note that it might result in a SIGSEGV, but worst case kubelet will restart the container. See https://github.com/fluent/fluent-bit/pull/7509 + + ### forwardOptions (*ForwardOptions, optional) {#fluentbitspec-forwardoptions} diff --git a/pkg/resources/fluentbit/config.go b/pkg/resources/fluentbit/config.go index fd76c7e00..ca86d020e 100644 --- a/pkg/resources/fluentbit/config.go +++ b/pkg/resources/fluentbit/config.go @@ -26,6 +26,9 @@ var fluentBitConfigTemplate = ` [SERVICE] Flush {{ .Flush }} Grace {{ .Grace }} + {{- if .ForceHotReloadAfterGrace }} + Hot_Reload.Ensure_Thread_Safety off + {{- end }} Daemon Off Log_Level {{ .LogLevel }} Parsers_File {{ .DefaultParsers }} diff --git a/pkg/resources/fluentbit/configsecret.go b/pkg/resources/fluentbit/configsecret.go index 7bc12bdca..b3636c6d6 100644 --- a/pkg/resources/fluentbit/configsecret.go +++ b/pkg/resources/fluentbit/configsecret.go @@ -64,23 +64,24 @@ type fluentBitConfig struct { Port int32 Path string } - Flush int32 - Grace int32 - LogLevel string - CoroStackSize int32 - Output map[string]string - Input fluentbitInputConfig - Inputs []fluentbitInputConfigWithTenant - DisableKubernetesFilter bool - KubernetesFilter map[string]string - AwsFilter map[string]string - BufferStorage map[string]string - FilterModify []v1beta1.FilterModify - FluentForwardOutput *fluentForwardOutputConfig - SyslogNGOutput *syslogNGOutputConfig - DefaultParsers string - CustomParsers string - HealthCheck *v1beta1.HealthCheck + Flush int32 + Grace int32 + LogLevel string + CoroStackSize int32 + Output map[string]string + ForceHotReloadAfterGrace bool + Input fluentbitInputConfig + Inputs []fluentbitInputConfigWithTenant + DisableKubernetesFilter bool + KubernetesFilter map[string]string + AwsFilter map[string]string + BufferStorage map[string]string + FilterModify []v1beta1.FilterModify + FluentForwardOutput *fluentForwardOutputConfig + SyslogNGOutput *syslogNGOutputConfig + DefaultParsers string + CustomParsers string + HealthCheck *v1beta1.HealthCheck } type fluentForwardOutputConfig struct { @@ -213,14 +214,15 @@ func (r *Reconciler) configSecret() (runtime.Object, reconciler.DesiredState, er } input := fluentBitConfig{ - Flush: r.fluentbitSpec.Flush, - Grace: r.fluentbitSpec.Grace, - LogLevel: r.fluentbitSpec.LogLevel, - CoroStackSize: r.fluentbitSpec.CoroStackSize, - Namespace: r.Logging.Spec.ControlNamespace, - DisableKubernetesFilter: disableKubernetesFilter, - FilterModify: r.fluentbitSpec.FilterModify, - HealthCheck: r.fluentbitSpec.HealthCheck, + Flush: r.fluentbitSpec.Flush, + Grace: r.fluentbitSpec.Grace, + ForceHotReloadAfterGrace: r.fluentbitSpec.ForceHotReloadAfterGrace, + LogLevel: r.fluentbitSpec.LogLevel, + CoroStackSize: r.fluentbitSpec.CoroStackSize, + Namespace: r.Logging.Spec.ControlNamespace, + DisableKubernetesFilter: disableKubernetesFilter, + FilterModify: r.fluentbitSpec.FilterModify, + HealthCheck: r.fluentbitSpec.HealthCheck, } input.DefaultParsers = fmt.Sprintf("%s/%s", StockConfigPath, "parsers.conf") diff --git a/pkg/sdk/logging/api/v1beta1/fluentbit_types.go b/pkg/sdk/logging/api/v1beta1/fluentbit_types.go index d57ff5449..dd7262685 100644 --- a/pkg/sdk/logging/api/v1beta1/fluentbit_types.go +++ b/pkg/sdk/logging/api/v1beta1/fluentbit_types.go @@ -76,6 +76,11 @@ type FluentbitSpec struct { Flush int32 `json:"flush,omitempty" plugin:"default:1"` // Set the grace time in seconds as Integer value. The engine loop uses a Grace timeout to define wait time on exit. Grace int32 `json:"grace,omitempty" plugin:"default:5"` + // HotReload pauses all inputs and waits until they finish. In certain situations this is unacceptable, for example if an output is down for a longer time. + // An undocumented option called "Hot_Reload.Ensure_Thread_Safety Off" can be used at the [SERVICE] config to force hotreload after the grace period. + // Please note that it might result in a SIGSEGV, but worst case kubelet will restart the container. + // See https://github.com/fluent/fluent-bit/pull/7509 + ForceHotReloadAfterGrace bool `json:"forceHotReloadAfterGrace,omitempty"` // Set the logging verbosity level. Allowed values are: error, warn, info, debug and trace. Values are accumulative, e.g: if 'debug' is set, it will include error, warning, info and debug. Note that trace mode is only available if Fluent Bit was built with the WITH_TRACE option enabled. LogLevel string `json:"logLevel,omitempty" plugin:"default:info"` // Set the coroutines stack size in bytes. The value must be greater than the page size of the running system. Don't set too small value (say 4096), or coroutine threads can overrun the stack buffer.