From bdd86f750023ff985c3c197dccfbc857d64d134e Mon Sep 17 00:00:00 2001 From: Peter Wilcsinszky Date: Tue, 17 Dec 2024 14:29:19 +0100 Subject: [PATCH] feat: force HotReload after grace period instead of blocking indefinitely Signed-off-by: Peter Wilcsinszky --- ...ogging.banzaicloud.io_fluentbitagents.yaml | 2 + .../logging.banzaicloud.io_loggings.yaml | 2 + ...ogging.banzaicloud.io_fluentbitagents.yaml | 2 + .../crds/logging.banzaicloud.io_loggings.yaml | 2 + ...ogging.banzaicloud.io_fluentbitagents.yaml | 2 + .../logging.banzaicloud.io_loggings.yaml | 2 + .../logging/tenant-infra-logging.yaml | 15 ++++- .../crds/v1beta1/fluentbit_types.md | 5 ++ pkg/resources/fluentbit/config.go | 3 + pkg/resources/fluentbit/configsecret.go | 56 ++++++++++--------- .../logging/api/v1beta1/fluentbit_types.go | 5 ++ 11 files changed, 68 insertions(+), 28 deletions(-) diff --git a/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_fluentbitagents.yaml b/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_fluentbitagents.yaml index 2b13fc0e2..2e23e6653 100644 --- a/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_fluentbitagents.yaml +++ b/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_fluentbitagents.yaml @@ -1480,6 +1480,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_loggings.yaml b/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_loggings.yaml index 66caf6f99..d49824251 100644 --- a/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_loggings.yaml +++ b/charts/logging-operator/charts/logging-operator-crds/templates/logging.banzaicloud.io_loggings.yaml @@ -2325,6 +2325,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml b/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml index 7d0262f59..da8d22be9 100644 --- a/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml +++ b/charts/logging-operator/crds/logging.banzaicloud.io_fluentbitagents.yaml @@ -1477,6 +1477,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml b/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml index f36867851..3da7d651d 100644 --- a/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml +++ b/charts/logging-operator/crds/logging.banzaicloud.io_loggings.yaml @@ -2322,6 +2322,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml b/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml index 7d0262f59..da8d22be9 100644 --- a/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml +++ b/config/crd/bases/logging.banzaicloud.io_fluentbitagents.yaml @@ -1477,6 +1477,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/config/crd/bases/logging.banzaicloud.io_loggings.yaml b/config/crd/bases/logging.banzaicloud.io_loggings.yaml index f36867851..3da7d651d 100644 --- a/config/crd/bases/logging.banzaicloud.io_loggings.yaml +++ b/config/crd/bases/logging.banzaicloud.io_loggings.yaml @@ -2322,6 +2322,8 @@ spec: flush: format: int32 type: integer + forceHotReloadAfterGrace: + type: boolean forwardOptions: properties: Require_ack_response: diff --git a/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml b/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml index b58e6667b..bdbd6ddbd 100644 --- a/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml +++ b/config/samples/multitenant-routing/logging/tenant-infra-logging.yaml @@ -11,7 +11,8 @@ metadata: tenant: infra spec: loggingRef: infra - fluentd: {} + fluentd: + metrics: {} controlNamespace: infra --- apiVersion: logging.banzaicloud.io/v1beta1 @@ -49,8 +50,12 @@ metadata: name: infra spec: loggingRef: infra + # this is required to reload even if there are pending tasks in one of the queues + # requires grace to be set, which is 5 by default + forceHotReloadAfterGrace: true inputTail: storage.type: filesystem + storage.pause_on_chunks_overlimit: "off" positiondb: hostPath: path: "" @@ -59,7 +64,15 @@ spec: path: "" network: connectTimeout: 2 + keepaliveMaxRecycle: 20 metrics: {} + bufferStorage: + storage.max_chunks_up: 10 + forwardOptions: + storage.total_limit_size: 50MB + image: + tag: 3.1.10-debug + configHotReload: {} --- apiVersion: logging.banzaicloud.io/v1beta1 kind: LoggingRoute diff --git a/docs/configuration/crds/v1beta1/fluentbit_types.md b/docs/configuration/crds/v1beta1/fluentbit_types.md index 623322e8b..12913ac11 100644 --- a/docs/configuration/crds/v1beta1/fluentbit_types.md +++ b/docs/configuration/crds/v1beta1/fluentbit_types.md @@ -128,6 +128,11 @@ Set the flush time in seconds.nanoseconds. The engine loop uses a Flush timeout Default: 1 +### forceHotReloadAfterGrace (bool, optional) {#fluentbitspec-forcehotreloadaftergrace} + +HotReload pauses all inputs and waits until they finish. In certain situations this is unacceptable, for example if an output is down for a longer time. An undocumented option called "Hot_Reload.Ensure_Thread_Safety Off" can be used at the [SERVICE] config to force hotreload after the grace period. Please note that it might result in a SIGSEGV, but worst case kubelet will restart the container. See https://github.com/fluent/fluent-bit/pull/7509 + + ### forwardOptions (*ForwardOptions, optional) {#fluentbitspec-forwardoptions} diff --git a/pkg/resources/fluentbit/config.go b/pkg/resources/fluentbit/config.go index 8514141e7..15d02c5e2 100644 --- a/pkg/resources/fluentbit/config.go +++ b/pkg/resources/fluentbit/config.go @@ -26,6 +26,9 @@ var fluentBitConfigTemplate = ` [SERVICE] Flush {{ .Flush }} Grace {{ .Grace }} + {{- if .ForceHotReloadAfterGrace }} + Hot_Reload.Ensure_Thread_Safety off + {{- end }} Daemon Off Log_Level {{ .LogLevel }} Parsers_File {{ .DefaultParsers }} diff --git a/pkg/resources/fluentbit/configsecret.go b/pkg/resources/fluentbit/configsecret.go index 729a06fb8..ad5e418d8 100644 --- a/pkg/resources/fluentbit/configsecret.go +++ b/pkg/resources/fluentbit/configsecret.go @@ -64,24 +64,25 @@ type fluentBitConfig struct { Port int32 Path string } - Flush int32 - Grace int32 - LogLevel string - EnabledIPv6 bool - CoroStackSize int32 - Output map[string]string - Input fluentbitInputConfig - Inputs []fluentbitInputConfigWithTenant - DisableKubernetesFilter bool - KubernetesFilter map[string]string - AwsFilter map[string]string - BufferStorage map[string]string - FilterModify []v1beta1.FilterModify - FluentForwardOutput *fluentForwardOutputConfig - SyslogNGOutput *syslogNGOutputConfig - DefaultParsers string - CustomParsers string - HealthCheck *v1beta1.HealthCheck + Flush int32 + Grace int32 + LogLevel string + EnabledIPv6 bool + CoroStackSize int32 + Output map[string]string + ForceHotReloadAfterGrace bool + Input fluentbitInputConfig + Inputs []fluentbitInputConfigWithTenant + DisableKubernetesFilter bool + KubernetesFilter map[string]string + AwsFilter map[string]string + BufferStorage map[string]string + FilterModify []v1beta1.FilterModify + FluentForwardOutput *fluentForwardOutputConfig + SyslogNGOutput *syslogNGOutputConfig + DefaultParsers string + CustomParsers string + HealthCheck *v1beta1.HealthCheck } type fluentForwardOutputConfig struct { @@ -214,15 +215,16 @@ func (r *Reconciler) configSecret() (runtime.Object, reconciler.DesiredState, er } input := fluentBitConfig{ - Flush: r.fluentbitSpec.Flush, - Grace: r.fluentbitSpec.Grace, - LogLevel: r.fluentbitSpec.LogLevel, - EnabledIPv6: r.fluentbitSpec.EnabledIPv6, - CoroStackSize: r.fluentbitSpec.CoroStackSize, - Namespace: r.Logging.Spec.ControlNamespace, - DisableKubernetesFilter: disableKubernetesFilter, - FilterModify: r.fluentbitSpec.FilterModify, - HealthCheck: r.fluentbitSpec.HealthCheck, + Flush: r.fluentbitSpec.Flush, + Grace: r.fluentbitSpec.Grace, + ForceHotReloadAfterGrace: r.fluentbitSpec.ForceHotReloadAfterGrace, + LogLevel: r.fluentbitSpec.LogLevel, + EnabledIPv6: r.fluentbitSpec.EnabledIPv6, + CoroStackSize: r.fluentbitSpec.CoroStackSize, + Namespace: r.Logging.Spec.ControlNamespace, + DisableKubernetesFilter: disableKubernetesFilter, + FilterModify: r.fluentbitSpec.FilterModify, + HealthCheck: r.fluentbitSpec.HealthCheck, } input.DefaultParsers = fmt.Sprintf("%s/%s", StockConfigPath, "parsers.conf") diff --git a/pkg/sdk/logging/api/v1beta1/fluentbit_types.go b/pkg/sdk/logging/api/v1beta1/fluentbit_types.go index b672e180b..d931c4d40 100644 --- a/pkg/sdk/logging/api/v1beta1/fluentbit_types.go +++ b/pkg/sdk/logging/api/v1beta1/fluentbit_types.go @@ -77,6 +77,11 @@ type FluentbitSpec struct { Flush int32 `json:"flush,omitempty" plugin:"default:1"` // Set the grace time in seconds as Integer value. The engine loop uses a Grace timeout to define wait time on exit. Grace int32 `json:"grace,omitempty" plugin:"default:5"` + // HotReload pauses all inputs and waits until they finish. In certain situations this is unacceptable, for example if an output is down for a longer time. + // An undocumented option called "Hot_Reload.Ensure_Thread_Safety Off" can be used at the [SERVICE] config to force hotreload after the grace period. + // Please note that it might result in a SIGSEGV, but worst case kubelet will restart the container. + // See https://github.com/fluent/fluent-bit/pull/7509 + ForceHotReloadAfterGrace bool `json:"forceHotReloadAfterGrace,omitempty"` // Set the logging verbosity level. Allowed values are: error, warn, info, debug and trace. Values are accumulative, e.g: if 'debug' is set, it will include error, warning, info and debug. Note that trace mode is only available if Fluent Bit was built with the WITH_TRACE option enabled. LogLevel string `json:"logLevel,omitempty" plugin:"default:info"` // Set the coroutines stack size in bytes. The value must be greater than the page size of the running system. Don't set too small value (say 4096), or coroutine threads can overrun the stack buffer.