From c16a2c9607b01a8488c65427bfba50b3d1c789e0 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Fri, 9 Feb 2024 12:31:38 +0100 Subject: [PATCH 1/2] [KNI] log: add the logtracr logger Introduce the logtracr logger. The logtracr logger builds upon and augments the existing klog (+logr) infrastructure to provide per-object (vs per-component) log. This logger implement a simplistic, in-process lookalike of a tracing logger, whose concepts (spans and traces) kinda remind of opentelemetry's concepts. In a nutshell, the logger aims to make it possible/easier to correlate all the logs pertaining to a container during a scheduling cycle For full design details please check the upcoming `pkg-kni/logtracr/DESIGN.md`. Signed-off-by: Francesco Romani --- cmd/noderesourcetopology-plugin/main.go | 9 +- go.mod | 2 +- pkg-kni/logtracr/dump.go | 90 ++++++++++ pkg-kni/logtracr/dump_test.go | 55 ++++++ pkg-kni/logtracr/logcache.go | 101 +++++++++++ pkg-kni/logtracr/logcache_test.go | 122 ++++++++++++++ pkg-kni/logtracr/logid.go | 86 ++++++++++ pkg-kni/logtracr/logid_test.go | 213 ++++++++++++++++++++++++ pkg-kni/logtracr/logtracr.go | 111 ++++++++++++ pkg-kni/logtracr/setup.go | 112 +++++++++++++ 10 files changed, 898 insertions(+), 3 deletions(-) create mode 100644 pkg-kni/logtracr/dump.go create mode 100644 pkg-kni/logtracr/dump_test.go create mode 100644 pkg-kni/logtracr/logcache.go create mode 100644 pkg-kni/logtracr/logcache_test.go create mode 100644 pkg-kni/logtracr/logid.go create mode 100644 pkg-kni/logtracr/logid_test.go create mode 100644 pkg-kni/logtracr/logtracr.go create mode 100644 pkg-kni/logtracr/setup.go diff --git a/cmd/noderesourcetopology-plugin/main.go b/cmd/noderesourcetopology-plugin/main.go index efaa9ba74..f751a594d 100644 --- a/cmd/noderesourcetopology-plugin/main.go +++ b/cmd/noderesourcetopology-plugin/main.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "context" "math/rand" "os" "time" @@ -33,6 +34,7 @@ import ( _ "sigs.k8s.io/scheduler-plugins/apis/config/scheme" knifeatures "sigs.k8s.io/scheduler-plugins/pkg-kni/features" + knilogger "sigs.k8s.io/scheduler-plugins/pkg-kni/logtracr" knistatus "sigs.k8s.io/scheduler-plugins/pkg-kni/pfpstatus" ) @@ -41,8 +43,11 @@ func main() { rand.Seed(time.Now().UnixNano()) - logh := klogr.NewWithOptions(klogr.WithFormat(klogr.FormatKlog)) - + logh, ok := knilogger.Setup(context.Background()) + if !ok { + // fully disabled, restore classic flow + logh = klogr.NewWithOptions(klogr.WithFormat(klogr.FormatKlog)) + } knistatus.Setup(logh) // Register custom plugins to the scheduler framework. diff --git a/go.mod b/go.mod index 5cfd13508..c67d63489 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/diktyo-io/networktopology-api v1.0.1-alpha github.com/dustin/go-humanize v1.0.1 github.com/go-logr/logr v1.2.4 + github.com/go-logr/stdr v1.2.2 github.com/google/go-cmp v0.5.9 github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.1 github.com/k8stopologyawareschedwg/podfingerprint v0.2.2 @@ -51,7 +52,6 @@ require ( github.com/felixge/httpsnoop v1.0.3 // indirect github.com/francoispqt/gojay v1.2.13 // indirect github.com/fsnotify/fsnotify v1.6.0 // indirect - github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.3 // indirect diff --git a/pkg-kni/logtracr/dump.go b/pkg-kni/logtracr/dump.go new file mode 100644 index 000000000..4f6b63261 --- /dev/null +++ b/pkg-kni/logtracr/dump.go @@ -0,0 +1,90 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +import ( + "bytes" + "context" + "os" + "path/filepath" + "strings" + "time" + + "github.com/go-logr/logr" +) + +func RunForever(ctx context.Context, logger logr.Logger, interval time.Duration, baseDirectory string, lc *LogCache) { + // let's try to keep the amount of code we do in init() at minimum. + // This may happen if the container didn't have the directory mounted + discard := !existsBaseDirectory(baseDirectory) + if discard { + logger.Info("base directory not found, will discard everything", "baseDirectory", baseDirectory) + } + + delta := interval - 10*time.Millisecond // TODO + logger.Info("dump loop info", "interval", interval, "delta", delta) + + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case now := <-ticker.C: + // keep the size at bay by popping old data even if we just discard it + expireds := lc.PopExpired(now, delta) + for _, expired := range expireds { + if discard { + continue + } + // intentionally swallow error. + // - if we hit issues and we do log (V=2, like), we will clog the regular log + // - if we hit issues and we do NOT log (v=5, like) we will not see it anyway + DumpLogNode(baseDirectory, expired.logID, expired.data) + } + if len(expireds) > 0 { + logger.V(4).Info("processed logs", "entries", len(expireds), "stored", !discard) + } + } + } +} + +func DumpLogNode(statusDir string, logName string, data bytes.Buffer) error { + logName = fixLogName(logName) + dst, err := os.OpenFile(filepath.Join(statusDir, logName+".log"), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + return err + } + if _, err := dst.Write(data.Bytes()); err != nil { + dst.Close() // swallow error because we want to bubble up the write error + return err + } + return dst.Close() +} + +func existsBaseDirectory(baseDir string) bool { + info, err := os.Stat(baseDir) + if err != nil { + return false + } + return info.IsDir() +} + +func fixLogName(name string) string { + return strings.ReplaceAll(name, "/", "__") +} diff --git a/pkg-kni/logtracr/dump_test.go b/pkg-kni/logtracr/dump_test.go new file mode 100644 index 000000000..3cbd5c97c --- /dev/null +++ b/pkg-kni/logtracr/dump_test.go @@ -0,0 +1,55 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +import ( + "bytes" + "os" + "path/filepath" + "testing" +) + +func TestDumpLogNodeAppendsData(t *testing.T) { + dir, err := os.MkdirTemp("", "logtracr-dump-data") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(dir) // clean up + + buf1 := bytes.NewBufferString("fizzbuzz\n") + err = DumpLogNode(dir, "foo-test", *buf1) + if err != nil { + t.Fatal(err) + } + + buf2 := bytes.NewBufferString("foobar\n") + err = DumpLogNode(dir, "foo-test", *buf2) + if err != nil { + t.Fatal(err) + } + + data, err := os.ReadFile(filepath.Join(dir, "foo-test.log")) + if err != nil { + t.Fatal(err) + } + + got := string(data) + expected := "fizzbuzz\nfoobar\n" + if got != expected { + t.Errorf("read error\ngot=[%s]\nexp=[%s]", got, expected) + } +} diff --git a/pkg-kni/logtracr/logcache.go b/pkg-kni/logtracr/logcache.go new file mode 100644 index 000000000..66bd05cff --- /dev/null +++ b/pkg-kni/logtracr/logcache.go @@ -0,0 +1,101 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +import ( + "bytes" + "sync" + "time" +) + +type SpanBlob struct { + logID string + data bytes.Buffer + lastUpdate time.Time +} + +func (ln *SpanBlob) IsExpired(now time.Time, delta time.Duration) bool { + return now.Sub(ln.lastUpdate) >= delta +} + +type TimeFunc func() time.Time + +type LogCache struct { + mutex sync.Mutex + // map logID -> data + spanBlobs map[string]*SpanBlob + timeFunc TimeFunc +} + +func NewLogCache(timeFunc TimeFunc) *LogCache { + return &LogCache{ + spanBlobs: make(map[string]*SpanBlob), + timeFunc: timeFunc, + } +} + +func (lc *LogCache) Put(logID, data string) error { + lc.mutex.Lock() + defer lc.mutex.Unlock() + buf := lc.bufferFor(logID) + _, err := buf.WriteString(data) + return err +} + +func (lc *LogCache) PopExpired(now time.Time, delta time.Duration) []*SpanBlob { + ret := []*SpanBlob{} + lc.mutex.Lock() + defer lc.mutex.Unlock() + for logID, SpanBlob := range lc.spanBlobs { + if !SpanBlob.IsExpired(now, delta) { + continue + } + ret = append(ret, SpanBlob) + delete(lc.spanBlobs, logID) + } + return ret +} + +// Get is (mostly) meant for testing purposes +func (lc *LogCache) Get(logID string) (string, bool) { + lc.mutex.Lock() + defer lc.mutex.Unlock() + node, ok := lc.spanBlobs[logID] + if !ok { + return "", false + } + return node.data.String(), true +} + +// Len is (mostly) meant for testing purposes +func (lc *LogCache) Len() int { + lc.mutex.Lock() + defer lc.mutex.Unlock() + return len(lc.spanBlobs) +} + +func (lc *LogCache) bufferFor(logID string) *bytes.Buffer { + node, ok := lc.spanBlobs[logID] + if !ok { + node = &SpanBlob{ + logID: logID, + } + lc.spanBlobs[logID] = node + } + node.lastUpdate = lc.timeFunc() + return &node.data +} diff --git a/pkg-kni/logtracr/logcache_test.go b/pkg-kni/logtracr/logcache_test.go new file mode 100644 index 000000000..ad447a99e --- /dev/null +++ b/pkg-kni/logtracr/logcache_test.go @@ -0,0 +1,122 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +import ( + "testing" + "time" +) + +func TestEmpty(t *testing.T) { + ft := FakeTime{} + lc := NewLogCache(ft.Now) + sz := lc.Len() + if sz > 0 { + t.Errorf("unexpected len > 0: %v", sz) + } +} + +func TestPutGet(t *testing.T) { + ft := FakeTime{} + lc := NewLogCache(ft.Now) + err := lc.Put("foo", "fizzbuzz") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + got, ok := lc.Get("foo") + if !ok || got != "fizzbuzz" { + t.Fatalf("unexpected value: %v (ok=%v)", got, ok) + } +} + +func TestMultiKeyPutGet(t *testing.T) { + ft := FakeTime{} + lc := NewLogCache(ft.Now) + keys := []string{"foo", "bar", "baz", "buz", "abc"} + for _, key := range keys { + err := lc.Put(key, "fizzbuzz") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + for _, key := range keys { + got, ok := lc.Get(key) + if !ok || got != "fizzbuzz" { + t.Fatalf("unexpected value: %v (ok=%v)", got, ok) + } + } + sz := lc.Len() + if sz != len(keys) { + t.Errorf("unexpected len: %d expected: %d", sz, len(keys)) + } +} + +func TestAppendGet(t *testing.T) { + ft := FakeTime{} + lc := NewLogCache(ft.Now) + for _, data := range []string{"fizz", "buzz", "fizz", "buzz", "fizz"} { + err := lc.Put("foo", data) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + got, ok := lc.Get("foo") + if !ok || got != "fizzbuzzfizzbuzzfizz" { + t.Fatalf("unexpected value: %v (ok=%v)", got, ok) + } + + sz := lc.Len() + if sz != 1 { + t.Errorf("unexpected len: %v", sz) + } +} + +func TestMultiKeyAppendGet(t *testing.T) { + ft := FakeTime{} + lc := NewLogCache(ft.Now) + keys := []string{"foo", "bar", "baz", "buz", "abc"} + expected := make(map[string]string) + + for _, data := range []string{"fizz", "buzz", "fizz", "buzz", "fizz"} { + for _, key := range keys { + err := lc.Put(key, data+key) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + expected[key] += data + key + } + } + for _, key := range keys { + got, ok := lc.Get(key) + if !ok || got != expected[key] { + t.Fatalf("unexpected value for key %q: %v (ok=%v)", key, got, ok) + } + } + + sz := lc.Len() + if sz != len(keys) { + t.Errorf("unexpected len: %d expected: %d", sz, len(keys)) + } +} + +type FakeTime struct { + TS time.Time +} + +func (ft FakeTime) Now() time.Time { + return ft.TS +} diff --git a/pkg-kni/logtracr/logid.go b/pkg-kni/logtracr/logid.go new file mode 100644 index 000000000..9b1933a37 --- /dev/null +++ b/pkg-kni/logtracr/logid.go @@ -0,0 +1,86 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +const ( + logIDKey = "logID" +) + +func StartsWithLogID(kvList ...interface{}) (string, bool) { + if len(kvList) < 2 { + return "", false + } + return isLogIDPair(kvList[0], kvList[1]) +} + +func FindLogID(values []interface{}) (string, bool) { + if len(values) < 2 || len(values)%2 != 0 { + return "", false // should never happen + } + for i := 0; i < len(values); i += 2 { + if vs, ok := isLogIDPair(values[i], values[i+1]); ok { + return vs, ok + } + } + return "", false +} + +func GetLogID(values []interface{}, kvList ...interface{}) (string, bool) { + // quick check because we look at most at 2 values + if len(kvList) >= 2 { + if vs, ok := isLogIDPair(kvList[0], kvList[1]); ok { + return vs, ok + } + } + if len(values) < 2 || len(values)%2 != 0 { + return "", false // should never happen + } + for i := 0; i < len(values); i += 2 { + if vs, ok := isLogIDPair(values[i], values[i+1]); ok { + return vs, ok + } + } + return "", false +} + +func HasLogIDKey(vals []interface{}) bool { + if len(vals) < 2 || len(vals)%2 != 0 { + return false + } + for i := 0; i < len(vals); i += 2 { + if isLogIDKey(vals[i]) { + return true + } + } + return false +} + +func isLogIDPair(key, val interface{}) (string, bool) { + if !isLogIDKey(key) { + return "", false + } + vs, ok := val.(string) + return vs, ok +} + +func isLogIDKey(val interface{}) bool { + v, ok := val.(string) + if !ok { + return false + } + return v == logIDKey +} diff --git a/pkg-kni/logtracr/logid_test.go b/pkg-kni/logtracr/logid_test.go new file mode 100644 index 000000000..fdc6569cb --- /dev/null +++ b/pkg-kni/logtracr/logid_test.go @@ -0,0 +1,213 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +import ( + "testing" +) + +func TestHasLogIDKey(t *testing.T) { + tests := []struct { + name string + values []interface{} + expected bool + }{ + { + name: "nil", + values: nil, + expected: false, + }, + { + name: "empty", + values: []interface{}{}, + expected: false, + }, + { + name: "missing val", + values: []interface{}{"logID"}, + expected: false, + }, + { + name: "missing key", + values: []interface{}{"foobar"}, + expected: false, + }, + { + name: "minimal", + values: []interface{}{"logID", "FOO"}, + expected: true, + }, + { + name: "uneven", + values: []interface{}{"logID", "FOO", "BAR"}, + expected: false, + }, + { + name: "multikey", + values: []interface{}{"logID", "AAA", "fizz", "buzz"}, + expected: true, + }, + { + name: "multikey-mispell", + values: []interface{}{"logid", "AAA", "fizz", "buzz"}, + expected: false, + }, + { + name: "multikey-mistype", + values: []interface{}{"logID", 12345, "fizz", "buzz"}, + expected: true, + }, + { + name: "ok-not-first", + values: []interface{}{"foo", "bar", "logID", "AAA", "fizz", "buzz"}, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := HasLogIDKey(tt.values) + if got != tt.expected { + t.Errorf("values=%v got=[%v] expected=[%v]", tt.values, got, tt.expected) + } + }) + } +} + +func TestGetLogID(t *testing.T) { + tests := []struct { + name string + values []interface{} // always even (ensured by other API contracts) + kvList []interface{} + expectedID string + expectedOK bool + }{ + { + name: "nil", + values: nil, + kvList: []interface{}{}, + expectedID: "", + expectedOK: false, + }, + { + name: "empty", + values: []interface{}{}, + kvList: []interface{}{}, + expectedID: "", + expectedOK: false, + }, + { + name: "missing val", + values: []interface{}{}, + kvList: []interface{}{"logID"}, + expectedID: "", + expectedOK: false, + }, + { + name: "missing key", + values: []interface{}{}, + kvList: []interface{}{"foobar"}, + expectedID: "", + expectedOK: false, + }, + { + name: "minimal", + values: []interface{}{}, + kvList: []interface{}{"logID", "FOO"}, + expectedID: "FOO", + expectedOK: true, + }, + { + name: "uneven", + values: []interface{}{}, + kvList: []interface{}{"logID", "FOO", "BAR"}, + // from the limited perspective of getting logID, this is OK + expectedID: "FOO", + expectedOK: true, + }, + { + name: "multikey", + values: []interface{}{}, + kvList: []interface{}{"logID", "AAA", "fizz", "buzz"}, + expectedID: "AAA", + expectedOK: true, + }, + { + name: "ok-not-first", + values: []interface{}{}, + // not the first to save search time, can be changed in the future + kvList: []interface{}{"foo", "bar", "logID", "BBB", "fizz", "buzz"}, + expectedID: "", + expectedOK: false, + }, + { + name: "missing-both", + values: []interface{}{"alpha", "1", "beta", "2"}, + // not the first to save search time, can be changed in the future + kvList: []interface{}{"foo", "bar", "fizz", "buzz"}, + expectedID: "", + expectedOK: false, + }, + { + name: "values-ok-not-first", + values: []interface{}{"alpha", "1", "logID", "BBB", "beta", "2"}, + kvList: []interface{}{"foo", "bar", "fizz", "buzz"}, + expectedID: "BBB", + expectedOK: true, + }, + { + name: "kvList-prevails", + values: []interface{}{"logID", "values", "nodeName", "localhost"}, + kvList: []interface{}{"logID", "kvList", "foo", "bar", "fizz", "buzz"}, + expectedID: "kvList", + expectedOK: true, + }, + { + name: "kvList-mislpaleced", + values: []interface{}{"first", "taken", "logID", "values", "nodeName", "localhost"}, + kvList: []interface{}{"useless", "value", "logID", "kvList", "foo", "bar", "fizz", "buzz"}, + expectedID: "values", + expectedOK: true, + }, + { + name: "kvList-mistype", + values: []interface{}{"alpha", "1", "beta", "2"}, + kvList: []interface{}{"logID", 12345, "fizz", "buzz"}, + expectedID: "", + expectedOK: false, + }, + { + name: "kvList-mistype-full", + values: []interface{}{"alpha", "1", "beta", "2"}, + kvList: []interface{}{123, 45, "fizz", "buzz"}, + expectedID: "", + expectedOK: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, ok := GetLogID(tt.values, tt.kvList...) + if ok != tt.expectedOK { + t.Fatalf("values=%v kvList=%v OK got=[%v] expected=[%v]", tt.values, tt.kvList, ok, tt.expectedOK) + } + if got != tt.expectedID { + t.Errorf("values=%v kvList=%v ID got=[%v] expected=[%v]", tt.values, tt.kvList, got, tt.expectedID) + } + }) + } +} diff --git a/pkg-kni/logtracr/logtracr.go b/pkg-kni/logtracr/logtracr.go new file mode 100644 index 000000000..be9c8a96b --- /dev/null +++ b/pkg-kni/logtracr/logtracr.go @@ -0,0 +1,111 @@ +/* + * Copyright 2019 The logr Authors. + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Derived from https://github.com/go-logr/stdr/blob/v1.2.2/stdr.go + */ + +package logtracr + +import ( + "github.com/go-logr/logr" + "github.com/go-logr/logr/funcr" + "github.com/go-logr/stdr" +) + +type logtracr struct { + funcr.Formatter + std stdr.StdLogger + traces *LogCache + logID string + hasLogID bool + verbose int +} + +func New(std stdr.StdLogger, lc *LogCache, verbose int, opts stdr.Options) logr.Logger { + sl := &logtracr{ + Formatter: funcr.NewFormatter(funcr.Options{ + LogCaller: funcr.MessageClass(opts.LogCaller), + }), + std: std, + traces: lc, + verbose: verbose, + } + + // For skipping our own logger.Info/Error. + sl.Formatter.AddCallDepth(1 + opts.Depth) + + return logr.New(sl) +} + +func (l logtracr) Enabled(level int) bool { + return l.verbose >= level || l.logID != "" +} + +func (l logtracr) Info(level int, msg string, kvList ...interface{}) { + prefix, args := l.FormatInfo(level, msg, kvList) + if prefix != "" { + args = prefix + ": " + args + } + l.store(args, kvList) + // we can end up here because either we have enough verbosiness + // OR because stored logID. So we must redo this check. + if l.verbose < level { + return + } + _ = l.std.Output(l.Formatter.GetDepth()+1, args) +} + +func (l logtracr) Error(err error, msg string, kvList ...interface{}) { + prefix, args := l.FormatError(err, msg, kvList) + if prefix != "" { + args = prefix + ": " + args + } + l.store(args, kvList) + _ = l.std.Output(l.Formatter.GetDepth()+1, args) +} + +func (l logtracr) WithName(name string) logr.LogSink { + l.Formatter.AddName(name) + return &l +} + +func (l logtracr) WithValues(kvList ...interface{}) logr.LogSink { + l.logID, l.hasLogID = FindLogID(kvList) + l.Formatter.AddValues(kvList) + return &l +} + +func (l logtracr) WithCallDepth(depth int) logr.LogSink { + l.Formatter.AddCallDepth(depth) + return &l +} + +func (l logtracr) store(args string, kvList ...interface{}) { + if l.traces == nil { + return + } + logID, ok := l.logID, l.hasLogID + if !ok { + logID, ok = StartsWithLogID(kvList...) + } + if !ok { + return + } + l.traces.Put(logID, args) // ignore error intentionally +} + +var _ logr.LogSink = &logtracr{} +var _ logr.CallDepthLogSink = &logtracr{} diff --git a/pkg-kni/logtracr/setup.go b/pkg-kni/logtracr/setup.go new file mode 100644 index 000000000..f86d6d110 --- /dev/null +++ b/pkg-kni/logtracr/setup.go @@ -0,0 +1,112 @@ +/* + * Copyright 2023 Red Hat, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package logtracr + +import ( + "context" + "encoding/json" + "log" + "os" + "strconv" + "time" + + "github.com/go-logr/logr" + "github.com/go-logr/stdr" + "k8s.io/klog/v2" +) + +const ( + LogTracrDirEnvVar string = "LOGTRACR_DUMP_DIR" + LogTracrIntervalEnvVar string = "LOGTRACR_DUMP_INTERVAL" + LogTracrVerboseEnvVar string = "LOGTRACR_VERBOSE" +) + +type Config struct { + Verbose int `json:"verbose"` + DumpInterval time.Duration `json:"dumpInterval"` + DumpDirectory string `json:"dumpDirectory"` +} + +type Params struct { + Conf Config + Timestamper TimeFunc +} + +func Setup(ctx context.Context) (logr.Logger, bool) { + backend := log.New(os.Stderr, "", log.Ldate|log.Ltime|log.Lmicroseconds|log.Lshortfile) + logh := stdr.New(backend) + + dumpDir, ok := os.LookupEnv(LogTracrDirEnvVar) + if !ok || dumpDir == "" { + logh.Info("disabled", "reason", "directory", "variableFound", ok, "valueGiven", dumpDir != "") + return logh, false + } + + val, ok := os.LookupEnv(LogTracrIntervalEnvVar) + if !ok { + logh.Info("disabled", "reason", "interval", "variableFound", ok) + return logh, false + } + interval, err := time.ParseDuration(val) + if err != nil { + logh.Info("cannot parse", "interval", val, "error", err) + return logh, false + } + if interval == 0 { + return logh, false + } + + verb, ok := os.LookupEnv(LogTracrVerboseEnvVar) + if !ok { + logh.Info("disabled", "reason", "verbose", "variableFound", ok) + return logh, false + } + verbose, err := strconv.Atoi(verb) + if err != nil { + logh.Info("cannot parse", "verbose", verb, "error", err) + return logh, false + } + + return SetupWithParams(ctx, backend, Params{ + Conf: Config{ + Verbose: verbose, + DumpInterval: interval, + DumpDirectory: dumpDir, + }, + Timestamper: time.Now, + }), true +} + +func SetupWithParams(ctx context.Context, backend *log.Logger, params Params) logr.Logger { + sink := stdr.New(backend) + sink.Info("starting", "configuration", toJSON(params.Conf)) + + traces := NewLogCache(params.Timestamper) + + klog.SetLogger(New(backend, traces, params.Conf.Verbose, stdr.Options{})) + go RunForever(ctx, sink, params.Conf.DumpInterval, params.Conf.DumpDirectory, traces) + + return sink +} + +func toJSON(obj interface{}) string { + data, err := json.Marshal(obj) + if err != nil { + return "" + } + return string(data) +} From 10e6573c90e1d0441754f35e344fd8d1e5a906f7 Mon Sep 17 00:00:00 2001 From: Francesco Romani Date: Wed, 6 Mar 2024 13:26:16 +0100 Subject: [PATCH 2/2] [KNI] doc: logtracr: add design document add design/arch document how logtracr works and the implementation choices. Signed-off-by: Francesco Romani --- pkg-kni/logtracr/DESIGN.md | 196 +++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 pkg-kni/logtracr/DESIGN.md diff --git a/pkg-kni/logtracr/DESIGN.md b/pkg-kni/logtracr/DESIGN.md new file mode 100644 index 000000000..09a697846 --- /dev/null +++ b/pkg-kni/logtracr/DESIGN.md @@ -0,0 +1,196 @@ +# Taming the scheduler logging +Owner: fromani@redhat.com + +## Summary +Logging is not a solved problem in a complex system, especially in a complex distributed system. +Focusing on the kubernetes ecosystem, the most common experienced pain points are excessive or +insufficient verbosiness, which in turn creates the need to change the verbosiness level during +the component lifetime. +This is because keeping the verbosiness high will create a large amount of logs, while keeping +it low will make it way harder to troubleshoot an issue without increase the verbosiness before, +restarting the affected components and re-create the issue, which can take time and effort. + +The scheduler logs are affected by all these issues. Keeping the log level high is, as it stands +today (March 2024), still discouraged and impractical. The matter is further complicated by the +fact the NUMA-aware scheduler is a new component which takes novel approaches, out of necessity, +and whose behavior is still under scrutiny. So it is especially important to have enough +data to troubleshoot issue, which once again calls for high verbosiness. + +We would like to improve the current flow, which is basically keep verbosiness=2, and in case +of incidents (but note: always after the fact), bump the verbosiness to 4 or more, +reproduce again, send logs. + +## Motivation +We want to improve the supportability of the NUMA-aware scheduler. Having detailed logs is key +to troubleshoot this component, because it is new and takes a novel approach (in the k8s ecosystem) +due to the characteristics of the problem space. Having detailed logs is thus a key enabler to +reduce the support cycle, or to make support possible at all. + +The work described here explicitly targets the NUMA-aware scheduler plugin, which is a very small +subset of the code running in a (secondary) scheduler process. +We have to trust the k8s ecosystem to get insights about all the framework used in the +NUMA-aware scheduler process. + +We believe this is a fair trade off because the k8s framework is very battle tested and has a +huge ecosystem backing it. Out of practicality, we cannot land nontrivial changes in that codebase. +Furthermore, most of the novel code is contained in the NUMA-aware scheduler plugin portion, +so focusing on this area for extra logging seems the sweet spot. + + +## Goals +- Make it possible/easier to correlate all the logs pertaining to a container during a scheduling cycle +- Improve the signal-to-noise ratio of the logs, filtering out all the content not relevant for + the NUMA-aware scheduling +- Avoid excessive storage consumption +- Minimize the slowdown caused by logging + +## Non-Goals +- Change the logging system (e.g. migrate away from klog) +- Introduce a replacement logger (e.g. module with klog-like API) +- Break source code compatibility (e.g. no changes to the scheduler plugins source code) +- Move to traces (independent effort not mutually exclusive) +- Make the verbosiness tunable at runtime (no consensus about how to do securely and safely, + will require a new logging package) + +## Proposal +- Introduce and use extensively a logID key/value pair to enable correlation of all the log entries + pertaining to a scheduling cycle, or to a scheduling activity in general +- Introduce a new logging backend plugging into the logr framework, which klog >= 2.100 fully supports +- Let the new logging backend handle the logging demultiplexing + - Aggregate the logs per-object + +## Risks and Mitigations + +- injecting a new logging mechanism which overrides the main setting can introduce bugs which can lead + to lost logs or process crashing. The mitigation is to make sure that all the new code is completely isolated + and can be completely and easily switched off and disabled. +- overriding the verbosity setting and storing extra logs on disk can cause storage exhaustion. + The mitigation is to set strong upper bounds on the extra storage and implementing a builtin log rotation mechanism. +- injecting a new logging means doing all the logging twice, which has a performance cost. + The mitigation is optimizing the code path, keeping the extra log storage in volatile memory for speed reasons + (tmpfs) but ultimately the proposed approach WILL still have a performance impact. + +## Design details + +### mapping concepts to OpenTelemetry + +Where feasible, we try to mimic as close as possible, or plainly to adopt, the same concepts and terminology +as [OpenTelemetry](https://opentelemetry.io/docs/concepts/signals/traces/) (otel). + +### custom logging backend + +First and foremost, we need to filter the logs using a different criteria. Not just the global verbosity anymore, +but using a combination of global verbosity and flow matching. In other words we want to introduce two log sinks +1. standard log sink: all the logs from all the flows are filtered by verbosity +2. NUMA-specific log sink: all the logs from all the logs are filtered by verbosity and some flowID, and grouped by the same flowID. +A `flowID` would be roughly equivalent to a otel `trace_id`. + +A "flow ID" is any token which is unique per-flow and per-object. For example, every pod during a scheduling cycle should +have its own uinque flowID. The value of flowID is not relevant. + +The log packages accepts a Logger implementation that will be used as backing implementation of the traditional klog log calls +using the `stdr` interface. See `klog.SetLogger` for details. Thus, we can keep the usual klog interface without changes, +but inject out custom logic cleanly. + +Note however that klog is doing the verbosity check, so we need first and foremost to bump the default klog verbosity +so klog will pass all the messages to our backend, and then do the real filtering in our backend. +Unfortunately, client-go has some klog calls backed in, so the klog verbosity is better kept lower than 6. +In order to convey all the settings to our logging backend, a separate config option set is needed. +We initially use environment variables, but configuration files or new command line flag is also an option. +Environment variables or configuration files are the most backward compatible ways, because the rest of the process +can just safely ignore them. + +``` ++------------------+ +| Application code | ++------------------+ + | + | calls + V ++-----------------+ +| klog (frontend) |<-- -v=MAX (overridden) gather all logs, passingh through the verbosiness check ++-----------------+ + | + | calls + V ++-----------------+ +|logtracr(backend)|<-- verbosiness value copied from -v - real verbosiness filter ++-----------------+ + | +------------------+ + +-------------->| groups by flowID |<- sends to different storage + V +------------------+ + /---------------/ | | + / main log / | | +/---------------/ V V + /--------------/ /-------------/ + / flowID=A log / / flowID=X log / + /--------------/ /--------------/ +``` + +### grouping log entries by flowID + +Once all the logs reach our backend, we can filter them using the above criteria (real user-provided verbosiness, +and flowID). Note the configuration agent (numaresources-operator for example) is in charge of: +1. forcing the klog verbosiness (-`v`) to the maximum allowed (typically 5) +2. move the real user-specified verbosiness setting into the backend-specific config setting. + +In order to determine `flowID` the logging backend needs some cooperation from the application layer. +The application layer is already using a `logID`/`xxx` key/value pair consistently across all the logs +for the scheduler plugin. This was an independent decision predating the introduction of the custom logging. +Having a consistent key enables to easily grep the general log to fetch all the messages pertaining a flow, +so it's a good addition (and completely transparent to the application logic) anyway. + +The custom logging backend can leverage `logID` as `flowID`. We hook in the `InfoS` and `ErrorS` implementation +(which a `stdr`-conformant logging backend must implement anyway) and if one of the key/value pairs is `logID`, +the backend marks the log entry as belonging to the flow matching the `logID` value. +For example, the line +``` +I0312 10:36:40.113054 1 filter.go:155] "final verdict" logID="numalign-pod-sdmxg/numalign" node="node01.lab.eng.k8s.lan" suitable=true +``` +would belong to the flowID `numalign-pod-sdmxg/numalign`. +Entries extracted this way and sent to further processing would be the rough equivalent of otel's `span`s. + +### multiple storage sinks + +Once we can bind log entries to opaque `flowID` we can group them and send to multiple sinks. +The logging backed bifurcates: log entries are sent to the general logging according to the user-specified verbosiness rules, +and they are _also_ sent to the per-flowID storage. + +The per-flowID storage is a collection of memory buffers to which log-lines are appended to. +The memory buffers are monitored for both their number and individual sizes to prevent excessive memory usage. +A last-updated timestamp is recorded for each memory buffer. If a buffer grows too much, if there are too many active +buffers or if a buffer is not updated over a configurable threshold, the buffers are sent to durable storage and the +memory is freed. + +This aggregation of entries (otel `span`s) are aggregated in the rough equivalent of an otel `trace`. + +### per-flow logging storage + +In order to minimize the slowdown caused by the extra work and the extra writes the custom backend is performing, +the recommended set for the durable storage is actually not durable, but rather a tmpfs. +Another key benefit of the tmpfs as storage backend is that we can trivially set a hard cap for the storage size, +preventing storage space exhaustion. This is especially important to minimize the side effects and contain the +blast radius of the logging backend failures. + +Since the per-flow storage space is bound, the logging backend must tolerate (and ignore) write failures, +because this is an expected, albeit unlikely, occurrence. +Is worth stressing out that the per-flow storage augments, not replaces, the main log, so augmentation failures +are not critical + +### future extensions + +#### builtin log rotation +We can add a simple log rotation mechanism in the logging backend. Log entries older than a configurable threshold +can be deleted to make room for the new entries. It's often preferrables to have newer entries rather than old ones. + +## Discarded alternatives + +### use an off-the-shelf 3rd party logging package + +At moment of writing, we are not aware of such package. If a comparable alternative is found, we will run an evaluation +and a case study about replacing the in-house solution as soon as possible. + +### replacing the core logging package + +this will require to have the logging package replaced in kubernetes, which is practically impossible. +