From c36b8efbd3f9752c1779f739e0f3c1c374b379f7 Mon Sep 17 00:00:00 2001 From: Ryan Turner Date: Fri, 9 Feb 2024 15:50:04 -0800 Subject: [PATCH] Log partial selectors from workload attestation on context cancellation (#4846) * Log partial selectors from workload attestation on context cancellation In the case when a client hangs up a connection to SPIRE Agent during workload attestation (e.g. timeout, process dies, etc.) while the agent is still discovering selectors from workload attestation plugins, it can be helpful to log whatever selectors the agent could find for debugging purposes. Today, only the pid is captured in logs in this case, which is often not enough information to trace back the caller since: - The caller process may have already died (e.g. crashlooping on startup) - pid alone is only useful if you can query the state of running processes on the host around the time of workload attestation (often not practical) Logging the partial set of selectors fetched by the agent before the caller hung up can help diagnose affected workloads in case a workload attestation plugin is misbehaving or a downstream dependency required by the plugin is down/not responding with accurate information. Signed-off-by: Ryan Turner --- pkg/agent/attestor/workload/workload.go | 19 +++- pkg/agent/attestor/workload/workload_test.go | 89 ++++++++++++++++--- pkg/agent/endpoints/peertracker.go | 5 +- pkg/agent/endpoints/peertracker_test.go | 6 +- pkg/common/telemetry/names.go | 3 + .../fakeworkloadattestor/timeout_attestor.go | 32 +++++++ 6 files changed, 133 insertions(+), 21 deletions(-) create mode 100644 test/fakes/fakeworkloadattestor/timeout_attestor.go diff --git a/pkg/agent/attestor/workload/workload.go b/pkg/agent/attestor/workload/workload.go index 96e9edda2d..3525c47f50 100644 --- a/pkg/agent/attestor/workload/workload.go +++ b/pkg/agent/attestor/workload/workload.go @@ -18,7 +18,7 @@ type attestor struct { } type Attestor interface { - Attest(ctx context.Context, pid int) []*common.Selector + Attest(ctx context.Context, pid int) ([]*common.Selector, error) } func New(config *Config) Attestor { @@ -26,6 +26,10 @@ func New(config *Config) Attestor { } func newAttestor(config *Config) *attestor { + if config.selectorHook == nil { + config.selectorHook = func([]*common.Selector) {} + } + return &attestor{c: config} } @@ -33,11 +37,14 @@ type Config struct { Catalog catalog.Catalog Log logrus.FieldLogger Metrics telemetry.Metrics + + // Test hook called when selectors are obtained from a workload attestor plugin + selectorHook func([]*common.Selector) } // Attest invokes all workload attestor plugins against the provided PID. If an error // is encountered, it is logged and selectors from the failing plugin are discarded. -func (wla *attestor) Attest(ctx context.Context, pid int) []*common.Selector { +func (wla *attestor) Attest(ctx context.Context, pid int) ([]*common.Selector, error) { counter := telemetry_workload.StartAttestationCall(wla.c.Metrics) defer counter.Done(nil) @@ -63,8 +70,14 @@ func (wla *attestor) Attest(ctx context.Context, pid int) []*common.Selector { select { case s := <-sChan: selectors = append(selectors, s...) + wla.c.selectorHook(selectors) case err := <-errChan: log.WithError(err).Error("Failed to collect all selectors for PID") + case <-ctx.Done(): + // If the client times out before all workload attestation plugins have reported selectors or an error, + // it can be helpful to see the partial set of selectors discovered for debugging purposes. + log.WithField(telemetry.PartialSelectors, selectors).Error("Timed out collecting selectors for PID") + return nil, ctx.Err() } } @@ -76,7 +89,7 @@ func (wla *attestor) Attest(ctx context.Context, pid int) []*common.Selector { if pid != os.Getpid() { log.WithField(telemetry.Selectors, selectors).Debug("PID attested to have selectors") } - return selectors + return selectors, nil } // invokeAttestor invokes attestation against the supplied plugin. Should be called from a goroutine. diff --git a/pkg/agent/attestor/workload/workload_test.go b/pkg/agent/attestor/workload/workload_test.go index 1339cf6a75..554e9879d8 100644 --- a/pkg/agent/attestor/workload/workload_test.go +++ b/pkg/agent/attestor/workload/workload_test.go @@ -3,8 +3,10 @@ package attestor import ( "context" "errors" + "fmt" "testing" + "github.com/sirupsen/logrus" "github.com/sirupsen/logrus/hooks/test" "github.com/spiffe/spire/pkg/common/telemetry" telemetry_workload "github.com/spiffe/spire/pkg/common/telemetry/agent/workloadapi" @@ -25,15 +27,15 @@ var ( attestor1Pids = map[int32][]string{ 1: nil, - 2: []string{"bar"}, + 2: {"bar"}, // 3: attestor1 cannot attest process 3 - 4: []string{"bar"}, + 4: {"bar"}, } attestor2Pids = map[int32][]string{ 1: nil, 2: nil, - 3: []string{"baz"}, - 4: []string{"baz"}, + 3: {"baz"}, + 4: {"baz"}, } ) @@ -44,12 +46,14 @@ func TestWorkloadAttestor(t *testing.T) { type WorkloadAttestorTestSuite struct { suite.Suite - attestor *attestor - catalog *fakeagentcatalog.Catalog + attestor *attestor + catalog *fakeagentcatalog.Catalog + loggerHook *test.Hook } func (s *WorkloadAttestorTestSuite) SetupTest() { - log, _ := test.NewNullLogger() + log, hook := test.NewNullLogger() + s.loggerHook = hook s.catalog = fakeagentcatalog.New() s.attestor = newAttestor(&Config{ @@ -66,19 +70,23 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkload() { ) // both attestors succeed but with no selectors - selectors := s.attestor.Attest(ctx, 1) + selectors, err := s.attestor.Attest(ctx, 1) + s.Assert().Nil(err) s.Empty(selectors) // attestor1 has selectors, but not attestor2 - selectors = s.attestor.Attest(ctx, 2) + selectors, err = s.attestor.Attest(ctx, 2) + s.Assert().Nil(err) spiretest.AssertProtoListEqual(s.T(), selectors1, selectors) // attestor2 has selectors, attestor1 fails - selectors = s.attestor.Attest(ctx, 3) + selectors, err = s.attestor.Attest(ctx, 3) + s.Assert().Nil(err) spiretest.AssertProtoListEqual(s.T(), selectors2, selectors) // both have selectors - selectors = s.attestor.Attest(ctx, 4) + selectors, err = s.attestor.Attest(ctx, 4) + s.Assert().Nil(err) util.SortSelectors(selectors) combined := make([]*common.Selector, 0, len(selectors1)+len(selectors2)) combined = append(combined, selectors1...) @@ -97,7 +105,8 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkloadMetrics() { metrics := fakemetrics.New() s.attestor.c.Metrics = metrics - selectors := s.attestor.Attest(ctx, 2) + selectors, err := s.attestor.Attest(ctx, 2) + s.Assert().Nil(err) // Create expected metrics expected := fakemetrics.New() @@ -114,12 +123,13 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkloadMetrics() { s.attestor.c.Metrics = metrics // No selectors expected - selectors = s.attestor.Attest(ctx, 3) + selectors, err = s.attestor.Attest(ctx, 3) + s.Assert().Nil(err) s.Empty(selectors) // Create expected metrics with error key expected = fakemetrics.New() - err := errors.New("some error") + err = errors.New("some error") attestorCounter = telemetry_workload.StartAttestorCall(expected, "fake1") attestorCounter.Done(&err) telemetry_workload.AddDiscoveredSelectorsSample(expected, float32(0)) @@ -128,3 +138,54 @@ func (s *WorkloadAttestorTestSuite) TestAttestWorkloadMetrics() { s.Require().Equal(expected.AllMetrics(), metrics.AllMetrics()) } + +func (s *WorkloadAttestorTestSuite) TestAttestLogsPartialSelectorsOnContextCancellation() { + pid := 4 + selectorC := make(chan []*common.Selector, 1) + s.attestor.c.selectorHook = func(selectors []*common.Selector) { + selectorC <- selectors + } + + pluginC := make(chan struct{}, 1) + // Add one attestor that provides selectors and another that doesn't return before the test context is cancelled + s.catalog.SetWorkloadAttestors( + fakeworkloadattestor.New(s.T(), "fake1", attestor1Pids), + fakeworkloadattestor.NewTimeoutAttestor(s.T(), "faketimeoutattestor", pluginC), + ) + + defer func() { + // Unblock attestor that is blocking on channel + pluginC <- struct{}{} + }() + + attestCh := make(chan struct{}, 1) + ctx, cancel := context.WithCancel(context.Background()) + var selectors []*common.Selector + var attestErr error + go func(innerCtx context.Context, pid int) { + selectors, attestErr = s.attestor.Attest(innerCtx, pid) + attestCh <- struct{}{} + }(ctx, pid) + + // Wait for one of the plugins to return selectors + partialSelectors := <-selectorC + + // Cancel context to simulate caller hanging up in the middle of workload attestation + cancel() + + // Wait for attestation goroutine to complete execution + <-attestCh + + s.Assert().Nil(selectors) + s.Assert().Error(attestErr) + spiretest.AssertLogs(s.T(), s.loggerHook.AllEntries(), []spiretest.LogEntry{ + { + Level: logrus.ErrorLevel, + Message: "Timed out collecting selectors for PID", + Data: logrus.Fields{ + telemetry.PartialSelectors: fmt.Sprint(partialSelectors), + telemetry.PID: fmt.Sprint(pid), + }, + }, + }) +} diff --git a/pkg/agent/endpoints/peertracker.go b/pkg/agent/endpoints/peertracker.go index 0124953730..c4aed5ab10 100644 --- a/pkg/agent/endpoints/peertracker.go +++ b/pkg/agent/endpoints/peertracker.go @@ -20,7 +20,10 @@ func (a PeerTrackerAttestor) Attest(ctx context.Context) ([]*common.Selector, er return nil, status.Error(codes.Internal, "peer tracker watcher missing from context") } - selectors := a.Attestor.Attest(ctx, int(watcher.PID())) + selectors, err := a.Attestor.Attest(ctx, int(watcher.PID())) + if err != nil { + return nil, err + } // Ensure that the original caller is still alive so that we know we didn't // attest some other process that happened to be assigned the original PID diff --git a/pkg/agent/endpoints/peertracker_test.go b/pkg/agent/endpoints/peertracker_test.go index 89d4777d80..3f559c2ccc 100644 --- a/pkg/agent/endpoints/peertracker_test.go +++ b/pkg/agent/endpoints/peertracker_test.go @@ -37,11 +37,11 @@ func TestPeerTrackerAttestor(t *testing.T) { type FakeAttestor struct{} -func (a FakeAttestor) Attest(_ context.Context, pid int) []*common.Selector { +func (a FakeAttestor) Attest(_ context.Context, pid int) ([]*common.Selector, error) { if pid == os.Getpid() { - return []*common.Selector{{Type: "Type", Value: "Value"}} + return []*common.Selector{{Type: "Type", Value: "Value"}}, nil } - return nil + return nil, nil } func WithFakeWatcher(alive bool) context.Context { diff --git a/pkg/common/telemetry/names.go b/pkg/common/telemetry/names.go index 70f1a9ad8d..88e13c83e6 100644 --- a/pkg/common/telemetry/names.go +++ b/pkg/common/telemetry/names.go @@ -381,6 +381,9 @@ const ( // ParentID tags parent ID for an entry ParentID = "parent_id" + // PartialSelectors is a partial set of selectors for a workload + PartialSelectors = "partial_selectors" + // Path declares some logic path, likely on the file system Path = "path" diff --git a/test/fakes/fakeworkloadattestor/timeout_attestor.go b/test/fakes/fakeworkloadattestor/timeout_attestor.go new file mode 100644 index 0000000000..e3b4bb243b --- /dev/null +++ b/test/fakes/fakeworkloadattestor/timeout_attestor.go @@ -0,0 +1,32 @@ +package fakeworkloadattestor + +import ( + "context" + "testing" + + workloadattestorv1 "github.com/spiffe/spire-plugin-sdk/proto/spire/plugin/agent/workloadattestor/v1" + "github.com/spiffe/spire/pkg/agent/plugin/workloadattestor" + "github.com/spiffe/spire/pkg/common/catalog" + "github.com/spiffe/spire/test/plugintest" +) + +func NewTimeoutAttestor(t *testing.T, name string, c chan struct{}) workloadattestor.WorkloadAttestor { + server := workloadattestorv1.WorkloadAttestorPluginServer(&timeoutWorkloadAttestor{ + c: c, + }) + wa := new(workloadattestor.V1) + plugintest.Load(t, catalog.MakeBuiltIn(name, server), wa) + return wa +} + +type timeoutWorkloadAttestor struct { + workloadattestorv1.UnimplementedWorkloadAttestorServer + + c chan struct{} +} + +func (twa *timeoutWorkloadAttestor) Attest(_ context.Context, _ *workloadattestorv1.AttestRequest) (*workloadattestorv1.AttestResponse, error) { + // Block on channel until test sends signal + <-twa.c + return &workloadattestorv1.AttestResponse{}, nil +}