From e5e98221d950bbcc265840792a5bec7408922489 Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Fri, 19 Jul 2024 15:19:03 -0700 Subject: [PATCH 1/7] Misc fixes for key sync --- enclave.go | 111 +++++++++++++++++++++++++++++--------------- enclave_test.go | 25 ++++++++++ handlers.go | 29 ------------ handlers_test.go | 54 --------------------- main.go | 7 ++- proxy.go | 7 +-- sync_shared.go | 4 +- sync_worker.go | 47 ++++++++++++++----- sync_worker_test.go | 20 +++++++- util.go | 109 ++++++++++++++++++++++++++----------------- util_test.go | 54 ++++++++++++++++++++- 11 files changed, 283 insertions(+), 184 deletions(-) diff --git a/enclave.go b/enclave.go index c6a6e38..85e5ddb 100644 --- a/enclave.go +++ b/enclave.go @@ -16,6 +16,7 @@ import ( "net/http/httputil" _ "net/http/pprof" "net/url" + "strconv" "sync" "time" @@ -65,20 +66,22 @@ var ( // Enclave represents a service running inside an AWS Nitro Enclave. type Enclave struct { attester - sync.Mutex // Guard syncState. - cfg *Config - syncState int - extPubSrv, extPrivSrv *http.Server - intSrv *http.Server - promSrv *http.Server - revProxy *httputil.ReverseProxy - hashes *AttestationHashes - promRegistry *prometheus.Registry - metrics *metrics - workers *workerManager - keys *enclaveKeys - httpsCert *certRetriever - ready, stop chan struct{} + sync.Mutex // Guard syncState. + cfg *Config + syncState int + extPubSrv, extPrivSrv *http.Server + intSrv *http.Server + promSrv *http.Server + revProxy *httputil.ReverseProxy + hashes *AttestationHashes + promRegistry *prometheus.Registry + metrics *metrics + workers *workerManager + keys *enclaveKeys + httpsCert *certRetriever + appReady, networkReady, stop chan struct{} + heartbeatActive bool + myHostname string } // Config represents the configuration of our enclave service. @@ -88,7 +91,7 @@ type Config struct { // is required. FQDN string - // FQDNLeader contains the fully qualified domain name of the leader + // FQDNLeader contains the fully qualified domain name and port of the leader // enclave, which coordinates enclave synchronization. Only set this field // if horizontal scaling is required. FQDNLeader string @@ -133,6 +136,10 @@ type Config struct { // metrics. Consider setting this to your application's name. PrometheusNamespace string + // Port of the host IP provider, provided by vsock-relay. + // Only required if key synchronization is enabled. + HostIpProviderPort uint32 + // UseProfiling enables profiling via pprof. Profiling information will be // available at /enclave/debug. Note that profiling data is privacy // sensitive and therefore must not be enabled in production. @@ -245,7 +252,8 @@ func NewEnclave(cfg *Config) (*Enclave, error) { hashes: new(AttestationHashes), workers: newWorkerManager(time.Minute), stop: make(chan struct{}), - ready: make(chan struct{}), + appReady: make(chan struct{}), + networkReady: make(chan struct{}), } // Increase the maximum number of idle connections per host. This is @@ -291,9 +299,8 @@ func NewEnclave(cfg *Config) (*Enclave, error) { // Register enclave-internal HTTP API. m = e.intSrv.Handler.(*chi.Mux) if cfg.WaitForApp { - m.Get(pathReady, readyHandler(e.ready)) + m.Get(pathReady, readyHandler(e.appReady)) } - m.Get(pathState, getStateHandler(e.getSyncState, e.keys)) m.Put(pathState, putStateHandler(e.attester, e.getSyncState, e.keys, e.workers)) m.Post(pathHash, hashHandler(e)) @@ -303,7 +310,11 @@ func NewEnclave(cfg *Config) (*Enclave, error) { e.revProxy = httputil.NewSingleHostReverseProxy(cfg.AppWebSrv) e.revProxy.BufferPool = newBufPool() e.revProxy.Transport = customTransport - e.extPubSrv.Handler.(*chi.Mux).Handle(pathProxy, e.revProxy) + extm := e.extPubSrv.Handler.(*chi.Mux) + extm.Handle(pathState, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "Forbidden", http.StatusForbidden) + })) + extm.Handle(pathProxy, e.revProxy) // If we expose Prometheus metrics, we keep track of the HTTP backend's // responses. if cfg.PrometheusPort > 0 { @@ -336,7 +347,7 @@ func (e *Enclave) Start() error { // Set up our networking environment which creates a TAP device that // forwards traffic (via the VSOCK interface) to the EC2 host. - go runNetworking(e.cfg, e.stop) + go runNetworking(e.cfg, e.stop, e.networkReady) // Get an HTTPS certificate. if e.cfg.UseACME { @@ -356,14 +367,24 @@ func (e *Enclave) Start() error { return nil } + elog.Println("Waiting for networking setup...") + <-e.networkReady // Check if we are the leader. if !e.weAreLeader() { + // Get the worker's hostname/IP, so we can give the leader + // enclave contact details for future sync attempts. elog.Println("Obtaining worker's hostname.") - worker := getSyncURL(getHostnameOrDie(), e.cfg.ExtPrivPort) + e.myHostname = getHostnameOrDie(e.cfg.HostIpProviderPort) + worker := getSyncURL(e.myHostname, e.cfg.ExtPrivPort) err = asWorker(e.setupWorkerPostSync, e.attester).registerWith(leader, worker) if err != nil { elog.Fatalf("Error syncing with leader: %v", err) } + } else { + // Get leader app key to share with worker enclaves. + if err = requestAndStoreKeyFromApp(e.cfg.AppWebSrv, e.keys); err != nil { + elog.Fatalf("Failed to retrieve key material from app as leader: %v", err) + } } return nil @@ -424,7 +445,6 @@ func (e *Enclave) weAreLeader() (result bool) { return case <-errChan: elog.Println("Not yet able to talk to leader designation endpoint.") - time.Sleep(time.Second) continue case result = <-areWeLeader: return @@ -442,15 +462,23 @@ func (e *Enclave) weAreLeader() (result bool) { // installing the given enclave keys and starting the heartbeat loop. func (e *Enclave) setupWorkerPostSync(keys *enclaveKeys) error { e.keys.set(keys) + + if err := sendKeyToApp(e.cfg.AppWebSrv, e.keys); err != nil { + return err + } + cert, err := tls.X509KeyPair(keys.NitridingCert, keys.NitridingKey) if err != nil { return err } e.httpsCert.set(&cert) - // Start our heartbeat. - worker := getSyncURL(getHostnameOrDie(), e.cfg.ExtPrivPort) - go e.workerHeartbeat(worker) + if !e.heartbeatActive { + worker := getSyncURL(e.myHostname, e.cfg.ExtPrivPort) + + go e.workerHeartbeat(worker) + e.heartbeatActive = true + } return nil } @@ -516,6 +544,7 @@ func (e *Enclave) workerHeartbeat(worker *url.URL) { // Stop stops the enclave. func (e *Enclave) Stop() error { close(e.stop) + e.heartbeatActive = false if err := e.intSrv.Shutdown(context.Background()); err != nil { return err } @@ -533,11 +562,11 @@ func (e *Enclave) Stop() error { // getExtListener returns a listener for the HTTPS service // via AF_INET or AF_VSOCK. -func (e *Enclave) getExtListener() (net.Listener, error) { +func (e *Enclave) getExtListener(port uint16) (net.Listener, error) { if e.cfg.UseVsockForExtPort { - return vsock.Listen(uint32(e.cfg.ExtPubPort), nil) + return vsock.Listen(uint32(port), nil) } else { - return net.Listen("tcp", fmt.Sprintf(":%d", e.cfg.ExtPubPort)) + return net.Listen("tcp", fmt.Sprintf(":%d", port)) } } @@ -562,8 +591,12 @@ func (e *Enclave) startWebServers() error { } }() go func() { + listener, err := e.getExtListener(e.cfg.ExtPrivPort) + if err != nil { + elog.Fatalf("Failed to listen on external port: %v", err) + } elog.Printf("Starting external private Web server at %s.", e.extPrivSrv.Addr) - err := e.extPrivSrv.ListenAndServeTLS("", "") + err = e.extPrivSrv.ServeTLS(listener, "", "") if err != nil && !errors.Is(err, http.ErrServerClosed) { elog.Fatalf("External private Web server error: %v", err) } @@ -572,11 +605,11 @@ func (e *Enclave) startWebServers() error { // If desired, don't launch our Internet-facing Web server until the // application signalled that it's ready. if e.cfg.WaitForApp { - <-e.ready + <-e.appReady elog.Println("Application signalled that it's ready. Starting public Web server.") } - listener, err := e.getExtListener() + listener, err := e.getExtListener(e.cfg.ExtPubPort) if err != nil { elog.Fatalf("Failed to listen on external port: %v", err) } @@ -704,7 +737,7 @@ func (e *Enclave) setCertFingerprint(rawData []byte) error { func (e *Enclave) getLeader(path string) *url.URL { return &url.URL{ Scheme: "https", - Host: fmt.Sprintf("%s:%d", e.cfg.FQDNLeader, e.cfg.ExtPrivPort), + Host: e.cfg.FQDNLeader, Path: path, } } @@ -712,13 +745,15 @@ func (e *Enclave) getLeader(path string) *url.URL { // getWorker takes as input the worker's heartbeat request payload and returns // the worker's URL. func (e *Enclave) getWorker(hb *heartbeatRequest) (*url.URL, error) { - var ( - host string - err error - ) - host, _, err = net.SplitHostPort(hb.WorkerHostname) + host, port, err := net.SplitHostPort(hb.WorkerHostname) if err != nil { return nil, err } - return getSyncURL(host, e.cfg.ExtPrivPort), nil + portUint, err := strconv.ParseUint(port, 10, 16) + if err != nil { + return nil, fmt.Errorf("invalid port number: %v", err) + } + portUint16 := uint16(portUint) + + return getSyncURL(host, portUint16), nil } diff --git a/enclave_test.go b/enclave_test.go index 624411e..d5cd64e 100644 --- a/enclave_test.go +++ b/enclave_test.go @@ -1,6 +1,9 @@ package main import ( + "io" + "net/http" + "net/http/httptest" "testing" ) @@ -17,6 +20,12 @@ var defaultCfg = Config{ WaitForApp: true, } +type mockAppRequestInfo struct { + method string + path string + body []byte +} + func assertEqual(t *testing.T, is, should interface{}) { t.Helper() if should != is { @@ -24,6 +33,22 @@ func assertEqual(t *testing.T, is, should interface{}) { } } +func createMockServer(responseBody []byte, mockAppRequests *[]mockAppRequestInfo) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ := io.ReadAll(r.Body) + + *mockAppRequests = append(*mockAppRequests, mockAppRequestInfo{ + method: r.Method, + path: r.URL.Path, + body: body, + }) + w.WriteHeader(http.StatusOK) + if responseBody != nil { + _, _ = w.Write(responseBody) + } + })) +} + func createEnclave(cfg *Config) *Enclave { e, err := NewEnclave(cfg) if err != nil { diff --git a/handlers.go b/handlers.go index 3018a21..f9b46af 100644 --- a/handlers.go +++ b/handlers.go @@ -56,35 +56,6 @@ func rootHandler(cfg *Config) http.HandlerFunc { } } -// getStateHandler returns a handler that lets the enclave application retrieve -// previously-set state. -// -// This is an enclave-internal endpoint that can only be accessed by the -// trusted enclave application. -func getStateHandler(getSyncState func() int, keys *enclaveKeys) http.HandlerFunc { - return func(w http.ResponseWriter, r *http.Request) { - switch getSyncState() { - case noSync: - http.Error(w, errKeySyncDisabled.Error(), http.StatusForbidden) - case isLeader: - http.Error(w, errEndpointGone.Error(), http.StatusGone) - case inProgress: - http.Error(w, errDesignationInProgress.Error(), http.StatusServiceUnavailable) - case isWorker: - w.Header().Set("Content-Type", "application/octet-stream") - appKeys := keys.getAppKeys() - n, err := w.Write(appKeys) - if err != nil { - elog.Fatalf("Error writing state to client: %v", err) - } - expected := len(appKeys) - if n != expected { - elog.Fatalf("Only wrote %d out of %d-byte state to client.", n, expected) - } - } - } -} - // putStateHandler returns a handler that lets the enclave application set // state that's synchronized with another enclave in case of horizontal // scaling. The state can be arbitrary bytes. diff --git a/handlers_test.go b/handlers_test.go index 01e0776..10ad2f7 100644 --- a/handlers_test.go +++ b/handlers_test.go @@ -124,34 +124,6 @@ func signalReady(t *testing.T, e *Enclave) { time.Sleep(100 * time.Millisecond) } -func TestGetStateHandler(t *testing.T) { - var keys = newTestKeys(t) - - makeReq := makeReqToHandler(getStateHandler(retState(noSync), keys)) - assertResponse(t, - makeReq(http.MethodGet, pathState, nil), - newResp(http.StatusForbidden, errKeySyncDisabled.Error()), - ) - - makeReq = makeReqToHandler(getStateHandler(retState(isLeader), keys)) - assertResponse(t, - makeReq(http.MethodGet, pathState, nil), - newResp(http.StatusGone, errEndpointGone.Error()), - ) - - makeReq = makeReqToHandler(getStateHandler(retState(isWorker), keys)) - assertResponse(t, - makeReq(http.MethodGet, pathState, nil), - newResp(http.StatusOK, string(keys.getAppKeys())), - ) - - makeReq = makeReqToHandler(getStateHandler(retState(inProgress), keys)) - assertResponse(t, - makeReq(http.MethodGet, pathState, nil), - newResp(http.StatusServiceUnavailable, errDesignationInProgress.Error()), - ) -} - func TestPutStateHandler(t *testing.T) { var ( tooLargeKey = make([]byte, maxKeyMaterialLen+1) @@ -193,32 +165,6 @@ func TestPutStateHandler(t *testing.T) { ) } -func TestGetPutStateHandlers(t *testing.T) { - var ( - a = &dummyAttester{} - keys = newTestKeys(t) - appKeys = "application keys" - stop = make(chan struct{}) - workers = newWorkerManager(time.Second) - ) - go workers.start(stop) - defer close(stop) - - // Set application state. - makeReq := makeReqToHandler(putStateHandler(a, retState(isLeader), keys, workers)) - assertResponse(t, - makeReq(http.MethodPut, pathState, strings.NewReader(appKeys)), - newResp(http.StatusOK, ""), - ) - - // Retrieve previously-set application state. - makeReq = makeReqToHandler(getStateHandler(retState(isWorker), keys)) - assertResponse(t, - makeReq(http.MethodGet, pathState, nil), - newResp(http.StatusOK, appKeys), - ) -} - func TestProxyHandler(t *testing.T) { appPage := "foobar" diff --git a/main.go b/main.go index 23f314b..16c47ce 100644 --- a/main.go +++ b/main.go @@ -34,14 +34,14 @@ func init() { func main() { var fqdn, fqdnLeader, appURL, appWebSrv, appCmd, prometheusNamespace, mockCertFp string - var extPubPort, extPrivPort, intPort, hostProxyPort, prometheusPort uint + var extPubPort, extPrivPort, intPort, hostProxyPort, prometheusPort, hostIpProviderPort uint var useACME, waitForApp, useProfiling, useVsockForExtPort, disableKeepAlives, debug bool var err error flag.StringVar(&fqdn, "fqdn", "", "FQDN of the enclave application (e.g., \"example.com\").") flag.StringVar(&fqdnLeader, "fqdn-leader", "", - "FQDN of the leader enclave (e.g., \"leader.example.com\"). Setting this enables key synchronization.") + "FQDN and port of the leader enclave (e.g., \"leader.example.com\"). Setting this enables key synchronization.") flag.StringVar(&appURL, "appurl", "", "Code repository of the enclave application (e.g., \"github.com/foo/bar\").") flag.StringVar(&appWebSrv, "appwebsrv", "", @@ -64,6 +64,8 @@ func main() { "Port of proxy application running on EC2 host.") flag.UintVar(&prometheusPort, "prometheus-port", 0, "Port to expose Prometheus metrics at.") + flag.UintVar(&hostIpProviderPort, "host-ip-provider-port", 6161, + "Port of the host IP provider, provided by vsock-relay.") flag.BoolVar(&useProfiling, "profile", false, "Enable pprof profiling. Only useful for debugging and must not be used in production.") flag.BoolVar(&useACME, "acme", false, @@ -108,6 +110,7 @@ func main() { DisableKeepAlives: disableKeepAlives, PrometheusPort: uint16(prometheusPort), PrometheusNamespace: prometheusNamespace, + HostIpProviderPort: uint32(hostIpProviderPort), HostProxyPort: uint32(hostProxyPort), UseACME: useACME, WaitForApp: waitForApp, diff --git a/proxy.go b/proxy.go index 4149e4f..8853af5 100644 --- a/proxy.go +++ b/proxy.go @@ -23,10 +23,10 @@ var ( // runNetworking calls the function that sets up our networking environment. // If anything fails, we try again after a brief wait period. -func runNetworking(c *Config, stop chan struct{}) { +func runNetworking(c *Config, stop chan struct{}, networkReady chan struct{}) { var err error for { - if err = setupNetworking(c, stop); err == nil { + if err = setupNetworking(c, stop, networkReady); err == nil { return } time.Sleep(time.Second) @@ -41,7 +41,7 @@ func runNetworking(c *Config, stop chan struct{}) { // 3. Establish a connection with the proxy running on the host. // 4. Spawn goroutines to forward traffic between the TAP device and the proxy // running on the host. -func setupNetworking(c *Config, stop chan struct{}) error { +func setupNetworking(c *Config, stop chan struct{}, networkReady chan struct{}) error { // Establish connection with the proxy running on the EC2 host. endpoint := fmt.Sprintf("vsock://%d:%d/connect", parentCID, c.HostProxyPort) conn, path, err := transport.Dial(endpoint) @@ -90,6 +90,7 @@ func setupNetworking(c *Config, stop chan struct{}) error { go tx(conn, tap, errCh) go rx(conn, tap, errCh) elog.Println("Started goroutines to forward traffic.") + close(networkReady) select { case err := <-errCh: return err diff --git a/sync_shared.go b/sync_shared.go index 719903b..e0ef80a 100644 --- a/sync_shared.go +++ b/sync_shared.go @@ -8,8 +8,8 @@ import ( ) const ( - maxAttstnBodyLen = 1 << 14 // Upper limit for attestation body length. - boxKeyLen = 32 // NaCl box's private and public key length. + maxAttstnBodyLen = 256 * 1024 // Upper limit for attestation body length. + boxKeyLen = 32 // NaCl box's private and public key length. ) var ( diff --git a/sync_worker.go b/sync_worker.go index 5a895f5..371c100 100644 --- a/sync_worker.go +++ b/sync_worker.go @@ -10,6 +10,7 @@ import ( "io" "net/http" "net/url" + "sync" "time" "golang.org/x/crypto/nacl/box" @@ -19,16 +20,24 @@ var ( errInProgress = errors.New("key sync already in progress") errFailedToDecrypt = errors.New("error decrypting enclave keys") errHashNotInAttstn = errors.New("hash of encrypted keys not in attestation document") + + maxInterimStateAgeSeconds = 10.0 ) +type interimSyncState struct { + ephemeralKey *boxKey + nonce nonce + startTime time.Time +} + // workerSync holds the state and code that we need for a one-off sync with a // leader enclave. workerSync implements the http.Handler interface because the // sync protocol requires two endpoints on the worker. type workerSync struct { attester - setupWorker func(*enclaveKeys) error - ephemeralKeys chan *boxKey - nonce chan nonce + setupWorker func(*enclaveKeys) error + interimStateMutex sync.Mutex + interimState *interimSyncState } // asWorker returns a new workerSync object. @@ -37,10 +46,8 @@ func asWorker( a attester, ) *workerSync { return &workerSync{ - attester: a, - setupWorker: setupWorker, - nonce: make(chan nonce, 1), - ephemeralKeys: make(chan *boxKey, 1), + attester: a, + setupWorker: setupWorker, } } @@ -104,10 +111,13 @@ func (s *workerSync) ServeHTTP(w http.ResponseWriter, r *http.Request) { func (s *workerSync) initSync(w http.ResponseWriter, r *http.Request) { elog.Println("Received leader's request to initiate key sync.") + s.interimStateMutex.Lock() + defer s.interimStateMutex.Unlock() // There must not be more than one key synchronization attempt at any given // time. Abort if we get another request while key synchronization is still // in progress. - if len(s.ephemeralKeys) > 0 { + if s.interimState != nil && + time.Since(s.interimState.startTime).Seconds() < maxInterimStateAgeSeconds { http.Error(w, errInProgress.Error(), http.StatusTooManyRequests) return } @@ -117,6 +127,7 @@ func (s *workerSync) initSync(w http.ResponseWriter, r *http.Request) { leadersNonce, err := getNonceFromReq(r) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) + return } // Create the worker's nonce and store it in our channel, so we can later @@ -126,7 +137,6 @@ func (s *workerSync) initSync(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusInternalServerError) return } - s.nonce <- workersNonce // Create an ephemeral key that the leader is going to use to encrypt // its enclave keys. @@ -135,7 +145,11 @@ func (s *workerSync) initSync(w http.ResponseWriter, r *http.Request) { http.Error(w, err.Error(), http.StatusInternalServerError) return } - s.ephemeralKeys <- boxKey + s.interimState = &interimSyncState{ + ephemeralKey: boxKey, + nonce: workersNonce, + startTime: time.Now(), + } // Create and return the worker's Base64-encoded attestation document. attstnDoc, err := s.createAttstn(&workerAuxInfo{ @@ -183,8 +197,16 @@ func (s *workerSync) finishSync(w http.ResponseWriter, r *http.Request) { return } + s.interimStateMutex.Lock() + defer s.interimStateMutex.Unlock() + + if s.interimState == nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + // Verify attestation document and obtain its auxiliary information. - aux, err := s.verifyAttstn(attstnDoc, <-s.nonce) + aux, err := s.verifyAttstn(attstnDoc, s.interimState.nonce) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return @@ -204,7 +226,8 @@ func (s *workerSync) finishSync(w http.ResponseWriter, r *http.Request) { return } - ephemeralKey := <-s.ephemeralKeys + ephemeralKey := s.interimState.ephemeralKey + s.interimState = nil // Decrypt the leader's enclave keys, which are encrypted with the // public key that we provided earlier. decrypted, ok := box.OpenAnonymous( diff --git a/sync_worker_test.go b/sync_worker_test.go index 8e2d575..9e3ddd7 100644 --- a/sync_worker_test.go +++ b/sync_worker_test.go @@ -33,6 +33,7 @@ func TestSuccessfulRegisterWith(t *testing.T) { w.WriteHeader(http.StatusOK) }), ) + defer srv.Close() leader, err := url.Parse(srv.URL) if err != nil { t.Fatalf("Error creating test server URL: %v", err) @@ -55,11 +56,23 @@ func TestSuccessfulSync(t *testing.T) { // the leader keys. initLeaderKeysCert(t) + var mockAppRequests []mockAppRequestInfo + mockAppServer := createMockServer(nil, &mockAppRequests) + defer mockAppServer.Close() + // Set up the worker. - worker := createEnclave(&defaultCfg) + cfg := defaultCfg + appURL, err := url.Parse(mockAppServer.URL) + if err != nil { + t.Fatalf("Error creating mock app test server URL: %v", err) + } + cfg.AppWebSrv = appURL + + worker := createEnclave(&cfg) srv := httptest.NewTLSServer( asWorker(worker.setupWorkerPostSync, &dummyAttester{}), ) + defer srv.Close() workerURL, err := url.Parse(srv.URL) if err != nil { t.Fatalf("Error creating test server URL: %v", err) @@ -69,6 +82,11 @@ func TestSuccessfulSync(t *testing.T) { t.Fatalf("Error syncing with leader: %v", err) } + assertEqual(t, len(mockAppRequests), 1) + assertEqual(t, mockAppRequests[0].method, http.MethodPut) + assertEqual(t, mockAppRequests[0].path, "/enclave/state") + assertEqual(t, string(mockAppRequests[0].body), string(leaderKeys.AppKeys)) + // Make sure that the keys were synced correctly. if !worker.keys.equal(leaderKeys) { t.Fatalf("Keys differ between worker and leader:\n%v (worker)\n%v (leader)", diff --git a/util.go b/util.go index 8478e3c..5485e67 100644 --- a/util.go +++ b/util.go @@ -1,6 +1,7 @@ package main import ( + "bytes" "crypto/ecdsa" "crypto/elliptic" "crypto/rand" @@ -18,14 +19,14 @@ import ( "net/url" "strings" "time" + + "github.com/mdlayher/vsock" ) const ( - // The endpoint of AWS's Instance Metadata Service, which allows an enclave - // to learn its internal hostname: - // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html - metadataSvcToken = "http://169.254.169.254/latest/api/token" - metadataSvcInfo = "http://169.254.169.254/latest/meta-data/local-hostname" + maxIPResponseSize = 32 + maxKeyMaterialSize = 256 * 1024 + maxHostnameReqSeconds = 5 ) var ( @@ -131,10 +132,10 @@ func sliceToNonce(s []byte) (nonce, error) { } // getHostnameOrDie returns the "enclave"'s hostname (or IP address) or dies -// trying. If inside an enclave, we query AWS's Instance Metadata Service. If -// outside an enclave, we pick whatever IP address the operating system would +// trying. If inside an enclave, we query the host IP provider, provided by vsock-relay. +// If outside an enclave, we pick whatever IP address the operating system would // choose when talking to a public IP address. -func getHostnameOrDie() (hostname string) { +func getHostnameOrDie(hostIpProviderPort uint32) (hostname string) { defer func() { elog.Printf("Determined our hostname: %s", hostname) }() @@ -147,10 +148,10 @@ func getHostnameOrDie() (hostname string) { // We cannot easily tell when all components are in place to receive // incoming connections. We therefore make five attempts to get our - // hostname from IMDS while waiting for one second in between attempts. + // hostname from the host IP provider while waiting for one second in between attempts. const retries = 5 for i := 0; i < retries; i++ { - hostname, err = getLocalEC2Hostname() + hostname, err = getLocalEC2Hostname(hostIpProviderPort) if err == nil { return } @@ -177,44 +178,27 @@ func getLocalAddr() string { return host } -func getLocalEC2Hostname() (string, error) { - const ( - maxTokenLen = 100 - maxHostnameLen = 255 - ) - // IMDSv2, which we are using, is session-oriented (God knows why), so we - // first obtain a session token from the service. - req, err := http.NewRequest(http.MethodPut, metadataSvcToken, nil) - if err != nil { - return "", err - } - req.Header.Set("X-aws-ec2-metadata-token-ttl-seconds", "10") - resp, err := http.DefaultClient.Do(req) +func getLocalEC2Hostname(hostIpProviderPort uint32) (string, error) { + conn, err := vsock.Dial(parentCID, hostIpProviderPort, nil) if err != nil { - return "", err + return "", fmt.Errorf("failed to connect to host ip provider: %w", err) } - body, err := io.ReadAll(newLimitReader(resp.Body, maxTokenLen)) - if err != nil { - return "", err - } - token := string(body) + defer conn.Close() - // Having obtained the session token, we can now make the actual metadata - // request. - req, err = http.NewRequest(http.MethodGet, metadataSvcInfo, nil) - if err != nil { - return "", err - } - req.Header.Set("X-aws-ec2-metadata-token", token) - resp, err = http.DefaultClient.Do(req) + _ = conn.SetDeadline(time.Now().Add(maxHostnameReqSeconds * time.Second)) + + data, err := io.ReadAll(newLimitReader(conn, maxIPResponseSize)) if err != nil { - return "", err + return "", fmt.Errorf("failed to read from host ip provider: %w", err) } - body, err = io.ReadAll(newLimitReader(resp.Body, maxHostnameLen)) - if err != nil { - return "", err + + hostname := strings.TrimSpace(string(data)) + + if hostname == "" { + return "", fmt.Errorf("received empty ip") } - return string(body), nil + + return hostname, nil } func getNonceFromReq(r *http.Request) (nonce, error) { @@ -257,3 +241,44 @@ func makeLeaderRequest(leader *url.URL, ourNonce nonce, areWeLeader chan bool, e } errChan <- fmt.Errorf("leader designation endpoint returned %d", resp.StatusCode) } + +func _getAppStateURL(appWebSrv *url.URL) string { + url := *appWebSrv + url.Path = pathState + return url.String() +} + +func requestAndStoreKeyFromApp(appWebSrv *url.URL, keys *enclaveKeys) error { + resp, err := newUnauthenticatedHTTPClient().Get(_getAppStateURL(appWebSrv)) + if err != nil { + return fmt.Errorf("failed to make get state request: %v", err) + } + if resp.StatusCode < 200 && resp.StatusCode >= 300 { + return fmt.Errorf("get state request returned %v", resp.StatusCode) + } + keyMaterial, err := io.ReadAll(newLimitReader(resp.Body, maxKeyMaterialSize)) + if err != nil { + return fmt.Errorf("failed to read state body: %v", err) + } + keys.setAppKeys(keyMaterial) + return nil +} + +func sendKeyToApp(appWebSrv *url.URL, keys *enclaveKeys) error { + keyMaterial := bytes.NewBuffer(keys.getAppKeys()) + req, err := http.NewRequest(http.MethodPut, _getAppStateURL(appWebSrv), keyMaterial) + if err != nil { + return fmt.Errorf("failed to generate request to send key to app: %v", err) + } + + req.Header.Set("Content-Type", "application/octet-stream") + + resp, err := newUnauthenticatedHTTPClient().Do(req) + if err != nil { + return fmt.Errorf("failed to send key to app: %v", err) + } + if resp.StatusCode < 200 && resp.StatusCode >= 300 { + return fmt.Errorf("send key to app request returned %v", resp.StatusCode) + } + return nil +} diff --git a/util_test.go b/util_test.go index 0c834e3..4f18521 100644 --- a/util_test.go +++ b/util_test.go @@ -1,6 +1,12 @@ package main -import "testing" +import ( + "net/http" + "net/url" + "testing" +) + +var mockKey = []byte("mock key material") func TestSliceToNonce(t *testing.T) { var err error @@ -11,3 +17,49 @@ func TestSliceToNonce(t *testing.T) { _, err = sliceToNonce(make([]byte, nonceLen)) assertEqual(t, err, nil) } + +func TestRequestAndStoreKeyFromApp(t *testing.T) { + var mockAppRequests []mockAppRequestInfo + mockServer := createMockServer(mockKey, &mockAppRequests) + defer mockServer.Close() + + appURL, err := url.Parse(mockServer.URL) + if err != nil { + t.Fatalf("Failed to get mock server URL: %v", err) + } + keys := enclaveKeys{} + + err = requestAndStoreKeyFromApp(appURL, &keys) + if err != nil { + t.Fatalf("Request and store request failed: %v", err) + } + + assertEqual(t, len(mockAppRequests), 1) + assertEqual(t, mockAppRequests[0].method, http.MethodGet) + assertEqual(t, mockAppRequests[0].path, "/enclave/state") + + assertEqual(t, string(keys.getAppKeys()), string(mockKey)) +} + +func TestSendKeyToApp(t *testing.T) { + var mockAppRequests []mockAppRequestInfo + mockServer := createMockServer(nil, &mockAppRequests) + defer mockServer.Close() + + appURL, err := url.Parse(mockServer.URL) + if err != nil { + t.Fatalf("Failed to get mock server URL: %v", err) + } + keys := &enclaveKeys{} + keys.setAppKeys(mockKey) + + err = sendKeyToApp(appURL, keys) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + assertEqual(t, len(mockAppRequests), 1) + assertEqual(t, mockAppRequests[0].method, http.MethodPut) + assertEqual(t, mockAppRequests[0].path, "/enclave/state") + assertEqual(t, string(mockAppRequests[0].body), string(mockKey)) +} From 971fa775736cc2188999ba6694386ce6c9e80f89 Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Fri, 19 Jul 2024 15:55:32 -0700 Subject: [PATCH 2/7] Update key sync sequence diagram --- doc/key-synchronization.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/key-synchronization.md b/doc/key-synchronization.md index 2a6f137..652e137 100644 --- a/doc/key-synchronization.md +++ b/doc/key-synchronization.md @@ -95,19 +95,20 @@ sequenceDiagram end leader->>leader: Generate HTTPS certificate -leaderApp->>leaderApp: Generate key material Note over leader,worker: Enclaves designate the leader worker->>+leader: GET /enclave/leader (nonce_w) leader-->>-worker: OK + worker->>worker: Did not call itself: worker leader->>leader: GET /enclave/leader (nonce_l) leader->>leader: Did call itself: leader -Note over leaderApp,leader: Application sets its key material -leaderApp->>+leader: PUT /enclave/state (key material) +Note over leaderApp,leader: Enclave prompts key generation + +leader->>+leaderApp: GET /enclave/state +leaderApp-->>-leader: OK leader->>leader: Save key material -leader-->>-leaderApp: OK Note over leader,worker: Worker announces itself to leader worker->>+leader: POST /enclave/heartbeat @@ -127,10 +128,9 @@ worker-->>-leader: OK worker->>worker: Install HTTPS certificate -Note over worker,workerApp: Application retrieves key material -workerApp->>+worker: GET /enclave/state -worker->>worker: Retrieve key material -worker-->>-workerApp: OK (key material) +Note over worker,workerApp: Enclave sends key material to app +worker->>+workerApp: PUT /enclave/state +workerApp-->>-worker: OK workerApp->>workerApp: Install key material Note over leader, worker: Worker starts heartbeat loop @@ -141,9 +141,10 @@ loop Heartbeat end Note over leaderApp: Application updates its key material + leaderApp->>+leader: PUT /enclave/state (key material) leader->>leader: Save key material leader-->>-leaderApp: OK note over leader,worker: Leader initiates key re-synchronization as above -``` \ No newline at end of file +``` From 48d6cbb2837e060ba7f01cee2ab314431d88bdb3 Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Mon, 22 Jul 2024 17:32:03 -0700 Subject: [PATCH 3/7] Increase leader check timout --- enclave.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/enclave.go b/enclave.go index 85e5ddb..d35885d 100644 --- a/enclave.go +++ b/enclave.go @@ -437,14 +437,14 @@ func (e *Enclave) weAreLeader() (result bool) { }, ) - timeout := time.NewTicker(10 * time.Second) + timeout := time.NewTicker(120 * time.Second) for { go makeLeaderRequest(leader, ourNonce, areWeLeader, errChan) select { case <-e.stop: return - case <-errChan: - elog.Println("Not yet able to talk to leader designation endpoint.") + case err = <-errChan: + elog.Printf("Not yet able to talk to leader designation endpoint: %v", err) continue case result = <-areWeLeader: return From ebb3a346c53497b914c9abf5132dd80bf66380e4 Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Thu, 25 Jul 2024 19:46:15 -0700 Subject: [PATCH 4/7] Prometheus fixes: make namespace optional, listen on vsock --- enclave.go | 6 +++++- main.go | 3 --- metrics.go | 44 ++++++++++++++++++++++++++++++-------------- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/enclave.go b/enclave.go index d35885d..a216a21 100644 --- a/enclave.go +++ b/enclave.go @@ -576,7 +576,11 @@ func (e *Enclave) startWebServers() error { if e.cfg.PrometheusPort > 0 { elog.Printf("Starting Prometheus Web server (%s).", e.promSrv.Addr) go func() { - err := e.promSrv.ListenAndServe() + listener, err := e.getExtListener(e.cfg.PrometheusPort) + if err != nil { + elog.Fatalf("Failed to listen on Prometheus port: %v", err) + } + err = e.promSrv.Serve(listener) if err != nil && !errors.Is(err, http.ErrServerClosed) { elog.Fatalf("Prometheus Web server error: %v", err) } diff --git a/main.go b/main.go index 16c47ce..270fca3 100644 --- a/main.go +++ b/main.go @@ -96,9 +96,6 @@ func main() { if prometheusPort > math.MaxUint16 { elog.Fatalf("-prometheus-port must be in interval [1, %d]", math.MaxUint16) } - if prometheusPort != 0 && prometheusNamespace == "" { - elog.Fatalf("-prometheus-namespace must be set when Prometheus is used.") - } c := &Config{ FQDN: fqdn, diff --git a/metrics.go b/metrics.go index 4c76dc6..cd4c32e 100644 --- a/metrics.go +++ b/metrics.go @@ -3,6 +3,7 @@ package main import ( "fmt" "net/http" + "time" "github.com/go-chi/chi/v5/middleware" "github.com/prometheus/client_golang/prometheus" @@ -10,10 +11,10 @@ import ( ) const ( - reqPath = "http_req_path" - reqMethod = "http_req_method" - respStatus = "http_resp_status" - respErr = "http_resp_error" + reqPath = "path" + reqMethod = "method" + respStatus = "status" + respErr = "error" notAvailable = "n/a" ) @@ -31,14 +32,15 @@ var ( // metrics contains our Prometheus metrics. type metrics struct { - reqs *prometheus.CounterVec - proxiedReqs *prometheus.CounterVec - heartbeats *prometheus.CounterVec + reqs *prometheus.CounterVec + proxiedReqs *prometheus.CounterVec + heartbeats *prometheus.CounterVec + reqDurations *prometheus.HistogramVec } // newMetrics initializes our Prometheus metrics. func newMetrics(reg prometheus.Registerer, namespace string) *metrics { - elog.Printf("Initializing Prometheus metrics for %q.", namespace) + elog.Printf("Initializing Prometheus metrics. namespace = %q", namespace) m := &metrics{ reqs: prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -48,6 +50,14 @@ func newMetrics(reg prometheus.Registerer, namespace string) *metrics { }, []string{reqPath, reqMethod, respStatus, respErr}, ), + reqDurations: prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Name: "request_duration_seconds", + Help: "Duration of proxied HTTP requests", + }, + []string{reqPath, reqMethod, respStatus, respErr}, + ), proxiedReqs: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, @@ -68,6 +78,7 @@ func newMetrics(reg prometheus.Registerer, namespace string) *metrics { reg.MustRegister(m.proxiedReqs) reg.MustRegister(m.reqs) reg.MustRegister(m.heartbeats) + reg.MustRegister(m.reqDurations) reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{ Namespace: namespace, @@ -109,14 +120,19 @@ func (m *metrics) checkRevProxyErr(w http.ResponseWriter, r *http.Request, err e // our Prometheus metrics. func (m *metrics) middleware(h http.Handler) http.Handler { f := func(w http.ResponseWriter, r *http.Request) { + startTime := time.Now() ww := middleware.NewWrapResponseWriter(w, r.ProtoMajor) h.ServeHTTP(ww, r) - m.reqs.With(prometheus.Labels{ - reqPath: r.URL.Path, - reqMethod: r.Method, - respStatus: fmt.Sprint(ww.Status()), - respErr: notAvailable, - }).Inc() + if ww.Status() != http.StatusNotFound { + labels := prometheus.Labels{ + reqPath: r.URL.Path, + reqMethod: r.Method, + respStatus: fmt.Sprint(ww.Status()), + respErr: notAvailable, + } + m.reqs.With(labels).Inc() + m.reqDurations.With(labels).Observe(time.Since(startTime).Seconds()) + } } return http.HandlerFunc(f) } From 42dfbcf0ee88c3466e1ec84ca668123d4f7e2f0a Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Mon, 29 Jul 2024 16:30:55 -0700 Subject: [PATCH 5/7] Set cert fingerprint in attestation document after worker sync --- enclave.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/enclave.go b/enclave.go index a216a21..0e3f983 100644 --- a/enclave.go +++ b/enclave.go @@ -472,6 +472,9 @@ func (e *Enclave) setupWorkerPostSync(keys *enclaveKeys) error { return err } e.httpsCert.set(&cert) + if err = e.setCertFingerprint(keys.NitridingCert); err != nil { + return err + } if !e.heartbeatActive { worker := getSyncURL(e.myHostname, e.cfg.ExtPrivPort) From 12da224934f60eacf0a18f40c1d2f4c82120d132 Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Mon, 16 Sep 2024 19:51:40 -0700 Subject: [PATCH 6/7] Ignore PCR3 during key sync attestation --- attestation.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/attestation.go b/attestation.go index ee632b1..317faad 100644 --- a/attestation.go +++ b/attestation.go @@ -70,10 +70,10 @@ func arePCRsIdentical(ourPCRs, theirPCRs map[uint][]byte) bool { } for pcr, ourValue := range ourPCRs { - // PCR4 contains a hash over the parent's instance ID. Our enclaves run - // on different parent instances; PCR4 will therefore always differ: + // PCR3 and PCR4 are hashes of the parent's instance ID and IAM role, respectively. + // Our enclaves run on different parent instances; PCR3 and PCR4 will therefore always differ: // https://docs.aws.amazon.com/enclaves/latest/user/set-up-attestation.html - if pcr == 4 { + if pcr == 3 || pcr == 4 { continue } theirValue, exists := theirPCRs[pcr] From 255fa70056b35b86ea493a0157d7a0b49a82579b Mon Sep 17 00:00:00 2001 From: Darnell Andries Date: Thu, 17 Oct 2024 21:14:59 -0700 Subject: [PATCH 7/7] Add nix flake --- flake.lock | 26 ++++++++++++++++++++++++++ flake.nix | 31 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..2dba23f --- /dev/null +++ b/flake.lock @@ -0,0 +1,26 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1728500571, + "narHash": "sha256-dOymOQ3AfNI4Z337yEwHGohrVQb4yPODCW9MDUyAc4w=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "d51c28603def282a24fa034bcb007e2bcb5b5dd0", + "type": "github" + }, + "original": { + "id": "nixpkgs", + "ref": "nixos-24.05", + "type": "indirect" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..b5bc1db --- /dev/null +++ b/flake.nix @@ -0,0 +1,31 @@ +{ + description = "Nitriding daemon"; + + inputs = { + nixpkgs.url = "nixpkgs/nixos-24.05"; + }; + + outputs = { self, nixpkgs }: + let + system = "x86_64-linux"; + + pkgs = import nixpkgs { inherit system; }; + + in { + packages.x86_64-linux.default = pkgs.buildGoModule { + pname = "nitriding-daemon"; + version = "1.4.2"; + src = builtins.filterSource + (path: type: + let relPath = pkgs.lib.removePrefix (toString ./. + "/") path; + in pkgs.lib.hasSuffix ".go" relPath || + pkgs.lib.hasSuffix ".mod" relPath || + pkgs.lib.hasSuffix ".sum" relPath) + ./.; + vendorHash = "sha256-KKgDI8W2Xbpfr3lRuSYH4fdOjPFfQZdapg7m09pXm80="; + CGO_ENABLED = 0; + ldflags = ["-s" "-w"]; + checkFlags = ["-skip"]; + }; + }; +}