Skip to content

Commit

Permalink
Exponential backoff after a failed sync
Browse files Browse the repository at this point in the history
  • Loading branch information
robert lestak committed May 8, 2024
1 parent f214a87 commit fdfad9f
Show file tree
Hide file tree
Showing 6 changed files with 145 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.21 as builder
FROM golang:1.22.0 as builder

WORKDIR /app

Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Enable Kubernetes `cert-manager` to sync TLS certificates to AWS ACM, GCP, Hashi
- [Heroku](#heroku)
- [Incapsula](#incapsula)
- [ThreatX](#threatx)
- [Exponential backoff after a failed sync](#exponential-backoff-after-a-failed-sync)
- [Configuration](#configuration)
- [Deployment](#deployment)

Expand Down Expand Up @@ -227,6 +228,16 @@ Annotations:
cert-manager-sync.lestak.sh/threatx-secret-name: "example-threatx-api-secret" # secret in same namespace which contains the threatx api key. If provided in format "namespace/secret-name", will look in that namespace for the secret
```

## Exponential backoff after a failed sync

Previously, a failed sync will be retried every `60s` which — especially in larger cert-manager installations — could cause rate limits to be hit as well as overwhelm external services. Failed attempts are now retried with a binary exponential backoff starting with `60s` then `120s`, `240s` up to a maximum of `32h`. As part of the new backoff behavior, a new `cert-manager-sync.lestak.sh/failed-sync-attempts` field was added to the `cert-manager-sync` Secret annotations to track the number of currently failed issuances.

By default, the operator will continue to retry indefinitely until the sync is successful, or the sync annotation is removed. If you would like to limit the number of retries, you can set the `cert-manager-sync.lestak.sh/max-sync-attempts` annotation to the number of retries you would like to allow.

```yaml
cert-manager-sync.lestak.sh/max-sync-attempts: "5" # limit the number of retries to 5, after which you will need to manually resolve the underlying issue and reset/remove the failed-sync-attempts annotation
```

## Configuration

The operator uses Kubernetes annotations to define the sync locations and configurations.
Expand Down Expand Up @@ -278,6 +289,8 @@ metadata:
cert-manager-sync.lestak.sh/vault-role: "role-name" # HashiCorp Vault role name
cert-manager-sync.lestak.sh/vault-auth-method: "auth-method" # HashiCorp Vault auth method name
cert-manager-sync.lestak.sh/vault-path: "kv-name/path/to/secret" # HashiCorp Vault path to store cert
cert-manager-sync.lestak.sh/max-sync-attempts: "5" # limit the number of retries to 5, after which you will need to manually resolve the underlying issue and reset/remove the failed-sync-attempts annotation
cert-manager-sync.lestak.sh/failed-sync-attempts: "0" # number of failed sync attempts, will be auto-filled by operator
data:
tls.crt: ""
tls.key: ""
Expand Down
24 changes: 23 additions & 1 deletion cmd/cert-manager-sync/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ func main() {
)
l.Info("starting cert-manager-sync")
// main loop
origLoopDelay := time.Duration(time.Second * 60)
loopDelay := origLoopDelay
maxLoopDelay := time.Duration(time.Hour * 32)
for {
l.Debug("main loop")
// if namespace is not specified all namespaces will be searched
Expand All @@ -70,12 +73,31 @@ func main() {
jobs <- s
}
close(jobs)
errCount := 0
for a := 1; a <= len(secrets); a++ {
err := <-results
if err != nil {
l.Error(err)
errCount++
}
}
time.Sleep(time.Second * 20)
if errCount > 0 {
// double the loop delay if there are errors
loopDelay = loopDelay * 2
if loopDelay > maxLoopDelay {
l.Warnf("max loop delay reached")
loopDelay = maxLoopDelay
}
l.WithFields(log.Fields{
"errCount": errCount,
"loopDelay": loopDelay,
"nextRun": time.Now().Add(loopDelay).Format(time.RFC3339),
}).Error("sync errors occurred")
} else {
// reset the loop delay if there are no errors
loopDelay = origLoopDelay
l.Debug("sync successful")
}
time.Sleep(loopDelay)
}
}
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/robertlestak/cert-manager-sync

go 1.18
go 1.22.0

require (
cloud.google.com/go/certificatemanager v1.6.0
Expand All @@ -12,6 +12,7 @@ require (
github.com/heroku/heroku-go/v5 v5.5.0
github.com/sirupsen/logrus v1.8.1
golang.org/x/oauth2 v0.5.0
google.golang.org/api v0.110.0
google.golang.org/genproto v0.0.0-20230209215440-0dfe4f8abfcc
k8s.io/api v0.24.0
k8s.io/apimachinery v0.24.0
Expand Down Expand Up @@ -93,7 +94,6 @@ require (
golang.org/x/term v0.15.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.5.0 // indirect
google.golang.org/api v0.110.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/grpc v1.53.0 // indirect
google.golang.org/protobuf v1.28.1 // indirect
Expand Down
5 changes: 5 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORN
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
Expand Down Expand Up @@ -426,6 +427,7 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/rogpeppe/go-internal v1.8.1 h1:geMPLpDpQOgVyCg5z5GoRwLHepNdb71NXb67XFkP+Eg=
github.com/rogpeppe/go-internal v1.8.1/go.mod h1:JeRgkft04UBgHMgCIwADu4Pn6Mtm5d4nPKWu0nJ5d+o=
github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
github.com/ryanuber/go-glob v1.0.0 h1:iQh3xXAumdQ+4Ufa5b25cRpC5TYKlno6hsv6Cb3pkBk=
github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc=
Expand All @@ -452,6 +454,7 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
Expand Down Expand Up @@ -719,6 +722,7 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
Expand Down Expand Up @@ -838,6 +842,7 @@ gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
Expand Down
105 changes: 101 additions & 4 deletions pkg/certmanagersync/certmanagersync.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package certmanagersync

import (
"context"
"errors"
"fmt"
"strconv"

"github.com/robertlestak/cert-manager-sync/pkg/state"
"github.com/robertlestak/cert-manager-sync/stores/acm"
Expand All @@ -16,6 +18,7 @@ import (
"github.com/robertlestak/cert-manager-sync/stores/vault"
log "github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type StoreType string
Expand All @@ -42,7 +45,6 @@ func NewStore(storeType StoreType) (RemoteStore, error) {
})
l.Debugf("NewStore %s", storeType)
var store RemoteStore
var err error
switch storeType {
case ACMStoreType:
store = &acm.ACMStore{}
Expand All @@ -65,11 +67,90 @@ func NewStore(storeType StoreType) (RemoteStore, error) {
default:
return nil, errors.New("invalid store type")
}
return store, nil
}

// maxRetries returns the max number of sync attempts allowed for a secret
// if the secret has a max-sync-attempts annotation
// if the annotation is not present, -1 is returned, indicating unlimited retries
func maxRetries(s *corev1.Secret) int {
l := log.WithFields(log.Fields{
"action": "maxRetries",
})
l.Debugf("maxRetries %s", s.Name)
if s.Annotations[state.OperatorName+"/max-sync-attempts"] != "" {
iv, err := strconv.ParseInt(s.Annotations[state.OperatorName+"/max-sync-attempts"], 10, 64)
if err != nil {
l.WithError(err).Errorf("ParseInt error")
return -1
}
return int(iv)
}
return -1
}

// consumedRetries returns the number of sync attempts that have been made for a secret
// if the secret has a failed-sync-attempts annotation
// if the annotation is not present, 0 is returned, indicating no retries have been made
func consumedRetries(s *corev1.Secret) int {
l := log.WithFields(log.Fields{
"action": "consumedRetries",
})
l.Debugf("consumedRetries %s", s.Name)
if s.Annotations[state.OperatorName+"/failed-sync-attempts"] != "" {
iv, err := strconv.ParseInt(s.Annotations[state.OperatorName+"/failed-sync-attempts"], 10, 64)
if err != nil {
l.WithError(err).Errorf("ParseInt error")
return 0
}
return int(iv)
}
return 0
}

func incrementRetries(secretNamespace, secretName string) error {
l := log.WithFields(log.Fields{
"action": "incrementRetries",
})
l.Debugf("incrementRetries %s/%s", secretNamespace, secretName)
// get the secret from k8s, since we don't know if data has been changed by a store
gopt := metav1.GetOptions{}
secret, err := state.KubeClient.CoreV1().Secrets(secretNamespace).Get(context.Background(), secretName, gopt)
if err != nil {
l.WithError(err).Errorf("vault.NewStore error")
return nil, err
l.WithError(err).Errorf("Get error")
return err
}
return store, nil
// increment the failed-sync-attempts annotation
iv := consumedRetries(secret) + 1
secret.Annotations[state.OperatorName+"/failed-sync-attempts"] = strconv.Itoa(iv)
_, err = state.KubeClient.CoreV1().Secrets(secretNamespace).Update(context.Background(), secret, metav1.UpdateOptions{})
if err != nil {
l.WithError(err).Errorf("Update error")
return err
}
return nil
}

func resetRetries(secretNamespace, secretName string) error {
l := log.WithFields(log.Fields{
"action": "resetRetries",
})
l.Debugf("resetRetries %s/%s", secretNamespace, secretName)
// get the secret from k8s, since we don't know if data has been changed by a store
gopt := metav1.GetOptions{}
secret, err := state.KubeClient.CoreV1().Secrets(secretNamespace).Get(context.Background(), secretName, gopt)
if err != nil {
l.WithError(err).Errorf("Get error")
return err
}
// remove the failed-sync-attempts annotation
delete(secret.Annotations, state.OperatorName+"/failed-sync-attempts")
_, err = state.KubeClient.CoreV1().Secrets(secretNamespace).Update(context.Background(), secret, metav1.UpdateOptions{})
if err != nil {
l.WithError(err).Errorf("Update error")
return err
}
return nil
}

func EnabledStores(s *corev1.Secret) []StoreType {
Expand Down Expand Up @@ -152,6 +233,13 @@ func HandleSecret(s *corev1.Secret) error {
"action": "HandleSecret",
})
l.Debugf("HandleSecret %s", s.Name)
// ensure we haven't exceeded the allotted retries
maxR := maxRetries(s)
consumedR := consumedRetries(s)
if maxR != -1 && consumedR >= maxR {
l.Errorf("max retries reached")
return nil
}
// get the list of stores enabled for this secret
stores := EnabledStores(s)
if len(stores) == 0 {
Expand Down Expand Up @@ -182,7 +270,16 @@ func HandleSecret(s *corev1.Secret) error {
}
}
if len(errs) > 0 {
// increment the failed-sync-attempts annotation
if err := incrementRetries(s.Namespace, s.Name); err != nil {
l.WithError(err).Errorf("incrementRetries error")
}
return fmt.Errorf("errors syncing secret %s/%s to stores: %v", s.Namespace, s.Name, errs)
} else {
// reset the failed-sync-attempts annotation
if err := resetRetries(s.Namespace, s.Name); err != nil {
l.WithError(err).Errorf("resetRetries error")
}
}
// if the sync was a success, add the secret to the cache
state.AddToCache(s)
Expand Down

0 comments on commit fdfad9f

Please sign in to comment.