Skip to content

Commit

Permalink
Small refactor to pod credentials reconciler to also support auto res…
Browse files Browse the repository at this point in the history
…tart for pods
  • Loading branch information
evyatarmeged committed Jun 23, 2024
1 parent 4d046c6 commit 953130a
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 22 deletions.
3 changes: 3 additions & 0 deletions src/operator/controllers/metadata/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ const (
// UserAndPasswordSecretNameAnnotation is the name of the secret in which the user and password are stored
UserAndPasswordSecretNameAnnotation = "credentials-operator.otterize.com/user-password-secret-name"

// RestartOnSecretRotation signals the
RestartOnSecretRotation = "credentials-operator.otterize.com/restart-on-secret-rotation"

// TLSSecretNameAnnotation is the name of the K8s secret in which the certificate data is stored
TLSSecretNameAnnotation = "credentials-operator.otterize.com/tls-secret-name"
TLSSecretNameAnnotationDeprecated = "spire-integration.otterize.com/tls-secret-name"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/samber/lo"
"github.com/sirupsen/logrus"
"github.com/spf13/viper"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -23,16 +24,19 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"strings"
"time"
)

const (
ReasonEnsuredPodUserAndPassword = "EnsuredPodUserAndPassword"
ReasonGeneratePodDatabaseUserFailed = "GeneratePodDatabaseUserFailed"
ReasonEnsuringPodUserAndPasswordFailed = "EnsuringPodUserAndPasswordFailed"
ReasonEnsuringDatabasePasswordFailed = "EnsuringDatabasePasswordFailed"
ReasonRotatingSecretFailed = "RotatingSecretFailed"
ReasonEnsuredPodUserAndPassword = "EnsuredPodUserAndPassword"
ReasonGeneratePodDatabaseUserFailed = "GeneratePodDatabaseUserFailed"
ReasonEnsuringPodUserAndPasswordFailed = "EnsuringPodUserAndPasswordFailed"
ReasonEnsuringDatabasePasswordFailed = "EnsuringDatabasePasswordFailed"
ReasonRotatingSecretFailed = "RotatingSecretFailed"
ReasonRestartingPodAfterSecretRotationFailed = "RestartingPodAfterSecretRotationFailed"
)

const (
Expand All @@ -58,6 +62,7 @@ func NewReconciler(client client.Client, scheme *runtime.Scheme, eventRecorder r
func (e *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
WithOptions(controller.Options{RecoverPanic: lo.ToPtr(true)}).
Watches(&v1.Secret{}, handler.EnqueueRequestsFromMapFunc(e.watchSecretDeletion)).
For(&v1.Pod{}).
Complete(e)
}
Expand All @@ -66,6 +71,11 @@ func (e *Reconciler) shouldHandleCredentialsForPod(pod v1.Pod) bool {
return pod.Annotations != nil && hasUserAndPasswordSecretAnnotation(pod)
}

func hasRestartAnnotation(pod v1.Pod) bool {
_, ok := pod.Annotations[metadata.RestartOnSecretRotation]
return ok
}

func hasUserAndPasswordSecretAnnotation(pod v1.Pod) bool {
_, ok := pod.Annotations[metadata.UserAndPasswordSecretNameAnnotation]
return ok
Expand Down Expand Up @@ -109,21 +119,45 @@ func (e *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
return ctrl.Result{}, errors.Wrap(err)
}

secretName := pod.Annotations[metadata.UserAndPasswordSecretNameAnnotation]
logrus.Debug("Ensuring user-password credentials secrets for pod")
password, err := e.ensurePodUserAndPasswordSecret(ctx, &pod, pod.Annotations[metadata.UserAndPasswordSecretNameAnnotation], username)
result, createdSecret, err := e.ensurePodUserAndPasswordSecret(ctx, &pod, secretName, username)
if err != nil {
e.recorder.Eventf(&pod, v1.EventTypeWarning, ReasonEnsuringPodUserAndPasswordFailed, "Failed to ensure user-password credentials secret: %s", err.Error())
return ctrl.Result{}, errors.Wrap(err)
}

if hasDatabaseAccessAnnotation(pod) {
logrus.Debug("Validating password in all databases")
err = e.ensurePasswordInDatabases(ctx, pod, username, password)
if err != nil {
if result.Requeue {
return result, nil
}

if createdSecret {
secret := v1.Secret{}
if err := e.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: secretName}, &secret); err != nil {
return ctrl.Result{}, errors.Wrap(err)
}
password := string(secret.Data["password"])

if hasDatabaseAccessAnnotation(pod) {
logrus.Debug("Validating password in all databases")
err = e.ensurePasswordInDatabases(ctx, pod, username, password)
if err != nil {
return ctrl.Result{}, errors.Wrap(err)
}
}

if hasRestartAnnotation(pod) {
logrus.Info("Triggering pod restart for newly created secret")
err := e.TriggerPodRestart(ctx, &pod)
if err != nil {
e.recorder.Eventf(&pod, v1.EventTypeWarning,
ReasonRestartingPodAfterSecretRotationFailed, "Failed restarting pod after secret creation: %s", err.Error())
}
}

e.recorder.Event(&pod, v1.EventTypeNormal, ReasonEnsuredPodUserAndPassword, "Ensured user-password credentials in specified secret")
}
e.recorder.Event(&pod, v1.EventTypeNormal, ReasonEnsuredPodUserAndPassword, "Ensured user-password credentials in specified secret")

return ctrl.Result{}, nil
}

Expand All @@ -142,30 +176,38 @@ func (e *Reconciler) generateServiceDatabaseUsername(ctx context.Context, pod *v
return clusterutils.KubernetesToPostgresName(username), nil
}

func (e *Reconciler) ensurePodUserAndPasswordSecret(ctx context.Context, pod *v1.Pod, secretName string, username string) (string, error) {
func (e *Reconciler) ensurePodUserAndPasswordSecret(ctx context.Context, pod *v1.Pod, secretName string, username string) (ctrl.Result, bool, error) {
log := logrus.WithFields(logrus.Fields{"pod": pod.Name, "namespace": pod.Namespace})
secret := v1.Secret{}
err := e.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: secretName}, &secret)

if apierrors.IsNotFound(err) {
log.Debug("Creating user-password credentials secret for pod")
password, err := databaseconfigurator.GenerateRandomPassword()
if err != nil {
return "", errors.Wrap(err)
return ctrl.Result{}, false, errors.Wrap(err)
}

secret := buildUserAndPasswordCredentialsSecret(secretName, pod.Namespace, username, password)
log.WithField("secret", secretName).Debug("Creating new secret with user-password credentials")
if err := e.client.Create(ctx, secret); err != nil {
return "", errors.Wrap(err)
return ctrl.Result{}, false, errors.Wrap(err)
}
return password, nil
return ctrl.Result{}, true, nil
}

if err != nil {
return "", errors.Wrap(err)
return ctrl.Result{}, false, errors.Wrap(err)
}

// If secret exists but is being deleted we need to requeue until it is deleted
if !secret.DeletionTimestamp.IsZero() {
logrus.Info("Secret is being deleted, retriggering reconcile")
return ctrl.Result{Requeue: true}, false, nil
}

log.Debug("Secret exists, nothing to do")
return string(secret.Data["password"]), nil
return ctrl.Result{}, false, nil
}

func (e *Reconciler) ensurePasswordInDatabases(ctx context.Context, pod v1.Pod, username string, password string) error {
Expand Down Expand Up @@ -233,8 +275,11 @@ func (e *Reconciler) RotateSecretsAndAlterPasswords(ctx context.Context) error {
logrus.Infof("Rotated secret: %s.%s", secret.Name, secret.Namespace)
}

err := e.runAlterPasswordForSecrets(ctx, rotatedSecrets)
if err != nil {
if err := e.runAlterPasswordForSecrets(ctx, rotatedSecrets); err != nil {
return errors.Wrap(err)
}

if err := e.handlePodRestartsForRotatedSecrets(ctx, rotatedSecrets); err != nil {
return errors.Wrap(err)
}

Expand Down Expand Up @@ -350,6 +395,7 @@ func (e *Reconciler) ensurePasswordInDatabaseInstance(

return true, nil
}

func closeAllConnections(ctx context.Context, allConfigurators []databaseconfigurator.DatabaseConfigurator) {
for _, dbConfigurator := range allConfigurators {
dbConfigurator.CloseConnection(ctx)
Expand All @@ -366,6 +412,7 @@ func (e *Reconciler) GetAllDBConfigurators(ctx context.Context, mysqlServerConfi
}
configurators = append(configurators, dbconfigurator)
}

for _, pgServerConfig := range pgServerConfigs {
dbconfigurator, err := postgres.NewPostgresConfigurator(ctx, pgServerConfig.Spec)
if err != nil {
Expand All @@ -377,6 +424,137 @@ func (e *Reconciler) GetAllDBConfigurators(ctx context.Context, mysqlServerConfi
return configurators
}

func (e *Reconciler) TriggerPodRestart(ctx context.Context, pod *v1.Pod) error {
owner, err := e.serviceIdResolver.GetOwnerObject(ctx, pod)
if err != nil {
return errors.Wrap(err)
}
kind := owner.GetObjectKind().GroupVersionKind().Kind
logrus.Infof("Trying to trigger restart for workload '%s' of kind %s", owner.GetName(), kind)
switch kind {
case "Deployment":
deployment := appsv1.Deployment{}
if err := e.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: owner.GetName()}, &deployment); err != nil {
return errors.Wrap(err)
}
deployment.Spec.Template.Annotations[metadata.TLSRestartTimeAfterRenewal] = time.Now().Format(time.RFC3339)
if err := e.client.Update(ctx, &deployment); err != nil {
return errors.Wrap(err)
}
case "ReplicaSet":
replicaSet := appsv1.ReplicaSet{}
if err := e.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: owner.GetName()}, &replicaSet); err != nil {
return errors.Wrap(err)
}
replicaSet.Spec.Template.Annotations[metadata.TLSRestartTimeAfterRenewal] = time.Now().Format(time.RFC3339)
if err := e.client.Update(ctx, &replicaSet); err != nil {
return errors.Wrap(err)
}
case "StatefulSet":
statefulSet := appsv1.StatefulSet{}
if err := e.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: owner.GetName()}, &statefulSet); err != nil {
return errors.Wrap(err)
}
statefulSet.Spec.Template.Annotations[metadata.TLSRestartTimeAfterRenewal] = time.Now().Format(time.RFC3339)
if err := e.client.Update(ctx, &statefulSet); err != nil {
return errors.Wrap(err)
}
case "DaemonSet":
daemonSet := appsv1.DaemonSet{}
if err := e.client.Get(ctx, types.NamespacedName{Namespace: pod.Namespace, Name: owner.GetName()}, &daemonSet); err != nil {
return errors.Wrap(err)
}
daemonSet.Spec.Template.Annotations[metadata.TLSRestartTimeAfterRenewal] = time.Now().Format(time.RFC3339)
if err := e.client.Update(ctx, &daemonSet); err != nil {
return errors.Wrap(err)
}
default:
// Rougher way - trigger delete on all pods in the namespace with the same labels - will restart just the owner's pods
err := e.client.DeleteAllOf(ctx, &v1.Pod{
ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace},
}, client.MatchingLabels(pod.Labels))
if err != nil {
return errors.Wrap(err)
}
}

return nil
}

func (e *Reconciler) handlePodRestartsForRotatedSecrets(ctx context.Context, rotatedSecrets []v1.Secret) error {
for _, secret := range rotatedSecrets {
podList := &v1.PodList{}
if err := e.client.List(ctx, podList, &client.ListOptions{Namespace: secret.Namespace}); err != nil {
return errors.Wrap(err)
}
for _, pod := range podList.Items {
secretName, ok := pod.Annotations[metadata.UserAndPasswordSecretNameAnnotation]
if ok && secretName == secret.Name {
if hasRestartAnnotation(pod) {
logrus.Info("Triggering pod restart after secret rotation")
err := e.TriggerPodRestart(ctx, &pod)
if err != nil {
e.recorder.Eventf(&pod, v1.EventTypeWarning,
ReasonRestartingPodAfterSecretRotationFailed, "Failed restarting pod after secret rotation: %s", err.Error())
}
// A single restart trigger should handle all pod replicas, we can break
// If we failed, we continue to try and restart pods for other rotated secrets
break
}
}
}
}

return nil
}

func (e *Reconciler) watchSecretDeletion(ctx context.Context, object client.Object) []reconcile.Request {
namespace := v1.Namespace{}
err := e.client.Get(ctx, types.NamespacedName{Name: object.GetNamespace()}, &namespace)
if err != nil {
logrus.WithError(err).Error("Failed to get namespace")
return nil
}
if !namespace.DeletionTimestamp.IsZero() {
// Skip deleting namespaces
return nil
}

secret := v1.Secret{}
err = e.client.Get(ctx, types.NamespacedName{Namespace: object.GetNamespace(), Name: object.GetName()}, &secret)
if err != nil && !apierrors.IsNotFound(err) {
logrus.WithError(err).Error("Failed to get secret")
return nil
}

// Only call reconcile if the secret is being deleted or is not found anymore
if !secret.DeletionTimestamp.IsZero() || apierrors.IsNotFound(err) {
podList := v1.PodList{}
if err := e.client.List(ctx, &podList, &client.ListOptions{Namespace: object.GetNamespace()}); err != nil {
logrus.WithError(err).Error("Failed to list pods")
return nil
}

if len(podList.Items) == 0 {
return nil
}

result := lo.Filter(podList.Items, func(pod v1.Pod, _ int) bool {
secretName, ok := pod.Annotations[metadata.UserAndPasswordSecretNameAnnotation]
return ok && secretName == object.GetName()
})

if len(result) > 0 {
// Even if more than 1 pods match the criteria, we just need to enqueue reconciliation for one
pod := result[0]
logrus.Infof("Enqueueing pod reconciliation: %s for deleted secret: %s", pod.Name, object.GetName())
return []reconcile.Request{{NamespacedName: types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}}}
}
}

return nil
}

func shouldRotateSecret(secret v1.Secret) bool {
if secret.Annotations == nil {
return true
Expand Down
2 changes: 1 addition & 1 deletion src/operator/go.mod

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions src/operator/go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 953130a

Please sign in to comment.