diff --git a/example/tk/tempo-microservices/main.jsonnet b/example/tk/tempo-microservices/main.jsonnet index 5f2f6820145..7bed639f037 100644 --- a/example/tk/tempo-microservices/main.jsonnet +++ b/example/tk/tempo-microservices/main.jsonnet @@ -16,6 +16,7 @@ minio + metrics + load + tempo { _config+:: { cluster: 'k3d', namespace: 'default', + block_builder_concurrent_rollout_enabled: true, compactor+: { }, querier+: { @@ -41,7 +42,7 @@ minio + metrics + load + tempo { pvc_storage_class: 'local-path', }, block_builder+:{ - replicas: 1, + replicas: 2, }, memcached+: { replicas: 1, diff --git a/operations/jsonnet/microservices/block-builder.libsonnet b/operations/jsonnet/microservices/block-builder.libsonnet index 926180fd2ab..2ce74a329cf 100644 --- a/operations/jsonnet/microservices/block-builder.libsonnet +++ b/operations/jsonnet/microservices/block-builder.libsonnet @@ -36,7 +36,7 @@ $.util.readinessProbe + (if $._config.variables_expansion then container.withArgsMixin(['-config.expand-env=true']) else {}), - tempo_block_builder_statefulset: + newBlockBuilderStatefulSet(concurrent_rollout_enabled=false, max_unavailable=1):: statefulset.new(target_name, $._config.block_builder.replicas, $.tempo_block_builder_container, [], { app: target_name }) + statefulset.mixin.spec.withServiceName(target_name) + statefulset.spec.template.spec.securityContext.withFsGroup(10001) + // 10001 is the UID of the tempo user @@ -47,7 +47,18 @@ volume.fromConfigMap(tempo_config_volume, $.tempo_block_builder_configmap.metadata.name), volume.fromConfigMap(tempo_overrides_config_volume, $._config.overrides_configmap_name), ]) + - statefulset.mixin.spec.withPodManagementPolicy('Parallel'), + statefulset.mixin.spec.withPodManagementPolicy('Parallel') + + ( + if !concurrent_rollout_enabled then {} else + statefulset.mixin.spec.selector.withMatchLabels({ name: 'block-builder', 'rollout-group': 'block-builder' }) + + statefulset.mixin.spec.updateStrategy.withType('OnDelete') + + statefulset.mixin.metadata.withLabelsMixin({ 'rollout-group': 'block-builder' }) + + statefulset.mixin.metadata.withAnnotationsMixin({ 'rollout-max-unavailable': std.toString(max_unavailable) }) + + statefulset.mixin.spec.template.metadata.withLabelsMixin({ 'rollout-group': 'block-builder' }) + ), + + tempo_block_builder_statefulset: + $.newBlockBuilderStatefulSet($._config.block_builder_concurrent_rollout_enabled, $._config.block_builder_max_unavailable), // Configmap diff --git a/operations/jsonnet/microservices/config.libsonnet b/operations/jsonnet/microservices/config.libsonnet index 7d0a018ce73..ce8ccc27cab 100644 --- a/operations/jsonnet/microservices/config.libsonnet +++ b/operations/jsonnet/microservices/config.libsonnet @@ -3,7 +3,7 @@ tempo: 'grafana/tempo:latest', tempo_query: 'grafana/tempo-query:latest', tempo_vulture: 'grafana/tempo-vulture:latest', - rollout_operator: 'grafana/rollout-operator:v0.1.1', + rollout_operator: 'grafana/rollout-operator:v0.23.0', memcached: 'memcached:1.6.32-alpine', memcachedExporter: 'prom/memcached-exporter:v0.14.3', }, @@ -18,6 +18,13 @@ node_selector: null, ingester_allow_multiple_replicas_on_same_node: false, + // Enable concurrent rollout of block-builder through the usage of the rollout operator. + // This feature modifies the block-builder StatefulSet which cannot be altered, so if it already exists it has to be deleted and re-applied again in order to be enabled. + block_builder_concurrent_rollout_enabled: false, + // Maximum number of unavailable replicas during a block-builder rollout when using block_builder_concurrent_rollout_enabled feature. + // Computed from block-builder replicas by default, but can also be specified as percentage, for example "25%". + block_builder_max_unavailable: $.tempo_block_builder_statefulset.spec.replicas, + // disable tempo-query by default tempo_query: { enabled: false, diff --git a/operations/jsonnet/microservices/multi-zone.libsonnet b/operations/jsonnet/microservices/multi-zone.libsonnet index 5270d0d872c..e926cdf03a4 100644 --- a/operations/jsonnet/microservices/multi-zone.libsonnet +++ b/operations/jsonnet/microservices/multi-zone.libsonnet @@ -133,70 +133,9 @@ // Keep it if multi-zone is disabled. if !$._config.multi_zone_ingester_enabled then super.ingester_pdb - // We don’t want Kubernetes to terminate any "ingester" StatefulSet's pod while migration is in progress. + // We don't want Kubernetes to terminate any "ingester" StatefulSet's pod while migration is in progress. else if $._config.multi_zone_ingester_migration_enabled then super.ingester_pdb + podDisruptionBudget.mixin.spec.withMaxUnavailable(0) // Remove it if multi-zone is enabled and no migration is in progress. else null, - - // - // Rollout operator. - // - - local rollout_operator_enabled = $._config.multi_zone_ingester_enabled, - - rollout_operator_args:: { - 'kubernetes.namespace': $._config.namespace, - }, - - rollout_operator_container:: - container.new('rollout-operator', $._images.rollout_operator) + - container.withArgsMixin($.util.mapToFlags($.rollout_operator_args)) + - container.withPorts([ - $.core.v1.containerPort.new('http-metrics', 8001), - ]) + - $.util.resourcesRequests('100m', '100Mi') + - $.util.resourcesLimits('1', '200Mi') + - container.mixin.readinessProbe.httpGet.withPath('/ready') + - container.mixin.readinessProbe.httpGet.withPort(8001) + - container.mixin.readinessProbe.withInitialDelaySeconds(5) + - container.mixin.readinessProbe.withTimeoutSeconds(1), - - rollout_operator_deployment: if !rollout_operator_enabled then null else - deployment.new('rollout-operator', 1, [$.rollout_operator_container]) + - deployment.mixin.metadata.withName('rollout-operator') + - deployment.mixin.spec.template.spec.withServiceAccountName('rollout-operator') + - // Ensure Kubernetes doesn't run 2 operators at the same time. - deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), - - rollout_operator_role: if !rollout_operator_enabled then null else - role.new('rollout-operator-role') + - role.mixin.metadata.withNamespace($._config.namespace) + - role.withRulesMixin([ - policyRule.withApiGroups('') + - policyRule.withResources(['pods']) + - policyRule.withVerbs(['list', 'get', 'watch', 'delete']), - policyRule.withApiGroups('apps') + - policyRule.withResources(['statefulsets']) + - policyRule.withVerbs(['list', 'get', 'watch']), - policyRule.withApiGroups('apps') + - policyRule.withResources(['statefulsets/status']) + - policyRule.withVerbs(['update']), - ]), - - rollout_operator_rolebinding: if !rollout_operator_enabled then null else - roleBinding.new('rollout-operator-rolebinding') + - roleBinding.mixin.metadata.withNamespace($._config.namespace) + - roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + - roleBinding.mixin.roleRef.withKind('Role') + - roleBinding.mixin.roleRef.withName('rollout-operator-role') + - roleBinding.withSubjectsMixin({ - kind: 'ServiceAccount', - name: 'rollout-operator', - namespace: $._config.namespace, - }), - - rollout_operator_service_account: if !rollout_operator_enabled then null else - serviceAccount.new('rollout-operator'), } diff --git a/operations/jsonnet/microservices/replica-template.libsonnet b/operations/jsonnet/microservices/replica-template.libsonnet new file mode 100644 index 00000000000..f7c285e1d7e --- /dev/null +++ b/operations/jsonnet/microservices/replica-template.libsonnet @@ -0,0 +1,32 @@ +{ + _config+: { + replica_template_custom_resource_definition_enabled: false, + }, + + replica_template:: std.parseYaml(importstr 'replica-templates.yaml'), + replica_template_custom_resource: if !$._config.replica_template_custom_resource_definition_enabled then null else $.replica_template, + + // replicaTemplate creates new ReplicaTemplate resource. + // If replicas is > 0, spec.replicas field is specified in the resource, if replicas <= 0, spec.replicas field is hidden. + // Syntactically valid label selector is required, and may be used by HorizontalPodAutoscaler controller when ReplicaTemplate + // is used as scaled resource depending on metric target type. + // (When using targetType=AverageValue, label selector is not used for scaling computation). + replicaTemplate(name, replicas, label_selector):: { + apiVersion: 'rollout-operator.grafana.com/v1', + kind: 'ReplicaTemplate', + metadata: { + name: name, + namespace: $._config.namespace, + }, + spec: { + // HPA requires that label selector exists and is valid, but it will not be used for target type of AverageValue. + labelSelector: label_selector, + } + ( + if replicas <= 0 then { + replicas:: null, // Hide replicas field. + } else { + replicas: replicas, + } + ), + }, +} diff --git a/operations/jsonnet/microservices/replica-templates.yaml b/operations/jsonnet/microservices/replica-templates.yaml new file mode 100644 index 00000000000..0e0ceccbe1d --- /dev/null +++ b/operations/jsonnet/microservices/replica-templates.yaml @@ -0,0 +1,52 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + # must be in the form: . + name: replicatemplates.rollout-operator.grafana.com +spec: + group: rollout-operator.grafana.com + versions: + - name: v1 + served: true + storage: true + additionalPrinterColumns: + - description: Status replicas + jsonPath: .status.replicas + name: StatusReplicas + type: string + - description: Spec replicas + jsonPath: .spec.replicas + name: SpecReplicas + type: string + schema: + openAPIV3Schema: + type: object + properties: + spec: + type: object + properties: + replicas: + type: integer + default: 1 + minimum: 0 + labelSelector: + type: string + status: + type: object + properties: + replicas: + type: integer + subresources: + status: { } + scale: + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + labelSelectorPath: .spec.labelSelector + scope: Namespaced + names: + plural: replicatemplates + singular: replicatemplate + kind: ReplicaTemplate + categories: + # Include in "kubectl get all" output + - all diff --git a/operations/jsonnet/microservices/rollout-operator.libsonnet b/operations/jsonnet/microservices/rollout-operator.libsonnet new file mode 100644 index 00000000000..370886767c3 --- /dev/null +++ b/operations/jsonnet/microservices/rollout-operator.libsonnet @@ -0,0 +1,255 @@ +{ + local k = import 'ksonnet-util/kausal.libsonnet', + + local clusterRole = k.rbac.v1.clusterRole, + local clusterRoleBinding = k.rbac.v1.clusterRoleBinding, + local container = k.core.v1.container, + local deployment = k.apps.v1.deployment, + local mutatingWebhook = k.admissionregistration.v1.mutatingWebhook, + local mutatingWebhookConfiguration = k.admissionregistration.v1.mutatingWebhookConfiguration, + local policyRule = k.rbac.v1.policyRule, + local role = k.rbac.v1.role, + local roleBinding = k.rbac.v1.roleBinding, + local service = k.core.v1.service, + local serviceAccount = k.core.v1.serviceAccount, + local servicePort = k.core.v1.servicePort, + local validatingWebhook = k.admissionregistration.v1.validatingWebhook, + local validatingWebhookConfiguration = k.admissionregistration.v1.validatingWebhookConfiguration, + local podDisruptionBudget = k.policy.v1.podDisruptionBudget, + + local replica_template = (import 'replica-template.libsonnet').replica_template, + + _config+:: { + rollout_operator_enabled: $._config.multi_zone_ingester_enabled || $._config.block_builder_concurrent_rollout_enabled, + // Configure the rollout operator to accept webhook requests made as part of scaling + // statefulsets up or down. This allows the rollout operator to ensure that stateful + // components (ingesters, store-gateways) are scaled up or down safely. + enable_rollout_operator_webhook: $._config.rollout_operator_enabled, + + // ignore_rollout_operator_*_webhook_failures will set the rollout-operator to ignore + // webhook failures. Useful during a rollout to a new cell, where rollout-operator service + // is still not created, as the webhook might be created before the service, and that could + // block other operations that would block the service creation. + ignore_rollout_operator_no_downscale_webhook_failures: false, + ignore_rollout_operator_prepare_downscale_webhook_failures: false, + + // Ignore these labels used for controlling webhook behavior when creating services. + service_ignored_labels+:: ['grafana.com/no-downscale', 'grafana.com/prepare-downscale'], + + rollout_operator_replica_template_access_enabled: true, + + // Automatically add groups based on enabled features + rollout_operator_enabled_groups+:: + (if $._config.block_builder_concurrent_rollout_enabled then ['block-builder'] else []) + + (if $._config.multi_zone_ingester_enabled then ['ingester'] else []), + + assert !$._config.rollout_operator_enabled || std.length($._config.rollout_operator_enabled_groups) > 0 : 'rollout_operator_enabled_groups must be set if rollout_operator_enabled is true', + }, + + rollout_operator_args:: { + 'kubernetes.namespace': $._config.namespace, + 'use-zone-tracker': true, + 'zone-tracker.config-map-name': 'rollout-operator-zone-tracker', + } + if $._config.enable_rollout_operator_webhook then { + '-server-tls.enabled': 'true', + } else {}, + + rollout_operator_container:: + container.new('rollout-operator', $._images.rollout_operator) + + container.withArgsMixin($.util.mapToFlags($.rollout_operator_args)) + + container.withPorts( + [$.core.v1.containerPort.new('http-metrics', 8001)] + + if $._config.enable_rollout_operator_webhook then + [$.core.v1.containerPort.new('https', 8443)] + else [] + ) + + $.util.resourcesRequests('100m', '100Mi') + + $.util.resourcesLimits(null, '200Mi') + + container.mixin.readinessProbe.httpGet.withPath('/ready') + + container.mixin.readinessProbe.httpGet.withPort(8001) + + container.mixin.readinessProbe.withInitialDelaySeconds(5) + + container.mixin.readinessProbe.withTimeoutSeconds(1), + + rollout_operator_deployment: if !$._config.rollout_operator_enabled then null else + deployment.new('rollout-operator', 1, [$.rollout_operator_container]) + + deployment.mixin.metadata.withName('rollout-operator') + + deployment.mixin.spec.template.spec.withServiceAccountName('rollout-operator') + + // Ensure Kubernetes doesn't run 2 operators at the same time + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + + rollout_operator_service: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + service.new( + 'rollout-operator', + { name: 'rollout-operator' }, + servicePort.newNamed('https', 443, 8443) + + servicePort.withProtocol('TCP'), + ), + + rollout_operator_role: if !$._config.rollout_operator_enabled then null else + role.new('rollout-operator-role') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRulesMixin( + [ + policyRule.withApiGroups('') + + policyRule.withResources(['pods']) + + policyRule.withVerbs(['list', 'get', 'watch', 'delete']), + policyRule.withApiGroups('apps') + + policyRule.withResources(['statefulsets']) + + policyRule.withVerbs(['list', 'get', 'watch', 'patch']), + policyRule.withApiGroups('apps') + + policyRule.withResources(['statefulsets/status']) + + policyRule.withVerbs(['update']), + policyRule.withApiGroups('') + + policyRule.withResources(['configmaps']) + + policyRule.withVerbs(['get', 'update', 'create']), + ] + ( + if $._config.rollout_operator_replica_template_access_enabled then [ + policyRule.withApiGroups(replica_template.spec.group) + + policyRule.withResources(['%s/scale' % replica_template.spec.names.plural, '%s/status' % replica_template.spec.names.plural]) + + policyRule.withVerbs(['get', 'patch']), + ] else [] + ) + ), + + rollout_operator_rolebinding: if !$._config.rollout_operator_enabled then null else + roleBinding.new('rollout-operator-rolebinding') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withKind('Role') + + roleBinding.mixin.roleRef.withName('rollout-operator-role') + + roleBinding.withSubjectsMixin({ + kind: 'ServiceAccount', + name: 'rollout-operator', + namespace: $._config.namespace, + }), + + rollout_operator_webhook_cert_secret_role: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + role.new('rollout-operator-webhook-cert-secret-role') + + role.mixin.metadata.withNamespace($._config.namespace) + + role.withRulesMixin([ + policyRule.withApiGroups('') + + policyRule.withResources(['secrets']) + + policyRule.withVerbs(['create']), + policyRule.withApiGroups('') + + policyRule.withResources(['secrets']) + + policyRule.withVerbs(['update', 'get']) + + policyRule.withResourceNames(['rollout-operator-self-signed-certificate']), + ]), + + rollout_operator_webhook_cert_secret_rolebinding: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + roleBinding.new('rollout-operator-webhook-cert-secret-rolebinding') + + roleBinding.mixin.metadata.withNamespace($._config.namespace) + + roleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + roleBinding.mixin.roleRef.withKind('Role') + + roleBinding.mixin.roleRef.withName('rollout-operator-webhook-cert-secret-role') + + roleBinding.withSubjectsMixin({ + kind: 'ServiceAccount', + name: 'rollout-operator', + namespace: $._config.namespace, + }), + + rollout_operator_webhook_cert_update_clusterrole: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + clusterRole.new('rollout-operator-%s-webhook-cert-update-role' % $._config.namespace) + + clusterRole.withRulesMixin([ + policyRule.withApiGroups('admissionregistration.k8s.io') + + policyRule.withResources(['validatingwebhookconfigurations', 'mutatingwebhookconfigurations']) + + policyRule.withVerbs(['list', 'patch']), + ]), + + rollout_operator_webhook_cert_update_clusterrolebinding: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + clusterRoleBinding.new('rollout-operator-%s-webhook-cert-secret-rolebinding' % $._config.namespace) + + clusterRoleBinding.mixin.roleRef.withApiGroup('rbac.authorization.k8s.io') + + clusterRoleBinding.mixin.roleRef.withKind('ClusterRole') + + clusterRoleBinding.mixin.roleRef.withName('rollout-operator-%s-webhook-cert-update-role' % $._config.namespace) + + clusterRoleBinding.withSubjectsMixin({ + kind: 'ServiceAccount', + name: 'rollout-operator', + namespace: $._config.namespace, + }), + + no_downscale_webhook: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + validatingWebhookConfiguration.new('no-downscale-%s' % $._config.namespace) + + validatingWebhookConfiguration.mixin.metadata.withLabels({ + 'grafana.com/namespace': $._config.namespace, + 'grafana.com/inject-rollout-operator-ca': 'true', + }) + + validatingWebhookConfiguration.withWebhooksMixin([ + validatingWebhook.withName('no-downscale-%s.grafana.com' % $._config.namespace) + + validatingWebhook.withAdmissionReviewVersions(['v1']) + + validatingWebhook.withFailurePolicy(if $._config.ignore_rollout_operator_no_downscale_webhook_failures then 'Ignore' else 'Fail') + + validatingWebhook.withMatchPolicy('Equivalent') + + validatingWebhook.withSideEffects('None') + + validatingWebhook.withTimeoutSeconds(10) + + validatingWebhook.withRulesMixin([ + { + apiGroups: ['apps'], + apiVersions: ['v1'], + operations: ['UPDATE'], + resources: ['statefulsets', 'statefulsets/scale'], + scope: 'Namespaced', + }, + ]) + + validatingWebhook.objectSelector.withMatchExpressions([ + { + key: 'rollout-group', + operator: 'In', + values: $._config.rollout_operator_enabled_groups, + }, + ]) + + validatingWebhook.namespaceSelector.withMatchLabels({ + 'kubernetes.io/metadata.name': $._config.namespace, + }) + + validatingWebhook.clientConfig.service.withName('rollout-operator') + + validatingWebhook.clientConfig.service.withNamespace($._config.namespace) + + validatingWebhook.clientConfig.service.withPath('/admission/no-downscale') + + validatingWebhook.clientConfig.service.withPort(443), + ]), + + prepare_downscale_webhook: if !$._config.rollout_operator_enabled || !$._config.enable_rollout_operator_webhook then null else + mutatingWebhookConfiguration.new('prepare-downscale-%s' % $._config.namespace) + + mutatingWebhookConfiguration.mixin.metadata.withLabels({ + 'grafana.com/namespace': $._config.namespace, + 'grafana.com/inject-rollout-operator-ca': 'true', + }) + + mutatingWebhookConfiguration.withWebhooksMixin([ + mutatingWebhook.withName('prepare-downscale-%s.grafana.com' % $._config.namespace) + + mutatingWebhook.withAdmissionReviewVersions(['v1']) + + mutatingWebhook.withFailurePolicy(if $._config.ignore_rollout_operator_prepare_downscale_webhook_failures then 'Ignore' else 'Fail') + + mutatingWebhook.withMatchPolicy('Equivalent') + + mutatingWebhook.withSideEffects('NoneOnDryRun') + + mutatingWebhook.withTimeoutSeconds(10) + + mutatingWebhook.withRulesMixin([ + { + apiGroups: ['apps'], + apiVersions: ['v1'], + operations: ['UPDATE'], + resources: ['statefulsets', 'statefulsets/scale'], + scope: 'Namespaced', + }, + ]) + + mutatingWebhook.objectSelector.withMatchExpressions([ + { + key: 'rollout-group', + operator: 'In', + values: $._config.rollout_operator_enabled_groups, + }, + ]) + + mutatingWebhook.namespaceSelector.withMatchLabels({ + 'kubernetes.io/metadata.name': $._config.namespace, + }) + + mutatingWebhook.clientConfig.service.withName('rollout-operator') + + mutatingWebhook.clientConfig.service.withNamespace($._config.namespace) + + mutatingWebhook.clientConfig.service.withPath('/admission/prepare-downscale') + + mutatingWebhook.clientConfig.service.withPort(443), + ]), + + rollout_operator_service_account: if !$._config.rollout_operator_enabled then null else + serviceAccount.new('rollout-operator'), + + rollout_operator_pdb: if !$._config.rollout_operator_enabled then null else + podDisruptionBudget.new('rollout-operator') + + podDisruptionBudget.mixin.metadata.withLabels({ name: 'rollout-operator' }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'rollout-operator' }) + + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), +} diff --git a/operations/jsonnet/microservices/tempo.libsonnet b/operations/jsonnet/microservices/tempo.libsonnet index aa7c0782a61..b61aee2b712 100644 --- a/operations/jsonnet/microservices/tempo.libsonnet +++ b/operations/jsonnet/microservices/tempo.libsonnet @@ -12,6 +12,7 @@ (import 'memcached.libsonnet') + (import 'multi-zone.libsonnet') + (import 'memberlist.libsonnet') + +(import 'rollout-operator.libsonnet') + { local k = import 'ksonnet-util/kausal.libsonnet',