Skip to content

Commit

Permalink
Address PR comments
Browse files Browse the repository at this point in the history
Have PRS remove hosts taking backups from consideration; and ERS only
consider them if there are no other valid candidates that are not taking
backups.

Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com>
  • Loading branch information
ejortegau committed Nov 8, 2024
1 parent 86f0182 commit 61ad1b6
Show file tree
Hide file tree
Showing 8 changed files with 977 additions and 1,002 deletions.
13 changes: 9 additions & 4 deletions changelog/22.0/22.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@ These are the RPC changes made in this release -

### <a id="reparents-prefer-not-backing-up"/>Prefer not promoting a replica that is currently taking a backup

Both planned and emergency failovers now prefer promoting replicas that are not
currently taking backups. Note that if there's only one suitable replica to promote, and it is taking a backup, it
will still be promoted. This does not apply to `builtin` backups. A replica that is currently taking a `builtin` backup
will never be promoted.
Emergency reparents now prefer not promoting replicas that are currently taking backups with a backup engine other than
`builtin`. Note that if there's only one suitable replica to promote, and it is taking a backup, it will still be
promoted.

For planned reparents, hosts taking backups with a backup engine other than `builtin` are filtered out of the list of
valid candidates. This means they will never get promoted - not even if there's no other candidates.

Note that behavior for `builtin` backups remains unchanged: a replica that is currently taking a `builtin` backup will
never be promoted, neither by planned nor by emergency reparents.
1,610 changes: 815 additions & 795 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata.pb.go

Large diffs are not rendered by default.

32 changes: 22 additions & 10 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
// Here we also check for split brain scenarios and check that the selected replica must be more advanced than all the other valid candidates.
// We fail in case there is a split brain detected.
// The validCandidateTablets list is sorted by the replication positions with ties broken by promotion rules.
intermediateSource, validCandidateTablets, err = erp.findMostAdvanced(validCandidates, tabletMap, stoppedReplicationSnapshot.tabletsBackupState, opts)
intermediateSource, validCandidateTablets, err = erp.findMostAdvanced(validCandidates, tabletMap, opts)
if err != nil {
return err
}
Expand All @@ -258,7 +258,7 @@ func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *eve
// 2. Remove the tablets with the Must_not promote rule
// 3. Remove cross-cell tablets if PreventCrossCellPromotion is specified
// Our final primary candidate MUST belong to this list of valid candidates
validCandidateTablets, err = erp.filterValidCandidates(validCandidateTablets, stoppedReplicationSnapshot.reachableTablets, prevPrimary, opts)
validCandidateTablets, err = erp.filterValidCandidates(validCandidateTablets, stoppedReplicationSnapshot.reachableTablets, stoppedReplicationSnapshot.tabletsBackupState, prevPrimary, opts)
if err != nil {
return err
}
Expand Down Expand Up @@ -397,7 +397,6 @@ func (erp *EmergencyReparenter) waitForAllRelayLogsToApply(
func (erp *EmergencyReparenter) findMostAdvanced(
validCandidates map[string]replication.Position,
tabletMap map[string]*topo.TabletInfo,
tabletsBackupState map[string]bool,
opts EmergencyReparentOptions,
) (*topodatapb.Tablet, []*topodatapb.Tablet, error) {
erp.logger.Infof("started finding the intermediate source")
Expand All @@ -408,7 +407,7 @@ func (erp *EmergencyReparenter) findMostAdvanced(
}

// sort the tablets for finding the best intermediate source in ERS
err = sortTabletsForReparent(validTablets, tabletPositions, nil, tabletsBackupState, opts.durability)
err = sortTabletsForReparent(validTablets, tabletPositions, nil, opts.durability)
if err != nil {
return nil, nil, err
}
Expand All @@ -425,8 +424,7 @@ func (erp *EmergencyReparenter) findMostAdvanced(
// have a larger GTID set but we are still not choosing it. This can lead to split brain, so
// we should cancel the ERS
for i, position := range tabletPositions {
runningBackUp, ok := tabletsBackupState[topoproto.TabletAliasString(validTablets[i].Alias)]
if !winningPosition.AtLeast(position) && ok && !runningBackUp {
if !winningPosition.AtLeast(position) {
return nil, nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "split brain detected between servers - %v and %v", winningPrimaryTablet.Alias, validTablets[i].Alias)
}
}
Expand Down Expand Up @@ -741,9 +739,12 @@ func (erp *EmergencyReparenter) identifyPrimaryCandidate(
return nil, vterrors.Errorf(vtrpc.Code_INTERNAL, "unreachable - did not find a valid primary candidate even though the valid candidate list was non-empty")
}

// filterValidCandidates filters valid tablets, keeping only the ones which can successfully be promoted without any constraint failures and can make forward progress on being promoted
func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb.Tablet, tabletsReachable []*topodatapb.Tablet, prevPrimary *topodatapb.Tablet, opts EmergencyReparentOptions) ([]*topodatapb.Tablet, error) {
// filterValidCandidates filters valid tablets, keeping only the ones which can successfully be promoted without any
// constraint failures and can make forward progress on being promoted. It will filter out candidates taking backups
// if possible.
func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb.Tablet, tabletsReachable []*topodatapb.Tablet, tabletsBackupState map[string]bool, prevPrimary *topodatapb.Tablet, opts EmergencyReparentOptions) ([]*topodatapb.Tablet, error) {
var restrictedValidTablets []*topodatapb.Tablet
var notPreferredValidTablets []*topodatapb.Tablet
for _, tablet := range validTablets {
tabletAliasStr := topoproto.TabletAliasString(tablet.Alias)
// Remove tablets which have MustNot promote rule since they must never be promoted
Expand All @@ -770,9 +771,20 @@ func (erp *EmergencyReparenter) filterValidCandidates(validTablets []*topodatapb
}
continue
}
restrictedValidTablets = append(restrictedValidTablets, tablet)
// Put candidates that are running a backup in a separate list
backingUp, ok := tabletsBackupState[tabletAliasStr]
if ok && backingUp {
erp.logger.Infof("Setting %s in list of valid candidates taking a backup", tabletAliasStr)
notPreferredValidTablets = append(notPreferredValidTablets, tablet)
} else {
restrictedValidTablets = append(restrictedValidTablets, tablet)
}
}
if len(restrictedValidTablets) > 0 {
return restrictedValidTablets, nil
}
return restrictedValidTablets, nil

return notPreferredValidTablets, nil
}

// findErrantGTIDs tries to find errant GTIDs for the valid candidates and returns the updated list of valid candidates.
Expand Down
Loading

0 comments on commit 61ad1b6

Please sign in to comment.