Skip to content

Commit

Permalink
Address PR comments
Browse files Browse the repository at this point in the history
Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com>
  • Loading branch information
ejortegau committed Nov 5, 2024
1 parent ca7078e commit 6840803
Show file tree
Hide file tree
Showing 13 changed files with 77 additions and 75 deletions.
11 changes: 6 additions & 5 deletions changelog/22.0/22.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@

- **[Known Issues](#known-issues)**
- **[Major Changes](#major-changes)**
- **[Support for specifying expected primary in reparents](#reparents-prefer-not-backing-up)**
- **[Prefer not promoting a replica that is currently taking a backup](#reparents-prefer-not-backing-up)**

## <a id="known-issues"/>Known Issues</a>

## <a id="major-changes"/>Major Changes</a>

### <a id="reparents-prefer-not-backing-up"/>Support specifying expected primary in reparents
### <a id="reparents-prefer-not-backing-up"/>Prefer not promoting a replica that is currently taking a backup

The execution of both Planned Shard Reparents and Emergency Reparents now prefer promoting replicas that are not
currently taking backups. Notice that if there's only one suitable replica to promote, and it is taking a backup, it
will still be promoted.
Both planned and emergency failovers now prefer promoting replicas that are not
currently taking backups. Note that if there's only one suitable replica to promote, and it is taking a backup, it
will still be promoted. This does not apply to `builtin` backups. A replica that is currently taking a `builtin` backup
will never be promoted.
5 changes: 3 additions & 2 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,10 +422,11 @@ func (erp *EmergencyReparenter) findMostAdvanced(

// We have already removed the tablets with errant GTIDs before calling this function. At this point our winning position must be a
// superset of all the other valid positions - unless the tablet with that position is taking a backup, in which case it might
// have a larger GTID set but we are still not choosing it. If that is not the case, then we have a split brain scenario, and
// have a larger GTID set but we are still not choosing it. This can lead to split brain, so
// we should cancel the ERS
for i, position := range tabletPositions {
if !winningPosition.AtLeast(position) && !backingUpTablets[topoproto.TabletAliasString(validTablets[i].Alias)] {
runningBackUp, ok := backingUpTablets[topoproto.TabletAliasString(validTablets[i].Alias)]
if !winningPosition.AtLeast(position) && ok && !runningBackUp {
return nil, nil, vterrors.Errorf(vtrpc.Code_FAILED_PRECONDITION, "split brain detected between servers - %v and %v", winningPrimaryTablet.Alias, validTablets[i].Alias)
}
}
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2868,7 +2868,7 @@ func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
},
},
}, {
name: "choose most advanced not backing up",
name: "choose most advanced not running backup",
validCandidates: map[string]replication.Position{
"zone1-0000000100": positionMostAdvanced,
"zone1-0000000101": positionIntermediate1,
Expand Down
10 changes: 5 additions & 5 deletions go/vt/vtctl/reparentutil/reparent_sorter.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type reparentSorter struct {
tablets []*topodatapb.Tablet
positions []replication.Position
innodbBufferPool []int
backingUp map[string]bool
backupRunning map[string]bool
durability Durabler
}

Expand All @@ -44,7 +44,7 @@ func newReparentSorter(tablets []*topodatapb.Tablet, positions []replication.Pos
positions: positions,
durability: durability,
innodbBufferPool: innodbBufferPool,
backingUp: backingUp,
backupRunning: backingUp,
}
}

Expand Down Expand Up @@ -74,11 +74,11 @@ func (rs *reparentSorter) Less(i, j int) bool {
return true
}

// We want tablets backing up always at the end of the list, so that's the first thing we check
if !rs.backingUp[topoproto.TabletAliasString(rs.tablets[i].Alias)] && rs.backingUp[topoproto.TabletAliasString(rs.tablets[j].Alias)] {
// We want tablets which are currently running a backup to always be at the end of the list, so that's the first thing we check
if !rs.backupRunning[topoproto.TabletAliasString(rs.tablets[i].Alias)] && rs.backupRunning[topoproto.TabletAliasString(rs.tablets[j].Alias)] {
return true
}
if rs.backingUp[topoproto.TabletAliasString(rs.tablets[i].Alias)] && !rs.backingUp[topoproto.TabletAliasString(rs.tablets[j].Alias)] {
if rs.backupRunning[topoproto.TabletAliasString(rs.tablets[i].Alias)] && !rs.backupRunning[topoproto.TabletAliasString(rs.tablets[j].Alias)] {
return false
}

Expand Down
90 changes: 45 additions & 45 deletions go/vt/vtctl/reparentutil/reparent_sorter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,71 +90,71 @@ func TestReparentSorter(t *testing.T) {
positionIntermediate2.GTIDSet = positionIntermediate2.GTIDSet.AddGTID(mysqlGTID2)

testcases := []struct {
name string
tablets []*topodatapb.Tablet
innodbBufferPool []int
positions []replication.Position
backingUpTablets map[string]bool
containsErr string
sortedTablets []*topodatapb.Tablet
name string
tablets []*topodatapb.Tablet
innodbBufferPool []int
positions []replication.Position
tabletBackupStatus map[string]bool
containsErr string
sortedTablets []*topodatapb.Tablet
}{
{
name: "all advanced, sort via promotion rules",
tablets: []*topodatapb.Tablet{nil, tabletReplica1_100, tabletRdonly1_102},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_100, tabletRdonly1_102, nil},
name: "all advanced, sort via promotion rules",
tablets: []*topodatapb.Tablet{nil, tabletReplica1_100, tabletRdonly1_102},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_100, tabletRdonly1_102, nil},
}, {
name: "all advanced, sort via innodb buffer pool",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced, positionMostAdvanced},
innodbBufferPool: []int{10, 40, 25},
sortedTablets: []*topodatapb.Tablet{tabletReplica2_100, tabletReplica1_100, tabletReplica1_101},
name: "all advanced, sort via innodb buffer pool",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced, positionMostAdvanced},
innodbBufferPool: []int{10, 40, 25},
sortedTablets: []*topodatapb.Tablet{tabletReplica2_100, tabletReplica1_100, tabletReplica1_101},
}, {
name: "ordering by position",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100, tabletRdonly1_102},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionEmpty, positionIntermediate1, positionIntermediate2, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletRdonly1_102, tabletReplica1_100, tabletReplica2_100, tabletReplica1_101},
name: "ordering by position",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100, tabletRdonly1_102},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionEmpty, positionIntermediate1, positionIntermediate2, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletRdonly1_102, tabletReplica1_100, tabletReplica2_100, tabletReplica1_101},
}, {
name: "tablets and positions count error",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100},
positions: []replication.Position{positionEmpty, positionIntermediate1, positionMostAdvanced},
containsErr: "unequal number of tablets and positions",
}, {
name: "promotion rule check",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletRdonly1_102},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_101, tabletRdonly1_102},
name: "promotion rule check",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletRdonly1_102},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_101, tabletRdonly1_102},
}, {
name: "mixed",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100, tabletRdonly1_102},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionEmpty, positionIntermediate1, positionMostAdvanced, positionIntermediate1},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_100, tabletReplica2_100, tabletRdonly1_102, tabletReplica1_101},
name: "mixed",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100, tabletRdonly1_102},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionEmpty, positionIntermediate1, positionMostAdvanced, positionIntermediate1},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_100, tabletReplica2_100, tabletRdonly1_102, tabletReplica1_101},
}, {
name: "mixed - another",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100, tabletRdonly1_102},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionIntermediate1, positionIntermediate1, positionMostAdvanced, positionIntermediate1},
innodbBufferPool: []int{100, 200, 0, 200},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_100, tabletReplica2_100, tabletReplica1_101, tabletRdonly1_102},
name: "mixed - another",
tablets: []*topodatapb.Tablet{tabletReplica1_101, tabletReplica2_100, tabletReplica1_100, tabletRdonly1_102},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_101.Alias): false, topoproto.TabletAliasString(tabletReplica2_100.Alias): false, topoproto.TabletAliasString(tabletReplica1_100.Alias): false, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionIntermediate1, positionIntermediate1, positionMostAdvanced, positionIntermediate1},
innodbBufferPool: []int{100, 200, 0, 200},
sortedTablets: []*topodatapb.Tablet{tabletReplica1_100, tabletReplica2_100, tabletReplica1_101, tabletRdonly1_102},
}, {
name: "all advanced, sort via backup flag",
tablets: []*topodatapb.Tablet{nil, tabletReplica1_100, tabletRdonly1_102},
backingUpTablets: map[string]bool{topoproto.TabletAliasString(tabletReplica1_100.Alias): true, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletRdonly1_102, tabletReplica1_100, nil},
name: "all advanced, sort via backup flag",
tablets: []*topodatapb.Tablet{nil, tabletReplica1_100, tabletRdonly1_102},
tabletBackupStatus: map[string]bool{topoproto.TabletAliasString(tabletReplica1_100.Alias): true, topoproto.TabletAliasString(tabletRdonly1_102.Alias): false},
positions: []replication.Position{positionMostAdvanced, positionMostAdvanced, positionMostAdvanced},
sortedTablets: []*topodatapb.Tablet{tabletRdonly1_102, tabletReplica1_100, nil},
},
}

durability, err := GetDurabilityPolicy("none")
require.NoError(t, err)
for _, testcase := range testcases {
t.Run(testcase.name, func(t *testing.T) {
err := sortTabletsForReparent(testcase.tablets, testcase.positions, testcase.innodbBufferPool, testcase.backingUpTablets, durability)
err := sortTabletsForReparent(testcase.tablets, testcase.positions, testcase.innodbBufferPool, testcase.tabletBackupStatus, durability)
if testcase.containsErr != "" {
require.EqualError(t, err, testcase.containsErr)
} else {
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vtctl/reparentutil/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func ElectNewPrimary(
tb := tablet
errorGroup.Go(func() error {
// find and store the positions for the tablet
pos, replLag, backingUp, err := findPositionLagBackingUpForTablet(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout)
pos, replLag, backingUp, err := findTabletPositionLagBackupStatus(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout)
mu.Lock()
defer mu.Unlock()
if err == nil && (opts.TolerableReplLag == 0 || opts.TolerableReplLag >= replLag) {
Expand Down Expand Up @@ -161,9 +161,9 @@ func ElectNewPrimary(
return validTablets[0].Alias, nil
}

// findPositionLagBackingUpForTablet processes the replication position and lag for a single tablet and
// findTabletPositionLagBackupStatus processes the replication position and lag for a single tablet and
// returns it. It is safe to call from multiple goroutines.
func findPositionLagBackingUpForTablet(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, error) {
func findTabletPositionLagBackupStatus(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, error) {
logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias))

ctx, cancel := context.WithTimeout(ctx, waitTimeout)
Expand Down
10 changes: 5 additions & 5 deletions go/vt/vtctl/reparentutil/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func TestElectNewPrimary(t *testing.T) {
errContains: nil,
},
{
name: "Two good replicas, but one of them backing up so we pick the other one",
name: "Two good replicas, but one of them is taking a backup so we pick the other one",
tmc: &chooseNewPrimaryTestTMClient{
// both zone1-101 and zone1-102 are equivalent from a replicaiton PoV, but zone1-102 is taking a backup
replicationStatuses: map[string]*replicationdatapb.Status{
Expand Down Expand Up @@ -201,7 +201,7 @@ func TestElectNewPrimary(t *testing.T) {
errContains: nil,
},
{
name: "Only one replica, but it's backing up. We still use it.",
name: "Only one replica, but it's taking a backup. We still use it.",
tmc: &chooseNewPrimaryTestTMClient{
// both zone1-101 and zone1-102 are equivalent from a replicaiton PoV, but zone1-102 is taking a backup
replicationStatuses: map[string]*replicationdatapb.Status{
Expand Down Expand Up @@ -524,7 +524,7 @@ func TestElectNewPrimary(t *testing.T) {
errContains: nil,
},
{
name: "Two replicas, first one with too much lag, another one backing up - elect the one taking backup",
name: "Two replicas, first one with too much lag, another one taking a backup - elect the one taking backup",
tmc: &chooseNewPrimaryTestTMClient{
// zone1-101 is behind zone1-102
replicationStatuses: map[string]*replicationdatapb.Status{
Expand Down Expand Up @@ -1083,7 +1083,7 @@ func TestFindPositionForTablet(t *testing.T) {
expectedLag: 201 * time.Second,
expectedPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
}, {
name: "Host is backing up",
name: "Host is taking a backup",
tmc: &testutil.TabletManagerClient{
ReplicationStatusResults: map[string]struct {
Position *replicationdatapb.Status
Expand Down Expand Up @@ -1177,7 +1177,7 @@ func TestFindPositionForTablet(t *testing.T) {

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
pos, lag, backingUp, err := findPositionLagBackingUpForTablet(ctx, test.tablet, logger, test.tmc, 10*time.Second)
pos, lag, backingUp, err := findTabletPositionLagBackupStatus(ctx, test.tablet, logger, test.tmc, 10*time.Second)
if test.expectedErr != "" {
require.EqualError(t, err, test.expectedErr)
return
Expand Down
4 changes: 2 additions & 2 deletions go/vt/vttablet/grpctmserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ func (s *server) MysqlHostMetrics(ctx context.Context, request *tabletmanagerdat
func (s *server) ReplicationStatus(ctx context.Context, request *tabletmanagerdatapb.ReplicationStatusRequest) (response *tabletmanagerdatapb.ReplicationStatusResponse, err error) {
defer s.tm.HandleRPCPanic(ctx, "ReplicationStatus", request, response, false /*verbose*/, &err)
ctx = callinfo.GRPCCallInfo(ctx)
response = &tabletmanagerdatapb.ReplicationStatusResponse{BackingUp: s.tm.IsBackingUp()}
response = &tabletmanagerdatapb.ReplicationStatusResponse{BackingUp: s.tm.IsBackupRunning()}
status, err := s.tm.ReplicationStatus(ctx)
if err == nil {
response.Status = status
Expand Down Expand Up @@ -626,7 +626,7 @@ func (s *server) StopReplicationAndGetStatus(ctx context.Context, request *table
response.Status = statusResponse.Status
}

response.BackingUp = s.tm.IsBackingUp()
response.BackingUp = s.tm.IsBackupRunning()

return response, err
}
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/rpc_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ type RPCTM interface {

RestoreFromBackup(ctx context.Context, logger logutil.Logger, request *tabletmanagerdatapb.RestoreFromBackupRequest) error

IsBackingUp() bool
IsBackupRunning() bool

// HandleRPCPanic is to be called in a defer statement in each
// RPC input point.
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/rpc_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.L
return err
}

func (tm *TabletManager) IsBackingUp() bool {
func (tm *TabletManager) IsBackupRunning() bool {
return tm._isBackupRunning
}

Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tmrpctest/test_tm_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1411,7 +1411,7 @@ func (fra *fakeRPCTM) Backup(ctx context.Context, logger logutil.Logger, request
return nil
}

func (fra *fakeRPCTM) IsBackingUp() bool {
func (fra *fakeRPCTM) IsBackupRunning() bool {
return false
}

Expand Down
Loading

0 comments on commit 6840803

Please sign in to comment.