Skip to content

Commit

Permalink
Merge pull request #19353 from fuweid/fix-19179-robustness
Browse files Browse the repository at this point in the history
test: update robustness doc and new case to reproduce 19179
  • Loading branch information
ahrtr authored Feb 6, 2025
2 parents 299bca3 + 8866fce commit 8c263d2
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 25 deletions.
29 changes: 16 additions & 13 deletions tests/robustness/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,20 @@ The purpose of these tests is to rigorously validate that etcd maintains its [KV

## Robustness track record

| Correctness / Consistency issue | Report | Introduced in | Discovered by | Reproducible by robustness test | Command |
|-----------------------------------------------------------------|----------|-----------------|---------------|-------------------------------------------------|-----------------------------------|
| Inconsistent revision caused by crash during high load [#13766] | Mar 2022 | v3.5 | User | Yes, report preceded robustness tests | `make test-robustness-issue13766` |
| Single node cluster can loose a write on crash [#14370] | Aug 2022 | v3.4 or earlier | User | Yes, report preceded robustness tests | `make test-robustness-issue14370` |
| Enabling auth can lead to inconsistency [#14571] | Oct 2022 | v3.4 or earlier | User | No, authorization is not covered. | |
| Inconsistent revision caused by crash during defrag [#14685] | Nov 2022 | v3.5 | Robustness | Yes, after covering defragmentation. | `make test-robustness-issue14685` |
| Watch progress notification not synced with steam [#15220] | Jan 2023 | v3.4 or earlier | User | Yes, after covering watch progress notification | |
| Watch traveling back in time after network partition [#15271] | Feb 2023 | v3.4 or earlier | Robustness | Yes, after covering network partitions | `make test-robustness-issue15271` |
| Duplicated watch event due to bug in TXN caching [#17247] | Jan 2024 | main branch | Robustness | Yes, prevented regression in v3.6 | |
| Watch events lost during stream starvation [#17529] | Mar 2024 | v3.4 or earlier | User | Yes, after covering of slow watch | `make test-robustness-issue17529` |
| Revision decreasing caused by crash during compaction [#17780] | Apr 2024 | v3.4 or earlier | Robustness | Yes, after covering compaction | |
| Watch dropping an event when compacting on delete [#18089] | May 2024 | v3.4 or earlier | Robustness | Yes, after covering of compaction | `make test-robustness-issue18089` |
| Inconsistency when reading compacted revision in TXN [#18667] | Oct 2024 | v3.4 or earlier | User | | |
| Correctness / Consistency issue | Report | Introduced in | Discovered by | Reproducible by robustness test | Command |
| ----------------------------------------------------------------- | ---------- | ----------------- | --------------- | ------------------------------------------------- | ----------------------------------- |
| Inconsistent revision caused by crash during high load [#13766] | Mar 2022 | v3.5 | User | Yes, report preceded robustness tests | `make test-robustness-issue13766` |
| Single node cluster can loose a write on crash [#14370] | Aug 2022 | v3.4 or earlier | User | Yes, report preceded robustness tests | `make test-robustness-issue14370` |
| Enabling auth can lead to inconsistency [#14571] | Oct 2022 | v3.4 or earlier | User | No, authorization is not covered. | |
| Inconsistent revision caused by crash during defrag [#14685] | Nov 2022 | v3.5 | Robustness | Yes, after covering defragmentation. | `make test-robustness-issue14685` |
| Watch progress notification not synced with steam [#15220] | Jan 2023 | v3.4 or earlier | User | Yes, after covering watch progress notification | |
| Watch traveling back in time after network partition [#15271] | Feb 2023 | v3.4 or earlier | Robustness | Yes, after covering network partitions | `make test-robustness-issue15271` |
| Duplicated watch event due to bug in TXN caching [#17247] | Jan 2024 | main branch | Robustness | Yes, prevented regression in v3.6 | |
| Watch events lost during stream starvation [#17529] | Mar 2024 | v3.4 or earlier | User | Yes, after covering of slow watch | `make test-robustness-issue17529` |
| Revision decreasing caused by crash during compaction [#17780] | Apr 2024 | v3.4 or earlier | Robustness | Yes, after covering compaction | |
| Watch dropping an event when compacting on delete [#18089] | May 2024 | v3.4 or earlier | Robustness | Yes, after covering of compaction | `make test-robustness-issue18089` |
| Inconsistency when reading compacted revision in TXN [#18667] | Oct 2024 | v3.4 or earlier | User | | |
| Missing delete event on watch opened on same revision as compaction [#19179] | Jan 2025 | v3.4 or earlier | Robustness | Yes, after covering of compaction | `make test-robustness-issue19179` |

[#13766]: https://github.com/etcd-io/etcd/issues/13766
[#14370]: https://github.com/etcd-io/etcd/issues/14370
Expand All @@ -33,6 +34,8 @@ The purpose of these tests is to rigorously validate that etcd maintains its [KV
[#17780]: https://github.com/etcd-io/etcd/issues/17780
[#18089]: https://github.com/etcd-io/etcd/issues/18089
[#18667]: https://github.com/etcd-io/etcd/issues/18667
[#19179]: https://github.com/etcd-io/etcd/issues/19179


## How Robustness Tests Work

Expand Down
5 changes: 5 additions & 0 deletions tests/robustness/makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ test-robustness-issue18089: /tmp/etcd-v3.5.12-beforeSendWatchResponse/bin
GO_TEST_FLAGS='-v -run=TestRobustnessRegression/Issue18089 -count 100 -failfast --bin-dir=/tmp/etcd-v3.5.12-beforeSendWatchResponse/bin' make test-robustness && \
echo "Failed to reproduce" || echo "Successful reproduction"

.PHONY: test-robustness-issue19179
test-robustness-issue19179: /tmp/etcd-v3.5.17-failpoints/bin
GO_TEST_FLAGS='-v -run=TestRobustnessRegression/Issue19179 -count 200 -failfast --bin-dir=/tmp/etcd-v3.5.17-failpoints/bin' make test-robustness && \
echo "Failed to reproduce" || echo "Successful reproduction"

# Failpoints

GOPATH = $(shell go env GOPATH)
Expand Down
31 changes: 31 additions & 0 deletions tests/robustness/scenarios/scenarios.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,37 @@ func Regression(t *testing.T) []TestScenario {
e2e.WithGoFailEnabled(true),
),
})

// NOTE:
//
// 1. All keys have only two revisions: creation and tombstone. With
// a small compaction batch limit, it's easy to separate a key's two
// revisions into different batch runs. If the compaction revision is a
// tombstone and the creation revision was deleted in a previous
// compaction run, we may encounter issue 19179.
//
// 2. It can be easily reproduced when using a lower QPS with a lower
// burstable value. A higher QPS can generate more new keys than
// expected, making it difficult to determine an optimal compaction
// batch limit within a larger key space.
scenarios = append(scenarios, TestScenario{
Name: "Issue19179",
Profile: traffic.Profile{
MinimalQPS: 50,
MaximalQPS: 100,
BurstableQPS: 100,
ClientCount: 8,
MaxNonUniqueRequestConcurrency: 3,
}.WithoutCompaction(),
Failpoint: failpoint.BatchCompactBeforeSetFinishedCompactPanic,
Traffic: traffic.KubernetesCreateDelete,
Cluster: *e2e.NewConfig(
e2e.WithClusterSize(1),
e2e.WithExperimentalCompactionBatchLimit(50),
e2e.WithSnapshotCount(1000),
e2e.WithGoFailEnabled(true),
),
})
scenarios = append(scenarios, TestScenario{
Name: "Issue18089",
Profile: traffic.LowTraffic.WithCompactionPeriod(100 * time.Millisecond), // Use frequent compaction for high reproduce rate
Expand Down
35 changes: 24 additions & 11 deletions tests/robustness/traffic/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,30 @@ import (
"go.etcd.io/etcd/tests/v3/robustness/random"
)

var Kubernetes Traffic = kubernetesTraffic{
averageKeyCount: 10,
resource: "pods",
namespace: "default",
// Please keep the sum of weights equal 100.
writeChoices: []random.ChoiceWeight[KubernetesRequestType]{
{Choice: KubernetesUpdate, Weight: 90},
{Choice: KubernetesDelete, Weight: 5},
{Choice: KubernetesCreate, Weight: 5},
},
}
var (
Kubernetes Traffic = kubernetesTraffic{
averageKeyCount: 10,
resource: "pods",
namespace: "default",
// Please keep the sum of weights equal 100.
writeChoices: []random.ChoiceWeight[KubernetesRequestType]{
{Choice: KubernetesUpdate, Weight: 90},
{Choice: KubernetesDelete, Weight: 5},
{Choice: KubernetesCreate, Weight: 5},
},
}

KubernetesCreateDelete Traffic = kubernetesTraffic{
averageKeyCount: 10,
resource: "pods",
namespace: "default",
// Please keep the sum of weights equal 100.
writeChoices: []random.ChoiceWeight[KubernetesRequestType]{
{Choice: KubernetesDelete, Weight: 40},
{Choice: KubernetesCreate, Weight: 60},
},
}
)

type kubernetesTraffic struct {
averageKeyCount int
Expand Down
5 changes: 4 additions & 1 deletion tests/robustness/traffic/traffic.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,14 @@ var (
LowTraffic = Profile{
MinimalQPS: 100,
MaximalQPS: 200,
BurstableQPS: 1000,
ClientCount: 8,
MaxNonUniqueRequestConcurrency: 3,
}
HighTrafficProfile = Profile{
MinimalQPS: 100,
MaximalQPS: 1000,
BurstableQPS: 1000,
ClientCount: 8,
MaxNonUniqueRequestConcurrency: 3,
}
Expand All @@ -59,7 +61,7 @@ func SimulateTraffic(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2
lm := identity.NewLeaseIDStorage()
reports := []report.ClientReport{}
// Use the highest MaximalQPS of all traffic profiles as burst otherwise actual traffic may be accidentally limited
limiter := rate.NewLimiter(rate.Limit(profile.MaximalQPS), 1000)
limiter := rate.NewLimiter(rate.Limit(profile.MaximalQPS), profile.BurstableQPS)

cc, err := client.NewRecordingClient(endpoints, ids, baseTime)
require.NoError(t, err)
Expand Down Expand Up @@ -178,6 +180,7 @@ func (ts *trafficStats) QPS() float64 {
type Profile struct {
MinimalQPS float64
MaximalQPS float64
BurstableQPS int
MaxNonUniqueRequestConcurrency int
ClientCount int
ForbidCompaction bool
Expand Down

0 comments on commit 8c263d2

Please sign in to comment.