diff --git a/go.mod b/go.mod
index ebc70910f..20fa8cc71 100644
--- a/go.mod
+++ b/go.mod
@@ -5,9 +5,9 @@ require (
 	github.com/golang/snappy v0.0.4
 	github.com/pkg/errors v0.9.1
 	github.com/urfave/cli v1.22.15
-	github.com/xtaci/kcp-go/v5 v5.6.17
+	github.com/xtaci/kcp-go/v5 v5.6.18
 	github.com/xtaci/qpp v1.1.17
-	github.com/xtaci/smux v1.5.30
+	github.com/xtaci/smux v1.5.31
 	github.com/xtaci/tcpraw v1.2.31
 	golang.org/x/crypto v0.27.0
 )
@@ -17,14 +17,14 @@ require (
 	github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
 	github.com/google/gopacket v1.1.19 // indirect
 	github.com/klauspost/cpuid/v2 v2.2.8 // indirect
-	github.com/klauspost/reedsolomon v1.12.3 // indirect
+	github.com/klauspost/reedsolomon v1.12.4 // indirect
 	github.com/mattn/go-colorable v0.1.13 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
 	github.com/russross/blackfriday/v2 v2.1.0 // indirect
 	github.com/templexxx/cpu v0.1.1 // indirect
 	github.com/templexxx/xorsimd v0.4.3 // indirect
 	github.com/tjfoc/gmsm v1.4.1 // indirect
-	golang.org/x/net v0.28.0 // indirect
+	golang.org/x/net v0.29.0 // indirect
 	golang.org/x/sys v0.25.0 // indirect
 )
 
diff --git a/go.sum b/go.sum
index 15165c380..39edcaa70 100644
--- a/go.sum
+++ b/go.sum
@@ -37,8 +37,8 @@ github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF
 github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo=
 github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
 github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
-github.com/klauspost/reedsolomon v1.12.3 h1:tzUznbfc3OFwJaTebv/QdhnFf2Xvb7gZ24XaHLBPmdc=
-github.com/klauspost/reedsolomon v1.12.3/go.mod h1:3K5rXwABAvzGeR01r6pWZieUALXO/Tq7bFKGIb4m4WI=
+github.com/klauspost/reedsolomon v1.12.4 h1:5aDr3ZGoJbgu/8+j45KtUJxzYm8k08JGtB9Wx1VQ4OA=
+github.com/klauspost/reedsolomon v1.12.4/go.mod h1:d3CzOMOt0JXGIFZm1StgkyF14EYr3xneR2rNWo7NcMU=
 github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
 github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
 github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
@@ -68,18 +68,14 @@ github.com/tjfoc/gmsm v1.4.1 h1:aMe1GlZb+0bLjn+cKTPEvvn9oUEBlJitaZiiBwsbgho=
 github.com/tjfoc/gmsm v1.4.1/go.mod h1:j4INPkHWMrhJb38G+J6W4Tw0AbuN8Thu3PbdVYhVcTE=
 github.com/urfave/cli v1.22.15 h1:nuqt+pdC/KqswQKhETJjo7pvn/k4xMUxgW6liI7XpnM=
 github.com/urfave/cli v1.22.15/go.mod h1:wSan1hmo5zeyLGBjRJbzRTNk8gwoYa2B9n4q9dmRIc0=
-github.com/xtaci/kcp-go/v5 v5.6.17 h1:en/aK0IDKX5Zb5NlfLRzbuhw1nnj8vhf6AgbAJI2WX4=
-github.com/xtaci/kcp-go/v5 v5.6.17/go.mod h1:75S1AKYYzNUSXIv30h+jPKJYZUwqpfvLshu63nCNSOM=
+github.com/xtaci/kcp-go/v5 v5.6.18 h1:7oV4mc272pcnn39/13BB11Bx7hJM4ogMIEokJYVWn4g=
+github.com/xtaci/kcp-go/v5 v5.6.18/go.mod h1:75S1AKYYzNUSXIv30h+jPKJYZUwqpfvLshu63nCNSOM=
 github.com/xtaci/lossyconn v0.0.0-20190602105132-8df528c0c9ae h1:J0GxkO96kL4WF+AIT3M4mfUVinOCPgf2uUWYFUzN0sM=
 github.com/xtaci/lossyconn v0.0.0-20190602105132-8df528c0c9ae/go.mod h1:gXtu8J62kEgmN++bm9BVICuT/e8yiLI2KFobd/TRFsE=
 github.com/xtaci/qpp v1.1.17 h1:w35NYqF3wOBoAMs+2qA2XFjkNQ12mugw51CUJ7OcTzo=
 github.com/xtaci/qpp v1.1.17/go.mod h1:dJS3usaXNMbWxZSWCAdxz01UgJcz9wXDkd4BccDY/V0=
-github.com/xtaci/smux v1.5.29 h1:xGpY4B0ngArN1yNXvKerMiW4QABDXxELUJhR3EAh5bQ=
-github.com/xtaci/smux v1.5.29/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY=
-github.com/xtaci/smux v1.5.30-0.20240915135522-784d53ae558d h1:OgHYTbVkmCTV9M4l5GSidOjmXBubG6RqBTt7Q1DoTPU=
-github.com/xtaci/smux v1.5.30-0.20240915135522-784d53ae558d/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY=
-github.com/xtaci/smux v1.5.30 h1:LFxB7WSr0mbQhbdJzfbxnfCKVQKYzcyB+/8mXf2dTdQ=
-github.com/xtaci/smux v1.5.30/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY=
+github.com/xtaci/smux v1.5.31 h1:3ha7sHtH46h85Iv7MfQogxasuRt1KPRhoFB3S4rmHgU=
+github.com/xtaci/smux v1.5.31/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY=
 github.com/xtaci/tcpraw v1.2.31 h1:i9mXzejnGJdGi0DpVKUn19Hq202/sHOJt0kObEwuE/U=
 github.com/xtaci/tcpraw v1.2.31/go.mod h1:T1blYD2EDkLneb+HtxddnzX38SoC9BG537EhkXeaT2k=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
@@ -101,8 +97,8 @@ golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
-golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
-golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
+golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
+golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go
index 9b3639502..bbc521f4e 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois.go
@@ -910,14 +910,14 @@ func galExp(a byte, n int) byte {
 	return expTable[uint8(logResult)]
 }
 
-func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
+func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs, vectorLength int, dst []byte) []byte {
 	if !codeGen {
 		panic("codegen not enabled")
 	}
 	total := inputs * outputs
 
 	// Duplicated in+out
-	wantBytes := total * 32 * 2
+	wantBytes := total * vectorLength * 2
 	if cap(dst) < wantBytes {
 		dst = AllocAligned(1, wantBytes)[0]
 	} else {
@@ -925,15 +925,16 @@ func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byt
 	}
 	for i, row := range matrixRows[:outputs] {
 		for j, idx := range row[inIdx : inIdx+inputs] {
-			dstIdx := (j*outputs + i) * 64
+			dstIdx := (j*outputs + i) * vectorLength * 2
 			dstPart := dst[dstIdx:]
-			dstPart = dstPart[:64]
+			dstPart = dstPart[:vectorLength*2]
 			lo := mulTableLow[idx][:]
 			hi := mulTableHigh[idx][:]
-			copy(dstPart[:16], lo)
-			copy(dstPart[16:32], lo)
-			copy(dstPart[32:48], hi)
-			copy(dstPart[48:64], hi)
+
+			for k := 0; k < vectorLength; k += 16 {
+				copy(dstPart[k:k+16], lo)
+				copy(dstPart[vectorLength*2-(k+16):vectorLength*2-k], hi)
+			}
 		}
 	}
 	return dst
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
index 8099f1664..8025560f2 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@@ -53,20 +53,32 @@ func galMulSlice(c byte, in, out []byte, o *options) {
 	}
 	if o.useAVX2 {
 		if len(in) >= bigSwitchover {
-			galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			done := (len(in) >> 6) << 6
+			if raceEnabled {
+				raceReadSlice(in[:done])
+				raceWriteSlice(out[:done])
+			}
+			galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			in = in[done:]
 			out = out[done:]
 		}
 		if len(in) > 32 {
-			galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			done := (len(in) >> 5) << 5
+			if raceEnabled {
+				raceReadSlice(in[:done])
+				raceWriteSlice(out[:done])
+			}
+			galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			in = in[done:]
 			out = out[done:]
 		}
 	} else if o.useSSSE3 {
-		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 		done := (len(in) >> 4) << 4
+		if raceEnabled {
+			raceReadSlice(in[:done])
+			raceWriteSlice(out[:done])
+		}
+		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 		in = in[done:]
 		out = out[done:]
 	}
@@ -85,20 +97,32 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
 
 	if o.useAVX2 {
 		if len(in) >= bigSwitchover {
-			galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			done := (len(in) >> 6) << 6
+			if raceEnabled {
+				raceReadSlice(in[:done])
+				raceWriteSlice(out[:done])
+			}
+			galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			in = in[done:]
 			out = out[done:]
 		}
 		if len(in) >= 32 {
-			galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			done := (len(in) >> 5) << 5
+			if raceEnabled {
+				raceReadSlice(in[:done])
+				raceWriteSlice(out[:done])
+			}
+			galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 			in = in[done:]
 			out = out[done:]
 		}
 	} else if o.useSSSE3 {
-		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 		done := (len(in) >> 4) << 4
+		if raceEnabled {
+			raceReadSlice(in[:done])
+			raceWriteSlice(out[:done])
+		}
+		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 		in = in[done:]
 		out = out[done:]
 	}
@@ -117,20 +141,32 @@ func sliceXor(in, out []byte, o *options) {
 	if o.useSSE2 {
 		if len(in) >= bigSwitchover {
 			if o.useAVX2 {
-				avx2XorSlice_64(in, out)
 				done := (len(in) >> 6) << 6
+				if raceEnabled {
+					raceReadSlice(in[:done])
+					raceWriteSlice(out[:done])
+				}
+				avx2XorSlice_64(in, out)
 				in = in[done:]
 				out = out[done:]
 			} else {
-				sSE2XorSlice_64(in, out)
 				done := (len(in) >> 6) << 6
+				if raceEnabled {
+					raceReadSlice(in[:done])
+					raceWriteSlice(out[:done])
+				}
+				sSE2XorSlice_64(in, out)
 				in = in[done:]
 				out = out[done:]
 			}
 		}
 		if len(in) >= 16 {
-			sSE2XorSlice(in, out)
 			done := (len(in) >> 4) << 4
+			if raceEnabled {
+				raceReadSlice(in[:done])
+				raceWriteSlice(out[:done])
+			}
+			sSE2XorSlice(in, out)
 			in = in[done:]
 			out = out[done:]
 		}
@@ -462,9 +498,17 @@ func fftDIT2(x, y []byte, log_m ffe, o *options) {
 	}
 	if o.useAVX2 {
 		tmp := &multiply256LUT[log_m]
+		if raceEnabled {
+			raceReadSlice(y)
+			raceWriteSlice(x)
+		}
 		fftDIT2_avx2(x, y, tmp)
 	} else if o.useSSSE3 {
 		tmp := &multiply256LUT[log_m]
+		if raceEnabled {
+			raceReadSlice(y)
+			raceWriteSlice(x)
+		}
 		fftDIT2_ssse3(x, y, tmp)
 	} else {
 		// Reference version:
@@ -480,11 +524,15 @@ func fftDIT28(x, y []byte, log_m ffe8, o *options) {
 	}
 
 	if o.useAVX2 {
+		done := (len(y) >> 6) << 6
+		if raceEnabled {
+			raceReadSlice(y[:done])
+			raceWriteSlice(x[:done])
+		}
 		fftDIT28_avx2(x, y, &multiply256LUT8[log_m])
 		if len(x)&63 == 0 {
 			return
 		}
-		done := (len(y) >> 6) << 6
 		y = y[done:]
 		x = x[done:]
 	}
@@ -499,11 +547,15 @@ func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
 	}
 
 	if o.useAVX2 {
+		done := (len(y) >> 6) << 6
+		if raceEnabled {
+			raceReadSlice(y[:done])
+			raceWriteSlice(x[:done])
+		}
 		ifftDIT28_avx2(x, y, &multiply256LUT8[log_m])
 		if len(x)&63 == 0 {
 			return
 		}
-		done := (len(y) >> 6) << 6
 		y = y[done:]
 		x = x[done:]
 	}
@@ -514,14 +566,22 @@ func ifftDIT28(x, y []byte, log_m ffe8, o *options) {
 func mulAdd8(x, y []byte, log_m ffe8, o *options) {
 	if o.useAVX2 {
 		t := &multiply256LUT8[log_m]
-		galMulAVX2Xor_64(t[:16], t[16:32], y, x)
 		done := (len(y) >> 6) << 6
+		if raceEnabled {
+			raceReadSlice(y[:done])
+			raceWriteSlice(x[:done])
+		}
+		galMulAVX2Xor_64(t[:16], t[16:32], y, x)
 		y = y[done:]
 		x = x[done:]
 	} else if o.useSSSE3 {
 		t := &multiply256LUT8[log_m]
-		galMulSSSE3Xor(t[:16], t[16:32], y, x)
 		done := (len(y) >> 4) << 4
+		if raceEnabled {
+			raceReadSlice(y[:done])
+			raceWriteSlice(x[:done])
+		}
+		galMulSSSE3Xor(t[:16], t[16:32], y, x)
 		y = y[done:]
 		x = x[done:]
 	}
@@ -535,9 +595,19 @@ func ifftDIT2(x, y []byte, log_m ffe, o *options) {
 	}
 	if o.useAVX2 {
 		tmp := &multiply256LUT[log_m]
+		if raceEnabled {
+			raceReadSlice(y)
+			raceWriteSlice(x)
+		}
+
 		ifftDIT2_avx2(x, y, tmp)
 	} else if o.useSSSE3 {
 		tmp := &multiply256LUT[log_m]
+		if raceEnabled {
+			raceReadSlice(y)
+			raceWriteSlice(x)
+		}
+
 		ifftDIT2_ssse3(x, y, tmp)
 	} else {
 		// Reference version:
@@ -552,9 +622,17 @@ func mulgf16(x, y []byte, log_m ffe, o *options) {
 	}
 	if o.useAVX2 {
 		tmp := &multiply256LUT[log_m]
+		if raceEnabled {
+			raceReadSlice(y)
+			raceWriteSlice(x)
+		}
 		mulgf16_avx2(x, y, tmp)
 	} else if o.useSSSE3 {
 		tmp := &multiply256LUT[log_m]
+		if raceEnabled {
+			raceReadSlice(y)
+			raceWriteSlice(x)
+		}
 		mulgf16_ssse3(x, y, tmp)
 	} else {
 		refMul(x, y, log_m)
@@ -564,14 +642,23 @@ func mulgf16(x, y []byte, log_m ffe, o *options) {
 func mulgf8(out, in []byte, log_m ffe8, o *options) {
 	if o.useAVX2 {
 		t := &multiply256LUT8[log_m]
-		galMulAVX2_64(t[:16], t[16:32], in, out)
 		done := (len(in) >> 6) << 6
+		if raceEnabled {
+			raceReadSlice(in[:done])
+			raceWriteSlice(out[:done])
+		}
+
+		galMulAVX2_64(t[:16], t[16:32], in, out)
 		in = in[done:]
 		out = out[done:]
 	} else if o.useSSSE3 {
 		t := &multiply256LUT8[log_m]
-		galMulSSSE3(t[:16], t[16:32], in, out)
 		done := (len(in) >> 4) << 4
+		if raceEnabled {
+			raceReadSlice(in[:done])
+			raceWriteSlice(out[:done])
+		}
+		galMulSSSE3(t[:16], t[16:32], in, out)
 		in = in[done:]
 		out = out[done:]
 	}
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
index 08f1ae8d9..d860525c9 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
@@ -17,8 +17,12 @@ func getVectorLength() (vl, pl uint64)
 
 func init() {
 	if defaultOptions.useSVE {
-		if vl, _ := getVectorLength(); vl != 256 {
-			defaultOptions.useSVE = false // Temp fix: disable SVE for non-256 vector widths (ie Graviton4)
+		if vl, _ := getVectorLength(); vl <= 256 {
+			// set vector length in bytes
+			defaultOptions.vectorLength = int(vl) >> 3
+		} else {
+			// disable SVE for hardware implementatons over 256 bits (only know to be Fujitsu A64FX atm)
+			defaultOptions.useSVE = false
 		}
 	}
 }
@@ -29,8 +33,12 @@ func galMulSlice(c byte, in, out []byte, o *options) {
 		return
 	}
 	var done int
-	galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 	done = (len(in) >> 5) << 5
+	if raceEnabled {
+		raceReadSlice(in[:done])
+		raceWriteSlice(out[:done])
+	}
+	galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 
 	remain := len(in) - done
 	if remain > 0 {
@@ -46,9 +54,12 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
 		sliceXor(in, out, o)
 		return
 	}
-	var done int
+	done := (len(in) >> 5) << 5
+	if raceEnabled {
+		raceReadSlice(in[:done])
+		raceWriteSlice(out[:done])
+	}
 	galMulXorNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-	done = (len(in) >> 5) << 5
 
 	remain := len(in) - done
 	if remain > 0 {
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s
index 335b94c36..dd974c115 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s
@@ -13,6 +13,9 @@ TEXT ·mulSve_10x1_64(SB), $0-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd346fc00 // lsr x0, x0, #6                              
+    WORD $0xd37ae400 // lsl x0, x0, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x1_64_end
     MOVD in_base+24(FP), R3
@@ -55,7 +58,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 0 to 1 outputs
     WORD $0x85804026 // ldr z6, [x1]                                
     WORD $0x85804425 // ldr z5, [x1, #1, MUL VL]                    
-    WORD $0x91010021 // add x1, x1, #64                             
+    WORD $0x04215041 // addvl x1, x1, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -77,7 +80,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 1 to 1 outputs
     WORD $0x85804086 // ldr z6, [x4]                                
     WORD $0x85804485 // ldr z5, [x4, #1, MUL VL]                    
-    WORD $0x91010084 // add x4, x4, #64                             
+    WORD $0x04245044 // addvl x4, x4, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -101,7 +104,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 2 to 1 outputs
     WORD $0x858040a6 // ldr z6, [x5]                                
     WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL]                    
-    WORD $0x910100a5 // add x5, x5, #64                             
+    WORD $0x04255045 // addvl x5, x5, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -125,7 +128,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 3 to 1 outputs
     WORD $0x85804106 // ldr z6, [x8]                                
     WORD $0x85804505 // ldr z5, [x8, #1, MUL VL]                    
-    WORD $0x91010108 // add x8, x8, #64                             
+    WORD $0x04285048 // addvl x8, x8, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -149,7 +152,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 4 to 1 outputs
     WORD $0x85804126 // ldr z6, [x9]                                
     WORD $0x85804525 // ldr z5, [x9, #1, MUL VL]                    
-    WORD $0x91010129 // add x9, x9, #64                             
+    WORD $0x04295049 // addvl x9, x9, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -173,7 +176,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 5 to 1 outputs
     WORD $0x85804146 // ldr z6, [x10]                               
     WORD $0x85804545 // ldr z5, [x10, #1, MUL VL]                   
-    WORD $0x9101014a // add x10, x10, #64                           
+    WORD $0x042a504a // addvl x10, x10, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -197,7 +200,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 6 to 1 outputs
     WORD $0x85804166 // ldr z6, [x11]                               
     WORD $0x85804565 // ldr z5, [x11, #1, MUL VL]                   
-    WORD $0x9101016b // add x11, x11, #64                           
+    WORD $0x042b504b // addvl x11, x11, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -221,7 +224,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 7 to 1 outputs
     WORD $0x85804186 // ldr z6, [x12]                               
     WORD $0x85804585 // ldr z5, [x12, #1, MUL VL]                   
-    WORD $0x9101018c // add x12, x12, #64                           
+    WORD $0x042c504c // addvl x12, x12, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -245,7 +248,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 8 to 1 outputs
     WORD $0x858041a6 // ldr z6, [x13]                               
     WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL]                   
-    WORD $0x910101ad // add x13, x13, #64                           
+    WORD $0x042d504d // addvl x13, x13, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -269,7 +272,7 @@ mulSve_10x1_64_loop:
     // Load and process 64 bytes from input 9 to 1 outputs
     WORD $0x85804066 // ldr z6, [x3]                                
     WORD $0x85804465 // ldr z5, [x3, #1, MUL VL]                    
-    WORD $0x91010063 // add x3, x3, #64                             
+    WORD $0x04235043 // addvl x3, x3, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -291,7 +294,7 @@ mulSve_10x1_64_store:
     // Store 1 outputs
     WORD $0xe58041c0 // str z0, [x14]                               
     WORD $0xe58045c1 // str z1, [x14, #1, MUL VL]                   
-    WORD $0x910101ce // add x14, x14, #64                           
+    WORD $0x042e504e // addvl x14, x14, #2
 
     // Prepare for next loop
     WORD $0xf1000400 // subs x0, x0, #1                             
@@ -309,6 +312,9 @@ TEXT ·mulSve_10x1_64Xor(SB), $0-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd346fc00 // lsr x0, x0, #6                              
+    WORD $0xd37ae400 // lsl x0, x0, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x1_64Xor_end
     MOVD in_base+24(FP), R3
@@ -355,7 +361,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 0 to 1 outputs
     WORD $0x85804026 // ldr z6, [x1]                                
     WORD $0x85804425 // ldr z5, [x1, #1, MUL VL]                    
-    WORD $0x91010021 // add x1, x1, #64                             
+    WORD $0x04215041 // addvl x1, x1, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -379,7 +385,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 1 to 1 outputs
     WORD $0x85804086 // ldr z6, [x4]                                
     WORD $0x85804485 // ldr z5, [x4, #1, MUL VL]                    
-    WORD $0x91010084 // add x4, x4, #64                             
+    WORD $0x04245044 // addvl x4, x4, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -403,7 +409,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 2 to 1 outputs
     WORD $0x858040a6 // ldr z6, [x5]                                
     WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL]                    
-    WORD $0x910100a5 // add x5, x5, #64                             
+    WORD $0x04255045 // addvl x5, x5, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -427,7 +433,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 3 to 1 outputs
     WORD $0x85804106 // ldr z6, [x8]                                
     WORD $0x85804505 // ldr z5, [x8, #1, MUL VL]                    
-    WORD $0x91010108 // add x8, x8, #64                             
+    WORD $0x04285048 // addvl x8, x8, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -451,7 +457,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 4 to 1 outputs
     WORD $0x85804126 // ldr z6, [x9]                                
     WORD $0x85804525 // ldr z5, [x9, #1, MUL VL]                    
-    WORD $0x91010129 // add x9, x9, #64                             
+    WORD $0x04295049 // addvl x9, x9, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -475,7 +481,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 5 to 1 outputs
     WORD $0x85804146 // ldr z6, [x10]                               
     WORD $0x85804545 // ldr z5, [x10, #1, MUL VL]                   
-    WORD $0x9101014a // add x10, x10, #64                           
+    WORD $0x042a504a // addvl x10, x10, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -499,7 +505,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 6 to 1 outputs
     WORD $0x85804166 // ldr z6, [x11]                               
     WORD $0x85804565 // ldr z5, [x11, #1, MUL VL]                   
-    WORD $0x9101016b // add x11, x11, #64                           
+    WORD $0x042b504b // addvl x11, x11, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -523,7 +529,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 7 to 1 outputs
     WORD $0x85804186 // ldr z6, [x12]                               
     WORD $0x85804585 // ldr z5, [x12, #1, MUL VL]                   
-    WORD $0x9101018c // add x12, x12, #64                           
+    WORD $0x042c504c // addvl x12, x12, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -547,7 +553,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 8 to 1 outputs
     WORD $0x858041a6 // ldr z6, [x13]                               
     WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL]                   
-    WORD $0x910101ad // add x13, x13, #64                           
+    WORD $0x042d504d // addvl x13, x13, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -571,7 +577,7 @@ mulSve_10x1_64Xor_loop:
     // Load and process 64 bytes from input 9 to 1 outputs
     WORD $0x85804066 // ldr z6, [x3]                                
     WORD $0x85804465 // ldr z5, [x3, #1, MUL VL]                    
-    WORD $0x91010063 // add x3, x3, #64                             
+    WORD $0x04235043 // addvl x3, x3, #2
     WORD $0x04fc94c7 // lsr z7.d, z6.d, #4                          
     WORD $0x04fc94a8 // lsr z8.d, z5.d, #4                          
     WORD $0x042230c6 // and z6.d, z6.d, z2.d                        
@@ -593,7 +599,7 @@ mulSve_10x1_64Xor_store:
     // Store 1 outputs
     WORD $0xe58041c0 // str z0, [x14]                               
     WORD $0xe58045c1 // str z1, [x14, #1, MUL VL]                   
-    WORD $0x910101ce // add x14, x14, #64                           
+    WORD $0x042e504e // addvl x14, x14, #2
 
     // Prepare for next loop
     WORD $0xf1000400 // subs x0, x0, #1                             
@@ -611,6 +617,9 @@ TEXT ·mulSve_10x2_64(SB), $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd346fc00 // lsr x0, x0, #6                              
+    WORD $0xd37ae400 // lsl x0, x0, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x2_64_end
     MOVD in_base+24(FP), R3
@@ -655,7 +664,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 0 to 2 outputs
     WORD $0x85804029 // ldr z9, [x1]                                
     WORD $0x8580442b // ldr z11, [x1, #1, MUL VL]                   
-    WORD $0x91010021 // add x1, x1, #64                             
+    WORD $0x04215041 // addvl x1, x1, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -685,7 +694,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 1 to 2 outputs
     WORD $0x85804089 // ldr z9, [x4]                                
     WORD $0x8580448b // ldr z11, [x4, #1, MUL VL]                   
-    WORD $0x91010084 // add x4, x4, #64                             
+    WORD $0x04245044 // addvl x4, x4, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -719,7 +728,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 2 to 2 outputs
     WORD $0x858040a9 // ldr z9, [x5]                                
     WORD $0x858044ab // ldr z11, [x5, #1, MUL VL]                   
-    WORD $0x910100a5 // add x5, x5, #64                             
+    WORD $0x04255045 // addvl x5, x5, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -753,7 +762,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 3 to 2 outputs
     WORD $0x85804109 // ldr z9, [x8]                                
     WORD $0x8580450b // ldr z11, [x8, #1, MUL VL]                   
-    WORD $0x91010108 // add x8, x8, #64                             
+    WORD $0x04285048 // addvl x8, x8, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -787,7 +796,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 4 to 2 outputs
     WORD $0x85804129 // ldr z9, [x9]                                
     WORD $0x8580452b // ldr z11, [x9, #1, MUL VL]                   
-    WORD $0x91010129 // add x9, x9, #64                             
+    WORD $0x04295049 // addvl x9, x9, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -821,7 +830,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 5 to 2 outputs
     WORD $0x85804149 // ldr z9, [x10]                               
     WORD $0x8580454b // ldr z11, [x10, #1, MUL VL]                  
-    WORD $0x9101014a // add x10, x10, #64                           
+    WORD $0x042a504a // addvl x10, x10, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -855,7 +864,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 6 to 2 outputs
     WORD $0x85804169 // ldr z9, [x11]                               
     WORD $0x8580456b // ldr z11, [x11, #1, MUL VL]                  
-    WORD $0x9101016b // add x11, x11, #64                           
+    WORD $0x042b504b // addvl x11, x11, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -889,7 +898,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 7 to 2 outputs
     WORD $0x85804189 // ldr z9, [x12]                               
     WORD $0x8580458b // ldr z11, [x12, #1, MUL VL]                  
-    WORD $0x9101018c // add x12, x12, #64                           
+    WORD $0x042c504c // addvl x12, x12, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -923,7 +932,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 8 to 2 outputs
     WORD $0x858041a9 // ldr z9, [x13]                               
     WORD $0x858045ab // ldr z11, [x13, #1, MUL VL]                  
-    WORD $0x910101ad // add x13, x13, #64                           
+    WORD $0x042d504d // addvl x13, x13, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -957,7 +966,7 @@ mulSve_10x2_64_loop:
     // Load and process 64 bytes from input 9 to 2 outputs
     WORD $0x85804069 // ldr z9, [x3]                                
     WORD $0x8580446b // ldr z11, [x3, #1, MUL VL]                   
-    WORD $0x91010063 // add x3, x3, #64                             
+    WORD $0x04235043 // addvl x3, x3, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -989,10 +998,10 @@ mulSve_10x2_64_store:
     // Store 2 outputs
     WORD $0xe58041e0 // str z0, [x15]                               
     WORD $0xe58045e1 // str z1, [x15, #1, MUL VL]                   
-    WORD $0x910101ef // add x15, x15, #64                           
+    WORD $0x042f504f // addvl x15, x15, #2
     WORD $0xe58041c2 // str z2, [x14]                               
     WORD $0xe58045c3 // str z3, [x14, #1, MUL VL]                   
-    WORD $0x910101ce // add x14, x14, #64                           
+    WORD $0x042e504e // addvl x14, x14, #2
 
     // Prepare for next loop
     WORD $0xf1000400 // subs x0, x0, #1                             
@@ -1010,6 +1019,9 @@ TEXT ·mulSve_10x2_64Xor(SB), $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd346fc00 // lsr x0, x0, #6                              
+    WORD $0xd37ae400 // lsl x0, x0, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x2_64Xor_end
     MOVD in_base+24(FP), R3
@@ -1060,7 +1072,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 0 to 2 outputs
     WORD $0x85804029 // ldr z9, [x1]                                
     WORD $0x8580442b // ldr z11, [x1, #1, MUL VL]                   
-    WORD $0x91010021 // add x1, x1, #64                             
+    WORD $0x04215041 // addvl x1, x1, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1094,7 +1106,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 1 to 2 outputs
     WORD $0x85804089 // ldr z9, [x4]                                
     WORD $0x8580448b // ldr z11, [x4, #1, MUL VL]                   
-    WORD $0x91010084 // add x4, x4, #64                             
+    WORD $0x04245044 // addvl x4, x4, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1128,7 +1140,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 2 to 2 outputs
     WORD $0x858040a9 // ldr z9, [x5]                                
     WORD $0x858044ab // ldr z11, [x5, #1, MUL VL]                   
-    WORD $0x910100a5 // add x5, x5, #64                             
+    WORD $0x04255045 // addvl x5, x5, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1162,7 +1174,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 3 to 2 outputs
     WORD $0x85804109 // ldr z9, [x8]                                
     WORD $0x8580450b // ldr z11, [x8, #1, MUL VL]                   
-    WORD $0x91010108 // add x8, x8, #64                             
+    WORD $0x04285048 // addvl x8, x8, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1196,7 +1208,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 4 to 2 outputs
     WORD $0x85804129 // ldr z9, [x9]                                
     WORD $0x8580452b // ldr z11, [x9, #1, MUL VL]                   
-    WORD $0x91010129 // add x9, x9, #64                             
+    WORD $0x04295049 // addvl x9, x9, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1230,7 +1242,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 5 to 2 outputs
     WORD $0x85804149 // ldr z9, [x10]                               
     WORD $0x8580454b // ldr z11, [x10, #1, MUL VL]                  
-    WORD $0x9101014a // add x10, x10, #64                           
+    WORD $0x042a504a // addvl x10, x10, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1264,7 +1276,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 6 to 2 outputs
     WORD $0x85804169 // ldr z9, [x11]                               
     WORD $0x8580456b // ldr z11, [x11, #1, MUL VL]                  
-    WORD $0x9101016b // add x11, x11, #64                           
+    WORD $0x042b504b // addvl x11, x11, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1298,7 +1310,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 7 to 2 outputs
     WORD $0x85804189 // ldr z9, [x12]                               
     WORD $0x8580458b // ldr z11, [x12, #1, MUL VL]                  
-    WORD $0x9101018c // add x12, x12, #64                           
+    WORD $0x042c504c // addvl x12, x12, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1332,7 +1344,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 8 to 2 outputs
     WORD $0x858041a9 // ldr z9, [x13]                               
     WORD $0x858045ab // ldr z11, [x13, #1, MUL VL]                  
-    WORD $0x910101ad // add x13, x13, #64                           
+    WORD $0x042d504d // addvl x13, x13, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1366,7 +1378,7 @@ mulSve_10x2_64Xor_loop:
     // Load and process 64 bytes from input 9 to 2 outputs
     WORD $0x85804069 // ldr z9, [x3]                                
     WORD $0x8580446b // ldr z11, [x3, #1, MUL VL]                   
-    WORD $0x91010063 // add x3, x3, #64                             
+    WORD $0x04235043 // addvl x3, x3, #2
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04243129 // and z9.d, z9.d, z4.d                        
@@ -1398,10 +1410,10 @@ mulSve_10x2_64Xor_store:
     // Store 2 outputs
     WORD $0xe58041e0 // str z0, [x15]                               
     WORD $0xe58045e1 // str z1, [x15, #1, MUL VL]                   
-    WORD $0x910101ef // add x15, x15, #64                           
+    WORD $0x042f504f // addvl x15, x15, #2
     WORD $0xe58041c2 // str z2, [x14]                               
     WORD $0xe58045c3 // str z3, [x14, #1, MUL VL]                   
-    WORD $0x910101ce // add x14, x14, #64                           
+    WORD $0x042e504e // addvl x14, x14, #2
 
     // Prepare for next loop
     WORD $0xf1000400 // subs x0, x0, #1                             
@@ -1419,6 +1431,9 @@ TEXT ·mulSve_10x3_64(SB), $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd346fc00 // lsr x0, x0, #6                              
+    WORD $0xd37ae400 // lsl x0, x0, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x3_64_end
     MOVD in_base+24(FP), R0
@@ -1461,6 +1476,9 @@ TEXT ·mulSve_10x3_64(SB), $8-88
     // Reload length to save a register
     MOVD n+80(FP), R6
     WORD $0xd346fcc6 // lsr x6, x6, #6                              
+    WORD $0xd37ae4c6 // lsl x6, x6, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad008c6 // udiv x6, x6, x16
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
@@ -1469,7 +1487,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 0 to 3 outputs
     WORD $0x8580406b // ldr z11, [x3]                               
     WORD $0x8580446d // ldr z13, [x3, #1, MUL VL]                   
-    WORD $0x91010063 // add x3, x3, #64                             
+    WORD $0x04235043 // addvl x3, x3, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1507,7 +1525,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 1 to 3 outputs
     WORD $0x8580402b // ldr z11, [x1]                               
     WORD $0x8580442d // ldr z13, [x1, #1, MUL VL]                   
-    WORD $0x91010021 // add x1, x1, #64                             
+    WORD $0x04215041 // addvl x1, x1, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1551,7 +1569,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 2 to 3 outputs
     WORD $0x8580408b // ldr z11, [x4]                               
     WORD $0x8580448d // ldr z13, [x4, #1, MUL VL]                   
-    WORD $0x91010084 // add x4, x4, #64                             
+    WORD $0x04245044 // addvl x4, x4, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1595,7 +1613,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 3 to 3 outputs
     WORD $0x858040ab // ldr z11, [x5]                               
     WORD $0x858044ad // ldr z13, [x5, #1, MUL VL]                   
-    WORD $0x910100a5 // add x5, x5, #64                             
+    WORD $0x04255045 // addvl x5, x5, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1639,7 +1657,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 4 to 3 outputs
     WORD $0x8580410b // ldr z11, [x8]                               
     WORD $0x8580450d // ldr z13, [x8, #1, MUL VL]                   
-    WORD $0x91010108 // add x8, x8, #64                             
+    WORD $0x04285048 // addvl x8, x8, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1683,7 +1701,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 5 to 3 outputs
     WORD $0x8580412b // ldr z11, [x9]                               
     WORD $0x8580452d // ldr z13, [x9, #1, MUL VL]                   
-    WORD $0x91010129 // add x9, x9, #64                             
+    WORD $0x04295049 // addvl x9, x9, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1727,7 +1745,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 6 to 3 outputs
     WORD $0x8580414b // ldr z11, [x10]                              
     WORD $0x8580454d // ldr z13, [x10, #1, MUL VL]                  
-    WORD $0x9101014a // add x10, x10, #64                           
+    WORD $0x042a504a // addvl x10, x10, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1771,7 +1789,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 7 to 3 outputs
     WORD $0x8580416b // ldr z11, [x11]                              
     WORD $0x8580456d // ldr z13, [x11, #1, MUL VL]                  
-    WORD $0x9101016b // add x11, x11, #64                           
+    WORD $0x042b504b // addvl x11, x11, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1815,7 +1833,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 8 to 3 outputs
     WORD $0x8580418b // ldr z11, [x12]                              
     WORD $0x8580458d // ldr z13, [x12, #1, MUL VL]                  
-    WORD $0x9101018c // add x12, x12, #64                           
+    WORD $0x042c504c // addvl x12, x12, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1859,7 +1877,7 @@ mulSve_10x3_64_loop:
     // Load and process 64 bytes from input 9 to 3 outputs
     WORD $0x8580400b // ldr z11, [x0]                               
     WORD $0x8580440d // ldr z13, [x0, #1, MUL VL]                   
-    WORD $0x91010000 // add x0, x0, #64                             
+    WORD $0x04205040 // addvl x0, x0, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -1901,13 +1919,13 @@ mulSve_10x3_64_store:
     // Store 3 outputs
     WORD $0xe58041c0 // str z0, [x14]                               
     WORD $0xe58045c1 // str z1, [x14, #1, MUL VL]                   
-    WORD $0x910101ce // add x14, x14, #64                           
+    WORD $0x042e504e // addvl x14, x14, #2
     WORD $0xe58041e2 // str z2, [x15]                               
     WORD $0xe58045e3 // str z3, [x15, #1, MUL VL]                   
-    WORD $0x910101ef // add x15, x15, #64                           
+    WORD $0x042f504f // addvl x15, x15, #2
     WORD $0xe58041a4 // str z4, [x13]                               
     WORD $0xe58045a5 // str z5, [x13, #1, MUL VL]                   
-    WORD $0x910101ad // add x13, x13, #64                           
+    WORD $0x042d504d // addvl x13, x13, #2
 
     // Prepare for next loop
     WORD $0xf10004c6 // subs x6, x6, #1                             
@@ -1925,6 +1943,9 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd346fc00 // lsr x0, x0, #6                              
+    WORD $0xd37ae400 // lsl x0, x0, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x3_64Xor_end
     MOVD in_base+24(FP), R0
@@ -1967,6 +1988,9 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88
     // Reload length to save a register
     MOVD n+80(FP), R6
     WORD $0xd346fcc6 // lsr x6, x6, #6                              
+    WORD $0xd37ae4c6 // lsl x6, x6, #6
+    WORD $0x04bf5050 // rdvl x16, #2
+    WORD $0x9ad008c6 // udiv x6, x6, x16
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
@@ -1983,7 +2007,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 0 to 3 outputs
     WORD $0x8580406b // ldr z11, [x3]                               
     WORD $0x8580446d // ldr z13, [x3, #1, MUL VL]                   
-    WORD $0x91010063 // add x3, x3, #64                             
+    WORD $0x04235043 // addvl x3, x3, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2027,7 +2051,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 1 to 3 outputs
     WORD $0x8580402b // ldr z11, [x1]                               
     WORD $0x8580442d // ldr z13, [x1, #1, MUL VL]                   
-    WORD $0x91010021 // add x1, x1, #64                             
+    WORD $0x04215041 // addvl x1, x1, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2071,7 +2095,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 2 to 3 outputs
     WORD $0x8580408b // ldr z11, [x4]                               
     WORD $0x8580448d // ldr z13, [x4, #1, MUL VL]                   
-    WORD $0x91010084 // add x4, x4, #64                             
+    WORD $0x04245044 // addvl x4, x4, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2115,7 +2139,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 3 to 3 outputs
     WORD $0x858040ab // ldr z11, [x5]                               
     WORD $0x858044ad // ldr z13, [x5, #1, MUL VL]                   
-    WORD $0x910100a5 // add x5, x5, #64                             
+    WORD $0x04255045 // addvl x5, x5, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2159,7 +2183,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 4 to 3 outputs
     WORD $0x8580410b // ldr z11, [x8]                               
     WORD $0x8580450d // ldr z13, [x8, #1, MUL VL]                   
-    WORD $0x91010108 // add x8, x8, #64                             
+    WORD $0x04285048 // addvl x8, x8, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2203,7 +2227,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 5 to 3 outputs
     WORD $0x8580412b // ldr z11, [x9]                               
     WORD $0x8580452d // ldr z13, [x9, #1, MUL VL]                   
-    WORD $0x91010129 // add x9, x9, #64                             
+    WORD $0x04295049 // addvl x9, x9, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2247,7 +2271,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 6 to 3 outputs
     WORD $0x8580414b // ldr z11, [x10]                              
     WORD $0x8580454d // ldr z13, [x10, #1, MUL VL]                  
-    WORD $0x9101014a // add x10, x10, #64                           
+    WORD $0x042a504a // addvl x10, x10, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2291,7 +2315,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 7 to 3 outputs
     WORD $0x8580416b // ldr z11, [x11]                              
     WORD $0x8580456d // ldr z13, [x11, #1, MUL VL]                  
-    WORD $0x9101016b // add x11, x11, #64                           
+    WORD $0x042b504b // addvl x11, x11, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2335,7 +2359,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 8 to 3 outputs
     WORD $0x8580418b // ldr z11, [x12]                              
     WORD $0x8580458d // ldr z13, [x12, #1, MUL VL]                  
-    WORD $0x9101018c // add x12, x12, #64                           
+    WORD $0x042c504c // addvl x12, x12, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2379,7 +2403,7 @@ mulSve_10x3_64Xor_loop:
     // Load and process 64 bytes from input 9 to 3 outputs
     WORD $0x8580400b // ldr z11, [x0]                               
     WORD $0x8580440d // ldr z13, [x0, #1, MUL VL]                   
-    WORD $0x91010000 // add x0, x0, #64                             
+    WORD $0x04205040 // addvl x0, x0, #2
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x0426316b // and z11.d, z11.d, z6.d                      
@@ -2421,13 +2445,13 @@ mulSve_10x3_64Xor_store:
     // Store 3 outputs
     WORD $0xe58041c0 // str z0, [x14]                               
     WORD $0xe58045c1 // str z1, [x14, #1, MUL VL]                   
-    WORD $0x910101ce // add x14, x14, #64                           
+    WORD $0x042e504e // addvl x14, x14, #2
     WORD $0xe58041e2 // str z2, [x15]                               
     WORD $0xe58045e3 // str z3, [x15, #1, MUL VL]                   
-    WORD $0x910101ef // add x15, x15, #64                           
+    WORD $0x042f504f // addvl x15, x15, #2
     WORD $0xe58041a4 // str z4, [x13]                               
     WORD $0xe58045a5 // str z5, [x13, #1, MUL VL]                   
-    WORD $0x910101ad // add x13, x13, #64                           
+    WORD $0x042d504d // addvl x13, x13, #2
 
     // Prepare for next loop
     WORD $0xf10004c6 // subs x6, x6, #1                             
@@ -2446,6 +2470,9 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x4_end
     MOVD in_base+24(FP), R3
@@ -2480,11 +2507,13 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x4_loop:
     // Load and process 32 bytes from input 0 to 4 outputs
     WORD $0x85804027 // ldr z7, [x1]                                
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2514,7 +2543,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 1 to 4 outputs
     WORD $0x85804087 // ldr z7, [x4]                                
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2548,7 +2577,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 2 to 4 outputs
     WORD $0x858040a7 // ldr z7, [x5]                                
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2582,7 +2611,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 3 to 4 outputs
     WORD $0x85804107 // ldr z7, [x8]                                
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2616,7 +2645,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 4 to 4 outputs
     WORD $0x85804127 // ldr z7, [x9]                                
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2650,7 +2679,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 5 to 4 outputs
     WORD $0x85804147 // ldr z7, [x10]                               
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2684,7 +2713,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 6 to 4 outputs
     WORD $0x85804167 // ldr z7, [x11]                               
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2718,7 +2747,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 7 to 4 outputs
     WORD $0x85804187 // ldr z7, [x12]                               
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2752,7 +2781,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 8 to 4 outputs
     WORD $0x858041a7 // ldr z7, [x13]                               
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2786,7 +2815,7 @@ mulSve_10x4_loop:
 
     // Load and process 32 bytes from input 9 to 4 outputs
     WORD $0x85804067 // ldr z7, [x3]                                
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2827,7 +2856,7 @@ mulSve_10x4_store:
     WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x4_loop
 
@@ -2844,6 +2873,9 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x4Xor_end
     MOVD in_base+24(FP), R3
@@ -2878,11 +2910,13 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x4Xor_loop:
     // Load and process 32 bytes from input 0 to 4 outputs
     WORD $0x85804027 // ldr z7, [x1]                                
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2924,7 +2958,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 1 to 4 outputs
     WORD $0x85804087 // ldr z7, [x4]                                
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2958,7 +2992,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 2 to 4 outputs
     WORD $0x858040a7 // ldr z7, [x5]                                
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -2992,7 +3026,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 3 to 4 outputs
     WORD $0x85804107 // ldr z7, [x8]                                
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3026,7 +3060,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 4 to 4 outputs
     WORD $0x85804127 // ldr z7, [x9]                                
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3060,7 +3094,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 5 to 4 outputs
     WORD $0x85804147 // ldr z7, [x10]                               
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3094,7 +3128,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 6 to 4 outputs
     WORD $0x85804167 // ldr z7, [x11]                               
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3128,7 +3162,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 7 to 4 outputs
     WORD $0x85804187 // ldr z7, [x12]                               
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3162,7 +3196,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 8 to 4 outputs
     WORD $0x858041a7 // ldr z7, [x13]                               
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3196,7 +3230,7 @@ mulSve_10x4Xor_loop:
 
     // Load and process 32 bytes from input 9 to 4 outputs
     WORD $0x85804067 // ldr z7, [x3]                                
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc94e8 // lsr z8.d, z7.d, #4                          
     WORD $0x042430e7 // and z7.d, z7.d, z4.d                        
     WORD $0x04243108 // and z8.d, z8.d, z4.d                        
@@ -3237,7 +3271,7 @@ mulSve_10x4Xor_store:
     WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x4Xor_loop
 
@@ -3254,6 +3288,9 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x5_end
     MOVD in_base+24(FP), R3
@@ -3288,11 +3325,13 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x5_loop:
     // Load and process 32 bytes from input 0 to 5 outputs
     WORD $0x85804028 // ldr z8, [x1]                                
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3327,7 +3366,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 1 to 5 outputs
     WORD $0x85804088 // ldr z8, [x4]                                
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3367,7 +3406,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 2 to 5 outputs
     WORD $0x858040a8 // ldr z8, [x5]                                
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3407,7 +3446,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 3 to 5 outputs
     WORD $0x85804108 // ldr z8, [x8]                                
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3447,7 +3486,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 4 to 5 outputs
     WORD $0x85804128 // ldr z8, [x9]                                
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3487,7 +3526,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 5 to 5 outputs
     WORD $0x85804148 // ldr z8, [x10]                               
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3527,7 +3566,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 6 to 5 outputs
     WORD $0x85804168 // ldr z8, [x11]                               
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3567,7 +3606,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 7 to 5 outputs
     WORD $0x85804188 // ldr z8, [x12]                               
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3607,7 +3646,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 8 to 5 outputs
     WORD $0x858041a8 // ldr z8, [x13]                               
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3647,7 +3686,7 @@ mulSve_10x5_loop:
 
     // Load and process 32 bytes from input 9 to 5 outputs
     WORD $0x85804068 // ldr z8, [x3]                                
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3696,7 +3735,7 @@ mulSve_10x5_store:
     WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x5_loop
 
@@ -3713,6 +3752,9 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x5Xor_end
     MOVD in_base+24(FP), R3
@@ -3747,11 +3789,13 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x5Xor_loop:
     // Load and process 32 bytes from input 0 to 5 outputs
     WORD $0x85804028 // ldr z8, [x1]                                
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3801,7 +3845,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 1 to 5 outputs
     WORD $0x85804088 // ldr z8, [x4]                                
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3841,7 +3885,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 2 to 5 outputs
     WORD $0x858040a8 // ldr z8, [x5]                                
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3881,7 +3925,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 3 to 5 outputs
     WORD $0x85804108 // ldr z8, [x8]                                
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3921,7 +3965,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 4 to 5 outputs
     WORD $0x85804128 // ldr z8, [x9]                                
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -3961,7 +4005,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 5 to 5 outputs
     WORD $0x85804148 // ldr z8, [x10]                               
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -4001,7 +4045,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 6 to 5 outputs
     WORD $0x85804168 // ldr z8, [x11]                               
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -4041,7 +4085,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 7 to 5 outputs
     WORD $0x85804188 // ldr z8, [x12]                               
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -4081,7 +4125,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 8 to 5 outputs
     WORD $0x858041a8 // ldr z8, [x13]                               
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -4121,7 +4165,7 @@ mulSve_10x5Xor_loop:
 
     // Load and process 32 bytes from input 9 to 5 outputs
     WORD $0x85804068 // ldr z8, [x3]                                
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc9509 // lsr z9.d, z8.d, #4                          
     WORD $0x04253108 // and z8.d, z8.d, z5.d                        
     WORD $0x04253129 // and z9.d, z9.d, z5.d                        
@@ -4170,7 +4214,7 @@ mulSve_10x5Xor_store:
     WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x5Xor_loop
 
@@ -4187,6 +4231,9 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x6_end
     MOVD in_base+24(FP), R3
@@ -4221,11 +4268,13 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x6_loop:
     // Load and process 32 bytes from input 0 to 6 outputs
     WORD $0x85804029 // ldr z9, [x1]                                
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4265,7 +4314,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 1 to 6 outputs
     WORD $0x85804089 // ldr z9, [x4]                                
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4311,7 +4360,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 2 to 6 outputs
     WORD $0x858040a9 // ldr z9, [x5]                                
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4357,7 +4406,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 3 to 6 outputs
     WORD $0x85804109 // ldr z9, [x8]                                
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4403,7 +4452,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 4 to 6 outputs
     WORD $0x85804129 // ldr z9, [x9]                                
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4449,7 +4498,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 5 to 6 outputs
     WORD $0x85804149 // ldr z9, [x10]                               
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4495,7 +4544,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 6 to 6 outputs
     WORD $0x85804169 // ldr z9, [x11]                               
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4541,7 +4590,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 7 to 6 outputs
     WORD $0x85804189 // ldr z9, [x12]                               
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4587,7 +4636,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 8 to 6 outputs
     WORD $0x858041a9 // ldr z9, [x13]                               
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4633,7 +4682,7 @@ mulSve_10x6_loop:
 
     // Load and process 32 bytes from input 9 to 6 outputs
     WORD $0x85804069 // ldr z9, [x3]                                
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4690,7 +4739,7 @@ mulSve_10x6_store:
     WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x6_loop
 
@@ -4707,6 +4756,9 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x6Xor_end
     MOVD in_base+24(FP), R3
@@ -4741,11 +4793,13 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x6Xor_loop:
     // Load and process 32 bytes from input 0 to 6 outputs
     WORD $0x85804029 // ldr z9, [x1]                                
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4803,7 +4857,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 1 to 6 outputs
     WORD $0x85804089 // ldr z9, [x4]                                
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4849,7 +4903,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 2 to 6 outputs
     WORD $0x858040a9 // ldr z9, [x5]                                
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4895,7 +4949,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 3 to 6 outputs
     WORD $0x85804109 // ldr z9, [x8]                                
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4941,7 +4995,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 4 to 6 outputs
     WORD $0x85804129 // ldr z9, [x9]                                
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -4987,7 +5041,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 5 to 6 outputs
     WORD $0x85804149 // ldr z9, [x10]                               
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -5033,7 +5087,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 6 to 6 outputs
     WORD $0x85804169 // ldr z9, [x11]                               
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -5079,7 +5133,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 7 to 6 outputs
     WORD $0x85804189 // ldr z9, [x12]                               
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -5125,7 +5179,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 8 to 6 outputs
     WORD $0x858041a9 // ldr z9, [x13]                               
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -5171,7 +5225,7 @@ mulSve_10x6Xor_loop:
 
     // Load and process 32 bytes from input 9 to 6 outputs
     WORD $0x85804069 // ldr z9, [x3]                                
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc952a // lsr z10.d, z9.d, #4                         
     WORD $0x04263129 // and z9.d, z9.d, z6.d                        
     WORD $0x0426314a // and z10.d, z10.d, z6.d                      
@@ -5228,7 +5282,7 @@ mulSve_10x6Xor_store:
     WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x6Xor_loop
 
@@ -5245,6 +5299,9 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x7_end
     MOVD in_base+24(FP), R3
@@ -5279,11 +5336,13 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x7_loop:
     // Load and process 32 bytes from input 0 to 7 outputs
     WORD $0x8580402a // ldr z10, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5328,7 +5387,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 1 to 7 outputs
     WORD $0x8580408a // ldr z10, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5380,7 +5439,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 2 to 7 outputs
     WORD $0x858040aa // ldr z10, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5432,7 +5491,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 3 to 7 outputs
     WORD $0x8580410a // ldr z10, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5484,7 +5543,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 4 to 7 outputs
     WORD $0x8580412a // ldr z10, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5536,7 +5595,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 5 to 7 outputs
     WORD $0x8580414a // ldr z10, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5588,7 +5647,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 6 to 7 outputs
     WORD $0x8580416a // ldr z10, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5640,7 +5699,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 7 to 7 outputs
     WORD $0x8580418a // ldr z10, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5692,7 +5751,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 8 to 7 outputs
     WORD $0x858041aa // ldr z10, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5744,7 +5803,7 @@ mulSve_10x7_loop:
 
     // Load and process 32 bytes from input 9 to 7 outputs
     WORD $0x8580406a // ldr z10, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5809,7 +5868,7 @@ mulSve_10x7_store:
     WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x7_loop
 
@@ -5826,6 +5885,9 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x7Xor_end
     MOVD in_base+24(FP), R3
@@ -5860,11 +5922,13 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x7Xor_loop:
     // Load and process 32 bytes from input 0 to 7 outputs
     WORD $0x8580402a // ldr z10, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5930,7 +5994,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 1 to 7 outputs
     WORD $0x8580408a // ldr z10, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -5982,7 +6046,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 2 to 7 outputs
     WORD $0x858040aa // ldr z10, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6034,7 +6098,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 3 to 7 outputs
     WORD $0x8580410a // ldr z10, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6086,7 +6150,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 4 to 7 outputs
     WORD $0x8580412a // ldr z10, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6138,7 +6202,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 5 to 7 outputs
     WORD $0x8580414a // ldr z10, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6190,7 +6254,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 6 to 7 outputs
     WORD $0x8580416a // ldr z10, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6242,7 +6306,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 7 to 7 outputs
     WORD $0x8580418a // ldr z10, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6294,7 +6358,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 8 to 7 outputs
     WORD $0x858041aa // ldr z10, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6346,7 +6410,7 @@ mulSve_10x7Xor_loop:
 
     // Load and process 32 bytes from input 9 to 7 outputs
     WORD $0x8580406a // ldr z10, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc954b // lsr z11.d, z10.d, #4                        
     WORD $0x0427314a // and z10.d, z10.d, z7.d                      
     WORD $0x0427316b // and z11.d, z11.d, z7.d                      
@@ -6411,7 +6475,7 @@ mulSve_10x7Xor_store:
     WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x7Xor_loop
 
@@ -6428,6 +6492,9 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x8_end
     MOVD in_base+24(FP), R3
@@ -6462,11 +6529,13 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x8_loop:
     // Load and process 32 bytes from input 0 to 8 outputs
     WORD $0x8580402b // ldr z11, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6516,7 +6585,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 1 to 8 outputs
     WORD $0x8580408b // ldr z11, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6574,7 +6643,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 2 to 8 outputs
     WORD $0x858040ab // ldr z11, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6632,7 +6701,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 3 to 8 outputs
     WORD $0x8580410b // ldr z11, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6690,7 +6759,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 4 to 8 outputs
     WORD $0x8580412b // ldr z11, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6748,7 +6817,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 5 to 8 outputs
     WORD $0x8580414b // ldr z11, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6806,7 +6875,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 6 to 8 outputs
     WORD $0x8580416b // ldr z11, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6864,7 +6933,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 7 to 8 outputs
     WORD $0x8580418b // ldr z11, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6922,7 +6991,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 8 to 8 outputs
     WORD $0x858041ab // ldr z11, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -6980,7 +7049,7 @@ mulSve_10x8_loop:
 
     // Load and process 32 bytes from input 9 to 8 outputs
     WORD $0x8580406b // ldr z11, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7053,7 +7122,7 @@ mulSve_10x8_store:
     WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x8_loop
 
@@ -7070,6 +7139,9 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x8Xor_end
     MOVD in_base+24(FP), R3
@@ -7104,11 +7176,13 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x8Xor_loop:
     // Load and process 32 bytes from input 0 to 8 outputs
     WORD $0x8580402b // ldr z11, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7182,7 +7256,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 1 to 8 outputs
     WORD $0x8580408b // ldr z11, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7240,7 +7314,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 2 to 8 outputs
     WORD $0x858040ab // ldr z11, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7298,7 +7372,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 3 to 8 outputs
     WORD $0x8580410b // ldr z11, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7356,7 +7430,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 4 to 8 outputs
     WORD $0x8580412b // ldr z11, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7414,7 +7488,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 5 to 8 outputs
     WORD $0x8580414b // ldr z11, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7472,7 +7546,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 6 to 8 outputs
     WORD $0x8580416b // ldr z11, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7530,7 +7604,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 7 to 8 outputs
     WORD $0x8580418b // ldr z11, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7588,7 +7662,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 8 to 8 outputs
     WORD $0x858041ab // ldr z11, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7646,7 +7720,7 @@ mulSve_10x8Xor_loop:
 
     // Load and process 32 bytes from input 9 to 8 outputs
     WORD $0x8580406b // ldr z11, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc956c // lsr z12.d, z11.d, #4                        
     WORD $0x0428316b // and z11.d, z11.d, z8.d                      
     WORD $0x0428318c // and z12.d, z12.d, z8.d                      
@@ -7719,7 +7793,7 @@ mulSve_10x8Xor_store:
     WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x8Xor_loop
 
@@ -7736,6 +7810,9 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x9_end
     MOVD in_base+24(FP), R3
@@ -7770,11 +7847,13 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x9_loop:
     // Load and process 32 bytes from input 0 to 9 outputs
     WORD $0x8580402c // ldr z12, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -7829,7 +7908,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 1 to 9 outputs
     WORD $0x8580408c // ldr z12, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -7893,7 +7972,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 2 to 9 outputs
     WORD $0x858040ac // ldr z12, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -7957,7 +8036,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 3 to 9 outputs
     WORD $0x8580410c // ldr z12, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8021,7 +8100,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 4 to 9 outputs
     WORD $0x8580412c // ldr z12, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8085,7 +8164,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 5 to 9 outputs
     WORD $0x8580414c // ldr z12, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8149,7 +8228,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 6 to 9 outputs
     WORD $0x8580416c // ldr z12, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8213,7 +8292,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 7 to 9 outputs
     WORD $0x8580418c // ldr z12, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8277,7 +8356,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 8 to 9 outputs
     WORD $0x858041ac // ldr z12, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8341,7 +8420,7 @@ mulSve_10x9_loop:
 
     // Load and process 32 bytes from input 9 to 9 outputs
     WORD $0x8580406c // ldr z12, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8422,7 +8501,7 @@ mulSve_10x9_store:
     WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x9_loop
 
@@ -8439,6 +8518,9 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x9Xor_end
     MOVD in_base+24(FP), R3
@@ -8473,11 +8555,13 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x9Xor_loop:
     // Load and process 32 bytes from input 0 to 9 outputs
     WORD $0x8580402c // ldr z12, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8559,7 +8643,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 1 to 9 outputs
     WORD $0x8580408c // ldr z12, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8623,7 +8707,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 2 to 9 outputs
     WORD $0x858040ac // ldr z12, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8687,7 +8771,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 3 to 9 outputs
     WORD $0x8580410c // ldr z12, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8751,7 +8835,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 4 to 9 outputs
     WORD $0x8580412c // ldr z12, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8815,7 +8899,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 5 to 9 outputs
     WORD $0x8580414c // ldr z12, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8879,7 +8963,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 6 to 9 outputs
     WORD $0x8580416c // ldr z12, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -8943,7 +9027,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 7 to 9 outputs
     WORD $0x8580418c // ldr z12, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -9007,7 +9091,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 8 to 9 outputs
     WORD $0x858041ac // ldr z12, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -9071,7 +9155,7 @@ mulSve_10x9Xor_loop:
 
     // Load and process 32 bytes from input 9 to 9 outputs
     WORD $0x8580406c // ldr z12, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc958d // lsr z13.d, z12.d, #4                        
     WORD $0x0429318c // and z12.d, z12.d, z9.d                      
     WORD $0x042931ad // and z13.d, z13.d, z9.d                      
@@ -9152,7 +9236,7 @@ mulSve_10x9Xor_store:
     WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x9Xor_loop
 
@@ -9169,6 +9253,9 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x10_end
     MOVD in_base+24(FP), R3
@@ -9203,11 +9290,13 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x10_loop:
     // Load and process 32 bytes from input 0 to 10 outputs
     WORD $0x8580402d // ldr z13, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9267,7 +9356,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 1 to 10 outputs
     WORD $0x8580408d // ldr z13, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9337,7 +9426,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 2 to 10 outputs
     WORD $0x858040ad // ldr z13, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9407,7 +9496,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 3 to 10 outputs
     WORD $0x8580410d // ldr z13, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9477,7 +9566,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 4 to 10 outputs
     WORD $0x8580412d // ldr z13, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9547,7 +9636,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 5 to 10 outputs
     WORD $0x8580414d // ldr z13, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9617,7 +9706,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 6 to 10 outputs
     WORD $0x8580416d // ldr z13, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9687,7 +9776,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 7 to 10 outputs
     WORD $0x8580418d // ldr z13, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9757,7 +9846,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 8 to 10 outputs
     WORD $0x858041ad // ldr z13, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9827,7 +9916,7 @@ mulSve_10x10_loop:
 
     // Load and process 32 bytes from input 9 to 10 outputs
     WORD $0x8580406d // ldr z13, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -9916,7 +10005,7 @@ mulSve_10x10_store:
     WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x10_loop
 
@@ -9933,6 +10022,9 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88
     MOVD n+80(FP), R0
     MOVD matrix_base+0(FP), R2
     WORD $0xd345fc00 // lsr x0, x0, #5                              
+    WORD $0xd37be800 // lsl x0, x0, #5
+    WORD $0x04bf5030 // rdvl x16, #1
+    WORD $0x9ad00800 // udiv x0, x0, x16
     WORD $0xea00001f // tst x0, x0                                  
     BEQ    mulSve_10x10Xor_end
     MOVD in_base+24(FP), R3
@@ -9967,11 +10059,13 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88
 
     // Load number of input shards
     MOVD   in_len+32(FP), R16
+    WORD $0x04bf5031 // rdvl x17, #1
+    WORD $0xd343fe31 // lsr  x17, x17, #3
 
 mulSve_10x10Xor_loop:
     // Load and process 32 bytes from input 0 to 10 outputs
     WORD $0x8580402d // ldr z13, [x1]                               
-    WORD $0x91008021 // add x1, x1, #32                             
+    WORD $0x04215021 // addvl x1, x1, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10061,7 +10155,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 1 to 10 outputs
     WORD $0x8580408d // ldr z13, [x4]                               
-    WORD $0x91008084 // add x4, x4, #32                             
+    WORD $0x04245024 // addvl x4, x4, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10131,7 +10225,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 2 to 10 outputs
     WORD $0x858040ad // ldr z13, [x5]                               
-    WORD $0x910080a5 // add x5, x5, #32                             
+    WORD $0x04255025 // addvl x5, x5, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10201,7 +10295,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 3 to 10 outputs
     WORD $0x8580410d // ldr z13, [x8]                               
-    WORD $0x91008108 // add x8, x8, #32                             
+    WORD $0x04285028 // addvl x8, x8, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10271,7 +10365,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 4 to 10 outputs
     WORD $0x8580412d // ldr z13, [x9]                               
-    WORD $0x91008129 // add x9, x9, #32                             
+    WORD $0x04295029 // addvl x9, x9, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10341,7 +10435,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 5 to 10 outputs
     WORD $0x8580414d // ldr z13, [x10]                              
-    WORD $0x9100814a // add x10, x10, #32                           
+    WORD $0x042a502a // addvl x10, x10, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10411,7 +10505,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 6 to 10 outputs
     WORD $0x8580416d // ldr z13, [x11]                              
-    WORD $0x9100816b // add x11, x11, #32                           
+    WORD $0x042b502b // addvl x11, x11, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10481,7 +10575,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 7 to 10 outputs
     WORD $0x8580418d // ldr z13, [x12]                              
-    WORD $0x9100818c // add x12, x12, #32                           
+    WORD $0x042c502c // addvl x12, x12, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10551,7 +10645,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 8 to 10 outputs
     WORD $0x858041ad // ldr z13, [x13]                              
-    WORD $0x910081ad // add x13, x13, #32                           
+    WORD $0x042d502d // addvl x13, x13, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10621,7 +10715,7 @@ mulSve_10x10Xor_loop:
 
     // Load and process 32 bytes from input 9 to 10 outputs
     WORD $0x8580406d // ldr z13, [x3]                               
-    WORD $0x91008063 // add x3, x3, #32                             
+    WORD $0x04235023 // addvl x3, x3, #1
     WORD $0x04fc95ae // lsr z14.d, z13.d, #4                        
     WORD $0x042a31ad // and z13.d, z13.d, z10.d                     
     WORD $0x042a31ce // and z14.d, z14.d, z10.d                     
@@ -10710,7 +10804,7 @@ mulSve_10x10Xor_store:
     WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3]        
 
     // Prepare for next loop
-    WORD $0x910011ef // add x15, x15, #4                            
+    WORD $0x8b1101ef // add x15, x15, x17
     WORD $0xf1000400 // subs x0, x0, #1                             
     BNE  mulSve_10x10Xor_loop
 
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go
index d4f46ea2d..f9c36e296 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go
@@ -43,8 +43,14 @@ func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(ma
 		inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs
 }
 
-func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := stop - start
+func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) (n int) {
+	n = stop - start
+	if raceEnabled {
+		defer func() {
+			raceReadSlices(in, start, n)
+			raceWriteSlices(out, start, n)
+		}()
+	}
 
 	switch len(in) {
 	case 1:
@@ -381,8 +387,14 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
 	panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
 }
 
-func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := (stop - start)
+func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) (n int) {
+	n = stop - start
+	if raceEnabled {
+		defer func() {
+			raceReadSlices(in, start, n)
+			raceWriteSlices(out, start, n)
+		}()
+	}
 
 	switch len(in) {
 	case 1:
@@ -722,6 +734,11 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
 func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (64 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
@@ -1060,6 +1077,11 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (64 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
@@ -1398,6 +1420,11 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int
 func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (32 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
@@ -1736,6 +1763,11 @@ func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int
 func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (32 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go
index ff2541b8e..656e06213 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go
@@ -38,9 +38,15 @@ func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(ma
 }
 
 // galMulSlicesSve
-func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := stop - start
+func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) (n int) {
+	n = stop - start
 
+	if raceEnabled {
+		defer func() {
+			raceReadSlices(in, start, n)
+			raceWriteSlices(out, start, n)
+		}()
+	}
 	// fmt.Println(len(in), len(out))
 	switch len(out) {
 	case 1:
@@ -78,8 +84,15 @@ func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int {
 }
 
 // galMulSlicesSveXor
-func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := (stop - start)
+func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) (n int) {
+	n = (stop - start)
+
+	if raceEnabled {
+		defer func() {
+			raceReadSlices(in, start, n)
+			raceWriteSlices(out, start, n)
+		}()
+	}
 
 	switch len(out) {
 	case 1:
@@ -117,8 +130,14 @@ func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int {
 }
 
 // galMulSlicesNeon
-func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := stop - start
+func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) (n int) {
+	n = stop - start
+	if raceEnabled {
+		defer func() {
+			raceReadSlices(in, start, n)
+			raceWriteSlices(out, start, n)
+		}()
+	}
 
 	switch len(out) {
 	case 1:
@@ -156,9 +175,14 @@ func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int {
 }
 
 // galMulSlicesNeonXor
-func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int {
-	n := (stop - start)
-
+func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) (n int) {
+	n = (stop - start)
+	if raceEnabled {
+		defer func() {
+			raceReadSlices(in, start, n)
+			raceWriteSlices(out, start, n)
+		}()
+	}
 	switch len(out) {
 	case 1:
 		mulNeon_10x1_64Xor(matrix, in, out, start, n)
diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go
index 66bab8a0b..3ac349d3f 100644
--- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go
@@ -45,6 +45,11 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
 func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (64 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
@@ -383,6 +388,11 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (64 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
@@ -721,6 +731,11 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int
 func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (32 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
@@ -1059,6 +1074,11 @@ func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int
 func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int {
 	n := (stop - start) & (maxInt - (32 - 1))
 
+	if raceEnabled {
+		raceReadSlices(in, start, n)
+		raceWriteSlices(out, start, n)
+	}
+
 	switch len(in) {
 	case 1:
 		switch len(out) {
diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go
index 377137ef5..cde255564 100644
--- a/vendor/github.com/klauspost/reedsolomon/options.go
+++ b/vendor/github.com/klauspost/reedsolomon/options.go
@@ -24,6 +24,7 @@ type options struct {
 	useSSE2,
 	useNEON,
 	useSVE bool
+	vectorLength int
 
 	useJerasureMatrix    bool
 	usePAR1Matrix        bool
@@ -55,6 +56,7 @@ var defaultOptions = options{
 	useAvxGNFI:    cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI),
 	useNEON:       cpuid.CPU.Supports(cpuid.ASIMD),
 	useSVE:        cpuid.CPU.Supports(cpuid.SVE),
+	vectorLength:  32, // default vector length is 32 bytes (256 bits) for AVX2 code gen
 }
 
 // leopardMode controls the use of leopard GF in encoding and decoding.
diff --git a/vendor/github.com/klauspost/reedsolomon/race.go b/vendor/github.com/klauspost/reedsolomon/race.go
new file mode 100644
index 000000000..4f2c0b693
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/race.go
@@ -0,0 +1,61 @@
+// Copyright (c) 2024+ Klaus Post. See LICENSE for license
+
+//go:build race
+
+package reedsolomon
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const raceEnabled = true
+
+func raceReadSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
+
+func raceWriteSlice[T any](s []T) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceWriteRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+}
+
+func raceReadSlices[T any](s [][]T, start, n int) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+	for _, v := range s {
+		if len(v) == 0 {
+			continue
+		}
+		n := n
+		if n < 0 {
+			n = len(v) - start
+		}
+		runtime.RaceReadRange(unsafe.Pointer(&v[start]), n*int(unsafe.Sizeof(v[0])))
+	}
+}
+
+func raceWriteSlices[T any](s [][]T, start, n int) {
+	if len(s) == 0 {
+		return
+	}
+	runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0])))
+
+	for _, v := range s {
+		if len(v) == 0 {
+			continue
+		}
+		n := n
+		if n < 0 {
+			n = len(v) - start
+		}
+		runtime.RaceWriteRange(unsafe.Pointer(&v[start]), n*int(unsafe.Sizeof(v[0])))
+	}
+}
diff --git a/vendor/github.com/klauspost/reedsolomon/race_none.go b/vendor/github.com/klauspost/reedsolomon/race_none.go
new file mode 100644
index 000000000..c7d05f287
--- /dev/null
+++ b/vendor/github.com/klauspost/reedsolomon/race_none.go
@@ -0,0 +1,17 @@
+// Copyright (c) 2024+ Klaus Post. See LICENSE for license
+
+//go:build !race
+
+package reedsolomon
+
+const raceEnabled = false
+
+func raceReadSlice[T any](s []T) {
+}
+
+func raceWriteSlice[T any](s []T) {
+}
+
+func raceReadSlices[T any](s [][]T, start, n int) {}
+
+func raceWriteSlices[T any](s [][]T, start, n int) {}
diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
index 3b6f5b785..443543f5d 100644
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@@ -833,7 +833,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
 		start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount)
 		end = len(inputs[0])
 	} else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok {
-		m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
+		m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice())
 		start += (*galMulGen)(m, inputs, outputs, 0, byteCount)
 		r.putTmpSlice(m)
 		end = len(inputs[0])
@@ -864,7 +864,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC
 						start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount)
 					}
 				} else {
-					m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
+					m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, m)
 					if inIdx == 0 {
 						start = (*galMulGen)(m, inPer, outPer, 0, byteCount)
 					} else {
@@ -914,7 +914,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte
 		var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64
 		gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:])
 	} else if useCodeGen {
-		genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice())
+		genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice())
 		defer r.putTmpSlice(genMatrix)
 	} else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI &&
 		byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards {
@@ -1025,7 +1025,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 					outPer = outPer[:codeGenMaxOutputs]
 				}
 				// Generate local matrix
-				m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
+				m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp)
 				tmp = tmp[len(m):]
 				plan = append(plan, state{
 					input:  inPer,
@@ -1056,7 +1056,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b
 					inPer = inPer[:codeGenMaxInputs]
 				}
 				// Generate local matrix
-				m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
+				m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp)
 				tmp = tmp[len(m):]
 				//fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
 				plan = append(plan, state{
diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go
index 6f0522f88..ffda8884c 100644
--- a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go
@@ -7,8 +7,12 @@ func xorSliceNEON(in, out []byte)
 
 // simple slice xor
 func sliceXor(in, out []byte, o *options) {
-	xorSliceNEON(in, out)
 	done := (len(in) >> 5) << 5
+	if raceEnabled {
+		raceWriteSlice(out[:done])
+		raceReadSlice(in[:done])
+	}
+	xorSliceNEON(in, out)
 
 	remain := len(in) - done
 	if remain > 0 {
diff --git a/vendor/github.com/xtaci/kcp-go/v5/fec.go b/vendor/github.com/xtaci/kcp-go/v5/fec.go
index 523bda2c1..5bd9c51f8 100644
--- a/vendor/github.com/xtaci/kcp-go/v5/fec.go
+++ b/vendor/github.com/xtaci/kcp-go/v5/fec.go
@@ -187,10 +187,13 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) {
 	}
 
 	// shard range for current packet
+	// NOTE: the shard sequence number starts at 0, so we can use mod operation
+	// to find the beginning of the current shard.
+	// ALWAYS ALIGNED TO 0
 	shardBegin := pkt.seqid() - pkt.seqid()%uint32(dec.shardSize)
 	shardEnd := shardBegin + uint32(dec.shardSize) - 1
 
-	// max search range in ordered queue for current shard
+	// Define max search range in ordered queue for current shard
 	searchBegin := insertIdx - int(pkt.seqid()%uint32(dec.shardSize))
 	if searchBegin < 0 {
 		searchBegin = 0
@@ -200,11 +203,12 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) {
 		searchEnd = len(dec.rx) - 1
 	}
 
-	// re-construct datashards
+	// check if we have enough shards to recover, if so, we can recover the data and free the shards
+	// if not, we can keep the shards in memory for future recovery.
 	if searchEnd-searchBegin+1 >= dec.dataShards {
 		var numshard, numDataShard, first, maxlen int
 
-		// zero caches
+		// zero working set for decoding
 		shards := dec.decodeCache
 		shardsflag := dec.flagCache
 		for k := range dec.decodeCache {
@@ -212,9 +216,10 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) {
 			shardsflag[k] = false
 		}
 
-		// shard assembly
+		// lookup shards in range [searchBegin, searchEnd] to the working set
 		for i := searchBegin; i <= searchEnd; i++ {
 			seqid := dec.rx[i].seqid()
+			// the shard seqid must be in [shardBegin, shardEnd], i.e. the current FEC group
 			if _itimediff(seqid, shardEnd) > 0 {
 				break
 			} else if _itimediff(seqid, shardBegin) >= 0 {
@@ -233,20 +238,23 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) {
 			}
 		}
 
+		// case 1: if there's no loss on data shards
 		if numDataShard == dec.dataShards {
-			// case 1: no loss on data shards
 			dec.rx = dec.freeRange(first, numshard, dec.rx)
-		} else if numshard >= dec.dataShards {
-			// case 2: loss on data shards, but it's recoverable from parity shards
+		} else if numshard >= dec.dataShards { // case 2: loss on data shards, but it's recoverable from parity shards
+			// make the bytes length of each shard equal
 			for k := range shards {
 				if shards[k] != nil {
 					dlen := len(shards[k])
 					shards[k] = shards[k][:maxlen]
 					clear(shards[k][dlen:])
 				} else if k < dec.dataShards {
+					// prepare memory for the data recovery
 					shards[k] = xmitBuf.Get().([]byte)[:0]
 				}
 			}
+
+			// Reed-Solomon recovery
 			if err := dec.codec.ReconstructData(shards); err == nil {
 				for k := range shards[:dec.dataShards] {
 					if !shardsflag[k] {
@@ -255,19 +263,22 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) {
 					}
 				}
 			}
+
+			// Free the shards in FIFO immediately
 			dec.rx = dec.freeRange(first, numshard, dec.rx)
 		}
 	}
 
-	// keep rxlimit
+	// keep rxlimit in FIFO order
 	if len(dec.rx) > dec.rxlimit {
-		if dec.rx[0].flag() == typeData { // track the unrecoverable data
+		if dec.rx[0].flag() == typeData {
+			// track the effectiveness of FEC
 			atomic.AddUint64(&DefaultSnmp.FECShortShards, 1)
 		}
 		dec.rx = dec.freeRange(0, 1, dec.rx)
 	}
 
-	// timeout policy
+	// FIFO timeout policy
 	current := currentMs()
 	numExpired := 0
 	for k := range dec.rx {
@@ -289,9 +300,12 @@ func (dec *fecDecoder) freeRange(first, n int, q []fecElement) []fecElement {
 		xmitBuf.Put([]byte(q[i].fecPacket))
 	}
 
+	// if n is small, we can avoid the copy
 	if first == 0 && n < cap(q)/2 {
 		return q[n:]
 	}
+
+	// on the other hand, we shift the tail
 	copy(q[first:], q[first+n:])
 	return q[:len(q)-n]
 }
@@ -375,7 +389,7 @@ func (enc *fecEncoder) encode(b []byte, rto uint32) (ps [][]byte) {
 		enc.maxSize = sz
 	}
 
-	//  Generation of Reed-Solomon Erasure Code
+	// Generation of Reed-Solomon Erasure Code
 	now := time.Now().UnixMilli()
 	if enc.shardCount == enc.dataShards {
 		// generate the rs-code only if the data is continuous.
@@ -400,6 +414,10 @@ func (enc *fecEncoder) encode(b []byte, rto uint32) (ps [][]byte) {
 					enc.markParity(ps[k][enc.headerOffset:])
 					ps[k] = ps[k][:enc.maxSize]
 				}
+			} else {
+				// record the error, and still keep the seqid monotonic increasing
+				atomic.AddUint64(&DefaultSnmp.FECErrs, 1)
+				enc.next = (enc.next + uint32(enc.parityShards)) % enc.paws
 			}
 		} else {
 			// through we do not send non-continuous parity shard, we still increase the next value
@@ -417,6 +435,7 @@ func (enc *fecEncoder) encode(b []byte, rto uint32) (ps [][]byte) {
 	return
 }
 
+// put a stamp on the FEC packet header with seqid and type
 func (enc *fecEncoder) markData(data []byte) {
 	binary.LittleEndian.PutUint32(data, enc.next)
 	binary.LittleEndian.PutUint16(data[4:], typeData)
diff --git a/vendor/github.com/xtaci/kcp-go/v5/sess.go b/vendor/github.com/xtaci/kcp-go/v5/sess.go
index 9e055b8f6..05122cab7 100644
--- a/vendor/github.com/xtaci/kcp-go/v5/sess.go
+++ b/vendor/github.com/xtaci/kcp-go/v5/sess.go
@@ -498,7 +498,7 @@ func (s *UDPSession) SetMtu(mtu int) bool {
 	return true
 }
 
-// SetStreamMode toggles the stream mode on/off
+// Deprecated: toggles the stream mode on/off
 func (s *UDPSession) SetStreamMode(enable bool) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -1035,7 +1035,10 @@ func (l *Listener) Accept() (net.Conn, error) {
 func (l *Listener) AcceptKCP() (*UDPSession, error) {
 	var timeout <-chan time.Time
 	if tdeadline, ok := l.rd.Load().(time.Time); ok && !tdeadline.IsZero() {
-		timeout = time.After(time.Until(tdeadline))
+		timer := time.NewTimer(time.Until(tdeadline))
+		defer timer.Stop()
+
+		timeout = timer.C
 	}
 
 	select {
diff --git a/vendor/github.com/xtaci/kcp-go/v5/timedsched.go b/vendor/github.com/xtaci/kcp-go/v5/timedsched.go
index 187ba7d4f..2e7dad665 100644
--- a/vendor/github.com/xtaci/kcp-go/v5/timedsched.go
+++ b/vendor/github.com/xtaci/kcp-go/v5/timedsched.go
@@ -83,8 +83,10 @@ func NewTimedSched(parallel int) *TimedSched {
 
 // sched is a goroutine to schedule and execute timed tasks.
 func (ts *TimedSched) sched() {
-	var tasks timedFuncHeap
 	timer := time.NewTimer(0)
+	defer timer.Stop()
+
+	var tasks timedFuncHeap
 	drained := false
 	for {
 		select {
diff --git a/vendor/github.com/xtaci/smux/session.go b/vendor/github.com/xtaci/smux/session.go
index 59d13783e..d5b4c5ad6 100644
--- a/vendor/github.com/xtaci/smux/session.go
+++ b/vendor/github.com/xtaci/smux/session.go
@@ -576,7 +576,10 @@ func (s *Session) sendLoop() {
 // writeControlFrame writes the control frame to the underlying connection
 // and returns the number of bytes written if successful
 func (s *Session) writeControlFrame(f Frame) (n int, err error) {
-	return s.writeFrameInternal(f, time.After(openCloseTimeout), CLSCTRL)
+	timer := time.NewTimer(openCloseTimeout)
+	defer timer.Stop()
+
+	return s.writeFrameInternal(f, timer.C, CLSCTRL)
 }
 
 // internal writeFrame version to support deadline used in keepalive
diff --git a/vendor/github.com/xtaci/smux/stream.go b/vendor/github.com/xtaci/smux/stream.go
index ed177419c..653a27513 100644
--- a/vendor/github.com/xtaci/smux/stream.go
+++ b/vendor/github.com/xtaci/smux/stream.go
@@ -494,7 +494,11 @@ func (s *stream) Close() error {
 	if once {
 		// send FIN in order
 		f := newFrame(byte(s.sess.config.Version), cmdFIN, s.id)
-		_, err = s.sess.writeFrameInternal(f, time.After(openCloseTimeout), CLSDATA)
+
+		timer := time.NewTimer(openCloseTimeout)
+		defer timer.Stop()
+
+		_, err = s.sess.writeFrameInternal(f, timer.C, CLSDATA)
 		s.sess.streamClosed(s.id)
 		return err
 	} else {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 98f06871c..5de51ef31 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -17,8 +17,8 @@ github.com/google/gopacket/layers
 # github.com/klauspost/cpuid/v2 v2.2.8
 ## explicit; go 1.15
 github.com/klauspost/cpuid/v2
-# github.com/klauspost/reedsolomon v1.12.3
-## explicit; go 1.18
+# github.com/klauspost/reedsolomon v1.12.4
+## explicit; go 1.21
 github.com/klauspost/reedsolomon
 # github.com/mattn/go-colorable v0.1.13
 ## explicit; go 1.15
@@ -44,13 +44,13 @@ github.com/tjfoc/gmsm/sm4
 # github.com/urfave/cli v1.22.15
 ## explicit; go 1.11
 github.com/urfave/cli
-# github.com/xtaci/kcp-go/v5 v5.6.17
+# github.com/xtaci/kcp-go/v5 v5.6.18
 ## explicit; go 1.21
 github.com/xtaci/kcp-go/v5
 # github.com/xtaci/qpp v1.1.17
 ## explicit; go 1.22.3
 github.com/xtaci/qpp
-# github.com/xtaci/smux v1.5.30
+# github.com/xtaci/smux v1.5.31
 ## explicit; go 1.13
 github.com/xtaci/smux
 # github.com/xtaci/tcpraw v1.2.31
@@ -67,7 +67,7 @@ golang.org/x/crypto/salsa20/salsa
 golang.org/x/crypto/tea
 golang.org/x/crypto/twofish
 golang.org/x/crypto/xtea
-# golang.org/x/net v0.28.0
+# golang.org/x/net v0.29.0
 ## explicit; go 1.18
 golang.org/x/net/bpf
 golang.org/x/net/internal/iana