diff --git a/go.mod b/go.mod index ebc70910f..20fa8cc71 100644 --- a/go.mod +++ b/go.mod @@ -5,9 +5,9 @@ require ( github.com/golang/snappy v0.0.4 github.com/pkg/errors v0.9.1 github.com/urfave/cli v1.22.15 - github.com/xtaci/kcp-go/v5 v5.6.17 + github.com/xtaci/kcp-go/v5 v5.6.18 github.com/xtaci/qpp v1.1.17 - github.com/xtaci/smux v1.5.30 + github.com/xtaci/smux v1.5.31 github.com/xtaci/tcpraw v1.2.31 golang.org/x/crypto v0.27.0 ) @@ -17,14 +17,14 @@ require ( github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect github.com/google/gopacket v1.1.19 // indirect github.com/klauspost/cpuid/v2 v2.2.8 // indirect - github.com/klauspost/reedsolomon v1.12.3 // indirect + github.com/klauspost/reedsolomon v1.12.4 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/templexxx/cpu v0.1.1 // indirect github.com/templexxx/xorsimd v0.4.3 // indirect github.com/tjfoc/gmsm v1.4.1 // indirect - golang.org/x/net v0.28.0 // indirect + golang.org/x/net v0.29.0 // indirect golang.org/x/sys v0.25.0 // indirect ) diff --git a/go.sum b/go.sum index 15165c380..39edcaa70 100644 --- a/go.sum +++ b/go.sum @@ -37,8 +37,8 @@ github.com/google/gopacket v1.1.19 h1:ves8RnFZPGiFnTS0uPQStjwru6uO6h+nlr9j6fL7kF github.com/google/gopacket v1.1.19/go.mod h1:iJ8V8n6KS+z2U1A8pUwu8bW5SyEMkXJB8Yo/Vo+TKTo= github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM= github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws= -github.com/klauspost/reedsolomon v1.12.3 h1:tzUznbfc3OFwJaTebv/QdhnFf2Xvb7gZ24XaHLBPmdc= -github.com/klauspost/reedsolomon v1.12.3/go.mod h1:3K5rXwABAvzGeR01r6pWZieUALXO/Tq7bFKGIb4m4WI= +github.com/klauspost/reedsolomon v1.12.4 h1:5aDr3ZGoJbgu/8+j45KtUJxzYm8k08JGtB9Wx1VQ4OA= +github.com/klauspost/reedsolomon v1.12.4/go.mod h1:d3CzOMOt0JXGIFZm1StgkyF14EYr3xneR2rNWo7NcMU= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= @@ -68,18 +68,14 @@ github.com/tjfoc/gmsm v1.4.1 h1:aMe1GlZb+0bLjn+cKTPEvvn9oUEBlJitaZiiBwsbgho= github.com/tjfoc/gmsm v1.4.1/go.mod h1:j4INPkHWMrhJb38G+J6W4Tw0AbuN8Thu3PbdVYhVcTE= github.com/urfave/cli v1.22.15 h1:nuqt+pdC/KqswQKhETJjo7pvn/k4xMUxgW6liI7XpnM= github.com/urfave/cli v1.22.15/go.mod h1:wSan1hmo5zeyLGBjRJbzRTNk8gwoYa2B9n4q9dmRIc0= -github.com/xtaci/kcp-go/v5 v5.6.17 h1:en/aK0IDKX5Zb5NlfLRzbuhw1nnj8vhf6AgbAJI2WX4= -github.com/xtaci/kcp-go/v5 v5.6.17/go.mod h1:75S1AKYYzNUSXIv30h+jPKJYZUwqpfvLshu63nCNSOM= +github.com/xtaci/kcp-go/v5 v5.6.18 h1:7oV4mc272pcnn39/13BB11Bx7hJM4ogMIEokJYVWn4g= +github.com/xtaci/kcp-go/v5 v5.6.18/go.mod h1:75S1AKYYzNUSXIv30h+jPKJYZUwqpfvLshu63nCNSOM= github.com/xtaci/lossyconn v0.0.0-20190602105132-8df528c0c9ae h1:J0GxkO96kL4WF+AIT3M4mfUVinOCPgf2uUWYFUzN0sM= github.com/xtaci/lossyconn v0.0.0-20190602105132-8df528c0c9ae/go.mod h1:gXtu8J62kEgmN++bm9BVICuT/e8yiLI2KFobd/TRFsE= github.com/xtaci/qpp v1.1.17 h1:w35NYqF3wOBoAMs+2qA2XFjkNQ12mugw51CUJ7OcTzo= github.com/xtaci/qpp v1.1.17/go.mod h1:dJS3usaXNMbWxZSWCAdxz01UgJcz9wXDkd4BccDY/V0= -github.com/xtaci/smux v1.5.29 h1:xGpY4B0ngArN1yNXvKerMiW4QABDXxELUJhR3EAh5bQ= -github.com/xtaci/smux v1.5.29/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY= -github.com/xtaci/smux v1.5.30-0.20240915135522-784d53ae558d h1:OgHYTbVkmCTV9M4l5GSidOjmXBubG6RqBTt7Q1DoTPU= -github.com/xtaci/smux v1.5.30-0.20240915135522-784d53ae558d/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY= -github.com/xtaci/smux v1.5.30 h1:LFxB7WSr0mbQhbdJzfbxnfCKVQKYzcyB+/8mXf2dTdQ= -github.com/xtaci/smux v1.5.30/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY= +github.com/xtaci/smux v1.5.31 h1:3ha7sHtH46h85Iv7MfQogxasuRt1KPRhoFB3S4rmHgU= +github.com/xtaci/smux v1.5.31/go.mod h1:OMlQbT5vcgl2gb49mFkYo6SMf+zP3rcjcwQz7ZU7IGY= github.com/xtaci/tcpraw v1.2.31 h1:i9mXzejnGJdGi0DpVKUn19Hq202/sHOJt0kObEwuE/U= github.com/xtaci/tcpraw v1.2.31/go.mod h1:T1blYD2EDkLneb+HtxddnzX38SoC9BG537EhkXeaT2k= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -101,8 +97,8 @@ golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= -golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go index 9b3639502..bbc521f4e 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois.go +++ b/vendor/github.com/klauspost/reedsolomon/galois.go @@ -910,14 +910,14 @@ func galExp(a byte, n int) byte { return expTable[uint8(logResult)] } -func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { +func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs, vectorLength int, dst []byte) []byte { if !codeGen { panic("codegen not enabled") } total := inputs * outputs // Duplicated in+out - wantBytes := total * 32 * 2 + wantBytes := total * vectorLength * 2 if cap(dst) < wantBytes { dst = AllocAligned(1, wantBytes)[0] } else { @@ -925,15 +925,16 @@ func genCodeGenMatrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byt } for i, row := range matrixRows[:outputs] { for j, idx := range row[inIdx : inIdx+inputs] { - dstIdx := (j*outputs + i) * 64 + dstIdx := (j*outputs + i) * vectorLength * 2 dstPart := dst[dstIdx:] - dstPart = dstPart[:64] + dstPart = dstPart[:vectorLength*2] lo := mulTableLow[idx][:] hi := mulTableHigh[idx][:] - copy(dstPart[:16], lo) - copy(dstPart[16:32], lo) - copy(dstPart[32:48], hi) - copy(dstPart[48:64], hi) + + for k := 0; k < vectorLength; k += 16 { + copy(dstPart[k:k+16], lo) + copy(dstPart[vectorLength*2-(k+16):vectorLength*2-k], hi) + } } } return dst diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go index 8099f1664..8025560f2 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go @@ -53,20 +53,32 @@ func galMulSlice(c byte, in, out []byte, o *options) { } if o.useAVX2 { if len(in) >= bigSwitchover { - galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } if len(in) > 32 { - galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } } else if o.useSSSE3 { - galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } @@ -85,20 +97,32 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { if o.useAVX2 { if len(in) >= bigSwitchover { - galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } if len(in) >= 32 { - galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } } else if o.useSSSE3 { - galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) in = in[done:] out = out[done:] } @@ -117,20 +141,32 @@ func sliceXor(in, out []byte, o *options) { if o.useSSE2 { if len(in) >= bigSwitchover { if o.useAVX2 { - avx2XorSlice_64(in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + avx2XorSlice_64(in, out) in = in[done:] out = out[done:] } else { - sSE2XorSlice_64(in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + sSE2XorSlice_64(in, out) in = in[done:] out = out[done:] } } if len(in) >= 16 { - sSE2XorSlice(in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + sSE2XorSlice(in, out) in = in[done:] out = out[done:] } @@ -462,9 +498,17 @@ func fftDIT2(x, y []byte, log_m ffe, o *options) { } if o.useAVX2 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } fftDIT2_avx2(x, y, tmp) } else if o.useSSSE3 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } fftDIT2_ssse3(x, y, tmp) } else { // Reference version: @@ -480,11 +524,15 @@ func fftDIT28(x, y []byte, log_m ffe8, o *options) { } if o.useAVX2 { + done := (len(y) >> 6) << 6 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } fftDIT28_avx2(x, y, &multiply256LUT8[log_m]) if len(x)&63 == 0 { return } - done := (len(y) >> 6) << 6 y = y[done:] x = x[done:] } @@ -499,11 +547,15 @@ func ifftDIT28(x, y []byte, log_m ffe8, o *options) { } if o.useAVX2 { + done := (len(y) >> 6) << 6 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } ifftDIT28_avx2(x, y, &multiply256LUT8[log_m]) if len(x)&63 == 0 { return } - done := (len(y) >> 6) << 6 y = y[done:] x = x[done:] } @@ -514,14 +566,22 @@ func ifftDIT28(x, y []byte, log_m ffe8, o *options) { func mulAdd8(x, y []byte, log_m ffe8, o *options) { if o.useAVX2 { t := &multiply256LUT8[log_m] - galMulAVX2Xor_64(t[:16], t[16:32], y, x) done := (len(y) >> 6) << 6 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } + galMulAVX2Xor_64(t[:16], t[16:32], y, x) y = y[done:] x = x[done:] } else if o.useSSSE3 { t := &multiply256LUT8[log_m] - galMulSSSE3Xor(t[:16], t[16:32], y, x) done := (len(y) >> 4) << 4 + if raceEnabled { + raceReadSlice(y[:done]) + raceWriteSlice(x[:done]) + } + galMulSSSE3Xor(t[:16], t[16:32], y, x) y = y[done:] x = x[done:] } @@ -535,9 +595,19 @@ func ifftDIT2(x, y []byte, log_m ffe, o *options) { } if o.useAVX2 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } + ifftDIT2_avx2(x, y, tmp) } else if o.useSSSE3 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } + ifftDIT2_ssse3(x, y, tmp) } else { // Reference version: @@ -552,9 +622,17 @@ func mulgf16(x, y []byte, log_m ffe, o *options) { } if o.useAVX2 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } mulgf16_avx2(x, y, tmp) } else if o.useSSSE3 { tmp := &multiply256LUT[log_m] + if raceEnabled { + raceReadSlice(y) + raceWriteSlice(x) + } mulgf16_ssse3(x, y, tmp) } else { refMul(x, y, log_m) @@ -564,14 +642,23 @@ func mulgf16(x, y []byte, log_m ffe, o *options) { func mulgf8(out, in []byte, log_m ffe8, o *options) { if o.useAVX2 { t := &multiply256LUT8[log_m] - galMulAVX2_64(t[:16], t[16:32], in, out) done := (len(in) >> 6) << 6 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + + galMulAVX2_64(t[:16], t[16:32], in, out) in = in[done:] out = out[done:] } else if o.useSSSE3 { t := &multiply256LUT8[log_m] - galMulSSSE3(t[:16], t[16:32], in, out) done := (len(in) >> 4) << 4 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulSSSE3(t[:16], t[16:32], in, out) in = in[done:] out = out[done:] } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go index 08f1ae8d9..d860525c9 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go @@ -17,8 +17,12 @@ func getVectorLength() (vl, pl uint64) func init() { if defaultOptions.useSVE { - if vl, _ := getVectorLength(); vl != 256 { - defaultOptions.useSVE = false // Temp fix: disable SVE for non-256 vector widths (ie Graviton4) + if vl, _ := getVectorLength(); vl <= 256 { + // set vector length in bytes + defaultOptions.vectorLength = int(vl) >> 3 + } else { + // disable SVE for hardware implementatons over 256 bits (only know to be Fujitsu A64FX atm) + defaultOptions.useSVE = false } } } @@ -29,8 +33,12 @@ func galMulSlice(c byte, in, out []byte, o *options) { return } var done int - galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) done = (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } + galMulNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) remain := len(in) - done if remain > 0 { @@ -46,9 +54,12 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { sliceXor(in, out, o) return } - var done int + done := (len(in) >> 5) << 5 + if raceEnabled { + raceReadSlice(in[:done]) + raceWriteSlice(out[:done]) + } galMulXorNEON(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 5) << 5 remain := len(in) - done if remain > 0 { diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s index 335b94c36..dd974c115 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_arm64.s @@ -13,6 +13,9 @@ TEXT ·mulSve_10x1_64(SB), $0-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x1_64_end MOVD in_base+24(FP), R3 @@ -55,7 +58,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 0 to 1 outputs WORD $0x85804026 // ldr z6, [x1] WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -77,7 +80,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 1 to 1 outputs WORD $0x85804086 // ldr z6, [x4] WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -101,7 +104,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 2 to 1 outputs WORD $0x858040a6 // ldr z6, [x5] WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -125,7 +128,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 3 to 1 outputs WORD $0x85804106 // ldr z6, [x8] WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -149,7 +152,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 4 to 1 outputs WORD $0x85804126 // ldr z6, [x9] WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -173,7 +176,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 5 to 1 outputs WORD $0x85804146 // ldr z6, [x10] WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -197,7 +200,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 6 to 1 outputs WORD $0x85804166 // ldr z6, [x11] WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -221,7 +224,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 7 to 1 outputs WORD $0x85804186 // ldr z6, [x12] WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -245,7 +248,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 8 to 1 outputs WORD $0x858041a6 // ldr z6, [x13] WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -269,7 +272,7 @@ mulSve_10x1_64_loop: // Load and process 64 bytes from input 9 to 1 outputs WORD $0x85804066 // ldr z6, [x3] WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -291,7 +294,7 @@ mulSve_10x1_64_store: // Store 1 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -309,6 +312,9 @@ TEXT ·mulSve_10x1_64Xor(SB), $0-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x1_64Xor_end MOVD in_base+24(FP), R3 @@ -355,7 +361,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 0 to 1 outputs WORD $0x85804026 // ldr z6, [x1] WORD $0x85804425 // ldr z5, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -379,7 +385,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 1 to 1 outputs WORD $0x85804086 // ldr z6, [x4] WORD $0x85804485 // ldr z5, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -403,7 +409,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 2 to 1 outputs WORD $0x858040a6 // ldr z6, [x5] WORD $0x858044a5 // ldr z5, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -427,7 +433,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 3 to 1 outputs WORD $0x85804106 // ldr z6, [x8] WORD $0x85804505 // ldr z5, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -451,7 +457,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 4 to 1 outputs WORD $0x85804126 // ldr z6, [x9] WORD $0x85804525 // ldr z5, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -475,7 +481,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 5 to 1 outputs WORD $0x85804146 // ldr z6, [x10] WORD $0x85804545 // ldr z5, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -499,7 +505,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 6 to 1 outputs WORD $0x85804166 // ldr z6, [x11] WORD $0x85804565 // ldr z5, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -523,7 +529,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 7 to 1 outputs WORD $0x85804186 // ldr z6, [x12] WORD $0x85804585 // ldr z5, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -547,7 +553,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 8 to 1 outputs WORD $0x858041a6 // ldr z6, [x13] WORD $0x858045a5 // ldr z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -571,7 +577,7 @@ mulSve_10x1_64Xor_loop: // Load and process 64 bytes from input 9 to 1 outputs WORD $0x85804066 // ldr z6, [x3] WORD $0x85804465 // ldr z5, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc94c7 // lsr z7.d, z6.d, #4 WORD $0x04fc94a8 // lsr z8.d, z5.d, #4 WORD $0x042230c6 // and z6.d, z6.d, z2.d @@ -593,7 +599,7 @@ mulSve_10x1_64Xor_store: // Store 1 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -611,6 +617,9 @@ TEXT ·mulSve_10x2_64(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x2_64_end MOVD in_base+24(FP), R3 @@ -655,7 +664,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 0 to 2 outputs WORD $0x85804029 // ldr z9, [x1] WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -685,7 +694,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 1 to 2 outputs WORD $0x85804089 // ldr z9, [x4] WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -719,7 +728,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 2 to 2 outputs WORD $0x858040a9 // ldr z9, [x5] WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -753,7 +762,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 3 to 2 outputs WORD $0x85804109 // ldr z9, [x8] WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -787,7 +796,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 4 to 2 outputs WORD $0x85804129 // ldr z9, [x9] WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -821,7 +830,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 5 to 2 outputs WORD $0x85804149 // ldr z9, [x10] WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -855,7 +864,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 6 to 2 outputs WORD $0x85804169 // ldr z9, [x11] WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -889,7 +898,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 7 to 2 outputs WORD $0x85804189 // ldr z9, [x12] WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -923,7 +932,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 8 to 2 outputs WORD $0x858041a9 // ldr z9, [x13] WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -957,7 +966,7 @@ mulSve_10x2_64_loop: // Load and process 64 bytes from input 9 to 2 outputs WORD $0x85804069 // ldr z9, [x3] WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -989,10 +998,10 @@ mulSve_10x2_64_store: // Store 2 outputs WORD $0xe58041e0 // str z0, [x15] WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041c2 // str z2, [x14] WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -1010,6 +1019,9 @@ TEXT ·mulSve_10x2_64Xor(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x2_64Xor_end MOVD in_base+24(FP), R3 @@ -1060,7 +1072,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 0 to 2 outputs WORD $0x85804029 // ldr z9, [x1] WORD $0x8580442b // ldr z11, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1094,7 +1106,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 1 to 2 outputs WORD $0x85804089 // ldr z9, [x4] WORD $0x8580448b // ldr z11, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1128,7 +1140,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 2 to 2 outputs WORD $0x858040a9 // ldr z9, [x5] WORD $0x858044ab // ldr z11, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1162,7 +1174,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 3 to 2 outputs WORD $0x85804109 // ldr z9, [x8] WORD $0x8580450b // ldr z11, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1196,7 +1208,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 4 to 2 outputs WORD $0x85804129 // ldr z9, [x9] WORD $0x8580452b // ldr z11, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1230,7 +1242,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 5 to 2 outputs WORD $0x85804149 // ldr z9, [x10] WORD $0x8580454b // ldr z11, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1264,7 +1276,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 6 to 2 outputs WORD $0x85804169 // ldr z9, [x11] WORD $0x8580456b // ldr z11, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1298,7 +1310,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 7 to 2 outputs WORD $0x85804189 // ldr z9, [x12] WORD $0x8580458b // ldr z11, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1332,7 +1344,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 8 to 2 outputs WORD $0x858041a9 // ldr z9, [x13] WORD $0x858045ab // ldr z11, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1366,7 +1378,7 @@ mulSve_10x2_64Xor_loop: // Load and process 64 bytes from input 9 to 2 outputs WORD $0x85804069 // ldr z9, [x3] WORD $0x8580446b // ldr z11, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04243129 // and z9.d, z9.d, z4.d @@ -1398,10 +1410,10 @@ mulSve_10x2_64Xor_store: // Store 2 outputs WORD $0xe58041e0 // str z0, [x15] WORD $0xe58045e1 // str z1, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041c2 // str z2, [x14] WORD $0xe58045c3 // str z3, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 // Prepare for next loop WORD $0xf1000400 // subs x0, x0, #1 @@ -1419,6 +1431,9 @@ TEXT ·mulSve_10x3_64(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x3_64_end MOVD in_base+24(FP), R0 @@ -1461,6 +1476,9 @@ TEXT ·mulSve_10x3_64(SB), $8-88 // Reload length to save a register MOVD n+80(FP), R6 WORD $0xd346fcc6 // lsr x6, x6, #6 + WORD $0xd37ae4c6 // lsl x6, x6, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad008c6 // udiv x6, x6, x16 // Load number of input shards MOVD in_len+32(FP), R16 @@ -1469,7 +1487,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 0 to 3 outputs WORD $0x8580406b // ldr z11, [x3] WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1507,7 +1525,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 1 to 3 outputs WORD $0x8580402b // ldr z11, [x1] WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1551,7 +1569,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 2 to 3 outputs WORD $0x8580408b // ldr z11, [x4] WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1595,7 +1613,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 3 to 3 outputs WORD $0x858040ab // ldr z11, [x5] WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1639,7 +1657,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 4 to 3 outputs WORD $0x8580410b // ldr z11, [x8] WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1683,7 +1701,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 5 to 3 outputs WORD $0x8580412b // ldr z11, [x9] WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1727,7 +1745,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 6 to 3 outputs WORD $0x8580414b // ldr z11, [x10] WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1771,7 +1789,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 7 to 3 outputs WORD $0x8580416b // ldr z11, [x11] WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1815,7 +1833,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 8 to 3 outputs WORD $0x8580418b // ldr z11, [x12] WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1859,7 +1877,7 @@ mulSve_10x3_64_loop: // Load and process 64 bytes from input 9 to 3 outputs WORD $0x8580400b // ldr z11, [x0] WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] - WORD $0x91010000 // add x0, x0, #64 + WORD $0x04205040 // addvl x0, x0, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -1901,13 +1919,13 @@ mulSve_10x3_64_store: // Store 3 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 WORD $0xe58041e2 // str z2, [x15] WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041a4 // str z4, [x13] WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 // Prepare for next loop WORD $0xf10004c6 // subs x6, x6, #1 @@ -1925,6 +1943,9 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd346fc00 // lsr x0, x0, #6 + WORD $0xd37ae400 // lsl x0, x0, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x3_64Xor_end MOVD in_base+24(FP), R0 @@ -1967,6 +1988,9 @@ TEXT ·mulSve_10x3_64Xor(SB), $8-88 // Reload length to save a register MOVD n+80(FP), R6 WORD $0xd346fcc6 // lsr x6, x6, #6 + WORD $0xd37ae4c6 // lsl x6, x6, #6 + WORD $0x04bf5050 // rdvl x16, #2 + WORD $0x9ad008c6 // udiv x6, x6, x16 // Load number of input shards MOVD in_len+32(FP), R16 @@ -1983,7 +2007,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 0 to 3 outputs WORD $0x8580406b // ldr z11, [x3] WORD $0x8580446d // ldr z13, [x3, #1, MUL VL] - WORD $0x91010063 // add x3, x3, #64 + WORD $0x04235043 // addvl x3, x3, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2027,7 +2051,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 1 to 3 outputs WORD $0x8580402b // ldr z11, [x1] WORD $0x8580442d // ldr z13, [x1, #1, MUL VL] - WORD $0x91010021 // add x1, x1, #64 + WORD $0x04215041 // addvl x1, x1, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2071,7 +2095,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 2 to 3 outputs WORD $0x8580408b // ldr z11, [x4] WORD $0x8580448d // ldr z13, [x4, #1, MUL VL] - WORD $0x91010084 // add x4, x4, #64 + WORD $0x04245044 // addvl x4, x4, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2115,7 +2139,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 3 to 3 outputs WORD $0x858040ab // ldr z11, [x5] WORD $0x858044ad // ldr z13, [x5, #1, MUL VL] - WORD $0x910100a5 // add x5, x5, #64 + WORD $0x04255045 // addvl x5, x5, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2159,7 +2183,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 4 to 3 outputs WORD $0x8580410b // ldr z11, [x8] WORD $0x8580450d // ldr z13, [x8, #1, MUL VL] - WORD $0x91010108 // add x8, x8, #64 + WORD $0x04285048 // addvl x8, x8, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2203,7 +2227,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 5 to 3 outputs WORD $0x8580412b // ldr z11, [x9] WORD $0x8580452d // ldr z13, [x9, #1, MUL VL] - WORD $0x91010129 // add x9, x9, #64 + WORD $0x04295049 // addvl x9, x9, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2247,7 +2271,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 6 to 3 outputs WORD $0x8580414b // ldr z11, [x10] WORD $0x8580454d // ldr z13, [x10, #1, MUL VL] - WORD $0x9101014a // add x10, x10, #64 + WORD $0x042a504a // addvl x10, x10, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2291,7 +2315,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 7 to 3 outputs WORD $0x8580416b // ldr z11, [x11] WORD $0x8580456d // ldr z13, [x11, #1, MUL VL] - WORD $0x9101016b // add x11, x11, #64 + WORD $0x042b504b // addvl x11, x11, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2335,7 +2359,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 8 to 3 outputs WORD $0x8580418b // ldr z11, [x12] WORD $0x8580458d // ldr z13, [x12, #1, MUL VL] - WORD $0x9101018c // add x12, x12, #64 + WORD $0x042c504c // addvl x12, x12, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2379,7 +2403,7 @@ mulSve_10x3_64Xor_loop: // Load and process 64 bytes from input 9 to 3 outputs WORD $0x8580400b // ldr z11, [x0] WORD $0x8580440d // ldr z13, [x0, #1, MUL VL] - WORD $0x91010000 // add x0, x0, #64 + WORD $0x04205040 // addvl x0, x0, #2 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x0426316b // and z11.d, z11.d, z6.d @@ -2421,13 +2445,13 @@ mulSve_10x3_64Xor_store: // Store 3 outputs WORD $0xe58041c0 // str z0, [x14] WORD $0xe58045c1 // str z1, [x14, #1, MUL VL] - WORD $0x910101ce // add x14, x14, #64 + WORD $0x042e504e // addvl x14, x14, #2 WORD $0xe58041e2 // str z2, [x15] WORD $0xe58045e3 // str z3, [x15, #1, MUL VL] - WORD $0x910101ef // add x15, x15, #64 + WORD $0x042f504f // addvl x15, x15, #2 WORD $0xe58041a4 // str z4, [x13] WORD $0xe58045a5 // str z5, [x13, #1, MUL VL] - WORD $0x910101ad // add x13, x13, #64 + WORD $0x042d504d // addvl x13, x13, #2 // Prepare for next loop WORD $0xf10004c6 // subs x6, x6, #1 @@ -2446,6 +2470,9 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x4_end MOVD in_base+24(FP), R3 @@ -2480,11 +2507,13 @@ TEXT ·mulSve_10x4(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x4_loop: // Load and process 32 bytes from input 0 to 4 outputs WORD $0x85804027 // ldr z7, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2514,7 +2543,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 1 to 4 outputs WORD $0x85804087 // ldr z7, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2548,7 +2577,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 2 to 4 outputs WORD $0x858040a7 // ldr z7, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2582,7 +2611,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 3 to 4 outputs WORD $0x85804107 // ldr z7, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2616,7 +2645,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 4 to 4 outputs WORD $0x85804127 // ldr z7, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2650,7 +2679,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 5 to 4 outputs WORD $0x85804147 // ldr z7, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2684,7 +2713,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 6 to 4 outputs WORD $0x85804167 // ldr z7, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2718,7 +2747,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 7 to 4 outputs WORD $0x85804187 // ldr z7, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2752,7 +2781,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 8 to 4 outputs WORD $0x858041a7 // ldr z7, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2786,7 +2815,7 @@ mulSve_10x4_loop: // Load and process 32 bytes from input 9 to 4 outputs WORD $0x85804067 // ldr z7, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2827,7 +2856,7 @@ mulSve_10x4_store: WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x4_loop @@ -2844,6 +2873,9 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x4Xor_end MOVD in_base+24(FP), R3 @@ -2878,11 +2910,13 @@ TEXT ·mulSve_10x4Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs WORD $0x85804027 // ldr z7, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2924,7 +2958,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 1 to 4 outputs WORD $0x85804087 // ldr z7, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2958,7 +2992,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 2 to 4 outputs WORD $0x858040a7 // ldr z7, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -2992,7 +3026,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 3 to 4 outputs WORD $0x85804107 // ldr z7, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3026,7 +3060,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 4 to 4 outputs WORD $0x85804127 // ldr z7, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3060,7 +3094,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 5 to 4 outputs WORD $0x85804147 // ldr z7, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3094,7 +3128,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 6 to 4 outputs WORD $0x85804167 // ldr z7, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3128,7 +3162,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 7 to 4 outputs WORD $0x85804187 // ldr z7, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3162,7 +3196,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 8 to 4 outputs WORD $0x858041a7 // ldr z7, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3196,7 +3230,7 @@ mulSve_10x4Xor_loop: // Load and process 32 bytes from input 9 to 4 outputs WORD $0x85804067 // ldr z7, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc94e8 // lsr z8.d, z7.d, #4 WORD $0x042430e7 // and z7.d, z7.d, z4.d WORD $0x04243108 // and z8.d, z8.d, z4.d @@ -3237,7 +3271,7 @@ mulSve_10x4Xor_store: WORD $0xe5ef40c3 // st1d { z3.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x4Xor_loop @@ -3254,6 +3288,9 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x5_end MOVD in_base+24(FP), R3 @@ -3288,11 +3325,13 @@ TEXT ·mulSve_10x5(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x5_loop: // Load and process 32 bytes from input 0 to 5 outputs WORD $0x85804028 // ldr z8, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3327,7 +3366,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 1 to 5 outputs WORD $0x85804088 // ldr z8, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3367,7 +3406,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 2 to 5 outputs WORD $0x858040a8 // ldr z8, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3407,7 +3446,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 3 to 5 outputs WORD $0x85804108 // ldr z8, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3447,7 +3486,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 4 to 5 outputs WORD $0x85804128 // ldr z8, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3487,7 +3526,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 5 to 5 outputs WORD $0x85804148 // ldr z8, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3527,7 +3566,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 6 to 5 outputs WORD $0x85804168 // ldr z8, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3567,7 +3606,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 7 to 5 outputs WORD $0x85804188 // ldr z8, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3607,7 +3646,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 8 to 5 outputs WORD $0x858041a8 // ldr z8, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3647,7 +3686,7 @@ mulSve_10x5_loop: // Load and process 32 bytes from input 9 to 5 outputs WORD $0x85804068 // ldr z8, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3696,7 +3735,7 @@ mulSve_10x5_store: WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x5_loop @@ -3713,6 +3752,9 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x5Xor_end MOVD in_base+24(FP), R3 @@ -3747,11 +3789,13 @@ TEXT ·mulSve_10x5Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs WORD $0x85804028 // ldr z8, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3801,7 +3845,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 1 to 5 outputs WORD $0x85804088 // ldr z8, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3841,7 +3885,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 2 to 5 outputs WORD $0x858040a8 // ldr z8, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3881,7 +3925,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 3 to 5 outputs WORD $0x85804108 // ldr z8, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3921,7 +3965,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 4 to 5 outputs WORD $0x85804128 // ldr z8, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -3961,7 +4005,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 5 to 5 outputs WORD $0x85804148 // ldr z8, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4001,7 +4045,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 6 to 5 outputs WORD $0x85804168 // ldr z8, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4041,7 +4085,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 7 to 5 outputs WORD $0x85804188 // ldr z8, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4081,7 +4125,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 8 to 5 outputs WORD $0x858041a8 // ldr z8, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4121,7 +4165,7 @@ mulSve_10x5Xor_loop: // Load and process 32 bytes from input 9 to 5 outputs WORD $0x85804068 // ldr z8, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc9509 // lsr z9.d, z8.d, #4 WORD $0x04253108 // and z8.d, z8.d, z5.d WORD $0x04253129 // and z9.d, z9.d, z5.d @@ -4170,7 +4214,7 @@ mulSve_10x5Xor_store: WORD $0xe5ef40c4 // st1d { z4.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x5Xor_loop @@ -4187,6 +4231,9 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x6_end MOVD in_base+24(FP), R3 @@ -4221,11 +4268,13 @@ TEXT ·mulSve_10x6(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x6_loop: // Load and process 32 bytes from input 0 to 6 outputs WORD $0x85804029 // ldr z9, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4265,7 +4314,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 1 to 6 outputs WORD $0x85804089 // ldr z9, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4311,7 +4360,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 2 to 6 outputs WORD $0x858040a9 // ldr z9, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4357,7 +4406,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 3 to 6 outputs WORD $0x85804109 // ldr z9, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4403,7 +4452,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 4 to 6 outputs WORD $0x85804129 // ldr z9, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4449,7 +4498,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 5 to 6 outputs WORD $0x85804149 // ldr z9, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4495,7 +4544,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 6 to 6 outputs WORD $0x85804169 // ldr z9, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4541,7 +4590,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 7 to 6 outputs WORD $0x85804189 // ldr z9, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4587,7 +4636,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 8 to 6 outputs WORD $0x858041a9 // ldr z9, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4633,7 +4682,7 @@ mulSve_10x6_loop: // Load and process 32 bytes from input 9 to 6 outputs WORD $0x85804069 // ldr z9, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4690,7 +4739,7 @@ mulSve_10x6_store: WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x6_loop @@ -4707,6 +4756,9 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x6Xor_end MOVD in_base+24(FP), R3 @@ -4741,11 +4793,13 @@ TEXT ·mulSve_10x6Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs WORD $0x85804029 // ldr z9, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4803,7 +4857,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 1 to 6 outputs WORD $0x85804089 // ldr z9, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4849,7 +4903,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 2 to 6 outputs WORD $0x858040a9 // ldr z9, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4895,7 +4949,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 3 to 6 outputs WORD $0x85804109 // ldr z9, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4941,7 +4995,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 4 to 6 outputs WORD $0x85804129 // ldr z9, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -4987,7 +5041,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 5 to 6 outputs WORD $0x85804149 // ldr z9, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5033,7 +5087,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 6 to 6 outputs WORD $0x85804169 // ldr z9, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5079,7 +5133,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 7 to 6 outputs WORD $0x85804189 // ldr z9, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5125,7 +5179,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 8 to 6 outputs WORD $0x858041a9 // ldr z9, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5171,7 +5225,7 @@ mulSve_10x6Xor_loop: // Load and process 32 bytes from input 9 to 6 outputs WORD $0x85804069 // ldr z9, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc952a // lsr z10.d, z9.d, #4 WORD $0x04263129 // and z9.d, z9.d, z6.d WORD $0x0426314a // and z10.d, z10.d, z6.d @@ -5228,7 +5282,7 @@ mulSve_10x6Xor_store: WORD $0xe5ef40c5 // st1d { z5.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x6Xor_loop @@ -5245,6 +5299,9 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x7_end MOVD in_base+24(FP), R3 @@ -5279,11 +5336,13 @@ TEXT ·mulSve_10x7(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x7_loop: // Load and process 32 bytes from input 0 to 7 outputs WORD $0x8580402a // ldr z10, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5328,7 +5387,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 1 to 7 outputs WORD $0x8580408a // ldr z10, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5380,7 +5439,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 2 to 7 outputs WORD $0x858040aa // ldr z10, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5432,7 +5491,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 3 to 7 outputs WORD $0x8580410a // ldr z10, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5484,7 +5543,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 4 to 7 outputs WORD $0x8580412a // ldr z10, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5536,7 +5595,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 5 to 7 outputs WORD $0x8580414a // ldr z10, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5588,7 +5647,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 6 to 7 outputs WORD $0x8580416a // ldr z10, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5640,7 +5699,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 7 to 7 outputs WORD $0x8580418a // ldr z10, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5692,7 +5751,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 8 to 7 outputs WORD $0x858041aa // ldr z10, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5744,7 +5803,7 @@ mulSve_10x7_loop: // Load and process 32 bytes from input 9 to 7 outputs WORD $0x8580406a // ldr z10, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5809,7 +5868,7 @@ mulSve_10x7_store: WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x7_loop @@ -5826,6 +5885,9 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x7Xor_end MOVD in_base+24(FP), R3 @@ -5860,11 +5922,13 @@ TEXT ·mulSve_10x7Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs WORD $0x8580402a // ldr z10, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5930,7 +5994,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 1 to 7 outputs WORD $0x8580408a // ldr z10, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -5982,7 +6046,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 2 to 7 outputs WORD $0x858040aa // ldr z10, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6034,7 +6098,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 3 to 7 outputs WORD $0x8580410a // ldr z10, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6086,7 +6150,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 4 to 7 outputs WORD $0x8580412a // ldr z10, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6138,7 +6202,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 5 to 7 outputs WORD $0x8580414a // ldr z10, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6190,7 +6254,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 6 to 7 outputs WORD $0x8580416a // ldr z10, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6242,7 +6306,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 7 to 7 outputs WORD $0x8580418a // ldr z10, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6294,7 +6358,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 8 to 7 outputs WORD $0x858041aa // ldr z10, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6346,7 +6410,7 @@ mulSve_10x7Xor_loop: // Load and process 32 bytes from input 9 to 7 outputs WORD $0x8580406a // ldr z10, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc954b // lsr z11.d, z10.d, #4 WORD $0x0427314a // and z10.d, z10.d, z7.d WORD $0x0427316b // and z11.d, z11.d, z7.d @@ -6411,7 +6475,7 @@ mulSve_10x7Xor_store: WORD $0xe5ef40c6 // st1d { z6.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x7Xor_loop @@ -6428,6 +6492,9 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x8_end MOVD in_base+24(FP), R3 @@ -6462,11 +6529,13 @@ TEXT ·mulSve_10x8(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x8_loop: // Load and process 32 bytes from input 0 to 8 outputs WORD $0x8580402b // ldr z11, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6516,7 +6585,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 1 to 8 outputs WORD $0x8580408b // ldr z11, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6574,7 +6643,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 2 to 8 outputs WORD $0x858040ab // ldr z11, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6632,7 +6701,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 3 to 8 outputs WORD $0x8580410b // ldr z11, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6690,7 +6759,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 4 to 8 outputs WORD $0x8580412b // ldr z11, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6748,7 +6817,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 5 to 8 outputs WORD $0x8580414b // ldr z11, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6806,7 +6875,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 6 to 8 outputs WORD $0x8580416b // ldr z11, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6864,7 +6933,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 7 to 8 outputs WORD $0x8580418b // ldr z11, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6922,7 +6991,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 8 to 8 outputs WORD $0x858041ab // ldr z11, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -6980,7 +7049,7 @@ mulSve_10x8_loop: // Load and process 32 bytes from input 9 to 8 outputs WORD $0x8580406b // ldr z11, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7053,7 +7122,7 @@ mulSve_10x8_store: WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x8_loop @@ -7070,6 +7139,9 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x8Xor_end MOVD in_base+24(FP), R3 @@ -7104,11 +7176,13 @@ TEXT ·mulSve_10x8Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs WORD $0x8580402b // ldr z11, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7182,7 +7256,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 1 to 8 outputs WORD $0x8580408b // ldr z11, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7240,7 +7314,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 2 to 8 outputs WORD $0x858040ab // ldr z11, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7298,7 +7372,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 3 to 8 outputs WORD $0x8580410b // ldr z11, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7356,7 +7430,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 4 to 8 outputs WORD $0x8580412b // ldr z11, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7414,7 +7488,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 5 to 8 outputs WORD $0x8580414b // ldr z11, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7472,7 +7546,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 6 to 8 outputs WORD $0x8580416b // ldr z11, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7530,7 +7604,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 7 to 8 outputs WORD $0x8580418b // ldr z11, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7588,7 +7662,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 8 to 8 outputs WORD $0x858041ab // ldr z11, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7646,7 +7720,7 @@ mulSve_10x8Xor_loop: // Load and process 32 bytes from input 9 to 8 outputs WORD $0x8580406b // ldr z11, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc956c // lsr z12.d, z11.d, #4 WORD $0x0428316b // and z11.d, z11.d, z8.d WORD $0x0428318c // and z12.d, z12.d, z8.d @@ -7719,7 +7793,7 @@ mulSve_10x8Xor_store: WORD $0xe5ef40c7 // st1d { z7.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x8Xor_loop @@ -7736,6 +7810,9 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x9_end MOVD in_base+24(FP), R3 @@ -7770,11 +7847,13 @@ TEXT ·mulSve_10x9(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x9_loop: // Load and process 32 bytes from input 0 to 9 outputs WORD $0x8580402c // ldr z12, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -7829,7 +7908,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 1 to 9 outputs WORD $0x8580408c // ldr z12, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -7893,7 +7972,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 2 to 9 outputs WORD $0x858040ac // ldr z12, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -7957,7 +8036,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 3 to 9 outputs WORD $0x8580410c // ldr z12, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8021,7 +8100,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 4 to 9 outputs WORD $0x8580412c // ldr z12, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8085,7 +8164,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 5 to 9 outputs WORD $0x8580414c // ldr z12, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8149,7 +8228,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 6 to 9 outputs WORD $0x8580416c // ldr z12, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8213,7 +8292,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 7 to 9 outputs WORD $0x8580418c // ldr z12, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8277,7 +8356,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 8 to 9 outputs WORD $0x858041ac // ldr z12, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8341,7 +8420,7 @@ mulSve_10x9_loop: // Load and process 32 bytes from input 9 to 9 outputs WORD $0x8580406c // ldr z12, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8422,7 +8501,7 @@ mulSve_10x9_store: WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x9_loop @@ -8439,6 +8518,9 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x9Xor_end MOVD in_base+24(FP), R3 @@ -8473,11 +8555,13 @@ TEXT ·mulSve_10x9Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs WORD $0x8580402c // ldr z12, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8559,7 +8643,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 1 to 9 outputs WORD $0x8580408c // ldr z12, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8623,7 +8707,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 2 to 9 outputs WORD $0x858040ac // ldr z12, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8687,7 +8771,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 3 to 9 outputs WORD $0x8580410c // ldr z12, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8751,7 +8835,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 4 to 9 outputs WORD $0x8580412c // ldr z12, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8815,7 +8899,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 5 to 9 outputs WORD $0x8580414c // ldr z12, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8879,7 +8963,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 6 to 9 outputs WORD $0x8580416c // ldr z12, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -8943,7 +9027,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 7 to 9 outputs WORD $0x8580418c // ldr z12, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -9007,7 +9091,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 8 to 9 outputs WORD $0x858041ac // ldr z12, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -9071,7 +9155,7 @@ mulSve_10x9Xor_loop: // Load and process 32 bytes from input 9 to 9 outputs WORD $0x8580406c // ldr z12, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc958d // lsr z13.d, z12.d, #4 WORD $0x0429318c // and z12.d, z12.d, z9.d WORD $0x042931ad // and z13.d, z13.d, z9.d @@ -9152,7 +9236,7 @@ mulSve_10x9Xor_store: WORD $0xe5ef40c8 // st1d { z8.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x9Xor_loop @@ -9169,6 +9253,9 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x10_end MOVD in_base+24(FP), R3 @@ -9203,11 +9290,13 @@ TEXT ·mulSve_10x10(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x10_loop: // Load and process 32 bytes from input 0 to 10 outputs WORD $0x8580402d // ldr z13, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9267,7 +9356,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 1 to 10 outputs WORD $0x8580408d // ldr z13, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9337,7 +9426,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 2 to 10 outputs WORD $0x858040ad // ldr z13, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9407,7 +9496,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 3 to 10 outputs WORD $0x8580410d // ldr z13, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9477,7 +9566,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 4 to 10 outputs WORD $0x8580412d // ldr z13, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9547,7 +9636,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 5 to 10 outputs WORD $0x8580414d // ldr z13, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9617,7 +9706,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 6 to 10 outputs WORD $0x8580416d // ldr z13, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9687,7 +9776,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 7 to 10 outputs WORD $0x8580418d // ldr z13, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9757,7 +9846,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 8 to 10 outputs WORD $0x858041ad // ldr z13, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9827,7 +9916,7 @@ mulSve_10x10_loop: // Load and process 32 bytes from input 9 to 10 outputs WORD $0x8580406d // ldr z13, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -9916,7 +10005,7 @@ mulSve_10x10_store: WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x10_loop @@ -9933,6 +10022,9 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 MOVD n+80(FP), R0 MOVD matrix_base+0(FP), R2 WORD $0xd345fc00 // lsr x0, x0, #5 + WORD $0xd37be800 // lsl x0, x0, #5 + WORD $0x04bf5030 // rdvl x16, #1 + WORD $0x9ad00800 // udiv x0, x0, x16 WORD $0xea00001f // tst x0, x0 BEQ mulSve_10x10Xor_end MOVD in_base+24(FP), R3 @@ -9967,11 +10059,13 @@ TEXT ·mulSve_10x10Xor(SB), NOSPLIT, $8-88 // Load number of input shards MOVD in_len+32(FP), R16 + WORD $0x04bf5031 // rdvl x17, #1 + WORD $0xd343fe31 // lsr x17, x17, #3 mulSve_10x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs WORD $0x8580402d // ldr z13, [x1] - WORD $0x91008021 // add x1, x1, #32 + WORD $0x04215021 // addvl x1, x1, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10061,7 +10155,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 1 to 10 outputs WORD $0x8580408d // ldr z13, [x4] - WORD $0x91008084 // add x4, x4, #32 + WORD $0x04245024 // addvl x4, x4, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10131,7 +10225,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 2 to 10 outputs WORD $0x858040ad // ldr z13, [x5] - WORD $0x910080a5 // add x5, x5, #32 + WORD $0x04255025 // addvl x5, x5, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10201,7 +10295,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 3 to 10 outputs WORD $0x8580410d // ldr z13, [x8] - WORD $0x91008108 // add x8, x8, #32 + WORD $0x04285028 // addvl x8, x8, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10271,7 +10365,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 4 to 10 outputs WORD $0x8580412d // ldr z13, [x9] - WORD $0x91008129 // add x9, x9, #32 + WORD $0x04295029 // addvl x9, x9, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10341,7 +10435,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 5 to 10 outputs WORD $0x8580414d // ldr z13, [x10] - WORD $0x9100814a // add x10, x10, #32 + WORD $0x042a502a // addvl x10, x10, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10411,7 +10505,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 6 to 10 outputs WORD $0x8580416d // ldr z13, [x11] - WORD $0x9100816b // add x11, x11, #32 + WORD $0x042b502b // addvl x11, x11, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10481,7 +10575,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 7 to 10 outputs WORD $0x8580418d // ldr z13, [x12] - WORD $0x9100818c // add x12, x12, #32 + WORD $0x042c502c // addvl x12, x12, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10551,7 +10645,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 8 to 10 outputs WORD $0x858041ad // ldr z13, [x13] - WORD $0x910081ad // add x13, x13, #32 + WORD $0x042d502d // addvl x13, x13, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10621,7 +10715,7 @@ mulSve_10x10Xor_loop: // Load and process 32 bytes from input 9 to 10 outputs WORD $0x8580406d // ldr z13, [x3] - WORD $0x91008063 // add x3, x3, #32 + WORD $0x04235023 // addvl x3, x3, #1 WORD $0x04fc95ae // lsr z14.d, z13.d, #4 WORD $0x042a31ad // and z13.d, z13.d, z10.d WORD $0x042a31ce // and z14.d, z14.d, z10.d @@ -10710,7 +10804,7 @@ mulSve_10x10Xor_store: WORD $0xe5ef40c9 // st1d { z9.d }, p0, [x6, x15, lsl #3] // Prepare for next loop - WORD $0x910011ef // add x15, x15, #4 + WORD $0x8b1101ef // add x15, x15, x17 WORD $0xf1000400 // subs x0, x0, #1 BNE mulSve_10x10Xor_loop diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go index d4f46ea2d..f9c36e296 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_amd64.go @@ -43,8 +43,14 @@ func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(ma inputs <= codeGenMaxInputs && outputs <= codeGenMaxOutputs } -func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop - start +func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(in) { case 1: @@ -381,8 +387,14 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } -func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(in) { case 1: @@ -722,6 +734,11 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (64 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { @@ -1060,6 +1077,11 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (64 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { @@ -1398,6 +1420,11 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (32 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { @@ -1736,6 +1763,11 @@ func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (32 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go index ff2541b8e..656e06213 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_arm64.go @@ -38,9 +38,15 @@ func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) (_, _ *func(ma } // galMulSlicesSve -func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop - start +func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } // fmt.Println(len(in), len(out)) switch len(out) { case 1: @@ -78,8 +84,15 @@ func galMulSlicesSve(matrix []byte, in, out [][]byte, start, stop int) int { } // galMulSlicesSveXor -func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) +func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = (stop - start) + + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(out) { case 1: @@ -117,8 +130,14 @@ func galMulSlicesSveXor(matrix []byte, in, out [][]byte, start, stop int) int { } // galMulSlicesNeon -func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop - start +func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = stop - start + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(out) { case 1: @@ -156,9 +175,14 @@ func galMulSlicesNeon(matrix []byte, in, out [][]byte, start, stop int) int { } // galMulSlicesNeonXor -func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) - +func galMulSlicesNeonXor(matrix []byte, in, out [][]byte, start, stop int) (n int) { + n = (stop - start) + if raceEnabled { + defer func() { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + }() + } switch len(out) { case 1: mulNeon_10x1_64Xor(matrix, in, out, start, n) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go index 66bab8a0b..3ac349d3f 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_switch_nopshufb_amd64.go @@ -45,6 +45,11 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (64 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { @@ -383,6 +388,11 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (64 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { @@ -721,6 +731,11 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (32 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { @@ -1059,6 +1074,11 @@ func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { n := (stop - start) & (maxInt - (32 - 1)) + if raceEnabled { + raceReadSlices(in, start, n) + raceWriteSlices(out, start, n) + } + switch len(in) { case 1: switch len(out) { diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go index 377137ef5..cde255564 100644 --- a/vendor/github.com/klauspost/reedsolomon/options.go +++ b/vendor/github.com/klauspost/reedsolomon/options.go @@ -24,6 +24,7 @@ type options struct { useSSE2, useNEON, useSVE bool + vectorLength int useJerasureMatrix bool usePAR1Matrix bool @@ -55,6 +56,7 @@ var defaultOptions = options{ useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), useNEON: cpuid.CPU.Supports(cpuid.ASIMD), useSVE: cpuid.CPU.Supports(cpuid.SVE), + vectorLength: 32, // default vector length is 32 bytes (256 bits) for AVX2 code gen } // leopardMode controls the use of leopard GF in encoding and decoding. diff --git a/vendor/github.com/klauspost/reedsolomon/race.go b/vendor/github.com/klauspost/reedsolomon/race.go new file mode 100644 index 000000000..4f2c0b693 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/race.go @@ -0,0 +1,61 @@ +// Copyright (c) 2024+ Klaus Post. See LICENSE for license + +//go:build race + +package reedsolomon + +import ( + "runtime" + "unsafe" +) + +const raceEnabled = true + +func raceReadSlice[T any](s []T) { + if len(s) == 0 { + return + } + runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) +} + +func raceWriteSlice[T any](s []T) { + if len(s) == 0 { + return + } + runtime.RaceWriteRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) +} + +func raceReadSlices[T any](s [][]T, start, n int) { + if len(s) == 0 { + return + } + runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) + for _, v := range s { + if len(v) == 0 { + continue + } + n := n + if n < 0 { + n = len(v) - start + } + runtime.RaceReadRange(unsafe.Pointer(&v[start]), n*int(unsafe.Sizeof(v[0]))) + } +} + +func raceWriteSlices[T any](s [][]T, start, n int) { + if len(s) == 0 { + return + } + runtime.RaceReadRange(unsafe.Pointer(&s[0]), len(s)*int(unsafe.Sizeof(s[0]))) + + for _, v := range s { + if len(v) == 0 { + continue + } + n := n + if n < 0 { + n = len(v) - start + } + runtime.RaceWriteRange(unsafe.Pointer(&v[start]), n*int(unsafe.Sizeof(v[0]))) + } +} diff --git a/vendor/github.com/klauspost/reedsolomon/race_none.go b/vendor/github.com/klauspost/reedsolomon/race_none.go new file mode 100644 index 000000000..c7d05f287 --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/race_none.go @@ -0,0 +1,17 @@ +// Copyright (c) 2024+ Klaus Post. See LICENSE for license + +//go:build !race + +package reedsolomon + +const raceEnabled = false + +func raceReadSlice[T any](s []T) { +} + +func raceWriteSlice[T any](s []T) { +} + +func raceReadSlices[T any](s [][]T, start, n int) {} + +func raceWriteSlices[T any](s [][]T, start, n int) {} diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go index 3b6f5b785..443543f5d 100644 --- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go +++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go @@ -833,7 +833,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC start += (*galMulGFNI)(m, inputs, outputs, 0, byteCount) end = len(inputs[0]) } else if galMulGen, _, ok := r.hasCodeGen(byteCount, len(inputs), len(outputs)); ok { - m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + m := genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice()) start += (*galMulGen)(m, inputs, outputs, 0, byteCount) r.putTmpSlice(m) end = len(inputs[0]) @@ -864,7 +864,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC start = (*galMulGFNIXor)(m, inPer, outPer, 0, byteCount) } } else { - m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + m = genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, m) if inIdx == 0 { start = (*galMulGen)(m, inPer, outPer, 0, byteCount) } else { @@ -914,7 +914,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte var tmp [codeGenMaxInputs * codeGenMaxOutputs]uint64 gfniMatrix = genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), tmp[:]) } else if useCodeGen { - genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) + genMatrix = genCodeGenMatrix(matrixRows, len(inputs), 0, len(outputs), r.o.vectorLength, r.getTmpSlice()) defer r.putTmpSlice(genMatrix) } else if galMulGFNI, galMulGFNIXor, useGFNI := r.canGFNI(byteCount/4, codeGenMaxInputs, codeGenMaxOutputs); useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > codeGenMinShards { @@ -1025,7 +1025,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b outPer = outPer[:codeGenMaxOutputs] } // Generate local matrix - m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp) tmp = tmp[len(m):] plan = append(plan, state{ input: inPer, @@ -1056,7 +1056,7 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b inPer = inPer[:codeGenMaxInputs] } // Generate local matrix - m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + m := genCodeGenMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), r.o.vectorLength, tmp) tmp = tmp[len(m):] //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) plan = append(plan, state{ diff --git a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go index 6f0522f88..ffda8884c 100644 --- a/vendor/github.com/klauspost/reedsolomon/xor_arm64.go +++ b/vendor/github.com/klauspost/reedsolomon/xor_arm64.go @@ -7,8 +7,12 @@ func xorSliceNEON(in, out []byte) // simple slice xor func sliceXor(in, out []byte, o *options) { - xorSliceNEON(in, out) done := (len(in) >> 5) << 5 + if raceEnabled { + raceWriteSlice(out[:done]) + raceReadSlice(in[:done]) + } + xorSliceNEON(in, out) remain := len(in) - done if remain > 0 { diff --git a/vendor/github.com/xtaci/kcp-go/v5/fec.go b/vendor/github.com/xtaci/kcp-go/v5/fec.go index 523bda2c1..5bd9c51f8 100644 --- a/vendor/github.com/xtaci/kcp-go/v5/fec.go +++ b/vendor/github.com/xtaci/kcp-go/v5/fec.go @@ -187,10 +187,13 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) { } // shard range for current packet + // NOTE: the shard sequence number starts at 0, so we can use mod operation + // to find the beginning of the current shard. + // ALWAYS ALIGNED TO 0 shardBegin := pkt.seqid() - pkt.seqid()%uint32(dec.shardSize) shardEnd := shardBegin + uint32(dec.shardSize) - 1 - // max search range in ordered queue for current shard + // Define max search range in ordered queue for current shard searchBegin := insertIdx - int(pkt.seqid()%uint32(dec.shardSize)) if searchBegin < 0 { searchBegin = 0 @@ -200,11 +203,12 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) { searchEnd = len(dec.rx) - 1 } - // re-construct datashards + // check if we have enough shards to recover, if so, we can recover the data and free the shards + // if not, we can keep the shards in memory for future recovery. if searchEnd-searchBegin+1 >= dec.dataShards { var numshard, numDataShard, first, maxlen int - // zero caches + // zero working set for decoding shards := dec.decodeCache shardsflag := dec.flagCache for k := range dec.decodeCache { @@ -212,9 +216,10 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) { shardsflag[k] = false } - // shard assembly + // lookup shards in range [searchBegin, searchEnd] to the working set for i := searchBegin; i <= searchEnd; i++ { seqid := dec.rx[i].seqid() + // the shard seqid must be in [shardBegin, shardEnd], i.e. the current FEC group if _itimediff(seqid, shardEnd) > 0 { break } else if _itimediff(seqid, shardBegin) >= 0 { @@ -233,20 +238,23 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) { } } + // case 1: if there's no loss on data shards if numDataShard == dec.dataShards { - // case 1: no loss on data shards dec.rx = dec.freeRange(first, numshard, dec.rx) - } else if numshard >= dec.dataShards { - // case 2: loss on data shards, but it's recoverable from parity shards + } else if numshard >= dec.dataShards { // case 2: loss on data shards, but it's recoverable from parity shards + // make the bytes length of each shard equal for k := range shards { if shards[k] != nil { dlen := len(shards[k]) shards[k] = shards[k][:maxlen] clear(shards[k][dlen:]) } else if k < dec.dataShards { + // prepare memory for the data recovery shards[k] = xmitBuf.Get().([]byte)[:0] } } + + // Reed-Solomon recovery if err := dec.codec.ReconstructData(shards); err == nil { for k := range shards[:dec.dataShards] { if !shardsflag[k] { @@ -255,19 +263,22 @@ func (dec *fecDecoder) decode(in fecPacket) (recovered [][]byte) { } } } + + // Free the shards in FIFO immediately dec.rx = dec.freeRange(first, numshard, dec.rx) } } - // keep rxlimit + // keep rxlimit in FIFO order if len(dec.rx) > dec.rxlimit { - if dec.rx[0].flag() == typeData { // track the unrecoverable data + if dec.rx[0].flag() == typeData { + // track the effectiveness of FEC atomic.AddUint64(&DefaultSnmp.FECShortShards, 1) } dec.rx = dec.freeRange(0, 1, dec.rx) } - // timeout policy + // FIFO timeout policy current := currentMs() numExpired := 0 for k := range dec.rx { @@ -289,9 +300,12 @@ func (dec *fecDecoder) freeRange(first, n int, q []fecElement) []fecElement { xmitBuf.Put([]byte(q[i].fecPacket)) } + // if n is small, we can avoid the copy if first == 0 && n < cap(q)/2 { return q[n:] } + + // on the other hand, we shift the tail copy(q[first:], q[first+n:]) return q[:len(q)-n] } @@ -375,7 +389,7 @@ func (enc *fecEncoder) encode(b []byte, rto uint32) (ps [][]byte) { enc.maxSize = sz } - // Generation of Reed-Solomon Erasure Code + // Generation of Reed-Solomon Erasure Code now := time.Now().UnixMilli() if enc.shardCount == enc.dataShards { // generate the rs-code only if the data is continuous. @@ -400,6 +414,10 @@ func (enc *fecEncoder) encode(b []byte, rto uint32) (ps [][]byte) { enc.markParity(ps[k][enc.headerOffset:]) ps[k] = ps[k][:enc.maxSize] } + } else { + // record the error, and still keep the seqid monotonic increasing + atomic.AddUint64(&DefaultSnmp.FECErrs, 1) + enc.next = (enc.next + uint32(enc.parityShards)) % enc.paws } } else { // through we do not send non-continuous parity shard, we still increase the next value @@ -417,6 +435,7 @@ func (enc *fecEncoder) encode(b []byte, rto uint32) (ps [][]byte) { return } +// put a stamp on the FEC packet header with seqid and type func (enc *fecEncoder) markData(data []byte) { binary.LittleEndian.PutUint32(data, enc.next) binary.LittleEndian.PutUint16(data[4:], typeData) diff --git a/vendor/github.com/xtaci/kcp-go/v5/sess.go b/vendor/github.com/xtaci/kcp-go/v5/sess.go index 9e055b8f6..05122cab7 100644 --- a/vendor/github.com/xtaci/kcp-go/v5/sess.go +++ b/vendor/github.com/xtaci/kcp-go/v5/sess.go @@ -498,7 +498,7 @@ func (s *UDPSession) SetMtu(mtu int) bool { return true } -// SetStreamMode toggles the stream mode on/off +// Deprecated: toggles the stream mode on/off func (s *UDPSession) SetStreamMode(enable bool) { s.mu.Lock() defer s.mu.Unlock() @@ -1035,7 +1035,10 @@ func (l *Listener) Accept() (net.Conn, error) { func (l *Listener) AcceptKCP() (*UDPSession, error) { var timeout <-chan time.Time if tdeadline, ok := l.rd.Load().(time.Time); ok && !tdeadline.IsZero() { - timeout = time.After(time.Until(tdeadline)) + timer := time.NewTimer(time.Until(tdeadline)) + defer timer.Stop() + + timeout = timer.C } select { diff --git a/vendor/github.com/xtaci/kcp-go/v5/timedsched.go b/vendor/github.com/xtaci/kcp-go/v5/timedsched.go index 187ba7d4f..2e7dad665 100644 --- a/vendor/github.com/xtaci/kcp-go/v5/timedsched.go +++ b/vendor/github.com/xtaci/kcp-go/v5/timedsched.go @@ -83,8 +83,10 @@ func NewTimedSched(parallel int) *TimedSched { // sched is a goroutine to schedule and execute timed tasks. func (ts *TimedSched) sched() { - var tasks timedFuncHeap timer := time.NewTimer(0) + defer timer.Stop() + + var tasks timedFuncHeap drained := false for { select { diff --git a/vendor/github.com/xtaci/smux/session.go b/vendor/github.com/xtaci/smux/session.go index 59d13783e..d5b4c5ad6 100644 --- a/vendor/github.com/xtaci/smux/session.go +++ b/vendor/github.com/xtaci/smux/session.go @@ -576,7 +576,10 @@ func (s *Session) sendLoop() { // writeControlFrame writes the control frame to the underlying connection // and returns the number of bytes written if successful func (s *Session) writeControlFrame(f Frame) (n int, err error) { - return s.writeFrameInternal(f, time.After(openCloseTimeout), CLSCTRL) + timer := time.NewTimer(openCloseTimeout) + defer timer.Stop() + + return s.writeFrameInternal(f, timer.C, CLSCTRL) } // internal writeFrame version to support deadline used in keepalive diff --git a/vendor/github.com/xtaci/smux/stream.go b/vendor/github.com/xtaci/smux/stream.go index ed177419c..653a27513 100644 --- a/vendor/github.com/xtaci/smux/stream.go +++ b/vendor/github.com/xtaci/smux/stream.go @@ -494,7 +494,11 @@ func (s *stream) Close() error { if once { // send FIN in order f := newFrame(byte(s.sess.config.Version), cmdFIN, s.id) - _, err = s.sess.writeFrameInternal(f, time.After(openCloseTimeout), CLSDATA) + + timer := time.NewTimer(openCloseTimeout) + defer timer.Stop() + + _, err = s.sess.writeFrameInternal(f, timer.C, CLSDATA) s.sess.streamClosed(s.id) return err } else { diff --git a/vendor/modules.txt b/vendor/modules.txt index 98f06871c..5de51ef31 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -17,8 +17,8 @@ github.com/google/gopacket/layers # github.com/klauspost/cpuid/v2 v2.2.8 ## explicit; go 1.15 github.com/klauspost/cpuid/v2 -# github.com/klauspost/reedsolomon v1.12.3 -## explicit; go 1.18 +# github.com/klauspost/reedsolomon v1.12.4 +## explicit; go 1.21 github.com/klauspost/reedsolomon # github.com/mattn/go-colorable v0.1.13 ## explicit; go 1.15 @@ -44,13 +44,13 @@ github.com/tjfoc/gmsm/sm4 # github.com/urfave/cli v1.22.15 ## explicit; go 1.11 github.com/urfave/cli -# github.com/xtaci/kcp-go/v5 v5.6.17 +# github.com/xtaci/kcp-go/v5 v5.6.18 ## explicit; go 1.21 github.com/xtaci/kcp-go/v5 # github.com/xtaci/qpp v1.1.17 ## explicit; go 1.22.3 github.com/xtaci/qpp -# github.com/xtaci/smux v1.5.30 +# github.com/xtaci/smux v1.5.31 ## explicit; go 1.13 github.com/xtaci/smux # github.com/xtaci/tcpraw v1.2.31 @@ -67,7 +67,7 @@ golang.org/x/crypto/salsa20/salsa golang.org/x/crypto/tea golang.org/x/crypto/twofish golang.org/x/crypto/xtea -# golang.org/x/net v0.28.0 +# golang.org/x/net v0.29.0 ## explicit; go 1.18 golang.org/x/net/bpf golang.org/x/net/internal/iana