Skip to content

Commit 53c5047

Browse files
authored
perf: adds avx512 poseidon2 for small fields (#665)
1 parent fbb0872 commit 53c5047

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+6098
-1830
lines changed

ecc/bls12-381/internal/fptower/e2_amd64.s

+24-240
Original file line numberDiff line numberDiff line change
@@ -557,73 +557,19 @@ TEXT ·squareAdxE2(SB), $48-16
557557
// t[3] -> R11
558558
// t[4] -> R12
559559
// t[5] -> R13
560-
#define MACC_0(in0, in1, in2) \
561-
ADCXQ in0, in1 \
562-
MULXQ in2, AX, in0 \
563-
ADOXQ AX, in1 \
564-
565-
#define DIV_SHIFT_0() \
566-
PUSHQ BP \
567-
MOVQ $const_qInvNeg, DX \
568-
IMULQ R8, DX \
569-
XORQ AX, AX \
570-
MULXQ ·qElement+0(SB), AX, BP \
571-
ADCXQ R8, AX \
572-
MOVQ BP, R8 \
573-
POPQ BP \
574-
MACC_0(R9, R8, ·qElement+8(SB)) \
575-
MACC_0(R10, R9, ·qElement+16(SB)) \
576-
MACC_0(R11, R10, ·qElement+24(SB)) \
577-
MACC_0(R12, R11, ·qElement+32(SB)) \
578-
MACC_0(R13, R12, ·qElement+40(SB)) \
579-
MOVQ $0, AX \
580-
ADCXQ AX, R13 \
581-
ADOXQ BP, R13 \
582-
583-
#define MUL_WORD_0_0() \
584-
XORQ AX, AX \
585-
MULXQ R14, R8, R9 \
586-
MULXQ R15, AX, R10 \
587-
ADOXQ AX, R9 \
588-
MULXQ CX, AX, R11 \
589-
ADOXQ AX, R10 \
590-
MULXQ BX, AX, R12 \
591-
ADOXQ AX, R11 \
592-
MULXQ SI, AX, R13 \
593-
ADOXQ AX, R12 \
594-
MULXQ DI, AX, BP \
595-
ADOXQ AX, R13 \
596-
MOVQ $0, AX \
597-
ADOXQ AX, BP \
598-
DIV_SHIFT_0() \
599-
600-
#define MUL_WORD_N_0() \
601-
XORQ AX, AX \
602-
MULXQ R14, AX, BP \
603-
ADOXQ AX, R8 \
604-
MACC_0(BP, R9, R15) \
605-
MACC_0(BP, R10, CX) \
606-
MACC_0(BP, R11, BX) \
607-
MACC_0(BP, R12, SI) \
608-
MACC_0(BP, R13, DI) \
609-
MOVQ $0, AX \
610-
ADCXQ AX, BP \
611-
ADOXQ AX, BP \
612-
DIV_SHIFT_0() \
613-
614560
// mul body
615561
MOVQ s0-8(SP), DX
616-
MUL_WORD_0_0()
562+
MUL_WORD_0()
617563
MOVQ s1-16(SP), DX
618-
MUL_WORD_N_0()
564+
MUL_WORD_N()
619565
MOVQ s2-24(SP), DX
620-
MUL_WORD_N_0()
566+
MUL_WORD_N()
621567
MOVQ s3-32(SP), DX
622-
MUL_WORD_N_0()
568+
MUL_WORD_N()
623569
MOVQ s4-40(SP), DX
624-
MUL_WORD_N_0()
570+
MUL_WORD_N()
625571
MOVQ s5-48(SP), DX
626-
MUL_WORD_N_0()
572+
MUL_WORD_N()
627573

628574
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
629575
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
@@ -674,79 +620,25 @@ TEXT ·mulAdxE2(SB), $96-24
674620
// t[3] -> R11
675621
// t[4] -> R12
676622
// t[5] -> R13
677-
#define MACC_1(in0, in1, in2) \
678-
ADCXQ in0, in1 \
679-
MULXQ in2, AX, in0 \
680-
ADOXQ AX, in1 \
681-
682-
#define DIV_SHIFT_1() \
683-
PUSHQ BP \
684-
MOVQ $const_qInvNeg, DX \
685-
IMULQ R8, DX \
686-
XORQ AX, AX \
687-
MULXQ ·qElement+0(SB), AX, BP \
688-
ADCXQ R8, AX \
689-
MOVQ BP, R8 \
690-
POPQ BP \
691-
MACC_1(R9, R8, ·qElement+8(SB)) \
692-
MACC_1(R10, R9, ·qElement+16(SB)) \
693-
MACC_1(R11, R10, ·qElement+24(SB)) \
694-
MACC_1(R12, R11, ·qElement+32(SB)) \
695-
MACC_1(R13, R12, ·qElement+40(SB)) \
696-
MOVQ $0, AX \
697-
ADCXQ AX, R13 \
698-
ADOXQ BP, R13 \
699-
700-
#define MUL_WORD_0_1() \
701-
XORQ AX, AX \
702-
MULXQ R14, R8, R9 \
703-
MULXQ R15, AX, R10 \
704-
ADOXQ AX, R9 \
705-
MULXQ CX, AX, R11 \
706-
ADOXQ AX, R10 \
707-
MULXQ BX, AX, R12 \
708-
ADOXQ AX, R11 \
709-
MULXQ SI, AX, R13 \
710-
ADOXQ AX, R12 \
711-
MULXQ DI, AX, BP \
712-
ADOXQ AX, R13 \
713-
MOVQ $0, AX \
714-
ADOXQ AX, BP \
715-
DIV_SHIFT_1() \
716-
717-
#define MUL_WORD_N_1() \
718-
XORQ AX, AX \
719-
MULXQ R14, AX, BP \
720-
ADOXQ AX, R8 \
721-
MACC_1(BP, R9, R15) \
722-
MACC_1(BP, R10, CX) \
723-
MACC_1(BP, R11, BX) \
724-
MACC_1(BP, R12, SI) \
725-
MACC_1(BP, R13, DI) \
726-
MOVQ $0, AX \
727-
ADCXQ AX, BP \
728-
ADOXQ AX, BP \
729-
DIV_SHIFT_1() \
730-
731623
// mul body
732624
MOVQ y+16(FP), DX
733625
MOVQ 48(DX), DX
734-
MUL_WORD_0_1()
626+
MUL_WORD_0()
735627
MOVQ y+16(FP), DX
736628
MOVQ 56(DX), DX
737-
MUL_WORD_N_1()
629+
MUL_WORD_N()
738630
MOVQ y+16(FP), DX
739631
MOVQ 64(DX), DX
740-
MUL_WORD_N_1()
632+
MUL_WORD_N()
741633
MOVQ y+16(FP), DX
742634
MOVQ 72(DX), DX
743-
MUL_WORD_N_1()
635+
MUL_WORD_N()
744636
MOVQ y+16(FP), DX
745637
MOVQ 80(DX), DX
746-
MUL_WORD_N_1()
638+
MUL_WORD_N()
747639
MOVQ y+16(FP), DX
748640
MOVQ 88(DX), DX
749-
MUL_WORD_N_1()
641+
MUL_WORD_N()
750642

751643
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
752644
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
@@ -797,73 +689,19 @@ TEXT ·mulAdxE2(SB), $96-24
797689
// t[3] -> R11
798690
// t[4] -> R12
799691
// t[5] -> R13
800-
#define MACC_2(in0, in1, in2) \
801-
ADCXQ in0, in1 \
802-
MULXQ in2, AX, in0 \
803-
ADOXQ AX, in1 \
804-
805-
#define DIV_SHIFT_2() \
806-
PUSHQ BP \
807-
MOVQ $const_qInvNeg, DX \
808-
IMULQ R8, DX \
809-
XORQ AX, AX \
810-
MULXQ ·qElement+0(SB), AX, BP \
811-
ADCXQ R8, AX \
812-
MOVQ BP, R8 \
813-
POPQ BP \
814-
MACC_2(R9, R8, ·qElement+8(SB)) \
815-
MACC_2(R10, R9, ·qElement+16(SB)) \
816-
MACC_2(R11, R10, ·qElement+24(SB)) \
817-
MACC_2(R12, R11, ·qElement+32(SB)) \
818-
MACC_2(R13, R12, ·qElement+40(SB)) \
819-
MOVQ $0, AX \
820-
ADCXQ AX, R13 \
821-
ADOXQ BP, R13 \
822-
823-
#define MUL_WORD_0_2() \
824-
XORQ AX, AX \
825-
MULXQ R14, R8, R9 \
826-
MULXQ R15, AX, R10 \
827-
ADOXQ AX, R9 \
828-
MULXQ CX, AX, R11 \
829-
ADOXQ AX, R10 \
830-
MULXQ BX, AX, R12 \
831-
ADOXQ AX, R11 \
832-
MULXQ SI, AX, R13 \
833-
ADOXQ AX, R12 \
834-
MULXQ DI, AX, BP \
835-
ADOXQ AX, R13 \
836-
MOVQ $0, AX \
837-
ADOXQ AX, BP \
838-
DIV_SHIFT_2() \
839-
840-
#define MUL_WORD_N_2() \
841-
XORQ AX, AX \
842-
MULXQ R14, AX, BP \
843-
ADOXQ AX, R8 \
844-
MACC_2(BP, R9, R15) \
845-
MACC_2(BP, R10, CX) \
846-
MACC_2(BP, R11, BX) \
847-
MACC_2(BP, R12, SI) \
848-
MACC_2(BP, R13, DI) \
849-
MOVQ $0, AX \
850-
ADCXQ AX, BP \
851-
ADOXQ AX, BP \
852-
DIV_SHIFT_2() \
853-
854692
// mul body
855693
MOVQ s0-8(SP), DX
856-
MUL_WORD_0_2()
694+
MUL_WORD_0()
857695
MOVQ s1-16(SP), DX
858-
MUL_WORD_N_2()
696+
MUL_WORD_N()
859697
MOVQ s2-24(SP), DX
860-
MUL_WORD_N_2()
698+
MUL_WORD_N()
861699
MOVQ s3-32(SP), DX
862-
MUL_WORD_N_2()
700+
MUL_WORD_N()
863701
MOVQ s4-40(SP), DX
864-
MUL_WORD_N_2()
702+
MUL_WORD_N()
865703
MOVQ s5-48(SP), DX
866-
MUL_WORD_N_2()
704+
MUL_WORD_N()
867705

868706
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
869707
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
@@ -889,79 +727,25 @@ TEXT ·mulAdxE2(SB), $96-24
889727
// t[3] -> R11
890728
// t[4] -> R12
891729
// t[5] -> R13
892-
#define MACC_3(in0, in1, in2) \
893-
ADCXQ in0, in1 \
894-
MULXQ in2, AX, in0 \
895-
ADOXQ AX, in1 \
896-
897-
#define DIV_SHIFT_3() \
898-
PUSHQ BP \
899-
MOVQ $const_qInvNeg, DX \
900-
IMULQ R8, DX \
901-
XORQ AX, AX \
902-
MULXQ ·qElement+0(SB), AX, BP \
903-
ADCXQ R8, AX \
904-
MOVQ BP, R8 \
905-
POPQ BP \
906-
MACC_3(R9, R8, ·qElement+8(SB)) \
907-
MACC_3(R10, R9, ·qElement+16(SB)) \
908-
MACC_3(R11, R10, ·qElement+24(SB)) \
909-
MACC_3(R12, R11, ·qElement+32(SB)) \
910-
MACC_3(R13, R12, ·qElement+40(SB)) \
911-
MOVQ $0, AX \
912-
ADCXQ AX, R13 \
913-
ADOXQ BP, R13 \
914-
915-
#define MUL_WORD_0_3() \
916-
XORQ AX, AX \
917-
MULXQ R14, R8, R9 \
918-
MULXQ R15, AX, R10 \
919-
ADOXQ AX, R9 \
920-
MULXQ CX, AX, R11 \
921-
ADOXQ AX, R10 \
922-
MULXQ BX, AX, R12 \
923-
ADOXQ AX, R11 \
924-
MULXQ SI, AX, R13 \
925-
ADOXQ AX, R12 \
926-
MULXQ DI, AX, BP \
927-
ADOXQ AX, R13 \
928-
MOVQ $0, AX \
929-
ADOXQ AX, BP \
930-
DIV_SHIFT_3() \
931-
932-
#define MUL_WORD_N_3() \
933-
XORQ AX, AX \
934-
MULXQ R14, AX, BP \
935-
ADOXQ AX, R8 \
936-
MACC_3(BP, R9, R15) \
937-
MACC_3(BP, R10, CX) \
938-
MACC_3(BP, R11, BX) \
939-
MACC_3(BP, R12, SI) \
940-
MACC_3(BP, R13, DI) \
941-
MOVQ $0, AX \
942-
ADCXQ AX, BP \
943-
ADOXQ AX, BP \
944-
DIV_SHIFT_3() \
945-
946730
// mul body
947731
MOVQ y+16(FP), DX
948732
MOVQ 0(DX), DX
949-
MUL_WORD_0_3()
733+
MUL_WORD_0()
950734
MOVQ y+16(FP), DX
951735
MOVQ 8(DX), DX
952-
MUL_WORD_N_3()
736+
MUL_WORD_N()
953737
MOVQ y+16(FP), DX
954738
MOVQ 16(DX), DX
955-
MUL_WORD_N_3()
739+
MUL_WORD_N()
956740
MOVQ y+16(FP), DX
957741
MOVQ 24(DX), DX
958-
MUL_WORD_N_3()
742+
MUL_WORD_N()
959743
MOVQ y+16(FP), DX
960744
MOVQ 32(DX), DX
961-
MUL_WORD_N_3()
745+
MUL_WORD_N()
962746
MOVQ y+16(FP), DX
963747
MOVQ 40(DX), DX
964-
MUL_WORD_N_3()
748+
MUL_WORD_N()
965749

966750
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
967751
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)

0 commit comments

Comments
 (0)