@@ -557,73 +557,19 @@ TEXT ·squareAdxE2(SB), $48-16
557
557
// t[3] -> R11
558
558
// t[4] -> R12
559
559
// t[5] -> R13
560
- #define MACC_0(in0, in1, in2) \
561
- ADCXQ in0, in1 \
562
- MULXQ in2, AX, in0 \
563
- ADOXQ AX, in1 \
564
-
565
- #define DIV_SHIFT_0() \
566
- PUSHQ BP \
567
- MOVQ $const_qInvNeg, DX \
568
- IMULQ R8, DX \
569
- XORQ AX, AX \
570
- MULXQ ·qElement+0 (SB), AX, BP \
571
- ADCXQ R8, AX \
572
- MOVQ BP, R8 \
573
- POPQ BP \
574
- MACC_0(R9, R8, ·qElement+8 (SB)) \
575
- MACC_0(R10, R9, ·qElement+16 (SB)) \
576
- MACC_0(R11, R10, ·qElement+24 (SB)) \
577
- MACC_0(R12, R11, ·qElement+32 (SB)) \
578
- MACC_0(R13, R12, ·qElement+40 (SB)) \
579
- MOVQ $0 , AX \
580
- ADCXQ AX, R13 \
581
- ADOXQ BP, R13 \
582
-
583
- #define MUL_WORD_0_0() \
584
- XORQ AX, AX \
585
- MULXQ R14, R8, R9 \
586
- MULXQ R15, AX, R10 \
587
- ADOXQ AX, R9 \
588
- MULXQ CX, AX, R11 \
589
- ADOXQ AX, R10 \
590
- MULXQ BX, AX, R12 \
591
- ADOXQ AX, R11 \
592
- MULXQ SI, AX, R13 \
593
- ADOXQ AX, R12 \
594
- MULXQ DI, AX, BP \
595
- ADOXQ AX, R13 \
596
- MOVQ $0 , AX \
597
- ADOXQ AX, BP \
598
- DIV_SHIFT_0() \
599
-
600
- #define MUL_WORD_N_0() \
601
- XORQ AX, AX \
602
- MULXQ R14, AX, BP \
603
- ADOXQ AX, R8 \
604
- MACC_0(BP, R9, R15) \
605
- MACC_0(BP, R10, CX) \
606
- MACC_0(BP, R11, BX) \
607
- MACC_0(BP, R12, SI) \
608
- MACC_0(BP, R13, DI) \
609
- MOVQ $0 , AX \
610
- ADCXQ AX, BP \
611
- ADOXQ AX, BP \
612
- DIV_SHIFT_0() \
613
-
614
560
// mul body
615
561
MOVQ s0-8 (SP), DX
616
- MUL_WORD_0_0 ()
562
+ MUL_WORD_0 ()
617
563
MOVQ s1-16 (SP), DX
618
- MUL_WORD_N_0 ()
564
+ MUL_WORD_N ()
619
565
MOVQ s2-24 (SP), DX
620
- MUL_WORD_N_0 ()
566
+ MUL_WORD_N ()
621
567
MOVQ s3-32 (SP), DX
622
- MUL_WORD_N_0 ()
568
+ MUL_WORD_N ()
623
569
MOVQ s4-40 (SP), DX
624
- MUL_WORD_N_0 ()
570
+ MUL_WORD_N ()
625
571
MOVQ s5-48 (SP), DX
626
- MUL_WORD_N_0 ()
572
+ MUL_WORD_N ()
627
573
628
574
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
629
575
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
@@ -674,79 +620,25 @@ TEXT ·mulAdxE2(SB), $96-24
674
620
// t[3] -> R11
675
621
// t[4] -> R12
676
622
// t[5] -> R13
677
- #define MACC_1(in0, in1, in2) \
678
- ADCXQ in0, in1 \
679
- MULXQ in2, AX, in0 \
680
- ADOXQ AX, in1 \
681
-
682
- #define DIV_SHIFT_1() \
683
- PUSHQ BP \
684
- MOVQ $const_qInvNeg, DX \
685
- IMULQ R8, DX \
686
- XORQ AX, AX \
687
- MULXQ ·qElement+0 (SB), AX, BP \
688
- ADCXQ R8, AX \
689
- MOVQ BP, R8 \
690
- POPQ BP \
691
- MACC_1(R9, R8, ·qElement+8 (SB)) \
692
- MACC_1(R10, R9, ·qElement+16 (SB)) \
693
- MACC_1(R11, R10, ·qElement+24 (SB)) \
694
- MACC_1(R12, R11, ·qElement+32 (SB)) \
695
- MACC_1(R13, R12, ·qElement+40 (SB)) \
696
- MOVQ $0 , AX \
697
- ADCXQ AX, R13 \
698
- ADOXQ BP, R13 \
699
-
700
- #define MUL_WORD_0_1() \
701
- XORQ AX, AX \
702
- MULXQ R14, R8, R9 \
703
- MULXQ R15, AX, R10 \
704
- ADOXQ AX, R9 \
705
- MULXQ CX, AX, R11 \
706
- ADOXQ AX, R10 \
707
- MULXQ BX, AX, R12 \
708
- ADOXQ AX, R11 \
709
- MULXQ SI, AX, R13 \
710
- ADOXQ AX, R12 \
711
- MULXQ DI, AX, BP \
712
- ADOXQ AX, R13 \
713
- MOVQ $0 , AX \
714
- ADOXQ AX, BP \
715
- DIV_SHIFT_1() \
716
-
717
- #define MUL_WORD_N_1() \
718
- XORQ AX, AX \
719
- MULXQ R14, AX, BP \
720
- ADOXQ AX, R8 \
721
- MACC_1(BP, R9, R15) \
722
- MACC_1(BP, R10, CX) \
723
- MACC_1(BP, R11, BX) \
724
- MACC_1(BP, R12, SI) \
725
- MACC_1(BP, R13, DI) \
726
- MOVQ $0 , AX \
727
- ADCXQ AX, BP \
728
- ADOXQ AX, BP \
729
- DIV_SHIFT_1() \
730
-
731
623
// mul body
732
624
MOVQ y+16 (FP), DX
733
625
MOVQ 48 (DX), DX
734
- MUL_WORD_0_1 ()
626
+ MUL_WORD_0 ()
735
627
MOVQ y+16 (FP), DX
736
628
MOVQ 56 (DX), DX
737
- MUL_WORD_N_1 ()
629
+ MUL_WORD_N ()
738
630
MOVQ y+16 (FP), DX
739
631
MOVQ 64 (DX), DX
740
- MUL_WORD_N_1 ()
632
+ MUL_WORD_N ()
741
633
MOVQ y+16 (FP), DX
742
634
MOVQ 72 (DX), DX
743
- MUL_WORD_N_1 ()
635
+ MUL_WORD_N ()
744
636
MOVQ y+16 (FP), DX
745
637
MOVQ 80 (DX), DX
746
- MUL_WORD_N_1 ()
638
+ MUL_WORD_N ()
747
639
MOVQ y+16 (FP), DX
748
640
MOVQ 88 (DX), DX
749
- MUL_WORD_N_1 ()
641
+ MUL_WORD_N ()
750
642
751
643
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
752
644
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
@@ -797,73 +689,19 @@ TEXT ·mulAdxE2(SB), $96-24
797
689
// t[3] -> R11
798
690
// t[4] -> R12
799
691
// t[5] -> R13
800
- #define MACC_2(in0, in1, in2) \
801
- ADCXQ in0, in1 \
802
- MULXQ in2, AX, in0 \
803
- ADOXQ AX, in1 \
804
-
805
- #define DIV_SHIFT_2() \
806
- PUSHQ BP \
807
- MOVQ $const_qInvNeg, DX \
808
- IMULQ R8, DX \
809
- XORQ AX, AX \
810
- MULXQ ·qElement+0 (SB), AX, BP \
811
- ADCXQ R8, AX \
812
- MOVQ BP, R8 \
813
- POPQ BP \
814
- MACC_2(R9, R8, ·qElement+8 (SB)) \
815
- MACC_2(R10, R9, ·qElement+16 (SB)) \
816
- MACC_2(R11, R10, ·qElement+24 (SB)) \
817
- MACC_2(R12, R11, ·qElement+32 (SB)) \
818
- MACC_2(R13, R12, ·qElement+40 (SB)) \
819
- MOVQ $0 , AX \
820
- ADCXQ AX, R13 \
821
- ADOXQ BP, R13 \
822
-
823
- #define MUL_WORD_0_2() \
824
- XORQ AX, AX \
825
- MULXQ R14, R8, R9 \
826
- MULXQ R15, AX, R10 \
827
- ADOXQ AX, R9 \
828
- MULXQ CX, AX, R11 \
829
- ADOXQ AX, R10 \
830
- MULXQ BX, AX, R12 \
831
- ADOXQ AX, R11 \
832
- MULXQ SI, AX, R13 \
833
- ADOXQ AX, R12 \
834
- MULXQ DI, AX, BP \
835
- ADOXQ AX, R13 \
836
- MOVQ $0 , AX \
837
- ADOXQ AX, BP \
838
- DIV_SHIFT_2() \
839
-
840
- #define MUL_WORD_N_2() \
841
- XORQ AX, AX \
842
- MULXQ R14, AX, BP \
843
- ADOXQ AX, R8 \
844
- MACC_2(BP, R9, R15) \
845
- MACC_2(BP, R10, CX) \
846
- MACC_2(BP, R11, BX) \
847
- MACC_2(BP, R12, SI) \
848
- MACC_2(BP, R13, DI) \
849
- MOVQ $0 , AX \
850
- ADCXQ AX, BP \
851
- ADOXQ AX, BP \
852
- DIV_SHIFT_2() \
853
-
854
692
// mul body
855
693
MOVQ s0-8 (SP), DX
856
- MUL_WORD_0_2 ()
694
+ MUL_WORD_0 ()
857
695
MOVQ s1-16 (SP), DX
858
- MUL_WORD_N_2 ()
696
+ MUL_WORD_N ()
859
697
MOVQ s2-24 (SP), DX
860
- MUL_WORD_N_2 ()
698
+ MUL_WORD_N ()
861
699
MOVQ s3-32 (SP), DX
862
- MUL_WORD_N_2 ()
700
+ MUL_WORD_N ()
863
701
MOVQ s4-40 (SP), DX
864
- MUL_WORD_N_2 ()
702
+ MUL_WORD_N ()
865
703
MOVQ s5-48 (SP), DX
866
- MUL_WORD_N_2 ()
704
+ MUL_WORD_N ()
867
705
868
706
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
869
707
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
@@ -889,79 +727,25 @@ TEXT ·mulAdxE2(SB), $96-24
889
727
// t[3] -> R11
890
728
// t[4] -> R12
891
729
// t[5] -> R13
892
- #define MACC_3(in0, in1, in2) \
893
- ADCXQ in0, in1 \
894
- MULXQ in2, AX, in0 \
895
- ADOXQ AX, in1 \
896
-
897
- #define DIV_SHIFT_3() \
898
- PUSHQ BP \
899
- MOVQ $const_qInvNeg, DX \
900
- IMULQ R8, DX \
901
- XORQ AX, AX \
902
- MULXQ ·qElement+0 (SB), AX, BP \
903
- ADCXQ R8, AX \
904
- MOVQ BP, R8 \
905
- POPQ BP \
906
- MACC_3(R9, R8, ·qElement+8 (SB)) \
907
- MACC_3(R10, R9, ·qElement+16 (SB)) \
908
- MACC_3(R11, R10, ·qElement+24 (SB)) \
909
- MACC_3(R12, R11, ·qElement+32 (SB)) \
910
- MACC_3(R13, R12, ·qElement+40 (SB)) \
911
- MOVQ $0 , AX \
912
- ADCXQ AX, R13 \
913
- ADOXQ BP, R13 \
914
-
915
- #define MUL_WORD_0_3() \
916
- XORQ AX, AX \
917
- MULXQ R14, R8, R9 \
918
- MULXQ R15, AX, R10 \
919
- ADOXQ AX, R9 \
920
- MULXQ CX, AX, R11 \
921
- ADOXQ AX, R10 \
922
- MULXQ BX, AX, R12 \
923
- ADOXQ AX, R11 \
924
- MULXQ SI, AX, R13 \
925
- ADOXQ AX, R12 \
926
- MULXQ DI, AX, BP \
927
- ADOXQ AX, R13 \
928
- MOVQ $0 , AX \
929
- ADOXQ AX, BP \
930
- DIV_SHIFT_3() \
931
-
932
- #define MUL_WORD_N_3() \
933
- XORQ AX, AX \
934
- MULXQ R14, AX, BP \
935
- ADOXQ AX, R8 \
936
- MACC_3(BP, R9, R15) \
937
- MACC_3(BP, R10, CX) \
938
- MACC_3(BP, R11, BX) \
939
- MACC_3(BP, R12, SI) \
940
- MACC_3(BP, R13, DI) \
941
- MOVQ $0 , AX \
942
- ADCXQ AX, BP \
943
- ADOXQ AX, BP \
944
- DIV_SHIFT_3() \
945
-
946
730
// mul body
947
731
MOVQ y+16 (FP), DX
948
732
MOVQ 0 (DX), DX
949
- MUL_WORD_0_3 ()
733
+ MUL_WORD_0 ()
950
734
MOVQ y+16 (FP), DX
951
735
MOVQ 8 (DX), DX
952
- MUL_WORD_N_3 ()
736
+ MUL_WORD_N ()
953
737
MOVQ y+16 (FP), DX
954
738
MOVQ 16 (DX), DX
955
- MUL_WORD_N_3 ()
739
+ MUL_WORD_N ()
956
740
MOVQ y+16 (FP), DX
957
741
MOVQ 24 (DX), DX
958
- MUL_WORD_N_3 ()
742
+ MUL_WORD_N ()
959
743
MOVQ y+16 (FP), DX
960
744
MOVQ 32 (DX), DX
961
- MUL_WORD_N_3 ()
745
+ MUL_WORD_N ()
962
746
MOVQ y+16 (FP), DX
963
747
MOVQ 40 (DX), DX
964
- MUL_WORD_N_3 ()
748
+ MUL_WORD_N ()
965
749
966
750
// reduce element(R8,R9,R10,R11,R12,R13) using temp registers (R14,R15,CX,BX,SI,DI)
967
751
REDUCE(R8,R9,R10,R11,R12,R13,R14,R15,CX,BX,SI,DI)
0 commit comments