From 79920db3ac9f1e1915d836c82bc401ba0cdc322f Mon Sep 17 00:00:00 2001
From: Stephan Hilb <stephan@ecshi.net>
Date: Sun, 25 Apr 2021 16:19:32 +0200
Subject: [PATCH] implement faster floating-point `isless` (#39090)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* implement faster floating-point `isless`

Previously `isless` relied on the C intrinsic `fpislt` in
`src/runtime_intrinsics.c`, while the new implementation in Julia
arguably generates better code, namely:

 1. The NaN-check compiles to a single instruction + branch amenable
    for branch prediction in arguably most usecases (i.e. comparing
    non-NaN floats), thus speeding up execution.
 2. The compiler now often manages to remove NaN-computation if the
    embedding code has already proven the arguments to be non-NaN.
 3. The actual operation compares both arguments as sign-magnitude
    integers instead of case analysis based on the sign of one
    argument. This symmetric treatment may generate vectorized
    instructions for the sign-magnitude conversion depending on how the
    arguments are layed out.

The actual behaviour of `isless` did not change and apart from the
Julia-specific NaN-handling (which may be up for debate) the resulting
total order corresponds to the IEEE-754 specified `totalOrder`.

While the new implementation no longer generates fully branchless code I
did not manage to construct a usecase where this was detrimental: the
saved work seems to outweight the potential cost of a branch
misprediction in all of my tests with various NaN-polluted data. Also
auto-vectorization was not effective on the previous `fpislt` either.

Quick benchmarks (AMD A10-7860K) on `sort`, avoiding the specialized
algorithm:

```julia
a = rand(1000);
@btime sort($a, lt=(a,b)->isless(a,b));
    # before: 56.030 μs (1 allocation: 7.94 KiB)
    #  after: 40.853 μs (1 allocation: 7.94 KiB)
a = rand(1000000);
@btime sort($a, lt=(a,b)->isless(a,b));
    # before: 159.499 ms (2 allocations: 7.63 MiB)
    #  after: 120.536 ms (2 allocations: 7.63 MiB)
a = [rand((rand(), NaN)) for _ in 1:1000000];
@btime sort($a, lt=(a,b)->isless(a,b));
    # before: 111.925 ms (2 allocations: 7.63 MiB)
    #  after:  77.669 ms (2 allocations: 7.63 MiB)
```


* Remove old intrinsic fpslt code

Co-authored-by: Mustafa Mohamad <mus-m@outlook.com>
---
 .clang-format            |  1 -
 base/compiler/tfuncs.jl  |  1 -
 base/float.jl            | 16 +++++++++++++---
 src/intrinsics.cpp       | 21 ---------------------
 src/intrinsics.h         |  1 -
 src/julia_internal.h     |  1 -
 src/runtime_intrinsics.c | 22 ----------------------
 7 files changed, 13 insertions(+), 50 deletions(-)

diff --git a/.clang-format b/.clang-format
index 0322d0f6749a9..39b5767a50291 100644
--- a/.clang-format
+++ b/.clang-format
@@ -109,7 +109,6 @@ StatementMacros:
   - checked_intrinsic_ctype
   - cvt_iintrinsic
   - fpiseq_n
-  - fpislt_n
   - ter_fintrinsic
   - ter_intrinsic_ctype
   - un_fintrinsic
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
index 921fbdfe32d03..214087781a44f 100644
--- a/base/compiler/tfuncs.jl
+++ b/base/compiler/tfuncs.jl
@@ -193,7 +193,6 @@ add_tfunc(ne_float, 2, 2, cmp_tfunc, 2)
 add_tfunc(lt_float, 2, 2, cmp_tfunc, 2)
 add_tfunc(le_float, 2, 2, cmp_tfunc, 2)
 add_tfunc(fpiseq, 2, 2, cmp_tfunc, 1)
-add_tfunc(fpislt, 2, 2, cmp_tfunc, 1)
 add_tfunc(eq_float_fast, 2, 2, cmp_tfunc, 1)
 add_tfunc(ne_float_fast, 2, 2, cmp_tfunc, 1)
 add_tfunc(lt_float_fast, 2, 2, cmp_tfunc, 1)
diff --git a/base/float.jl b/base/float.jl
index fd5283fc82ab4..560016d9400f5 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -439,9 +439,19 @@ end
 isequal(x::Float16, y::Float16) = fpiseq(x, y)
 isequal(x::Float32, y::Float32) = fpiseq(x, y)
 isequal(x::Float64, y::Float64) = fpiseq(x, y)
-isless( x::Float16, y::Float16) = fpislt(x, y)
-isless( x::Float32, y::Float32) = fpislt(x, y)
-isless( x::Float64, y::Float64) = fpislt(x, y)
+
+# interpret as sign-magnitude integer
+@inline function _fpint(x)
+    IntT = inttype(typeof(x))
+    ix = reinterpret(IntT, x)
+    return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix)
+end
+
+@inline function isless(a::T, b::T) where T<:IEEEFloat
+    (isnan(a) || isnan(b)) && return !isnan(a)
+
+    return _fpint(a) < _fpint(b)
+end
 
 # Exact Float (Tf) vs Integer (Ti) comparisons
 # Assumes:
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index c0ee6ffad0ab8..d21e84593b5a2 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -58,7 +58,6 @@ static void jl_init_intrinsic_functions_codegen(void)
     float_func[lt_float_fast] = true;
     float_func[le_float_fast] = true;
     float_func[fpiseq] = true;
-    float_func[fpislt] = true;
     float_func[abs_float] = true;
     float_func[copysign_float] = true;
     float_func[ceil_llvm] = true;
@@ -1165,26 +1164,6 @@ static Value *emit_untyped_intrinsic(jl_codectx_t &ctx, intrinsic f, Value **arg
                                 ctx.builder.CreateICmpEQ(xi, yi));
     }
 
-    case fpislt: {
-        *newtyp = jl_bool_type;
-        Type *it = INTT(t);
-        Value *xi = ctx.builder.CreateBitCast(x, it);
-        Value *yi = ctx.builder.CreateBitCast(y, it);
-        return ctx.builder.CreateOr(
-            ctx.builder.CreateAnd(
-                ctx.builder.CreateFCmpORD(x, x),
-                ctx.builder.CreateFCmpUNO(y, y)),
-            ctx.builder.CreateAnd(
-                ctx.builder.CreateFCmpORD(x, y),
-                ctx.builder.CreateOr(
-                    ctx.builder.CreateAnd(
-                        ctx.builder.CreateICmpSGE(xi, ConstantInt::get(it, 0)),
-                        ctx.builder.CreateICmpSLT(xi, yi)),
-                    ctx.builder.CreateAnd(
-                        ctx.builder.CreateICmpSLT(xi, ConstantInt::get(it, 0)),
-                        ctx.builder.CreateICmpUGT(xi, yi)))));
-    }
-
     case and_int: return ctx.builder.CreateAnd(x, y);
     case or_int:  return ctx.builder.CreateOr(x, y);
     case xor_int: return ctx.builder.CreateXor(x, y);
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 1558769eb3643..a639e06a6316e 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -45,7 +45,6 @@
     ALIAS(lt_float_fast, lt_float) \
     ALIAS(le_float_fast, le_float) \
     ADD_I(fpiseq, 2) \
-    ADD_I(fpislt, 2) \
     /*  bitwise operators */ \
     ADD_I(and_int, 2) \
     ADD_I(or_int, 2) \
diff --git a/src/julia_internal.h b/src/julia_internal.h
index dc7236c6b6f40..43617b17dbefe 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -1114,7 +1114,6 @@ JL_DLLEXPORT jl_value_t *jl_ne_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_lt_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_le_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_fpiseq(jl_value_t *a, jl_value_t *b);
-JL_DLLEXPORT jl_value_t *jl_fpislt(jl_value_t *a, jl_value_t *b);
 
 JL_DLLEXPORT jl_value_t *jl_not_int(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_and_int(jl_value_t *a, jl_value_t *b);
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index 2337abe7d5704..22c3a8988b580 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -834,33 +834,11 @@ fpiseq_n(double, 64)
 #define fpiseq(a,b) \
     sizeof(a) == sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
 
-#define fpislt_n(c_type, nbits)                                         \
-    static inline int fpislt##nbits(c_type a, c_type b) JL_NOTSAFEPOINT \
-    {                                                                   \
-        bits##nbits ua, ub;                                             \
-        ua.f = a;                                                       \
-        ub.f = b;                                                       \
-        if (!isnan(a) && isnan(b))                                      \
-            return 1;                                                   \
-        if (isnan(a) || isnan(b))                                       \
-            return 0;                                                   \
-        if (ua.d >= 0 && ua.d < ub.d)                                   \
-            return 1;                                                   \
-        if (ua.d < 0 && ua.ud > ub.ud)                                  \
-            return 1;                                                   \
-        return 0;                                                       \
-    }
-fpislt_n(float, 32)
-fpislt_n(double, 64)
-#define fpislt(a, b) \
-    sizeof(a) == sizeof(float) ? fpislt32(a, b) : fpislt64(a, b)
-
 bool_fintrinsic(eq,eq_float)
 bool_fintrinsic(ne,ne_float)
 bool_fintrinsic(lt,lt_float)
 bool_fintrinsic(le,le_float)
 bool_fintrinsic(fpiseq,fpiseq)
-bool_fintrinsic(fpislt,fpislt)
 
 // bitwise operators
 #define and_op(a,b) a & b