-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathreduce_2.cuh
92 lines (72 loc) · 2.96 KB
/
reduce_2.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/*
kernels/reduce_2.cuh -- CUDA parallel reduction kernels
Copyright (c) 2021 Wenzel Jakob <wenzel.jakob@epfl.ch>
All rights reserved. Use of this source code is governed by a BSD-style
license that can be found in the LICENSE file.
*/
#include "common.h"
template <typename Value, typename Reduce, uint32_t BlockSize>
__device__ void reduce_2(const Value *in_1, const Value *in_2, uint32_t size,
Value *out) {
Value *shared = SharedMemory<Value>::get();
uint32_t tid = threadIdx.x,
bid = blockIdx.x,
nb = gridDim.x,
offset = BlockSize * 2 * bid + tid,
stride = BlockSize * 2 * nb;
Reduce red;
Value value = red.init();
// Grid-stride loop to reduce elements
for (uint32_t i = offset; i < size; i += stride) {
value = red(value, in_1[i], in_2[i]);
uint32_t ib = i + BlockSize;
if (ib < size)
value = red(value, in_1[ib], in_2[ib]);
}
// Write to shared memory and wait for all threads to reach this point
shared[tid] = value;
__syncthreads();
// Block-level reduction from nb*BlockSize -> nb*32 values
if (BlockSize >= 1024 && tid < 512)
shared[tid] = value = red(value, shared[tid + 512]);
__syncthreads();
if (BlockSize >= 512 && tid < 256)
shared[tid] = value = red(value, shared[tid + 256]);
__syncthreads();
if (BlockSize >= 256 && tid < 128)
shared[tid] = value = red(value, shared[tid + 128]);
__syncthreads();
if (BlockSize >= 128 && tid < 64)
shared[tid] = value = red(value, shared[tid + 64]);
__syncthreads();
if (tid < 32) {
if (BlockSize >= 64)
value = red(value, shared[tid + 32]);
// Block-level reduction from nb*32 -> nb values
for (uint32_t i = 1; i < 32; i *= 2)
value = red(value, __shfl_xor_sync(WarpMask, value, i));
if (tid == 0)
out[bid] = value;
}
}
template <typename Value> struct reduction_dot {
__device__ Value init() { return (Value) 0; }
__device__ Value operator()(Value accum, Value value) const {
return add_(accum, value);
}
__device__ Value operator()(Value accum, Value in_1, Value in_2) const {
return fma_(in_1, in_2, accum);
}
};
#define HORIZ_OP(Name, Reduction, Type, Suffix) \
KERNEL void Name##_##Suffix(const Type *in_1, const Type *in_2, \
uint32_t size, Type *out) { \
reduce_2<Type, Reduction<Type>, 1024>(in_1, in_2, size, out); \
}
#define HORIZ_OP_ALL(Name, Reduction) \
HORIZ_OP(Name, Reduction, half, f16) \
HORIZ_OP(Name, Reduction, float, f32) \
HORIZ_OP(Name, Reduction, double, f64)
HORIZ_OP_ALL(reduce_dot, reduction_dot)
#undef HORIZ_OP
#undef HORIZ_OP_ALL