forked from inducer/pycuda
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_cdpSimplePrint.py
121 lines (107 loc) · 4.66 KB
/
demo_cdpSimplePrint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
'''
* demo_cdpSimplePrint.py
*
* Adapted from NVIDIA's "cdpSimplePrint - Simple Print (CUDA Dynamic Parallelism)" sample
* http://docs.nvidia.com/cuda/cuda-samples/index.html#simple-print--cuda-dynamic-parallelism-
* http://ecee.colorado.edu/~siewerts/extra/code/example_code_archive/a490dmis_code/CUDA/cuda_work/samples/0_Simple/cdpSimplePrint/cdpSimplePrint.cu
*
* From cdpSimplePrint.cu (not sure if this is Ok with NVIDIA's 38-page EULA though...):
* ---------------------------------------------------------------------------
* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
* ---------------------------------------------------------------------------
'''
import sys, os
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import DynamicSourceModule
cdpSimplePrint_cu = '''
#include <cstdio>
////////////////////////////////////////////////////////////////////////////////
// Variable on the GPU used to generate unique identifiers of blocks.
////////////////////////////////////////////////////////////////////////////////
__device__ int g_uids = 0;
////////////////////////////////////////////////////////////////////////////////
// Print a simple message to signal the block which is currently executing.
////////////////////////////////////////////////////////////////////////////////
__device__ void print_info( int depth, int thread, int uid, int parent_uid )
{
if( threadIdx.x == 0 )
{
if( depth == 0 )
printf( "BLOCK %d launched by the host\\n", uid );
else
{
char buffer[32];
for( int i = 0 ; i < depth ; ++i )
{
buffer[3*i+0] = '|';
buffer[3*i+1] = ' ';
buffer[3*i+2] = ' ';
}
buffer[3*depth] = '\\0';
printf( "%sBLOCK %d launched by thread %d of block %d\\n", buffer, uid, thread, parent_uid );
}
}
__syncthreads();
}
////////////////////////////////////////////////////////////////////////////////
// The kernel using CUDA dynamic parallelism.
//
// It generates a unique identifier for each block. Prints the information
// about that block. Finally, if the 'max_depth' has not been reached, the
// block launches new blocks directly from the GPU.
////////////////////////////////////////////////////////////////////////////////
__global__ void cdp_kernel( int max_depth, int depth, int thread, int parent_uid )
{
// We create a unique ID per block. Thread 0 does that and shares the value with the other threads.
__shared__ int s_uid;
if( threadIdx.x == 0 )
{
s_uid = atomicAdd( &g_uids, 1 );
}
__syncthreads();
// We print the ID of the block and information about its parent.
print_info( depth, thread, s_uid, parent_uid );
// We launch new blocks if we haven't reached the max_depth yet.
if( ++depth >= max_depth )
{
return;
}
cdp_kernel<<<gridDim.x, blockDim.x>>>( max_depth, depth, threadIdx.x, s_uid );
}
'''
def main(argv):
max_depth = 2
if len(argv) > 1:
if len(argv) == 2 and argv[1].isdigit() and int(argv[1]) >= 1 and int(argv[1]) <= 8:
max_depth = int(argv[1])
else:
print("Usage: %s <max_depth>\t(where max_depth is a value between 1 and 8)." % argv[0])
sys.exit(0)
print("starting Simple Print (CUDA Dynamic Parallelism)")
mod = DynamicSourceModule(cdpSimplePrint_cu)
cdp_kernel = mod.get_function('cdp_kernel').prepare('iiii').prepared_call
print("***************************************************************************")
print("The CPU launches 2 blocks of 2 threads each. On the device each thread will")
print("launch 2 blocks of 2 threads each. The GPU we will do that recursively")
print("until it reaches max_depth=%d\n" % max_depth)
print("In total 2")
num_blocks, sum = 2, 2
for i in range(1, max_depth):
num_blocks *= 4
print("+%d" % num_blocks)
sum += num_blocks
print("=%d blocks are launched!!! (%d from the GPU)" % (sum, sum-2))
print("***************************************************************************\n")
pycuda.autoinit.context.set_limit(cuda.limit.PRINTF_FIFO_SIZE, 11 << 20) # make this work at depth=8
pycuda.autoinit.context.set_limit(cuda.limit.DEV_RUNTIME_SYNC_DEPTH, max_depth)
print("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n")
cdp_kernel((2,1), (2,1,1), max_depth, 0, 0, -1)
if __name__ == "__main__":
main(sys.argv)