-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cu
98 lines (74 loc) · 3.8 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/**************************************************************************************
* CUDA Convolution Operation Example
* - naive implementation of Conv
* - Im2Col implementation of Conv
***************************************************************************************/
#include <vector>
#include <iostream>
#include <cstdlib>
#include <algorithm>
#include <cstdio>
#include <helper.cuh>
#include <conv_cpu.cuh>
#include <conv_gpu_naive.cuh>
#include <conv_gpu_matmul.cuh>
/***************************************************************************
* Problem configuration
***************************************************************************/
#define DTYPE float
const int BATCH_NUM = 12;
const int INPUT_H = 128;
const int INPUT_W = 128;
const int INPUT_C = 32;
const int FILTER_H = 7;
const int FILTER_W = 7;
const int PAD_H = 1;
const int PAD_W = 1;
const int STRIDE_H = 1;
const int STRIDE_W = 1;
const int OUTPUT_H = (INPUT_H-FILTER_H+2*PAD_H)/STRIDE_H + 1;
const int OUTPUT_W = (INPUT_W-FILTER_W+2*PAD_W)/STRIDE_W + 1;
const int OUTPUT_C = 32;
int main(void) {
srand(1);
printf("===================================================================\n");
printf("CUDA Convolution Operation Example\n");
printf(" - Input size: [%d,%d,%d,%d], filter size: [%d,%d,%d,%d], pad: [%d,%d], stride: [%d,%d] -> output size: [%d,%d,%d,%d]\n",
BATCH_NUM,INPUT_C,INPUT_H,INPUT_W, INPUT_C,OUTPUT_C,FILTER_H,FILTER_W, PAD_H,PAD_W, STRIDE_H,STRIDE_W, BATCH_NUM,OUTPUT_C,OUTPUT_H,OUTPUT_W);
printf(" - Size of input[%.3fGB], output[%.3fGB], col[%.3fGB]: \n",
1.0f*sizeof(DTYPE)*BATCH_NUM*INPUT_C*INPUT_W*INPUT_H*1e-9, 1.0f*sizeof(DTYPE)*BATCH_NUM*OUTPUT_C*OUTPUT_W*OUTPUT_H*1e-9,
sizeof(DTYPE)*BATCH_NUM*INPUT_C*FILTER_H*FILTER_W*OUTPUT_H*OUTPUT_W*1e-9
);
printf(" - Target algorithm: \n");
printf(" - Naive implementation of Conv\n");
printf(" - Im2Col implementation of Conv\n");
printf("===================================================================\n");
printf("\n");
/***************************************************************************
* Data initialization
***************************************************************************/
// Define data
std::vector<DTYPE> input(BATCH_NUM*INPUT_C*INPUT_W*INPUT_H); // B*C*H*W
std::vector<DTYPE> filter(INPUT_C*OUTPUT_C*FILTER_W*FILTER_H); // OUT_C*IN_C*H*W
std::vector<DTYPE> output(BATCH_NUM*OUTPUT_C*OUTPUT_W*OUTPUT_H, 0); // B*C*H*W
std::vector<DTYPE> gt(BATCH_NUM*OUTPUT_C*OUTPUT_W*OUTPUT_H, 0); // C*H*W
// Initial with random value
std::generate(input.begin(), input.end(), [](){return (std::rand()%101-50)/10;});
std::generate(filter.begin(), filter.end(), [](){return (std::rand()%101-50)/10;});
/***************************************************************************
* Get ground truth via CPU
***************************************************************************/
conv_cpu<DTYPE>(input, filter, gt, BATCH_NUM, INPUT_C,INPUT_H,INPUT_W, FILTER_H,FILTER_W, PAD_H,PAD_W, STRIDE_H,STRIDE_W, OUTPUT_C,OUTPUT_H,OUTPUT_W);
/***************************************************************************
* Launch GPU naive implementation
***************************************************************************/
conv_gpu_naive<DTYPE, 16>(input, filter, output, BATCH_NUM, INPUT_C,INPUT_H,INPUT_W, FILTER_H,FILTER_W, PAD_H,PAD_W, STRIDE_H,STRIDE_W, OUTPUT_C,OUTPUT_H,OUTPUT_W);
#ifdef DEBUG_ON
check_result(output, gt);
#endif
conv_gpu_matmul<DTYPE, 16>(input, filter, output, BATCH_NUM, INPUT_C,INPUT_H,INPUT_W, FILTER_H,FILTER_W, PAD_H,PAD_W, STRIDE_H,STRIDE_W, OUTPUT_C,OUTPUT_H,OUTPUT_W);
#ifdef DEBUG_ON
check_result(output, gt);
#endif
return 0;
}