CUDA基础-实例演示

CUDA基础-实例演示

[TOC]

CUDA基础-实例演示

CUDA-SDK安装

CUDA10.1安装 +VS2015开发环境搭建

创建项目

  • 打开Visual Studio 2010 旗舰版
  • 创建测试项目(矢量相加)
  • 自动生成矢量相加代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size); //封装函数,c结果,a第一个矢量,b第二个矢量,size矢量长度。

__global__ void addKernel(int *c, const int *a, const int *b) // Kernel声明
{
int i = threadIdx.x; //线程,一维情况下线程的编号,数据映射关系通过这种方式建立起来。
c[i] = a[i] + b[i];
}

int main()
{
//abc在host端口,所以要拷贝到device端
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 }; //数组初始化
const int b[arraySize] = { 10, 20, 30, 40, 50 }; //数组初始化
int c[arraySize] = { 0 }; //结果数组

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); //枚举类型
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}

return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
//abc在host端口,所以要拷贝到device端。有指针没有在GPU分配空间
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;

//设置哪一个GPU
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}

//GPU分配内存空间
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}

//CPU端口的数据传入
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);//(传到哪里,从哪传,传多少,枚举类型代表host-device)
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

//c不需要传输

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b); //<<<1,size>>>1个block,size个线程

//错误控制
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize(); //告诉它已经做完了
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}

//返回数据
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); //D->H
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);

return cudaStatus; //返回状态是否成功
}

执行结果

1
{1,2,3,4,5} + {10,20,30,40,50} = {11,22,33,44,55}

CUDA调试怎办

Nsight

addWithCuda 函数中添加

1
2
int blockPerGrid = 2;
int threadsPerBlock = 8;

addKernel调用时候

1
addKernel<<<blockPerGrid, threadsPerBlock>>>(dev c, dev a, dev b);

对于addKernel中线程编号问题,永远只对应一个block内的内容

1
2
int i = threadIdx.x;
c[i] = a[i] + b[i];

如何联通两个block

1
int i = blockIdx.x * blockDim.x + threadIdx.x;	//block编号*block内的线程个数 + 线程号

一万个向量需要一万个线程,但是硬件设置并没有一个block有一万个线程。

一个SM最多2048线程,最多8个block,所以线程最多不能超过2048,最好是2的次方倍数。(256)

线程规划

如果是不规则数字,比如arraySize=17,需要3个block,2*8+1,浪费了7个。

1
2
3
4
5
6
__global__ void addKernel(int  *c, const int *a, const int *b)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < 17)//让多余的线程不工作
c[i] = a[i] + b[i];
}

矩阵相加

1.host分配数据
2.初始化
3.GPU分配内存
4.配置线程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
/*
* Veotor addition : C = A + B
*/
#include <stdio.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define HEIGHT 8
#define WIDTH 9

//更换行列不用改变算法结构

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int Col = blockDim.x * blockIdx.x + threadIdx.x;
int Row = blockDim.y * blockIdx.y + threadIdx.y;
if (Col < WIDTH && Row < HEIGHT)
{
C[Row * WIDTH + Col] = A[Row * WIDTH + Col] + B[Row * WIDTH + Col];
}
}

int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;

// Print the vector length to be used,and compute its size
int numElements = HEIGHT*WIDTH;
size_t size = numElements * sizeof(float);
printf("[Bector addition of %d elements]\n", numElements);

// Alocate the host input vector A
float *h_A = (float *)malloc(size);

// Alocate the host input vector B
float *h_B = (float *)malloc(size);

// Alocate the host input vector C
float *h_C = (float *)malloc(size);

// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL)
{
fprintf(stderr, "Failed to allocate host vectors:\n");
exit(EXIT_FAILURE);
}

// initalize the host input vectors
printf("Matrix A:\n");
for(int i = 0; i < HEIGHT; ++i)
{
for(int j = 0; j < WIDTH; ++j)
{
h_A[i * WIDTH +j] = 1 + i;
printf("%7.2f", h_A[i * WIDTH + j]);
}
printf("\n");
}

printf("Matrix B:\n");
for(int i = 0; i < HEIGHT; ++i)
{
for(int j = 0; j < WIDTH; ++j)
{
h_B[i * WIDTH +j] = 2 + i;
printf("%7.2f", h_A[i * WIDTH + j]);
}
printf("\n");
}

// Allocate the device input vector A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Allocate the device input vector B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Allocate the device input vector C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Copy the host input vectors A and B in host memory to the device input vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");

err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}


// Launch the Vector Add CUDA Kernel
// int threadsPerBlock = 8
// int blocksperGrid = (numElements + threadsPerBlock -1) / threadsPerBlock

dim3 threadsPerblock(16, 16);
dim3 blocksPerGrid((WIDTH - 1) / 16 + 1,(HEIGHT - 1) / 16 + 1); //因为宽度对应grid中表示的是x坐标系,而高度对应表示的是y坐标系
//printf("CUDA kernel launch with %d blocks of %d threads\n", blockPerGrid, threadsPerBlock);

vectorAdd<<<blocksPerGrid, threadsPerblock>>>(d_A, d_B, d_C, numElements);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

//Copy the device result vector in device memory in the host result vector in host memory
printf("Copy output data from the CUDA device in the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

printf("Matrix C:\n");
for(int i = 0; i < HEIGHT; ++i)
{
for(int j = 0; j < WIDTH; ++j)
{
printf("%7.2f", h_C[i * WIDTH + j]);
}
printf("\n");
}

// Verify that the result vector is correct
for(int i = 0; i < HEIGHT; ++i)
{ for(int j = 0; j < WIDTH; ++j)
{
if (fabs(h_A[i * WIDTH + j] + h_B[i * WIDTH +j] - h_C[i* WIDTH + j]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
}
printf("Test PASSED\n");

// Free device global memory
err = cudaFree(d_A);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

err = cudaFree(d_B);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

err = cudaFree(d_C);

if (err != cudaSuccess)
{
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Free host memory
free(h_A);
free(h_B);
free(h_C);

// Reset the device and exit
err = cudaDeviceReset();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to deinitialize the device!(error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

printf("Done\n");
system("pause");
return 0;
}

执行结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
[Bector addition of 72 elements]
Matrix A:
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00
3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00
4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00
5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00
6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00
7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
Matrix B:
1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00 2.00
3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00
4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00 4.00
5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00
6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00 6.00
7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00
8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00 8.00
Copy input data from the host memory to the CUDA device
Copy output data from the CUDA device in the host memory
Matrix C:
3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00 3.00
5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00 5.00
7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00 7.00
9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00 9.00
11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00 11.00
13.00 13.00 13.00 13.00 13.00 13.00 13.00 13.00 13.00
15.00 15.00 15.00 15.00 15.00 15.00 15.00 15.00 15.00
17.00 17.00 17.00 17.00 17.00 17.00 17.00 17.00 17.00
Test PASSED
Done
文章作者: HibisciDai
文章链接: http://hibiscidai.com/2020/06/10/CUDA基础-实例演示/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 HibisciDai
好用、实惠、稳定的梯子,点击这里