CUDA基础-实例演示

CUDA-SDK安装

CUDA10.1安装 +VS2015开发环境搭建

创建项目

打开Visual Studio 2010 旗舰版

创建测试项目（矢量相加）

自动生成矢量相加代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);	//封装函数，c结果，a第一个矢量，b第二个矢量，size矢量长度。

__global__ void addKernel(int *c, const int *a, const int *b) // Kernel声明
{
    int i = threadIdx.x;	//线程，一维情况下线程的编号，数据映射关系通过这种方式建立起来。
    c[i] = a[i] + b[i];
}

int main()
{
	//abc在host端口，所以要拷贝到device端
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };	//数组初始化
    const int b[arraySize] = { 10, 20, 30, 40, 50 };	//数组初始化
    int c[arraySize] = { 0 };	//结果数组

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);	//枚举类型
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
	//abc在host端口，所以要拷贝到device端。有指针没有在GPU分配空间
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

	//设置哪一个GPU
    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

	//GPU分配内存空间
    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

	//CPU端口的数据传入
    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);//（传到哪里，从哪传，传多少，枚举类型代表host-device）
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    
    //c不需要传输

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b);	//<<<1,size>>>1个block，size个线程

	//错误控制
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();	//告诉它已经做完了
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

	//返回数据
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);	//D->H
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;	//返回状态是否成功
}

执行结果

1	{1,2,3,4,5} + {10,20,30,40,50} = {11,22,33,44,55}

CUDA调试怎办

Nsight

addWithCuda 函数中添加

1 2	int blockPerGrid = 2; int threadsPerBlock = 8;

addKernel调用时候

1	addKernel<<<blockPerGrid, threadsPerBlock>>>(dev c, dev a, dev b);

对于addKernel中线程编号问题，永远只对应一个block内的内容

1 2	int i = threadIdx.x; c[i] = a[i] + b[i];

如何联通两个block

1	int i = blockIdx.x * blockDim.x + threadIdx.x; //block编号*block内的线程个数 + 线程号

一万个向量需要一万个线程，但是硬件设置并没有一个block有一万个线程。

一个SM最多2048线程，最多8个block，所以线程最多不能超过2048，最好是2的次方倍数。（256）

线程规划

如果是不规则数字，比如arraySize=17，需要3个block，2*8+1，浪费了7个。

__global__ void addKernel(int  *c, const int *a, const int *b)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i < 17)//让多余的线程不工作 
    	c[i] = a[i] + b[i];
}

矩阵相加

1.host分配数据
2.初始化
3.GPU分配内存
4.配置线程

/*
* Veotor addition : C = A + B
*/
#include <stdio.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define HEIGHT 8
#define WIDTH 9

//更换行列不用改变算法结构

__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
{
	int Col = blockDim.x * blockIdx.x + threadIdx.x;
	int Row = blockDim.y * blockIdx.y + threadIdx.y;
	if (Col < WIDTH && Row < HEIGHT)
	{
		C[Row * WIDTH + Col] = A[Row * WIDTH + Col] + B[Row * WIDTH + Col];
	}
}

int main(void)
{
	// Error code to check return values for CUDA calls
	cudaError_t err = cudaSuccess;

	// Print the vector length to be used,and compute its size
	int numElements = HEIGHT*WIDTH;
	size_t size = numElements * sizeof(float);
	printf("[Bector addition of %d elements]\n", numElements);

	// Alocate the host input vector A
	float *h_A = (float *)malloc(size);

	// Alocate the host input vector B
	float *h_B = (float *)malloc(size);

	// Alocate the host input vector C
	float *h_C = (float *)malloc(size);

	// Verify that allocations succeeded
	if (h_A == NULL || h_B == NULL || h_C == NULL)
	{
		fprintf(stderr, "Failed to allocate host vectors:\n");
		exit(EXIT_FAILURE);
	}

	// initalize the host input vectors
	printf("Matrix A:\n");
	for(int i = 0; i < HEIGHT; ++i)
	{
		for(int j = 0; j < WIDTH; ++j)
		{
			h_A[i * WIDTH +j] = 1 + i;
			printf("%7.2f", h_A[i * WIDTH + j]);
		}
		printf("\n");
	}

	printf("Matrix B:\n");
	for(int i = 0; i < HEIGHT; ++i)
	{
		for(int j = 0; j < WIDTH; ++j)
		{
			h_B[i * WIDTH +j] = 2 + i;
			printf("%7.2f", h_A[i * WIDTH + j]);
		}
		printf("\n");
	}

	// Allocate the device input vector A
	float *d_A = NULL;
	err = cudaMalloc((void **)&d_A, size);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector B
	float *d_B = NULL;
	err = cudaMalloc((void **)&d_B, size);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Allocate the device input vector C
	float *d_C = NULL;
	err = cudaMalloc((void **)&d_C, size);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Copy the host input vectors A and B in host memory to the device input vectors in device memory
	printf("Copy input data from the host memory to the CUDA device\n");

	err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}


	// Launch the Vector Add CUDA Kernel
	// int threadsPerBlock = 8
	// int blocksperGrid = (numElements + threadsPerBlock -1) / threadsPerBlock

	dim3 threadsPerblock(16, 16);
	dim3 blocksPerGrid((WIDTH - 1) / 16 + 1,(HEIGHT - 1) / 16 + 1);	//因为宽度对应grid中表示的是x坐标系，而高度对应表示的是y坐标系
	//printf("CUDA kernel launch with %d blocks of %d threads\n", blockPerGrid, threadsPerBlock);

	vectorAdd<<<blocksPerGrid, threadsPerblock>>>(d_A, d_B, d_C, numElements);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to launch vectorAdd kernel(error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	//Copy the device result vector in device memory in the host result vector in host memory
	printf("Copy output data from the CUDA device in the host memory\n");
	err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	printf("Matrix C:\n");
	for(int i = 0; i < HEIGHT; ++i)
	{
		for(int j = 0; j < WIDTH; ++j)
		{
			printf("%7.2f", h_C[i * WIDTH + j]);
		}
		printf("\n");
	}

	// Verify that the result vector is correct
	for(int i = 0; i < HEIGHT; ++i)
	{	for(int j = 0; j < WIDTH; ++j)
	{
		if (fabs(h_A[i * WIDTH + j] + h_B[i * WIDTH +j] - h_C[i* WIDTH + j]) > 1e-5)
		{
			fprintf(stderr, "Result verification failed at element %d!\n", i);
			exit(EXIT_FAILURE);
		}		
	}
	}
	printf("Test PASSED\n");

	// Free device global memory
	err = cudaFree(d_A);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaFree(d_B);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	err = cudaFree(d_C);

	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	// Free host memory
	free(h_A);
	free(h_B);
	free(h_C);

	// Reset the device and exit
	err = cudaDeviceReset();
	if (err != cudaSuccess)
	{
		fprintf(stderr, "Failed to deinitialize the device!(error code %s)!\n", cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}

	printf("Done\n");
	system("pause");
	return 0;
}

执行结果

[Bector addition of 72 elements]
Matrix A:
   1.00   1.00   1.00   1.00   1.00   1.00   1.00   1.00   1.00
   2.00   2.00   2.00   2.00   2.00   2.00   2.00   2.00   2.00
   3.00   3.00   3.00   3.00   3.00   3.00   3.00   3.00   3.00
   4.00   4.00   4.00   4.00   4.00   4.00   4.00   4.00   4.00
   5.00   5.00   5.00   5.00   5.00   5.00   5.00   5.00   5.00
   6.00   6.00   6.00   6.00   6.00   6.00   6.00   6.00   6.00
   7.00   7.00   7.00   7.00   7.00   7.00   7.00   7.00   7.00
   8.00   8.00   8.00   8.00   8.00   8.00   8.00   8.00   8.00
Matrix B:
   1.00   1.00   1.00   1.00   1.00   1.00   1.00   1.00   1.00
   2.00   2.00   2.00   2.00   2.00   2.00   2.00   2.00   2.00
   3.00   3.00   3.00   3.00   3.00   3.00   3.00   3.00   3.00
   4.00   4.00   4.00   4.00   4.00   4.00   4.00   4.00   4.00
   5.00   5.00   5.00   5.00   5.00   5.00   5.00   5.00   5.00
   6.00   6.00   6.00   6.00   6.00   6.00   6.00   6.00   6.00
   7.00   7.00   7.00   7.00   7.00   7.00   7.00   7.00   7.00
   8.00   8.00   8.00   8.00   8.00   8.00   8.00   8.00   8.00
Copy input data from the host memory to the CUDA device
Copy output data from the CUDA device in the host memory
Matrix C:
   3.00   3.00   3.00   3.00   3.00   3.00   3.00   3.00   3.00
   5.00   5.00   5.00   5.00   5.00   5.00   5.00   5.00   5.00
   7.00   7.00   7.00   7.00   7.00   7.00   7.00   7.00   7.00
   9.00   9.00   9.00   9.00   9.00   9.00   9.00   9.00   9.00
  11.00  11.00  11.00  11.00  11.00  11.00  11.00  11.00  11.00
  13.00  13.00  13.00  13.00  13.00  13.00  13.00  13.00  13.00
  15.00  15.00  15.00  15.00  15.00  15.00  15.00  15.00  15.00
  17.00  17.00  17.00  17.00  17.00  17.00  17.00  17.00  17.00
Test PASSED
Done