多个非默认CUDA流并发重叠核函数执行和数据传输"/>
GPU编程 CUDA C++ 使用多个非默认CUDA流并发重叠核函数执行和数据传输
缩写解释:
H:Host主机
D:Device设备(GPU)
K:Kernel核函数
H2D:主机复制数据到GPU
KER:核函数执行
D2H:GPU复制数据到主机
理论加速比:
使用一个默认CUDA流:(时间是:1)
Stream 0: H2D ---> KER ---> D2H
使用四个非默认CUDA流并发重叠:(时间是:6/12)
Stream 1: H2D ---> KER ---> D2H
Stream 2: H2D ---> KER ---> D2H
Stream 3: H2D ---> KER ---> D2H
Stream 4: H2D ---> KER ---> D2H
继续增加并发CUDA流数量n后的极限时间是:(n+2)/(3n) 约为 1/3
代码:
kernel-transfer.cu
#include "error.cuh"
#include <math.h>
#include <stdio.h>#ifdef USE_DPtypedef double real;
#elsetypedef float real;
#endifconst int NUM_REPEATS = 10;
const int N = 1 << 22; //对2^22个数据求和
const int M = sizeof(real) * N;
const int MAX_NUM_STREAMS = 128; //测试中最大CUDA流数目
cudaStream_t streams[MAX_NUM_STREAMS];void timing
(const real *h_x, const real *h_y, real *h_z,real *d_x, real *d_y, real *d_z,const int num
);int main(void)
{real *h_x, *h_y, *h_z;CHECK(cudaMallocHost(&h_x, M)); //分配“不可分页”主机内存CHECK(cudaMallocHost(&h_y, M));CHECK(cudaMallocHost(&h_z, M));for (int n = 0; n < N; ++n){h_x[n] = 1.23;h_y[n] = 2.34;}real *d_x, *d_y, *d_z;CHECK(cudaMalloc(&d_x, M));CHECK(cudaMalloc(&d_y, M));CHECK(cudaMalloc(&d_z, M));for (int i = 0; i < MAX_NUM_STREAMS; i++){CHECK(cudaStreamCreate(&(streams[i]))); //创建第i个CUDA流}for (int num = 1; num <= MAX_NUM_STREAMS; num *= 2){timing(h_x, h_y, h_z, d_x, d_y, d_z, num); //依次测试1,2,4,8,16,32,64,128个CUDA流并发执行时间}for (int i = 0 ; i < MAX_NUM_STREAMS; i++){CHECK(cudaStreamDestroy(streams[i])); //销毁第i个CUDA流}CHECK(cudaFreeHost(h_x)); //释放“不可分页”主机内存,不能直接用free()函数释放CHECK(cudaFreeHost(h_y));CHECK(cudaFreeHost(h_z));CHECK(cudaFree(d_x));CHECK(cudaFree(d_y));CHECK(cudaFree(d_z));return 0;
}void __global__ add(const real *x, const real *y, real *z, int N)
{const int n = blockDim.x * blockIdx.x + threadIdx.x;if (n < N){for (int i = 0; i < 400; ++i) //故意重复相加400次,使核函数执行时间和数据传输时间相当{z[n] = x[n] + y[n];}}
}void timing
(const real *h_x, const real *h_y, real *h_z,real *d_x, real *d_y, real *d_z,const int num
)
{int N1 = N / num; //N:总数据个数int M1 = M / num; //M:总内存大小float t_sum = 0;float t2_sum = 0;//计时相关的代码for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat){cudaEvent_t start, stop;CHECK(cudaEventCreate(&start));CHECK(cudaEventCreate(&stop));CHECK(cudaEventRecord(start));cudaEventQuery(start);for (int i = 0; i < num; i++){int offset = i * N1;//H2D过程:CHECK(cudaMemcpyAsync(d_x + offset, h_x + offset, M1, //使用cudaMemcpyAsync()函数异步传输数据h_x H2DcudaMemcpyHostToDevice, streams[i]));CHECK(cudaMemcpyAsync(d_y + offset, h_y + offset, M1, cudaMemcpyHostToDevice, streams[i]));//使用cudaMemcpyAsync()函数异步传输数据h_y H2Dint block_size = 128;int grid_size = (N1 - 1) / block_size + 1;add<<<grid_size, block_size, 0, streams[i]>>> //KER过程:GPU计算调用核函数(d_x + offset, d_y + offset, d_z + offset, N1);//D2H过程:CHECK(cudaMemcpyAsync(h_z + offset, d_z + offset, M1, cudaMemcpyDeviceToHost, streams[i]));//使用cudaMemcpyAsync()函数异步传输数据h_z D2H}CHECK(cudaEventRecord(stop));CHECK(cudaEventSynchronize(stop));float elapsed_time;CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));if (repeat > 0){t_sum += elapsed_time;t2_sum += elapsed_time * elapsed_time;}CHECK(cudaEventDestroy(start));CHECK(cudaEventDestroy(stop));}const float t_ave = t_sum / NUM_REPEATS;const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);printf("%d %g\n", num, t_ave);
}
头文件error.cuh为错误检查宏CHECK函数:
#pragma once
#include <stdio.h>#define CHECK(call) \
do \
{ \const cudaError_t error_code = call; \if (error_code != cudaSuccess) \{ \printf("CUDA Error:\n"); \printf(" File: %s\n", __FILE__); \printf(" Line: %d\n", __LINE__); \printf(" Error code: %d\n", error_code); \printf(" Error text: %s\n", \cudaGetErrorString(error_code)); \exit(1); \} \
} while (0)
编译并运行:
$ nvcc kernel-transfer.cu -o kernel-transfer
$ ./kernel-transfer > k-t.dat
对 k-t.dat 作图:
使用约20个非默认CUDA流并发重叠核函数执行和数据传输,有着最大的性能,得到了约2倍的加速比。
不可能达到理论上3倍的加速比的原因:
1. 从主机到GPU的数据传输量(d_x,d_y)是从GPU到主机的数据传输量(d_z)的两倍。
2. 当CUDA流的数量增加,会带来额外的开销。
更多推荐
GPU编程 CUDA C++ 使用多个非默认CUDA流并发重叠核函数执行和数据传输
发布评论