性能比较讨论话题"/>
一个不错的关于CPU和GPU(CUDA)的性能比较讨论话题
.html这里是今天的一个论坛的一个帖子,大家可以讨论一下:)
1.那些程序适合用cpu来做,那些适合用gpu来做
2.如果用gpu来做,需要注意那些东西
3.如果需要优化,需要那些思路:)
在lz的代码的基础上做了一些变化,大家可以自己测试一下,就知道那些工作适合用cpu做,那些是适合用gpu来做。
这里面的LOOP_ADD_TIME 从1->10->100->1000->10000....
大家可以做一个测试,看看最后的效果是怎么样的,可以画一个曲线图出来:)
过一段时间,还可以把这个代码在修改一下,添加更多的内容进去,再看看两者的效果怎么样:)
- C/C++ code
-
#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <cutil.h>
// Simple utility function to check for CUDA runtime errors
//void checkCUDAError(const char* msg)
#define LOOP_ADD_TIME 100
// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock( int*d_a)
{
int dx=blockDim.x*blockIdx.x+threadIdx.x;
for (int i = 1; i <= LOOP_ADD_TIME; i++)
{
d_a[dx] += i;
}
}
int gpu_test()
{
clock_t start, finish;
double duration;
// pointer for host memory and size
int *h_a,transfer;
int dimA = 512*21056; // 256K elements (1MB total)
// pointer for device memory
int *d_a;
// define grid and block size
int numThreadsPerBlock =512;
// Part 1 of 2: compute number of blocks needed based on array size and desired block size
int numBlocks = dimA/numThreadsPerBlock;
printf("blocks: %d/n",numBlocks);
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
CUDA_SAFE_CALL(cudaMalloc( (void **) &d_a, memSize ));
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf("%d ",h_a[i]);
}
start = clock();
//unsigned int timer;
//CUT_SAFE_CALL(cutCreateTimer(&timer));
//CUT_SAFE_CALL(cutStartTimer(timer));
// Copy host array to device array
CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock <<< dimGrid, dimBlock >>>( d_a );
// device to host copy
CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));
//CUT_SAFE_CALL(cutStopTimer(timer));
finish = clock();
duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
printf( "gpu time is %f ms/n", duration );
//printf( "gpu time is %f ms/n", cutGetTimerValue(timer));
int *h_a2;
// allocate host memory
h_a2 = (int *) malloc(memSize);
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a2[i] = i;
//printf("%d ",h_a[i]);
}
for( int j=0; j < dimA ; ++j )
{
for(int k = 1; k <= LOOP_ADD_TIME; k++)
{
h_a2[j] += k;
}
}
for( int j=0; j < dimA ; ++j )
{
if (h_a[j] != h_a2[j])printf("error!/n");
}
// free host memory
free(h_a2);
// free host memory
free(h_a);
// free device memory
cudaFree(d_a);
return 0;
}
////
// Program main
////
int cpu_test()
{
clock_t start, finish;
double duration;
// pointer for host memory and size
int *h_a,transfer;
int dimA = 512*21056; // 256K elements (1MB total)
// allocate host memory
size_t memSize = 512*21056* sizeof(int);
h_a = (int *) malloc(memSize);
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf("%d ",h_a[i]);
}
printf("/n");
start = clock();
for( int j=0; j < dimA ; ++j )
{
for(int k = 1; k <= LOOP_ADD_TIME; k++)
{
h_a[j] += k;
}
}
finish = clock();
duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
printf( "cpu time is %f ms/n", duration );
// free host memory
free(h_a);
return 0;
}
////
// Program main
////
int main( int argc, char** argv)
{
CUT_DEVICE_INIT(argc, argv);
gpu_test();
cpu_test();
CUT_EXIT(argc, argv);
}
更多推荐
一个不错的关于CPU和GPU(CUDA)的性能比较讨论话题
发布评论