一个不错的关于CPU和GPU（CUDA）的性能比较讨论话题

编程入门行业动态更新时间:2024-10-14 10:39:13

一个不错的关于CPU和GPU（CUDA）的<a href=https://www.elefans.com/category/jswz/34/1771266.html style= 性能比较讨论话题"/>

一个不错的关于CPU和GPU（CUDA）的性能比较讨论话题

.html
这里是今天的一个论坛的一个帖子，大家可以讨论一下：）
1.那些程序适合用cpu来做，那些适合用gpu来做
2.如果用gpu来做，需要注意那些东西
3.如果需要优化，需要那些思路：）

在lz的代码的基础上做了一些变化，大家可以自己测试一下，就知道那些工作适合用cpu做，那些是适合用gpu来做。
这里面的LOOP_ADD_TIME 从1->10->100->1000->10000....
大家可以做一个测试，看看最后的效果是怎么样的，可以画一个曲线图出来：）

过一段时间，还可以把这个代码在修改一下，添加更多的内容进去，再看看两者的效果怎么样：）

C/C++ code

   
#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <cutil.h>

// Simple utility function to check for CUDA runtime errors
//void checkCUDAError(const char* msg)
#define LOOP_ADD_TIME 100

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock( int*d_a)
{  
    int dx=blockDim.x*blockIdx.x+threadIdx.x;

    for (int i = 1; i <= LOOP_ADD_TIME; i++)
    {
        d_a[dx] += i;
    }
}

int gpu_test()
{
    clock_t start, finish;
    double duration;

    // pointer for host memory and size
    int *h_a,transfer;
    int dimA = 512*21056; // 256K elements (1MB total)

    // pointer for device memory
    int *d_a;

    // define grid and block size
    int numThreadsPerBlock =512;

    // Part 1 of 2: compute number of blocks needed based on array size and desired block size
    int numBlocks = dimA/numThreadsPerBlock; 
    printf("blocks: %d/n",numBlocks);

    // allocate host and device memory
    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);
    CUDA_SAFE_CALL(cudaMalloc( (void **) &d_a, memSize ));

    // Initialize input array on host
    for (int i = 0; i < dimA; ++i)
    {
        h_a[i] = i;
        //printf("%d ",h_a[i]);
    }

    start = clock();
    //unsigned int timer;
    //CUT_SAFE_CALL(cutCreateTimer(&timer));
    //CUT_SAFE_CALL(cutStartTimer(timer));
    // Copy host array to device array
    CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));
    
    // launch kernel
    dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);
    reverseArrayBlock <<< dimGrid, dimBlock >>>( d_a );

    // device to host copy
    CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));

    //CUT_SAFE_CALL(cutStopTimer(timer));
    finish = clock();
    duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
    printf( "gpu time is %f ms/n", duration );
    //printf( "gpu time is %f ms/n", cutGetTimerValue(timer));


    int *h_a2;

    // allocate host memory
    h_a2 = (int *) malloc(memSize);


    // Initialize input array on host
    for (int i = 0; i < dimA; ++i)
    {
        h_a2[i] = i;
        //printf("%d ",h_a[i]);
    }
    for( int j=0; j < dimA ; ++j )
    { 
        for(int k = 1; k <= LOOP_ADD_TIME; k++)
        {
            h_a2[j] += k;
        }
    }

    for( int j=0; j < dimA ; ++j )
    { 
        if (h_a[j] != h_a2[j])printf("error!/n");
    }
    // free host memory
    free(h_a2);

    // free host memory
    free(h_a);

    // free device memory
    cudaFree(d_a);

    return 0;

}

////
// Program main
////
int cpu_test()
{
    clock_t start, finish;
    double duration;

    // pointer for host memory and size
    int *h_a,transfer;
    int dimA = 512*21056; // 256K elements (1MB total)

    // allocate host memory
    size_t memSize = 512*21056* sizeof(int);
    h_a = (int *) malloc(memSize);


    // Initialize input array on host
    for (int i = 0; i < dimA; ++i)
    {
        h_a[i] = i;
        //printf("%d ",h_a[i]);
    }
    printf("/n");

    start = clock();
    for( int j=0; j < dimA ; ++j )
    { 
        for(int k = 1; k <= LOOP_ADD_TIME; k++)
        {
            h_a[j] += k;
        }
    }

    finish = clock();
    duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
    printf( "cpu time is %f ms/n", duration );


    // free host memory
    free(h_a);

    return 0;

}
////
// Program main
////
int main( int argc, char** argv)
{

    CUT_DEVICE_INIT(argc, argv);
    gpu_test();

    cpu_test();
    
    CUT_EXIT(argc, argv);
}