为什么Opencv GPU代码比CPU慢？(Why Opencv GPU code is slower than CPU?)

笔记本使用opencv242 + VS2010。我试图对OpenCV中的GPU块进行一些简单的测试，但它显示GPU比CPU代码慢100倍。在这段代码中，我只是将彩色图像转换为灰度图像，使用cvtColor函数

这里是我的代码，PART1是CPU代码（测试CPU RGB2GRAY），PART2是上传图像到GPU，PART3是GPU RGB2GRAY，PART4是CPU RGB2GRAY。有三件事让我如此疑惑：

1在我的代码中，part1是0.3ms，而part4（与part1完全相同）是40ms！ 2将图片上传到GPU的part2是6000ms !!! 3 Part3（GPU代码）是11ms，对于这个简单的图像来说太慢了！

#include "StdAfx.h" #include <iostream> #include "opencv2/opencv.hpp" #include "opencv2/gpu/gpu.hpp" #include "opencv2/gpu/gpumat.hpp" #include "opencv2/core/core.hpp" #include "opencv2/highgui/highgui.hpp" #include <cuda.h> #include <cuda_runtime_api.h> #include <ctime> #include <windows.h> using namespace std; using namespace cv; using namespace cv::gpu; int main() { LARGE_INTEGER freq; LONGLONG QPart1,QPart6; double dfMinus, dfFreq, dfTim; QueryPerformanceFrequency(&freq); dfFreq = (double)freq.QuadPart; cout<<getCudaEnabledDeviceCount()<<endl; Mat img_src = imread("d:\\CUDA\\train.png", 1); // PART1 CPU code~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // From color image to grayscale image. QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; Mat img_gray; cvtColor(img_src,img_gray,CV_BGR2GRAY); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("CPU RGB2GRAY running time is %.2f ms\n\n",dfTim); // PART2 GPU upload image~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GpuMat gimg_src; QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; gimg_src.upload(img_src); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("Read image running time is %.2f ms\n\n",dfTim); GpuMat dst1; QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; /*dst.upload(src_host);*/ dst1.upload(imread("d:\\CUDA\\train.png", 1)); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("Read image running time 2 is %.2f ms\n\n",dfTim); // PART3~ GPU code~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // gpuimage From color image to grayscale image. QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; GpuMat gimg_gray; gpu::cvtColor(gimg_src,gimg_gray,CV_BGR2GRAY); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("GPU RGB2GRAY running time is %.2f ms\n\n",dfTim); // PART4~CPU code(again)~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // gpuimage From color image to grayscale image. QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; Mat img_gray2; cvtColor(img_src,img_gray2,CV_BGR2GRAY); BOOL i_test=QueryPerformanceCounter(&freq); printf("%d \n",i_test); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("CPU RGB2GRAY running time is %.2f ms\n\n",dfTim); cvWaitKey(); getchar(); return 0; }

I'm using opencv242 + VS2010 by a notebook. I tried to do some simple test of the GPU block in OpenCV, but it showed the GPU is 100 times slower than CPU codes. In this code, I just turn the color image to grayscale image, use the function of cvtColor

Here is my code, PART1 is CPU code(test cpu RGB2GRAY), PART2 is upload image to GPU, PART3 is GPU RGB2GRAY, PART4 is CPU RGB2GRAY again. There are 3 things makes me so wondering:

1 In my code, part1 is 0.3ms, while part4 (which is exactly same with part1) is 40ms!!! 2 The part2 which upload image to GPU is 6000ms!!! 3 Part3( GPU codes) is 11ms, it is so slow for this simple image!

#include "StdAfx.h" #include <iostream> #include "opencv2/opencv.hpp" #include "opencv2/gpu/gpu.hpp" #include "opencv2/gpu/gpumat.hpp" #include "opencv2/core/core.hpp" #include "opencv2/highgui/highgui.hpp" #include <cuda.h> #include <cuda_runtime_api.h> #include <ctime> #include <windows.h> using namespace std; using namespace cv; using namespace cv::gpu; int main() { LARGE_INTEGER freq; LONGLONG QPart1,QPart6; double dfMinus, dfFreq, dfTim; QueryPerformanceFrequency(&freq); dfFreq = (double)freq.QuadPart; cout<<getCudaEnabledDeviceCount()<<endl; Mat img_src = imread("d:\\CUDA\\train.png", 1); // PART1 CPU code~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // From color image to grayscale image. QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; Mat img_gray; cvtColor(img_src,img_gray,CV_BGR2GRAY); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("CPU RGB2GRAY running time is %.2f ms\n\n",dfTim); // PART2 GPU upload image~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GpuMat gimg_src; QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; gimg_src.upload(img_src); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("Read image running time is %.2f ms\n\n",dfTim); GpuMat dst1; QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; /*dst.upload(src_host);*/ dst1.upload(imread("d:\\CUDA\\train.png", 1)); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("Read image running time 2 is %.2f ms\n\n",dfTim); // PART3~ GPU code~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // gpuimage From color image to grayscale image. QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; GpuMat gimg_gray; gpu::cvtColor(gimg_src,gimg_gray,CV_BGR2GRAY); QueryPerformanceCounter(&freq); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("GPU RGB2GRAY running time is %.2f ms\n\n",dfTim); // PART4~CPU code(again)~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // gpuimage From color image to grayscale image. QueryPerformanceCounter(&freq); QPart1 = freq.QuadPart; Mat img_gray2; cvtColor(img_src,img_gray2,CV_BGR2GRAY); BOOL i_test=QueryPerformanceCounter(&freq); printf("%d \n",i_test); QPart6 = freq.QuadPart; dfMinus = (double)(QPart6 - QPart1); dfTim = 1000 * dfMinus / dfFreq; printf("CPU RGB2GRAY running time is %.2f ms\n\n",dfTim); cvWaitKey(); getchar(); return 0; }

最满意答案

cvtColor并没有做太多的工作，为了让所有你需要的是平均三个数字。

CPU上的cvColor代码使用SSE2指令一次处理多达8个像素，如果您有使用所有内核/超线程的TBB，则CPU以10倍于GPU的时钟速度运行，最后您不必将数据复制到GPU并返回。

cvtColor isn't doing very much work, to make grey all you have to is average three numbers.

The cvColor code on the CPU is using SSE2 instructions to process upto 8 pixels at once and if you have TBB it's using all the cores/hyperthreads, the CPU is running at 10x the clock speed of the GPU and finally you don't have to copy data onto the GPU and back.

更多推荐

为什么Opencv GPU代码比CPU慢？(Why Opencv GPU code is slower than CPU?)

最满意答案

发布评论取消回复

最近发表

热门文章

标签列表