深度学习完全攻略！（连载五：GPU加速技术指南）

编程入门行业动态更新时间:2024-10-07 19:26:50

深度学习完全攻略！（连载五：GPU加速技术指南）

本文以同步至公众号，欢迎订阅。

第四章 cm编译器

这一章，我们就用一个例子来说明cm的client和server时如何联系到一起，并最终运行的。以高斯模糊为例。此例也是intel提供的一个案例，但是没有说明怎么用。哈哈哈。

第一节建一个client的程序

假设文件名字为gauss_client.cpp

#include "cm_rt.h"
#include "common/bitmap_helpers.h"
#include "common/cm_rt_helpers.h"
#include "common/isa_helpers.h"using cm::util::bitmap::BitMap;
// Defines the number of columns per thread.
#define NUM_COLS_PER_THREAD 8
// Defines the number of rows per thread.
#define NUM_ROWS_PER_THREAD 8// Declares coefficients for gaussian filter.
float a0 = 0, a1 = 0, a2 = 0, a3 = 0, b1 = 0, b2 = 0, coefp = 0, coefn = 0;// This function is used to computes coefficients for gaussian filter.
void CalculateCoefficients(float sigma, int order) {const float nsigma = sigma < 0.1f ? 0.1f : sigma;const float alpha = 1.695f / nsigma;const float ema = (float)exp(-alpha);const float ema2 = (float)exp(-2 * alpha);b1 = -2 * ema;b2 = ema2;switch (order) {case 0: {const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2);a0 = k;a1 = k * (alpha - 1) * ema;a2 = k * (alpha + 1) * ema;a3 = -k * ema2;} break;case 1: {const float k = (1 - ema) * (1 - ema) / ema;a0 = k * ema;a1 = a3 = 0;a2 = -a0;} break;case 2: {const floatea = (float)exp(-alpha),k = -(ema2 - 1) / (2 * alpha * ema),kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) /(3 * ea + 1 + 3 * ea * ea + ea * ea * ea));a0 = kn;a1 = -kn * (1 + k * alpha) * ema;a2 = kn * (1 - k * alpha) * ema;a3 = -kn * ema2;} break;default:fprintf(stderr, "gaussianFilter: invalid order parameter!\n");return;}coefp = (a0 + a1) / (1 + b1 + b2);coefn = (a2 + a3) / (1 + b1 + b2);printf("Coefficients are: \n");printf(" a0 = %f, a1 = %f, a2 = %f, a3 = %f, b1 = %f, b2 = %f\n", a0, a1, a2, a3, b1, b2);
}int main(int argc, char *argv[]) {// Loads an input image named "lena.bmp".auto input_image = BitMap::load("lena.bmp");// Gets the width and height of the input image.unsigned int width = input_image.getWidth();unsigned int height = input_image.getHeight();printf("image width = %d, height = %d\n", width, height);// Checks the value of width, height and bpp(bits per pixel) of the image.// Only images in 8-bit RGB format are supported.// Only images with width and height a multiple of 8 are supported.if (width & 7 || height & 7 || input_image.getBPP() != 24) {std::cerr << "Error: Only images in 8-bit RGB format with width and "<< "height a multiple of 8 are supported.\n";std::exit(1);}// Copies input image to output except for the data.auto output_image = input_image;// Sets image size in bytes. There are a total of width*height pixels and// each pixel occupies (out.getBPP()/8) bytes.unsigned int img_size = width * height * output_image.getBPP() / 8;// Sets output to blank image.output_image.setData(new unsigned char[img_size]);// Allocates system memory for rgb_to_rgba to convert image format from// RGB to RGBA.// Allocates system memory for rgba_to_rgb to convert image format from// RGBA to RGB.unsigned int num_pixels = width * height;unsigned char *rgb_to_rgba = new unsigned char[num_pixels * 4];unsigned char *rgba_to_rgb = new unsigned char[num_pixels * 4];// Converts image format from RGB to RGBA.// Copies the RGB values from the image, set the 4th byte with zero.for (int i = 0; i < num_pixels; i++) {rgb_to_rgba[i * 4] = input_image.getData()[i * 3];rgb_to_rgba[i * 4 + 1] = input_image.getData()[i * 3 + 1];rgb_to_rgba[i * 4 + 2] = input_image.getData()[i * 3 + 2];rgb_to_rgba[i * 4 + 3] = 0;}// Computes coefficients for gaussian filter.float sigma = 10.0f;int order = 0;CalculateCoefficients(sigma, order);// Creates a CmDevice from scratch.// Param device: pointer to the CmDevice object.// Param version: CM API version supported by the runtime library.CmDevice *device = nullptr;unsigned int version = 0;cm_result_check(::CreateCmDevice(device, version));// The file gaussian_blur_test_genx.isa is generated when the kernels in the// file gaussian_blur_test_genx.cpp are compiled by the CM compiler.// Reads in the virtual ISA from "gaussian_blur_test_genx.isa" to the code// buffer.std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");if (isa_code.size() == 0) {std::cerr << "Error: empty ISA binary.\n";std::exit(1);}// Creates a CmProgram object consisting of the kernels loaded from the code// buffer.// Param isa_code.data(): Pointer to the code buffer containing the virtual// ISA.// Param isa_code.size(): Size in bytes of the code buffer containing the// virtual ISA.CmProgram *program = nullptr;cm_result_check(device->LoadProgram(const_cast<char *>(isa_code.data()),isa_code.size(),program));// For vertical direction.// Creates the kernel.// Param program: CM Program from which the kernel is created.// Param "gaussianVertical": The kernel name which should be no more than 256// bytes including the null terminator.CmKernel *kernel_vertical = nullptr;cm_result_check(device->CreateKernel(program,"gaussianVertical",kernel_vertical));// Creates input surface with given width and height in pixels and format.CmSurface2D *input_surface = nullptr;cm_result_check(device->CreateSurface2D(4 * width,height,CM_SURFACE_FORMAT_A8,input_surface));// Copies system memory content to the input surface using the CPU. The// system memory content is the data of the input image in RGBA format.// The size of data copied is the size of data in the rgb_to_rgba.cm_result_check(input_surface->WriteSurface(rgb_to_rgba, nullptr));// Creates the temp surface. The width, height and format is the same as// the input surface.// The temp surface contains the output of kernel_vertical.CmSurface2D *temp_surface = nullptr;cm_result_check(device->CreateSurface2D(4 * width,height,CM_SURFACE_FORMAT_A8,temp_surface));// When a surface is created by the CmDevice a SurfaceIndex object is// created. This object contains a unique index value that is mapped to the// surface.// Gets the input surface index.SurfaceIndex *input_surface_idx = nullptr;cm_result_check(input_surface->GetIndex(input_surface_idx));// Gets the temp surface index.SurfaceIndex *temp_surface_idx = nullptr;cm_result_check(temp_surface->GetIndex(temp_surface_idx));// Sets a per kernel argument.// Sets input surface index as the first argument of kernel_vertical.// Sets temp surface index as the second argument of kernel_vertical.cm_result_check(kernel_vertical->SetKernelArg(0,sizeof(SurfaceIndex),input_surface_idx));cm_result_check(kernel_vertical->SetKernelArg(1,sizeof(SurfaceIndex),temp_surface_idx));// Sets the image width and height as the third and the fourth argument// of kernel_vertical.cm_result_check(kernel_vertical->SetKernelArg(2, 4, &width));cm_result_check(kernel_vertical->SetKernelArg(3, 4, &height));// Sets filter coefficients as the rest arguments of kernel_vertical.cm_result_check(kernel_vertical->SetKernelArg(4, 4, &a0));cm_result_check(kernel_vertical->SetKernelArg(5, 4, &a1));cm_result_check(kernel_vertical->SetKernelArg(6, 4, &a2));cm_result_check(kernel_vertical->SetKernelArg(7, 4, &a3));cm_result_check(kernel_vertical->SetKernelArg(8, 4, &b1));cm_result_check(kernel_vertical->SetKernelArg(9, 4, &b2));cm_result_check(kernel_vertical->SetKernelArg(10, 4, &coefp));cm_result_check(kernel_vertical->SetKernelArg(11, 4, &coefn));// Each CmKernel can be executed by multiple concurrent threads.// Here, for "kernel_vertical" kernel, each thread works on// NUM_COLS_PER_THREAD columns in vertical direction.int thread_width = width / NUM_COLS_PER_THREAD;// Creates a CmThreadSpace object.// There are two usage models for the thread space. One is to define the// dependency between threads to run in the GPU. The other is to define a// thread space where each thread can get a pair of coordinates during// kernel execution. For this example, we use the latter usage model.CmThreadSpace *thread_space = nullptr;cm_result_check(device->CreateThreadSpace(thread_width,1,thread_space));// Creates a task queue.// The CmQueue is an in-order queue. Tasks get executed according to the// order they are enqueued. The next task does not start execution until the// current task finishes.CmQueue *cmd_queue = nullptr;cm_result_check(device->CreateQueue(cmd_queue));// Creates a CmTask object.// The CmTask object is a container for CmKernel pointers. It is used to// enqueue the kernels for execution.CmTask *task = nullptr;cm_result_check(device->CreateTask(task));// Adds a CmKernel pointer to CmTask.// This task has one kernel.cm_result_check(task->AddKernel(kernel_vertical));// Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the// function returns immediately without waiting for the GPU to start or// finish execution of the task. The runtime will query the HW status. If// the hardware is not busy, the runtime will submit the task to the// driver/HW; otherwise, the runtime will submit the task to the driver/HW// at another time.// An event, "sync_event", is created to track the status of the task.CmEvent *sync_event = nullptr;cm_result_check(cmd_queue->Enqueue(task,sync_event,thread_space));// Destroys a CmTask object.// CmTask will be destroyed when CmDevice is destroyed.// Here, the application destroys the CmTask object by itself.cm_result_check(device->DestroyTask(task));// For horizontal direction.// Creates the kernel.// Param program: CM Program from which the kernel is created.// Param "gaussianHorizontal": The kernel name which should be no more than// 256 bytes including the null terminator.CmKernel *kernel_horizontal = nullptr;cm_result_check(device->CreateKernel(program,"gaussianHorizontal",kernel_horizontal));// Creates the output surface. The width, height and format is the same as// the input surface.// The output surface contains the output of kernel_horizontal.CmSurface2D *output_surface = nullptr;cm_result_check(device->CreateSurface2D(4 * width,height,CM_SURFACE_FORMAT_A8,output_surface));// Gets the output surface index.SurfaceIndex *output_surface_idx = nullptr;cm_result_check(output_surface->GetIndex(output_surface_idx));// Sets a per kernel argument.// Sets the output of kernel_vertical as the input of kernel_horizontal.// Sets temp surface index as the first argument of kernel_horizontal.// Sets output surface index as the second argument of kernel_horizontal.cm_result_check(kernel_horizontal->SetKernelArg(0,sizeof(SurfaceIndex),temp_surface_idx));cm_result_check(kernel_horizontal->SetKernelArg(1,sizeof(SurfaceIndex),output_surface_idx));// Sets the image width and height as the third and the fourth argument// of kernel_horizontal.cm_result_check(kernel_horizontal->SetKernelArg(2, 4, &width));cm_result_check(kernel_horizontal->SetKernelArg(3, 4, &height));// Sets filter coefficients as the rest arguments of kernel_horizontal.cm_result_check(kernel_horizontal->SetKernelArg(4, 4, &a0));cm_result_check(kernel_horizontal->SetKernelArg(5, 4, &a1));cm_result_check(kernel_horizontal->SetKernelArg(6, 4, &a2));cm_result_check(kernel_horizontal->SetKernelArg(7, 4, &a3));cm_result_check(kernel_horizontal->SetKernelArg(8, 4, &b1));cm_result_check(kernel_horizontal->SetKernelArg(9, 4, &b2));cm_result_check(kernel_horizontal->SetKernelArg(10, 4, &coefp));cm_result_check(kernel_horizontal->SetKernelArg(11, 4, &coefn));// Each CmKernel can be executed by multiple concurrent threads.// Here, for "kernel_horizontal" kernel, each thread works on// NUM_ROWS_PER_THREAD rows in horizontal direction.int thread_height = height / NUM_ROWS_PER_THREAD;// Creates a CmThreadSpace object.// There are two usage models for the thread space. One is to define the// dependency between threads to run in the GPU. The other is to define a// thread space where each thread can get a pair of coordinates during// kernel execution. For this example, we use the latter usage model.cm_result_check(device->CreateThreadSpace(thread_height,1,thread_space));// Creates a CmTask object.// The CmTask object is a container for CmKernel pointers. It is used to// enqueue the kernels for execution.cm_result_check(device->CreateTask(task));// Adds a CmKernel pointer to CmTask.// This task has one kernels.cm_result_check(task->AddKernel(kernel_horizontal));// Launches the task on the GPU. Enqueue is a non-blocking call, i.e. the// function returns immediately without waiting for the GPU to start or// finish execution of the task. The runtime will query the HW status. If// the hardware is not busy, the runtime will submit the task to the// driver/HW; otherwise, the runtime will submit the task to the driver/HW// at another time.// An event, "sync_event", is created to track the status of the task.cm_result_check(cmd_queue->Enqueue(task,sync_event,thread_space));// Destroys a CmTask object.// CmTask will be destroyed when CmDevice is destroyed.// Here, the application destroys the CmTask object by itself.cm_result_check(device->DestroyTask(task));// Reads the output surface content to the system memory using the CPU.// The size of data copied is the size of data in Surface.// It is a blocking call. The function will not return until the copy// operation is completed.// The dependent event "sync_event" ensures that the reading of the surface// will not happen until its state becomes CM_STATUS_FINISHED.cm_result_check(output_surface->ReadSurface(rgba_to_rgb,sync_event));// Destroys the CmDevice.// Also destroys surfaces, kernels, tasks, thread spaces, and queues that// were created using this device instance that have not explicitly been// destroyed by calling the respective destroy functions.cm_result_check(::DestroyCmDevice(device));// Converts image format from RGBA to RGB.unsigned char *tmp = new unsigned char[num_pixels * 3];for (int i = 0; i < num_pixels; i++) {tmp[i * 3] = rgba_to_rgb[i * 4];tmp[i * 3 + 1] = rgba_to_rgb[i * 4 + 1];tmp[i * 3 + 2] = rgba_to_rgb[i * 4 + 2];}output_image.setData(tmp);// Saves the output image data into the file "blur_out.bmp".output_image.save("blur_out.bmp");// Frees memory.delete[] rgb_to_rgba;delete[] rgba_to_rgb;// Checks result.if (BitMap::checkResult("blur_out.bmp","blur_gold.bmp",5)) {std::cout << "PASSED" << std::endl;return 0;} else {std::cout << "FAILED" << std::endl;return -1;}
}

第二节建一个server的程序

假设文件名字为gauss_genx.cpp

#include <cm/cm.h>
#define NUM_COMPONENTS 4
// number of rows we read in at once
#define NUM_ROWS_PER_ITER 8
// number of columns per thread
#define NUM_COLS_PER_THREAD 8
#define SIMD_SIZE (NUM_COLS_PER_THREAD * NUM_COMPONENTS)#define CLAMP_TO_EDGE 1// for horizontal direction
// number of rows per thread
#define NUM_ROWS_PER_THREAD 8
// number of columns we read in at once
#define NUM_COLS_PER_ITER 8// Each thread processes 32 columns independently
// For now assume height is divisible by 8
extern "C" _GENX_MAIN_ void
gaussianVertical( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{matrix<uchar, NUM_ROWS_PER_ITER, SIMD_SIZE> image;matrix<uchar, NUM_ROWS_PER_ITER, SIMD_SIZE> outImage;vector<float, SIMD_SIZE> in;vector<float, SIMD_SIZE> out;vector<float, SIMD_SIZE> inMinusOne;vector<float, SIMD_SIZE> outMinusOne;vector<float, SIMD_SIZE> outMinusTwo;uint id = get_thread_origin_x();#if CLAMP_TO_EDGEmatrix<uchar, 1, SIMD_SIZE> firstRow;read( INBUF, id * SIMD_SIZE, 0, firstRow);inMinusOne = firstRow;inMinusOne *= 1/255.0f;outMinusTwo = coefp * inMinusOne;outMinusOne = outMinusTwo;
#elseinMinusOne = 0;outMinusOne = 0;outMinusTwo = 0;
#endif//read in 8 rows at a timefor( int i = 0; i < height; i += NUM_ROWS_PER_ITER ) {read( INBUF, id * SIMD_SIZE, i, image );#pragma unrollfor( unsigned j = 0; j < NUM_ROWS_PER_ITER; j++ ) {in = image.row(j);in *= 1/255.0f;//out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);inMinusOne = in;outMinusTwo = outMinusOne;outMinusOne = out;//clamp the value to [0,1]out = cm_add<float>(out, 0.0f, SAT);outImage.row(j) = out * 255.0f;}//write back to surfacewrite( OUTBUF, id*SIMD_SIZE, i, outImage );}vector<float, SIMD_SIZE> inPlusOne;vector<float, SIMD_SIZE> inPlusTwo;vector<float, SIMD_SIZE> outPlusOne;vector<float, SIMD_SIZE> outPlusTwo;vector<float, SIMD_SIZE> temp;#if CLAMP_TO_EDGEmatrix<uchar, 1, SIMD_SIZE> lastRow;read( INBUF, id * SIMD_SIZE, height - 1, lastRow );inPlusOne = lastRow;inPlusOne *= 1/255.0f;inPlusTwo = inPlusOne;outPlusOne = coefn * inPlusOne;outPlusTwo = outPlusOne;
#elseinPlusOne = 0;inPlusTwo = 0;outPlusOne = 0;outPlusTwo = 0;
#endif//read 8 rows at a time, in reverse directionfor( int i = height - NUM_ROWS_PER_ITER; i >= 0; i -= NUM_ROWS_PER_ITER ) {read( INBUF, id * SIMD_SIZE, i, image );read( MODIFIED(OUTBUF), id * SIMD_SIZE, i, outImage );#pragma unrollfor( int j = NUM_ROWS_PER_ITER - 1; j >= 0; j-- ) {in = image.row(j);in *= 1 / 255.0f;//temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);inPlusTwo = inPlusOne;inPlusOne = in;outPlusTwo = outPlusOne;outPlusOne = temp;out = outImage.row(j);out = cm_add<float>( out * (1/255.0f), temp, SAT );outImage.row(j) = out * 255;}//write back to surfacewrite( OUTBUF, id*SIMD_SIZE, i, outImage );}
}extern "C" _GENX_MAIN_ void
transpose( SurfaceIndex INBUF, SurfaceIndex OUTBUF, unsigned id, int width, int height ) {matrix<uint, 8, 8> in;matrix<uint, 8, 8> out;for( int i = 0; i < height; i += 8 ) {read( INBUF, id * 32, i, in );out.row(0) = in.column(0);out.row(1) = in.column(1);out.row(2) = in.column(2);out.row(3) = in.column(3);out.row(4) = in.column(4);out.row(5) = in.column(5);out.row(6) = in.column(6);out.row(7) = in.column(7);write( OUTBUF, i * 4, id * 8, out );}
}// Like gaussianVertical, except we process 8 independent rows at once
extern "C" _GENX_MAIN_ void
gaussianHorizontal( SurfaceIndex INBUF, SurfaceIndex OUTBUF, int width, int height, float a0, float a1, float a2, float a3, float b1, float b2, float coefp, float coefn )
{matrix<uchar, NUM_ROWS_PER_THREAD, NUM_COLS_PER_ITER * NUM_COMPONENTS> image;matrix<uchar, NUM_ROWS_PER_THREAD, NUM_COLS_PER_ITER * NUM_COMPONENTS> outImage;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> in;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> out;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> inMinusOne;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outMinusOne;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outMinusTwo;uint id = get_thread_origin_x();#if CLAMP_TO_EDGEmatrix<uchar, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> firstColumn;read( MODIFIED(INBUF), 0, id * NUM_ROWS_PER_THREAD, firstColumn );inMinusOne = firstColumn;inMinusOne *= 1/255.0f;outMinusTwo = coefp * inMinusOne;outMinusOne = outMinusTwo;
#elseinMinusOne = 0;outMinusOne = 0;outMinusTwo = 0;
#endif//read 8 rows at a timefor( int i = 0; i < width; i += NUM_COLS_PER_ITER ) {read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );#pragma unrollfor( unsigned j = 0; j < NUM_COLS_PER_ITER; j++ ) {in = image.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j * NUM_COMPONENTS);in *= 1/255.0f;//out = a0 * in + a1 * inMinusOne - b1 * outMinusOne - b2 * outMinusTwo;out = a0 * in + a1 * inMinusOne - (b1 * outMinusOne + b2 * outMinusTwo);inMinusOne = in;outMinusTwo = outMinusOne;outMinusOne = out;//clamp the value to [0,1]out = cm_add<float>( out, 0.0f, SAT ) * 255.0f;outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS) = out;}//write back to surfacewrite( OUTBUF, i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );}//reverse directionmatrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> inPlusOne;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> inPlusTwo;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outPlusOne;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> outPlusTwo;matrix<float, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> temp;#if CLAMP_TO_EDGEmatrix<uchar, NUM_ROWS_PER_THREAD, NUM_COMPONENTS> lastColumn;read( MODIFIED(INBUF), width - NUM_COMPONENTS, id * 8, lastColumn );inPlusOne = lastColumn;inPlusOne *= 1/255.0f;inPlusTwo = inPlusOne;outPlusOne = coefn * inPlusOne;outPlusTwo = outPlusOne;
#elseinPlusOne = 0;inPlusTwo = 0;outPlusOne = 0;outPlusTwo = 0;
#endiffor( int i = width - NUM_COLS_PER_ITER; i >= 0; i -= NUM_COLS_PER_ITER ) {read( MODIFIED(INBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, image );read( MODIFIED(OUTBUF), i * NUM_COMPONENTS, id * NUM_ROWS_PER_THREAD, outImage );#pragma unrollfor( int j = NUM_COLS_PER_ITER - 1; j >= 0; j-- ) {in = image.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS);in *= 1/255.0f;//temp = a2 * inPlusOne + a3 * inPlusTwo - b1 * outPlusOne - b2 * outPlusTwo;temp = a2 * inPlusOne + a3 * inPlusTwo - (b1 * outPlusOne + b2 * outPlusTwo);inPlusTwo = inPlusOne;inPlusOne = in;outPlusTwo = outPlusOne;outPlusOne = temp;//The mul * 1 forces out to not be coalesced with outImage, so we can use SIMD16//operations instead of SIMD4out = outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS) * 1.0f;//out = outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS);out = cm_add<float>( out * (1/255.0f), temp, SAT );outImage.select<NUM_ROWS_PER_THREAD, 1, NUM_COMPONENTS, 1>(0, j*NUM_COMPONENTS) = out * 255.0f;}write( OUTBUF, i * NUM_COMPONENTS, id * 8, outImage );}
}

第三节最重要的编译

在第一节中，有一行代码时核心，

std::string isa_code = cm::util::isa::loadFile("gauss_genx.isa");

那么这个gauss_genx.isa 是哪来的呢？用NVIDIA显卡做过深度学习的同学肯定知道ISA总线这个名词，本文不写这些。主要是告诉您，怎么由gauss_genx.cpp变为gauss_genx.isa.

首先，你得找到cm的编译器。

（1）从

（2）安装：VS2015（或以上版本），安装python2.7，安装cygwin,安装cmake,安装unzip,安装curl

（3）打开cygwin,进入cm-compiler里面执行下面的代码：

cmake path/to/llvm/source/root

cmake --build .

（4）执行support/scripts/build.bash -s vs2015 -d -m --64

（5）找到.exe，应该是在build.64.vs2015文件夹里面。