cudaMalloc或cudaMemcpy上的分段错误(Segmentation fault on cudaMalloc or cudaMemcpy)

CUDA编程的新手，并且为什么我在以下代码中获得segfault非常困惑：

#include <cuda.h> #include <stdio.h> #include <stdint.h> #include <fstream> #include <iostream> #include <sstream> #include <string> using namespace std; typedef struct password_t{ char word[56]; size_t length; } password; typedef struct libEntry_t{ uint8_t digest[16]; password pwd; } libEntry; // Generates a library of passwords and their corresponding MD5 hashes // // Params: // numPwds - the number of passwords for which to generate hashes // pwds - the list of passwords to hash // library - the array in which to store the unhashed/hashed password library __global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library) { // __device__ void cuda_md5(const password *pwd, uint8_t *digest) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; uint8_t hashed[16]; if (index < numPwds) { cuda_md5(&pwds[index], hashed); for (int j = 0; j < 16; j++) { library[index].digest[j] = hashed[j]; } library[index].pwd = pwds[index]; } } int crack_password (uint8_t* classified) { int count = 10; unsigned int mem_size = sizeof(password) * count; password *h_pwds = (password*) malloc(mem_size); ifstream inFile("passwords.txt"); if (!inFile) { cerr << "File passwords.txt not found." << endl; return -1; } string line; int i; while (getline(inFile, line)) { if (line.empty()) continue; memcpy(h_pwds[i].word,line.c_str(),line.size()); h_pwds[i].length = line.size(); cout << "Password: " << h_pwds[i].word << "\n"; cout << "Length: " << h_pwds[i].length << "\n"; i++; } inFile.close(); /***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/ password* d_pwds; cudaMalloc( (void**) &d_pwds, mem_size); cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice); libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count); libEntry* d_library; cudaMalloc( (void**) &d_library, sizeof(libEntry) * count); int h_numPwds = i; cout << "INT NUMPWDS: " << h_numPwds << "\n"; int* d_numPwds; cudaMalloc( (void**) &d_numPwds, sizeof(int)); cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice); /*unsigned int threads_per_block = 1024; dim3 grid(1024, 1, 1); dim3 threads(threads_per_block, 1, 1); // generateLibraryKernel(int numPwds, password* pwds, libEntry* library) generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library); cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/ return 0; } int main(int argc, char *argv[]) { if (argc != 2) { fprintf(stderr, "usage: ./prog password\n"); return 1; } crack_password((uint8_t*) argv[1]); cout << "Hack Password: " << argv[1] << "\n"; return 0; }

我逐行完成了它，我相信它发生在以下几行：

int* d_numPwds; cudaMalloc( (void**) &d_numPwds, sizeof(int)); cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);

当我评论上面的cudaMemcpy ，我至少得到了终端上的cout输出。请注意，我还没有进入内核执行部分，我只关注内存分配，然后才能实际执行和调试内核。任何帮助将不胜感激！

我如何检查退货状态：

#define CUDA_SAFE_CALL(call) do { \ CUDA_SAFE_CALL_NO_SYNC(call); \ cudaError err = cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(EXIT_FAILURE); \ } } while (0)

编辑：错误仍然发生在我处理int memcpy和malloc后，显然我没有必须分配或cpy它。可能只是把它传了过来。所以，错误是由于以下几行，我不确定是哪一个或为什么？

password* d_pwds; cudaMalloc( (void**) &d_pwds, mem_size); cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice); libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count); libEntry* d_library; cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);

编辑2：我清理了一切，仍然无法搞清楚。通过在以下行CUDA_SAFE_CALL上使用CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); 即使每个其他内存分配命令被注释掉，我也会出现分段错误。

New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:

#include <cuda.h> #include <stdio.h> #include <stdint.h> #include <fstream> #include <iostream> #include <sstream> #include <string> using namespace std; typedef struct password_t{ char word[56]; size_t length; } password; typedef struct libEntry_t{ uint8_t digest[16]; password pwd; } libEntry; // Generates a library of passwords and their corresponding MD5 hashes // // Params: // numPwds - the number of passwords for which to generate hashes // pwds - the list of passwords to hash // library - the array in which to store the unhashed/hashed password library __global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library) { // __device__ void cuda_md5(const password *pwd, uint8_t *digest) { int index = (blockIdx.x * blockDim.x) + threadIdx.x; uint8_t hashed[16]; if (index < numPwds) { cuda_md5(&pwds[index], hashed); for (int j = 0; j < 16; j++) { library[index].digest[j] = hashed[j]; } library[index].pwd = pwds[index]; } } int crack_password (uint8_t* classified) { int count = 10; unsigned int mem_size = sizeof(password) * count; password *h_pwds = (password*) malloc(mem_size); ifstream inFile("passwords.txt"); if (!inFile) { cerr << "File passwords.txt not found." << endl; return -1; } string line; int i; while (getline(inFile, line)) { if (line.empty()) continue; memcpy(h_pwds[i].word,line.c_str(),line.size()); h_pwds[i].length = line.size(); cout << "Password: " << h_pwds[i].word << "\n"; cout << "Length: " << h_pwds[i].length << "\n"; i++; } inFile.close(); /***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/ password* d_pwds; cudaMalloc( (void**) &d_pwds, mem_size); cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice); libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count); libEntry* d_library; cudaMalloc( (void**) &d_library, sizeof(libEntry) * count); int h_numPwds = i; cout << "INT NUMPWDS: " << h_numPwds << "\n"; int* d_numPwds; cudaMalloc( (void**) &d_numPwds, sizeof(int)); cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice); /*unsigned int threads_per_block = 1024; dim3 grid(1024, 1, 1); dim3 threads(threads_per_block, 1, 1); // generateLibraryKernel(int numPwds, password* pwds, libEntry* library) generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library); cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/ return 0; } int main(int argc, char *argv[]) { if (argc != 2) { fprintf(stderr, "usage: ./prog password\n"); return 1; } crack_password((uint8_t*) argv[1]); cout << "Hack Password: " << argv[1] << "\n"; return 0; }

I have gone through it line by line and I believe it happens on the following lines:

int* d_numPwds; cudaMalloc( (void**) &d_numPwds, sizeof(int)); cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);

When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!

How I have been checking for return status:

#define CUDA_SAFE_CALL(call) do { \ CUDA_SAFE_CALL_NO_SYNC(call); \ cudaError err = cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(EXIT_FAILURE); \ } } while (0)

EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?

password* d_pwds; cudaMalloc( (void**) &d_pwds, mem_size); cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice); libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count); libEntry* d_library; cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);

EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.

最满意答案

对于想知道出了什么问题的人，我能够解决它。我不确定究竟是什么问题，但我在某些地方有不正确的内存分配，而在其他情况下我甚至不需要使用cudaMalloc或cudaMemcpy 。此外，使用什么是使用CUDA运行时API检查错误的规范方法？检查错误而不是我自己的实现工作。我现在拥有的：

/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/ /***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/ unsigned int threads_per_block = 1024; dim3 grid(1024, 1, 1); dim3 threads(threads_per_block, 1, 1); password* d_pwds; ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size)); ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice)); libEntry* d_library; ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count)); // generateLibraryKernel(int numPwds, password* pwds, libEntry* library) generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library); ERROR_CHECK( cudaPeekAtLastError() ); ERROR_CHECK( cudaDeviceSynchronize() );

其中ERROR_CHECK是从上面的链接定义的。

#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } }

我仍然不完全理解CUDA（设备和主机分配）中的内存管理，但我的代码现在正常工作！谢谢你们。

For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:

/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/ /***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/ unsigned int threads_per_block = 1024; dim3 grid(1024, 1, 1); dim3 threads(threads_per_block, 1, 1); password* d_pwds; ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size)); ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice)); libEntry* d_library; ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count)); // generateLibraryKernel(int numPwds, password* pwds, libEntry* library) generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library); ERROR_CHECK( cudaPeekAtLastError() ); ERROR_CHECK( cudaDeviceSynchronize() );

Where ERROR_CHECK is defined from the link above.

#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } }

I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.

更多推荐