Почему CudaFree не освобождает память?
Я пытаюсь выделить память устройства, скопировать в нее, выполнить вычисления на GPU, скопировать результаты обратно и затем освободить память устройства, которую я выделил. Я хотел убедиться, что я не превышаю этот предел, и я хотел посмотреть, хватит ли мне памяти в общей памяти, чтобы выгрузить несколько массивов.
Когда я выделяю память устройства, ошибки не возвращаются. Когда я используюcudaMemGetInfo
чтобы проверить объем выделенной памяти, он выглядит как одинcudaMalloc
не выделил никакой памяти. Также, когда я пытаюсь освободить память, похоже, что освобождается только один указатель.
Я использую MatlabMexfunction
интерфейс для настройки памяти GPU и запуска ядра. На данный момент я даже не обращаюсь к ядру, а просто возвращаю единичную матрицу для результатов.
<code>cudaError_t cudaErr; size_t freeMem = 0; size_t totalMem = 0; size_t allocMem = 0; cudaMemGetInfo(&freeMem, &totalMem); mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem); /* Pointers for the device memory */ double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; double *deviceReceivedReal, *deviceReceivedImag; /* Allocate memory on the device for the arrays. */ mexPrintf("Allocating memory.\n"); cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceReceivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceReceivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem)); /* copy the input arrays across to the device */ mexPrintf("\nCopying memory.\n"); cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); /* call the kernel */ // launchKernel<<<1,512>>>(........); /* retireve the output */ cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to receivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to receivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); /* free the memory. */ mexPrintf("\nFree'ing memory.\n"); cudaMemGetInfo(&freeMem, &totalMem); mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem); cudaErr = cudaFree(devicePulseDelay); if (cudaErr != cudaSuccess) { mexPrintf("could free devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceTarDistance); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceScattDistance); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceScatterers); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceReceivedReal); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceReceivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceReceivedImag); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceReceivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); </code>
Вот вывод из этого:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
Я чувствую, что я чего-то не понимаю. Может кто-нибудь помочь объяснить, что происходит?
EDIT: платформа для Windows 7 с графической картой Tesla C2050.