Por que o CudaFree não parece liberar memória?
Eu estou tentando alocar memória do dispositivo, copiar para ele, executar os cálculos na GPU, copiar os resultados de volta e, em seguida, liberar a memória do dispositivo que eu aloquei. Eu queria ter certeza de que não ultrapassaria o limite e queria ver se teria memória suficiente no espaço de memória compartilhada para despejar alguns arrays.
Quando aloco a memória do dispositivo, não há erros sendo retornados. Quando eu usocudaMemGetInfo
para verificar a quantidade de memória alocada, parece umcudaMalloc
não alocou memória alguma. Além disso, quando tento liberar a memória, parece que apenas um ponteiro é liberado.
Estou usando o matlabMexfunction
interface para configurar a memória da GPU e iniciar o kernel. Neste ponto, eu nem estou chamando para o kernel e apenas retornando uma matriz de unidade para os resultados.
<code>cudaError_t cudaErr; size_t freeMem = 0; size_t totalMem = 0; size_t allocMem = 0; cudaMemGetInfo(&freeMem, &totalMem); mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem); /* Pointers for the device memory */ double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; double *deviceReceivedReal, *deviceReceivedImag; /* Allocate memory on the device for the arrays. */ mexPrintf("Allocating memory.\n"); cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceReceivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceReceivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem)); /* copy the input arrays across to the device */ mexPrintf("\nCopying memory.\n"); cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); /* call the kernel */ // launchKernel<<<1,512>>>(........); /* retireve the output */ cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to receivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to receivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); /* free the memory. */ mexPrintf("\nFree'ing memory.\n"); cudaMemGetInfo(&freeMem, &totalMem); mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem); cudaErr = cudaFree(devicePulseDelay); if (cudaErr != cudaSuccess) { mexPrintf("could free devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceTarDistance); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceScattDistance); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceScatterers); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceReceivedReal); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceReceivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceReceivedImag); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceReceivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); </code>
Aqui está a saída disso:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
Eu sinto que há algo óbvio que estou perdendo. Alguém pode ajudar a explicar o que está acontecendo?
EDIT: plataforma é o Windows 7 com uma placa GPU Tesla C2050.