¿Por qué CudaFree no parece liberar la memoria?

Estoy tratando de asignar memoria del dispositivo, copiarla, realizar los cálculos en la GPU, copiar los resultados y luego liberar la memoria del dispositivo que asigné. Quería asegurarme de que no estaba superando el límite y quería ver si tendría suficiente memoria en el espacio de la memoria compartida para vaciar algunas matrices.

Cuando asigno memoria de dispositivo, no se devuelven errores. Cuando yo usocudaMemGetInfo para comprobar la cantidad de memoria asignada, parece unocudaMalloc no ha asignado ninguna memoria. También cuando intento liberar la memoria, parece que solo se libera un puntero.

Estoy usando el matlabMexfunction Interfaz para configurar la memoria de la GPU y lanzar el kernel. En este punto, ni siquiera estoy invocando el kernel y simplemente estoy devolviendo una matriz de unidades para los resultados.

<code>cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);  

/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;

/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
    mexPrintf("could not allocate memory to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
    mexPrintf("could not allocate memory to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
    mexPrintf("could not allocate memory to deviceScattDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
    mexPrintf("could not allocate memory to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
    mexPrintf("could not allocate memory to deviceReceivedReal\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
    mexPrintf("could not allocate memory to deviceReceivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));

/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
    mexPrintf("could not copy to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
    mexPrintf("could not copy to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);   
if (cudaErr != cudaSuccess)
    mexPrintf("could not copy to deviceScattDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
    mexPrintf("could not copy to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));  

/* call the kernel */
// launchKernel<<<1,512>>>(........);   

/* retireve the output */  
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);   
if (cudaErr != cudaSuccess)
    mexPrintf("could not copy to receivedReal\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess)
    mexPrintf("could not copy to receivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));   

/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n");   
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);  
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
    mexPrintf("could free devicePulseDelay\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceTarDistance);   
if (cudaErr != cudaSuccess) 
    mexPrintf("could free deviceTarDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScattDistance);   
if (cudaErr != cudaSuccess) 
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScatterers);  
if (cudaErr != cudaSuccess) 
    mexPrintf("could free deviceScatterers\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceReceivedReal);  
if (cudaErr != cudaSuccess) 
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceReceivedImag);   
if (cudaErr != cudaSuccess) 
    mexPrintf("could free deviceReceivedImag\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

Aquí está la salida de esto:

Memory avaliable: Free: 2523959296, Total: 2818572288
 Allocating memory.
 devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576
 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576
 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880
 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456
 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608
 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Copying memory.
 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760
 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760

Free'ing memory.
 Before freeing: Free 2513473536, Total: 2818572288
 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0
 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576

Siento que hay algo obvio que me falta. ¿Alguien puede ayudar a explicar lo que está pasando?

EDITAR: la plataforma es Windows 7 con una tarjeta Tesla C2050 GPu.

Respuestas a la pregunta(1)

Su respuesta a la pregunta