¿Por qué CudaFree no parece liberar la memoria?
Estoy tratando de asignar memoria del dispositivo, copiarla, realizar los cálculos en la GPU, copiar los resultados y luego liberar la memoria del dispositivo que asigné. Quería asegurarme de que no estaba superando el límite y quería ver si tendría suficiente memoria en el espacio de la memoria compartida para vaciar algunas matrices.
Cuando asigno memoria de dispositivo, no se devuelven errores. Cuando yo usocudaMemGetInfo
para comprobar la cantidad de memoria asignada, parece unocudaMalloc
no ha asignado ninguna memoria. También cuando intento liberar la memoria, parece que solo se libera un puntero.
Estoy usando el matlabMexfunction
Interfaz para configurar la memoria de la GPU y lanzar el kernel. En este punto, ni siquiera estoy invocando el kernel y simplemente estoy devolviendo una matriz de unidades para los resultados.
<code>cudaError_t cudaErr; size_t freeMem = 0; size_t totalMem = 0; size_t allocMem = 0; cudaMemGetInfo(&freeMem, &totalMem); mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem); /* Pointers for the device memory */ double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers; double *deviceReceivedReal, *deviceReceivedImag; /* Allocate memory on the device for the arrays. */ mexPrintf("Allocating memory.\n"); cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceReceivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512); if (cudaErr != cudaSuccess) { mexPrintf("could not allocate memory to deviceReceivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem)); /* copy the input arrays across to the device */ mexPrintf("\nCopying memory.\n"); cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); /* call the kernel */ // launchKernel<<<1,512>>>(........); /* retireve the output */ cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to receivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); if (cudaErr != cudaSuccess) { mexPrintf("could not copy to receivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem)); /* free the memory. */ mexPrintf("\nFree'ing memory.\n"); cudaMemGetInfo(&freeMem, &totalMem); mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem); cudaErr = cudaFree(devicePulseDelay); if (cudaErr != cudaSuccess) { mexPrintf("could free devicePulseDelay\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceTarDistance); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceTarDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceScattDistance); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceScattDistance\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceScatterers); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceScatterers\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceReceivedReal); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceReceivedReal\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); cudaErr = cudaFree(deviceReceivedImag); if (cudaErr != cudaSuccess) { mexPrintf("could free deviceReceivedImag\n"); mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr)); } cudaMemGetInfo(&allocMem, &totalMem); mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem)); </code>
Aquí está la salida de esto:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
Siento que hay algo obvio que me falta. ¿Alguien puede ayudar a explicar lo que está pasando?
EDITAR: la plataforma es Windows 7 con una tarjeta Tesla C2050 GPu.