CUDA-Strukturausrichtung verlangsamt meinen Code (kompilierbares Beispiel)
Ich habe eine Simulation, die die 3D-Vektoren geladener Teilchen berechnet, die sich in einem elektrischen und magnetischen Feld bewegen. Ich habe versucht, dies in CUDA mit dem @ zu beschleunige__align__
specifier, in der Annahme, dass der begrenzende Faktor vielleicht das Lesen und Schreiben des globalen Speichers war,aber mit__align__
endete es zu verlangsamen (möglicherweise, weil dadurch der Gesamtspeicherbedarf erhöht wurde). Ich habe auch versucht mitfloat3
undfloat4
aber ihre Leistung war ähnlich
Ich habe eine vereinfachte Version dieses Codes erstellt und sie unten eingefügt, um mein Problem anzuzeigen.Der folgende Code sollte kompilierbar sein und durch Ändern der Definition vonCASE
in der vierten Zeile zu0
, 1
, oder2
, die verschiedenen Optionen, die ich oben beschrieben habe, können ausprobiert werden. Zwei Funktionen,ParticleMoverCPU
undParticleMoverGPU
dient zum Vergleich der CPU- und der GPU-Leistung.
Vielen Dank
CPU - Intel Xeon E5620 @ 2.40GHz
GPU - NVIDIA Tesla C2070
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE 0 // define to either 0, 1 or 2 as described above
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>
#define CEX 10 // x-value of electric field (dimensionless and arbitrary)
#define CEY 0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary)
#define FACTOR 15 // I played around with these numbers until I got the best speedup
#define THREADS 256 // I played around with these numbers until I got the best speedup
typedef struct{
float x;
float y;
float z;
} VecCPU; //Struct for vectors for CPU calculation
// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
float x;
float y;
float z;
} VecGPU;
#endif
#if CASE==1
// This method seems to be less fast. It is an attempt to align for memory coalescence
typedef struct __align__(16){
float x;
float y;
float z;
} VecGPU;
#endif
// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif
VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation
__constant__ VecGPU *pos_d, *vel_d; // pointers in constant memory which we will point to data in global memory
void ParticleMoverCPU(int np, int ts, float dt){
int n = 0;
while (n < np){
VecCPU vminus, tvec, vprime, vplus;
float tvec_fact;
int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_c[n].x + CEX*0.5*dt;
vminus.y = vel_c[n].y + CEY*0.5*dt;
vminus.z = vel_c[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (v,prime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_c[n].x = vplus.x + CEX*0.5*dt;
vel_c[n].y = vplus.y + CEY*0.5*dt;
vel_c[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_c[n].x += vel_c[n].x*dt;
pos_c[n].y += vel_c[n].y*dt;
pos_c[n].z += vel_c[n].z*dt;
it++;
}
n++;
}
}
__global__ void ParticleMoverGPU(register int np,register int ts, register float dt){
register int n = threadIdx.x + blockDim.x * blockIdx.x;
while (n < np){
register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
register float tvec_fact;
register int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_d[n].x + CEX*0.5*dt;
vminus.y = vel_d[n].y + CEY*0.5*dt;
vminus.z = vel_d[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_d[n].x = vplus.x + CEX*0.5*dt;
vel_d[n].y = vplus.y + CEY*0.5*dt;
vel_d[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_d[n].x += vel_d[n].x*dt;
pos_d[n].y += vel_d[n].y*dt;
pos_d[n].z += vel_d[n].z*dt;
it++;
}
n += blockDim.x*gridDim.x;
}
}
int main(void){
int np = 50000; // Number of Particles
const int ts = 1000; // Number of Time-steps
const float dt = 1E-3; // Time-step value
// ----------- CPU ----------- //
pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position
vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity
for (int n = 0; n < np; n++){
pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables
vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables
}
printf("Starting CPU kernel\n");
clock_t startCPU;
float CPUtime;
startCPU = clock();
ParticleMoverCPU(np, ts, dt); // Launch CPU kernel
CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
printf("CPU kernel finished\n");
// Ouput final CPU computation time
printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
// ------------ GPU ----------- //
cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect)
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int blocks = deviceProp.multiProcessorCount;
VecGPU *pos_g, *vel_g, *pos_l, *vel_l;
pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for positions on the CPU
vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for velocities on the CPU
cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np); // allocate memory for velocities on the GPU
cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*)); // copy memory address of position to the constant memory pointer pos_d
cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*)); // copy memory address of velocity to the constant memory pointer vel_d
for (int n = 0; n < np; n++){
pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
}
cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
printf("Starting GPU kernel\n");
// start cuda timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt); // Launch GPU kernel
//stop cuda timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("GPU kernel finished\n");
cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy positions from GPU memory back to CPU
cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy velocities from GPU memory back to CPU
// Ouput GPU computation time
printf("GPUtime = %6.1f ms\n", elapsedTime);
// Output speedup factor
printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
// free allocated memory
cudaFree(pos_l);
cudaFree(vel_l);
free(pos_g);
free(vel_g);
free(pos_c);
free(vel_c);
}
ZumCASE 0
(reguläre Vektorstruktur) Ich erhalte:
CPUtime = 1302.0 ms
GPUtime = 21.8 ms
Speedup = 59.79
ZumCASE 1
(__align__(16)
vector struct) Ich bekomme:
CPUtime = 1298.0 ms
GPUtime = 24.5 ms
Speedup = 53.08
ZumCASE 2
(mitfloat3
) Ich bekomme
CPUtime = 1305.0 ms
GPUtime = 21.8 ms
Speedup = 59.80
Wenn ich @ benutfloat4
Anstatt vonfloat3
Ich bekomme etwas ähnliches wie das__align__(16)
Methode
Vielen Dank!