Выравнивание структуры CUDA замедляет мой код (скомпилированный пример)

Question

Aug 26, 2015, 07:58 PM

Выравнивание структуры CUDA замедляет мой код (скомпилированный пример)

У меня есть симуляция, которая рассчитывает трехмерные векторы заряженных частиц, движущихся в электрическом и магнитном поле.Я попытался ускорить это в CUDA, используя__align__ спецификатордумая, что, возможно, ограничивающим фактором было чтение и запись глобальной памяти,но используя__align__ в конечном итоге замедлить его (возможно, потому что это увеличило общую потребность в памяти). Я также пытался использоватьfloat3 а такжеfloat4 но их производительность была похожа

Я создал упрощенную версию этого кода и вставил ее ниже, чтобы показать мою проблему.Код ниже должен быть компилируемым и путем изменения определенияCASE на четвертой строке0, 1, или же2можно попробовать различные варианты, которые я описал выше. Две функции,ParticleMoverCPU а такжеParticleMoverGPU определены для сравнения производительности процессора и графического процессора.

Есть ли причина, по которой мои попытки объединения памяти замедляются, а не ускоряют мой код?Есть ли что-нибудь еще очевидное, что я не делаю то, что я мог бы сделать, чтобы получить ускорение в 60 раз для «смущающе параллельного» кода, такого как этот?

Спасибо!

Процессор - Intel Xeon E5620 @ 2,40 ГГц

Графический процессор - NVIDIA Tesla C2070

// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE        0   // define to either 0, 1 or 2 as described above

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>

#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>

#define CEX         10  // x-value of electric field (dimensionless and arbitrary)
#define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)

#define FACTOR      15  // I played around with these numbers until I got the best speedup
#define THREADS     256 // I played around with these numbers until I got the best speedup

typedef struct{
    float x;
    float y;
    float z;
} VecCPU;           //Struct for vectors for CPU calculation

// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
    float x;
    float y;
    float z;
} VecGPU;
#endif

#if CASE==1
// This method seems to be less fast.  It is an attempt to align for memory coalescence
typedef struct __align__(16){
    float x;
    float y;
    float z;
} VecGPU;
#endif

// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif

VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation
__constant__ VecGPU *pos_d, *vel_d;     // pointers in constant memory which we will point to data in global memory

void ParticleMoverCPU(int np, int ts, float dt){

    int n = 0;
    while (n < np){

        VecCPU vminus, tvec, vprime, vplus;
        float tvec_fact;
        int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_c[n].x + CEX*0.5*dt;
            vminus.y = vel_c[n].y + CEY*0.5*dt;
            vminus.z = vel_c[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (v,prime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_c[n].x = vplus.x + CEX*0.5*dt;
            vel_c[n].y = vplus.y + CEY*0.5*dt;
            vel_c[n].z = vplus.z + CEZ*0.5*dt;

            // ------ Update Particle positions -------------- //
            pos_c[n].x += vel_c[n].x*dt;
            pos_c[n].y += vel_c[n].y*dt;
            pos_c[n].z += vel_c[n].z*dt;
            it++;
        }
        n++;
    }
}

__global__ void ParticleMoverGPU(register int np,register int ts, register float dt){

    register int n = threadIdx.x + blockDim.x * blockIdx.x;
    while (n < np){

        register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
        register float tvec_fact;
        register int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_d[n].x + CEX*0.5*dt;
            vminus.y = vel_d[n].y + CEY*0.5*dt;
            vminus.z = vel_d[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_d[n].x = vplus.x + CEX*0.5*dt;
            vel_d[n].y = vplus.y + CEY*0.5*dt;
            vel_d[n].z = vplus.z + CEZ*0.5*dt;
            // ------ Update Particle positions -------------- //
            pos_d[n].x += vel_d[n].x*dt;
            pos_d[n].y += vel_d[n].y*dt;
            pos_d[n].z += vel_d[n].z*dt;
            it++;
        }
        n += blockDim.x*gridDim.x;
    }
}

int main(void){

    int np = 50000;                                         // Number of Particles
    const int ts = 1000;                                    // Number of Time-steps
    const float dt = 1E-3;                                  // Time-step value


    // ----------- CPU ----------- //

    pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
    vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity

    for (int n = 0; n < np; n++){
        pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
        vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
    }

    printf("Starting CPU kernel\n");
    clock_t startCPU;
    float CPUtime;
    startCPU = clock();
    ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
    CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
    printf("CPU kernel finished\n");
    // Ouput final CPU computation time
    printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);

    // ------------ GPU ----------- //

    cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int blocks = deviceProp.multiProcessorCount;

    VecGPU *pos_g, *vel_g, *pos_l, *vel_l;

    pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for positions on the CPU
    vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for velocities on the CPU

    cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np);      // allocate memory for velocities on the GPU

    cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*));   // copy memory address of position to the constant memory pointer pos_d
    cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*));   // copy memory address of velocity to the constant memory pointer vel_d

    for (int n = 0; n < np; n++){
        pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
        vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
    }

    cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory

    printf("Starting GPU kernel\n");
    // start cuda timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt);             // Launch GPU kernel

    //stop cuda timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    printf("GPU kernel finished\n");

    cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy positions from GPU memory back to CPU
    cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy velocities from GPU memory back to CPU

    // Ouput GPU computation time
    printf("GPUtime = %6.1f ms\n", elapsedTime);

    // Output speedup factor
    printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);

    // free allocated memory
    cudaFree(pos_l);
    cudaFree(vel_l);
    free(pos_g);
    free(vel_g);
    free(pos_c);
    free(vel_c);
}

ЗаCASE 0 (обычная векторная структура) Я получаю:

CPUtime = 1302.0 ms
GPUtime =   21.8 ms
Speedup = 59.79

ЗаCASE 1 (__align__(16) векторная структура) я получаю:

CPUtime = 1298.0 ms
GPUtime =   24.5 ms
Speedup = 53.08

ЗаCASE 2 (с помощьюfloat3) Я получил:

CPUtime = 1305.0 ms
GPUtime =   21.8 ms
Speedup = 59.80

Если я используюfloat4 вместоfloat3 Я получаю что-то похожее на__align__(16) метод.

Спасибо!!

Выравнивание структуры CUDA замедляет мой код (скомпилированный пример)

Ответы на вопрос(1)

Ваш ответ на вопрос

Популярные вопросы

Вы очень активны! Это здорово!

Выравнивание структуры CUDA замедляет мой код (скомпилированный пример)

Ответы на вопрос(1)

Ваш ответ на вопрос

Популярные вопросы