Multiplicação matriz-vetor em CUDA: benchmarking & performance

Question

Oct 17, 2014, 05:36 AM

Multiplicação matriz-vetor em CUDA: benchmarking & performance

Estou atualizando minha pergunta com alguns novos resultados de benchmarking (também reformulei a pergunta para ser mais específica e atualizei o código) ...

Eu implementei um kernel para multiplicação de vetores matriciais no CUDA C, seguindo oGuia de programação CUDA C usando memória compartilhada. Deixe-me apresentar primeiro alguns resultados de benchmarking que fiz em um Jetson TK1 (GPU: Tegra K1, capacidade de computação 3.2) e uma comparação com cuBLAS:

Aqui, acho que o cuBLAS faz alguma mágica, pois parece que sua execução não é afetada pelo número de colunas deA, o que, por sua vez, implica que existe algum tipo de paralelização ao longo das colunas deA.

Agora, aqui está o código fonte do meu kernel e uma função host para chamá-lo (arquivo: mv.cuh):

#include <cuda_runtime.h>

#define     BLOCK_SIZE      16

/* Set to __restric__ */
#define     RESTRICT

/**
 * Performs matrix-vector multiplication on the device.
 *
 * @param   dA              Address of matrix `A` on the device
 * @param   dx              Address of vector `x` on the device
 * @param   dev_ptr_y       Address of result y = A*x
 * @param   nRows           Number of rows of `A`
 * @param   nx              Size of `x` (number of columns of `A`)
 *
 * @tparam  T               Data type
 *
 */
template<typename T>
__global__ void matvec_kernel(
        const T * RESTRICT dA,
        const T * RESTRICT dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx);


/**
 * Host-side wrapper for #matvec_kernel.
 *
 * @param   dA              Address of matrix `A` on the device
 * @param   dx              Address of vector `x` on the device
 * @param   dev_ptr_y       Address of result y = A*x
 * @param   nRows           Number of rows of `A`
 * @param   nx              Size of `x` (number of columns of `A`)
 * @param   elapsed_time    Time for the kernel to complete the execution in `ms`.
 *                          If NULL is passed to this argument, the elapsed time
 *                          will not be computed.
 *
 * @tparam  T               Data type for `A` and `x`
 */
template<typename T>
__host__ void matvec(
        const T * RESTRICT dA,
        const T * RESTRICT dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx);



/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */

template<typename T>
__global__ void matvec_kernel(const T * RESTRICT  dA, const T * RESTRICT  dx,
        T * RESTRICT dy,
        const unsigned int nRows, const unsigned int nx)
{

        unsigned int bid = blockIdx.x;
        unsigned int row = threadIdx.x;
        const unsigned int block_size = blockDim.x;
        const unsigned int num_hor_blocks = ((nx + block_size - 1)/ block_size);
        unsigned int n_star;
        unsigned int idx_x;
        unsigned int idx_Asub;
        unsigned int idx_y;
        const T * Asub;
        const T * xsub;

        /* Only `x` is copied to shared memory */
        __shared__ T x_shared[BLOCK_SIZE];

        idx_y = bid * block_size;

        T * y_sub = dy + idx_y;

        T y_val = 0.0;

        #pragma unroll
        for (unsigned int m = 0; m < num_hor_blocks; ++m)
        {
            idx_Asub = block_size * (bid + m * nRows);
            idx_x = m * block_size;

            Asub = dA + idx_Asub;
            xsub = dx + idx_x;

            if (idx_x + row <  nx) {
                x_shared[row] = xsub[row];
            }

            __syncthreads();


        /* If the tiling is exact */
        if ( (nRows % block_size == 0 && nx % block_size == 0 ) ||
                (m != block_size - 1 || bid != gridDim.x - 1)) {
            y_val += Asub[row] * x_shared[0];
            y_val += Asub[row + nRows] * x_shared[1];
            y_val += Asub[row + 2 * nRows] * x_shared[2];
            y_val += Asub[row + 3 * nRows] * x_shared[3];
            y_val += Asub[row + 4 * nRows] * x_shared[4];
            y_val += Asub[row + 5 * nRows] * x_shared[5];
            y_val += Asub[row + 6 * nRows] * x_shared[6];
            y_val += Asub[row + 7 * nRows] * x_shared[7];
            y_val += Asub[row + 8 * nRows] * x_shared[8];
            y_val += Asub[row + 9 * nRows] * x_shared[9];
            y_val += Asub[row + 10 * nRows] * x_shared[10];
            y_val += Asub[row + 11 * nRows] * x_shared[11];
            y_val += Asub[row + 12 * nRows] * x_shared[12];
            y_val += Asub[row + 13 * nRows] * x_shared[13];
            y_val += Asub[row + 14 * nRows] * x_shared[14];
            y_val += Asub[row + 15 * nRows] * x_shared[15];
        } else {
            n_star = min(BLOCK_SIZE, nx - idx_x);
            #pragma unroll
            for (unsigned int e = 0; e < n_star; ++e) {
                y_val += Asub[row + e * nRows] * x_shared[e];
            }
        }
        __syncthreads();
        }

        if (row + idx_y < nRows)
            y_sub[row] = y_val;

}



template<typename T>
__host__ void matvec(
        const T * RESTRICT  dA,
        const T * RESTRICT  dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx)
{
    dim3 dim_grid( (nRows + BLOCK_SIZE -1)/ BLOCK_SIZE );
    dim3 dim_block(BLOCK_SIZE);
    matvec_kernel<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
}

Estou usando isso para cronometrar minha execução (arquivo: cuda_timer.cuh):

#include <cuda_runtime.h>
#include "error_handles.cuh"

static cudaEvent_t start;
static cudaEvent_t stop;
static short timer_running = 0;
static short tic_called = 0;

/**
 * Sets up the timer.
 *
 * Must be called before any invocation to
 * tic() or toc(), preferrably at the beginning of your
 * application.
 */
void start_tictoc();

/**
 * Starts the timer.
 *
 * Use `toc()` to get the elapsed time; `tic()` must
 * be called before a `toc()`.
 */
void tic();

/**
 * Returns the elapsed time between its invocation
 * and a previous invocation of `toc()`. Returns `-1`
 * and prints a warning message if `toc()` was not
 * previously called. Returns `-2` and prints and error
 * message if `start_tictoc()` has not been called.
 *
 * @return Elapsed time between `tic()` and `toc()` in milliseconds
 * with a resolution of `0.5` microseconds.
 */
float toc();

/**
 * This function should be called when the
 * time will not be being used any more. It destroys
 * the events used to time CUDA kernels. If the timer
 * is not running, this function does nothing and
 * prints a warning message.
 */
void stop_tictoc();


void start_tictoc() {
    _CUDA(cudaEventCreate(&start));
    _CUDA(cudaEventCreate(&stop));
    timer_running = 1;
}

void tic() {
    if (timer_running) {
        _CUDA(cudaEventRecord(start, 0));
        tic_called = 1;
    } else {
        printf("WARNING: tic() called without a timer running!\n");
    }
}

float toc() {
    float elapsed_time;
    if (tic_called == 0) {
        printf("WARNING: toc() called without a previous tic()!\n");
        return -1;
    }
    if (timer_running == 1) {
        // _CUDA(cudaDeviceSynchronize()); // Removed! (See discussion below)
        _CUDA(cudaEventRecord(stop, 0));
        _CUDA(cudaEventSynchronize(stop));
        _CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
        tic_called = 0;
        return elapsed_time;
    } else {
        printf("WARNING: toc() called without a timer running!\n");
        return -2;
    }

}

void stop_tictoc()
{
    if (timer_running == 1){
        _CUDA(cudaEventDestroy(start));
        _CUDA(cudaEventDestroy(stop));
        timer_running = 0;
    } else{
        printf("WARNING: stop_tictoc() called without a timer running!\n");
    }
}

e meu arquivo principal (main.cu) é o seguinte:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "cublas_v2.h"
#include <math.h>
#include <curand.h>
#include <stdbool.h>

#include "mv.cuh"
#include "cuda_timer.cuh"
#include "error_handles.cuh"

typedef float real_t;

#define _CUDA(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define TEST_COLUMNS        1
#define TEST_ROWS           0

/**
 * If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark
 * will be performed with respect to columns (with a fixed
 * number of rows). If it is set to `TEST_ROWS`, then a benchmark will
 * run with respect to rows (fixed number of columns).
 */
#define TEST_WRT_ TEST_ROWS

#define CONSTANT_COLS 300
#define CONSTANT_ROWS 256

/**
 * In order to estimate the execution time, every
 * kernel is run `RUNS` times and the average is taken.
 */
#define RUNS 50

void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows)
{
    real_t * hst_y_cublas;
    real_t * hst_y;
    const size_t s = nrows * sizeof(real_t);

    hst_y_cublas = (real_t*) malloc(s);
    hst_y = (real_t*) malloc(s);
    _CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost));
    _CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost));
    for (unsigned int i = 0; i < nrows; ++i) {
        if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) {
            printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i]));
            exit(EXIT_FAILURE);
        }
    }
    if (hst_y_cublas) free(hst_y_cublas);
    if (hst_y) free(hst_y);
}


void do_benchmark() {
    curandGenerator_t gen;
    real_t *dev_rand_data = NULL; // Random data will be allocated here!
    real_t *dev_y = NULL;
    real_t *dev_y_cublas = NULL;
    real_t t;
    real_t t_cublas;
    const size_t n_rows_max = 1500;
    const size_t n_cols_max = 300;
    const size_t ntot = n_cols_max * (1 + n_rows_max);
    const size_t size_tot = sizeof(real_t) * ntot;

    float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake
    cublasHandle_t handle;
    _CUBLAS(cublasCreate(&handle));

    start_tictoc();

    _CUDA(cudaMalloc((void** )&dev_rand_data, size_tot));
    _CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t)));
    _CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t)));

    _CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
    _CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
    tic();
    _CURAND(curandGenerateUniform(gen, dev_rand_data, ntot));
    t = toc();
    printf("RNG in %f ms\n", t);

    _CURAND(curandDestroyGenerator(gen));

    size_t ncols = CONSTANT_COLS;
    size_t nrows = CONSTANT_ROWS;
    size_t runs = RUNS;

    cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));
    matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols);
    _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
                                nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
    /* Compare results */
    compare_results(dev_y_cublas,dev_y, nrows);


    FILE * pFile;
    char filename[50];
#if (TEST_WRT_ == TEST_COLUMNS)
    sprintf(filename, "times_rows%lu_cols.txt", nrows);
#else
    sprintf(filename, "times_cols%lu_rows.txt", ncols);
#endif

    printf("Logging to : '%s'\n", filename);
    pFile = fopen(filename, "w");
    if (pFile == NULL) {
        perror("Error opening file.");
        exit(79);
    }


#if (TEST_WRT_ == TEST_COLUMNS)
    fprintf(pFile, "0, %lu, 0, 0\n", nrows);
    for (ncols = 32; ncols < n_cols_max; ncols += 32) {
#else
    fprintf(pFile, "1, %lu, 0, 0\n", ncols);
    for (nrows = 32; nrows < n_rows_max; nrows += 32) {
#endif
        tic();
        for (short i = 0; i < runs; i++) {
            matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows,
                    ncols);
        }
        t = toc() / runs;
        tic();
        for (short i = 0; i < runs; i++) {
            _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
                            nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
        }
        t_cublas = toc() / runs;
#if (TEST_WRT_ == TEST_COLUMNS)
        fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas);
#else
        fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas);
#endif
    }
    _CUBLAS(cublasDestroy(handle));

    fclose(pFile);

    if (dev_rand_data != NULL)
        _CUDA(cudaFree(dev_rand_data));

    stop_tictoc();
}

int main(void)
{
    do_benchmark();

    return EXIT_SUCCESS;
}

Finalmente, este é um script do MATLAB que estou usando para plotar os tempos de execução:

fetch_this = 'times_cols512_rows.txt';

username = 'ubuntu';
target_hostname = 'jetson';
% Do not modify below this line
eval_this=['! scp ' username '@' target_hostname ':~/mv/Debug/' fetch_this ' .'];
eval(eval_this)

set(0, 'DefaultAxesFontSize', 14);
r = csvread(fetch_this);
r_header = r(1,:);

plot(r(2:end,1), r(2:end,2)*1000, '-'); 
hold on
plot(r(2:end,1), r(2:end,3)*1000, '-r'); 
grid on;

fig_title = 'Matvec on Tegra K1 - %d %s';
if (r_header(1)==1),
    xlabel('Number of rows');
    title(sprintf(fig_title, r_header(2),'columns'));
else
    xlabel('Number of columns');
    title(sprintf(fig_title, r_header(2),'rows'));
end
ylabel('Computation time [us]');


legend('Kernel', 'cuBLAS');
axis tight

Estou preocupado com o desempenho e a escalabilidade do meu kernel, então primeiro gostaria de saber como melhorar a escalabilidade com relação ao número de linhas da matrizA. Segundo, sei que não é uma prática muito boa ter divergência de ramificação (e meu código possui), mas sinto que quero algumas dicas para melhorá-la.

ATUALIZAÇÃO: Graças a todos os seus comentários e sugestões, cheguei à conclusão de quecudaDeviceSynchronized() causou, em primeiro lugar, algumas peculiaridades do meu tempo, portanto minhas medições iniciais eram imprecisas. A ordenação de linhas principais leva a piores resultados. O tamanho dos blocos é um importante parâmetro de ajuste e muda de16 para32 ou64 melhora o tempo de execução. Benchmarking adicional é necessário para escolher o tamanho do bloco. Para esse fim, pode-se usar a seguinte API para o kernel:

template<typename T, const uint_t blk>
__global__ void matvec_kernel(const T * RESTRICT  dA, const T * RESTRICT  dx,
        T * RESTRICT dy, const uint_t nRows, const uint_t nx);

e chame assim do host:

template<typename T>
__host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx,
        T * RESTRICT dy, const uint_t nRows, const uint_t nx) {
    uint_t blk_size_opt = 64;

    /* Add code to decide the value of `blk_size_opt` */

    if (blk_size_opt == 32) {
        matvec_engine<T, 32>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 64) {
        matvec_engine<T, 64>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 128) {
        matvec_engine<T, 128>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 256) {
        matvec_engine<T, 256>(dA, dx, dy, nRows, nx);
    }

}

Deixe-me fornecer alguns resultados de benchmarking. Primeiro, uma comparação com cublasSgemv:

e o efeito do tamanho do bloco no tempo de execução: