Matrix-Vektor-Multiplikation in CUDA: Benchmarking & Performance

Question

Oct 17, 2014, 05:36 AM

Matrix-Vektor-Multiplikation in CUDA: Benchmarking & Performance

Ich aktualisiere meine Frage mit einigen neuen Benchmarking-Ergebnissen (ich habe die Frage auch genauer umformuliert und den Code aktualisiert) ...

Ich habe einen Kernel für die Matrix-Vektor-Multiplikation in CUDA C nach dem @ implementierCUDA C Programmierhandbuch Shared Memory verwenden. Lassen Sie mich zunächst einige Benchmarking-Ergebnisse vorstellen, die ich mit einem Jetson TK1 (GPU: Tegra K1, Rechenkapazität 3.2) und einem Vergleich mit cuBLAS gemacht habe:

Hier macht cuBLAS wohl etwas Magie, da es den Anschein hat, dass die Ausführung nicht durch die Anzahl der Spalten von @ beeinflusst wirA, was wiederum impliziert, dass es eine Art Parallelisierung entlang der Spalten von @ giA.

Nun, hier ist der Quellcode meines Kernels und eine Host-Funktion, um ihn aufzurufen (Datei: mv.cuh):

#include <cuda_runtime.h>

#define     BLOCK_SIZE      16

/* Set to __restric__ */
#define     RESTRICT

/**
 * Performs matrix-vector multiplication on the device.
 *
 * @param   dA              Address of matrix `A` on the device
 * @param   dx              Address of vector `x` on the device
 * @param   dev_ptr_y       Address of result y = A*x
 * @param   nRows           Number of rows of `A`
 * @param   nx              Size of `x` (number of columns of `A`)
 *
 * @tparam  T               Data type
 *
 */
template<typename T>
__global__ void matvec_kernel(
        const T * RESTRICT dA,
        const T * RESTRICT dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx);


/**
 * Host-side wrapper for #matvec_kernel.
 *
 * @param   dA              Address of matrix `A` on the device
 * @param   dx              Address of vector `x` on the device
 * @param   dev_ptr_y       Address of result y = A*x
 * @param   nRows           Number of rows of `A`
 * @param   nx              Size of `x` (number of columns of `A`)
 * @param   elapsed_time    Time for the kernel to complete the execution in `ms`.
 *                          If NULL is passed to this argument, the elapsed time
 *                          will not be computed.
 *
 * @tparam  T               Data type for `A` and `x`
 */
template<typename T>
__host__ void matvec(
        const T * RESTRICT dA,
        const T * RESTRICT dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx);



/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */
/* -------------------------------------------------------------------------------------------- */

template<typename T>
__global__ void matvec_kernel(const T * RESTRICT  dA, const T * RESTRICT  dx,
        T * RESTRICT dy,
        const unsigned int nRows, const unsigned int nx)
{

        unsigned int bid = blockIdx.x;
        unsigned int row = threadIdx.x;
        const unsigned int block_size = blockDim.x;
        const unsigned int num_hor_blocks = ((nx + block_size - 1)/ block_size);
        unsigned int n_star;
        unsigned int idx_x;
        unsigned int idx_Asub;
        unsigned int idx_y;
        const T * Asub;
        const T * xsub;

        /* Only `x` is copied to shared memory */
        __shared__ T x_shared[BLOCK_SIZE];

        idx_y = bid * block_size;

        T * y_sub = dy + idx_y;

        T y_val = 0.0;

        #pragma unroll
        for (unsigned int m = 0; m < num_hor_blocks; ++m)
        {
            idx_Asub = block_size * (bid + m * nRows);
            idx_x = m * block_size;

            Asub = dA + idx_Asub;
            xsub = dx + idx_x;

            if (idx_x + row <  nx) {
                x_shared[row] = xsub[row];
            }

            __syncthreads();


        /* If the tiling is exact */
        if ( (nRows % block_size == 0 && nx % block_size == 0 ) ||
                (m != block_size - 1 || bid != gridDim.x - 1)) {
            y_val += Asub[row] * x_shared[0];
            y_val += Asub[row + nRows] * x_shared[1];
            y_val += Asub[row + 2 * nRows] * x_shared[2];
            y_val += Asub[row + 3 * nRows] * x_shared[3];
            y_val += Asub[row + 4 * nRows] * x_shared[4];
            y_val += Asub[row + 5 * nRows] * x_shared[5];
            y_val += Asub[row + 6 * nRows] * x_shared[6];
            y_val += Asub[row + 7 * nRows] * x_shared[7];
            y_val += Asub[row + 8 * nRows] * x_shared[8];
            y_val += Asub[row + 9 * nRows] * x_shared[9];
            y_val += Asub[row + 10 * nRows] * x_shared[10];
            y_val += Asub[row + 11 * nRows] * x_shared[11];
            y_val += Asub[row + 12 * nRows] * x_shared[12];
            y_val += Asub[row + 13 * nRows] * x_shared[13];
            y_val += Asub[row + 14 * nRows] * x_shared[14];
            y_val += Asub[row + 15 * nRows] * x_shared[15];
        } else {
            n_star = min(BLOCK_SIZE, nx - idx_x);
            #pragma unroll
            for (unsigned int e = 0; e < n_star; ++e) {
                y_val += Asub[row + e * nRows] * x_shared[e];
            }
        }
        __syncthreads();
        }

        if (row + idx_y < nRows)
            y_sub[row] = y_val;

}



template<typename T>
__host__ void matvec(
        const T * RESTRICT  dA,
        const T * RESTRICT  dx,
        T * RESTRICT dy,
        const unsigned int nRows,
        const unsigned int nx)
{
    dim3 dim_grid( (nRows + BLOCK_SIZE -1)/ BLOCK_SIZE );
    dim3 dim_block(BLOCK_SIZE);
    matvec_kernel<T> <<<dim_grid, dim_block>>>(dA, dx, dy, nRows, nx);
}

Ich benutze dies, um meine Ausführung zeitlich festzulegen (Datei: cuda_timer.cuh):

#include <cuda_runtime.h>
#include "error_handles.cuh"

static cudaEvent_t start;
static cudaEvent_t stop;
static short timer_running = 0;
static short tic_called = 0;

/**
 * Sets up the timer.
 *
 * Must be called before any invocation to
 * tic() or toc(), preferrably at the beginning of your
 * application.
 */
void start_tictoc();

/**
 * Starts the timer.
 *
 * Use `toc()` to get the elapsed time; `tic()` must
 * be called before a `toc()`.
 */
void tic();

/**
 * Returns the elapsed time between its invocation
 * and a previous invocation of `toc()`. Returns `-1`
 * and prints a warning message if `toc()` was not
 * previously called. Returns `-2` and prints and error
 * message if `start_tictoc()` has not been called.
 *
 * @return Elapsed time between `tic()` and `toc()` in milliseconds
 * with a resolution of `0.5` microseconds.
 */
float toc();

/**
 * This function should be called when the
 * time will not be being used any more. It destroys
 * the events used to time CUDA kernels. If the timer
 * is not running, this function does nothing and
 * prints a warning message.
 */
void stop_tictoc();


void start_tictoc() {
    _CUDA(cudaEventCreate(&start));
    _CUDA(cudaEventCreate(&stop));
    timer_running = 1;
}

void tic() {
    if (timer_running) {
        _CUDA(cudaEventRecord(start, 0));
        tic_called = 1;
    } else {
        printf("WARNING: tic() called without a timer running!\n");
    }
}

float toc() {
    float elapsed_time;
    if (tic_called == 0) {
        printf("WARNING: toc() called without a previous tic()!\n");
        return -1;
    }
    if (timer_running == 1) {
        // _CUDA(cudaDeviceSynchronize()); // Removed! (See discussion below)
        _CUDA(cudaEventRecord(stop, 0));
        _CUDA(cudaEventSynchronize(stop));
        _CUDA(cudaEventElapsedTime(&elapsed_time, start, stop));
        tic_called = 0;
        return elapsed_time;
    } else {
        printf("WARNING: toc() called without a timer running!\n");
        return -2;
    }

}

void stop_tictoc()
{
    if (timer_running == 1){
        _CUDA(cudaEventDestroy(start));
        _CUDA(cudaEventDestroy(stop));
        timer_running = 0;
    } else{
        printf("WARNING: stop_tictoc() called without a timer running!\n");
    }
}

und meine Hauptdatei (main.cu) ist die folgende:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <assert.h>
#include "cublas_v2.h"
#include <math.h>
#include <curand.h>
#include <stdbool.h>

#include "mv.cuh"
#include "cuda_timer.cuh"
#include "error_handles.cuh"

typedef float real_t;

#define _CUDA(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define _CUBLAS(x) do { if((x) != CUBLAS_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define _CURAND(x) do { if((x) != CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    exit(EXIT_FAILURE);}} while(0)

#define TEST_COLUMNS        1
#define TEST_ROWS           0

/**
 * If `TEST_WRT_` is set to `TEST_COLUMNS`, then a benchmark
 * will be performed with respect to columns (with a fixed
 * number of rows). If it is set to `TEST_ROWS`, then a benchmark will
 * run with respect to rows (fixed number of columns).
 */
#define TEST_WRT_ TEST_ROWS

#define CONSTANT_COLS 300
#define CONSTANT_ROWS 256

/**
 * In order to estimate the execution time, every
 * kernel is run `RUNS` times and the average is taken.
 */
#define RUNS 50

void compare_results(real_t *dev_y_cublas, real_t * dev_y,unsigned int nrows)
{
    real_t * hst_y_cublas;
    real_t * hst_y;
    const size_t s = nrows * sizeof(real_t);

    hst_y_cublas = (real_t*) malloc(s);
    hst_y = (real_t*) malloc(s);
    _CUDA(cudaMemcpy(hst_y, dev_y, s, cudaMemcpyDeviceToHost));
    _CUDA(cudaMemcpy(hst_y_cublas, dev_y_cublas, s, cudaMemcpyDeviceToHost));
    for (unsigned int i = 0; i < nrows; ++i) {
        if (fabsf(hst_y_cublas[i] - hst_y[i]) > 0.001) {
            printf("ERROR ------ %f\n", fabsf(hst_y_cublas[i] - hst_y[i]));
            exit(EXIT_FAILURE);
        }
    }
    if (hst_y_cublas) free(hst_y_cublas);
    if (hst_y) free(hst_y);
}


void do_benchmark() {
    curandGenerator_t gen;
    real_t *dev_rand_data = NULL; // Random data will be allocated here!
    real_t *dev_y = NULL;
    real_t *dev_y_cublas = NULL;
    real_t t;
    real_t t_cublas;
    const size_t n_rows_max = 1500;
    const size_t n_cols_max = 300;
    const size_t ntot = n_cols_max * (1 + n_rows_max);
    const size_t size_tot = sizeof(real_t) * ntot;

    float alpha = 1.0, beta = 0.0; // beta was initially set to 1.0 by mistake
    cublasHandle_t handle;
    _CUBLAS(cublasCreate(&handle));

    start_tictoc();

    _CUDA(cudaMalloc((void** )&dev_rand_data, size_tot));
    _CUDA(cudaMalloc((void** )&dev_y, n_rows_max * sizeof(real_t)));
    _CUDA(cudaMalloc((void** )&dev_y_cublas, n_rows_max * sizeof(real_t)));

    _CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
    _CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
    tic();
    _CURAND(curandGenerateUniform(gen, dev_rand_data, ntot));
    t = toc();
    printf("RNG in %f ms\n", t);

    _CURAND(curandDestroyGenerator(gen));

    size_t ncols = CONSTANT_COLS;
    size_t nrows = CONSTANT_ROWS;
    size_t runs = RUNS;

    cudaMemset(dev_y_cublas, 0, n_rows_max * sizeof(real_t));
    matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows, ncols);
    _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
                                nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
    /* Compare results */
    compare_results(dev_y_cublas,dev_y, nrows);


    FILE * pFile;
    char filename[50];
#if (TEST_WRT_ == TEST_COLUMNS)
    sprintf(filename, "times_rows%lu_cols.txt", nrows);
#else
    sprintf(filename, "times_cols%lu_rows.txt", ncols);
#endif

    printf("Logging to : '%s'\n", filename);
    pFile = fopen(filename, "w");
    if (pFile == NULL) {
        perror("Error opening file.");
        exit(79);
    }


#if (TEST_WRT_ == TEST_COLUMNS)
    fprintf(pFile, "0, %lu, 0, 0\n", nrows);
    for (ncols = 32; ncols < n_cols_max; ncols += 32) {
#else
    fprintf(pFile, "1, %lu, 0, 0\n", ncols);
    for (nrows = 32; nrows < n_rows_max; nrows += 32) {
#endif
        tic();
        for (short i = 0; i < runs; i++) {
            matvec<real_t>(dev_rand_data + ncols, dev_rand_data, dev_y, nrows,
                    ncols);
        }
        t = toc() / runs;
        tic();
        for (short i = 0; i < runs; i++) {
            _CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, nrows, ncols, &alpha, dev_rand_data + ncols,
                            nrows, dev_rand_data, 1, &beta, dev_y_cublas, 1));
        }
        t_cublas = toc() / runs;
#if (TEST_WRT_ == TEST_COLUMNS)
        fprintf(pFile, "%lu, %f, %f\n", ncols, t, t_cublas);
#else
        fprintf(pFile, "%lu, %f, %f\n", nrows, t, t_cublas);
#endif
    }
    _CUBLAS(cublasDestroy(handle));

    fclose(pFile);

    if (dev_rand_data != NULL)
        _CUDA(cudaFree(dev_rand_data));

    stop_tictoc();
}

int main(void)
{
    do_benchmark();

    return EXIT_SUCCESS;
}

Abschließend ist dies ein MATLAB-Skript, mit dem ich die Ausführungszeiten zeichne:

fetch_this = 'times_cols512_rows.txt';

username = 'ubuntu';
target_hostname = 'jetson';
% Do not modify below this line
eval_this=['! scp ' username '@' target_hostname ':~/mv/Debug/' fetch_this ' .'];
eval(eval_this)

set(0, 'DefaultAxesFontSize', 14);
r = csvread(fetch_this);
r_header = r(1,:);

plot(r(2:end,1), r(2:end,2)*1000, '-'); 
hold on
plot(r(2:end,1), r(2:end,3)*1000, '-r'); 
grid on;

fig_title = 'Matvec on Tegra K1 - %d %s';
if (r_header(1)==1),
    xlabel('Number of rows');
    title(sprintf(fig_title, r_header(2),'columns'));
else
    xlabel('Number of columns');
    title(sprintf(fig_title, r_header(2),'rows'));
end
ylabel('Computation time [us]');


legend('Kernel', 'cuBLAS');
axis tight

ch bin besorgt über die Leistung und die Skalierbarkeit meines Kernels, daher möchte ich zunächst wissen, wie ich die Skalierbarkeit in Bezug auf die Anzahl der Zeilen der Matrix verbessern kanA. Zweitens weiß ich, dass es nicht sehr empfehlenswert ist, Verzweigungsdivergenzen zu haben (und mein Code hat), aber ich habe das Gefühl, ich möchte einige Hinweise, um sie zu verbessern.

UPDATE: Dank all Ihrer Kommentare und Vorschläge bin ich zu dem Schluss gekommen, dasscudaDeviceSynchronized() verursachte in erster Linie einige Besonderheiten mit meinem Timing, so dass meine anfänglichen Messungen ungenau waren. Reihen-Hauptreihenfolge führt zu schlechteren Ergebnissen. Die Größe der Blöcke ist ein wichtiger Abstimmungsparameter und ändert sich von16 zu32 oder64 verbessert die Ausführungszeit. Für die Auswahl der Blockgröße ist ein weiteres Benchmarking erforderlich. Zu diesem Zweck kann man die folgende API für den Kernel verwenden:

template<typename T, const uint_t blk>
__global__ void matvec_kernel(const T * RESTRICT  dA, const T * RESTRICT  dx,
        T * RESTRICT dy, const uint_t nRows, const uint_t nx);

und nennen es so vom Host:

template<typename T>
__host__ void matvec(const T * RESTRICT dA, const T * RESTRICT dx,
        T * RESTRICT dy, const uint_t nRows, const uint_t nx) {
    uint_t blk_size_opt = 64;

    /* Add code to decide the value of `blk_size_opt` */

    if (blk_size_opt == 32) {
        matvec_engine<T, 32>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 64) {
        matvec_engine<T, 64>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 128) {
        matvec_engine<T, 128>(dA, dx, dy, nRows, nx);
    } else if (blk_size_opt == 256) {
        matvec_engine<T, 256>(dA, dx, dy, nRows, nx);
    }

}

Lassen Sie mich einige Benchmarking-Ergebnisse liefern. Zuerst ein Vergleich mit cublasSgemv:

und die Auswirkung der Blockgröße auf die Ausführungszeit: