Por que meu código de verificação inclusivo é 2x mais rápido na CPU do que em uma GPU?

Eu escrevi um pequeno programa CUDA que usa obiblioteca CUB altamente otimizada para demonstrar que um núcleo de um antigo processador Intel Q6600 de quatro núcleos (todos os quatro são supostamente capazes de ~ 30 GFLOPS / s) pode fazer uma varredura inclusiva (ou soma cumulativa / prefixo, se preferir) em 100.000 elementos mais rapidamente que uma Nvidia 750 Ti (supostamente capaz de 1306 GFLOPS / s de precisão única). Por que esse é o caso?

O código fonte é:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cub/cub.cuh>

#include <stdio.h>
#include <time.h>
#include <algorithm>


#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

void fillArrayWithRandom(float* inputArray, int inputN)
{
    for (int i = 0; i < inputN; i++)
    {
        inputArray[i] = (float)rand() / float(RAND_MAX);
    }
}

void inclusiveSum_CPU(float *inputArray, float *inputSummedArray, int inputN)
{
    for (int i = 0; i < inputN; i++)
    {
        if (i > 0)
        {
            inputSummedArray[i] = inputSummedArray[i - 1] + inputArray[i];
        }
        else
        {
            inputSummedArray[i] = inputArray[i];
        }
    }
}

int main()
{
    int N = 100000; //1 hundred thousand elements
    float numSimulations = 10000;

    //Make Host Arrays
    float* testArray_CPU = (float *)malloc(sizeof(float)*N);
    fillArrayWithRandom(testArray_CPU, N);
    float* testArrayOutput_CPU = (float *)malloc(sizeof(float)*N);

    //Make GPU Arrays
    float* testArray_GPU;
    gpuErrchk(cudaMalloc(&testArray_GPU, N*sizeof(float)));
    gpuErrchk(cudaMemcpy(testArray_GPU, testArray_CPU, N*sizeof(float), cudaMemcpyHostToDevice));
    float* testArrayOutput_GPU;
    gpuErrchk(cudaMalloc(&testArrayOutput_GPU, N*sizeof(float)));

    //Initiate the benchmark variables
    clock_t begin_CPU, end_CPU;
    float time_spent_GPU, time_spent_CPU;

    //GPU prep
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;
    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, testArray_GPU, testArrayOutput_GPU, N);
    gpuErrchk(cudaMalloc(&d_temp_storage, temp_storage_bytes));

    //GPU Timing

    cudaEvent_t start, stop;
    gpuErrchk(cudaEventCreate(&start));
    gpuErrchk(cudaEventCreate(&stop));
    gpuErrchk(cudaEventRecord(start, 0));
    for (int i = 0; i < numSimulations; i++)
    {
        cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, testArray_GPU, testArrayOutput_GPU, N);
    }
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaEventRecord(stop, 0));
    gpuErrchk(cudaEventSynchronize(stop));
    gpuErrchk(cudaEventElapsedTime(&time_spent_GPU, start, stop));

    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        exit(-1);
    }

    time_spent_GPU = (float)(time_spent_GPU / 1000);
    float avg_GPU = time_spent_GPU / numSimulations;
    printf("Avg. GPU Simulation Time: %.17g [sim/sec]\n", avg_GPU);

    //CPU Timing
    begin_CPU = clock();
    for (int i = 0; i < numSimulations; i++)
    {
        inclusiveSum_CPU(testArray_CPU, testArrayOutput_CPU, N);
    }
    end_CPU = clock();
    time_spent_CPU = (float)(end_CPU - begin_CPU) / CLOCKS_PER_SEC;
    float avg_CPU = time_spent_CPU / numSimulations;
    printf("Avg. CPU Simulation Time: %.17g [sim/sec]\n", avg_CPU);

    printf("GPU/CPU Timing:%.17gx \n", avg_GPU / avg_CPU);

    return 0;
}

E a saída quando a executo na minha máquina é:

Média Tempo de simulação da GPU: 0.0011999999405816197 [sim / s]

Média Tempo de simulação da CPU: 0.00059999997029080987 [sim / s]

Tempo de GPU / CPU: 2x

Além disso, aqui estão meus sinalizadores de compilação e saída:

1>------ Build started: Project: speedTest, Configuration: Debug Win32 ------
1>  Compiling CUDA source file kernel.cu...
1>  
1>  C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" -gencode=arch=compute_50,code=\"sm_50,compute_50\" --use-local-env --cl-version 2013 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin" -rdc=true -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\include"  -G   --keep-dir Debug -maxrregcount=0  --machine 32 --compile -cudart static  -g   -DWIN32 -D_DEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -o Debug\kernel.cu.obj "C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest\kernel.cu" 
1>  kernel.cu
1>  
1>  C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\speedTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\nvcc.exe" -dlink -o Debug\speedTest.device-link.obj -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32" cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib  -gencode=arch=compute_50,code=sm_50 -G --machine 32 Debug\kernel.cu.obj 
1>  cudart.lib
1>  kernel32.lib
1>  user32.lib
1>  gdi32.lib
1>  winspool.lib
1>  comdlg32.lib
1>  advapi32.lib
1>  shell32.lib
1>  ole32.lib
1>  oleaut32.lib
1>  uuid.lib
1>  odbc32.lib
1>  odbccp32.lib
1>  kernel.cu.obj
1>  speedTest.vcxproj -> C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\Debug\speedTest.exe
1>  copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart*.dll" "C:\Users\Owner\Documents\Visual Studio 2013\Projects\speedTest\Debug\"
1>  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart32_65.dll
1>  C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\bin\cudart64_65.dll
1>          2 file(s) copied.
========== Build: 1 succeeded, 0 failed, 0 up-to-date, 0 skipped ==========

questionAnswers(1)

yourAnswerToTheQuestion