ядра CUDA не выполняются одновременно

Question

Jan 02, 2013, 03:34 PM

ядра CUDA не выполняются одновременно

Я пытаюсь исследовать свойство выполнения параллельных ядер моего Nvidia Quadro 4000, который имеет возможность 2.0.

Я использую 2 разных потока, которые работают так же, как показано ниже:

Скопируйте H2D два разных куска закрепленной памятиЗапустить ядроСкопируйте D2H двумя разными частями в закрепленную память.

Ядра обоих потоков абсолютно одинаковы и имеют время выполнения 190 мс каждый.

В Visual profiler (версия 5.0) я ожидал, что оба ядра начнут выполнение одновременно, однако они перекрываются только на 20 мс. Вот пример кода:

enter code here

//initiate the streams
        cudaStream_t stream0,stream1;
        CHK_ERR(cudaStreamCreate(&stream0));
        CHK_ERR(cudaStreamCreate(&stream1));
        //allocate the memory on the GPU for stream0
        CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        //allocate the memory on the GPU for stream1
        CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));

        //allocate page-locked memory for stream0
        CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));

        //allocate page-locked memory for stream1
        CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));


        memcpy(host01,in1,width*height*sizeof(char));
        memcpy(host02,in2,width*height*sizeof(char));

        memcpy(host11,in1,width*height*sizeof(char));
        memcpy(host12,in2,width*height*sizeof(char));



        cudaEvent_t start, stop;
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
        dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);

        cudaEventRecord(start,0);
        // --------------------
        // Copy images to device
        // --------------------
        //enqueue copies of def stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
        //enqueue copies of ref stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));

        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        //CALLING KERNEL
        //enqueue kernel in stream0 and stream1
        TIME_KERNEL((exhaustiveSearchKernel< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
        TIME_KERNEL((exhaustiveSearchKernel

ядра CUDA не выполняются одновременно

Ответы на вопрос(1)

Ваш ответ на вопрос

Популярные вопросы

Вы очень активны! Это здорово!

ядра CUDA не выполняются одновременно

Ответы на вопрос(1)

Ваш ответ на вопрос

Популярные вопросы