Um paralelo para usar std :: thread?

Eu sou novo com std :: thread e eu tento codificar umparallel_for. Eu codifiquei o seguinte:

// parallel_for.cpp
// compilation: g++ -O3 -std=c++0x parallel_for.cpp -o parallel_for -lpthread
// execution: time ./parallel_for 100 50000000 
// (100: number of threads, 50000000: vector size)
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <utility>

// Parallel for
template<typename Iterator, class Function>
void parallel_for(const Iterator& first, const Iterator& last, Function&& f, const int nthreads = 1, const int threshold = 1000)
{
    const unsigned int group = std::max(std::max(1, std::abs(threshold)), (last-first)/std::abs(nthreads));
    std::vector<std::thread> threads;
    for (Iterator it = first; it < last; it += group) {
        threads.push_back(std::thread([=](){std::for_each(it, std::min(it+group, last), f);}));
    }
    std::for_each(threads.begin(), threads.end(), [=](std::thread& x){x.join();});
}

// Function to apply
template<typename Type>
void f1(Type& x)
{
    x = std::sin(x)+std::exp(std::cos(x))/std::exp(std::sin(x)); 
}

// Main
int main(int argc, char* argv[]) {

    const unsigned int nthreads = (argc > 1) ? std::atol(argv[1]) : (1);
    const unsigned int n = (argc > 2) ? std::atol(argv[2]) : (100000000);
    double x = 0;
    std::vector<double> v(n);
    std::iota(v.begin(), v.end(), 0);

    parallel_for(v.begin(), v.end(), f1<double>, nthreads);

    for (unsigned int i = 0; i < n; ++i) x += v[i];
    std::cout<<std::setprecision(15)<<x<<std::endl;
    return 0;
}

Mas isso não está funcionando: (códigos de erro do g ++ 4.6)

parallel_for.cpp: In instantiation of ‘parallel_for(const Iterator&, const Iterator&, Function&&, int, int) [with Iterator = __gnu_cxx::__normal_iterator<double*, std::vector<double> >, Function = void (&)(double&)]::<lambda()>’:
parallel_for.cpp:22:9:   instantiated from ‘void parallel_for(const Iterator&, const Iterator&, Function&&, int, int) [with Iterator = __gnu_cxx::__normal_iterator<double*, std::vector<double> >, Function = void (&)(double&)]’
parallel_for.cpp:43:58:   instantiated from here
parallel_for.cpp:22:89: erreur: field ‘parallel_for(const Iterator&, const Iterator&, Function&&, int, int) [with Iterator = __gnu_cxx::__normal_iterator<double*, std::vector<double> >, Function = void (&)(double&)]::<lambda()>::__f’ invalidly declared function type

Como resolver este problema ?

EDIT: esta nova versão compila, mas não dá o bom resultado:

// parallel_for.cpp
// compilation: g++ -O3 -std=c++0x parallel_for.cpp -o parallel_for -lpthread
// execution: time ./parallel_for 100 50000000 
// (100: number of threads, 50000000: vector size)
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <utility>

// Parallel for
template<typename Iterator, class Function>
void parallel_for(const Iterator& first, const Iterator& last, Function&& f, const int nthreads = 1, const int threshold = 1000)
{
    const unsigned int group = std::max(std::max(1, std::abs(threshold)), (last-first)/std::abs(nthreads));
    std::vector<std::thread> threads;
    for (Iterator it = first; it < last; it += group) {
        threads.push_back(std::thread([=, &f](){std::for_each(it, std::min(it+group, last), f);}));
    }
    std::for_each(threads.begin(), threads.end(), [](std::thread& x){x.join();});
}

// Function to apply
template<typename Type>
void f(Type& x)
{
    x = std::sin(x)+std::exp(std::cos(x))/std::exp(std::sin(x)); 
}

// Main
int main(int argc, char* argv[]) {

    const unsigned int nthreads = (argc > 1) ? std::atol(argv[1]) : (1);
    const unsigned int n = (argc > 2) ? std::atol(argv[2]) : (100000000);
    double x = 0;
    double y = 0;
    std::vector<double> v(n);

    std::iota(v.begin(), v.end(), 0);
    std::for_each(v.begin(), v.end(), f<double>);
    for (unsigned int i = 0; i < n; ++i) x += v[i];

    std::iota(v.begin(), v.end(), 0);
    parallel_for(v.begin(), v.end(), f<double>, nthreads);
    for (unsigned int i = 0; i < n; ++i) y += v[i];

    std::cout<<std::setprecision(15)<<x<<" "<<y<<std::endl;
    return 0;
}

O resultado é :

./parallel_for 1 100
155.524339894552 4950

A versão paralela retorna 4950 enquanto a versão sequencial retorna 155 ..... Onde está o problema?

questionAnswers(6)

yourAnswerToTheQuestion