cuda matrix inversa gaussian jordania

No encontré ninguna pregunta similar a la mía. Estoy tratando de escribir el algoritmo de matriz inversa gaussian-jordan. La idea del algoritmo es simple :)

Quiero invertir solo una matriz triangular inferior. Obtuve una respuesta casi correcta. ¿Dónde hice algo mal? ¿Alguien me puede ayudar? Te lo agradeceré

d_ A matriz triangular inferior (nxn)dI matriz de identidad (nxn)

n tamaño de una matriz en una dirección (n% 16 = 0)

dim3 threadsPerBlock (n / 16, n / 16);

dim3 numBlocks (16,16);

Sé que es una implementación simple, pero al principio necesito que funcione correctamente :) ¿Alguien puede ayudarme o darme alguna pista? Te lo agradeceré ¡Muchas gracias!

hay todo el código de la CPU:

  #include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>

using namespace std;

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i)
            if(y>=i){
                P=A[x*n+i]/A[i*n+i];
                I[x*n+y]-= I[i*n+y]*P;
                A[x*n+y]-= A[i*n+y]*P;
            }
            __syncthreads();
 }


 __global__ void dev(float *d_A,  float *dI, int h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x<h && y<h)
        if(d_A[x*h+x]!=0){
            dI[x*h+y]  /= d_A[x*h+x];
            d_A[x*h+y] /= d_A[x*h+x];
        }
    __syncthreads();

}

void savetofile(float *A, string s, int n, int h)
{
    std::ofstream plik;
    plik.open(s);

    for(int j=0;j<h;j++){
        for(int i=0;i<h;i++){
            plik<<A[j*n+i]<<"\t";}
            plik<<endl;}
    plik.close();
}

int main()
{
    int n = 16;
// creating input
    float *iL = new float [n*n];
    float *L = new float [n*n];
    for(int i=0;i<n;i++)
        for(int j=0;j<n;j++)
            if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
            else L[i*n+j] = 0;

    savetofile(L,"L.txt",n,n);

    cout<<"inv\n";
    float *d_A, *d_L,*I, *dI;
    float time;
    cudaError_t err;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(float);

    dim3 threadsPerBlock(n/16,n/16);
    dim3 numBlocks(16,16);
// memory allocation    
    err= cudaMalloc( (void**)  &d_A, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err= cudaMalloc( (void**)   &dI, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    I = new float[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=1.0;
                else I[i*n+j]=0.0;}}
 //copy data from GPU to CPU
    err =cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err =cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);  if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,    threadsPerBlock>>>(d_A, dI, n); 

    err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 
    err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
    savetofile(iL,"inv.txt",n,n);
    savetofile(L,"I.txt",n,n);
    cudaFree(d_A);
    cudaFree(dI);
    delete []I;
    delete []L;
    delete []iL;
    system("Pause");
 return 0;
}

ahí está mi salida:

60.6061 0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
-34.1233    -2.13675    -0  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-48.5115    1.91603 -0.0799201  -0  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.4891    1.8697  0.0748167   -0.0196634  -0  -0  -0  -0  0   0   0   0   0   0   0   0   
-49.8332    1.84732 0.0725876   0.018747    -0.00767828 -0  -0  -0  0   0   0   0   0   0   0   0   
-50.0073    1.83403 0.071321    0.0182352   0.00739595  -0.00376795 -0  -0  0   0   0   0   0   0   0   0   
-50.112 1.82521 0.0705011   0.0179073   0.0072164   0.00365346  -0.00212282 -0  0   0   0   0   0   0   0   0   
-50.1818    1.81893 0.0699261   0.0176789   0.00709196  0.00357445  0.00206784  -0.00131234 0   0   0   0   0   0   0   0   
-50.2316    1.81423 0.0695003   0.0175105   0.00700059  0.00351662  0.0020277   0.00128271  -0.00086736 -0  -0  -0  -0  -0  -0  -0  
-50.2689    1.81057 0.0691722   0.0173813   0.00693062  0.00347244  0.00199711  0.00126017  0.000850006 -0.000602925    -0  -0  -0  -0  -0  -0  
-50.2979    1.80765 0.0689115   0.0172789   0.0068753   0.00343758  0.00197301  0.00124245  0.000836382 0.000592093 -0.000435975    -0  -0  -0  -0  -0  
-50.321 1.80527 0.0686993   0.0171957   0.00683047  0.00340937  0.00195354  0.00122815  0.000825401 0.000583374 0.000428868 -0.00032541 -0  -0  -0  -0  
-50.34  1.80328 0.0685233   0.0171269   0.0067934   0.00338607  0.00193748  0.00121637  0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293    -0  -0  -0  
-50.3557    1.80159 0.0683749   0.0170689   0.00676223  0.0033665   0.001924    0.00120649  0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186    -0  -0  
-50.369 1.80015 0.0682481   0.0170195   0.00673566  0.00334983  0.00191253  0.00119809  0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673    -0  
-50.3805    1.7989  0.0681385   0.0169768   0.00671274  0.00333547  0.00190265  0.00119086  0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382  -0.000126146

y debería ser:

60,6060606060606    4,44089209850063e-16    4,85722573273506e-17    -3,12250225675825e-17   0   1,73472347597681e-18    -1,08420217248550e-18   -7,58941520739853e-19   4,33680868994202e-19    -5,42101086242752e-19   0   -6,93889390390723e-18   0   -1,38777878078145e-17   0   1,18720137887163e-17
-34,1232841232841   -2,13675213675214   0   8,67361737988404e-18    3,03576608295941e-18    8,67361737988404e-19    -1,73472347597681e-18   1,35525271560688e-18    -8,67361737988404e-19   1,00288700954909e-18    0   0   6,93889390390723e-18    6,93889390390723e-18    -1,38777878078145e-17   3,02221355580334e-18
-17,9130271437964   1,91603268526345    -0,0799200799200800 1,30104260698261e-18    1,95156391047391e-18    -9,75781955236954e-19   1,95156391047391e-18    2,16840434497101e-19    -3,52365706057789e-19   -1,62630325872826e-19   1,38777878078145e-17    -3,46944695195361e-18   0   0   0   -2,72405795836983e-18
-2,86140643299924   0,0760191125748172  0,0748166415934231  -0,0196633632216454 -2,41234983378025e-18   7,99599102208060e-19    3,25260651745651e-19    -4,74338450462408e-19   2,67662411332359e-19    2,91379333855479e-19    -2,16840434497101e-18   -4,33680868994202e-19   1,30104260698261e-18    0   0   6,86096687275983e-20
-1,33482739506506   0,0346053236774996  0,00125734163772674 0,0187469132242915  -0,00767825058738617    5,35324822664718e-19    -2,23616698075135e-19   5,08219768352580e-20    5,92923063078010e-20    1,74488787134386e-19    -4,33680868994202e-19   4,33680868994202e-19    -2,16840434497101e-19   2,16840434497101e-19    0   -1,19008129089229e-19
-0,793561224702690  0,0203250367373064  0,000727127971238783    0,000177630032830862    0,00739591005669882 -0,00376795430225022    4,98055372985529e-19    -3,84552958053452e-19   3,20178454062126e-19    -1,35525271560688e-19   6,50521303491303e-19    -1,08420217248550e-19   1,08420217248550e-19    -2,16840434497101e-19   0   -7,15742840429884e-20
-0,532255026297144  0,0135340901236068  0,000479383336751935    0,000115847127348313    4,51920594555328e-05    0,00365346070706817 -0,00212282675610843    1,37219337455197e-19    -5,14996031930615e-19   3,30342849429177e-19    0   -2,71050543121376e-19   1,08420217248550e-19    0   0   5,08219768352580e-20
-0,384130052448431  0,00972113086608457 0,000342250536212794    8,21235560483452e-05    3,18129608485860e-05    1,56232096436654e-05    0,00206784220009096 -0,00131233595800525    6,39509875176997e-20    -3,37542629480839e-19   -1,08420217248550e-19   2,16840434497101e-19    0   0   0   -8,47032947254300e-22
-0,291692030052418  0,00735419164507677 0,000257375648850429    6,15185225200113e-05    2,37495210052671e-05    1,16038017329438e-05    6,53368676878396e-06    0,00128271813402154 -0,000867362869930264   1,77876918923403e-19    1,62630325872826e-19    -1,89735380184963e-19   1,62630325872826e-19    0   0   -9,07384044746169e-20
-0,229596895430646  0,00578230937666655 0,000201707743336976    4,79768824589291e-05    1,84020572663637e-05    8,96002707181433e-06    5,05525466995835e-06    3,12009781742606e-06    0,000850011219708818    -0,000602925394011745   0   2,71050543121376e-20    -8,13151629364128e-20   5,42101086242752e-20    -5,42101086242752e-20   7,73976355553617e-20
-0,185720949479909  0,00466765632076680 0,000162419592307734    3,85318721641536e-05    1,47407053519860e-05    7,17308297585328e-06    4,02178178072207e-06    2,48428717850195e-06    1,64547815065802e-06    0,000592092919336558    -0,000435974905284452   0   0   8,13151629364128e-20    -1,08420217248550e-19   2,64697796016969e-20
-0,153867987373140  0,00385473267086607 0,000133863548213241    3,17506489004575e-05    1,20962229586152e-05    5,86799087221288e-06    3,28276799988068e-06    2,02338706451671e-06    1,33735029942045e-06    9,34275734555363e-07    0,000428867197061432    -0,000325409609345764   0   2,71050543121376e-20    0   -1,09055491958991e-20
-0,129703518509601  0,00324211947468978 0,000112403568308126    2,65969300905272e-05    1,01402805713936e-05    4,89779294849866e-06    2,73496124917826e-06    1,68586638861081e-06    1,11012300345236e-06    7,73556738632873e-07    5,60933254708493e-07    0,000320553621268105    -0,000249293253625970   5,42101086242752e-20    0   -1,01114558078482e-20
-0,110691345431593  0,00276839969825208 9,59884298624889e-05    2,25961759289096e-05    8,63052307521336e-06    4,15554692230644e-06    2,31688356971108e-06    1,42511604039733e-06    9,39229137057347e-07    6,51934526276135e-07    4,72019315851685e-07    3,53897320062806e-07    0,000245863313382516    -0,000195185934120844   0   -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05    1,95227026042985e-05    7,41637441475814e-06    3,57424367962823e-06    1,99334817579930e-06    1,21993241781196e-06    8,05577604288488e-07    5,57554928001086e-07    4,03155267486669e-07    3,01723475812485e-07    2,31838854154289e-07    0,000192695260333710    -0,000155673036807333   -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05    1,70229067498473e-05    6,46008752692950e-06    3,11455737751181e-06    1,73159030599080e-06    1,06073213436631e-06    6,96842172109705e-07    4,82764206408816e-07    3,49217230232344e-07    2,60145440758586e-07    2,00286821017368e-07    1,56906945950947e-07    0,000153820426928509    -0,000126146355001072

Respuestas a la pregunta(1)

Su respuesta a la pregunta