I am trying to parallelize the following function (an iterative solver) that has a while loop and a nested for loop inside. The code looks like:
static const int nx = 128;
static const int ny = 128;
static const int nz = 128;
//Initialize
Eigen::Tensor<std::complex<double>, 3> xTerm(nx,ny,nz);
Eigen::Tensor<std::complex<double>, 3> yTerm(nx,ny,nz);
Eigen::Tensor<std::complex<double>, 3> zTerm(nx,ny,nz);
Eigen::Tensor<std::complex<double>, 3> RHS(nx,ny,nz);
double UpdatedSol_max, OldSol_max;
double it_error;
// Begin counter for the number of iterations it takes to calculate phi
int count = 0;
// Begin while loop
do{
// Calculate some thing
#pragma omp parallel for collapse(3)
for(int i = 0; i< nx; i++){
for(int j = 0; j< ny; j++){
for(int k = 0; k< nz; k++){
RHS(k,i,j) = Source(k,i,j) - xTerm(k,i,j) - yTerm(k,i,j) - zTerm(k,i,j);
}
}
}
// Some calculations
// Increase iteration count by 1
count = count + 1;
// Calculate error
it_error = fabs((UpdatedSol_max-OldSol_max)/UpdatedSol_max);
}while( it_error > err_max && count <= max_iter && UpdatedSol_max > err_max);
return count;
So, a simple #pragma omp parallel for collapse(3) actually makes the parallel code slower so I tried using reduction since there's some addition in that nested for loop and this is my attempt:
#pragma omp parallel for reduction(+: Terms)
for(int i = 0; i< nx; i++){
for(int j = 0; j< ny; j++){
for(int k = 0; k< nz; k++){
Terms(k,i,j) += xTerm(k,i,j) + yTerm(k,i,j) + zTerm(k,i,j);
RHS(k,i,j) = Source(k,i,j) - Terms(k,i,j);
}
}
}
This however returns the error:
error: user defined reduction not found for ‘Terms’
which I tried fixing by adding how many elements are to be reduced: #pragma omp parallel for reduction(+: Terms[:nx])
but I am still getting the error:
error: ‘Terms’ does not have pointer or array type
I am sort of confused, can I not use an Eigen array structure here? how can I rewrite this better? Thanks
Edit: I am including a small reproducible example for people to compile and test themselves. This is an iterative solver so somethings are initialized to just zeros to make it easier to compile:
static const int nx = 128;
static const int ny = 128;
static const int nz = 128;
using namespace Eigen;
int main(){
double err_max = 1.e-8;
int phi_iter_max = 500;
int Soliter = 0;
Eigen::initParallel();
Eigen::setNbThreads(omp_get_max_threads());
Eigen::Tensor<std::complex<double>, 3> invnk(nx,ny,nz);
invnk.setZero();
Eigen::Tensor<std::complex<double>, 3> dndxk(nx,ny,nz);
dndxk.setZero();
Eigen::Tensor<std::complex<double>, 3> dndyk(nx,ny,nz);
dndyk.setZero();
Eigen::Tensor<std::complex<double>, 3> dndzk(nx,ny,nz);
dndzk.setZero();
Eigen::Tensor<std::complex<double>, 3> Sol(nx,ny,nz);
dphikdt.setConstant(1.0f);
Eigen::Tensor<std::complex<double>, 3> Source(nx,ny,nz);
Source.setZero();
Eigen::Tensor<double, 3> KX(nx,ny,nz);
KX.setZero();
Eigen::Tensor<double, 3> KY(nx,ny,nz);
KY.setZero();
Eigen::Tensor<double, 3> KZ(nx,ny,nz);
KZ.setZero();
Eigen::Tensor<double, 3> factor(nx,ny,nz);
factor.setZero();
double start_time1, run_time1;
start_time1 = omp_get_wtime();
//clock_t start_time1 = clock();
Soliter = Solver3D(invnk, dndxk, dndyk, dndzk, Sol, Source, KX, KY, KZ, factor, err_max, phi_iter_max);
//clock_t end1 = clock();
run_time1 = omp_get_wtime() - start_time1;
cout << "Parallel Time in SEC: " << run_time1 << "s\n";
//cout << "Serial Time in SEC: " << (double)(end1-start_time1) / CLOCKS_PER_SEC << "s\n";
}
The iterative solver function looks like:
int Solver3D(Eigen::Tensor<std::complex<double>, 3>& invnk, Eigen::Tensor<std::complex<double>, 3>& dndxk, Eigen::Tensor<std::complex<double>, 3>& dndyk, Eigen::Tensor<std::complex<double>, 3>& dndzk,Eigen::Tensor<std::complex<double>, 3>& phik, Eigen::Tensor<std::complex<double>, 3>& Source,Eigen::Tensor<double, 3>& kx, Eigen::Tensor<double, 3>& ky, Eigen::Tensor<double, 3>& kz, Eigen::Tensor<double, 3>& factor, double err_max, int max_iter){
Eigen::Tensor<std::complex<double>, 3> xTerm(nx,ny,nz);
xTerm.setZero();
Eigen::Tensor<std::complex<double>, 3> yTerm(nx,ny,nz);
yTerm.setZero();
Eigen::Tensor<std::complex<double>, 3> zTerm(nx,ny,nz);
zTerm.setZero();
Eigen::Tensor<std::complex<double>, 3> RHS(nx,ny,nz);
RHS.setZero();
double UpdatedSol_max , OldSol_max;
double it_error;
// Begin counter for the number of iterations it takes to calculate phi
int count = 0;
// Begin while loop
do{
#pragma omp parallel for collapse(3)
for(int i = 0; i< nx; i++){
for(int j = 0; j< ny; j++){
for(int k = 0; k< nz; k++){
RHS(k,i,j) = Source(k,i,j) - xTerm(k,i,j) - yTerm(k,i,j) - zTerm(k,i,j);
}
}
}
// Calculate maximum of absolute value of previous solution
Eigen::Tensor<double, 0> AbsMaxAsTensor = Sol.abs().maximum();
OldSol_max = AbsMaxAsTensor(0);
Sol = RHS * factor;
// Calculate maximum of absolute value of updated solution
Eigen::Tensor<double, 0> AbsMaxAsTensor1 = Sol.abs().maximum();
UpdatedSol_max = AbsMaxAsTensor1(0);
// Increase iteration count by 1
count = count + 1;
// Calculate error
it_error = fabs((UpdatedSol_max-OldSol_max)/UpdatedSol_max);
}while( it_error > err_max && count <= max_iter && UpdatedSol_max > err_max);
return count;
}
I run the following command to obtain the ''parallel'' time:
g++ ParallelTest.cpp -lfftw3 -lfftw3_threads -fopenmp -lm -O3 -Ofast -o /test.out
And the following for the ''serial'' time using ''clock_t for wall time:
g++ ParallelTest.cpp -lfftw3 -lfftw3_threads -lm -O3 -Ofast -o /test.out
Eidt 1: Since it seems difficult to reproduce my example, I am including some of the runtimes for different number of threads: in main code
for (int nThreads =1; nThreads <= 16; nThreads++){
double start_time1, run_time1;
start_time1 = omp_get_wtime();
omp_set_num_threads(nThreads);
Soliter = Solver3D(invnk, dndxk, dndyk, dndzk, Sol, Source, KX, KY, KZ, factor, err_max, phi_iter_max);
run_time1 = omp_get_wtime() - start_time1;
cout << "Threads: " << nThreads << " Parallel Time in SEC: " << run_time1 << "s\n";
This returns:
Threads: 1 Parallel Time in SEC: 0.444323s
Threads: 2 Parallel Time in SEC: 0.292054s
Threads: 3 Parallel Time in SEC: 0.261521s
Threads: 4 Parallel Time in SEC: 0.260612s
Threads: 5 Parallel Time in SEC: 0.25353s
Threads: 6 Parallel Time in SEC: 0.258204s
Threads: 7 Parallel Time in SEC: 0.275174s
Threads: 8 Parallel Time in SEC: 0.280042s
Threads: 9 Parallel Time in SEC: 0.274214s
Threads: 10 Parallel Time in SEC: 0.271887s
Threads: 11 Parallel Time in SEC: 0.274815s
Threads: 12 Parallel Time in SEC: 0.273562s
Threads: 13 Parallel Time in SEC: 0.283543s
Threads: 14 Parallel Time in SEC: 0.302721s
Threads: 15 Parallel Time in SEC: 0.317697s
Threads: 16 Parallel Time in SEC: 0.290503s
I have 12 processors and 6 cores so really using up to 16 threads is just a test.