Verified Commit 7c39b12b authored by Sébastien Villemot's avatar Sébastien Villemot
Browse files

Always compile MEX files with OpenMP (when they can take advantage of it)

As a consequence, remove the --enable-openmp option of the configure script.
parent 82cef48e
Pipeline #1453 passed with stages
in 103 minutes and 9 seconds
......@@ -127,11 +127,8 @@ To remove debugging information for MATLAB MEX functions, the analogous call wou
./configure MATLAB_MEX_CFLAGS="-O3" MATLAB_MEX_CXXFLAGS="-O3"
```
If you want to give a try to the parallelized versions of some mex files (`A_times_B_kronecker_C` and `sparse_hessian_times_B_kronecker_C` used to get the reduced form of the second order approximation of the model) you can add the `--enable-openmp` flag, for instance:
```
./configure --with-matlab=/usr/local/MATLAB/R2019a MATLAB_VERSION=9.6 --enable-openmp
```
If the configuration goes well, the script will tell you which components are correctly configured and will be built.
If the configuration goes well, the script will tell you which components are
correctly configured and will be built.
Note that it is possible that some MEX files cannot be compiled, due to missing
build dependencies. If you find no way of installing the missing dependencies,
......
function set_dynare_threads(mexname,n)
% This function sets the number of threads used by some MEX files when compiled
% with OpenMP support (i.e with --enable-openmp is given to configure) or any
% other parallel library.
% This function sets the number of threads used by some MEX files using
% OpenMP features or any other parallel library.
%
% INPUTS
% o mexname [string] Name of the mex file.
......@@ -10,7 +9,7 @@ function set_dynare_threads(mexname,n)
% OUTPUTS
% none.
% Copyright (C) 2009-2012 Dynare Team
% Copyright (C) 2009-2019 Dynare Team
%
% This file is part of Dynare.
%
......
......@@ -3,6 +3,7 @@ mex_PROGRAMS = block_kalman_filter
TOPDIR = $(top_srcdir)/../../sources/block_kalman_filter
block_kalman_filter_CPPFLAGS = $(AM_CPPFLAGS) -I$(TOPDIR)
block_kalman_filter_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp
nodist_block_kalman_filter_SOURCES = block_kalman_filter.cc
......
......@@ -3,6 +3,8 @@ mex_PROGRAMS = sparse_hessian_times_B_kronecker_C A_times_B_kronecker_C
nodist_sparse_hessian_times_B_kronecker_C_SOURCES = sparse_hessian_times_B_kronecker_C.cc
nodist_A_times_B_kronecker_C_SOURCES = A_times_B_kronecker_C.cc
sparse_hessian_times_B_kronecker_C_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp
BUILT_SOURCES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES) $(nodist_A_times_B_kronecker_C_SOURCES)
CLEANFILES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES) $(nodist_A_times_B_kronecker_C_SOURCES)
......
......@@ -2,6 +2,8 @@ mex_PROGRAMS = local_state_space_iteration_2
nodist_local_state_space_iteration_2_SOURCES = local_state_space_iteration_2.cc
local_state_space_iteration_2_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp
BUILT_SOURCES = $(nodist_local_state_space_iteration_2_SOURCES)
CLEANFILES = $(nodist_local_state_space_iteration_2_SOURCES)
......
......@@ -129,14 +129,6 @@ else
BUILD_MS_SBVAR_MEX_MATLAB="no (missing GSL)"
fi
AC_ARG_ENABLE([openmp], AS_HELP_STRING([--enable-openmp], [use OpenMP for parallelization of some MEX files]), [
if test "$enable_openmp" = yes; then
CPPFLAGS="$CPPFLAGS -DUSE_OMP"
CFLAGS="$CFLAGS -fopenmp"
CXXFLAGS="$CXXFLAGS -fopenmp"
fi
])
AC_ARG_WITH([m2html], AS_HELP_STRING([--with-m2html=DIR], [specify installation directory of M2HTML]), [
M2HTML=$withval
BUILD_M2HTML=yes
......
......@@ -103,14 +103,6 @@ else
BUILD_MS_SBVAR_MEX_OCTAVE="no (missing GSL or MatIO library)"
fi
AC_ARG_ENABLE([openmp], AS_HELP_STRING([--enable-openmp], [use OpenMP for parallelization of some MEX files]), [
if test "x$enable_openmp" = "xyes"; then
CPPFLAGS="$CPPFLAGS -DUSE_OMP"
CFLAGS="$CFLAGS -fopenmp"
CXXFLAGS="$CXXFLAGS -fopenmp"
fi
])
AC_MSG_NOTICE([
Dynare is now configured for building the following components...
......
......@@ -3,6 +3,7 @@ mex_PROGRAMS = qmc_sequence
TOPDIR = $(top_srcdir)/../../sources/sobol
qmc_sequence_CPPFLAGS = $(AM_CPPFLAGS) -I$(TOPDIR)
qmc_sequence_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp
nodist_qmc_sequence_SOURCES = qmc_sequence.cc
......
......@@ -19,9 +19,8 @@
#include <cmath>
#include <algorithm>
#ifdef USE_OMP
# include <omp.h>
#endif
#include <omp.h>
#include "block_kalman_filter.hh"
#define BLAS
......@@ -372,7 +371,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
//v = Y(:,t) - a(mf)
int i_i = 0;
//#pragma omp parallel for shared(v, i_i, d_index) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
//#pragma omp parallel for shared(v, i_i, d_index)
for (auto i = d_index.begin(); i != d_index.end(); i++)
{
//mexPrintf("i_i=%d, omp_get_max_threads()=%d\n",i_i,omp_get_max_threads());
......@@ -383,7 +382,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
//F = P(mf,mf) + H;
i_i = 0;
if (H_size == 1)
//#pragma omp parallel for shared(iF, F, i_i) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
//#pragma omp parallel for shared(iF, F, i_i)
for (auto i = d_index.begin(); i != d_index.end(); i++, i_i++)
{
int j_j = 0;
......@@ -391,7 +390,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
iF[i_i + j_j * size_d_index] = F[i_i + j_j * size_d_index] = P[mf[*i] + mf[*j] * n] + H[0];
}
else
//#pragma omp parallel for shared(iF, F, P, H, mf, i_i) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
//#pragma omp parallel for shared(iF, F, P, H, mf, i_i)
for (auto i = d_index.begin(); i != d_index.end(); i++, i_i++)
{
int j_j = 0;
......@@ -502,9 +501,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
mexPrintf("dgetri failure with error %d\n", static_cast<int>(info));
//lik(t) = log(dF)+transpose(v)*iF*v;
#ifdef USE_OMP
# pragma omp parallel for shared(v_pp) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
#endif
#pragma omp parallel for shared(v_pp)
for (int i = 0; i < size_d_index; i++)
{
double res = 0.0;
......@@ -522,9 +519,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
if (missing_observations)
//K = P(:,mf)*iF;
#ifdef USE_OMP
# pragma omp parallel for shared(P_mf) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
#endif
#pragma omp parallel for shared(P_mf)
for (int i = 0; i < n; i++)
{
int j_j = 0;
......@@ -538,9 +533,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
for (int j = 0; j < pp; j++)
P_mf[i + j * n] = P[i + mf[j] * n];
#ifdef USE_OMP
# pragma omp parallel for shared(K) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
#endif
#pragma omp parallel for shared(K)
for (int i = 0; i < n; i++)
for (int j = 0; j < size_d_index; j++)
{
......@@ -552,9 +545,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
}
//a = T*(a+K*v);
#ifdef USE_OMP
# pragma omp parallel for shared(v_n) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
#endif
#pragma omp parallel for shared(v_n)
for (int i = pure_obs; i < n; i++)
{
double res = 0.0;
......@@ -563,9 +554,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
v_n[i] = res + a[i];
}
#ifdef USE_OMP
# pragma omp parallel for shared(a) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
#endif
#pragma omp parallel for shared(a)
for (int i = 0; i < n; i++)
{
double res = 0.0;
......@@ -578,24 +567,20 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
{
//P = T*(P-K*P(mf,:))*transpose(T)+QQ;
int i_i = 0;
//#pragma omp parallel for shared(P_mf) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
//#pragma omp parallel for shared(P_mf)
for (auto i = d_index.begin(); i != d_index.end(); i++, i_i++)
for (int j = pure_obs; j < n; j++)
P_mf[i_i + j * size_d_index] = P[mf[*i] + j * n];
}
else
//P = T*(P-K*P(mf,:))*transpose(T)+QQ;
#ifdef USE_OMP
# pragma omp parallel for shared(P_mf) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
#endif
#pragma omp parallel for shared(P_mf)
for (int i = 0; i < pp; i++)
for (int j = pure_obs; j < n; j++)
P_mf[i + j * pp] = P[mf[i] + j * n];
#ifdef BLAS
# ifdef USE_OMP
# pragma omp parallel for shared(K_P) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
# endif
# pragma omp parallel for shared(K_P)
for (int i = 0; i < n; i++)
for (int j = i; j < n; j++)
{
......@@ -733,9 +718,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
}
# else
# ifdef USE_OMP
# pragma omp parallel for shared(K_P) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
# endif
# pragma omp parallel for shared(K_P)
for (int i = pure_obs; i < n; i++)
{
unsigned int i1 = i - pure_obs;
......@@ -750,9 +733,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
}
}
# ifdef USE_OMP
# pragma omp parallel for shared(P_t_t1) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
# endif
# pragma omp parallel for shared(P_t_t1)
for (int i = pure_obs; i < n; i++)
{
unsigned int i1 = i - pure_obs;
......@@ -766,9 +747,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
fill_n(tmp, 0, n * n_state);
# ifdef USE_OMP
# pragma omp parallel for shared(tmp) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
# endif
# pragma omp parallel for shared(tmp)
for (int i = 0; i < n; i++)
{
int max_k = i_nz_state_var[i];
......@@ -785,9 +764,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
fill_n(P, 0, n * n);
int n_n_obs = -n * pure_obs;
# ifdef USE_OMP
# pragma omp parallel for shared(P) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
# endif
# pragma omp parallel for shared(P)
for (int i = 0; i < n; i++)
{
for (int j = i; j < n; j++)
......@@ -802,9 +779,7 @@ BlockKalmanFilter::block_kalman_filter(int nlhs, mxArray *plhs[])
}
}
# ifdef USE_OMP
# pragma omp parallel for shared(P) num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
# endif
# pragma omp parallel for shared(P)
for (int i = 0; i < n; i++)
{
for (int j = i; j < n; j++)
......
......@@ -1923,7 +1923,7 @@ dynSparseMatrix::Init_GE(int periods, int y_kmin, int y_kmax, int Size, map<pair
NbNZCol[i] = 0;
}
int nnz = 0;
//pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) ordered private(it4, ti_y_kmin, ti_y_kmax, eq, var, lag) schedule(dynamic)
//pragma omp parallel for ordered private(it4, ti_y_kmin, ti_y_kmax, eq, var, lag) schedule(dynamic)
for (t = 0; t < periods; t++)
{
ti_y_kmin = -min(t, y_kmin);
......@@ -2710,7 +2710,7 @@ dynSparseMatrix::Sparse_mult_SAT_B(mxArray *A_m, mxArray *B_m)
//unsigned int nze_A = 0;
unsigned int C_col = 0;
C_j[C_col] = 0;
//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
//#pragma omp parallel for
for (unsigned int j = 0; j < n_B; j++)
{
for (unsigned int i = 0; i < n_A; i++)
......@@ -5371,7 +5371,7 @@ dynSparseMatrix::Solve_ByteCode_Sparse_GaussianElimination(int Size, int blck, i
bc[nb_eq_todo++] = first;
first = first->NZE_C_N;
}
//pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS")))
//pragma omp parallel for
for (int j = 0; j < nb_eq_todo; j++)
{
first = bc[j];
......@@ -5704,7 +5704,7 @@ dynSparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bo
bc[nb_eq_todo++] = first;
first = first->NZE_C_N;
}
//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
//#pragma omp parallel for shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
for (int j = 0; j < nb_eq_todo; j++)
{
t_save_op_s *save_op_s_l;
......@@ -5873,7 +5873,7 @@ dynSparseMatrix::Solve_ByteCode_Symbolic_Sparse_GaussianElimination(int Size, bo
bc[nb_eq_todo++] = first;
first = first->NZE_C_N;
}
//#pragma omp parallel for num_threads(atoi(getenv("DYNARE_NUM_THREADS"))) shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
//#pragma omp parallel for shared(nb_var_piva, first_piva, nopa, save_op) reduction(+:nop)
for (int j = 0; j < nb_eq_todo; j++)
{
NonZeroElem *first = bc[j];
......
......@@ -27,9 +27,7 @@
#include <dynmex.h>
#ifdef USE_OMP
# include <omp.h>
#endif
#include <omp.h>
#define DEBUG_OMP 0
......@@ -42,9 +40,7 @@ sparse_hessian_times_B_kronecker_B(const mwIndex *isparseA, const mwIndex *jspar
** This loop is splitted into two nested loops because we use the
** symmetric pattern of the hessian matrix.
*/
#if USE_OMP
# pragma omp parallel for num_threads(number_of_threads)
#endif
#pragma omp parallel for num_threads(number_of_threads)
for (mwIndex j1B = 0; j1B < static_cast<mwIndex>(nB); j1B++)
{
#if DEBUG_OMP
......@@ -95,9 +91,7 @@ sparse_hessian_times_B_kronecker_C(const mwIndex *isparseA, const mwIndex *jspar
/*
** Loop over the columns of B⊗B (or of the result matrix D).
*/
#if USE_OMP
# pragma omp parallel for num_threads(number_of_threads)
#endif
#pragma omp parallel for num_threads(number_of_threads)
for (mwIndex jj = 0; jj < static_cast<mwIndex>(nB*nC); jj++) // column of B⊗C index.
{
// Uncomment the following line to check if all processors are used.
......
......@@ -28,9 +28,7 @@
#include <dynmex.h>
#include <dynblas.h>
#ifdef USE_OMP
# include <omp.h>
#endif
#include <omp.h>
#define FIRST_ORDER_LOOP 1// Comment out this line to use mkl-blas instead of loops when computing ghx*yhat and ghu*epsilon
......@@ -67,9 +65,7 @@ ss2Iteration_pruning(double *y2, double *y1, const double *yhat2, const double *
std::vector<int> jj1, jj2, jj3;// vector indices for ghuu
set_vector_of_indices(n, m, ii1, ii2, ii3);
set_vector_of_indices(q, m, jj1, jj2, jj3);
#ifdef USE_OMP
# pragma omp parallel for num_threads(number_of_threads)
#endif
#pragma omp parallel for num_threads(number_of_threads)
for (int particle = 0; particle < s; particle++)
{
int particle_ = particle*m;
......@@ -155,9 +151,7 @@ ss2Iteration(double *y, const double *yhat, const double *epsilon,
std::vector<int> jj1, jj2, jj3;// vector indices for ghuu
set_vector_of_indices(n, m, ii1, ii2, ii3);
set_vector_of_indices(q, m, jj1, jj2, jj3);
#ifdef USE_OMP
# pragma omp parallel for num_threads(number_of_threads)
#endif
#pragma omp parallel for num_threads(number_of_threads)
for (int particle = 0; particle < s; particle++)
{
int particle_ = particle*m;
......
......@@ -2,7 +2,7 @@
**
** Pseudo code of the algorithm is given at http://home.online.no/~pjacklam/notes/invnorm
**
** Copyright © 2010-2017 Dynare Team
** Copyright © 2010-2019 Dynare Team
**
** This file is part of Dynare.
**
......@@ -26,6 +26,8 @@
#include <limits>
#include <algorithm>
#include <omp.h>
#include <dynblas.h>
using namespace std;
......@@ -33,11 +35,6 @@ using namespace std;
constexpr double lb = .02425;
constexpr double ub = .97575;
#ifdef USE_OMP
# include <omp.h>
#endif
#define DEBUG_OMP 0
template<typename T>
T
icdf(const T uniform)
......@@ -125,9 +122,7 @@ template<typename T>
void
icdfm(int n, T *U)
{
#if USE_OMP
# pragma omp parallel for num_threads(omp_get_num_threads())
#endif
#pragma omp parallel for
for (int i = 0; i < n; i++)
U[i] = icdf(U[i]);
return;
......@@ -152,9 +147,7 @@ void
usphere(int d, int n, T *U)
{
icdfm(n*d, U);
#if USE_OMP
# pragma omp parallel for num_threads(omp_get_num_threads())
#endif
#pragma omp parallel for
for (int j = 0; j < n; j++)// sequence index.
{
int k = j*d;
......@@ -173,9 +166,7 @@ void
usphereRadius(int d, int n, double radius, T *U)
{
icdfm(n*d, U);
#if USE_OMP
# pragma omp parallel for num_threads(omp_get_num_threads())
#endif
#pragma omp parallel for
for (int j = 0; j < n; j++)// sequence index.
{
int k = j*d;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment