diff --git a/mex/build/kronecker.am b/mex/build/kronecker.am index ac07bb602029bd2331d225c5c58a68e7f71f6b7d..236d18625b78cb2f7111e088d213a0d8433a5f55 100644 --- a/mex/build/kronecker.am +++ b/mex/build/kronecker.am @@ -1,17 +1,19 @@ mex_PROGRAMS = sparse_hessian_times_B_kronecker_C A_times_B_kronecker_C -nodist_sparse_hessian_times_B_kronecker_C_SOURCES = sparse_hessian_times_B_kronecker_C.cc +nodist_sparse_hessian_times_B_kronecker_C_SOURCES = sparse_hessian_times_B_kronecker_C.f08 matlab_mex.F08 nodist_A_times_B_kronecker_C_SOURCES = A_times_B_kronecker_C.f08 matlab_mex.F08 blas_lapack.F08 -sparse_hessian_times_B_kronecker_C_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp -sparse_hessian_times_B_kronecker_C_LDFLAGS = $(AM_LDFLAGS) $(OPENMP_LDFLAGS) +# Technically, only the sparse version needs OpenMP compilation flags. +# However, using per-object compilation flags triggers object renaming in Automake, +# with many problems (notably with Fortran modules, whose .o will be renamed, but +# not their .mod). +AM_FCFLAGS += -fopenmp +AM_LDFLAGS += $(OPENMP_LDFLAGS) BUILT_SOURCES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES) $(nodist_A_times_B_kronecker_C_SOURCES) CLEANFILES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES) $(nodist_A_times_B_kronecker_C_SOURCES) -%.cc: $(top_srcdir)/../../sources/kronecker/%.cc - $(LN_S) -f $< $@ - +sparse_hessian_times_B_kronecker_C.o : matlab_mex.mod A_times_B_kronecker_C.o : matlab_mex.mod lapack.mod %.f08: $(top_srcdir)/../../sources/kronecker/%.f08 diff --git a/mex/sources/kronecker/sparse_hessian_times_B_kronecker_C.cc b/mex/sources/kronecker/sparse_hessian_times_B_kronecker_C.cc deleted file mode 100644 index 8dd74b4791257462efc1fb5f766ef1799a6c030f..0000000000000000000000000000000000000000 --- a/mex/sources/kronecker/sparse_hessian_times_B_kronecker_C.cc +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright © 2007-2021 Dynare Team - * - * This file is part of Dynare. - * - * Dynare is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Dynare is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Dynare. If not, see <https://www.gnu.org/licenses/>. - */ - -/* - * This mex file computes A·(B⊗C) or A·(B⊗B) without explicitly building B⊗C or B⊗B, so that - * one can consider large matrices A, B and/or C, and assuming that A is a the hessian of a DSGE model - * (dynare format). This mex file should not be used outside dyn_second_order_solver.m. - */ - -#include <algorithm> - -#include <dynmex.h> - -#include <omp.h> - -#define DEBUG_OMP 0 - -void -sparse_hessian_times_B_kronecker_B(const mwIndex *isparseA, const mwIndex *jsparseA, const double *vsparseA, - const double *B, double *D, size_t mA, size_t nA, size_t mB, size_t nB, int number_of_threads) -{ - /* - ** Loop over the columns of B⊗B (or of the result matrix D). - ** This loop is splitted into two nested loops because we use the - ** symmetric pattern of the hessian matrix. - */ -#pragma omp parallel for num_threads(number_of_threads) - for (mwIndex j1B = 0; j1B < static_cast<mwIndex>(nB); j1B++) - { -#if DEBUG_OMP - mexPrintf("%d thread number is %d (%d).\n", j1B, omp_get_thread_num(), omp_get_num_threads()); -#endif - for (mwIndex j2B = j1B; j2B < static_cast<mwIndex>(nB); j2B++) - { - mwIndex jj = j1B*nB+j2B; // column of B⊗B index. - mwIndex iv = 0; - int nz_in_column_ii_of_A = 0; - mwIndex k1 = 0; - mwIndex k2 = 0; - /* - ** Loop over the rows of B⊗B (column jj). - */ - for (mwIndex ii = 0; ii < static_cast<mwIndex>(nA); ii++) - { - k1 = jsparseA[ii]; - k2 = jsparseA[ii+1]; - if (k1 < k2) // otherwise column ii of A does not have non zero elements (and there is nothing to compute). - { - ++nz_in_column_ii_of_A; - mwIndex i1B = ii / mB; - mwIndex i2B = ii % mB; - double bb = B[j1B*mB+i1B]*B[j2B*mB+i2B]; - /* - ** Loop over the non zero entries of A(:,ii). - */ - for (mwIndex k = k1; k < k2; k++) - { - mwIndex kk = isparseA[k]; - D[jj*mA+kk] = D[jj*mA+kk] + bb*vsparseA[iv]; - iv++; - } - } - } - if (nz_in_column_ii_of_A > 0) - std::copy_n(&D[jj*mA], mA, &D[(j2B*nB+j1B)*mA]); - } - } -} - -void -sparse_hessian_times_B_kronecker_C(const mwIndex *isparseA, const mwIndex *jsparseA, const double *vsparseA, - const double *B, const double *C, double *D, - size_t mA, size_t nA, size_t mB, size_t nB, size_t mC, size_t nC, int number_of_threads) -{ - /* - ** Loop over the columns of B⊗B (or of the result matrix D). - */ -#pragma omp parallel for num_threads(number_of_threads) - for (mwIndex jj = 0; jj < static_cast<mwIndex>(nB*nC); jj++) // column of B⊗C index. - { - // Uncomment the following line to check if all processors are used. -#if DEBUG_OMP - mexPrintf("%d thread number is %d (%d).\n", jj, omp_get_thread_num(), omp_get_num_threads()); -#endif - mwIndex jB = jj/nC; - mwIndex jC = jj%nC; - mwIndex k1 = 0; - mwIndex k2 = 0; - mwIndex iv = 0; - int nz_in_column_ii_of_A = 0; - /* - ** Loop over the rows of B⊗C (column jj). - */ - for (mwIndex ii = 0; ii < static_cast<mwIndex>(nA); ii++) - { - k1 = jsparseA[ii]; - k2 = jsparseA[ii+1]; - if (k1 < k2) // otherwise column ii of A does not have non zero elements (and there is nothing to compute). - { - ++nz_in_column_ii_of_A; - mwIndex iC = ii % mC; - mwIndex iB = ii / mC; - double cb = C[jC*mC+iC]*B[jB*mB+iB]; - /* - ** Loop over the non zero entries of A(:,ii). - */ - for (mwIndex k = k1; k < k2; k++) - { - mwIndex kk = isparseA[k]; - D[jj*mA+kk] = D[jj*mA+kk] + cb*vsparseA[iv]; - iv++; - } - } - } - } -} - -void -mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) -{ - // Check input and output: - if (nrhs > 4 || nrhs < 3 || nlhs != 1) - { - mexErrMsgTxt("sparse_hessian_times_B_kronecker_C takes 3 or 4 input arguments and provides 1 output argument."); - return; // Needed to shut up some GCC warnings - } - - if (!mxIsSparse(prhs[0])) - mexErrMsgTxt("sparse_hessian_times_B_kronecker_C: First input must be a sparse (dynare) hessian matrix."); - - // Get & Check dimensions (columns and rows): - size_t mA = mxGetM(prhs[0]); - size_t nA = mxGetN(prhs[0]); - size_t mB = mxGetM(prhs[1]); - size_t nB = mxGetN(prhs[1]); - size_t mC, nC; - if (nrhs == 4) // A·(B⊗C) is to be computed. - { - mC = mxGetM(prhs[2]); - nC = mxGetN(prhs[2]); - if (mB*mC != nA) - mexErrMsgTxt("Input dimension error!"); - } - else // A·(B⊗B) is to be computed. - { - if (mB*mB != nA) - mexErrMsgTxt("Input dimension error!"); - } - // Get input matrices: - int numthreads; - const double *B = mxGetPr(prhs[1]); - const double *C; - numthreads = static_cast<int>(mxGetScalar(prhs[2])); - if (nrhs == 4) - { - C = mxGetPr(prhs[2]); - numthreads = static_cast<int>(mxGetScalar(prhs[3])); - } - // Sparse (dynare) hessian matrix. - const mwIndex *isparseA = mxGetIr(prhs[0]); - const mwIndex *jsparseA = mxGetJc(prhs[0]); - const double *vsparseA = mxGetPr(prhs[0]); - // Initialization of the ouput: - if (nrhs == 4) - plhs[0] = mxCreateDoubleMatrix(mA, nB*nC, mxREAL); - else - plhs[0] = mxCreateDoubleMatrix(mA, nB*nB, mxREAL); - double *D = mxGetPr(plhs[0]); - - // Computational part: - if (nrhs == 3) - sparse_hessian_times_B_kronecker_B(isparseA, jsparseA, vsparseA, B, D, mA, nA, mB, nB, numthreads); - else - sparse_hessian_times_B_kronecker_C(isparseA, jsparseA, vsparseA, B, C, D, mA, nA, mB, nB, mC, nC, numthreads); -} diff --git a/mex/sources/kronecker/sparse_hessian_times_B_kronecker_C.f08 b/mex/sources/kronecker/sparse_hessian_times_B_kronecker_C.f08 new file mode 100644 index 0000000000000000000000000000000000000000..acd7532bee774b4ed445132f24d7eac0a1309a7e --- /dev/null +++ b/mex/sources/kronecker/sparse_hessian_times_B_kronecker_C.f08 @@ -0,0 +1,164 @@ +! This mex file computes A·(B⊗C) or A·(B⊗B) without explicitly building B⊗C or +! B⊗B, so that one can consider large matrices A, B and/or C, and assuming that +! A is a the hessian of a DSGE model (Dynare format). This MEX file should not +! be used outside of dyn_second_order_solver.m. + +! Copyright © 2007-2021 Dynare Team +! +! This file is part of Dynare. +! +! Dynare is free software: you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. +! +! Dynare is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License +! along with Dynare. If not, see <https://www.gnu.org/licenses/>. + +subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction') + use iso_fortran_env + use iso_c_binding + use matlab_mex + implicit none + + type(c_ptr), dimension(*), intent(in), target :: prhs + type(c_ptr), dimension(*), intent(out) :: plhs + integer(c_int), intent(in), value :: nlhs, nrhs + + integer(c_size_t) :: mA, nA, mB, nB, mC, nC + real(real64), dimension(:), pointer, contiguous :: A_v + integer(mwSize), dimension(:), pointer, contiguous :: A_ir, A_jc + real(real64), dimension(:, :), pointer, contiguous :: B, C, D + integer :: numthreads + + if (nrhs > 4 .or. nrhs < 3 .or. nlhs /= 1) then + call mexErrMsgTxt("sparse_hessian_times_B_kronecker_C takes 3 or 4 input arguments and provides 1 output argument") + end if + + if (.not. mxIsDouble(prhs(1)) .or. mxIsComplex(prhs(1)) .or. .not. mxIsSparse(prhs(1))) then + call mexErrMsgTxt("sparse_hessian_times_B_kronecker_C: first input must be a sparse (Dynare) hessian matrix") + end if + mA = mxGetM(prhs(1)) + nA = mxGetN(prhs(1)) + A_ir => mxGetIr(prhs(1)) + A_jc => mxGetJc(prhs(1)) + A_v => mxGetPr(prhs(1)) + + if (.not. mxIsDouble(prhs(2)) .or. mxIsComplex(prhs(2)) .or. mxIsSparse(prhs(2))) then + call mexErrMsgTxt("sparse_hessian_times_B_kronecker_C: second argument must be a (dense) real matrix") + end if + + mB = mxGetM(prhs(2)) + nB = mxGetN(prhs(2)) + B(1:mB,1:nB) => mxGetPr(prhs(2)) + + if (nrhs == 4) then + ! A·(B⊗C) is to be computed. + if (.not. mxIsDouble(prhs(3)) .or. mxIsComplex(prhs(3)) .or. mxIsSparse(prhs(3))) then + call mexErrMsgTxt("sparse_hessian_times_B_kronecker_C: third argument must be a (dense) real matrix") + end if + mC = mxGetM(prhs(3)) + nC = mxGetN(prhs(3)) + if (mB*mC /= nA) then + call mexErrMsgTxt("Input dimension error!") + end if + C(1:mC,1:nC) => mxGetPr(prhs(3)) + + if (.not. mxIsScalar(prhs(4))) then + call mexErrMsgTxt("sparse_hessian_times_B_kronecker_C: fourth argument must be a scalar") + end if + numthreads = int(mxGetScalar(prhs(4))) + + plhs(1) = mxCreateDoubleMatrix(mA, nB*nC, mxREAL) + D(1:mA,1:nB*nC) => mxGetPr(plhs(1)) + + call sparse_hessian_times_B_kronecker_C + else + ! A·(B⊗B) is to be computed. + if (mB*mB /= nA) then + call mexErrMsgTxt("Input dimension error!") + end if + + if (.not. mxIsScalar(prhs(3))) then + call mexErrMsgTxt("sparse_hessian_times_B_kronecker_C: third argument must be a scalar") + end if + numthreads = int(mxGetScalar(prhs(3))) + + plhs(1) = mxCreateDoubleMatrix(mA, nB*nB, mxREAL) + D(1:mA,1:nB*nB) => mxGetPr(plhs(1)) + + call sparse_hessian_times_B_kronecker_B + end if +contains + subroutine sparse_hessian_times_B_kronecker_C + integer(c_size_t) :: ii,jj,jB,jC,iB,iC + integer(mwIndex) :: k1,k2,k,kk + real(real64) :: bc + + ! Loop over the columns of B⊗C (or of the result matrix D) + !$omp parallel do num_threads(numthreads) default(none) shared(nA,nB,nC,mC,A_jc,A_ir,A_v,B,C,D) & + !$omp private(ii,jj,jB,jC,iB,iC,k1,k2,k,kk,bc) + do jj = 1,nB*nC ! column of B⊗C index + jB = (jj-1)/nC+1 + jC = mod(jj-1, nC)+1 + ! Loop over the rows of B⊗C (column jj) + do ii=1,nA + k1 = A_jc(ii) + k2 = A_jc(ii+1) + if (k1 < k2) then ! Otherwise column ii of A does not have non zero elements (and there is nothing to compute) + iC = mod(ii-1, mC)+1 + iB = (ii-1)/mC+1 + bc = B(iB,jB)*C(iC,jC) + ! Loop over the non zero entries of A(:,ii) + do k=k1+1,k2 + kk = A_ir(k)+1 + D(kk,jj) = D(kk,jj) + A_v(k)*bc + end do + end if + end do + end do + end subroutine sparse_hessian_times_B_kronecker_C + + subroutine sparse_hessian_times_B_kronecker_B + integer(c_size_t) :: ii,jj,j1B,j2B,i1B,i2B,nz_in_column_ii_of_A + integer(mwIndex) :: k1,k2,k,kk + real(real64) :: bb + + ! Loop over the columns of B⊗B (or of the result matrix D) + ! This loop is splitted into two nested loops because we use the + ! symmetric pattern of the hessian matrix. + + !$omp parallel do num_threads(numthreads) default(none) shared(nA,nB,mB,A_jc,A_ir,A_v,B,D) & + !$omp private(ii,jj,j1B,j2B,i1B,i2B,nz_in_column_ii_of_A,k1,k2,k,kk,bb) + do j1B = 1,nB + do j2B = j1B,nB + jj = (j1B-1)*nB+j2B ! Column of B⊗B index + nz_in_column_ii_of_A = 0 + ! Loop over the rows of B⊗C (column jj) + do ii=1,nA + k1 = A_jc(ii) + k2 = A_jc(ii+1) + if (k1 < k2) then ! Otherwise column ii of A does not have non zero elements (and there is nothing to compute) + nz_in_column_ii_of_A = nz_in_column_ii_of_A + 1 + i1B = (ii-1)/mB+1 + i2B = mod(ii-1, mB)+1 + bb = B(i1B,j1B)*B(i2B,j2B) + ! Loop over the non zero entries of A(:,ii) + do k=k1+1,k2 + kk = A_ir(k)+1 + D(kk,jj) = D(kk,jj) + A_v(k)*bb + end do + end if + end do + if (nz_in_column_ii_of_A > 0) then + D(:,(j2B-1)*nB+j1B) = D(:,jj) + end if + end do + end do + end subroutine sparse_hessian_times_B_kronecker_B +end subroutine mexFunction diff --git a/mex/sources/matlab_mex.F08 b/mex/sources/matlab_mex.F08 index c8e90c973291609be45abf826324637bbe783260..ff84678acb0baff87771384bcfd6eb706feab661 100644 --- a/mex/sources/matlab_mex.F08 +++ b/mex/sources/matlab_mex.F08 @@ -177,15 +177,26 @@ module matlab_mat integer(mxComplexity), intent(in), value :: ComplexFlag end function mxCreateSparse - type(c_ptr) function mxGetIr(pm) bind(c, name="mxGetIr"//API_VER2) + logical(c_bool) function mxIsSparse(pm) bind(c, name="mxIsSparse"//API_VER) use iso_c_binding type(c_ptr), intent(in), value :: pm - end function mxGetIr + end function mxIsSparse - type(c_ptr) function mxGetJc(pm) bind(c, name="mxGetJc"//API_VER2) + integer(mwSize) function mxGetNzmax(pm) bind(c, name="mxGetNzmax"//API_VER2) use iso_c_binding + import :: mwSize type(c_ptr), intent(in), value :: pm - end function mxGetJc + end function mxGetNzmax + + type(c_ptr) function mxGetIr_internal(pm) bind(c, name="mxGetIr"//API_VER2) + use iso_c_binding + type(c_ptr), intent(in), value :: pm + end function mxGetIr_internal + + type(c_ptr) function mxGetJc_internal(pm) bind(c, name="mxGetJc"//API_VER2) + use iso_c_binding + type(c_ptr), intent(in), value :: pm + end function mxGetJc_internal ! Nonnumeric types type(c_ptr) function mxGetData(pm) bind(c, name="mxGetData"//API_VER) @@ -318,6 +329,18 @@ contains end function mxGetPi #endif + function mxGetIr(pm) + type(c_ptr), intent(in) :: pm + integer(mwIndex), dimension(:), pointer, contiguous :: mxGetIr + call c_f_pointer(mxGetIr_internal(pm), mxGetIr, [ mxGetNzmax(pm) ]) + end function mxGetIr + + function mxGetJc(pm) + type(c_ptr), intent(in) :: pm + integer(mwIndex), dimension(:), pointer, contiguous :: mxGetJc + call c_f_pointer(mxGetJc_internal(pm), mxGetJc, [ mxGetN(pm)+1 ]) + end function mxGetJc + function mxGetLogicals(array_ptr) type(c_ptr), intent(in) :: array_ptr logical(mxLogical), dimension(:), pointer, contiguous :: mxGetLogicals