From c6d5c48ff7e41dbfc031d404a80a2214bd02deb0 Mon Sep 17 00:00:00 2001 From: NormannR <normann@dynare.org> Date: Tue, 30 Aug 2022 14:06:19 +0200 Subject: [PATCH] Local state-space iteration at order 3: multi-thread 3rd-order version with and without pruning --- mex/build/libkordersim.am | 7 +- mex/build/local_state_space_iterations.am | 18 +- mex/sources/Makefile.am | 1 + mex/sources/libkordersim/partitions.f08 | 232 ++++++- mex/sources/libkordersim/pascal.f08 | 25 +- mex/sources/libkordersim/pparticle.f08 | 78 --- .../local_state_space_iteration_3.f08 | 580 ++++++++++++++++++ .../local_state_space_iteration_k.f08 | 64 +- tests/Makefile.am | 6 +- .../local_state_space_iteration_3_test.mod | 92 +++ 10 files changed, 980 insertions(+), 123 deletions(-) delete mode 100644 mex/sources/libkordersim/pparticle.f08 create mode 100644 mex/sources/local_state_space_iterations/local_state_space_iteration_3.f08 create mode 100644 tests/particle/local_state_space_iteration_3_test.mod diff --git a/mex/build/libkordersim.am b/mex/build/libkordersim.am index d9d916839c..1199cce2eb 100644 --- a/mex/build/libkordersim.am +++ b/mex/build/libkordersim.am @@ -8,8 +8,8 @@ nodist_libkordersim_a_SOURCES = \ partitions.f08 \ simulation.f08 \ struct.f08 \ - pthread.F08 \ - pparticle.f08 + pthread.F08 + BUILT_SOURCES = $(nodist_libkordersim_a_SOURCES) CLEANFILES = $(nodist_libkordersim_a_SOURCES) @@ -27,8 +27,5 @@ partitions.mod: partitions.o simulation.o: partitions.mod lapack.mod simulation.mod: simulation.o -pparticle.o: simulation.mod matlab_mex.mod -pparticle.mod: pparticle.o - %.f08: $(top_srcdir)/../../sources/libkordersim/%.f08 $(LN_S) -f $< $@ diff --git a/mex/build/local_state_space_iterations.am b/mex/build/local_state_space_iterations.am index 07e709284b..e9281345f3 100644 --- a/mex/build/local_state_space_iterations.am +++ b/mex/build/local_state_space_iterations.am @@ -1,21 +1,27 @@ -mex_PROGRAMS = local_state_space_iteration_2 local_state_space_iteration_k +mex_PROGRAMS = local_state_space_iteration_2 local_state_space_iteration_3 local_state_space_iteration_k nodist_local_state_space_iteration_2_SOURCES = local_state_space_iteration_2.cc +nodist_local_state_space_iteration_3_SOURCES = local_state_space_iteration_3.f08 nodist_local_state_space_iteration_k_SOURCES = local_state_space_iteration_k.f08 local_state_space_iteration_2_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp local_state_space_iteration_2_LDFLAGS = $(AM_LDFLAGS) $(OPENMP_LDFLAGS) +local_state_space_iteration_3_FCFLAGS = $(AM_FCFLAGS) -I../libkordersim -pthread +local_state_space_iteration_3_LDADD = ../libkordersim/libkordersim.a + local_state_space_iteration_k_FCFLAGS = $(AM_FCFLAGS) -I../libkordersim -pthread local_state_space_iteration_k_LDADD = ../libkordersim/libkordersim.a BUILT_SOURCES = $(nodist_local_state_space_iteration_2_SOURCES) \ - $(nodist_local_state_space_iteration_k_SOURCES) + $(nodist_local_state_space_iteration_3_SOURCES) \ + $(nodist_local_state_space_iteration_k_SOURCES) CLEANFILES = $(nodist_local_state_space_iteration_2_SOURCES) \ - $(nodist_local_state_space_iteration_k_SOURCES) - -%.cc: $(top_srcdir)/../../sources/local_state_space_iterations/%.cc - $(LN_S) -f $< $@ + $(nodist_local_state_space_iteration_3_SOURCES) \ + $(nodist_local_state_space_iteration_k_SOURCES) %.f08: $(top_srcdir)/../../sources/local_state_space_iterations/%.f08 $(LN_S) -f $< $@ + +%.cc: $(top_srcdir)/../../sources/local_state_space_iterations/%.cc + $(LN_S) -f $< $@ \ No newline at end of file diff --git a/mex/sources/Makefile.am b/mex/sources/Makefile.am index 9ef0e4336c..c1e6727b95 100644 --- a/mex/sources/Makefile.am +++ b/mex/sources/Makefile.am @@ -6,6 +6,7 @@ EXTRA_DIST = \ blas_lapack.F08 \ defines.F08 \ matlab_mex.F08 \ + pthread.F08 \ mjdgges \ kronecker \ bytecode \ diff --git a/mex/sources/libkordersim/partitions.f08 b/mex/sources/libkordersim/partitions.f08 index 0fb0829e40..5272d4bb98 100644 --- a/mex/sources/libkordersim/partitions.f08 +++ b/mex/sources/libkordersim/partitions.f08 @@ -22,15 +22,16 @@ module partitions use pascal use sort + use iso_fortran_env implicit none ! index represents the aforementioned (α₁,…,αₘ) objects type index - integer, dimension(:), allocatable :: ind + integer, dimension(:), allocatable :: coor end type index interface index - module procedure :: init_index + module procedure :: init_index, init_index_vec, init_index_int end interface index ! a dictionary that matches folded indices with folded offsets @@ -56,18 +57,36 @@ module partitions contains - ! Constructor for the index type - type(index) function init_index(d, ind) + ! Constructors for the index type + ! Simply allocates the index with the size provided as input + type(index) function init_index(d) integer, intent(in) :: d - integer, dimension(d), intent(in) :: ind - allocate(init_index%ind(d)) - init_index%ind = ind + allocate(init_index%coor(d)) end function init_index + ! Creates an index with the vector provided as inputs + type(index) function init_index_vec(ind) + integer, dimension(:), intent(in) :: ind + allocate(init_index_vec%coor(size(ind))) + init_index_vec%coor = ind + end function init_index_vec + + ! Creates the index with a given size + ! and fills it with a given integer + type(index) function init_index_int(d, m) + integer, intent(in) :: d, m + integer :: i + allocate(init_index_int%coor(d)) + do i=1,d + init_index_int%coor(i) = m + end do + end function init_index_int + + ! Operators for the index type ! Comparison for the index type. Returns true if the two indices are different type(logical) function diff_indices(i1,i2) type(index), intent(in) :: i1, i2 - if (size(i1%ind) /= size(i2%ind) .or. any(i1%ind /= i2%ind)) then + if (size(i1%coor) /= size(i2%coor) .or. any(i1%coor /= i2%coor)) then diff_indices = .true. else diff_indices = .false. @@ -91,7 +110,7 @@ contains integer :: i i = 1 if (d>1) then - do while ((i < d) .and. (idx%ind(i+1) == idx%ind(1))) + do while ((i < d) .and. (idx%coor(i+1) == idx%coor(1))) i = i+1 end do end if @@ -99,11 +118,10 @@ contains end function get_prefix_length ! Gets the folded index associated with an unfolded index - type(index) function u_index_to_f_index(idx, d) + type(index) function u_index_to_f_index(idx) type(index), intent(in) :: idx - integer, intent(in) :: d - u_index_to_f_index = index(d, idx%ind) - call sort_int(u_index_to_f_index%ind) + u_index_to_f_index = index(idx%coor) + call sort_int(u_index_to_f_index%coor) end function u_index_to_f_index ! Converts the offset of an unfolded tensor to the associated unfolded tensor index @@ -112,11 +130,11 @@ contains type(index) function u_offset_to_u_index(j, n, d) integer, intent(in) :: j, n, d ! offset, number of variables and dimensions respectively integer :: i, tmp, r - allocate(u_offset_to_u_index%ind(d)) + allocate(u_offset_to_u_index%coor(d)) tmp = j-1 ! We substract 1 as j ∈ {1, ..., n} so that tmp ∈ {0, ..., n-1} and our modular operations work do i=d,1,-1 r = mod(tmp, n) - u_offset_to_u_index%ind(i) = r + u_offset_to_u_index%coor(i) = r tmp = (tmp-r)/n end do end function u_offset_to_u_index @@ -136,11 +154,26 @@ contains j = 1 else prefix = get_prefix_length(idx,d) - tmp = index(d-prefix, idx%ind(prefix+1:) - idx%ind(1)) - j = get(d, n+d-1, p) - get(d, n-idx%ind(1)+d-1, p) + f_index_to_f_offset(tmp, n-idx%ind(1), d-prefix, p) + tmp = index(idx%coor(prefix+1:) - idx%coor(1)) + j = get(d, n+d-1, p) - get(d, n-idx%coor(1)+d-1, p) + f_index_to_f_offset(tmp, n-idx%coor(1), d-prefix, p) end if end function f_index_to_f_offset - + + ! Returns the unfolded tensor offset associated with an unfolded tensor index + ! Written in a recursive way, the unfolded offset off(α₁,…,αₘ) associated with the + ! index (α₁,…,αₘ) with αᵢ ∈ {1, ..., n} verifies + ! off(α₁,…,αₘ) = n*off(α₁,…,αₘ₋₁) + αₘ + integer function u_index_to_u_offset(idx, n, d) + type(index), intent(in) :: idx ! unfolded index + integer, intent(in) :: n, d ! number of variables and dimensions + integer :: j + u_index_to_u_offset = 0 + do j=1,d + u_index_to_u_offset = n*u_index_to_u_offset + idx%coor(j)-1 + end do + u_index_to_u_offset = u_index_to_u_offset + 1 + end function u_index_to_u_offset + ! Function that searches a value in an array of a given length type(integer) function find(a, v, l) integer, intent(in) :: l ! length of the array @@ -180,7 +213,7 @@ contains c = dict(n, d, p) do j=1,n**d tmp = u_offset_to_u_index(j,n,d) - tmp = u_index_to_f_index(tmp, d) + tmp = u_index_to_f_index(tmp) found = find(c%indices, tmp, c%pr) if (found == 0) then c%pr = c%pr+1 @@ -193,7 +226,128 @@ contains end do end subroutine fill_folded_indices - end module partitions + ! ! Specialized code for local_state_space_iteration_3 + ! ! Considering the folded tensor gᵥᵥ, for each folded offset, + ! ! fills (i) the corresponding index, (ii) the corresponding + ! ! unfolded offset in the corresponding unfolded tensor + ! ! and (iii) the number of equivalent unfolded indices the folded index + ! ! associated with the folded offset represents + ! subroutine index_2(indices, uoff, neq, q) + ! integer, intent(in) :: q ! size of v + ! integer, dimension(:), intent(inout) :: uoff, neq ! list of corresponding unfolded offsets and number of equivalent unfolded indices + ! type(index), dimension(:), intent(inout) :: indices ! list of folded indices + ! integer :: m, j + ! m = q*(q+1)/2 ! total number of folded indices : ⎛q+2-1⎞ + ! ! ⎝ 2 ⎠ + ! uoff(1) = 1 + ! neq(1) = 1 + ! ! offsets such that j ∈ { 2, ..., q } are associated with + ! ! indices (1, α), α ∈ { 2, ..., q } + ! do j=2,q + ! neq(j) = 2 + ! end do + ! end subroutine index_2 + + ! In order to list folded indices α = (α₁,…,αₘ) with αᵢ ∈ { 1, ..., n }, + ! at least 2 algorithms exist: a recursive one and an iterative one. + ! The recursive algorithm list_folded_indices(n,m,q) that returns + ! the list of all folded indices α = (α₁,…,αₘ) with αᵢ ∈ { 1+q, ..., n+q } works as follows: + ! if n=0, return an empty list + ! else if m=0, return the list containing the sole zero-sized index + ! otherwise, + ! return the concatenation of ([1+q, ℓ] for ℓ ∈ list_folded_indices(n,m-1,q)) + ! and list_folded_indices(n-1,m,1,q+1)] + ! A call to list_folded_indices(n,m,0) then returns the list + ! of folded indices α = (α₁,…,αₘ) with αᵢ ∈ { 1, ..., n } + ! The problem with recursive functions is that the compiler may manage poorly + ! the stack, which slows down the function's execution + ! recursive function list_folded_indices(n, m, q) result(list) + ! integer :: n, m, q + ! type(index), allocatable, dimension(:) :: list, temp + ! integer :: j + ! if (m==0) then + ! list = [index(0)] + ! elseif (n == 0) then + ! allocate(list(0)) + ! else + ! temp = list_folded_indices(n,m-1,q) + ! list = [(index([1+q,temp(j)%coor]), j=1, size(temp)), list_folded_indices(n-1,m,q+1)] + ! end if + ! end function list_folded_indices + + ! Considering the folded tensor gᵥᵐ, for each folded offset, + ! fills the lists of (i) the corresponding index, (ii) the corresponding + ! unfolded offset in the corresponding unfolded tensor + ! and (iii) the number of equivalent unfolded indices the folded index + ! (associated with the folded offset) represents + ! The algorithm to get the folded index associated with a folded offset + ! relies on the definition of the lexicographic order. + ! Considering α = (α₁,…,αₘ) with αᵢ ∈ { 1, ..., n }, + ! the next index α' is such that there exists i that verifies + ! αⱼ = αⱼ' for all j < i, αᵢ' > αᵢ. Note that all the coordinates + ! αᵢ', ... , αₘ' need to be as small as the lexicographic order allows + ! for α' to immediately follow α. + ! Suppose j is the latest incremented coordinate: + ! if αⱼ < n, then αⱼ' = αⱼ + 1 + ! otherwise αⱼ = n, set αₖ' = αⱼ₋₁ + 1 for all k ≥ j-1 + ! if αⱼ₋₁ = n, set j := j-1 + ! otherwise, set j := m + ! The algorithm to count the number of equivalent unfolded indices + ! works as follows. A folded index can be written as α = (x₁, ..., x₁, ..., xₚ, ..., xₚ) + ! such that x₁ < x₂ < ... < xₚ. Denote kᵢ the number of coordinates equal to xᵢ. + ! The number of unfolded indices equivalent to α is c(α) = ⎛ d ⎞ + ! ⎝ k₁, k₂, ..., kₚ ⎠ + ! Suppose j is the latest incremented coordinate. + ! If αⱼ < n, then αⱼ' = αⱼ + 1, k(αⱼ) := k(αⱼ)-1, k(αⱼ') := k(αⱼ')+1. + ! In this case, c(α') = c(α)*(k(αⱼ)+1)/k(αⱼ') + ! otherwise, αⱼ = n: set αₖ' = αⱼ₋₁ + 1 for all k ≥ j-1, + ! k(αⱼ₋₁) := k(αⱼ₋₁)-1, k(n) := 0, k(αⱼ₋₁') = m-(j-1)+1 + ! In this case, we compute c(α') with the multinomial formula above + ! Finally, the algorithm that returns the unfolded offset of a given folded index works + ! as follows. Suppose j is the latest incremented coordinate and off(α) is the unfolded offset + ! associated with index α: + ! if αⱼ < n, then αⱼ' = αⱼ + 1 and off(α') = off(α)+1 + ! otherwise, αⱼ = n: set αₖ' = αⱼ₋₁ + 1 for all k ≥ j-1 + ! and off(α') can be computed using the u_index_to_u_offset routine + subroutine folded_offset_loop(ind, nbeq, off, n, m, p) + type(index), dimension(:), intent(inout) :: ind ! list of indices + integer, dimension(:), intent(inout) :: nbeq, off ! lists of numbers of equivalent indices and of offsets + integer, intent(in) :: n, m + type(pascal_triangle), intent(in) :: p + integer :: j, lastinc, k(n) + ind(1) = index(m, 1) + nbeq(1) = 1 + k = 0 + k(1) = m + off(1) = 1 + j = 2 + lastinc = m + do while (j <= size(ind)) + ind(j) = index(ind(j-1)%coor) + if (ind(j-1)%coor(lastinc) == n) then + ind(j)%coor(lastinc-1:m) = ind(j-1)%coor(lastinc-1)+1 + k(ind(j-1)%coor(lastinc-1)) = k(ind(j-1)%coor(lastinc-1))-1 + k(n) = 0 + k(ind(j)%coor(lastinc-1)) = m - (lastinc-1) + 1 + nbeq(j) = multinomial(k,m,p) + off(j) = u_index_to_u_offset(ind(j), n, m) + if (ind(j)%coor(m) == n) then + lastinc = lastinc-1 + else + lastinc = m + end if + else + ind(j)%coor(lastinc) = ind(j-1)%coor(lastinc)+1 + k(ind(j)%coor(lastinc)) = k(ind(j)%coor(lastinc))+1 + nbeq(j) = nbeq(j-1)*k(ind(j-1)%coor(lastinc))/k(ind(j)%coor(lastinc)) + k(ind(j-1)%coor(lastinc)) = k(ind(j-1)%coor(lastinc))-1 + off(j) = off(j-1)+1 + end if + j = j+1 + end do + end subroutine folded_offset_loop + +end module partitions ! gfortran -o partitions partitions.f08 pascal.f08 sort.f08 ! ./partitions @@ -203,8 +357,11 @@ contains ! implicit none ! type(index) :: uidx, fidx, i1, i2 ! integer, dimension(:), allocatable :: folded -! integer :: i, uj, n, d +! integer :: i, uj, n, d, j, nb_folded_idcs ! type(pascal_triangle) :: p +! type(index), dimension(:), allocatable :: list_folded_idcs +! integer, dimension(:), allocatable :: nbeq, off + ! ! Unfolded indices and offsets ! ! 0,0,0 1 1,0,0 10 2,0,0 19 ! ! 0,0,1 2 1,0,1 11 2,0,1 20 @@ -231,15 +388,15 @@ contains ! ! u_offset_to_u_index ! uidx = u_offset_to_u_index(uj,n,d) -! print '(3i2)', (uidx%ind(i), i=1,d) ! should display 0 2 1 +! print '(3i2)', (uidx%coor(i), i=1,d) ! should display 0 2 1 ! ! f_index_to_f_offset -! fidx = u_index_to_f_index(uidx, d) +! fidx = u_index_to_f_index(uidx) ! print '(i2)', f_index_to_f_offset(fidx, n, d, p) ! should display 5 ! ! /= -! i1 = index(5, (/1,2,3,4,5/)) -! i2 = index(5, (/1,2,3,4,6/)) +! i1 = index((/1,2,3,4,5/)) +! i2 = index((/1,2,3,4,6/)) ! if (i1 /= i2) then ! print *, "Same!" ! else @@ -247,10 +404,25 @@ contains ! end if ! ! fill_folded_indices -! allocate(folded(n**d)) -! call fill_folded_indices(folded,n,d) -! print *, "Matching offsets unfolded -> folded" -! print '(1000i4)', (i, i=1,n**d) -! print '(1000i4)', (folded(i), i=1,n**d) +! ! allocate(folded(n**d)) +! ! call fill_folded_indices(folded,n,d,p) +! ! print *, "Matching offsets unfolded -> folded" +! ! print '(1000i4)', (i, i=1,n**d) +! ! print '(1000i4)', (folded(i), i=1,n**d) + +! n = 3 +! d = 3 +! p = pascal_triangle(n+d-1) +! nb_folded_idcs = get(d,n+d-1,p) +! ! recursive list_folded_indices +! ! list_folded_idcs = list_folded_indices(n, d, 0) +! ! print '(4i2)', ((list_folded_idcs(i)%coor(j), j=1,d), i=1,nb_folded_idcs) + +! ! iterative list_folded_indices +! allocate(list_folded_idcs(nb_folded_idcs), nbeq(nb_folded_idcs), off(nb_folded_idcs)) +! call folded_offset_loop(list_folded_idcs, nbeq, off, n, d, p) +! print '(3i2)', ((list_folded_idcs(i)%coor(j), j=1,d), i=1,nb_folded_idcs) +! print '(i3)', (nbeq(i), i=1,nb_folded_idcs) +! print '(i4)', (off(i), i=1,nb_folded_idcs) ! end program test \ No newline at end of file diff --git a/mex/sources/libkordersim/pascal.f08 b/mex/sources/libkordersim/pascal.f08 index bec24515ec..11978b1ad7 100644 --- a/mex/sources/libkordersim/pascal.f08 +++ b/mex/sources/libkordersim/pascal.f08 @@ -60,13 +60,28 @@ contains ! Returns ⎛n⎞ stored in pascal_triangle p ! ⎝k⎠ - - type(integer) function get(k,n,p) + integer function get(k,n,p) integer, intent(in) :: k, n type(pascal_triangle), intent(in) :: p get = p%lines(n)%coeffs(k+1) end function get + ! Returns ⎛ d ⎞ + ! ⎝ k₁, k₂, ..., kₙ ⎠ + integer function multinomial(k,d,p) + integer, intent(in) :: k(:), d + type(pascal_triangle), intent(in) :: p + integer :: s, i + s = d + multinomial = 1 + i = 1 + do while (s > 0) + multinomial = multinomial*get(k(i), s, p) + s = s-k(i) + i = i+1 + end do + end function + end module pascal ! gfortran -o pascal pascal.f08 @@ -86,4 +101,10 @@ end module pascal ! end if ! end do ! end do + +! d = 3 +! p = pascal_triangle(d) +! print '(i2)', multinomial([1,2,3], d, p) ! should print 60 +! print '(i2)', multinomial([0,0,0,3], d, p) ! should print 20 + ! end program test \ No newline at end of file diff --git a/mex/sources/libkordersim/pparticle.f08 b/mex/sources/libkordersim/pparticle.f08 deleted file mode 100644 index a74d421680..0000000000 --- a/mex/sources/libkordersim/pparticle.f08 +++ /dev/null @@ -1,78 +0,0 @@ -! Copyright © 2021-2022 Dynare Team -! -! This file is part of Dynare. -! -! Dynare is free software: you can redistribute it and/or modify -! it under the terms of the GNU General Public License as published by -! the Free Software Foundation, either version 3 of the License, or -! (at your option) any later version. -! -! Dynare is distributed in the hope that it will be useful, -! but WITHOUT ANY WARRANTY; without even the implied warranty of -! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -! GNU General Public License for more details. -! -! You should have received a copy of the GNU General Public License -! along with Dynare. If not, see <https://www.gnu.org/licenses/>. - -! Routines and data structures for multithreading over particles in local_state_space_iteration_k -module pparticle - use iso_c_binding - use simulation - use matlab_mex - - implicit none - - type tdata - integer :: nm, nys, endo_nbr, nvar, order, nrestricted, nparticles - real(real64), allocatable :: yhat(:,:), e(:,:), ynext(:,:), ys_reordered(:), restrict_var_list(:) - type(pol), dimension(:), allocatable :: udr - end type tdata - - type(tdata) :: thread_data - -contains - - subroutine thread_eval(arg) bind(c) - type(c_ptr), intent(in), value :: arg - integer, pointer :: im - integer :: i, j, start, end, q, r, ind - type(horner), dimension(:), allocatable :: h - real(real64), dimension(:), allocatable :: dyu - - ! Checking that the thread number got passed as argument - if (.not. c_associated(arg)) then - call mexErrMsgTxt("No argument was passed to thread_eval") - end if - call c_f_pointer(arg, im) - - ! Allocating local arrays - allocate(h(0:thread_data%order), dyu(thread_data%nvar)) - do i=0, thread_data%order - allocate(h(i)%c(thread_data%endo_nbr, thread_data%nvar**i)) - end do - - ! Specifying bounds for the curent thread - q = thread_data%nparticles / thread_data%nm - r = mod(thread_data%nparticles, thread_data%nm) - start = (im-1)*q+1 - if (im < thread_data%nm) then - end = start+q-1 - else - end = thread_data%nparticles - end if - - ! Using the Horner algorithm to evaluate the decision rule at the chosen yhat and epsilon - do j=start,end - dyu(1:thread_data%nys) = thread_data%yhat(:,j) - dyu(thread_data%nys+1:) = thread_data%e(:,j) - call eval(h, dyu, thread_data%udr, thread_data%endo_nbr, thread_data%nvar, thread_data%order) - do i=1,thread_data%nrestricted - ind = int(thread_data%restrict_var_list(i)) - thread_data%ynext(i,j) = h(0)%c(ind,1) + thread_data%ys_reordered(ind) - end do - end do - - end subroutine thread_eval - -end module pparticle diff --git a/mex/sources/local_state_space_iterations/local_state_space_iteration_3.f08 b/mex/sources/local_state_space_iterations/local_state_space_iteration_3.f08 new file mode 100644 index 0000000000..64a567fcbc --- /dev/null +++ b/mex/sources/local_state_space_iterations/local_state_space_iteration_3.f08 @@ -0,0 +1,580 @@ +! Copyright © 2022 Dynare Team +! +! This file is part of Dynare. +! +! Dynare is free software: you can redistribute it and/or modify +! it under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. +! +! Dynare is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU General Public License for more details. +! +! You should have received a copy of the GNU General Public License +! along with Dynare. If not, see <https://www.gnu.org/licenses/>. + +! Routines and data structures for multithreading over particles in local_state_space_iteration_3 +module pparticle_3 + use matlab_mex + use partitions + + implicit none + + type tdata_3 + integer :: n, m, s, q, numthreads, xx_size, uu_size, xxx_size, uuu_size + real(real64), pointer, contiguous :: yhat3(:,:), & + &e(:,:), ghx(:,:), ghu(:,:), constant(:), ghxu(:,:), ghxx(:,:), & + &ghuu(:,:), ghxxx(:,:), ghuuu(:,:), ghxxu(:,:), ghxuu(:,:), ghxss(:,:), & + &ghuss(:,:), ss(:), y3(:,:) + real(real64), pointer :: yhat2(:,:), yhat1(:,:), y2(:,:), y1(:,:) + type(index), pointer, contiguous :: xx_idcs(:), uu_idcs(:), & + &xxx_idcs(:), uuu_idcs(:) + end type tdata_3 + + type(tdata_3) :: td3 + +contains + + ! Fills y3 as y3 = ybar + ½ghss + ghx·ŷ+ghu·ε + ½ghxx·ŷ⊗ŷ + ½ghuu·ε⊗ε + + ! ghxu·ŷ⊗ε + (1/6)·ghxxx ŷ⊗ŷ⊗ŷ + (1/6)·ghuuu·ε⊗ε⊗ε + + ! (3/6)·ghxxu·ŷ⊗ŷ⊗ε + (3/6)·ghxuu·ŷ⊗ε⊗ε + + ! (3/6)·ghxss·ŷ + (3/6)·ghuss·ε + ! in td3 + subroutine thread_eval_3(arg) bind(c) + type(c_ptr), intent(in), value :: arg + integer, pointer :: ithread + integer :: is, im, j, k, start, end, q, r + + ! Checking that the thread number got passed as argument + if (.not. c_associated(arg)) then + call mexErrMsgTxt("No argument was passed to thread_eval_3") + end if + call c_f_pointer(arg, ithread) + + ! Specifying bounds for the curent thread + q = td3%s / td3%numthreads + r = mod(td3%s, td3%numthreads) + start = (ithread-1)*q+1 + if (ithread < td3%numthreads) then + end = start+q-1 + else + end = td3%s + end if + + do is=start,end + do im=1,td3%m + ! y3 = ybar + ½ghss + td3%y3(im,is) = td3%constant(im) + ! y3 += ghx·ŷ+(3/6)·ghxss·ŷ + first n folded indices for ½ghxx·ŷ⊗ŷ + ! + first n folded indices for (1/6)ghxxx·ŷ⊗ŷ⊗ŷ + do j=1,td3%n + td3%y3(im,is) = td3%y3(im,is)+& + &(0.5*td3%ghxss(j,im)+td3%ghx(j,im))*td3%yhat3(j,is)+& + &(0.5*td3%ghxx(j,im)+(1./6.)*td3%ghxxx(j,im)*td3%yhat3(1, is))*& + &td3%yhat3(1, is)*td3%yhat3(j,is) + ! y3 += ghxu·ŷ⊗ε + ! + first n*q folded indices of (3/6)·ghxxu·ŷ⊗ŷ⊗ε + do k=1,td3%q + td3%y3(im,is) = td3%y3(im,is) + & + &(td3%ghxu(td3%q*(j-1)+k,im)+& + &0.5*td3%ghxxu(td3%q*(j-1)+k,im)*td3%yhat3(1, is))*& + &td3%yhat3(j, is)*td3%e(k, is) + end do + end do + ! y3 += ghu·ε+(3/6)·ghuss·ε + first q folded indices of ½ghuu·ε⊗ε + ! + first q folded indices for (1/6)·ghuuu·ε⊗ε⊗ε + ! + first n*q folded indices of (3/6)·ghxuu·ŷ⊗ε⊗ε + do j=1,td3%q + td3%y3(im,is) = td3%y3(im,is) + & + &(0.5*td3%ghuss(j,im)+td3%ghu(j,im))*td3%e(j,is) + & + &(0.5*td3%ghuu(j,im)+(1./6.)*td3%ghuuu(j,im)*& + &td3%e(1, is))*td3%e(1, is)*td3%e(j, is) + do k=1,td3%n + td3%y3(im,is) = td3%y3(im,is) + & + &0.5*td3%ghxuu(td3%uu_size*(k-1)+j,im)*& + &td3%yhat3(k, is)*td3%e(1, is)*td3%e(j, is) + end do + end do + ! y3 += remaining ½ghxx·ŷ⊗ŷ terms + ! + the next terms starting from n+1 up to xx_size + ! of (1/6)ghxxx·ŷ⊗ŷ⊗ŷ + ! + remaining terms of (3/6)·ghxxu·ŷ⊗ŷ⊗ε + do j=td3%n+1,td3%xx_size + td3%y3(im,is) = td3%y3(im,is) + & + &(0.5*td3%ghxx(j,im)+(1./6.)*td3%ghxxx(j,im)*td3%yhat3(1, is))*& + &td3%yhat3(td3%xx_idcs(j)%coor(1), is)*& + &td3%yhat3(td3%xx_idcs(j)%coor(2), is) + do k=1,td3%q + td3%y3(im,is) = td3%y3(im,is)+& + &0.5*td3%ghxxu(td3%q*(j-1)+k,im)*& + &td3%yhat3(td3%xx_idcs(j)%coor(1), is)*& + &td3%yhat3(td3%xx_idcs(j)%coor(2), is)*& + &td3%e(k, is) + end do + end do + ! y3 += remaining ½ghuu·ε⊗ε terms + ! + the next uu_size terms starting from q+1 + ! of (1/6)·ghuuu·ε⊗ε⊗ε + ! + remaining terms of (3/6)·ghxuu·ŷ⊗ε⊗ε + do j=td3%q+1,td3%uu_size + td3%y3(im,is) = td3%y3(im,is) + & + &(0.5*td3%ghuu(j,im)+(1./6.)*td3%ghuuu(j,im)*td3%e(1, is))*& + &td3%e(td3%uu_idcs(j)%coor(1), is)*& + &td3%e(td3%uu_idcs(j)%coor(2), is) + do k=1,td3%n + td3%y3(im,is) = td3%y3(im,is) + & + &0.5*td3%ghxuu(td3%uu_size*(k-1)+j,im)*& + &td3%yhat3(k, is)*& + &td3%e(td3%uu_idcs(j)%coor(1), is)*& + &td3%e(td3%uu_idcs(j)%coor(2), is) + end do + end do + ! y3 += remaining (1/6)·ghxxx·ŷ⊗ŷ⊗ŷ terms + do j=td3%xx_size+1,td3%xxx_size + td3%y3(im,is) = td3%y3(im,is)+& + &(1./6.)*td3%ghxxx(j,im)*& + &td3%yhat3(td3%xxx_idcs(j)%coor(1), is)*& + &td3%yhat3(td3%xxx_idcs(j)%coor(2), is)*& + &td3%yhat3(td3%xxx_idcs(j)%coor(3), is) + end do + ! y3 += remaining (1/6)ghuuu·ε⊗ε⊗ε terms + do j=td3%uu_size+1,td3%uuu_size + td3%y3(im,is) = td3%y3(im,is) + & + &(1./6.)*td3%ghuuu(j,im)*& + &td3%e(td3%uuu_idcs(j)%coor(1), is)*& + &td3%e(td3%uuu_idcs(j)%coor(2), is)*& + &td3%e(td3%uuu_idcs(j)%coor(3), is) + end do + + end do + end do + + end subroutine thread_eval_3 + + ! Fills y1 and y2 as + ! y1 = ybar + ghx·ŷ1 + ghu·ε + ! y2 = ybar + ½ghss + ghx·ŷ2 + ghu·ε + ½ghxx·ŷ1⊗ŷ1 + ½ghuu·ε⊗ε + ghxu·ŷ1⊗ε + ! y3 = ybar + ghx·ŷ3 + ghu·ε + ghxx·ŷ1⊗ŷ2 + ghuu·ε⊗ε + ghxu·ŷ1⊗ε + ghxu·ŷ2⊗ε + ! + (1/6)·ghxxx·ŷ1⊗ŷ1⊗ŷ1 + (1/6)·ghuuu·ε⊗ε⊗ε + (3/6)·ghxxu·ŷ1⊗ŷ1⊗ε + ! + (3/6)·ghxuu·ŷ1⊗ε⊗ε + (3/6)·ghxss·ŷ1 + (3/6)·ghuss·ε + ! in td3 + subroutine thread_eval_3_pruning(arg) bind(c) + type(c_ptr), intent(in), value :: arg + integer, pointer :: ithread + integer :: is, im, j, k, start, end, q, r + real(real64) :: x, y + + ! Checking that the thread number got passed as argument + if (.not. c_associated(arg)) then + call mexErrMsgTxt("No argument was passed to thread_eval") + end if + call c_f_pointer(arg, ithread) + + ! Specifying bounds for the curent thread + q = td3%s / td3%numthreads + r = mod(td3%s, td3%numthreads) + start = (ithread-1)*q+1 + if (ithread < td3%numthreads) then + end = start+q-1 + else + end = td3%s + end if + + do is=start,end + do im=1,td3%m + ! y1 = ybar + ! y2 = ybar + ½ghss + ! y3 = ybar + td3%y1(im,is) = td3%ss(im) + td3%y2(im,is) = td3%constant(im) + td3%y3(im,is) = td3%ss(im) + ! y1 += ghx·ŷ1 + ! y2 += ghx·ŷ2 + first n folded indices for ½ghxx·ŷ1⊗ŷ1 + ! y3 += ghx·ŷ3 +(3/6)·ghxss·ŷ + ! + first n folded indices for ghxx·ŷ1⊗ŷ2 + ! + first n folded indices for (1/6)ghxxx·ŷ1⊗ŷ1⊗ŷ1 + do j=1,td3%n + td3%y1(im,is) = td3%y1(im,is) + td3%ghx(j,im)*td3%yhat1(j,is) + td3%y2(im,is) = td3%y2(im,is) + td3%ghx(j,im)*td3%yhat2(j,is) +& + &0.5*td3%ghxx(j,im)*td3%yhat1(1, is)*td3%yhat1(j, is) + td3%y3(im,is) = td3%y3(im,is) + td3%ghx(j,im)*td3%yhat3(j,is) +& + &0.5*td3%ghxss(j,im)*td3%yhat1(j,is) +& + &td3%ghxx(j,im)*td3%yhat1(1, is)*td3%yhat2(j, is)+& + &(1./6.)*td3%ghxxx(j,im)*td3%yhat1(1, is)*& + &td3%yhat1(1, is)*td3%yhat1(j,is) + ! y2 += + ghxu·ŷ1⊗ε + ! y3 += + ghxu·ŷ1⊗ε + ghxu·ŷ2⊗ε + ! + first n*q folded indices of (3/6)·ghxxu·ŷ1⊗ŷ1⊗ε + do k=1,td3%q + td3%y2(im,is) = td3%y2(im,is)+& + &td3%ghxu(td3%q*(j-1)+k,im)*& + &td3%yhat1(j, is)*td3%e(k, is) + td3%y3(im,is) = td3%y3(im,is)+& + &td3%ghxu(td3%q*(j-1)+k,im)*& + &(td3%yhat1(j, is)+td3%yhat2(j, is))*td3%e(k, is)+& + &0.5*td3%ghxxu(td3%q*(j-1)+k,im)*td3%yhat1(1, is)*& + &td3%yhat1(j, is)*td3%e(k, is) + end do + end do + ! y1 += +ghu·ε + ! y2 += +ghu·ε + first q folded indices for ½ghuu·ε⊗ε + ! y3 += +ghu·ε + first q folded indices for ghuu·ε⊗ε + ! + first n*q folded indices of (3/6)·ghxuu·ŷ1⊗ε⊗ε + ! + first n folded indices of (1/6)·ghuuu·ε⊗ε⊗ε + do j=1,td3%q + x = td3%ghu(j,im)*td3%e(j,is) + y = td3%ghuu(j,im)*td3%e(1, is)*td3%e(j, is) + td3%y1(im,is) = td3%y1(im,is) + x + td3%y2(im,is) = td3%y2(im,is) + x + 0.5*y + td3%y3(im,is) = td3%y3(im,is) + x + y +& + &td3%ghuss(j,im)*td3%e(j,is)+& + &(1./6.)*td3%ghuuu(j,im)*td3%e(1, is)*td3%e(1, is)*& + &td3%e(j, is) + do k=1,td3%n + td3%y3(im,is) = td3%y3(im,is) + & + &0.5*td3%ghxuu(td3%uu_size*(k-1)+j,im)*& + &td3%yhat1(k, is)*td3%e(1, is)*td3%e(j, is) + end do + end do + ! y2 += remaining ½ghxx·ŷ1⊗ŷ1 terms + ! y3 += remaining ghxx·ŷ1⊗ŷ2 terms + ! + the next terms starting from n+1 up to xx_size + ! of (1/6)ghxxx·ŷ1⊗ŷ1⊗ŷ1 + ! + remaining terms of (3/6)·ghxxu·ŷ1⊗ŷ1⊗ε + do j=td3%n+1,td3%xx_size + td3%y2(im,is) = td3%y2(im,is) + & + &0.5*td3%ghxx(j,im)*& + &td3%yhat1(td3%xx_idcs(j)%coor(1), is)*& + &td3%yhat1(td3%xx_idcs(j)%coor(2), is) + td3%y3(im,is) = td3%y3(im,is) + & + &td3%ghxx(j,im)*& + &td3%yhat1(td3%xx_idcs(j)%coor(1), is)*& + &td3%yhat2(td3%xx_idcs(j)%coor(2), is)+& + &(1./6.)*td3%ghxxx(j,im)*td3%yhat1(1, is)*& + &td3%yhat1(td3%xx_idcs(j)%coor(1), is)*& + &td3%yhat1(td3%xx_idcs(j)%coor(2), is) + do k=1,td3%n + td3%y3(im,is) = td3%y3(im,is) + & + &0.5*td3%ghxxu(td3%q*(j-1)+k,im)*& + &td3%yhat1(td3%xx_idcs(j)%coor(1), is)*& + &td3%yhat1(td3%xx_idcs(j)%coor(2), is)*& + &td3%e(k, is) + end do + end do + ! y2 += remaining ½ghuu·ε⊗ε terms + ! y3 += remaining ghuu·ε⊗ε terms + ! + remaining terms of (3/6)·ghxuu·ŷ⊗ε⊗ε + ! + the next uu_size terms starting from q+1 + ! of (1/6)·ghuuu·ε⊗ε⊗ε + do j=td3%q+1,td3%uu_size + x = td3%ghuu(j,im)*& + &td3%e(td3%uu_idcs(j)%coor(1), is)*& + &td3%e(td3%uu_idcs(j)%coor(2), is) + td3%y2(im,is) = td3%y2(im,is)+0.5*x + td3%y3(im,is) = td3%y3(im,is)+x+& + &(1./6.)*td3%ghuuu(j,im)*td3%e(1, is)*& + &td3%e(td3%uu_idcs(j)%coor(1), is)*& + &td3%e(td3%uu_idcs(j)%coor(2), is) + do k=1,td3%n + td3%y3(im,is) = td3%y3(im,is) + & + &0.5*td3%ghxuu(td3%uu_size*(k-1)+j,im)*& + &td3%yhat1(k, is)*& + &td3%e(td3%uu_idcs(j)%coor(1), is)*& + &td3%e(td3%uu_idcs(j)%coor(2), is) + end do + end do + ! y3 += remaining (1/6)·ghxxx·ŷ⊗ŷ⊗ŷ terms + do j=td3%xx_size+1,td3%xxx_size + td3%y3(im,is) = td3%y3(im,is)+& + &(1./6.)*td3%ghxxx(j,im)*& + &td3%yhat1(td3%xxx_idcs(j)%coor(1), is)*& + &td3%yhat1(td3%xxx_idcs(j)%coor(2), is)*& + &td3%yhat1(td3%xxx_idcs(j)%coor(3), is) + end do + ! y3 += remaining (1/6)ghuuu·ε⊗ε⊗ε terms + do j=td3%uu_size+1,td3%uuu_size + td3%y3(im,is) = td3%y3(im,is) + & + &(1./6.)*td3%ghuuu(j,im)*& + &td3%e(td3%uuu_idcs(j)%coor(1), is)*& + &td3%e(td3%uuu_idcs(j)%coor(2), is)*& + &td3%e(td3%uuu_idcs(j)%coor(3), is) + end do + end do + end do + + end subroutine thread_eval_3_pruning + +end module pparticle_3 + +! The code of the local_state_space_iteration_3 routine +! Input: +! prhs[1] yhat3 [double] n×s array, time t particles. +! prhs[2] e [double] q×s array, time t innovations. +! prhs[3] ghx [double] m×n array, first order reduced form. +! prhs[4] ghu [double] m×q array, first order reduced form. +! prhs[5] constant [double] m×1 array, deterministic steady state + +! third order correction for the union of +! the states and observed variables. +! prhs[6] ghxx [double] m×n² array, second order reduced form. +! prhs[7] ghuu [double] m×q² array, second order reduced form. +! prhs[8] ghxu [double] m×nq array, second order reduced form. +! prhs[9] ghxxx [double] m×n array, third order reduced form. +! prhs[10] ghuuu [double] m×q array, third order reduced form. +! prhs[11] ghxxu [double] m×n²q array, third order reduced form. +! prhs[12] ghxuu [double] m×nq² array, third order reduced form. +! prhs[13] ghxss [double] m×n array, third order reduced form. +! prhs[14] ghuss [double] m×q array, third order reduced form. +! prhs[15] yhat2 [double] [OPTIONAL] 2n×s array, time t particles up to 2nd order pruning additional latent variables. The first n rows concern the pruning first-order latent variables, while the last n rows concern the pruning 2nd-order latent variables +! prhs[16] ss [double] [OPTIONAL] m×1 array, steady state for the union of the states and the observed variables (needed for the pruning mode). +! prhs[15 or 17] [double] num of threads +! +! Output: +! plhs[1] y3 [double] m×s array, time t+1 particles. +! plhs[2] y2 [double] 2m×s array, time t+1 particles for the pruning latent variables up to 2nd order. The first m rows concern the pruning first-order latent variables, while the last m rows concern the pruning 2nd-order latent variables +subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction') + use iso_c_binding + use matlab_mex + use pascal + use partitions + use pthread + use pparticle_3 + implicit none + + type(c_ptr), dimension(*), intent(in), target :: prhs + type(c_ptr), dimension(*), intent(out) :: plhs + integer(c_int), intent(in), value :: nlhs, nrhs + integer :: n, m, s, q, numthreads + real(real64), pointer, contiguous :: ghx(:,:), ghu(:,:), ghxx(:,:), & + &ghuu(:,:), ghxu(:,:), ghxxx(:,:), ghuuu(:,:), ghxxu(:,:), & + &ghxuu(:,:), ghxss(:,:), ghuss(:,:), yhatlat(:,:), ylat(:,:) + integer :: i, j, k, xx_size, uu_size, xxx_size, uuu_size, rc + character(kind=c_char, len=10) :: arg_nber + type(pascal_triangle) :: p + integer, allocatable :: xx_nbeq(:), xxx_nbeq(:), & + &uu_nbeq(:), uuu_nbeq(:), xx_off(:), uu_off(:), & + &xxx_off(:), uuu_off(:) + type(c_pthread_t), allocatable :: threads(:) + integer, allocatable, target :: routines(:) + + ! 0. Checking the consistency and validity of input arguments + if (nrhs /= 15 .and. nrhs /= 17) then + call mexErrMsgTxt("Must have exactly 15 inputs or 18 inputs") + end if + + if (nlhs > 2) then + call mexErrMsgTxt("Too many output arguments.") + end if + + do i=1, max(14, nrhs-1) + if (.not. (c_associated(prhs(i)) .and. mxIsDouble(prhs(i)) .and. & + (.not. mxIsComplex(prhs(i))) .and. (.not. mxIsSparse(prhs(i))))) then + write (arg_nber,"(i2)") i + call mexErrMsgTxt("Argument " // trim(arg_nber) // " should be a real dense matrix") + end if + end do + + i = max(15,nrhs) + if (.not. (c_associated(prhs(i)) .and. mxIsScalar(prhs(i)) .and. & + mxIsNumeric(prhs(i)))) then + write (arg_nber,"(i2)") i + call mexErrMsgTxt("Argument " // trim(arg_nber) // " should be a numeric scalar") + end if + numthreads = int(mxGetScalar(prhs(i))) + if (numthreads <= 0) then + write (arg_nber,"(i2)") i + call mexErrMsgTxt("Argument " // trim(arg_nber) // " should be a positive integer") + end if + td3%numthreads = numthreads + + n = int(mxGetM(prhs(1))) ! Number of states. + s = int(mxGetN(prhs(1))) ! Number of particles. + q = int(mxGetM(prhs(2))) ! Number of innovations. + m = int(mxGetM(prhs(3))) ! Number of elements in the union of states and observed variables. + td3%n = n + td3%s = s + td3%q = q + td3%m = m + + if ((s /= mxGetN(prhs(2))) & ! Number of columns for epsilon + &.or. (n /= mxGetN(prhs(3))) & ! Number of columns for ghx + &.or. (m /= mxGetM(prhs(4))) & ! Number of rows for ghu + &.or. (q /= mxGetN(prhs(4))) & ! Number of columns for ghu + &.or. (m /= mxGetM(prhs(5))) & ! Number of rows for 2nd order constant correction + deterministic steady state + &.or. (m /= mxGetM(prhs(6))) & ! Number of rows for ghxx + &.or. (n*n /= mxGetN(prhs(6))) & ! Number of columns for ghxx + &.or. (m /= mxGetM(prhs(7))) & ! Number of rows for ghuu + &.or. (q*q /= mxGetN(prhs(7))) & ! Number of columns for ghuu + &.or. (m /= mxGetM(prhs(8))) & ! Number of rows for ghxu + &.or. (n*q /= mxGetN(prhs(8))) & ! Number of columns for ghxu + &.or. (m /= mxGetM(prhs(9))) & ! Number of rows for ghxxx + &.or. (n*n*n /= mxGetN(prhs(9))) & ! Number of columns for ghxxx + &.or. (m /= mxGetM(prhs(10))) & ! Number of rows for ghuuu + &.or. (q*q*q /= mxGetN(prhs(10))) & ! Number of columns for ghuuu + &.or. (m /= mxGetM(prhs(11))) & ! Number of rows for ghxxu + &.or. (n*n*q /= mxGetN(prhs(11))) & ! Number of columns for ghxxu + &.or. (m /= mxGetM(prhs(12))) & ! Number of rows for ghxuu + &.or. (n*q*q /= mxGetN(prhs(12))) & ! Number of columns for ghxuu + &.or. (m /= mxGetM(prhs(13))) & ! Number of rows for ghxss + &.or. (n /= mxGetN(prhs(13))) & ! Number of columns for ghxss + &.or. (m /= mxGetM(prhs(14))) & ! Number of rows for ghuss + &.or. (q /= mxGetN(prhs(14))) & ! Number of columns for ghuss + &.or. ((nrhs == 17) & ! With pruning optional inputs + &.and. ((2*n /= mxGetM(prhs(15))) & ! Number of rows for yhat2 + &.or. (s /= mxGetN(prhs(15))) & ! Number of columns for yhat2 + &.or. (m /= mxGetM(prhs(16)))))) then ! Number of rows for ss + ! &) then + call mexErrMsgTxt("Input dimension mismatch") + end if + + ! 1. Getting relevant information to take advantage of symmetries + ! There are symmetries in the ghxx, ghuu, ghxxx, ghuuu, ghxxu and ghxuu terms + ! that we may exploit to avoid unnecessarily repeating operations in matrix-vector + ! multiplications, e.g in ghxx·ŷ⊗ŷ. + ! In matrix-vector multiplications such as ghxx·ŷ⊗ŷ, we loop through all the folded offsets + ! and thus need for each one of them : + ! (i) the corresponding folded index, e.g (α₁,α₂), α₁≤α₂ for ghxx + ! (i) the corresponding offset in the unfolded matrix + ! (ii) the corresponding number of equivalent unfolded indices (1 if α₁=α₂, 2 otherwise) + ! It is better to compute these beforehand as it avoids repeating the calculation for + ! each particle. The `folded_offset_loop` routine carries out this operation. + + p = pascal_triangle(max(n,q)+3-1) + xx_size = get(2,n+2-1,p) + uu_size = get(2,q+2-1,p) + xxx_size = get(3,n+3-1,p) + uuu_size = get(3,q+3-1,p) + + td3%xx_size = xx_size + td3%uu_size = uu_size + td3%xxx_size = xxx_size + td3%uuu_size = uuu_size + + allocate(td3%xx_idcs(xx_size), td3%uu_idcs(uu_size), & + &td3%xxx_idcs(xxx_size), td3%uuu_idcs(uuu_size), & + &xx_off(xx_size), uu_off(uu_size), & + &xxx_off(xxx_size), uuu_off(uuu_size), & + &xx_nbeq(xx_size), uu_nbeq(uu_size), & + &xxx_nbeq(xxx_size), uuu_nbeq(uuu_size)) + + call folded_offset_loop(td3%xx_idcs, xx_nbeq, & + &xx_off, n, 2, p) + call folded_offset_loop(td3%uu_idcs, uu_nbeq, & + &uu_off, q, 2, p) + call folded_offset_loop(td3%xxx_idcs, xxx_nbeq, & + &xxx_off, n, 3, p) + call folded_offset_loop(td3%uuu_idcs, uuu_nbeq, & + &uuu_off, q, 3, p) + + ! 1. Storing the relevant input variables in Fortran + td3%yhat3(1:n,1:s) => mxGetPr(prhs(1)) + td3%e(1:q,1:s) => mxGetPr(prhs(2)) + ghx(1:m,1:n) => mxGetPr(prhs(3)) + ghu(1:m,1:q) => mxGetPr(prhs(4)) + td3%constant => mxGetPr(prhs(5)) + ghxx(1:m,1:n*n) => mxGetPr(prhs(6)) + ghuu(1:m,1:q*q) => mxGetPr(prhs(7)) + ghxu(1:m,1:n*q) => mxGetPr(prhs(8)) + ghxxx(1:m,1:n*n*n) => mxGetPr(prhs(9)) + ghuuu(1:m,1:q*q*q) => mxGetPr(prhs(10)) + ghxxu(1:m,1:n*n*q) => mxGetPr(prhs(11)) + ghxuu(1:m,1:n*q*q) => mxGetPr(prhs(12)) + ghxss(1:m,1:n) => mxGetPr(prhs(13)) + ghuss(1:m,1:q) => mxGetPr(prhs(14)) + if (nrhs == 17) then + yhatlat(1:2*n,1:s) => mxGetPr(prhs(15)) + td3%yhat1 => yhatlat(1:n,1:s) + td3%yhat2 => yhatlat(n+1:2*n,1:s) + td3%ss => mxGetPr(prhs(16)) + end if + + ! Getting a transposed folded copy of the unfolded tensors + ! for future loops to be more efficient + allocate(td3%ghx(n,m), td3%ghu(q,m),& + &td3%ghuu(uu_size,m), td3%ghxu(n*q,m), & + &td3%ghxx(xx_size,m), & + &td3%ghxxx(xxx_size,m), td3%ghuuu(uuu_size,m), & + &td3%ghxxu(xx_size*q,m), td3%ghxuu(n*uu_size,m), & + &td3%ghxss(n,m), td3%ghuss(q,m)) + do i=1,m + do j=1,n + td3%ghx(j,i) = ghx(i,j) + td3%ghxss(j,i) = ghxss(i,j) + td3%ghxx(j,i) = xx_nbeq(j)*ghxx(i,xx_off(j)) + td3%ghxxx(j,i) = xxx_nbeq(j)*ghxxx(i,xxx_off(j)) + do k=1,q + td3%ghxu(q*(j-1)+k,i) = ghxu(i,q*(j-1)+k) + td3%ghxxu(q*(j-1)+k,i) = xx_nbeq(j)*ghxxu(i,q*(xx_off(j)-1)+k) + end do + end do + do j=n+1,xx_size + td3%ghxx(j,i) = xx_nbeq(j)*ghxx(i,xx_off(j)) + td3%ghxxx(j,i) = xxx_nbeq(j)*ghxxx(i,xxx_off(j)) + do k=1,q + td3%ghxxu(q*(j-1)+k,i) = xx_nbeq(j)*ghxxu(i,q*(xx_off(j)-1)+k) + end do + end do + do j=xx_size+1,xxx_size + td3%ghxxx(j,i) = xxx_nbeq(j)*ghxxx(i,xxx_off(j)) + end do + do j=1,q + td3%ghu(j,i) = ghu(i,j) + td3%ghuss(j,i) = ghuss(i,j) + td3%ghuu(j,i) = uu_nbeq(j)*ghuu(i,uu_off(j)) + td3%ghuuu(j,i) = uuu_nbeq(j)*ghuuu(i,uuu_off(j)) + do k=1,n + td3%ghxuu(uu_size*(k-1)+j,i) = uu_nbeq(j)*ghxuu(i,q*q*(k-1)+uu_off(j)) + end do + end do + do j=q+1,uu_size + td3%ghuu(j,i) = uu_nbeq(j)*ghuu(i,uu_off(j)) + td3%ghuuu(j,i) = uuu_nbeq(j)*ghuuu(i,uuu_off(j)) + do k=1,n + td3%ghxuu(uu_size*(k-1)+j,i) = uu_nbeq(j)*ghxuu(i,q*q*(k-1)+uu_off(j)) + end do + end do + do j=uu_size+1,uuu_size + td3%ghuuu(j,i) = uuu_nbeq(j)*ghuuu(i,uuu_off(j)) + end do + end do + + ! 3. Implementing the calculations: + + plhs(1) = mxCreateDoubleMatrix(int(m, mwSize), int(s, mwSize), mxREAL) + td3%y3(1:m,1:s) => mxGetPr(plhs(1)) + if (nrhs == 17) then + plhs(2) = mxCreateDoubleMatrix(int(2*m, mwSize), int(s, mwSize), mxREAL) + ylat(1:2*m,1:s) => mxGetPr(plhs(2)) + td3%y1 => ylat(1:m,1:s) + td3%y2 => ylat(m+1:2*m,1:s) + end if + + allocate(threads(numthreads), routines(numthreads)) + routines = [ (i, i = 1, numthreads) ] + + if (numthreads == 1) then + if (nrhs == 17) then + call thread_eval_3_pruning(c_loc(routines(1))) + else + call thread_eval_3(c_loc(routines(1))) + end if + else + ! Creating the threads + if (nrhs == 17) then + do i = 1, numthreads + rc = c_pthread_create(threads(i), c_null_ptr, c_funloc(thread_eval_3_pruning), c_loc(routines(i))) + end do + else + do i = 1, numthreads + rc = c_pthread_create(threads(i), c_null_ptr, c_funloc(thread_eval_3), c_loc(routines(i))) + end do + end if + + ! Joining the threads + do i = 1, numthreads + rc = c_pthread_join(threads(i), c_loc(routines(i))) + end do + end if + +end subroutine mexFunction \ No newline at end of file diff --git a/mex/sources/local_state_space_iterations/local_state_space_iteration_k.f08 b/mex/sources/local_state_space_iterations/local_state_space_iteration_k.f08 index e7211ffb26..b1632da674 100644 --- a/mex/sources/local_state_space_iterations/local_state_space_iteration_k.f08 +++ b/mex/sources/local_state_space_iterations/local_state_space_iteration_k.f08 @@ -15,6 +15,69 @@ ! You should have received a copy of the GNU General Public License ! along with Dynare. If not, see <https://www.gnu.org/licenses/>. +! Routines and data structures for multithreading over particles in local_state_space_iteration_k +module pparticle + use iso_c_binding + use simulation + use matlab_mex + + implicit none + + type tdata + integer :: nm, nys, endo_nbr, nvar, order, nrestricted, nparticles + real(real64), allocatable :: yhat(:,:), e(:,:), ynext(:,:), ys_reordered(:), restrict_var_list(:) + type(pol), dimension(:), allocatable :: udr + end type tdata + + type(tdata) :: thread_data + +contains + + subroutine thread_eval(arg) bind(c) + type(c_ptr), intent(in), value :: arg + integer, pointer :: im + integer :: i, j, start, end, q, r, ind + type(horner), dimension(:), allocatable :: h + real(real64), dimension(:), allocatable :: dyu + + ! Checking that the thread number got passed as argument + if (.not. c_associated(arg)) then + call mexErrMsgTxt("No argument was passed to thread_eval") + end if + call c_f_pointer(arg, im) + + ! Allocating local arrays + allocate(h(0:thread_data%order), dyu(thread_data%nvar)) + do i=0, thread_data%order + allocate(h(i)%c(thread_data%endo_nbr, thread_data%nvar**i)) + end do + + ! Specifying bounds for the curent thread + q = thread_data%nparticles / thread_data%nm + r = mod(thread_data%nparticles, thread_data%nm) + start = (im-1)*q+1 + if (im < thread_data%nm) then + end = start+q-1 + else + end = thread_data%nparticles + end if + + ! Using the Horner algorithm to evaluate the decision rule at the chosen yhat and epsilon + do j=start,end + dyu(1:thread_data%nys) = thread_data%yhat(:,j) + dyu(thread_data%nys+1:) = thread_data%e(:,j) + call eval(h, dyu, thread_data%udr, thread_data%endo_nbr, thread_data%nvar, thread_data%order) + do i=1,thread_data%nrestricted + ind = int(thread_data%restrict_var_list(i)) + thread_data%ynext(i,j) = h(0)%c(ind,1) + thread_data%ys_reordered(ind) + end do + end do + + end subroutine thread_eval + +end module pparticle + +! The code of the local_state_space_iteration_k routine ! input: ! yhat values of endogenous variables ! epsilon values of the exogenous shock @@ -24,7 +87,6 @@ ! udr struct containing the model unfolded tensors ! output: ! ynext simulated next-period results - subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction') use iso_fortran_env use iso_c_binding diff --git a/tests/Makefile.am b/tests/Makefile.am index 552e137fe9..d2e5007dfe 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -652,7 +652,8 @@ PARTICLEFILES = \ particle/dsge_base2.mod \ particle/first_spec.mod \ particle/first_spec_MCMC.mod \ - particle/local_state_space_iteration_k_test.mod + particle/local_state_space_iteration_k_test.mod \ + particle/local_state_space_iteration_3_test.mod XFAIL_MODFILES = ramst_xfail.mod \ @@ -959,6 +960,9 @@ discretionary_policy/dennis_1_estim.o.trs: discretionary_policy/dennis_1.o.trs discretionary_policy/Gali_2015_chapter_3_nonlinear.m.trs: discretionary_policy/Gali_2015_chapter_3.m.trs discretionary_policy/Gali_2015_chapter_3_nonlinear.o.trs: discretionary_policy/Gali_2015_chapter_3.o.trs +particle/local_state_space_iteration_3_test.m.trs: particle/first_spec.m.trs +particle/local_state_space_iteration_3_test.o.trs: particle/first_spec.o.trs + particle/local_state_space_iteration_k_test.m.trs: particle/first_spec.m.trs particle/local_state_space_iteration_k_test.o.trs: particle/first_spec.o.trs diff --git a/tests/particle/local_state_space_iteration_3_test.mod b/tests/particle/local_state_space_iteration_3_test.mod new file mode 100644 index 0000000000..f6b5a20e00 --- /dev/null +++ b/tests/particle/local_state_space_iteration_3_test.mod @@ -0,0 +1,92 @@ +/* + Tests that local_state_space_iteration_3 and local_state_space_iteration_k (for k=3) return the same results + + This file must be run after first_spec.mod (both are based on the same model). +*/ + +@#include "first_spec_common.inc" + +varobs q ca; + +shocks; +var eeps = 0.04^2; +var nnu = 0.03^2; +var q = 0.01^2; +var ca = 0.01^2; +end; + +// Initialize various structures +estimation(datafile='my_data.mat',order=3,mode_compute=0,mh_replic=0,filter_algorithm=sis,nonlinear_filter_initialization=2 + ,cova_compute=0 %tell program that no covariance matrix was computed +); + +stoch_simul(order=3, periods=200, irf=0, k_order_solver); + +// Really perform the test + +nparticles = options_.particle.number_of_particles; +nsims = 1e6/nparticles; + +/* We generate particles using realistic distributions (though this is not + strictly needed) */ +nstates = M_.npred + M_.nboth; +state_idx = oo_.dr.order_var((M_.nstatic+1):(M_.nstatic+nstates)); +yhat = chol(oo_.var(state_idx,state_idx))*randn(nstates, nparticles); +epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles); + +dr = oo_.dr; + + +// “rf” stands for “Reduced Form” +rf_ghx = dr.ghx(dr.restrict_var_list, :); +rf_ghu = dr.ghu(dr.restrict_var_list, :); +rf_constant = dr.ys(dr.order_var)+0.5*dr.ghs2; +rf_constant = rf_constant(dr.restrict_var_list, :); +rf_ghxx = dr.ghxx(dr.restrict_var_list, :); +rf_ghuu = dr.ghuu(dr.restrict_var_list, :); +rf_ghxu = dr.ghxu(dr.restrict_var_list, :); +rf_ghxxx = dr.ghxxx(dr.restrict_var_list, :); +rf_ghuuu = dr.ghuuu(dr.restrict_var_list, :); +rf_ghxxu = dr.ghxxu(dr.restrict_var_list, :); +rf_ghxuu = dr.ghxuu(dr.restrict_var_list, :); +rf_ghxss = dr.ghxss(dr.restrict_var_list, :); +rf_ghuss = dr.ghuss(dr.restrict_var_list, :); + +options_.threads.local_state_space_iteration_3 = 12; +options_.threads.local_state_space_iteration_k = 12; + +% Without pruning +tStart1 = tic; for i=1:nsims, ynext1 = local_state_space_iteration_3(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, rf_ghxxx, rf_ghuuu, rf_ghxxu, rf_ghxuu, rf_ghxss, rf_ghuss, options_.threads.local_state_space_iteration_3); end, tElapsed1 = toc(tStart1); +tStart2 = tic; [udr] = folded_to_unfolded_dr(dr, M_, options_); for i=1:nsims, ynext2 = local_state_space_iteration_k(yhat, epsilon, dr, M_, options_, udr); end, tElapsed2 = toc(tStart2); + +if max(max(abs(ynext1-ynext2))) > 1e-10 + error('Without pruning: Inconsistency between local_state_space_iteration_3 and local_state_space_iteration_k') +end + +if tElapsed1<tElapsed2 + skipline() + dprintf('Without pruning: local_state_space_iteration_3 is %5.2f times faster than local_state_space_iteration_k', tElapsed2/tElapsed1) + skipline() +else + skipline() + dprintf('Without pruning: local_state_space_iteration_3 is %5.2f times slower than local_state_space_iteration_k', tElapsed1/tElapsed2) + skipline() +end + +% With pruning +rf_ss = dr.ys(dr.order_var); +rf_ss = rf_ss(dr.restrict_var_list, :); +yhat_ = chol(oo_.var(state_idx,state_idx))*randn(nstates, nparticles); +yhat__ = zeros(2*nstates,nparticles); +yhat__(1:nstates,:) = yhat_; +yhat__(nstates+1:2*nstates,:) = chol(oo_.var(state_idx,state_idx))*randn(nstates, nparticles); +nstatesandobs = size(rf_ghx,1); + +[ynext1,ynext1_lat] = local_state_space_iteration_3(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, rf_ghxxx, rf_ghuuu, rf_ghxxu, rf_ghxuu, rf_ghxss, rf_ghuss, yhat__, rf_ss, options_.threads.local_state_space_iteration_3); +[ynext2,ynext2_lat] = local_state_space_iteration_2(yhat__(nstates+1:2*nstates,:), epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, yhat_, rf_ss, options_.threads.local_state_space_iteration_2); +if max(max(abs(ynext1_lat(nstatesandobs+1:2*nstatesandobs,:)-ynext2))) > 1e-14 + error('With pruning: inconsistency between local_state_space_iteration_2 and local_state_space_iteration_3 on the 2nd-order pruned variable') +end +if max(max(abs(ynext1_lat(1:nstatesandobs,:)-ynext2_lat))) > 1e-14 + error('With pruning: inconsistency between local_state_space_iteration_2 and local_state_space_iteration_3 on the 1st-order pruned variable') +end \ No newline at end of file -- GitLab