diff --git a/.gitignore b/.gitignore index d5aefb1d2c00323f642f0756809e3b4444b85709..8c1eedbb65f1fe1514fea5c821d98be32b02084f 100644 --- a/.gitignore +++ b/.gitignore @@ -90,10 +90,12 @@ doc/internals/ltxpng /mex/build/matlab/*/*.cc /mex/build/matlab/*/*.f08 /mex/build/matlab/*/*.F08 +/mex/build/matlab/*/*.S /mex/build/octave/*/*.c /mex/build/octave/*/*.cc /mex/build/octave/*/*.f08 /mex/build/octave/*/*.F08 +/mex/build/octave/*/*.S # Fortran modules /mex/build/matlab/*/*.mod diff --git a/mex/build/local_state_space_iterations.am b/mex/build/local_state_space_iterations.am index e9281345f32772c79e6565f1223d300a86ccd521..8068e72b3f2a06269971d5bb093becb9d6cdffd8 100644 --- a/mex/build/local_state_space_iterations.am +++ b/mex/build/local_state_space_iterations.am @@ -1,9 +1,10 @@ mex_PROGRAMS = local_state_space_iteration_2 local_state_space_iteration_3 local_state_space_iteration_k -nodist_local_state_space_iteration_2_SOURCES = local_state_space_iteration_2.cc +nodist_local_state_space_iteration_2_SOURCES = local_state_space_iteration_2.cc lssi2_avx2.S lssi2_avx512.S nodist_local_state_space_iteration_3_SOURCES = local_state_space_iteration_3.f08 nodist_local_state_space_iteration_k_SOURCES = local_state_space_iteration_k.f08 +local_state_space_iteration_2_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/../../sources/local_state_space_iterations local_state_space_iteration_2_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp local_state_space_iteration_2_LDFLAGS = $(AM_LDFLAGS) $(OPENMP_LDFLAGS) @@ -24,4 +25,7 @@ CLEANFILES = $(nodist_local_state_space_iteration_2_SOURCES) \ $(LN_S) -f $< $@ %.cc: $(top_srcdir)/../../sources/local_state_space_iterations/%.cc - $(LN_S) -f $< $@ \ No newline at end of file + $(LN_S) -f $< $@ + +%.S: $(top_srcdir)/../../sources/local_state_space_iterations/%.S + $(LN_S) -f $< $@ diff --git a/mex/build/matlab/configure.ac b/mex/build/matlab/configure.ac index 052fd0084c317191e099c473800bc65c3c98b263..cf91e038a47866d4ef0760a8f31ed47978e5d60f 100644 --- a/mex/build/matlab/configure.ac +++ b/mex/build/matlab/configure.ac @@ -52,6 +52,7 @@ AC_PROG_RANLIB AX_PROG_LN_S AC_PROG_MKDIR_P AM_PROG_AR +AM_PROG_AS AX_CXX11_THREAD diff --git a/mex/build/octave/configure.ac b/mex/build/octave/configure.ac index 16b2b34f2c6fc999edd47b6122430594e09c578e..86be71ca40629bda3709f8e5bdc406320e114e0e 100644 --- a/mex/build/octave/configure.ac +++ b/mex/build/octave/configure.ac @@ -50,6 +50,7 @@ AC_PROG_RANLIB AX_PROG_LN_S AC_PROG_MKDIR_P AM_PROG_AR +AM_PROG_AS AX_CXX11_THREAD diff --git a/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc b/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc index cd0253ca5267cb4e2fcf8a6ec8544d2acdc28e4b..e81a7f3cd2f79fd52a0dfee8e92d94c8266c436b 100644 --- a/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc +++ b/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc @@ -26,12 +26,16 @@ #include <algorithm> #include <tuple> #include <string> +#include <thread> +#include <cstdlib> #include <dynmex.h> #include <dynblas.h> #include <omp.h> +#include "lssi2.hh" + /* Uncomment the following line to use BLAS instead of loops when computing ghx·ŷ and ghu·ε. @@ -136,18 +140,22 @@ ss2Iteration_pruning(double *y2, double *y1, const double *yhat2, const double * } } +/* Generic kernel for the iteration without pruning. + More specialized kernels are implemented in assembly. */ void -ss2Iteration(double *y, const double *yhat, const double *epsilon, - const double *ghx, const double *ghu, - const double *constant, const double *ghxx, const double *ghuu, const double *ghxu, - blas_int m, blas_int n, blas_int q, blas_int s, int number_of_threads) +lssi2_generic(double *y, const double *yhat, const double *epsilon, + const double *ghx, const double *ghu, + const double *constant, const double *ghxx, const double *ghuu, const double *ghxu, + blas_int m, blas_int n, blas_int q, blas_int s, + int number_of_threads) { + auto [ii1, ii2, ii3] = set_vector_of_indices(n, m); // vector indices for ghxx + auto [jj1, jj2, jj3] = set_vector_of_indices(q, m); // vector indices for ghuu + #ifdef USE_BLAS_AT_FIRST_ORDER const double one = 1.0; const blas_int ONE = 1; #endif - auto [ii1, ii2, ii3] = set_vector_of_indices(n, m); // vector indices for ghxx - auto [jj1, jj2, jj3] = set_vector_of_indices(q, m); // vector indices for ghuu #pragma omp parallel for num_threads(number_of_threads) for (int particle = 0; particle < s; particle++) { @@ -197,6 +205,65 @@ ss2Iteration(double *y, const double *yhat, const double *epsilon, } } +void +ss2Iteration(double *y, const double *yhat, const double *epsilon, + const double *ghx, const double *ghu, + const double *constant, const double *ghxx, const double *ghuu, const double *ghxu, + blas_int m, blas_int n, blas_int q, blas_int s, int number_of_threads) +{ + std::string kernel{"auto"}; + if (char *kstr = getenv("DYNARE_LSSI2_KERNEL"); kstr && *kstr != '\0') + kernel = kstr; + + // Runtime selection of kernel +#if defined(__x86_64__) && defined(__LP64__) + if ((kernel == "auto" &&__builtin_cpu_supports("avx512f")) + || kernel == "avx512") + { + int it_total {s/8}; + int it_per_thread {it_total / number_of_threads}; + std::vector<std::jthread> threads; + for (int i {0}; i < number_of_threads; i++) + { + int offset {i*it_per_thread*8}; + int s2 {i == number_of_threads - 1 ? it_total*8 - offset : it_per_thread*8}; + threads.emplace_back(lssi2_avx512, y+offset*m, yhat+offset*n, epsilon+offset*q, + ghx, ghu, constant, ghxx, ghuu, ghxu, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(q), s2); + } + if (int rem {s % 8}; rem != 0) + // If s is not a multiple of 8, use the generic routine to finish the computation + lssi2_generic(y+(s-rem)*m, yhat+(s-rem)*n, epsilon+(s-rem)*q, + ghx, ghu, constant, ghxx, ghuu, ghxu, m, n, q, rem, 1); + } + else if ((kernel == "auto" &&__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma")) + || kernel == "avx2") + { + int it_total {s/4}; + int it_per_thread {it_total / number_of_threads}; + std::vector<std::jthread> threads; + for (int i {0}; i < number_of_threads; i++) + { + int offset {i*it_per_thread*4}; + int s2 {i == number_of_threads - 1 ? it_total*4 - offset : it_per_thread*4}; + threads.emplace_back(lssi2_avx2, y+offset*m, yhat+offset*n, epsilon+offset*q, + ghx, ghu, constant, ghxx, ghuu, ghxu, + static_cast<int>(m), static_cast<int>(n), static_cast<int>(q), s2); + } + if (int rem {s % 4}; rem != 0) + // If s is not a multiple of 4, use the generic routine to finish the computation + lssi2_generic(y+(s-rem)*m, yhat+(s-rem)*n, epsilon+(s-rem)*q, + ghx, ghu, constant, ghxx, ghuu, ghxu, m, n, q, rem, 1); + } + else +#endif + if (kernel == "auto" || kernel == "generic") + lssi2_generic(y, yhat, epsilon, ghx, ghu, constant, ghxx, ghuu, ghxu, + m, n, q, s, number_of_threads); + else + mexErrMsgTxt(("Unknown kernel: " + kernel).c_str()); +} + void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { diff --git a/mex/sources/local_state_space_iterations/lssi2.hh b/mex/sources/local_state_space_iterations/lssi2.hh new file mode 100644 index 0000000000000000000000000000000000000000..339c637330da5f113153395f2ebdcaa45099b673 --- /dev/null +++ b/mex/sources/local_state_space_iterations/lssi2.hh @@ -0,0 +1,48 @@ +// Prototypes for state space iteration at order 2 routines written in assembly + +/* + * Copyright © 2022 Dynare Team + * + * This file is part of Dynare. + * + * Dynare is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Dynare is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Dynare. If not, see <https://www.gnu.org/licenses/>. + */ + +/* + WARNING: s must be a multiple of 4 for the AVX2 routines. + + Sizes of the vector/matrix arguments: + – y: m×s + – yhat: n×s + – epsilon: q×s + – ghx: m×n + – constant: m + – ghxx: m×n² + – ghuu: m×q² + – ghxu: m×nq +*/ + +#if defined(__x86_64__) && defined(__LP64__) + +extern "C" void lssi2_avx2(double *y, const double *yhat, const double *epsilon, + const double *ghx, const double *ghu, const double *constant, + const double *ghxx, const double *ghuu, const double *ghxu, + int m, int n, int q, int s); + +extern "C" void lssi2_avx512(double *y, const double *yhat, const double *epsilon, + const double *ghx, const double *ghu, const double *constant, + const double *ghxx, const double *ghuu, const double *ghxu, + int m, int n, int q, int s); + +#endif diff --git a/mex/sources/local_state_space_iterations/lssi2_avx2.S b/mex/sources/local_state_space_iterations/lssi2_avx2.S new file mode 100644 index 0000000000000000000000000000000000000000..9a4d521bbfa62d223475c807a42c4dc18f96b138 --- /dev/null +++ b/mex/sources/local_state_space_iterations/lssi2_avx2.S @@ -0,0 +1,348 @@ +/* + * Local state-space iteration at order 2, without pruning. + * Specialized kernel in x86-64 assembly using AVX2 and FMA extensions. + * See the function prototype in lssi2.hh. + * + * WARNING: the number of particles must be a multiple of 4. + * + * TODO: + * – Implement Windows ABI as an alternative + */ + +/* + * Copyright © 2022 Dynare Team + * + * This file is part of Dynare. + * + * Dynare is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Dynare is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Dynare. If not, see <https://www.gnu.org/licenses/>. + */ + +#if defined(__x86_64__) && defined(__LP64__) + + .intel_syntax noprefix + + ## Enforce required CPU features, so that GNU as errors out if we use + ## instructions not in that feature set + .arch generic64 + .arch .avx2 + .arch .fma + + +### Some useful macros +#include "lssi2_common.S" + +### Some pre-defined vector constants + .section .rodata + + .align 8 +double_1: + .double 1. +double_0p5: + .double 0.5 + + .align 16 +v4_int_0_1_2_3: + .int 0, 1, 2, 3 + + +### Layout of the current stack frame and of the beginning of the previous +### one (for arguments passed through the stack). +### This defines offset symbols through which variables and arguments can +### be conveniently referred. + .struct 0 +constant_offset: + .space 8 +ghu_offset: + .space 8 +ghx_offset: + .space 8 +saved_regs_offset: + .space 6*8 +return_address_offset: + .space 8 + ## Beginning of the previous stack frame +ghxx_offset: + .space 8 +ghuu_offset: + .space 8 +ghxu_offset: + .space 8 +m_offset: + .space 8 +n_offset: + .space 8 +q_offset: + .space 8 +s_offset: + .space 8 + + +### Function body + .text + .global lssi2_avx2 +lssi2_avx2: + .cfi_startproc + .set pushed_regs,0 + push_cfi_reg rbp + push_cfi_reg rbx + push_cfi_reg r12 + push_cfi_reg r13 + push_cfi_reg r14 + push_cfi_reg r15 + .if return_address_offset-saved_regs_offset != pushed_regs*8 + .error "Incorrect stack frame layout regarding saved registers" + .endif + + ## Allocate space for variables in the current stack frame + sub rsp,saved_regs_offset + .cfi_adjust_cfa_offset saved_regs_offset + + ## Define more explicit names for registers that will remain associated + ## to input arguments + .set y,rdi + .set yhat,rsi + .set epsilon,rdx + .set m,r12d + .set n,r13d + .set q,r14d + .set s,r15d + + ## Initialize those registers that need it + mov m,[rsp+m_offset] # number of states + observed vars + mov n,[rsp+n_offset] # number of states + mov q,[rsp+q_offset] # number of shocks + mov s,[rsp+s_offset] # number of particles + + ## Conversely save some arguments to the stack + mov [rsp+ghx_offset],rcx + mov [rsp+ghu_offset],r8 + mov [rsp+constant_offset],r9 + + ## Do nothing if s=0 + test s,s + jz done + + ## Precompute 8m (used for moving pointers by one column inside y and gh* matrices) + mov r11d,m + shl r11,3 # Use a 64-bit op, since m could theoretically be 2³¹-1 + + ## Pre-compute some vectorized constants + vmovd xmm15,n + vpbroadcastd xmm15,xmm15 # xmm15 = [ int32: n (×4) ] + vmovd xmm14,q + vpbroadcastd xmm14,xmm14 # xmm14 = [ int32: q (×4) ] + mov rax,0x8000000000000000 + vmovq xmm13,rax + vpbroadcastq ymm13,xmm13 # ymm13 = [ int64: 0x8000000000000000 (×4) ] + + vmovdqa xmm12,[rip+v4_int_0_1_2_3] + vpmulld xmm12,xmm12,xmm15 # xmm12 = base VSIB for current particles in ŷ + vmovdqa xmm11,[rip+v4_int_0_1_2_3] + vpmulld xmm11,xmm11,xmm14 # xmm11 = base VSIB for current particles in ε + + vpslld xmm15,xmm15,2 # xmm15 = [ int32: 4n (×4) ] + vpslld xmm14,xmm14,2 # xmm14 = [ int32: 4q (×4) ] + + mov eax,1 + vmovd xmm10,eax + vpbroadcastd xmm10,xmm10 # xmm10 = [ int32: 1 (×4) ] + vbroadcastsd ymm9,[rip+double_1] # ymm9 = [ double: 1. (×4) ] + vbroadcastsd ymm8,[rip+double_0p5] # ymm8 = [ double: 0.5 (×4) ] + + ## Enter the main loop + xor ebx,ebx # ebx = particle counter + +next_4_particles: + xor ecx,ecx # ecx = variable counter + +next_variable: + ## Use ymm0 to store the 4 next-period particles for this specific variable + ## Initialize ymm0 to the constant + mov r8,[rsp+constant_offset] + vbroadcastsd ymm0,[r8+rcx*8] + + ## Add ghx·ŷ + mov ebp,n # ebp = number of remaining state variables + vmovdqa xmm1,xmm12 # xmm1 = VSIB for current particles in ŷ + mov r8,[rsp+ghx_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghxᵢⱼ + .align 16 # Recommendation p. 537 of Kusswurm (2018) +next_state: + vmovdqa ymm4,ymm13 # Mask for vgatherdpd (will be cleared by the instr.) + vgatherdpd ymm2,[yhat+xmm1*8],ymm4 # ymm2 = current particles for ŷⱼ + vbroadcastsd ymm3,[r8] # ymm3 = ghxᵢⱼ repeated 4 times + vfmadd231pd ymm0,ymm2,ymm3 # Add ghxᵢⱼ·ŷⱼ to current particles + add r8,r11 # Move to next column in ghx + vpaddd xmm1,xmm1,xmm10 # Update VSIB for next state (j=j+1) + sub ebp,1 + jnz next_state + + ## Add ghu·ε + mov ebp,q # ebp = number of remaining shocks + vmovdqa xmm1,xmm11 # xmm1 = VSIB for current particles in ε + mov r8,[rsp+ghu_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghuᵢⱼ + .align 16 +next_shock: + vmovdqa ymm4,ymm13 + vgatherdpd ymm2,[epsilon+xmm1*8],ymm4 # ymm2 = current particles for εⱼ + vbroadcastsd ymm3,[r8] # ymm3 = ghuᵢⱼ repeated 4 times + vfmadd231pd ymm0,ymm2,ymm3 # Add ghuᵢⱼ·εⱼ to current particles + add r8,r11 # Move to next column in ghu + vpaddd xmm1,xmm1,xmm10 # Update VSIB for next shock (j=j+1) + sub ebp,1 + jnz next_shock + + ## Add ½ghxx·ŷ⊗ŷ + xor ebp,ebp # Index of first state (j₁) + mov r8,[rsp+ghxx_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghxxᵢⱼ + vmovdqa xmm1,xmm12 # xmm1 = VSIB for current particles in ŷⱼ₁ + .align 16 +next_state_state_1: + mov r10d,ebp # Index of second state (j₂) + vmovdqa xmm2,xmm1 # xmm2 = VSIB for current particles in ŷⱼ₂ + .align 16 +next_state_state_2: + vmovdqa ymm3,ymm13 + vgatherdpd ymm4,[yhat+xmm1*8],ymm3 # ymm4 = current particles for ŷⱼ₁ + vmovdqa ymm5,ymm13 + vgatherdpd ymm6,[yhat+xmm2*8],ymm5 # ymm6 = current particles for ŷⱼ₂ + vmulpd ymm3,ymm4,ymm6 # ymm3 = particles for ŷⱼ₁·ŷⱼ₂ + vpcmpeqd xmm7,xmm1,xmm2 + vinserti128 ymm7,ymm7,xmm7,1 # ymm7 = all 1s if diagonal element, all 0s otherwise + ## There is a spurious dependency here due to ymm4 being reused (no more free registers) + ## Freeing ymm8 by using a memory read in the following blend does not improve things + vblendvpd ymm4,ymm9,ymm8,ymm7 # ymm4 = ½ if diagonal element, 1 otherwise + vmulpd ymm3,ymm4,ymm3 # ymm3 = ½ŷⱼ₁·ŷⱼ₂ if diagonal, ŷⱼ₁·ŷⱼ₂ otherwise + vbroadcastsd ymm7,[r8] # ymm7 = ghxxᵢⱼ repeated 4 times + vfmadd231pd ymm0,ymm7,ymm3 # Add (½?)ghxxᵢⱼ·ŷⱼ₁·ŷⱼ₂ to current particles + vpaddd xmm2,xmm2,xmm10 # Update VSIB for next second state (j₂=j₂+1) + add r8,r11 # Move to next column in ghxx + add r10d,1 + cmp r10d,n + jl next_state_state_2 + vpaddd xmm1,xmm1,xmm10 # Update VSIB for next first state (j₁=j₁+1) + add ebp,1 + mov eax,ebp + imul rax,r11 + add r8,rax # Jump several columns in ghxx + cmp ebp,n + jl next_state_state_1 + + ## Add ½ghuu·ε⊗ε + xor ebp,ebp # Index of first shock (j₁) + mov r8,[rsp+ghuu_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghuuᵢⱼ + vmovdqa xmm1,xmm11 # xmm1 = VSIB for current particles in εⱼ₁ + .align 16 +next_shock_shock_1: + mov r10d,ebp # Index of second shock (j₂) + vmovdqa xmm2,xmm1 # xmm2 = VSIB for current particles in εⱼ₂ + .align 16 +next_shock_shock_2: + vmovdqa ymm3,ymm13 + vgatherdpd ymm4,[epsilon+xmm1*8],ymm3 # ymm4 = current particles for εⱼ₁ + vmovdqa ymm5,ymm13 + vgatherdpd ymm6,[epsilon+xmm2*8],ymm5 # ymm6 = current particles for εⱼ₂ + vmulpd ymm3,ymm4,ymm6 # ymm3 = particles for εⱼ₁·εⱼ₂ + vpcmpeqd xmm7,xmm1,xmm2 + vinserti128 ymm7,ymm7,xmm7,1 # ymm7 = all 1s if diagonal element, all 0s otherwise + ## Same remark as previous section regarding ymm4 + vblendvpd ymm4,ymm9,ymm8,ymm7 # ymm4 = ½ if diagonal element, 1 otherwise + vmulpd ymm3,ymm4,ymm3 # ymm3 = ½εⱼ₁εⱼ₂ if diagonal, εⱼ₁εⱼ₂ otherwise + vbroadcastsd ymm7,[r8] # ymm7 = ghuuᵢⱼ repeated 4 times + vfmadd231pd ymm0,ymm7,ymm3 # Add ghuuᵢⱼ·εⱼ₁εⱼ₂ to current particles + vpaddd xmm2,xmm2,xmm10 # Update VSIB for next second shock (j₂=j₂+1) + add r8,r11 # Move to next column in ghuu + add r10d,1 + cmp r10d,q + jl next_shock_shock_2 + vpaddd xmm1,xmm1,xmm10 # Update VSIB for next first shock (j₁=j₁+1) + add ebp,1 + mov eax,ebp + imul rax,r11 + add r8,rax # Jump several columns in ghuu + cmp ebp,q + jl next_shock_shock_1 + + ## Add ghxu·ŷ⊗ε + mov ebp,n # ebp = number of remaining states + vmovdqa xmm1,xmm12 # xmm1 = VSIB for current particles in ŷ + mov r8,[rsp+ghxu_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghxuᵢⱼ + .align 16 +next_state_2: + mov eax,q # eax = number of remaining shocks + vmovdqa xmm2,xmm11 # xmm2 = VSIB for current particles in ε + vmovdqa ymm3,ymm13 + vgatherdpd ymm4,[yhat+xmm1*8],ymm3 # ymm4 = current particles for ŷⱼ₁ + .align 16 +next_shock_2: + vmovdqa ymm5,ymm13 + vgatherdpd ymm6,[epsilon+xmm2*8],ymm5 # ymm6 = current particles for εⱼ₂ + vbroadcastsd ymm7,[r8] # ymm7 = ghxuᵢⱼ repeated 4 times + vmulpd ymm6,ymm4,ymm6 # ymm6 = particles for ŷⱼ₁εⱼ₂ + vfmadd231pd ymm0,ymm7,ymm6 # Add ghxuᵢⱼ·ŷⱼ₁εⱼ₂ to current particles + add r8,r11 # Move to next column in ghxu + vpaddd xmm2,xmm2,xmm10 # Update VSIB for next shock (j₂=j₂+1) + sub eax,1 + jnz next_shock_2 + vpaddd xmm1,xmm1,xmm10 # Update VSIB for next state (j₁=j₁+1) + sub ebp,1 + jnz next_state_2 + + ## Save updated particles to memory + mov eax,ebx + imul eax,m + add eax,ecx + lea r8,[y+rax*8] + vextractf128 xmm2,ymm0,1 + vmovsd [r8],xmm0 + add r8,r11 + vpsrldq xmm1,xmm0,8 + vmovsd [r8],xmm1 + add r8,r11 + vmovsd [r8],xmm2 + add r8,r11 + vpsrldq xmm3,xmm2,8 + vmovsd [r8],xmm3 + + ## Loop over variables + add ecx,1 + cmp ecx,m + jl next_variable + + ## Loop over particles + add ebx,4 + vpaddd xmm12,xmm12,xmm15 # Update base VSIB for ŷ + vpaddd xmm11,xmm11,xmm14 # Update base VSIB for ε + cmp ebx,s + jl next_4_particles + +done: + ## Cleanup + add rsp,saved_regs_offset + .cfi_adjust_cfa_offset -saved_regs_offset + pop_cfi_reg r15 + pop_cfi_reg r14 + pop_cfi_reg r13 + pop_cfi_reg r12 + pop_cfi_reg rbx + pop_cfi_reg rbp + vzeroupper + ret + .cfi_endproc + +#endif diff --git a/mex/sources/local_state_space_iterations/lssi2_avx512.S b/mex/sources/local_state_space_iterations/lssi2_avx512.S new file mode 100644 index 0000000000000000000000000000000000000000..b117b22a85554b97aefd10a021b053473f1ae9aa --- /dev/null +++ b/mex/sources/local_state_space_iterations/lssi2_avx512.S @@ -0,0 +1,334 @@ +/* + * Local state-space iteration at order 2, without pruning. + * Specialized kernel in x86-64 assembly using AVX-512F. + * See the function prototype in lssi2.hh. + * + * WARNING: the number of particles must be a multiple of 8. + * + * TODO: + * – Implement Windows ABI as an alternative + * – Check whether requiring AVX-512VL (and thus replacing the 3 instructions + * which manipulate VSIB in zmm* by their equivalent that manipulate ymm*) + * improves performance. Actually, is there any CPU that implements AVX-512F + * but not AVX-512VL? + * – Check whether using int64 inside VSIB (instead of int32) improves + * performance + */ + +/* + * Copyright © 2022 Dynare Team + * + * This file is part of Dynare. + * + * Dynare is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Dynare is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Dynare. If not, see <https://www.gnu.org/licenses/>. + */ + +#if defined(__x86_64__) && defined(__LP64__) + + .intel_syntax noprefix + + ## Enforce required CPU features, so that GNU as errors out if we use + ## instructions not in that feature set + .arch generic64 + .arch .avx512f + + +### Some useful macros +#include "lssi2_common.S" + + +### Some pre-defined vector constants + .section .rodata + + .align 8 +double_0p5: + .double 0.5 + + .align 32 +v8_int_0_1__7: + .int 0, 1, 2, 3, 4, 5, 6, 7 + + +### Layout of the current stack frame and of the beginning of the previous +### one (for arguments passed through the stack). +### This defines offset symbols through which variables and arguments can +### be conveniently referred. + .struct 0 +constant_offset: + .space 8 +ghu_offset: + .space 8 +ghx_offset: + .space 8 +saved_regs_offset: + .space 6*8 +return_address_offset: + .space 8 + ## Beginning of the previous stack frame +ghxx_offset: + .space 8 +ghuu_offset: + .space 8 +ghxu_offset: + .space 8 +m_offset: + .space 8 +n_offset: + .space 8 +q_offset: + .space 8 +s_offset: + .space 8 + + +### Function body + .text + .global lssi2_avx512 +lssi2_avx512: + .cfi_startproc + .set pushed_regs,0 + push_cfi_reg rbp + push_cfi_reg rbx + push_cfi_reg r12 + push_cfi_reg r13 + push_cfi_reg r14 + push_cfi_reg r15 + .if return_address_offset-saved_regs_offset != pushed_regs*8 + .error "Incorrect stack frame layout regarding saved registers" + .endif + + ## Allocate space for variables in the current stack frame + sub rsp,saved_regs_offset + .cfi_adjust_cfa_offset saved_regs_offset + + ## Define more explicit names for registers that will remain associated + ## to input arguments + .set y,rdi + .set yhat,rsi + .set epsilon,rdx + .set m,r12d + .set n,r13d + .set q,r14d + .set s,r15d + + ## Initialize those registers that need it + mov m,[rsp+m_offset] # number of states + observed vars + mov n,[rsp+n_offset] # number of states + mov q,[rsp+q_offset] # number of shocks + mov s,[rsp+s_offset] # number of particles + + ## Conversely save some arguments to the stack + mov [rsp+ghx_offset],rcx + mov [rsp+ghu_offset],r8 + mov [rsp+constant_offset],r9 + + ## Do nothing if s=0 + test s,s + jz done + + ## Precompute 8m (used for moving pointers by one column inside y and gh* matrices) + mov r11d,m + shl r11,3 # Use a 64-bit op, since m could theoretically be 2³¹-1 + + ## Pre-compute some vectorized constants + vmovd xmm15,n + vpbroadcastd ymm15,xmm15 # ymm15 = [ int32: n (×8) ] + vmovd xmm14,q + vpbroadcastd ymm14,xmm14 # ymm14 = [ int32: q (×8) ] + vmovd xmm13,m + vpbroadcastd ymm13,xmm13 # ymm13 = [ int32: m (×8) ] + + vmovdqa ymm12,[rip+v8_int_0_1__7] + vpmulld ymm12,ymm12,ymm15 # ymm12 = base VSIB for current particles in ŷ + vmovdqa ymm11,[rip+v8_int_0_1__7] + vpmulld ymm11,ymm11,ymm14 # ymm11 = base VSIB for current particles in ε + vmovdqa ymm10,[rip+v8_int_0_1__7] + vpmulld ymm10,ymm10,ymm13 # ymm10 = base VSIB for particles in y + + vpslld ymm15,ymm15,3 # ymm15 = [ int32: 8n (×8) ] + vpslld ymm14,ymm14,3 # ymm14 = [ int32: 8q (×8) ] + vpslld ymm13,ymm13,3 # ymm13 = [ int32: 8m (×8) ] + + mov eax,1 + ## NB: We broadcast into zmm in order to avoid requiring AVX-512VL + vpbroadcastd zmm8,eax # ymm8 = [ int32: 1 (×8) ] + vbroadcastsd zmm16,[rip+double_0p5] # zmm16 = [ double: 0.5 (×8) ] + + ## Enter the main loop + xor ebx,ebx # ebx = particle counter + +next_8_particles: + xor ecx,ecx # ecx = variable counter + vmovdqa ymm9,ymm10 # ymm9 = VSIB for current particles in y + +next_variable: + ## Use ymm0 to store the 4 next-period particles for this specific variable + ## Initialize ymm0 to the constant + mov r8,[rsp+constant_offset] + vbroadcastsd zmm0,[r8+rcx*8] + + ## Add ghx·ŷ + mov ebp,n # ebp = number of remaining state variables + vmovdqa ymm1,ymm12 # ymm1 = VSIB for current particles in ŷ + mov r8,[rsp+ghx_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghxᵢⱼ + .align 16 # Recommendation p. 537 of Kusswurm (2018) +next_state: + kxnorw k1,k1,k1 + vgatherdpd zmm2{k1},[yhat+ymm1*8] # zmm2 = current particles for ŷⱼ + vfmadd231pd zmm0,zmm2,[r8]{1to8} # Add ghxᵢⱼ·ŷⱼ to current particles + add r8,r11 # Move to next column in ghx + vpaddd ymm1,ymm1,ymm8 # Update VSIB for next state (j=j+1) + sub ebp,1 + jnz next_state + + ## Add ghu·ε + mov ebp,q # ebp = number of remaining shocks + vmovdqa ymm1,ymm11 # ymm1 = VSIB for current particles in ε + mov r8,[rsp+ghu_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghuᵢⱼ + .align 16 +next_shock: + kxnorw k1,k1,k1 + vgatherdpd zmm2{k1},[epsilon+ymm1*8] # zmm2 = current particles for εⱼ + vfmadd231pd zmm0,zmm2,[r8]{1to8} # Add ghuᵢⱼ·εⱼ to current particles + add r8,r11 # Move to next column in ghu + vpaddd ymm1,ymm1,ymm8 # Update VSIB for next shock (j=j+1) + sub ebp,1 + jnz next_shock + + ## Add ½ghxx·ŷ⊗ŷ + xor ebp,ebp # Index of first state (j₁) + mov r8,[rsp+ghxx_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghxxᵢⱼ + vmovdqa ymm1,ymm12 # ymm1 = VSIB for current particles in ŷⱼ₁ + .align 16 +next_state_state_1: + mov r10d,ebp # Index of second state (j₂) + vmovdqa ymm2,ymm1 # ymm2 = VSIB for current particles in ŷⱼ₂ + .align 16 +next_state_state_2: + kxnorw k1,k1,k1 + vgatherdpd zmm4{k1},[yhat+ymm1*8] # zmm4 = current particles for ŷⱼ₁ + kxnorw k2,k2,k2 + vgatherdpd zmm6{k2},[yhat+ymm2*8] # zmm6 = current particles for ŷⱼ₂ + vmulpd zmm3,zmm4,zmm6 # zmm3 = particles for ŷⱼ₁·ŷⱼ₂ + ## NB: We compare the zmm registers to avoid requiring AVX-512VL + vpcmpeqd k3,zmm1,zmm2 # k3[0:7]=1 if diagonal, 0 otherwise + vmulpd zmm3{k3},zmm3,zmm16 # zmm3 = ½ŷⱼ₁·ŷⱼ₂ if diagonal, ŷⱼ₁·ŷⱼ₂ otherwise + vfmadd231pd zmm0,zmm3,[r8]{1to8} # Add (½?)ghxxᵢⱼ·ŷⱼ₁·ŷⱼ₂ to current particles + vpaddd ymm2,ymm2,ymm8 # Update VSIB for next second state (j₂=j₂+1) + add r8,r11 # Move to next column in ghxx + add r10d,1 + cmp r10d,n + jl next_state_state_2 + vpaddd ymm1,ymm1,ymm8 # Update VSIB for next first state (j₁=j₁+1) + add ebp,1 + mov eax,ebp + imul rax,r11 + add r8,rax # Jump several columns in ghxx + cmp ebp,n + jl next_state_state_1 + + ## Add ½ghuu·ε⊗ε + xor ebp,ebp # Index of first shock (j₁) + mov r8,[rsp+ghuu_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghuuᵢⱼ + vmovdqa ymm1,ymm11 # ymm1 = VSIB for current particles in εⱼ₁ + .align 16 +next_shock_shock_1: + mov r10d,ebp # Index of second shock (j₂) + vmovdqa ymm2,ymm1 # ymm2 = VSIB for current particles in εⱼ₂ + .align 16 +next_shock_shock_2: + kxnorw k1,k1,k1 + vgatherdpd zmm4{k1},[epsilon+ymm1*8] # zmm4 = current particles for εⱼ₁ + kxnorw k2,k2,k2 + vgatherdpd zmm6{k2},[epsilon+ymm2*8] # zmm6 = current particles for εⱼ₂ + vmulpd zmm3,zmm4,zmm6 # zmm3 = particles for εⱼ₁·εⱼ₂ + ## NB: We compare the zmm registers to avoid requiring AVX-512VL + vpcmpeqd k3,zmm1,zmm2 # k3[0:7]=1 if diagonal, 0 otherwise + vmulpd zmm3{k3},zmm3,zmm16 # zmm3 = ½εⱼ₁εⱼ₂ if diagonal, εⱼ₁εⱼ₂ otherwise + vfmadd231pd zmm0,zmm3,[r8]{1to8} # Add ghuuᵢⱼ·εⱼ₁εⱼ₂ to current particles + vpaddd ymm2,ymm2,ymm8 # Update VSIB for next second shock (j₂=j₂+1) + add r8,r11 # Move to next column in ghuu + add r10d,1 + cmp r10d,q + jl next_shock_shock_2 + vpaddd ymm1,ymm1,ymm8 # Update VSIB for next first shock (j₁=j₁+1) + add ebp,1 + mov eax,ebp + imul rax,r11 + add r8,rax # Jump several columns in ghuu + cmp ebp,q + jl next_shock_shock_1 + + ## Add ghxu·ŷ⊗ε + mov ebp,n # ebp = number of remaining states + vmovdqa ymm1,ymm12 # ymm1 = VSIB for current particles in ŷ + mov r8,[rsp+ghxu_offset] + lea r8,[r8+rcx*8] # r8 = pointer to ghxuᵢⱼ + .align 16 +next_state_2: + mov eax,q # eax = number of remaining shocks + vmovdqa ymm2,ymm11 # ymm2 = VSIB for current particles in ε + kxnorw k1,k1,k1 + vgatherdpd zmm4{k1},[yhat+ymm1*8] # zmm4 = current particles for ŷⱼ₁ + .align 16 +next_shock_2: + kxnorw k2,k2,k2 + vgatherdpd zmm6{k2},[epsilon+ymm2*8] # zmm6 = current particles for εⱼ₂ + vmulpd zmm6,zmm4,zmm6 # zmm6 = particles for ŷⱼ₁εⱼ₂ + vfmadd231pd zmm0,zmm6,[r8]{1to8} # Add ghxuᵢⱼ·ŷⱼ₁εⱼ₂ to current particles + add r8,r11 # Move to next column in ghxu + vpaddd ymm2,ymm2,ymm8 # Update VSIB for next shock (j₂=j₂+1) + sub eax,1 + jnz next_shock_2 + vpaddd ymm1,ymm1,ymm8 # Update VSIB for next state (j₁=j₁+1) + sub ebp,1 + jnz next_state_2 + + ## Save updated particles to memory + kxnorw k1,k1,k1 + vscatterdpd [y+ymm9*8]{k1},zmm0 + + ## Loop over variables + vpaddd ymm9,ymm9,ymm8 # Update VSIB for next variable in y + add ecx,1 + cmp ecx,m + jl next_variable + + ## Loop over particles + add ebx,8 + vpaddd ymm12,ymm12,ymm15 # Update base VSIB for ŷ + vpaddd ymm11,ymm11,ymm14 # Update base VSIB for ε + vpaddd ymm10,ymm10,ymm13 # Update base VSIB for y + cmp ebx,s + jl next_8_particles + +done: + ## Cleanup + add rsp,saved_regs_offset + .cfi_adjust_cfa_offset -saved_regs_offset + pop_cfi_reg r15 + pop_cfi_reg r14 + pop_cfi_reg r13 + pop_cfi_reg r12 + pop_cfi_reg rbx + pop_cfi_reg rbp + vzeroupper + ret + .cfi_endproc + +#endif diff --git a/mex/sources/local_state_space_iterations/lssi2_common.S b/mex/sources/local_state_space_iterations/lssi2_common.S new file mode 100644 index 0000000000000000000000000000000000000000..3274cac3f4c3f7f3aaa8457e996ed878e277313d --- /dev/null +++ b/mex/sources/local_state_space_iterations/lssi2_common.S @@ -0,0 +1,15 @@ + ## Push a register to the stack and adjust CFI information accordingly + ## Also increment a counter + .macro push_cfi_reg reg + push \reg + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset \reg, 0 + .set pushed_regs,pushed_regs+1 + .endm + + ## Pop a register from the stack and adjust CFI information accordingly + .macro pop_cfi_reg reg + pop \reg + .cfi_adjust_cfa_offset -8 + .cfi_restore \reg + .endm diff --git a/tests/particle/local_state_space_iteration_k_test.mod b/tests/particle/local_state_space_iteration_k_test.mod index 1478931ad195f52f1769eaaffc0de3aa21391b75..b7de0734f95c558b5e75b0f22c1255951b6ec438 100644 --- a/tests/particle/local_state_space_iteration_k_test.mod +++ b/tests/particle/local_state_space_iteration_k_test.mod @@ -1,9 +1,3 @@ -/* - Tests that local_state_space_iteration_2 and local_state_space_iteration_k (for k=2) return the same results - - This file must be run after first_spec.mod (both are based on the same model). -*/ - @#include "first_spec_common.inc" varobs q ca; @@ -16,24 +10,22 @@ var ca = 0.01^2; end; // Initialize various structures +/* estimation(datafile='my_data.mat',order=2,mode_compute=0,mh_replic=0,filter_algorithm=sis,nonlinear_filter_initialization=2 ,cova_compute=0 %tell program that no covariance matrix was computed ); +*/ stoch_simul(order=2, periods=200, irf=0, k_order_solver); -// Really perform the test - -nparticles = options_.particle.number_of_particles; -nsims = 1e6/nparticles; +nparticles = 100003; +nsims = 500; +options_.threads.local_state_space_iteration_2 = 32; -/* We generate particles using realistic distributions (though this is not - strictly needed) */ state_idx = oo_.dr.order_var((M_.nstatic+1):(M_.nstatic+M_.npred+M_.nboth)); -yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles); -epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles); dr = oo_.dr; +dr.restrict_var_list=1:6; // “rf” stands for “Reduced Form” @@ -45,20 +37,97 @@ rf_ghxx = dr.ghxx(dr.restrict_var_list, :); rf_ghuu = dr.ghuu(dr.restrict_var_list, :); rf_ghxu = dr.ghxu(dr.restrict_var_list, :); -tStart1 = tic; for i=1:nsims, ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); end, tElapsed1 = toc(tStart1); +// Check consistency between the kernels +/* We generate particles using realistic distributions (though this is not + strictly needed) */ +yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles); +epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles); -tStart2 = tic; [udr] = folded_to_unfolded_dr(dr, M_, options_); for i=1:nsims, ynext2 = local_state_space_iteration_k(yhat, epsilon, dr, M_, options_, udr); end, tElapsed2 = toc(tStart2); +setenv('DYNARE_LSSI2_KERNEL', 'avx512') +ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); +setenv('DYNARE_LSSI2_KERNEL', 'avx2') +ynext2 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); +setenv('DYNARE_LSSI2_KERNEL', 'generic') +ynext3 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); -if max(max(abs(ynext1-ynext2))) > 1e-14 - error('Inconsistency between local_state_space_iteration_2 and local_state_space_iteration_k') +if max(max(abs(ynext1-ynext3))) > 1e-15 + error('avx512 kernel is inconsistent with generic one') end +if max(max(abs(ynext2-ynext3))) > 1e-15 + error('avx2 kernel is inconsistent with generic one') +end + +// Perform multi-threaded benchmarks +// We regenerate particles before each run to make sure they are not in the CPU cache before the run + +tElapsed = NaN(3,nsims); -if tElapsed1<tElapsed2 - skipline() - dprintf('local_state_space_iteration_2 is %5.2f times faster than local_state_space_iteration_k', tElapsed2/tElapsed1) - skipline() -else - skipline() - dprintf('local_state_space_iteration_2 is %5.2f times slower than local_state_space_iteration_k', tElapsed1/tElapsed2) - skipline() +disp('* Precompute states and shocks for all particles') +yhat_all = NaN(M_.npred+M_.nboth, nparticles, nsims); +epsilon_all = NaN(M_.exo_nbr, nparticles, nsims); +for i = 1:nsims + yhat_all(:,:,i) = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles); + epsilon_all(:,:,i) = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles); end + +disp('* Cool down processor') +pause(10) + +disp('* Multi-threaded benchmarks') + +for i = 1:nsims + +yhat = yhat_all(:,:,i); +epsilon = epsilon_all(:,:,i); + +setenv('DYNARE_LSSI2_KERNEL', 'avx512') +tStart = tic; +ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); +tElapsed(1,i) = toc(tStart); + +setenv('DYNARE_LSSI2_KERNEL', 'avx2') +tStart = tic; +ynext2 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); +tElapsed(2,i) = toc(tStart); + +setenv('DYNARE_LSSI2_KERNEL', 'generic') +tStart = tic; +ynext3 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); +tElapsed(3,i) = toc(tStart); + +end + +printf('avx512 kernel: average timing = %g μs\n', round(mean(tElapsed(1,:))*1e6)) +printf('avx2 kernel: average timing = %g μs\n', round(mean(tElapsed(2,:))*1e6)) +printf('generic kernel: average timing = %g μs\n', round(mean(tElapsed(3,:))*1e6)) + + +disp('* Single-threaded benchmarks') + +for i = 1:nsims + +yhat = yhat_all(:,:,i); +epsilon = epsilon_all(:,:,i); + +setenv('DYNARE_LSSI2_KERNEL', 'avx512') +tStart = tic; +ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, 1); +tElapsed(1,i) = toc(tStart); + +setenv('DYNARE_LSSI2_KERNEL', 'avx2') +tStart = tic; +ynext2 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, 1); +tElapsed(2,i) = toc(tStart); + +setenv('DYNARE_LSSI2_KERNEL', 'generic') +tStart = tic; +ynext3 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, 1); +tElapsed(3,i) = toc(tStart); + +end + +printf('avx512 kernel: average timing = %g μs\n', round(mean(tElapsed(1,:))*1e6)) +printf('avx2 kernel: average timing = %g μs\n', round(mean(tElapsed(2,:))*1e6)) +printf('generic kernel: average timing = %g μs\n', round(mean(tElapsed(3,:))*1e6)) + +setenv('DYNARE_LSSI2_KERNEL', 'auto')