diff --git a/.gitignore b/.gitignore
index d5aefb1d2c00323f642f0756809e3b4444b85709..8c1eedbb65f1fe1514fea5c821d98be32b02084f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -90,10 +90,12 @@ doc/internals/ltxpng
 /mex/build/matlab/*/*.cc
 /mex/build/matlab/*/*.f08
 /mex/build/matlab/*/*.F08
+/mex/build/matlab/*/*.S
 /mex/build/octave/*/*.c
 /mex/build/octave/*/*.cc
 /mex/build/octave/*/*.f08
 /mex/build/octave/*/*.F08
+/mex/build/octave/*/*.S
 
 # Fortran modules
 /mex/build/matlab/*/*.mod
diff --git a/mex/build/local_state_space_iterations.am b/mex/build/local_state_space_iterations.am
index e9281345f32772c79e6565f1223d300a86ccd521..8068e72b3f2a06269971d5bb093becb9d6cdffd8 100644
--- a/mex/build/local_state_space_iterations.am
+++ b/mex/build/local_state_space_iterations.am
@@ -1,9 +1,10 @@
 mex_PROGRAMS = local_state_space_iteration_2 local_state_space_iteration_3 local_state_space_iteration_k
 
-nodist_local_state_space_iteration_2_SOURCES = local_state_space_iteration_2.cc
+nodist_local_state_space_iteration_2_SOURCES = local_state_space_iteration_2.cc lssi2_avx2.S lssi2_avx512.S
 nodist_local_state_space_iteration_3_SOURCES = local_state_space_iteration_3.f08
 nodist_local_state_space_iteration_k_SOURCES = local_state_space_iteration_k.f08
 
+local_state_space_iteration_2_CPPFLAGS = $(AM_CPPFLAGS) -I$(top_srcdir)/../../sources/local_state_space_iterations
 local_state_space_iteration_2_CXXFLAGS = $(AM_CXXFLAGS) -fopenmp
 local_state_space_iteration_2_LDFLAGS = $(AM_LDFLAGS) $(OPENMP_LDFLAGS)
 
@@ -24,4 +25,7 @@ CLEANFILES = $(nodist_local_state_space_iteration_2_SOURCES) \
 	$(LN_S) -f $< $@
 
 %.cc: $(top_srcdir)/../../sources/local_state_space_iterations/%.cc
-	$(LN_S) -f $< $@
\ No newline at end of file
+	$(LN_S) -f $< $@
+
+%.S: $(top_srcdir)/../../sources/local_state_space_iterations/%.S
+	$(LN_S) -f $< $@
diff --git a/mex/build/matlab/configure.ac b/mex/build/matlab/configure.ac
index 052fd0084c317191e099c473800bc65c3c98b263..cf91e038a47866d4ef0760a8f31ed47978e5d60f 100644
--- a/mex/build/matlab/configure.ac
+++ b/mex/build/matlab/configure.ac
@@ -52,6 +52,7 @@ AC_PROG_RANLIB
 AX_PROG_LN_S
 AC_PROG_MKDIR_P
 AM_PROG_AR
+AM_PROG_AS
 
 AX_CXX11_THREAD
 
diff --git a/mex/build/octave/configure.ac b/mex/build/octave/configure.ac
index 16b2b34f2c6fc999edd47b6122430594e09c578e..86be71ca40629bda3709f8e5bdc406320e114e0e 100644
--- a/mex/build/octave/configure.ac
+++ b/mex/build/octave/configure.ac
@@ -50,6 +50,7 @@ AC_PROG_RANLIB
 AX_PROG_LN_S
 AC_PROG_MKDIR_P
 AM_PROG_AR
+AM_PROG_AS
 
 AX_CXX11_THREAD
 
diff --git a/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc b/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc
index cd0253ca5267cb4e2fcf8a6ec8544d2acdc28e4b..e81a7f3cd2f79fd52a0dfee8e92d94c8266c436b 100644
--- a/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc
+++ b/mex/sources/local_state_space_iterations/local_state_space_iteration_2.cc
@@ -26,12 +26,16 @@
 #include <algorithm>
 #include <tuple>
 #include <string>
+#include <thread>
+#include <cstdlib>
 
 #include <dynmex.h>
 #include <dynblas.h>
 
 #include <omp.h>
 
+#include "lssi2.hh"
+
 /*
   Uncomment the following line to use BLAS instead of loops when computing
   ghx·ŷ and ghu·ε.
@@ -136,18 +140,22 @@ ss2Iteration_pruning(double *y2, double *y1, const double *yhat2, const double *
     }
 }
 
+/* Generic kernel for the iteration without pruning.
+   More specialized kernels are implemented in assembly. */
 void
-ss2Iteration(double *y, const double *yhat, const double *epsilon,
-             const double *ghx, const double *ghu,
-             const double *constant, const double *ghxx, const double *ghuu, const double *ghxu,
-             blas_int m, blas_int n, blas_int q, blas_int s, int number_of_threads)
+lssi2_generic(double *y, const double *yhat, const double *epsilon,
+              const double *ghx, const double *ghu,
+              const double *constant, const double *ghxx, const double *ghuu, const double *ghxu,
+              blas_int m, blas_int n, blas_int q, blas_int s,
+              int number_of_threads)
 {
+  auto [ii1, ii2, ii3] = set_vector_of_indices(n, m); // vector indices for ghxx
+  auto [jj1, jj2, jj3] = set_vector_of_indices(q, m); // vector indices for ghuu
+
 #ifdef USE_BLAS_AT_FIRST_ORDER
   const double one = 1.0;
   const blas_int ONE = 1;
 #endif
-  auto [ii1, ii2, ii3] = set_vector_of_indices(n, m); // vector indices for ghxx
-  auto [jj1, jj2, jj3] = set_vector_of_indices(q, m); // vector indices for ghuu
 #pragma omp parallel for num_threads(number_of_threads)
   for (int particle = 0; particle < s; particle++)
     {
@@ -197,6 +205,65 @@ ss2Iteration(double *y, const double *yhat, const double *epsilon,
     }
 }
 
+void
+ss2Iteration(double *y, const double *yhat, const double *epsilon,
+             const double *ghx, const double *ghu,
+             const double *constant, const double *ghxx, const double *ghuu, const double *ghxu,
+             blas_int m, blas_int n, blas_int q, blas_int s, int number_of_threads)
+{
+  std::string kernel{"auto"};
+  if (char *kstr = getenv("DYNARE_LSSI2_KERNEL"); kstr && *kstr != '\0')
+    kernel = kstr;
+
+  // Runtime selection of kernel
+#if defined(__x86_64__) && defined(__LP64__)
+  if ((kernel == "auto" &&__builtin_cpu_supports("avx512f"))
+      || kernel == "avx512")
+    {
+      int it_total {s/8};
+      int it_per_thread {it_total / number_of_threads};
+      std::vector<std::jthread> threads;
+      for (int i {0}; i < number_of_threads; i++)
+        {
+          int offset {i*it_per_thread*8};
+          int s2 {i == number_of_threads - 1 ? it_total*8 - offset : it_per_thread*8};
+          threads.emplace_back(lssi2_avx512, y+offset*m, yhat+offset*n, epsilon+offset*q,
+                               ghx, ghu, constant, ghxx, ghuu, ghxu,
+                               static_cast<int>(m), static_cast<int>(n), static_cast<int>(q), s2);
+        }
+      if (int rem {s % 8}; rem != 0)
+        // If s is not a multiple of 8, use the generic routine to finish the computation
+        lssi2_generic(y+(s-rem)*m, yhat+(s-rem)*n, epsilon+(s-rem)*q,
+                      ghx, ghu, constant, ghxx, ghuu, ghxu, m, n, q, rem, 1);
+    }
+  else if ((kernel == "auto" &&__builtin_cpu_supports("avx2") && __builtin_cpu_supports("fma"))
+           || kernel == "avx2")
+    {
+      int it_total {s/4};
+      int it_per_thread {it_total / number_of_threads};
+      std::vector<std::jthread> threads;
+      for (int i {0}; i < number_of_threads; i++)
+        {
+          int offset {i*it_per_thread*4};
+          int s2 {i == number_of_threads - 1 ? it_total*4 - offset : it_per_thread*4};
+          threads.emplace_back(lssi2_avx2, y+offset*m, yhat+offset*n, epsilon+offset*q,
+                               ghx, ghu, constant, ghxx, ghuu, ghxu,
+                               static_cast<int>(m), static_cast<int>(n), static_cast<int>(q), s2);
+        }
+      if (int rem {s % 4}; rem != 0)
+        // If s is not a multiple of 4, use the generic routine to finish the computation
+        lssi2_generic(y+(s-rem)*m, yhat+(s-rem)*n, epsilon+(s-rem)*q,
+                      ghx, ghu, constant, ghxx, ghuu, ghxu, m, n, q, rem, 1);
+    }
+  else
+#endif
+    if (kernel == "auto" || kernel == "generic")
+      lssi2_generic(y, yhat, epsilon, ghx, ghu, constant, ghxx, ghuu, ghxu,
+                    m, n, q, s, number_of_threads);
+    else
+      mexErrMsgTxt(("Unknown kernel: " + kernel).c_str());
+}
+
 void
 mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
 {
diff --git a/mex/sources/local_state_space_iterations/lssi2.hh b/mex/sources/local_state_space_iterations/lssi2.hh
new file mode 100644
index 0000000000000000000000000000000000000000..339c637330da5f113153395f2ebdcaa45099b673
--- /dev/null
+++ b/mex/sources/local_state_space_iterations/lssi2.hh
@@ -0,0 +1,48 @@
+// Prototypes for state space iteration at order 2 routines written in assembly
+
+/*
+ * Copyright © 2022 Dynare Team
+ *
+ * This file is part of Dynare.
+ *
+ * Dynare is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Dynare is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+  WARNING: s must be a multiple of 4 for the AVX2 routines.
+
+  Sizes of the vector/matrix arguments:
+  – y: m×s
+  – yhat: n×s
+  – epsilon: q×s
+  – ghx: m×n
+  – constant: m
+  – ghxx: m×n²
+  – ghuu: m×q²
+  – ghxu: m×nq
+*/
+
+#if defined(__x86_64__) && defined(__LP64__)
+
+extern "C" void lssi2_avx2(double *y, const double *yhat, const double *epsilon,
+                           const double *ghx, const double *ghu, const double *constant,
+                           const double *ghxx, const double *ghuu, const double *ghxu,
+                           int m, int n, int q, int s);
+
+extern "C" void lssi2_avx512(double *y, const double *yhat, const double *epsilon,
+                             const double *ghx, const double *ghu, const double *constant,
+                             const double *ghxx, const double *ghuu, const double *ghxu,
+                             int m, int n, int q, int s);
+
+#endif
diff --git a/mex/sources/local_state_space_iterations/lssi2_avx2.S b/mex/sources/local_state_space_iterations/lssi2_avx2.S
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d521bbfa62d223475c807a42c4dc18f96b138
--- /dev/null
+++ b/mex/sources/local_state_space_iterations/lssi2_avx2.S
@@ -0,0 +1,348 @@
+/*
+ * Local state-space iteration at order 2, without pruning.
+ * Specialized kernel in x86-64 assembly using AVX2 and FMA extensions.
+ * See the function prototype in lssi2.hh.
+ *
+ * WARNING: the number of particles must be a multiple of 4.
+ *
+ * TODO:       
+ * – Implement Windows ABI as an alternative
+ */
+
+/*
+ * Copyright © 2022 Dynare Team
+ *
+ * This file is part of Dynare.
+ *
+ * Dynare is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Dynare is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#if defined(__x86_64__) && defined(__LP64__)
+
+        .intel_syntax noprefix
+
+        ## Enforce required CPU features, so that GNU as errors out if we use
+        ## instructions not in that feature set
+        .arch generic64
+        .arch .avx2
+        .arch .fma
+
+
+### Some useful macros
+#include "lssi2_common.S"
+
+### Some pre-defined vector constants
+        .section .rodata
+
+        .align 8
+double_1:
+        .double 1.
+double_0p5:
+        .double 0.5
+
+        .align 16
+v4_int_0_1_2_3:
+        .int 0, 1, 2, 3
+
+
+### Layout of the current stack frame and of the beginning of the previous
+### one (for arguments passed through the stack).
+### This defines offset symbols through which variables and arguments can
+### be conveniently referred.
+        .struct 0
+constant_offset:
+        .space 8
+ghu_offset:
+        .space 8
+ghx_offset:
+        .space 8
+saved_regs_offset:
+        .space 6*8
+return_address_offset:
+        .space 8
+        ## Beginning of the previous stack frame
+ghxx_offset:
+        .space 8
+ghuu_offset:
+        .space 8
+ghxu_offset:
+        .space 8
+m_offset:
+        .space 8
+n_offset:
+        .space 8
+q_offset:
+        .space 8
+s_offset:
+        .space 8
+
+
+### Function body
+        .text
+        .global lssi2_avx2
+lssi2_avx2:
+        .cfi_startproc
+        .set pushed_regs,0
+        push_cfi_reg rbp
+        push_cfi_reg rbx
+        push_cfi_reg r12
+        push_cfi_reg r13
+        push_cfi_reg r14
+        push_cfi_reg r15
+        .if return_address_offset-saved_regs_offset != pushed_regs*8
+        .error "Incorrect stack frame layout regarding saved registers"
+        .endif
+
+        ## Allocate space for variables in the current stack frame
+        sub rsp,saved_regs_offset
+        .cfi_adjust_cfa_offset saved_regs_offset
+
+        ## Define more explicit names for registers that will remain associated
+        ## to input arguments
+        .set y,rdi
+        .set yhat,rsi
+        .set epsilon,rdx
+        .set m,r12d
+        .set n,r13d
+        .set q,r14d
+        .set s,r15d
+
+        ## Initialize those registers that need it
+        mov m,[rsp+m_offset]                    # number of states + observed vars
+        mov n,[rsp+n_offset]                    # number of states
+        mov q,[rsp+q_offset]                    # number of shocks
+        mov s,[rsp+s_offset]                    # number of particles
+
+        ## Conversely save some arguments to the stack
+        mov [rsp+ghx_offset],rcx
+        mov [rsp+ghu_offset],r8
+        mov [rsp+constant_offset],r9
+
+        ## Do nothing if s=0
+        test s,s
+        jz done
+
+        ## Precompute 8m (used for moving pointers by one column inside y and gh* matrices)
+        mov r11d,m
+        shl r11,3                               # Use a 64-bit op, since m could theoretically be 2³¹-1
+
+        ## Pre-compute some vectorized constants
+        vmovd xmm15,n
+        vpbroadcastd xmm15,xmm15                # xmm15 = [ int32: n (×4) ]
+        vmovd xmm14,q
+        vpbroadcastd xmm14,xmm14                # xmm14 = [ int32: q (×4) ]
+        mov rax,0x8000000000000000
+        vmovq xmm13,rax
+        vpbroadcastq ymm13,xmm13                # ymm13 = [ int64: 0x8000000000000000 (×4) ]
+
+        vmovdqa xmm12,[rip+v4_int_0_1_2_3]
+        vpmulld xmm12,xmm12,xmm15               # xmm12 = base VSIB for current particles in ŷ
+        vmovdqa xmm11,[rip+v4_int_0_1_2_3]
+        vpmulld xmm11,xmm11,xmm14               # xmm11 = base VSIB for current particles in ε
+
+        vpslld xmm15,xmm15,2                    # xmm15 = [ int32: 4n (×4) ]
+        vpslld xmm14,xmm14,2                    # xmm14 = [ int32: 4q (×4) ]
+
+        mov eax,1
+        vmovd xmm10,eax
+        vpbroadcastd xmm10,xmm10                # xmm10 = [ int32: 1 (×4) ]
+        vbroadcastsd ymm9,[rip+double_1]        # ymm9 = [ double: 1. (×4) ]
+        vbroadcastsd ymm8,[rip+double_0p5]      # ymm8 = [ double: 0.5 (×4) ]
+
+        ## Enter the main loop
+        xor ebx,ebx                             # ebx = particle counter
+
+next_4_particles:
+        xor ecx,ecx                             # ecx = variable counter
+
+next_variable:
+        ## Use ymm0 to store the 4 next-period particles for this specific variable
+        ## Initialize ymm0 to the constant
+        mov r8,[rsp+constant_offset]
+        vbroadcastsd ymm0,[r8+rcx*8]
+
+        ## Add ghx·ŷ
+        mov ebp,n                               # ebp = number of remaining state variables
+        vmovdqa xmm1,xmm12                      # xmm1 = VSIB for current particles in ŷ
+        mov r8,[rsp+ghx_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghxᵢⱼ
+        .align 16                               # Recommendation p. 537 of Kusswurm (2018)
+next_state:
+        vmovdqa ymm4,ymm13                      # Mask for vgatherdpd (will be cleared by the instr.)
+        vgatherdpd ymm2,[yhat+xmm1*8],ymm4      # ymm2 = current particles for ŷⱼ
+        vbroadcastsd ymm3,[r8]                  # ymm3 = ghxᵢⱼ repeated 4 times
+        vfmadd231pd ymm0,ymm2,ymm3              # Add ghxᵢⱼ·ŷⱼ to current particles
+        add r8,r11                              # Move to next column in ghx
+        vpaddd xmm1,xmm1,xmm10                  # Update VSIB for next state (j=j+1)
+        sub ebp,1
+        jnz next_state
+
+        ## Add ghu·ε
+        mov ebp,q                               # ebp = number of remaining shocks
+        vmovdqa xmm1,xmm11                      # xmm1 = VSIB for current particles in ε
+        mov r8,[rsp+ghu_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghuᵢⱼ
+        .align 16
+next_shock:
+        vmovdqa ymm4,ymm13
+        vgatherdpd ymm2,[epsilon+xmm1*8],ymm4   # ymm2 = current particles for εⱼ
+        vbroadcastsd ymm3,[r8]                  # ymm3 = ghuᵢⱼ repeated 4 times
+        vfmadd231pd ymm0,ymm2,ymm3              # Add ghuᵢⱼ·εⱼ to current particles
+        add r8,r11                              # Move to next column in ghu
+        vpaddd xmm1,xmm1,xmm10                  # Update VSIB for next shock (j=j+1)
+        sub ebp,1
+        jnz next_shock
+
+        ## Add ½ghxx·ŷ⊗ŷ
+        xor ebp,ebp                             # Index of first state (j₁)
+        mov r8,[rsp+ghxx_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghxxᵢⱼ
+        vmovdqa xmm1,xmm12                      # xmm1 = VSIB for current particles in ŷⱼ₁
+        .align 16
+next_state_state_1:
+        mov r10d,ebp                            # Index of second state (j₂)
+        vmovdqa xmm2,xmm1                       # xmm2 = VSIB for current particles in ŷⱼ₂
+        .align 16
+next_state_state_2:
+        vmovdqa ymm3,ymm13
+        vgatherdpd ymm4,[yhat+xmm1*8],ymm3      # ymm4 = current particles for ŷⱼ₁
+        vmovdqa ymm5,ymm13
+        vgatherdpd ymm6,[yhat+xmm2*8],ymm5      # ymm6 = current particles for ŷⱼ₂
+        vmulpd ymm3,ymm4,ymm6                   # ymm3 = particles for ŷⱼ₁·ŷⱼ₂
+        vpcmpeqd xmm7,xmm1,xmm2
+        vinserti128 ymm7,ymm7,xmm7,1            # ymm7 = all 1s if diagonal element, all 0s otherwise
+        ## There is a spurious dependency here due to ymm4 being reused (no more free registers)
+        ## Freeing ymm8 by using a memory read in the following blend does not improve things
+        vblendvpd ymm4,ymm9,ymm8,ymm7           # ymm4 = ½ if diagonal element, 1 otherwise
+        vmulpd ymm3,ymm4,ymm3                   # ymm3 = ½ŷⱼ₁·ŷⱼ₂ if diagonal, ŷⱼ₁·ŷⱼ₂ otherwise
+        vbroadcastsd ymm7,[r8]                  # ymm7 = ghxxᵢⱼ repeated 4 times
+        vfmadd231pd ymm0,ymm7,ymm3              # Add (½?)ghxxᵢⱼ·ŷⱼ₁·ŷⱼ₂ to current particles
+        vpaddd xmm2,xmm2,xmm10                  # Update VSIB for next second state (j₂=j₂+1)
+        add r8,r11                              # Move to next column in ghxx
+        add r10d,1
+        cmp r10d,n
+        jl next_state_state_2
+        vpaddd xmm1,xmm1,xmm10                  # Update VSIB for next first state (j₁=j₁+1)
+        add ebp,1
+        mov eax,ebp
+        imul rax,r11
+        add r8,rax                              # Jump several columns in ghxx
+        cmp ebp,n
+        jl next_state_state_1
+
+        ## Add ½ghuu·ε⊗ε
+        xor ebp,ebp                             # Index of first shock (j₁)
+        mov r8,[rsp+ghuu_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghuuᵢⱼ
+        vmovdqa xmm1,xmm11                      # xmm1 = VSIB for current particles in εⱼ₁
+        .align 16
+next_shock_shock_1:
+        mov r10d,ebp                            # Index of second shock (j₂)
+        vmovdqa xmm2,xmm1                       # xmm2 = VSIB for current particles in εⱼ₂
+        .align 16
+next_shock_shock_2:
+        vmovdqa ymm3,ymm13
+        vgatherdpd ymm4,[epsilon+xmm1*8],ymm3   # ymm4 = current particles for εⱼ₁
+        vmovdqa ymm5,ymm13
+        vgatherdpd ymm6,[epsilon+xmm2*8],ymm5   # ymm6 = current particles for εⱼ₂
+        vmulpd ymm3,ymm4,ymm6                   # ymm3 = particles for εⱼ₁·εⱼ₂
+        vpcmpeqd xmm7,xmm1,xmm2
+        vinserti128 ymm7,ymm7,xmm7,1            # ymm7 = all 1s if diagonal element, all 0s otherwise
+        ## Same remark as previous section regarding ymm4
+        vblendvpd ymm4,ymm9,ymm8,ymm7           # ymm4 = ½ if diagonal element, 1 otherwise
+        vmulpd ymm3,ymm4,ymm3                   # ymm3 = ½εⱼ₁εⱼ₂ if diagonal, εⱼ₁εⱼ₂ otherwise
+        vbroadcastsd ymm7,[r8]                  # ymm7 = ghuuᵢⱼ repeated 4 times
+        vfmadd231pd ymm0,ymm7,ymm3              # Add ghuuᵢⱼ·εⱼ₁εⱼ₂ to current particles
+        vpaddd xmm2,xmm2,xmm10                  # Update VSIB for next second shock (j₂=j₂+1)
+        add r8,r11                              # Move to next column in ghuu
+        add r10d,1
+        cmp r10d,q
+        jl next_shock_shock_2
+        vpaddd xmm1,xmm1,xmm10                  # Update VSIB for next first shock (j₁=j₁+1)
+        add ebp,1
+        mov eax,ebp
+        imul rax,r11
+        add r8,rax                              # Jump several columns in ghuu
+        cmp ebp,q
+        jl next_shock_shock_1
+
+        ## Add ghxu·ŷ⊗ε
+        mov ebp,n                               # ebp = number of remaining states
+        vmovdqa xmm1,xmm12                      # xmm1 = VSIB for current particles in ŷ
+        mov r8,[rsp+ghxu_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghxuᵢⱼ
+        .align 16
+next_state_2:
+        mov eax,q                               # eax = number of remaining shocks
+        vmovdqa xmm2,xmm11                      # xmm2 = VSIB for current particles in ε
+        vmovdqa ymm3,ymm13
+        vgatherdpd ymm4,[yhat+xmm1*8],ymm3      # ymm4 = current particles for ŷⱼ₁
+        .align 16
+next_shock_2:
+        vmovdqa ymm5,ymm13
+        vgatherdpd ymm6,[epsilon+xmm2*8],ymm5   # ymm6 = current particles for εⱼ₂
+        vbroadcastsd ymm7,[r8]                  # ymm7 = ghxuᵢⱼ repeated 4 times
+        vmulpd ymm6,ymm4,ymm6                   # ymm6 = particles for ŷⱼ₁εⱼ₂
+        vfmadd231pd ymm0,ymm7,ymm6              # Add ghxuᵢⱼ·ŷⱼ₁εⱼ₂ to current particles
+        add r8,r11                              # Move to next column in ghxu
+        vpaddd xmm2,xmm2,xmm10                  # Update VSIB for next shock (j₂=j₂+1)
+        sub eax,1
+        jnz next_shock_2
+        vpaddd xmm1,xmm1,xmm10                  # Update VSIB for next state (j₁=j₁+1)
+        sub ebp,1
+        jnz next_state_2
+
+        ## Save updated particles to memory
+        mov eax,ebx
+        imul eax,m
+        add eax,ecx
+        lea r8,[y+rax*8]
+        vextractf128 xmm2,ymm0,1
+        vmovsd [r8],xmm0
+        add r8,r11
+        vpsrldq xmm1,xmm0,8
+        vmovsd [r8],xmm1
+        add r8,r11
+        vmovsd [r8],xmm2
+        add r8,r11
+        vpsrldq xmm3,xmm2,8
+        vmovsd [r8],xmm3
+
+        ## Loop over variables
+        add ecx,1
+        cmp ecx,m
+        jl next_variable
+
+        ## Loop over particles
+        add ebx,4
+        vpaddd xmm12,xmm12,xmm15                # Update base VSIB for ŷ
+        vpaddd xmm11,xmm11,xmm14                # Update base VSIB for ε
+        cmp ebx,s
+        jl next_4_particles
+
+done:
+        ## Cleanup
+        add rsp,saved_regs_offset
+        .cfi_adjust_cfa_offset -saved_regs_offset
+        pop_cfi_reg r15
+        pop_cfi_reg r14
+        pop_cfi_reg r13
+        pop_cfi_reg r12
+        pop_cfi_reg rbx
+        pop_cfi_reg rbp
+        vzeroupper
+        ret
+        .cfi_endproc
+
+#endif
diff --git a/mex/sources/local_state_space_iterations/lssi2_avx512.S b/mex/sources/local_state_space_iterations/lssi2_avx512.S
new file mode 100644
index 0000000000000000000000000000000000000000..b117b22a85554b97aefd10a021b053473f1ae9aa
--- /dev/null
+++ b/mex/sources/local_state_space_iterations/lssi2_avx512.S
@@ -0,0 +1,334 @@
+/*
+ * Local state-space iteration at order 2, without pruning.
+ * Specialized kernel in x86-64 assembly using AVX-512F.
+ * See the function prototype in lssi2.hh.
+ *
+ * WARNING: the number of particles must be a multiple of 8.
+ *
+ * TODO:
+ * – Implement Windows ABI as an alternative
+ * – Check whether requiring AVX-512VL (and thus replacing the 3 instructions
+ *   which manipulate VSIB in zmm* by their equivalent that manipulate ymm*)
+ *   improves performance. Actually, is there any CPU that implements AVX-512F
+ *   but not AVX-512VL?
+ * – Check whether using int64 inside VSIB (instead of int32) improves
+ *   performance
+ */
+
+/*
+ * Copyright © 2022 Dynare Team
+ *
+ * This file is part of Dynare.
+ *
+ * Dynare is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Dynare is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Dynare.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#if defined(__x86_64__) && defined(__LP64__)
+
+        .intel_syntax noprefix
+
+        ## Enforce required CPU features, so that GNU as errors out if we use
+        ## instructions not in that feature set
+        .arch generic64
+        .arch .avx512f
+
+
+### Some useful macros
+#include "lssi2_common.S"
+
+
+### Some pre-defined vector constants
+        .section .rodata
+
+        .align 8
+double_0p5:
+        .double 0.5
+
+        .align 32
+v8_int_0_1__7:
+        .int 0, 1, 2, 3, 4, 5, 6, 7
+
+
+### Layout of the current stack frame and of the beginning of the previous
+### one (for arguments passed through the stack).
+### This defines offset symbols through which variables and arguments can
+### be conveniently referred.
+        .struct 0
+constant_offset:
+        .space 8
+ghu_offset:
+        .space 8
+ghx_offset:
+        .space 8
+saved_regs_offset:
+        .space 6*8
+return_address_offset:
+        .space 8
+        ## Beginning of the previous stack frame
+ghxx_offset:
+        .space 8
+ghuu_offset:
+        .space 8
+ghxu_offset:
+        .space 8
+m_offset:
+        .space 8
+n_offset:
+        .space 8
+q_offset:
+        .space 8
+s_offset:
+        .space 8
+
+
+### Function body
+        .text
+        .global lssi2_avx512
+lssi2_avx512:
+        .cfi_startproc
+        .set pushed_regs,0
+        push_cfi_reg rbp
+        push_cfi_reg rbx
+        push_cfi_reg r12
+        push_cfi_reg r13
+        push_cfi_reg r14
+        push_cfi_reg r15
+        .if return_address_offset-saved_regs_offset != pushed_regs*8
+        .error "Incorrect stack frame layout regarding saved registers"
+        .endif
+
+        ## Allocate space for variables in the current stack frame
+        sub rsp,saved_regs_offset
+        .cfi_adjust_cfa_offset saved_regs_offset
+
+        ## Define more explicit names for registers that will remain associated
+        ## to input arguments
+        .set y,rdi
+        .set yhat,rsi
+        .set epsilon,rdx
+        .set m,r12d
+        .set n,r13d
+        .set q,r14d
+        .set s,r15d
+
+        ## Initialize those registers that need it
+        mov m,[rsp+m_offset]                    # number of states + observed vars
+        mov n,[rsp+n_offset]                    # number of states
+        mov q,[rsp+q_offset]                    # number of shocks
+        mov s,[rsp+s_offset]                    # number of particles
+
+        ## Conversely save some arguments to the stack
+        mov [rsp+ghx_offset],rcx
+        mov [rsp+ghu_offset],r8
+        mov [rsp+constant_offset],r9
+
+        ## Do nothing if s=0
+        test s,s
+        jz done
+
+        ## Precompute 8m (used for moving pointers by one column inside y and gh* matrices)
+        mov r11d,m
+        shl r11,3                               # Use a 64-bit op, since m could theoretically be 2³¹-1
+
+        ## Pre-compute some vectorized constants
+        vmovd xmm15,n
+        vpbroadcastd ymm15,xmm15                # ymm15 = [ int32: n (×8) ]
+        vmovd xmm14,q
+        vpbroadcastd ymm14,xmm14                # ymm14 = [ int32: q (×8) ]
+        vmovd xmm13,m
+        vpbroadcastd ymm13,xmm13                # ymm13 = [ int32: m (×8) ]
+
+        vmovdqa ymm12,[rip+v8_int_0_1__7]
+        vpmulld ymm12,ymm12,ymm15               # ymm12 = base VSIB for current particles in ŷ
+        vmovdqa ymm11,[rip+v8_int_0_1__7]
+        vpmulld ymm11,ymm11,ymm14               # ymm11 = base VSIB for current particles in ε
+        vmovdqa ymm10,[rip+v8_int_0_1__7]
+        vpmulld ymm10,ymm10,ymm13               # ymm10 = base VSIB for particles in y
+
+        vpslld ymm15,ymm15,3                    # ymm15 = [ int32: 8n (×8) ]
+        vpslld ymm14,ymm14,3                    # ymm14 = [ int32: 8q (×8) ]
+        vpslld ymm13,ymm13,3                    # ymm13 = [ int32: 8m (×8) ]
+
+        mov eax,1
+        ## NB: We broadcast into zmm in order to avoid requiring AVX-512VL
+        vpbroadcastd zmm8,eax                   # ymm8 = [ int32: 1 (×8) ]
+        vbroadcastsd zmm16,[rip+double_0p5]     # zmm16 = [ double: 0.5 (×8) ]
+
+        ## Enter the main loop
+        xor ebx,ebx                             # ebx = particle counter
+
+next_8_particles:
+        xor ecx,ecx                             # ecx = variable counter
+        vmovdqa ymm9,ymm10                      # ymm9 = VSIB for current particles in y
+
+next_variable:
+        ## Use ymm0 to store the 4 next-period particles for this specific variable
+        ## Initialize ymm0 to the constant
+        mov r8,[rsp+constant_offset]
+        vbroadcastsd zmm0,[r8+rcx*8]
+
+        ## Add ghx·ŷ
+        mov ebp,n                               # ebp = number of remaining state variables
+        vmovdqa ymm1,ymm12                      # ymm1 = VSIB for current particles in ŷ
+        mov r8,[rsp+ghx_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghxᵢⱼ
+        .align 16                               # Recommendation p. 537 of Kusswurm (2018)
+next_state:
+        kxnorw k1,k1,k1
+        vgatherdpd zmm2{k1},[yhat+ymm1*8]       # zmm2 = current particles for ŷⱼ
+        vfmadd231pd zmm0,zmm2,[r8]{1to8}        # Add ghxᵢⱼ·ŷⱼ to current particles
+        add r8,r11                              # Move to next column in ghx
+        vpaddd ymm1,ymm1,ymm8                   # Update VSIB for next state (j=j+1)
+        sub ebp,1
+        jnz next_state
+
+        ## Add ghu·ε
+        mov ebp,q                               # ebp = number of remaining shocks
+        vmovdqa ymm1,ymm11                      # ymm1 = VSIB for current particles in ε
+        mov r8,[rsp+ghu_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghuᵢⱼ
+        .align 16
+next_shock:
+        kxnorw k1,k1,k1
+        vgatherdpd zmm2{k1},[epsilon+ymm1*8]    # zmm2 = current particles for εⱼ
+        vfmadd231pd zmm0,zmm2,[r8]{1to8}        # Add ghuᵢⱼ·εⱼ to current particles
+        add r8,r11                              # Move to next column in ghu
+        vpaddd ymm1,ymm1,ymm8                   # Update VSIB for next shock (j=j+1)
+        sub ebp,1
+        jnz next_shock
+
+        ## Add ½ghxx·ŷ⊗ŷ
+        xor ebp,ebp                             # Index of first state (j₁)
+        mov r8,[rsp+ghxx_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghxxᵢⱼ
+        vmovdqa ymm1,ymm12                      # ymm1 = VSIB for current particles in ŷⱼ₁
+        .align 16
+next_state_state_1:
+        mov r10d,ebp                            # Index of second state (j₂)
+        vmovdqa ymm2,ymm1                       # ymm2 = VSIB for current particles in ŷⱼ₂
+        .align 16
+next_state_state_2:
+        kxnorw k1,k1,k1
+        vgatherdpd zmm4{k1},[yhat+ymm1*8]       # zmm4 = current particles for ŷⱼ₁
+        kxnorw k2,k2,k2
+        vgatherdpd zmm6{k2},[yhat+ymm2*8]       # zmm6 = current particles for ŷⱼ₂
+        vmulpd zmm3,zmm4,zmm6                   # zmm3 = particles for ŷⱼ₁·ŷⱼ₂
+        ## NB: We compare the zmm registers to avoid requiring AVX-512VL
+        vpcmpeqd k3,zmm1,zmm2                   # k3[0:7]=1 if diagonal, 0 otherwise
+        vmulpd zmm3{k3},zmm3,zmm16              # zmm3 = ½ŷⱼ₁·ŷⱼ₂ if diagonal, ŷⱼ₁·ŷⱼ₂ otherwise
+        vfmadd231pd zmm0,zmm3,[r8]{1to8}        # Add (½?)ghxxᵢⱼ·ŷⱼ₁·ŷⱼ₂ to current particles
+        vpaddd ymm2,ymm2,ymm8                   # Update VSIB for next second state (j₂=j₂+1)
+        add r8,r11                              # Move to next column in ghxx
+        add r10d,1
+        cmp r10d,n
+        jl next_state_state_2
+        vpaddd ymm1,ymm1,ymm8                   # Update VSIB for next first state (j₁=j₁+1)
+        add ebp,1
+        mov eax,ebp
+        imul rax,r11
+        add r8,rax                              # Jump several columns in ghxx
+        cmp ebp,n
+        jl next_state_state_1
+
+        ## Add ½ghuu·ε⊗ε
+        xor ebp,ebp                             # Index of first shock (j₁)
+        mov r8,[rsp+ghuu_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghuuᵢⱼ
+        vmovdqa ymm1,ymm11                      # ymm1 = VSIB for current particles in εⱼ₁
+        .align 16
+next_shock_shock_1:
+        mov r10d,ebp                            # Index of second shock (j₂)
+        vmovdqa ymm2,ymm1                       # ymm2 = VSIB for current particles in εⱼ₂
+        .align 16
+next_shock_shock_2:
+        kxnorw k1,k1,k1
+        vgatherdpd zmm4{k1},[epsilon+ymm1*8]    # zmm4 = current particles for εⱼ₁
+        kxnorw k2,k2,k2
+        vgatherdpd zmm6{k2},[epsilon+ymm2*8]    # zmm6 = current particles for εⱼ₂
+        vmulpd zmm3,zmm4,zmm6                   # zmm3 = particles for εⱼ₁·εⱼ₂
+        ## NB: We compare the zmm registers to avoid requiring AVX-512VL
+        vpcmpeqd k3,zmm1,zmm2                   # k3[0:7]=1 if diagonal, 0 otherwise
+        vmulpd zmm3{k3},zmm3,zmm16              # zmm3 = ½εⱼ₁εⱼ₂ if diagonal, εⱼ₁εⱼ₂ otherwise
+        vfmadd231pd zmm0,zmm3,[r8]{1to8}        # Add ghuuᵢⱼ·εⱼ₁εⱼ₂ to current particles
+        vpaddd ymm2,ymm2,ymm8                   # Update VSIB for next second shock (j₂=j₂+1)
+        add r8,r11                              # Move to next column in ghuu
+        add r10d,1
+        cmp r10d,q
+        jl next_shock_shock_2
+        vpaddd ymm1,ymm1,ymm8                   # Update VSIB for next first shock (j₁=j₁+1)
+        add ebp,1
+        mov eax,ebp
+        imul rax,r11
+        add r8,rax                              # Jump several columns in ghuu
+        cmp ebp,q
+        jl next_shock_shock_1
+
+        ## Add ghxu·ŷ⊗ε
+        mov ebp,n                               # ebp = number of remaining states
+        vmovdqa ymm1,ymm12                      # ymm1 = VSIB for current particles in ŷ
+        mov r8,[rsp+ghxu_offset]
+        lea r8,[r8+rcx*8]                       # r8 = pointer to ghxuᵢⱼ
+        .align 16
+next_state_2:
+        mov eax,q                               # eax = number of remaining shocks
+        vmovdqa ymm2,ymm11                      # ymm2 = VSIB for current particles in ε
+        kxnorw k1,k1,k1
+        vgatherdpd zmm4{k1},[yhat+ymm1*8]       # zmm4 = current particles for ŷⱼ₁
+        .align 16
+next_shock_2:
+        kxnorw k2,k2,k2
+        vgatherdpd zmm6{k2},[epsilon+ymm2*8]    # zmm6 = current particles for εⱼ₂
+        vmulpd zmm6,zmm4,zmm6                   # zmm6 = particles for ŷⱼ₁εⱼ₂
+        vfmadd231pd zmm0,zmm6,[r8]{1to8}        # Add ghxuᵢⱼ·ŷⱼ₁εⱼ₂ to current particles
+        add r8,r11                              # Move to next column in ghxu
+        vpaddd ymm2,ymm2,ymm8                   # Update VSIB for next shock (j₂=j₂+1)
+        sub eax,1
+        jnz next_shock_2
+        vpaddd ymm1,ymm1,ymm8                   # Update VSIB for next state (j₁=j₁+1)
+        sub ebp,1
+        jnz next_state_2
+
+        ## Save updated particles to memory
+        kxnorw k1,k1,k1
+        vscatterdpd [y+ymm9*8]{k1},zmm0
+
+        ## Loop over variables
+        vpaddd ymm9,ymm9,ymm8                   # Update VSIB for next variable in y
+        add ecx,1
+        cmp ecx,m
+        jl next_variable
+
+        ## Loop over particles
+        add ebx,8
+        vpaddd ymm12,ymm12,ymm15                # Update base VSIB for ŷ
+        vpaddd ymm11,ymm11,ymm14                # Update base VSIB for ε
+        vpaddd ymm10,ymm10,ymm13                # Update base VSIB for y
+        cmp ebx,s
+        jl next_8_particles
+
+done:
+        ## Cleanup
+        add rsp,saved_regs_offset
+        .cfi_adjust_cfa_offset -saved_regs_offset
+        pop_cfi_reg r15
+        pop_cfi_reg r14
+        pop_cfi_reg r13
+        pop_cfi_reg r12
+        pop_cfi_reg rbx
+        pop_cfi_reg rbp
+        vzeroupper
+        ret
+        .cfi_endproc
+
+#endif
diff --git a/mex/sources/local_state_space_iterations/lssi2_common.S b/mex/sources/local_state_space_iterations/lssi2_common.S
new file mode 100644
index 0000000000000000000000000000000000000000..3274cac3f4c3f7f3aaa8457e996ed878e277313d
--- /dev/null
+++ b/mex/sources/local_state_space_iterations/lssi2_common.S
@@ -0,0 +1,15 @@
+        ## Push a register to the stack and adjust CFI information accordingly
+        ## Also increment a counter
+        .macro push_cfi_reg reg
+        push \reg
+        .cfi_adjust_cfa_offset 8
+        .cfi_rel_offset \reg, 0
+        .set pushed_regs,pushed_regs+1
+        .endm
+
+        ## Pop a register from the stack and adjust CFI information accordingly
+        .macro pop_cfi_reg reg
+        pop \reg
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore \reg
+        .endm
diff --git a/tests/particle/local_state_space_iteration_k_test.mod b/tests/particle/local_state_space_iteration_k_test.mod
index 1478931ad195f52f1769eaaffc0de3aa21391b75..b7de0734f95c558b5e75b0f22c1255951b6ec438 100644
--- a/tests/particle/local_state_space_iteration_k_test.mod
+++ b/tests/particle/local_state_space_iteration_k_test.mod
@@ -1,9 +1,3 @@
-/*
-  Tests that local_state_space_iteration_2 and local_state_space_iteration_k (for k=2) return the same results
-
-  This file must be run after first_spec.mod (both are based on the same model).
-*/
-
 @#include "first_spec_common.inc"
 
 varobs q ca;
@@ -16,24 +10,22 @@ var ca = 0.01^2;
 end;
 
 // Initialize various structures
+/*
 estimation(datafile='my_data.mat',order=2,mode_compute=0,mh_replic=0,filter_algorithm=sis,nonlinear_filter_initialization=2
     ,cova_compute=0 %tell program that no covariance matrix was computed
 );
+*/
 
 stoch_simul(order=2, periods=200, irf=0, k_order_solver);
 
-// Really perform the test
-
-nparticles = options_.particle.number_of_particles;
-nsims = 1e6/nparticles;
+nparticles = 100003;
+nsims = 500;
+options_.threads.local_state_space_iteration_2 = 32;
 
-/* We generate particles using realistic distributions (though this is not
-   strictly needed) */
 state_idx = oo_.dr.order_var((M_.nstatic+1):(M_.nstatic+M_.npred+M_.nboth));
-yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
-epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);
 
 dr = oo_.dr;
+dr.restrict_var_list=1:6;
 
 
 // “rf” stands for “Reduced Form”
@@ -45,20 +37,97 @@ rf_ghxx = dr.ghxx(dr.restrict_var_list, :);
 rf_ghuu = dr.ghuu(dr.restrict_var_list, :);
 rf_ghxu = dr.ghxu(dr.restrict_var_list, :);
 
-tStart1 = tic; for i=1:nsims, ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2); end, tElapsed1 = toc(tStart1);
+// Check consistency between the kernels
+/* We generate particles using realistic distributions (though this is not
+   strictly needed) */
+yhat = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
+epsilon = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);
 
-tStart2 = tic; [udr] = folded_to_unfolded_dr(dr, M_, options_); for i=1:nsims, ynext2 = local_state_space_iteration_k(yhat, epsilon, dr, M_, options_, udr); end, tElapsed2 = toc(tStart2);
+setenv('DYNARE_LSSI2_KERNEL', 'avx512')
+ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2);
+setenv('DYNARE_LSSI2_KERNEL', 'avx2')
+ynext2 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2);
+setenv('DYNARE_LSSI2_KERNEL', 'generic')
+ynext3 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2);
 
-if max(max(abs(ynext1-ynext2))) > 1e-14
-    error('Inconsistency between local_state_space_iteration_2 and local_state_space_iteration_k')
+if max(max(abs(ynext1-ynext3))) > 1e-15
+    error('avx512 kernel is inconsistent with generic one')
 end
+if max(max(abs(ynext2-ynext3))) > 1e-15
+    error('avx2 kernel is inconsistent with generic one')
+end
+
+// Perform multi-threaded benchmarks
+// We regenerate particles before each run to make sure they are not in the CPU cache before the run
+
+tElapsed = NaN(3,nsims);
 
-if tElapsed1<tElapsed2
-    skipline()
-    dprintf('local_state_space_iteration_2 is %5.2f times faster than local_state_space_iteration_k', tElapsed2/tElapsed1)
-    skipline()
-else
-    skipline()
-    dprintf('local_state_space_iteration_2 is %5.2f times slower than local_state_space_iteration_k', tElapsed1/tElapsed2)
-    skipline()
+disp('* Precompute states and shocks for all particles')
+yhat_all = NaN(M_.npred+M_.nboth, nparticles, nsims);
+epsilon_all = NaN(M_.exo_nbr, nparticles, nsims);
+for i = 1:nsims
+  yhat_all(:,:,i) = chol(oo_.var(state_idx,state_idx))*randn(M_.npred+M_.nboth, nparticles);
+  epsilon_all(:,:,i) = chol(M_.Sigma_e)*randn(M_.exo_nbr, nparticles);
 end
+
+disp('* Cool down processor')
+pause(10)
+
+disp('* Multi-threaded benchmarks')
+
+for i = 1:nsims
+
+yhat = yhat_all(:,:,i);
+epsilon = epsilon_all(:,:,i);
+
+setenv('DYNARE_LSSI2_KERNEL', 'avx512')
+tStart = tic;
+ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2);
+tElapsed(1,i) = toc(tStart);
+
+setenv('DYNARE_LSSI2_KERNEL', 'avx2')
+tStart = tic;
+ynext2 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2);
+tElapsed(2,i) = toc(tStart);
+
+setenv('DYNARE_LSSI2_KERNEL', 'generic')
+tStart = tic;
+ynext3 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, options_.threads.local_state_space_iteration_2);
+tElapsed(3,i) = toc(tStart);
+
+end
+
+printf('avx512 kernel:  average timing = %g μs\n', round(mean(tElapsed(1,:))*1e6))
+printf('avx2 kernel:    average timing = %g μs\n', round(mean(tElapsed(2,:))*1e6))
+printf('generic kernel: average timing = %g μs\n', round(mean(tElapsed(3,:))*1e6))
+
+
+disp('* Single-threaded benchmarks')
+
+for i = 1:nsims
+
+yhat = yhat_all(:,:,i);
+epsilon = epsilon_all(:,:,i);
+
+setenv('DYNARE_LSSI2_KERNEL', 'avx512')
+tStart = tic;
+ynext1 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, 1);
+tElapsed(1,i) = toc(tStart);
+
+setenv('DYNARE_LSSI2_KERNEL', 'avx2')
+tStart = tic;
+ynext2 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, 1);
+tElapsed(2,i) = toc(tStart);
+
+setenv('DYNARE_LSSI2_KERNEL', 'generic')
+tStart = tic;
+ynext3 = local_state_space_iteration_2(yhat, epsilon, rf_ghx, rf_ghu, rf_constant, rf_ghxx, rf_ghuu, rf_ghxu, 1);
+tElapsed(3,i) = toc(tStart);
+
+end
+
+printf('avx512 kernel:  average timing = %g μs\n', round(mean(tElapsed(1,:))*1e6))
+printf('avx2 kernel:    average timing = %g μs\n', round(mean(tElapsed(2,:))*1e6))
+printf('generic kernel: average timing = %g μs\n', round(mean(tElapsed(3,:))*1e6))
+
+setenv('DYNARE_LSSI2_KERNEL', 'auto')