Skip to content
Snippets Groups Projects
Verified Commit 8f895e00 authored by Sébastien Villemot's avatar Sébastien Villemot
Browse files

WIP sparse v2

Faster algorithm, but unfortunately not deterministic (because additions can be
made in a different order, depending on relative CPU core loads). The result
can therefore slightly change across runs, because changing the order of
additions in an a multi-element sum can change the result due to floating point
properties (this makes example2.mod fail randomly, because that test checks for
strict equality).

NB: The case B⊗B still needs to be rewritten

[skip ci]
parent 8f945e11
No related branches found
No related tags found
No related merge requests found
Pipeline #5485 skipped
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
mex_PROGRAMS = sparse_hessian_times_B_kronecker_C mex_PROGRAMS = sparse_hessian_times_B_kronecker_C
nodist_sparse_hessian_times_B_kronecker_C_SOURCES = sparse_hessian_times_B_kronecker_C.f08 matlab_mex.F08 nodist_sparse_hessian_times_B_kronecker_C_SOURCES = sparse_hessian_times_B_kronecker_C.f08 matlab_mex.F08 blas_lapack.F08
AM_FCFLAGS += -fopenmp AM_FCFLAGS += -fopenmp
AM_LDFLAGS += $(OPENMP_LDFLAGS) AM_LDFLAGS += $(OPENMP_LDFLAGS)
...@@ -13,7 +13,7 @@ AM_LDFLAGS += $(OPENMP_LDFLAGS) ...@@ -13,7 +13,7 @@ AM_LDFLAGS += $(OPENMP_LDFLAGS)
BUILT_SOURCES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES) BUILT_SOURCES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES)
CLEANFILES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES) CLEANFILES = $(nodist_sparse_hessian_times_B_kronecker_C_SOURCES)
sparse_hessian_times_B_kronecker_C.o : matlab_mex.mod sparse_hessian_times_B_kronecker_C.o : matlab_mex.mod blas.mod
%.f08: $(top_srcdir)/../../sources/kronecker/%.f08 %.f08: $(top_srcdir)/../../sources/kronecker/%.f08
$(LN_S) -f $< $@ $(LN_S) -f $< $@
...@@ -48,6 +48,16 @@ module blas ...@@ -48,6 +48,16 @@ module blas
real(real64), dimension(*), intent(inout) :: y real(real64), dimension(*), intent(inout) :: y
end subroutine dgemv end subroutine dgemv
end interface end interface
interface
subroutine dger(m, n, alpha, x, incx, y, incy, a, lda)
import :: blint, real64
integer(blint), intent(in) :: m, n, incx, incy, lda
real(real64), intent(in) :: alpha
real(real64), dimension(*), intent(in) :: x, y
real(real64), dimension(*), intent(inout) :: a
end subroutine dger
end interface
end module blas end module blas
module lapack module lapack
... ...
......
...@@ -24,6 +24,8 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction') ...@@ -24,6 +24,8 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
use iso_fortran_env use iso_fortran_env
use iso_c_binding use iso_c_binding
use matlab_mex use matlab_mex
use blas
use omp_lib
implicit none implicit none
type(c_ptr), dimension(*), intent(in), target :: prhs type(c_ptr), dimension(*), intent(in), target :: prhs
...@@ -96,32 +98,43 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction') ...@@ -96,32 +98,43 @@ subroutine mexFunction(nlhs, plhs, nrhs, prhs) bind(c, name='mexFunction')
end if end if
contains contains
subroutine sparse_hessian_times_B_kronecker_C subroutine sparse_hessian_times_B_kronecker_C
integer(c_size_t) :: ii,jj,jB,jC,iB,iC integer(c_size_t) :: ii,k,kk,k1,k2,iB,iC
integer(mwIndex) :: k1,k2,k,kk real(real64), dimension(:,:), allocatable :: Dt ! Transpose of D
real(real64) :: bc integer(omp_lock_kind), dimension(:), allocatable :: locks
! Loop over the columns of B⊗C (or of the result matrix D) allocate(Dt(nB*nC, mA))
!$omp parallel do num_threads(numthreads) default(none) shared(nA,nB,nC,mC,A_jc,A_ir,A_v,B,C,D) & Dt = 0._real64
!$omp private(ii,jj,jB,jC,iB,iC,k1,k2,k,kk,bc)
do jj = 1,nB*nC ! column of B⊗C index allocate(locks(mA))
jB = (jj-1)/nC+1 do ii=1,mA
jC = mod(jj-1, nC)+1 call omp_init_lock(locks(ii))
! Loop over the rows of B⊗C (column jj) end do
! Loop over columns of A (and therefore rows of B⊗C)
! Scheduling is made dynamic because the number of non-zero elements is not constant
!$omp parallel do num_threads(numthreads) default(none) shared(nA,nB,nC,mC,A_jc,A_ir,A_v,B,C,Dt,locks) &
!$omp private(ii,k,kk,k1,k2,iB,iC) schedule(dynamic)
do ii=1,nA do ii=1,nA
k1 = A_jc(ii) k1 = A_jc(ii)
k2 = A_jc(ii+1) k2 = A_jc(ii+1)
if (k1 < k2) then ! Otherwise column ii of A does not have non zero elements (and there is nothing to compute) if (k1 < k2) then ! Otherwise column ii of A does not have non zero elements (and there is nothing to compute)
iC = mod(ii-1, mC)+1 iC = mod(ii-1, mC)+1
iB = (ii-1)/mC+1 iB = (ii-1)/mC+1
bc = B(iB,jB)*C(iC,jC) ! Loop over the non-zero entries of A(:,ii)
! Loop over the non zero entries of A(:,ii)
do k=k1+1,k2 do k=k1+1,k2
kk = A_ir(k)+1 kk = A_ir(k)+1
D(kk,jj) = D(kk,jj) + A_v(k)*bc ! D(kk,:) += A(kk,ii)·vec(C(iC,:)·B(iB,:)ᵀ)
! NB: This call creates temporary contiguous copies of B(iB,:) and C(iC,:), hence incx=incy=1
call omp_set_lock(locks(kk))
call dger(int(nC, blint), int(nB, blint), A_v(k), C(iC,:), 1_blint, &
B(iB,:), 1_blint, Dt(:,kk), int(nC, blint))
call omp_unset_lock(locks(kk))
end do end do
end if end if
end do end do
end do
D = transpose(Dt)
end subroutine sparse_hessian_times_B_kronecker_C end subroutine sparse_hessian_times_B_kronecker_C
subroutine sparse_hessian_times_B_kronecker_B subroutine sparse_hessian_times_B_kronecker_B
... ...
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment