diff --git a/matlab/parallel/closeSlave.m b/matlab/parallel/closeSlave.m index ed7749f2500cf4c1e50ba2f71d551bc48af6b4ad..1e78bb9e5ef07cf8c97d098eb763d6f1f465630c 100644 --- a/matlab/parallel/closeSlave.m +++ b/matlab/parallel/closeSlave.m @@ -1,4 +1,4 @@ -function closeSlave(Parallel,TmpFolder), +function closeSlave(Parallel,TmpFolder,partial), % PARALLEL CONTEXT % In parallel context, this utility closes all remote matlab instances % called by masterParallel when strategy (1) is active i.e. always open (which leaves @@ -32,6 +32,32 @@ function closeSlave(Parallel,TmpFolder), % You should have received a copy of the GNU General Public License % along with Dynare. If not, see <http://www.gnu.org/licenses/>. +if nargin<3, + partial=0; +end + +s=warning('off'); + +if partial==1 + save('slaveParallel_break','partial') + for indPC=1:length(Parallel), + if (Parallel(indPC).Local==0), + dynareParallelSendFiles('slaveParallel_break.mat',TmpFolder,Parallel(indPC)); + end + end +% delete('slaveParallel_break') + return +end +if partial==-1 + delete('slaveParallel_break.mat') + for indPC=1:length(Parallel), + if (Parallel(indPC).Local==0), + dynareParallelDelete( 'slaveParallel_break.mat',TmpFolder,Parallel(indPC)); + end + end +% delete('slaveParallel_break') + return +end for indPC=1:length(Parallel), if (Parallel(indPC).Local==0), @@ -58,3 +84,5 @@ while(1) end end +s=warning('on'); + diff --git a/matlab/parallel/fMessageStatus.m b/matlab/parallel/fMessageStatus.m index f9a92d1e0ef910c3e4394382cca0e142c12ac929..53576de436669b6b2cdc725760ca655d2065c21e 100644 --- a/matlab/parallel/fMessageStatus.m +++ b/matlab/parallel/fMessageStatus.m @@ -43,7 +43,8 @@ catch end fslave = dir( ['slaveParallel_input',int2str(njob),'.mat']); -if isempty(fslave), +fbreak = dir( ['slaveParallel_break.mat']); +if isempty(fslave) || ~isempty(fbreak), error('Master asked to break the job'); end diff --git a/matlab/parallel/fParallel.m b/matlab/parallel/fParallel.m index 9bd09d14da39b77d2419981e94b1181d6b3770dd..3ce751af74618d4e8066d7eab4aeff64b80f5d7f 100644 --- a/matlab/parallel/fParallel.m +++ b/matlab/parallel/fParallel.m @@ -81,20 +81,33 @@ try, % Save the output result. save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) end + if isfield(fOutputVar,'CloseAllSlaves'), + CloseAllSlaves = 1; + fOutputVar = rmfield(fOutputVar,'CloseAllSlaves'); + save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) + save(['comp_status_',funcName,int2str(whoiam),'.mat'],'CloseAllSlaves'); + end disp(['fParallel ',int2str(whoiam),' completed.']) catch, - disp(['fParallel ',int2str(whoiam),' crashed.']) - fOutputVar.error = lasterror; - save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) - waitbarString = fOutputVar.error.message; - % waitbarTitle=['Metropolis-Hastings ',options_.parallel(ThisMatlab).ComputerName]; - if Parallel(ThisMatlab).Local, - waitbarTitle='Local '; + theerror = lasterror; + if strfind(theerror.message,'Master asked to break the job') + fOutputVar.message = theerror; + save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) + waitbarString = theerror.message; else - waitbarTitle=[Parallel(ThisMatlab).ComputerName]; + disp(['fParallel ',int2str(whoiam),' crashed.']) + fOutputVar.error = theerror; + save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) + waitbarString = theerror.message; + % waitbarTitle=['Metropolis-Hastings ',options_.parallel(ThisMatlab).ComputerName]; + if Parallel(ThisMatlab).Local, + waitbarTitle='Local '; + else + waitbarTitle=[Parallel(ThisMatlab).ComputerName]; + end + fMessageStatus(NaN,whoiam,waitbarString, waitbarTitle, Parallel(ThisMatlab)); end - fMessageStatus(NaN,whoiam,waitbarString, waitbarTitle, Parallel(ThisMatlab)); end diary off; diff --git a/matlab/parallel/masterParallel.m b/matlab/parallel/masterParallel.m index f2efc139996cb4c930d4b0e993009a0f7297c38b..30a145595e66919955459d67d6682820fee5d2fc 100644 --- a/matlab/parallel/masterParallel.m +++ b/matlab/parallel/masterParallel.m @@ -142,6 +142,7 @@ switch Strategy save(['temp_input.mat'],'fInputVar') end save(['temp_input.mat'],'Parallel','-append') + closeSlave(Parallel,PRCDir,-1); end @@ -423,6 +424,7 @@ for j=1:totCPU, if isempty(PRCDirSnapshot{indPC}), PRCDirSnapshot(indPC)=dynareParallelSnapshot(PRCDir,Parallel(indPC)); + PRCDirSnapshotInit(indPC) = PRCDirSnapshot(indPC); else PRCDirSnapshot(indPC)=dynareParallelGetNewFiles(PRCDir,Parallel(indPC),PRCDirSnapshot(indPC)); end @@ -453,6 +455,7 @@ end if Strategy==0 || newInstance, % See above. PRCDirSnapshot=dynareParallelSnapshot(PRCDir,Parallel(1:totSlaves)); + PRCDirSnapshotInit = PRCDirSnapshot; % Run the slaves. if ~ispc, %isunix || (~matlab_ver_less_than('7.4') && ismac), @@ -587,6 +590,7 @@ NuoviFilecopiati=zeros(1,totSlaves); ForEver=1; statusString = ''; +flag_CloseAllSlaves=0; while (ForEver) @@ -607,6 +611,12 @@ while (ForEver) try if ~isempty(['comp_status_',fname,int2str(j),'.mat']) load(['comp_status_',fname,int2str(j),'.mat']); +% whoCloseAllSlaves = who(['comp_status_',fname,int2str(j),'.mat','CloseAllSlaves']); + if exist('CloseAllSlaves') && flag_CloseAllSlaves==0, + flag_CloseAllSlaves=1; + whoiamCloseAllSlaves=j; + closeSlave(Parallel(1:totSlaves),PRCDir,1); + end end pcerdone(j) = prtfrc; idCPU(j) = njob; @@ -711,11 +721,16 @@ for j=1:totCPU, for jstack=1:length(fOutputVar.error.stack) fOutputVar.error.stack(jstack), end - else + elseif flag_CloseAllSlaves==0, fOutVar(j)=fOutputVar; + elseif j==whoiamCloseAllSlaves, + fOutVar=fOutputVar; end end +if flag_CloseAllSlaves==1, + closeSlave(Parallel(1:totSlaves),PRCDir,-1); +end if iscrash, error('Remote jobs crashed'); @@ -737,10 +752,11 @@ switch Strategy [A B C]=rmdir('dynareParallelLogFiles'); mkdir('dynareParallelLogFiles'); end - - copyfile('*.log','dynareParallelLogFiles'); - delete([fname,'*.log']); - + try + copyfile('*.log','dynareParallelLogFiles'); + mydelete([fname,'*.log']); + catch + end mydelete(['*_core*_input*.mat']); % if Parallel(indPC).Local == 1 % delete(['slaveParallel_input*.mat']); diff --git a/matlab/parallel/slaveParallel.m b/matlab/parallel/slaveParallel.m index d54b6ac30d2bc3dfe064518a8d2a3c10aade2d84..c7b23aa7be72a2d8c461e70614368087f0e5d5ff 100644 --- a/matlab/parallel/slaveParallel.m +++ b/matlab/parallel/slaveParallel.m @@ -136,6 +136,13 @@ while (etime(clock,t0)<1200 && ~isempty(fslave)) || ~isempty(dir(['stayalive',in % Save the output result. save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ); +% keyboard, + if isfield(fOutputVar,'CloseAllSlaves'), + CloseAllSlaves = 1; + fOutputVar = rmfield(fOutputVar,'CloseAllSlaves'); + save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) + save(['comp_status_',funcName,int2str(whoiam),'.mat'],'CloseAllSlaves'); + end % Inform the master that the job is finished, and transfer the output data delete(['P_',fname,'_',int2str(whoiam),'End.txt']); @@ -143,19 +150,27 @@ while (etime(clock,t0)<1200 && ~isempty(fslave)) || ~isempty(dir(['stayalive',in disp(['Job ',fname,' on CPU ',int2str(whoiam),' completed.']); t0 =clock; % Re-set waiting time of 20 mins - catch ME - disp(['Job ',fname,' on CPU ',int2str(whoiam),' crashed.']); - fOutputVar.error = ME; - save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ); - waitbarString = fOutputVar.error.message; - if Parallel(ThisMatlab).Local, - waitbarTitle='Local '; + catch, + theerror = lasterror; + if strfind(theerror.message,'Master asked to break the job') + disp(['Job ',fname,' on CPU ',int2str(whoiam),' broken from master.']); + fOutputVar.message = theerror; + save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ) + delete(['P_',fname,'_',int2str(whoiam),'End.txt']); else - waitbarTitle=[Parallel(ThisMatlab).ComputerName]; + disp(['Job ',fname,' on CPU ',int2str(whoiam),' crashed.']); + fOutputVar.error = lasterror; + save([ fname,'_output_',int2str(whoiam),'.mat'],'fOutputVar' ); + waitbarString = fOutputVar.error.message; + if Parallel(ThisMatlab).Local, + waitbarTitle='Local '; + else + waitbarTitle=[Parallel(ThisMatlab).ComputerName]; + end + fMessageStatus(NaN,whoiam,waitbarString, waitbarTitle, Parallel(ThisMatlab)); + delete(['P_',fname,'_',int2str(whoiam),'End.txt']); + break end - fMessageStatus(NaN,whoiam,waitbarString, waitbarTitle, Parallel(ThisMatlab)); - delete(['P_',fname,'_',int2str(whoiam),'End.txt']); - break end end