corrcoef.m 13.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
function [R,sig,ci1,ci2,nan_sig] = corrcoef(X,Y,varargin)
% CORRCOEF calculates the correlation matrix from pairwise correlations.
%   The input data can contain missing values encoded with NaN.
%   Missing data (NaN's) are handled by pairwise deletion [15].
%   In order to avoid possible pitfalls, use case-wise deletion or
%   or check the correlation of NaN's with your data (see below).
%   A significance test for testing the Hypothesis
%   'correlation coefficient R is significantly different to zero'
%   is included.
%
% [...] = CORRCOEF(X);
%      calculates the (auto-)correlation matrix of X
% [...] = CORRCOEF(X,Y);
%      calculates the crosscorrelation between X and Y
%
% [...] = CORRCOEF(..., Mode);
%       Mode='Pearson' or 'parametric' [default]
%               gives the correlation coefficient
%               also known as the 'product-moment coefficient of correlation'
%               or 'Pearson''s correlation' [1]
%       Mode='Spearman'     gives 'Spearman''s Rank Correlation Coefficient'
%               This replaces SPEARMAN.M
%       Mode='Rank'         gives a nonparametric Rank Correlation Coefficient
%               This is the "Spearman rank correlation with proper handling of ties"
%               This replaces RANKCORR.M
%
% [...] = CORRCOEF(..., param1, value1, param2, value2, ... );
28
29
30
31
32
33
34
35
36
%       param       value
%       'Mode'          type of correlation
%               'Pearson','parametric'
%               'Spearman'
%               'rank'
%       'rows'          how do deal with missing values encoded as NaN's.
%               'complete': remove all rows with at least one NaN
%               'pairwise': [default]
%       'alpha'         0.01    : significance level to compute confidence interval
37
38
39
%
% [R,p,ci1,ci2,nansig] = CORRCOEF(...);
%   R is the correlation matrix
40
%       R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j)
41
%  p    gives the significance of R
42
%       It tests the null hypothesis that the product moment correlation coefficient is zero
43
44
45
46
%       using Student's t-test on the statistic t = r*sqrt(N-2)/sqrt(1-r^2)
%       where N is the number of samples (Statistics, M. Spiegel, Schaum series).
%  p > alpha: do not reject the Null hypothesis: 'R is zero'.
%  p < alpha: The alternative hypothesis 'R is larger than zero' is true with probability (1-alpha).
47
48
49
%  ci1  lower (1-alpha) confidence interval
%  ci2  upper (1-alpha) confidence interval
%       If no alpha is provided, the default alpha is 0.01. This can be changed with function flag_implicit_significance.
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
%  nan_sig  p-value whether H0: 'NaN''s are not correlated' could be correct
%       if nan_sig < alpha, H1 ('NaNs are correlated') is very likely.
%
% The result is only valid if the occurence of NaN's is uncorrelated. In
% order to avoid this pitfall, the correlation of NaN's should be checked
% or case-wise deletion should be applied.
%   Case-Wise deletion can be implemented
%    ix = ~any(isnan([X,Y]),2);
%    [...] = CORRCOEF(X(ix,:),Y(ix,:),...);
%
%  Correlation (non-random distribution) of NaN's can be checked with
%       [nan_R,nan_sig]=corrcoef(X,isnan(X))
%   or  [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y]))
%   or  [R,p,ci1,ci2] = CORRCOEF(...);
%
% Further recommandation related to the correlation coefficient:
% + LOOK AT THE SCATTERPLOTS to make sure that the relationship is linear
% + Correlation is not causation because
68
%       it is not clear which parameter is 'cause' and which is 'effect' and
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
%       the observed correlation between two variables might be due to the action of other, unobserved variables.
%
% see also: SUMSKIPNAN, COVM, COV, COR, SPEARMAN, RANKCORR, RANKS,
%       PARTCORRCOEF, flag_implicit_significance
%
% REFERENCES:
% on the correlation coefficient
% [ 1] http://mathworld.wolfram.com/CorrelationCoefficient.html
% [ 2] http://www.geography.btinternet.co.uk/spearman.htm
% [ 3] Hogg, R. V. and Craig, A. T. Introduction to Mathematical Statistics, 5th ed.  New York: Macmillan, pp. 338 and 400, 1995.
% [ 4] Lehmann, E. L. and D'Abrera, H. J. M. Nonparametrics: Statistical Methods Based on Ranks, rev. ed. Englewood Cliffs, NJ: Prentice-Hall, pp. 292, 300, and 323, 1998.
% [ 5] Press, W. H.; Flannery, B. P.; Teukolsky, S. A.; and Vetterling, W. T. Numerical Recipes in FORTRAN: The Art of Scientific Computing, 2nd ed. Cambridge, England: Cambridge University Press, pp. 634-637, 1992
% [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
% on the significance test of the correlation coefficient
% [11] http://www.met.rdg.ac.uk/cag/STATS/corr.html
% [12] http://www.janda.org/c10/Lectures/topic06/L24-significanceR.htm
% [13] http://faculty.vassar.edu/lowry/ch4apx.html
% [14] http://davidmlane.com/hyperstat/B134689.html
% [15] http://www.statsoft.com/textbook/stbasic.html%Correlations
% others
% [20] http://www.tufts.edu/~gdallal/corr.htm
% [21] Fisher transformation http://en.wikipedia.org/wiki/Fisher_transformation

%       $Id: corrcoef.m 9387 2011-12-15 10:42:14Z schloegl $
%       Copyright (C) 2000-2004,2008,2009,2011 by Alois Schloegl <alois.schloegl@gmail.com>
Stéphane Adjemian's avatar
Stéphane Adjemian committed
94
%       Copyright (C) 2014-2017 Dynare Team
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
%       This function is part of the NaN-toolbox
%       http://pub.ist.ac.at/~schloegl/matlab/NaN/

%    This program is free software: you can redistribute it and/or modify
%    it under the terms of the GNU General Public License as published by
%    the Free Software Foundation, either version 3 of the License, or
%    (at your option) any later version.
%
%    This program is distributed in the hope that it will be useful,
%    but WITHOUT ANY WARRANTY; without even the implied warranty of
%    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%    GNU General Public License for more details.
%
%    You should have received a copy of the GNU General Public License
%    along with this program.  If not, see <http://www.gnu.org/licenses/>.

% Features:
% + handles missing values (encoded as NaN's)
%       + pairwise deletion of missing data
%       + checks independence of missing values (NaNs)
% + parametric and non-parametric (rank) correlation
%       + Pearson's correlation
%       + Spearman's rank correlation
%       + Rank correlation (non-parametric, Spearman rank correlation with proper handling of ties)
% + is fast, using an efficient algorithm O(n.log(n)) for calculating the ranks
% + significance test for null-hypthesis: r=0
% + confidence interval included
% - rank correlation works for cell arrays, too (no check for missing values).
% + compatible with Octave and Matlab

global FLAG_NANS_OCCURED;

127
NARG = nargout; % needed because nargout is not reentrant in Octave, and corrcoef is recursive
128
129
130
mode = [];

if nargin==1
131
132
    Y = [];
    Mode='Pearson';
133
elseif nargin==0
134
    fprintf(2,'Error CORRCOEF: Missing argument(s)\n');
135
elseif nargin>1
136
137
138
139
140
141
    if ischar(Y)
        varg = [Y,varargin];
        Y=[];
    else
        varg = varargin;
    end
142

143
144
145
146
147
148
149
150
151
152
    if length(varg)<1
        Mode = 'Pearson';
    elseif length(varg)==1
        Mode = varg{1};
    else
        for k = 2:2:length(varg)
            mode = setfield(mode,lower(varg{k-1}),varg{k});
        end
        if isfield(mode,'mode')
            Mode = mode.mode;
153
        end
154
    end
155
156
end
if isempty(Mode), Mode='pearson'; end
157
158
159
160
Mode=[Mode,'        '];



161
FLAG_WARNING = warning;         % save warning status
162
163
164
165
warning('off');

[r1,c1]=size(X);
if ~isempty(Y)
166
167
168
169
170
171
    [r2,c2]=size(Y);
    if r1~=r2
        fprintf(2,'Error CORRCOEF: X and Y must have the same number of observations (rows).\n');
        return
    end
    NN = real(~isnan(X)')*real(~isnan(Y));
172
else
173
174
    [r2,c2]=size(X);
    NN = real(~isnan(X)')*real(~isnan(X));
175
end
176
177
178

%%%%% generate combinations using indices for pairwise calculation of the correlation
YESNAN = any(isnan(X(:))) | any(isnan(Y(:)));
179
if YESNAN
180
181
182
183
184
185
186
    FLAG_NANS_OCCURED=(1==1);
    if isfield(mode,'rows')
        if strcmp(mode.rows,'complete')
            ix = ~any([X,Y],2);
            X = X(ix,:);
            if ~isempty(Y)
                Y = Y(ix,:);
187
            end
188
189
190
191
192
193
194
            YESNAN = 0;
            NN = size(X,1);
        elseif strcmp(mode.rows,'all')
            fprintf(1,'Warning: data contains NaNs, rows=pairwise is used.');
            %%NN(NN < size(X,1)) = NaN;
        elseif strcmp(mode.rows,'pairwise')
            %%% default
195
196
197
198
        end
    end
end
if isempty(Y)
199
200
201
    IX = ones(c1)-diag(ones(c1,1));
    [jx, jy ] = find(IX);
    [jxo,jyo] = find(IX);
202
203
    R = eye(c1);
else
204
205
206
    IX = sparse([],[],[],c1+c2,c1+c2,c1*c2);
    IX(1:c1,c1+(1:c2)) = 1;
    [jx,jy] = find(IX);
207

208
209
    IX = ones(c1,c2);
    [jxo,jyo] = find(IX);
210
    R = zeros(c1,c2);
211
end
212

213
if strcmp(lower(Mode(1:7)),'pearson')
214
    % see http://mathworld.wolfram.com/CorrelationCoefficient.html
215
    if ~YESNAN
216
217
218
219
220
221
222
223
        [S,N,SSQ] = sumskipnan(X,1);
        if ~isempty(Y)
            [S2,N2,SSQ2] = sumskipnan(Y,1);
            CC = X'*Y;
            M1 = S./N;
            M2 = S2./N2;
            cc = CC./NN - M1'*M2;
            R  = cc./sqrt((SSQ./N-M1.*M1)'*(SSQ2./N2-M2.*M2));
224
        else
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
            CC = X'*X;
            M  = S./N;
            cc = CC./NN - M'*M;
            v  = SSQ./N - M.*M; %max(N-1,0);
            R  = cc./sqrt(v'*v);
        end
    else
        if ~isempty(Y)
            X  = [X,Y];
        end
        for k = 1:length(jx)
            %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
            ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
            [s,n,s2] = sumskipnan(X(ik,[jx(k),jy(k)]),1);
            v  = (s2-s.*s./n)./n;
            cc = X(ik,jx(k))'*X(ik,jy(k));
            cc = cc/n(1) - prod(s./n);
            %r(k) = cc./sqrt(prod(v));
            R(jxo(k),jyo(k)) = cc./sqrt(prod(v));
        end
245
246
    end

247
elseif strcmp(lower(Mode(1:4)),'rank')
248
    % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
249
    if ~YESNAN
250
251
        if isempty(Y)
            R = corrcoef(ranks(X));
252
        else
253
254
255
256
257
258
259
260
261
262
263
264
            R = corrcoef(ranks(X),ranks(Y));
        end
    else
        if ~isempty(Y)
            X = [X,Y];
        end
        for k = 1:length(jx)
            %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
            ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
            il = ranks(X(ik,[jx(k),jy(k)]));
            R(jxo(k),jyo(k)) = corrcoef(il(:,1),il(:,2));
        end
265
        X = ranks(X);
266
    end
267

268
elseif strcmp(lower(Mode(1:8)),'spearman')
269
270
271
272
    % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
    if ~isempty(Y)
        X = [X,Y];
    end
273

274
    n = repmat(nan,c1,c2);
275

276
277
    if ~YESNAN
        iy = ranks(X);  %  calculates ranks;
278

279
280
        for k = 1:length(jx)
            [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((iy(:,jx(k)) - iy(:,jy(k))).^2);       % NN is the number of non-missing values
281
        end
282
283
284
285
286
287
288
289
290
291
292
    else
        for k = 1:length(jx)
            %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
            ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
            il = ranks(X(ik,[jx(k),jy(k)]));
            % NN is the number of non-missing values
            [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((il(:,1) - il(:,2)).^2);
        end
        X = ranks(X);
    end
    R = 1 - 6 * R ./ (n.*(n.*n-1));
293

294
elseif strcmp(lower(Mode(1:7)),'partial')
295
    fprintf(2,'Error CORRCOEF: use PARTCORRCOEF \n',Mode);
296

297
    return
298

299
elseif strcmp(lower(Mode(1:7)),'kendall')
300
    fprintf(2,'Error CORRCOEF: mode ''%s'' not implemented yet.\n',Mode);
301

302
    return
303
else
304
    fprintf(2,'Error CORRCOEF: unknown mode ''%s''\n',Mode);
305
end
306

307
if (NARG<2)
308
309
    warning(FLAG_WARNING);  % restore warning status
    return
310
end
311
312
313
314
315


% CONFIDENCE INTERVAL
if isfield(mode,'alpha')
    alpha = mode.alpha;
316
elseif exist('flag_implicit_significance','file')
317
    alpha = flag_implicit_significance;
318
319
else
    alpha = 0.01;
320
end
321
322
323
324
325
326
% fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha);


% SIGNIFICANCE TEST
R(isnan(R))=0;
tmp = 1 - R.*R;
327
tmp(tmp<0) = 0;         % prevent tmp<0 i.e. imag(t)~=0
328
329
t   = R.*sqrt(max(NN-2,0)./tmp);

330
if exist('t_cdf','file')
331
    sig = t_cdf(t,NN-2);
332
elseif exist('tcdf','file')>1
333
    sig = tcdf(t,NN-2);
334
else
335
336
    fprintf('CORRCOEF: significance test not completed because of missing TCDF-function\n')
    sig = repmat(nan,size(R));
337
end
338
339
340
sig  = 2 * min(sig,1 - sig);


341
if NARG<3
342
    warning(FLAG_WARNING);  % restore warning status
343
    return
344
end
345
346
347


tmp = R;
348
%tmp(ix1 | ix2) = nan;          % avoid division-by-zero warning
349
z   = log((1+tmp)./(1-tmp))/2;  % Fisher transformation [21]
350
351
                                %sz = 1./sqrt(NN-3);            % standard error of z
sz  = sqrt(2)*erfinv(1-alpha)./sqrt(NN-3);      % confidence interval for alpha of z
352
353
354
355

ci1 = tanh(z-sz);
ci2 = tanh(z+sz);

356
%ci1(isnan(ci1))=R(isnan(ci1)); % in case of isnan(ci), the interval limits are exactly the R value
357
358
%ci2(isnan(ci2))=R(isnan(ci2));

359
if (NARG<5) || ~YESNAN
360
361
    nan_sig = repmat(NaN,size(R));
    warning(FLAG_WARNING);  % restore warning status
362
363
    return
end
364
365
366
367
368
369
370
371
372

%%%%% ----- check independence of NaNs (missing values) -----
[nan_R, nan_sig] = corrcoef(X,double(isnan(X)));

% remove diagonal elements, because these have not any meaning %
nan_sig(isnan(nan_R)) = nan;
% remove diagonal elements, because these have not any meaning %
nan_R(isnan(nan_R)) = 0;

373
if 0, any(nan_sig(:) < alpha)
374
375
376
377
378
379
380
    tmp = nan_sig(:);                       % Hack to skip NaN's in MIN(X)
    min_sig = min(tmp(~isnan(tmp)));    % Necessary, because Octave returns NaN rather than min(X) for min(NaN,X)
    fprintf(1,'CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min_sig);
    fprintf(1,'   Its recommended to remove all samples (i.e. rows) with any missing value (NaN).\n');
    fprintf(1,'   The null-hypotheses (NaNs are uncorrelated) is rejected for the following parameter pair(s).\n');
    [ix,iy] = find(nan_sig < alpha);
    disp([ix,iy])
381
end
382
383
384
385

%%%%% ----- end of independence check ------

warning(FLAG_WARNING);  % restore warning status