-
Notifications
You must be signed in to change notification settings - Fork 0
/
swtest.m
231 lines (214 loc) · 8.77 KB
/
swtest.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
function [H, pValue, W] = swtest(x, alpha)
%SWTEST Shapiro-Wilk parametric hypothesis test of composite normality.
% [H, pValue, SWstatistic] = SWTEST(X, ALPHA) performs the
% Shapiro-Wilk test to determine if the null hypothesis of
% composite normality is a reasonable assumption regarding the
% population distribution of a random sample X. The desired significance
% level, ALPHA, is an optional scalar input (default = 0.05).
%
% The Shapiro-Wilk and Shapiro-Francia null hypothesis is:
% "X is normal with unspecified mean and variance."
%
% This is an omnibus test, and is generally considered relatively
% powerful against a variety of alternatives.
% Shapiro-Wilk test is better than the Shapiro-Francia test for
% Platykurtic sample. Conversely, Shapiro-Francia test is better than the
% Shapiro-Wilk test for Leptokurtic samples.
%
% When the series 'X' is Leptokurtic, SWTEST performs the Shapiro-Francia
% test, else (series 'X' is Platykurtic) SWTEST performs the
% Shapiro-Wilk test.
%
% [H, pValue, SWstatistic] = SWTEST(X, ALPHA)
%
% Inputs:
% X - a vector of deviates from an unknown distribution. The observation
% number must exceed 3 and less than 5000.
%
% Optional inputs:
% ALPHA - The significance level for the test (default = 0.05).
%
% Outputs:
% SWstatistic - The test statistic (non normalized).
%
% pValue - is the p-value, or the probability of observing the given
% result by chance given that the null hypothesis is true. Small values
% of pValue cast doubt on the validity of the null hypothesis.
%
% H = 0 => Do not reject the null hypothesis at significance level ALPHA.
% H = 1 => Reject the null hypothesis at significance level ALPHA.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Copyright (c) 17 March 2009 by Ahmed Ben Saïda %
% Department of Finance, IHEC Sousse - Tunisia %
% Email: [email protected] %
% $ Revision 3.0 $ Date: 18 Juin 2014 $ %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% References:
%
% - Royston P. "Remark AS R94", Applied Statistics (1995), Vol. 44,
% No. 4, pp. 547-551.
% AS R94 -- calculates Shapiro-Wilk normality test and P-value
% for sample sizes 3 <= n <= 5000. Handles censored or uncensored data.
% Corrects AS 181, which was found to be inaccurate for n > 50.
% Subroutine can be found at: http://lib.stat.cmu.edu/apstat/R94
%
% - Royston P. "A pocket-calculator algorithm for the Shapiro-Francia test
% for non-normality: An application to medicine", Statistics in Medecine
% (1993a), Vol. 12, pp. 181-184.
%
% - Royston P. "A Toolkit for Testing Non-Normality in Complete and
% Censored Samples", Journal of the Royal Statistical Society Series D
% (1993b), Vol. 42, No. 1, pp. 37-43.
%
% - Royston P. "Approximating the Shapiro-Wilk W-test for non-normality",
% Statistics and Computing (1992), Vol. 2, pp. 117-119.
%
% - Royston P. "An Extension of Shapiro and Wilk's W Test for Normality
% to Large Samples", Journal of the Royal Statistical Society Series C
% (1982a), Vol. 31, No. 2, pp. 115-124.
%
%
% Ensure the sample data is a VECTOR.
%
if numel(x) == length(x)
x = x(:); % Ensure a column vector.
else
error(' Input sample ''X'' must be a vector.');
end
%
% Remove missing observations indicated by NaN's and check sample size.
%
x = x(~isnan(x));
if length(x) < 3
error(' Sample vector ''X'' must have at least 3 valid observations.');
end
if length(x) > 5000
warning('Shapiro-Wilk test might be inaccurate due to large sample size ( > 5000).');
end
%
% Ensure the significance level, ALPHA, is a
% scalar, and set default if necessary.
%
if (nargin >= 2) && ~isempty(alpha)
if ~isscalar(alpha)
error(' Significance level ''Alpha'' must be a scalar.');
end
if (alpha <= 0 || alpha >= 1)
error(' Significance level ''Alpha'' must be between 0 and 1.');
end
else
alpha = 0.05;
end
% First, calculate the a's for weights as a function of the m's
% See Royston (1992, p. 117) and Royston (1993b, p. 38) for details
% in the approximation.
x = sort(x); % Sort the vector X in ascending order.
n = length(x);
mtilde = norminv(((1:n)' - 3/8) / (n + 1/4));
weights = zeros(n,1); % Preallocate the weights.
if kurtosis(x) > 3
% The Shapiro-Francia test is better for leptokurtic samples.
weights = 1/sqrt(mtilde'*mtilde) * mtilde;
%
% The Shapiro-Francia statistic W' is calculated to avoid excessive
% rounding errors for W' close to 1 (a potential problem in very
% large samples).
%
W = (weights' * x)^2 / ((x - mean(x))' * (x - mean(x)));
% Royston (1993a, p. 183):
nu = log(n);
u1 = log(nu) - nu;
u2 = log(nu) + 2/nu;
mu = -1.2725 + (1.0521 * u1);
sigma = 1.0308 - (0.26758 * u2);
newSFstatistic = log(1 - W);
%
% Compute the normalized Shapiro-Francia statistic and its p-value.
%
NormalSFstatistic = (newSFstatistic - mu) / sigma;
% Computes the p-value, Royston (1993a, p. 183).
pValue = 1 - normcdf(NormalSFstatistic, 0, 1);
else
% The Shapiro-Wilk test is better for platykurtic samples.
c = 1/sqrt(mtilde'*mtilde) * mtilde;
u = 1/sqrt(n);
% Royston (1992, p. 117) and Royston (1993b, p. 38):
PolyCoef_1 = [-2.706056 , 4.434685 , -2.071190 , -0.147981 , 0.221157 , c(n)];
PolyCoef_2 = [-3.582633 , 5.682633 , -1.752461 , -0.293762 , 0.042981 , c(n-1)];
% Royston (1992, p. 118) and Royston (1993b, p. 40, Table 1)
PolyCoef_3 = [-0.0006714 , 0.0250540 , -0.39978 , 0.54400];
PolyCoef_4 = [-0.0020322 , 0.0627670 , -0.77857 , 1.38220];
PolyCoef_5 = [0.00389150 , -0.083751 , -0.31082 , -1.5861];
PolyCoef_6 = [0.00303020 , -0.082676 , -0.48030];
PolyCoef_7 = [0.459 , -2.273];
weights(n) = polyval(PolyCoef_1 , u);
weights(1) = -weights(n);
if n > 5
weights(n-1) = polyval(PolyCoef_2 , u);
weights(2) = -weights(n-1);
count = 3;
phi = (mtilde'*mtilde - 2 * mtilde(n)^2 - 2 * mtilde(n-1)^2) / ...
(1 - 2 * weights(n)^2 - 2 * weights(n-1)^2);
else
count = 2;
phi = (mtilde'*mtilde - 2 * mtilde(n)^2) / ...
(1 - 2 * weights(n)^2);
end
% Special attention when n = 3 (this is a special case).
if n == 3
% Royston (1992, p. 117)
weights(1) = 1/sqrt(2);
weights(n) = -weights(1);
phi = 1;
end
%
% The vector 'WEIGHTS' obtained next corresponds to the same coefficients
% listed by Shapiro-Wilk in their original test for small samples.
%
weights(count : n-count+1) = mtilde(count : n-count+1) / sqrt(phi);
%
% The Shapiro-Wilk statistic W is calculated to avoid excessive rounding
% errors for W close to 1 (a potential problem in very large samples).
%
W = (weights' * x) ^2 / ((x - mean(x))' * (x - mean(x)));
%
% Calculate the normalized W and its significance level (exact for
% n = 3). Royston (1992, p. 118) and Royston (1993b, p. 40, Table 1).
%
newn = log(n);
if (n >= 4) && (n <= 11)
mu = polyval(PolyCoef_3 , n);
sigma = exp(polyval(PolyCoef_4 , n));
gam = polyval(PolyCoef_7 , n);
newSWstatistic = -log(gam-log(1-W));
elseif n > 11
mu = polyval(PolyCoef_5 , newn);
sigma = exp(polyval(PolyCoef_6 , newn));
newSWstatistic = log(1 - W);
elseif n == 3
mu = 0;
sigma = 1;
newSWstatistic = 0;
end
%
% Compute the normalized Shapiro-Wilk statistic and its p-value.
%
NormalSWstatistic = (newSWstatistic - mu) / sigma;
% NormalSWstatistic is referred to the upper tail of N(0,1),
% Royston (1992, p. 119).
pValue = 1 - normcdf(NormalSWstatistic, 0, 1);
% Special attention when n = 3 (this is a special case).
if n == 3
pValue = 6/pi * (asin(sqrt(W)) - asin(sqrt(3/4)));
% Royston (1982a, p. 121)
end
end
%
% To maintain consistency with existing Statistics Toolbox hypothesis
% tests, returning 'H = 0' implies that we 'Do not reject the null
% hypothesis at the significance level of alpha' and 'H = 1' implies
% that we 'Reject the null hypothesis at significance level of alpha.'
%
H = (alpha >= pValue);