forked from donotdespair/Bayesian-Autoregressions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.qmd
1428 lines (1110 loc) · 65 KB
/
index.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Bayesian Autoregressions"
author:
- name: Tomasz Woźniak
url: https://github.com/donotdespair
orcid: 0000-0003-2212-2378
- name: Jonas Loopers Davidsen
url: https://github.com/jonasld23
- name: Filippo Dell'Andrea
url: https://github.com/IoTiFeelo
- name: Corwin Getty
url: https://github.com/Wintersend
- name: Ray Gomez
url: https://github.com/rayccg
- name: Xiaoman Guo
url: https://github.com/mandyxmg
- name: Manting Huang
url: https://github.com/mantihuang
- name: Nathan Huynh
url: https://github.com/nathanh93
- name: Eungwon Lee
url: https://github.com/eung1
- name: Thomas Kronholm Møller
url: https://github.com/ThomasKronhol
- name: Victoria Sonnemans
url: https://github.com/vsonnemans
- name: Yobin Timilsena
url: https://github.com/yobin-tim
- name: Django Trueman-Greinke
url: https://github.com/DjangoTG
- name: Hanwen Zhang
url: https://github.com/hanwenzhang0317
execute:
echo: false
citation:
issued: 2023-05-25
url: https://donotdespair.github.io/Bayesian-Autoregressions/
doi: 10.26188/23255657
bibliography: references.bib
editor:
markdown:
wrap: 72
---
> **Abstract.** We present the basics of Bayesian estimation and
> inference for autoregressive models. The range of topics includes the
> natural conjugate analysis using normal-inverted-gamma 2 prior
> distribution and its extensions focusing on hierarchical modelling,
> conditional heteroskedasticity, and Student-t error terms. We focus on
> forecasting and sampling from the predictive density.
>
> **Keywords.** Autoregressions, Bayesian Inference, Forecasting,
> Heteroskedasticity, Hierarchical Modelling, Natural Conjugacy,
> Shrinkage Prior
# Autoregressions
Autoregressions are a popular class of linear models that are the most
useful for time series persistence analysis and forecasting a random
variable's unknown future values. The simplicity of their formulation,
estimation, and range of applications in which they occur useful decides
on their continued employment.
## The AR($p$) model
The model is set for a univariate time series whose observation at time
$t$ is denoted by $y_t$. It includes a $d$-vector $d_t$ of deterministic
terms and $p$ lags of the dependent variable on the right-hand side of
the model equation. It is complemented by error term $u_t$ that, in this
note, is zero-mean normally distributed with variance $\sigma^2$. Then
the model equations are: \begin{align}
y_t &= \alpha_d' d_t + \alpha_1 y_{t-1} + \dots + \alpha_p y_{t-p} + u_t\\
u_t\mid d_t, y_{t-1}, \dots, y_{t-p} &\sim\mathcal{N}\left(0, \sigma^2\right)
\end{align} where $\alpha_d$ is a $d$-vector of coefficients on
deterministic terms, and parameters $\alpha_1,\dots,\alpha_p$ are
autoregressive slopes.
## Matrix notation for the model
To simplify the notation and the derivations introduce matrix notation
for the model. Let $T$ be the available sample size for the variable
$y$. Define a $T$-vector of zeros, $\mathbf{0}_T$, the identity matrix
of order $T$, $\mathbf{I}_T$, $T\times1$ vectors: \begin{align}
\mathbf{y} = \begin{bmatrix} y_1\\ \vdots \\ y_T\end{bmatrix}, \quad
\text{ and }\quad
\mathbf{u} = \begin{bmatrix} u_1\\ \vdots \\ u_T\end{bmatrix},
\end{align} a $k\times1$ vector
$\mathbf{x}_t = \begin{bmatrix}d_t' & y_{t-1}&\dots& y_{t-} \end{bmatrix}'$,
where $k=d+p$, and a $T\times k$ matrix collecting the explanatory
variables: \begin{align}
\mathbf{X} = \begin{bmatrix} \mathbf{x}_1'\\ \vdots \\ \mathbf{x}_T'\end{bmatrix}.
\end{align} Collect the parameters of the conditional mean equation in a
$k$-vector: \begin{align}
\boldsymbol\alpha = \begin{bmatrix} \alpha_d'& \alpha_1 & \dots & \alpha_p\end{bmatrix}'.
\end{align}
Then the model can be written in a concise notation as: \begin{align}
\mathbf{y} &= \mathbf{X} \boldsymbol\alpha + \mathbf{u}\\
\mathbf{u}\mid \mathbf{X} &\sim\mathcal{N}_T\left(\mathbf{0}_T, \sigma^2\mathbf{I}_T\right).
\end{align}
## Likelihood function
The model equations imply the predictive density of the data vector
$\mathbf{y}$. To see this, consider the model equation as a linear
transformation of a normal vector $\mathbf{u}$. Therefore, the data
vector follows a multivariate normal distribution given by:
\begin{align}
\mathbf{y}\mid \mathbf{X}, \boldsymbol\alpha, \sigma^2 &\sim\mathcal{N}_T\left(\mathbf{X} \boldsymbol\alpha, \sigma^2\mathbf{I}_T\right).
\end{align}
This distribution determines the shape of the likelihood function that
is defined as the sampling data density: \begin{align}
L(\boldsymbol\alpha,\sigma^2|\mathbf{y}, \mathbf{X})\equiv p\left(\mathbf{y}\mid \mathbf{X}, \boldsymbol\alpha, \sigma^2 \right).
\end{align}
The likelihood function that for the sake of the estimation of the
parameters, and after plugging in data in place of matrices $\mathbf{y}$
and $\mathbf{X}$, is considered a function of parameters
$\boldsymbol\alpha$ and $\sigma^2$ is given by: \begin{align}
L(\boldsymbol\alpha,\sigma^2|\mathbf{y}, \mathbf{X}) =
(2\pi)^{-\frac{T}{2}}\left(\sigma^2\right)^{-\frac{T}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}(\mathbf{y} - \mathbf{X}\boldsymbol\alpha)'(\mathbf{y} - \mathbf{X}\boldsymbol\alpha)\right\}.
\end{align}
# Natural-Conjugate Analysis
> **Authors:** Thomas Kronholm Møller & Jonas Loopers Davidsen
## Likelihood as normal-inverted gamma 2
In order to facilitate deriving the posterior distribution, the likelihood function can be rewritten to a $\mathcal{NIG}2$-distribution given by:
```{=tex}
\begin{align}
L(\boldsymbol\alpha,\sigma^2|\mathbf{y}, \mathbf{X}) &\propto \left(\sigma^2\right)^{-\frac{T-(k+2)+k+2}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}\left(\boldsymbol\alpha-\boldsymbol{\hat{\alpha}}\right)'\mathbf{X}'\mathbf{X}\left(\boldsymbol\alpha-\boldsymbol{\hat{\alpha}}\right) \right\}\\
&\qquad\times\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}\left(\mathbf{y}-\mathbf{X}\boldsymbol{\hat{\alpha}}\right)'\left(\mathbf{y}-\mathbf{X}\boldsymbol{\hat{\alpha}}\right) \right\},
\end{align}
```
where $\boldsymbol{\hat{\alpha}} = (\mathbf{X}'\mathbf{X})^{-1}\mathbf{X}'\mathbf{y}$ in the maximum likelihood estimator of $\boldsymbol\alpha$. It is now quite straight forward to identify the $\mathcal{NIG}2$-kernel. By remembering that the $\mathcal{NIG}2$-distribution is characterized by its four moments, we get the following outcome:
```{=tex}
\begin{align}
L(\boldsymbol\alpha,\sigma^2|\mathbf{y}, \mathbf{X}) = \mathcal{NIG}2\left(\mu=\boldsymbol{\hat{\alpha}},\Sigma=\left(\mathbf{X}'\mathbf{X}\right)^{-1},s=\left(\mathbf{y}-\mathbf{X}\boldsymbol{\hat{\alpha}}\right)'\left(\mathbf{y}-\mathbf{X}\boldsymbol{\hat{\alpha}}\right),\nu=T-k-2\right)
\end{align}
```
## Normal-inverted gamma 2 prior
The prior distribution of the natural conjugate is determined by the form of the distribution of the parameters implied by the likelihood function discussed above. The priors for the Normal-inverse gamma 2 distribution can thus be written as:
```{=tex}
\begin{align}
p(\boldsymbol\alpha,\sigma^2) &= p(\boldsymbol\alpha|\sigma^2)p(\sigma^2)\\
p(\boldsymbol\alpha|\sigma^2) &= \mathcal{N}(\underline{\boldsymbol\alpha},\sigma^2\underline{\mathbf{V}}_{\boldsymbol\alpha})\\
p(\sigma^2) & = \mathcal{IG}2(\underline{s},\underline{\nu})
\end{align}
```
Using the distributions above, we can write the kernel of the $\mathcal{NIG}2$ prior as:
```{=tex}
\begin{align}
\mathcal{NIG}2_{k}(\underline{\boldsymbol\alpha},\underline{\mathbf{V}}_{\boldsymbol\alpha}, \underline{s}, \underline{\nu}) \propto (\sigma^2)^{-\frac{\underline{\nu}+k+2}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}(\boldsymbol\alpha-\underline{\boldsymbol\alpha})'\underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}(\boldsymbol\alpha-\underline{\boldsymbol\alpha})\right\}\exp\left\{-\frac{1}{2}\frac{\underline{s}}{\sigma^2}\right\}
\end{align}
```
## Normal-inverted gamma 2 posterior
The product of the prior distribution and the likelihood function as introduced above gives the posterior distribution given by:
```{=tex}
\begin{align}
p(\boldsymbol\alpha,\sigma^2|\mathbf{y},\mathbf{X}) \propto L(\mathbf{y}|\mathbf{X}, \boldsymbol\alpha, \sigma^2)p(\boldsymbol\alpha,\sigma^2) = L(\mathbf{y}|\mathbf{X}, \boldsymbol\alpha, \sigma^2)p( \boldsymbol\alpha| \sigma^2)p(\sigma^2)
\end{align}
```
Now plugging in the expressions for the likelihood and the prior distribution:
```{=tex}
\begin{align}
p(\boldsymbol\alpha,\sigma^2|\mathbf{y},\mathbf{X}) &\propto \left(\sigma^2\right)^{-\frac{T-(k-2)+k+2}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}\left(\boldsymbol\alpha-\boldsymbol{\hat{\alpha}}\right)'\mathbf{X}'\mathbf{X}\left(\boldsymbol\alpha-\boldsymbol{\hat{\alpha}}\right) \right\}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}\left(\mathbf{y}-\mathbf{X}\boldsymbol{\hat{\alpha}}\right)'\left(\mathbf{y}-\mathbf{X}\boldsymbol{\hat{\alpha}}\right) \right\} \\
&\times (\sigma^2)^{-\frac{\underline{\nu}+k+2}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}(\boldsymbol\alpha-\underline{\boldsymbol\alpha})'\underline{\mathbf{V}}_{\boldsymbol\alpha}^{-1}(\boldsymbol\alpha-\underline{\boldsymbol\alpha})\right\}\exp\left\{-\frac{1}{2}\frac{\underline{s}}{\sigma^2}\right\}
\end{align}
```
By collecting all terms and simplifying the expressions, the joint posterior distribution can be derived to be:
```{=tex}
\begin{align}
p(\boldsymbol\alpha,\sigma^2|\mathbf{y},\mathbf{X}) &\propto (\sigma^2)^{-\frac{T+\underline{\nu}+k+2}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2} (\boldsymbol\alpha-\overline{\boldsymbol\alpha})'\overline{\mathbf{V}}_{\boldsymbol\alpha}^{-1}(\boldsymbol\alpha-\overline{\boldsymbol\alpha})\right\} \times \exp\left\{-\frac{1}{2}\frac{1}{\sigma^2}\left(\underline{s}-\overline{\boldsymbol\alpha}'\overline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\overline{\boldsymbol\alpha}+\underline{\boldsymbol\alpha}'\underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\underline{\boldsymbol\alpha}+\mathbf{y}'\mathbf{y}\right)\right\}
\\&= (\sigma^2)^{\frac{\overline{\nu}+k+2}{2}}\exp\left\{-\frac{1}{2}\frac{1}{\sigma^2} (\boldsymbol\alpha-\overline{\boldsymbol\alpha})'\overline{\mathbf{V}}_{\boldsymbol\alpha}^{-1}(\boldsymbol\alpha-\overline{\boldsymbol\alpha})\right\} \times \exp\left\{-\frac{1}{2}\frac{\overline{s}}{\sigma^2}\right\}
\end{align}
```
This now fully defines the joint posterior distribution as is it is in a normal inverse gamma 2 form with its corresponding four moments:
```{=tex}
\begin{align}
p(\boldsymbol\alpha,\sigma^2|\mathbf{y},\mathbf{X}) &= \mathcal{NIG}2_{k}\left(\overline{\boldsymbol\alpha},\overline{\mathbf{V}}_{\boldsymbol\alpha},\overline{s},\overline{\nu}\right)\\
\overline{\boldsymbol\alpha} &= \overline{\mathbf{V}}_{\boldsymbol\alpha}( \underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\underline{\boldsymbol\alpha}+\mathbf{X}'\mathbf{y})\\
\overline{\mathbf{V}}_{\boldsymbol\alpha} &=\left(\underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}+\mathbf{X}'\mathbf{X}\right)^{-1} \\
\overline{s} &= \underline{s}-\overline{\boldsymbol\alpha}'\overline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\overline{\boldsymbol\alpha}+\underline{\boldsymbol\alpha}'\underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\underline{\boldsymbol\alpha}+\mathbf{y}'\mathbf{y}\\
\overline{\nu} &= \underline{\nu}+T
\end{align}
```
## Sampling draws from the posterior
We start by generating a random walk, which can be used to validate that our estimation is indeed correct.
```{r}
#| echo: true
#| message: false
#| warning: false
T = 100
N = 1
y = apply(matrix(rnorm(T * N), ncol = N), 2, cumsum)
y = as.matrix(y)
p = 1
d = 1
k = p + d
T = nrow(y)
Y = as.matrix(y[(p+1):T,])
X = cbind(rep(1, T - p), y[1:(T - p),])
```
Now, defining our priors for $\underline{\boldsymbol\alpha}$, $\underline{\mathbf{V}}_{\boldsymbol\alpha}$, $\underline{s}$ and $\underline{\nu}$:
```{r}
#| echo: true
#| message: false
#| warning: false
priors = list(
alpha = as.matrix(rep(0, k)),
Sigma = diag(2),
S = 1,
nu = 3
)
```
and computing the function for the posterior parameters:
```{r}
#| echo: true
#| message: false
#| warning: false
posterior = function(y, priors){
Sigma.inv = t(X) %*% X + priors$Sigma
Sigma = solve(Sigma.inv)
alpha = Sigma %*% (t(X) %*% Y + solve(priors$Sigma) %*% priors$alpha)
S = as.numeric(t(Y) %*% Y + priors$S + t(priors$alpha) %*% solve(priors$Sigma) %*% priors$alpha
- t(alpha) %*% Sigma.inv %*% alpha)
nu = T + priors$nu
return(list(
Sigma = Sigma,
alpha = alpha,
S = S,
nu = nu
))
}
post = posterior(y = y, priors = priors)
```
We are then able to do the estimation of our parameters using the Gibbs sampler provided below.
```{r}
#| echo: true
#| message: false
#| warning: false
posterior.draws = function(S, posterior){
Sigma2.posterior = as.matrix(posterior$S / rchisq(S, posterior$nu))
alpha.posterior = simplify2array(
lapply(1:S, function(i){
mvtnorm::rmvnorm(1, mean = posterior$alpha, sigma = Sigma2.posterior[i,] * posterior$Sigma)
})
)
output = cbind(t(alpha.posterior[1,,]), Sigma2.posterior)
return(output)
}
draws = posterior.draws(S = 1000, posterior = post)
```
# Hierarchical Prior Analysis
## Estimating autoregressive prior shrinkage
### Inverted gamma 2 scale mixture of normal
Given the scalar scale follows an inverted gamma 2 distribution:
\begin{align}
\kappa_{\boldsymbol\alpha} \sim \mathcal{IG}2 (\underline{s}_{\boldsymbol\alpha},\underline{\nu}_{\boldsymbol\alpha})
\end{align}
We have priors as:
\begin{align}
\boldsymbol\alpha | \sigma^2, \kappa_{\boldsymbol\alpha} &\sim \mathcal{N}_{k}(\underline{\boldsymbol\alpha}, \sigma^2 \kappa_{\boldsymbol\alpha} \underline{\mathbf{V}} _{\boldsymbol\alpha}) \\
\\ \sigma^2 &\sim \mathcal{IG}2(\underline{s},\underline{\nu}) \\
\\ p(\boldsymbol\alpha, \sigma^2, \kappa_{\boldsymbol\alpha}) &= p(\boldsymbol\alpha | \sigma^2, \kappa_{\boldsymbol\alpha}) p(\sigma^2) p(\kappa_{\boldsymbol\alpha})
\end{align}
Then, the joint distribution of $(\boldsymbol\alpha,\sigma^2)$ given $\kappa_{\boldsymbol\alpha}$ would follow a normal-inverted gamma 2 distribution:
\begin{align*}
p\left(\boldsymbol\alpha,\sigma^2 \mid \kappa_{\boldsymbol\alpha}\right) = \mathcal{NIG}2_N(\underline{\boldsymbol\alpha},\kappa_{\boldsymbol\alpha} \underline{\mathbf{V}} _{\boldsymbol\alpha}, \underline{s},\underline{\nu})
\end{align*}
A **Gibbs Sampler** can be applied using the following steps:
Initialize $\kappa_{\boldsymbol\alpha}$ at $\kappa_{\boldsymbol\alpha}^{(0)}$.
| At each iteration s:
| Step 1. Draw $(\boldsymbol\alpha,\sigma^2)^{(s)} \sim p(\boldsymbol\alpha,\sigma^2|\mathbf{X},\mathbf{y},\kappa_{\boldsymbol\alpha}^{(s-1)}) = \mathcal{NIG}2(\overline{\boldsymbol\alpha},\overline{\mathbf{V}},\overline{s},\overline{\nu})$.
| Step 2. Draw $\kappa_{\boldsymbol\alpha}^{(s)} \sim p(\kappa_{\boldsymbol\alpha}|\mathbf{X},\mathbf{y},\boldsymbol\alpha,\sigma^2) = \mathcal{IG}2(\overline{s},\overline{\nu})$.
Repeat step 1 and 2 $(S_1+S_2)$ times and discard the first $S_1$ draws.
The **full conditional posterior** of $\kappa_{\boldsymbol\alpha}$ is:
\begin{gather*}
p(\kappa_{\boldsymbol\alpha}|\mathbf{X},\mathbf{y},\boldsymbol\alpha,\sigma^2) = L(\mathbf{y}|\mathbf{X},\boldsymbol\alpha,\sigma^2) p(\kappa_{\boldsymbol\alpha}) p(\boldsymbol\alpha|\sigma^2,\kappa_{\boldsymbol\alpha}) p(\sigma^2) \\
\\ \propto p(\kappa_{\boldsymbol\alpha}) p(\boldsymbol\alpha|\sigma^2,\kappa_{\boldsymbol\alpha}) \\
\\ \propto (\kappa_{\boldsymbol\alpha})^{-\frac{\nu+2}{2}} exp \left\{ -\frac{1}{2}\frac{s}{\kappa_{\boldsymbol\alpha}} \right\}
\times \text{det}(\sigma^2 \kappa_{\boldsymbol\alpha} \underline{\mathbf{V}} _{\boldsymbol\alpha})^{-\frac{1}{2}} exp \left\{ -\frac{1}{2} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'}(\sigma^2 \kappa_{\boldsymbol\alpha} \underline{\mathbf{V}} _{\boldsymbol\alpha})^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})\right\} \\
\\ = (\kappa_{\boldsymbol\alpha})^{-\frac{\nu+2+k}{2}} exp \left\{ -\frac{1}{2}\frac{s}{\kappa_{\boldsymbol\alpha}} \right\} exp \left\{ -\frac{1}{2}\frac{1}{\kappa_{\boldsymbol\alpha}} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'} (\sigma^2 \underline{\mathbf{V}}_{\boldsymbol\alpha})^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})\right\}
\end{gather*}
in which we recognize a kernel of inverted gamma 2 distribution with parameters:
\begin{align}
\overline{s}_{\boldsymbol\alpha} &= s + (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'} \sigma^{-2} \underline{\mathbf{V}}_{\boldsymbol\alpha}^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha}) \\
\overline{\nu}_{\boldsymbol\alpha} &= \nu + k
\end{align}
To sample from the $\mathcal{IG}2(s,\nu)$ distribution, the following code can be used:
```{r}
#| echo: true
# Set parameters of IG2 distribution
s <- 1
nu <- 1
# Draw one sample from IG2 distribution
sample <- s / rchisq(1, df = nu)
```
### Gamma scale mixture of normal
> Contributor: Yobin Timilsena
Alternatively, the scalar scale $\kappa_{\boldsymbol\alpha}$ that premultiplies the covariance matrix of the normal prior for vector $\boldsymbol\alpha$ can be assumed to have a hierarchical prior with a gamma distribution:
$$
\kappa_{\boldsymbol\alpha}|\underline s_\kappa, \underline a_\kappa \sim \mathcal{G}(\underline s_\kappa, \underline a_\kappa)
$$
The following code creates a list `kappa.priors` to store priors on $\kappa_\boldsymbol\alpha$.
```{r kappa priors for gamma scale mixture of normal, echo=TRUE}
kappa.priors = list(
a = 1,
s = .1
)
```
Given the likelihood and priors above, we can obtain the **full conditional posterior** of $\kappa_\boldsymbol\alpha$ as
$$
\begin{align*}
p(\kappa_{\boldsymbol\alpha} | \bf y, \bf X, \boldsymbol\alpha, \sigma^2) & \propto p(\kappa_{\boldsymbol\alpha} | \underline s_{\boldsymbol\kappa}, \underline a_{\boldsymbol\kappa}) \cdot p(\boldsymbol\alpha | \kappa_{\boldsymbol\alpha}, \sigma^2)\\
&\qquad\times (\kappa_{\boldsymbol\alpha})^{\underline a_{\boldsymbol\kappa} - 1} \exp \left\{ - \frac{\kappa_{\boldsymbol\alpha}}{\underline s_{\boldsymbol\kappa}} \right\} \cdot \det(\sigma^2 \kappa_{\boldsymbol\alpha} \underline{\bf V}_{\boldsymbol\alpha})^{-\frac{1}{2}} exp \left\{ -\frac{1}{2} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'}(\sigma^2 \kappa_{\boldsymbol\alpha} \underline{\mathbf{V}}_{\boldsymbol\alpha})^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})\right\} \\
& \propto (\kappa_{\boldsymbol\alpha})^{\underline a_{\boldsymbol\kappa} - 1} \exp \left\{ - \frac{\kappa_{\boldsymbol\alpha}}{\underline s_{\boldsymbol\kappa}} \right\} \cdot (\sigma^2)^{-\frac{K}{2}} (\kappa_{\boldsymbol\alpha})^{-\frac{K}{2}} \det(\underline{\bf V}_{\boldsymbol\alpha})^{-\frac{1}{2}} exp \left\{ -\frac{1}{2 \kappa_{\boldsymbol\alpha}} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'} (\sigma^2 \underline{\mathbf{V}}_{\boldsymbol\alpha})^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha})\right\} \\
& \propto (\kappa_{\boldsymbol\alpha})^{\underline a_{\boldsymbol\kappa} - \frac{K}{2} - 1} \exp \left\{ -\frac{1}{2} \left( (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'} (\sigma^2 \underline{\mathbf{V}}_{\boldsymbol\alpha})^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha}) \cdot \frac{1}{\kappa_{\boldsymbol\alpha}} + \frac{2}{\underline s_{\boldsymbol\kappa}} \kappa_{\boldsymbol\alpha} \right) \right\}
\end{align*}
$$
which is a kernel for a Generalised Inverse Gaussian distribution with parameters:
$$
\begin{align*}
& \overline\lambda = \underline a_{\boldsymbol\kappa} - \frac{K}{2}\\
& \overline\chi = (\boldsymbol\alpha-\underline{\boldsymbol\alpha})^{'} (\sigma^2 \underline{\mathbf{V}}_{\boldsymbol\alpha})^{-1} (\boldsymbol\alpha-\underline{\boldsymbol\alpha}) \\
& \overline\Psi = \frac{2}{\underline s_{\boldsymbol\kappa}}
\end{align*}
$$
Hence, the **full-conditional posterior** distribution of $\kappa_\alpha$ follows a Generalised Inverse Gaussian distribution.
$$
\kappa_{\boldsymbol\alpha} | \bf y, \bf X, {\boldsymbol\alpha}, \sigma^2 \sim \mathcal{GIG}(\overline\lambda, \overline\chi, \overline\Psi)
$$
Given the full conditional posterior of both $\kappa_\alpha$ and $(\boldsymbol\alpha, \sigma^2)$, we can implement the **Gibbs Sampler** with the following steps:
- Initialise $\kappa_\alpha^{(0)}$.
- At each iteration $s$:
- Step 1: Draw $(\boldsymbol\alpha,\sigma^2)^{(s)} \sim p(\boldsymbol\alpha,\sigma^2|\mathbf{X},\mathbf{y},\kappa_{\boldsymbol\alpha}^{(s-1)}) = \mathcal{NIG}2(\overline{\boldsymbol\alpha},\overline{\mathbf{V}},\overline{s},\overline{\nu})$.
- Step 2. Draw $\kappa_{\boldsymbol\alpha}^{(s)} \sim p(\kappa_{\boldsymbol\alpha}|\mathbf{X},\mathbf{y},(\boldsymbol\alpha,\sigma^2)^{(s)}) = \mathcal{GIG}(\lambda, \chi, \Psi)$.
The `Gamma.sampler` function below implements the Gibbs Sampler using the aforementioned algorithm.
```{r sample from a gamma distributed scalar scale, echo=TRUE}
Gamma.sampler = function(S, posterior, priors, kappa.priors){
set.seed(12345)
S.burnin = 100
S.total = S + S.burnin
#create matrices to store posterior draws
alpha.posterior = array(NA,c(k,1,S.total))
kappa.posterior = array(NA,c(1,S.total))
#initial value for kappa
kappa.posterior[1] = 10
# Draw sigma^2 from IG2 outside the loop as it does not update iteratively
Sigma2.posterior = posterior$S / rchisq(n=S.total,df=posterior$nu)
for (i in 1:(S.total)) {
# Plug in current kappa to update Sigma (or V in slides)
prior.Sigma.inv = solve(kappa.posterior[i] * priors$Sigma)
Sigma.inv = t(X) %*% X + prior.Sigma.inv
Sigma = solve(Sigma.inv)
# Draw alpha from normal dist using sigma^2 and updated var-covar matrix Sigma
alpha.posterior[,,i] = t(mvtnorm::rmvnorm(n=1, mean = posterior$alpha, sigma = Sigma2.posterior[i] * Sigma))
# Update parameters for kappa posterior
lambda = kappa.priors$a - k/2
chi = t(alpha.posterior[,,i] - priors$alpha) %*% solve(Sigma2.posterior[i] * priors$Sigma) %*% (alpha.posterior[,,i] - priors$alpha)
Psi = 2 / kappa.priors$s
# Draw next period value for kappa from GIG distribution
if (i != S.total){
kappa.posterior[i+1] = GIGrvg::rgig(n=1, lambda = lambda, chi = chi, psi = Psi)
}
}
# save output as a list
list(
alpha.posterior = alpha.posterior[,,((S.burnin+1):S.total)],
Sigma2.posterior = Sigma2.posterior[((S.burnin+1):S.total)],
kappa.posterior = kappa.posterior[((S.burnin+1):S.total)]
)
}
GammaScale.draws = Gamma.sampler(S = 1000, post, priors, kappa.priors)
```
I compute and display the posterior means for $({\boldsymbol\alpha, \sigma^2, \kappa_{\boldsymbol\alpha}})$.
```{r compute and display alpha posterior mean, echo=TRUE}
alpha.posterior.mean = rowMeans(GammaScale.draws$alpha.posterior)
alpha.posterior.mean
```
```{r compute and display sigmasq posterior mean, echo=TRUE}
Sigma2.posterior.mean = mean(GammaScale.draws$Sigma2.posterior)
Sigma2.posterior.mean
```
```{r compute and display kappa posterior mean, echo=TRUE}
kappa.posterior.mean = mean(GammaScale.draws$kappa.posterior)
kappa.posterior.mean
```
## Estimating error term variance prior scale
In this section, we estimate the error term variance prior scale
$\underline{s}$ that follows a Gamma distribution $G(\underline{s}_s,\underline{a}_s)$ with scale $\underline{s}_s$ shape $\underline{a}_s$ that follow a probability density function equal to:
$$p(\underline{s})=\Gamma(\underline{a}_s)^{-1}\underline{s}_s^{-\underline{a}_s}\underline{s}^{\underline{a}_s-1}\exp\left\{-\frac{\underline{s}}{\underline{s}_s}\right\}$$
In order to find our full conditional posterior of $\underline{s}$ write out its kernel as:
\begin{align}
p(\underline{s}|y,X,\alpha,\sigma^2) &\propto
L(y|X,\alpha,\sigma^2) \times p(\alpha|\sigma^2) \times p(\sigma^2|\underline{s}) \times p(\underline{s})\\
&\propto p(\sigma^2|\underline{s}) \times p(\underline{s})\\
&\propto \underline{s}^{\frac{\underline{\nu}}{2}}
\exp\left\{-\frac{1}{2}\frac{\underline{s}}{\sigma^2}\right\}
\underline{s}^{\underline{a}_s-1}\exp\left\{-\frac{\underline{s}}{\underline{s}_s}\right\}\\
&\propto
\underline{s}^{\frac{\underline{\nu}}{2}+\underline{a}_s-1}\exp\left\{-\frac{\underline{s}}{
\left[\left(2\sigma^2\right)^{-1} + \underline{s}_s^{-1}\right]^{-1}
} \right\}
\end{align}
from which we recognise a Gamma function $G(\overline{a}_s, \overline{s}_s)$ with parameters:
\begin{align}
\overline{a}_s&=\frac{\underline{\nu}}{2}+\underline{a}_s\\
\overline{s}_s&=\left[\left(2\sigma^2\right)^{-1} + \underline{s}_s^{-1}\right]^{-1}
\end{align}
In order to obtain a sample from the posterior distribution we use a Gibbs sampler. We generate
random draws from the joint posterior distribution and we update them at
each iteration. In this case we exploit the following procedure:
Initialize $\underline{s}$ at $\underline{s}^{(0)}$
At each iteration s:
1. Draw $(\boldsymbol\alpha,\sigma^2)^{(s)} \sim p\left(\boldsymbol\alpha,\sigma^2 \mid \mathbf{y},\mathbf{X}, \boldsymbol\alpha^{(s-1)}, {\sigma^2}^{(s-1)}, \underline{s}^{(s)}\right)$
2. Draw $\underline{s}^{(s)} \sim p(\underline{s}\mid\mathbf{y},\mathbf{X},\boldsymbol\alpha,\sigma^2)$
Repeat steps 1 and 2 for $(S_1 + S_2)$ times. Discard the first $S_1$
repetitions. Return the output as
$\left \{ \boldsymbol\alpha^{(s)}, \sigma^{2(s)}, \underline{s}^{(s)}\right \}^{S_1+S_2}_{s=S_1+1}$.
The following script illustrates sampling from the full conditional posterior distribution of $\underline{s}$.
```{r}
#| echo: true
#| message: false
#| warning: false
nu = 1
sigma2 = 10
# define the prior hyper-parameters
s.prior.s <- 0.01
a.prior.s <- 1
#define the posteriors
a.bar.s <- (nu / 2) + s.prior.s
s.bar.s <- 1 / (1/(2 * sigma2) + (1 / s.prior.s))
#sample from the gamma distribution using rgamma function
s.sigma <- rgamma(1, shape = a.bar.s, scale = s.bar.s)
```
## Dummy observation prior
In this extended model, the system is augmented using the *sum of coefficients* and *dummy initial observation* prior. The idea for this way of setting a prior is to generate artificial new rows that augment data matrices $\bf y$ and $\bf X$. It is equivalent to assuming a normal-inverted gamma 2 prior whose parameters are determined by dummy observations augmenting the system.
The *sum of coefficients* prior takes the average of the lagged values and adds them to the basic equation. This is because these values are assumed to be a good forecast of future observations. The *dummy initial observation* prior adds a single dummy observation such that all values are set equal to the averages of initial conditions, up to a scaling factor.
In order to practically generate the additional rows the following steps should be taken.
Firstly, the average of lagged values needs to be calculated, `y.bar`. This is done by finding the mean of the first $p$ values in the data. Next, values of of the scaling factors need to be selected, where values equal to 1 are chosen typically.
Once this has been done, the rows can be created. In the univariate case this is simple, two rows are added to vector $\bf y$, the first equal to `y.bar` divided by the first scaling factor, $\lambda_3$, the second equal to th same value divided by the second scaling factor, $\lambda_4$.
Adding rows to $\bf X$ is slightly more complicated. The additional rows are as follows. The values in the first row, corresponding to the sum of coefficients, are all equal to `y.bar` divided by $\lambda_3$, except the last column, which is equal to zero. The values in the second row, corresponding to the dummy initial observation, are all equal to `y.bar`, except the last column, which is equal to one, al of which are divided by $\lambda_4$.
The following **R** function `prior.ex()` generates vectors and matrices by which $\bf y$ and $\bf X$ should be extended.
```{r}
#| echo: true
#|
prior.ex <- function(data, p = 1, lambda_3 = 1, lambda_4 = 1){
N = 1
M = p + 1
y.bar = mean(data[1:p])
Y_star = rbind((y.bar)/lambda_3, y.bar/lambda_4)
X_star = as.matrix(c(rep(0, N), 1/lambda_4))
for (i in 1:p) {
X_star = cbind(Y_star, X_star)
}
ext.data <- list(YN = Y_star, XN = X_star)
return(ext.data)
}
```
# Model Extensions
## Student-$t$ error term
A possible extension to the model is to relax the assumption of normally distributed errors. By assuming a t-distribution for the errors we can better account for excess kurtosis in the data, also referred to as 'fat tails'. Such a specification lends itself well to estimating models on data commonly found to be leptokurtic, such as financial time series.
$$ u_t \sim \mathcal{t}(0,\Sigma,\nu) $$
#### Scale Mixture of Normals Representation of the $t$ Distribution
Implementation of t-distributed errors can be achieved by representing the t-distribution as a 'scale mixture of normal distributions'.
If,
$$
\begin{align*}
x|\lambda &\sim \mathcal{N}(0, \lambda\sigma^2) \\
\lambda &\sim \mathcal{IG2}(s, \nu)
\end{align*}
$$
then
$$ x \sim t(0, s, \sigma^2, \nu) $$
This result can be derived by expressing the joint distribution of $x$ and $\lambda$ as
$$ p(x,\lambda) = p(x|\lambda) p(\lambda) $$
and then intergrating with respect to $\lambda$
$$ \int_0^\infty p(x|\lambda) p(\lambda) d\lambda $$
This integration produces a density function which is proportional to the distribution for a Student-t distribution with $\nu$ degrees of freedom. Hence, $x\sim t_\nu$.
#### Likelihood and Posteriors under Student-t errors
Implementing the scale mixture specification, the conditional distribution of $y$ now takes the following form
$$ \boldsymbol{y}|\boldsymbol{X},\boldsymbol{\alpha},\sigma^2,\lambda \sim \mathcal{N_T}(\boldsymbol{X}\boldsymbol{\alpha}, \sigma^2\lambda I_T) $$
The likelihood under this specification now becomes
$$ L(\boldsymbol{\alpha}, \sigma^2, \lambda | \boldsymbol{y}, \boldsymbol{X}) = (2\pi)^{-\frac{T}{2}} (\sigma^2)^{-\frac{T}{2}} \det(\lambda I_T)^{-\frac{1}{2}}
exp(-\frac{1}{2}\frac{1}{\sigma^2}(\boldsymbol{y - X\alpha})'(\lambda I_T)^{-1}(\boldsymbol{y - X\alpha}))$$
As before, we can derive the full conditional posteriors of the parameters by multiplying the likelihood and the priors. The natural conjugacy of $\boldsymbol\alpha$ and $\sigma^2$ are preserved under this specification and therefore the joint posterior is given by
$$ p(\boldsymbol{\alpha}, \sigma^2|\boldsymbol{y, X}, \lambda) \propto L(\boldsymbol{\alpha}, \sigma^2, \lambda | \boldsymbol{y}, \boldsymbol{X}) p(\boldsymbol{\alpha}, \sigma^2) $$
Expanding and rearranging the resulting expression allows the posterior to be expressed in the form of a Normal Inverse Gamma 2 with the following moments
$$\begin{align*}
p(\boldsymbol{\alpha}, \sigma^2|\boldsymbol{\mathbf{y}, \mathbf{X}}, \lambda) &= \mathcal{NIG2}(\boldsymbol{\overline\alpha, \overline V_\alpha}, \overline s, \overline \nu) \\
\overline{\boldsymbol\alpha} &= \overline{\mathbf{V}}_{\boldsymbol\alpha}( \underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\underline{\boldsymbol\alpha}+\mathbf{X}'(\lambda I_T)^{-1}\mathbf{y})\\
\overline{\mathbf{V}}_{\boldsymbol\alpha} &=\left(\underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}+\mathbf{X}'(\lambda I_T)^{-1}\mathbf{X}\right)^{-1} \\
\overline{s} &= \underline{s}+\overline{\boldsymbol\alpha}'\overline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\overline{\boldsymbol\alpha}+\underline{\boldsymbol\alpha}'\underline{\mathbf{V}}^{-1}_{\boldsymbol\alpha}\underline{\boldsymbol\alpha}+\mathbf{y}'(\lambda I_T)^{-1}\mathbf{y}\\
\overline{\nu} &= \underline{\nu}+T
\end{align*}$$
The full conditional for $\lambda$ is also derived in a similar manner, with the posterior distribution taking the form of an Inverse Gamma 2
$$\begin{align*}
p(\lambda|\mathbf{y,X},\boldsymbol{\alpha},\sigma^2) &\propto L(\boldsymbol{\alpha}, \sigma^2, \lambda | \mathbf{y, X})p(\lambda) \\
&\propto (\sigma^2)^{-\frac{T}{2}}\det(\lambda I_T)^{-\frac{1}{2}}\exp(-\frac{1}{2}\frac{1}{\sigma^2}(\mathbf{y - X\alpha})'(\lambda I_T)^{-1}(\mathbf{y - X\alpha})) \\
&\times \lambda^{-\frac{\underline \nu_\lambda +2}{2}}\exp(-\frac{1}{2}\frac{\underline s_\lambda}{\lambda})
\end{align*}$$
Rearranging the above to express in the form of $\mathcal{IG2}$
$$\begin{align*}
p(\lambda|\mathbf{y,X},\boldsymbol{\alpha},\sigma^2) &= \mathcal{IG2}(\overline s_\lambda, \overline \nu_\lambda) \\
\overline s_\lambda &= \frac{1}{\sigma^2}(\mathbf{y - X\boldsymbol\alpha})'(\mathbf{y - X\boldsymbol\alpha}) + \underline s_\lambda \\
\overline \nu_\lambda &= T + \underline \nu_\lambda
\end{align*}$$
#### Gibbs Sampler Code for t-distributed errors
Now that we have the full conditionals for all of the parameters we can incorporate them into the Gibbs Sampler routine in order to draw a sample from the posterior densities.
The routine proceeds as follows:
Initialize lambda based on priors for $\underline s_\lambda$ and $\underline \nu_\lambda$.
At each iteration s:
1. Draw $\sigma^{2(s)}$ from the $\mathcal{IG2}(\overline S, \overline\nu)$ distribution
2. Draw $\mathbf{\alpha^{(s)}}$ from the $\mathcal{N}(\boldsymbol{\overline \alpha}, \sigma^{2(s)} \overline V)$ distribution
3. Draw $\lambda^{(s)}$ from $\mathcal{IG2}(\overline s_\lambda, \overline\nu_\lambda)$
The sampling routine is implemented via the following function in R and the results of applying the sampler to the artificial random walk data are displayed. For the purposes of this simulation a relatively large prior for the degrees of freedom parameter is chosen, which communicates our prior belief that the data should follow a random walk.
```{r}
#| echo: true
#| message: false
#| warning: false
TDist.Gibbs.sampler = function(S, Y, X, priors){
# A function to sample from the full conditionals of alpha, sigma and lambda
#
# priors: a list containing the usual priors for alpha, Sigma, S and nu
# as well as additional priors for lambda hyper-parameters lambda_s and lambda_nu.
A.prior = priors$alpha
A_V.gprior = priors$Sigma
Sigma_s.prior = priors$S
Sigma_v.prior = priors$nu
lambda_s.prior = priors$lambda_s
lambda_v.prior = priors$lambda_nu
lambda.draw = lambda_s.prior/rchisq(1, lambda_v.prior)
Sigma.posterior.draws = array(NA, c(1,1,S))
A.posterior.draws = array(NA, c(k,1,S))
lambda.posterior.draws = rep(NA,S)
for (s in 1:S){
lambda.gprior.diag = diag(lambda.draw, nrow(Y))
A_V.posterior = solve(t(X)%*%diag(1/diag(lambda.gprior.diag))%*%X + solve(A_V.gprior))
A.posterior = A_V.posterior%*%(t(X)%*%diag(1/diag(lambda.gprior.diag))%*%Y + solve(A_V.gprior)%*%A.prior)
Sigma_s.posterior = t(Y)%*%diag(1/diag(lambda.gprior.diag))%*%Y + t(A.prior)%*%solve(A_V.gprior)%*%A.prior + Sigma_s.prior - t(A.posterior)%*%solve(A_V.posterior)%*%A.posterior
Sigma_v.posterior = nrow(Y) + Sigma_v.prior
#Sigma.inv.draw = rWishart(1, Sigma_v.posterior, solve(Sigma_s.posterior))[,,1]
#Sigma.posterior.draws[,,s] = solve(Sigma.inv.draw)
Sigma.posterior.draws[,,s] = Sigma_s.posterior/ rchisq(1, df=Sigma_v.posterior)
Sigma.inv.draw = solve(Sigma.posterior.draws[,,s])
A.posterior.draws[,,s] = matrix(mvtnorm::rmvnorm(1, mean=as.vector(A.posterior), sigma=Sigma.posterior.draws[,,s]%x%A_V.posterior), ncol=2)
lambda_s.posterior = sum(diag(Sigma.inv.draw%*%t(Y - X%*%A.posterior.draws[,,s])%*%(Y - X%*%A.posterior.draws[,,s]))) + lambda_s.prior
lambda_v.posterior = nrow(Y)*2 + lambda_v.prior
lambda.draw = lambda_s.posterior / rchisq(1, lambda_v.posterior)
lambda.posterior.draws[s] = lambda.draw
}
return(list(A.posterior.draws, Sigma.posterior.draws, lambda.posterior.draws))
}
tdist.res = TDist.Gibbs.sampler(1000, Y, X, c(priors, list(lambda_s=30, lambda_nu=30)))
#Alpha posterior mean:
round(apply(tdist.res[[1]], 1:2, mean),2)
#Sigma posterior mean:
round(mean(tdist.res[[2]]),2)
#Lambda posterior mean:
round(mean(tdist.res[[3]]),2)
```
## Estimating autoregressions after 2020
The 2020 COVID-19 pandemic significantly altered the global economic
landscape. It may be argued that the pre and post COVID periods are not
easily comparable, or that the most severe changes for COVID would best
be discounted due to their warping effect on the overall data. However,
@lenza2022estimate disagree. In their paper *How to estimate a vector
autoregression after March 2020*, rather than discount data, they
suggest that the effects of COVID be modeled as a temporary spike in
volatility. They found that the first three periods of the pandemic are
where volatility is the most unpredictable and this finding holds
regardless of whether monthly or quarterly data is being used. For
higher frequency data the exact effect is unknown though should cover at
least the first 3 months of the pandemic. Thus, they propose the
following change to the standard formula for autoregressions:
$$y_t = \mathbf{x}_t'\boldsymbol\alpha + c_tu_t,$$
where $c_t$ is a standard deviation multiplier. That is, for every
period before COVID it takes a value of 1, for the first 3 periods of
COVID it takes values $\bar{c}_0$, $\bar{c}_1$, $\bar{c}_2$, and then in
all future periods a value of
$$c_{t*+j} = 1 + (\bar{c}_2 - 1)\rho^{j-2}, $$ where $\rho\in(0,1)$
captures the exponential decay in the value of the conditional standard
deviation towards the value one. This creates a vector that leaves the
error term unchanged before COVID, has a surge in volatility during
COVID, and then decays geometrically after COVID. This structure
approximates the observed shocks to volatility and facilitates
straightforward estimation.
By dividing both sides by $c_t$ the model equation can be rewritten as
$$\tilde{y}_t = \tilde{\mathbf{x}}_t'\boldsymbol\alpha + u_t$$ where
$\tilde{y}_t = y_t/c_t$ and $\tilde{\mathbf{x}}_t = \mathbf{x}_t/c_t$. These
simple transformations then lend themselves to analysis using whatever
estimation method is preferred.
The main difficulty arises in the estimation of $c_t$ as it is an
unknown parameter in many cases.
### Methodology
To estimate the values for $c_0$, $c_1$, and $c_2$ define a $T$-vector
$\mathbf{c}$ of COVID volatility variables:
$$\mathbf{c}=\begin{bmatrix}1&\dots& c_0 & c_1 & c_2 & 1+(c_2-1)\rho & 1+(c_2-1)\rho^2&\dots\end{bmatrix}'$$
Intuitively, volatility variable for the period before COVID is set to
unity. Heightened volatility during the first three quarters of COVID
are parameterized, then they are assumed to decay at a geometric rate
beginning at the fourth quarter since the pandemic's onset.
These COVID volatility parameters are collected in a vector
$\boldsymbol\theta=(c_0\quad c_1\quad c_2\quad\rho)$ and are estimated
from their own marginal posterior as proposed by @lenza2022estimate:
$$p(\boldsymbol\theta\mid\mathbf{y},\mathbf{X})\propto P(\mathbf{y}\mid\mathbf{X},\boldsymbol\theta)p(\boldsymbol\theta)$$
where the likelihood function is given as:
$$\begin{align}p(\mathbf{y},\mathbf{X}|\boldsymbol\theta) &\propto \Bigg(\prod^T_{t=1}c_t^{-N}\Bigg)\underline{\mathbf{s}}^{\frac{\underline{\nu}}{2}}\det\left(\underline{\mathbf{V}}\right)^{-\frac{1}{2}}\det\left(\tilde{\mathbf{X}}'\tilde{\mathbf{X}}+\underline{\mathbf{V}}^{-1}\right)^{-\frac{1}{2}}\\
&\det\left(\underline{\mathbf{s}}+\hat{\tilde{\mathbf{u}}}'\hat{\tilde{\mathbf{u}}}+(\hat{\tilde{\boldsymbol\alpha}}-\underline{\boldsymbol\alpha})'\underline{\mathbf{V}}^{-1}(\hat{\tilde{\boldsymbol\alpha}}-\underline{\boldsymbol\alpha})\right)^{-\frac{T-p+\underline{\nu}}{2}},\end{align}$$
where $\tilde{\mathbf{X}} = \begin{bmatrix}\tilde{\mathbf{x}}_1' & \dots & \tilde{\mathbf{x}}_T' \end{bmatrix}'$, $\tilde{\mathbf{y}} = \begin{bmatrix}\tilde{y}_1 & \dots & \tilde{y}_T \end{bmatrix}'$, $\hat{\tilde{\mathbf{u}}} = \tilde{\mathbf{y}} - \tilde{\mathbf{X}}\hat{\tilde{\boldsymbol\alpha}}$, and
$\hat{\tilde{\boldsymbol\alpha}}=(\tilde{\mathbf{X}}'\tilde{\mathbf{X}}-\underline{\mathbf{V}}^{-1})^{-1}(\tilde{\mathbf{X}}'\tilde{\mathbf{y}}-\underline{\mathbf{V}}^{-1}\underline{\boldsymbol\alpha})$.
and the priors are assumed to be:
$$\begin{align}
\mathbf{c}_0,\mathbf{c}_1,\mathbf{c}_2 &\sim \mathcal{P}areto(1,1)\\
\rho&\sim \mathcal{B}eta(3,1.5)
\end{align}$$
### Algorithm with code
This first step is to sample draws of the vector of COVID volatility variables $\textbf{c}$, which is done by estimating parameters in $\boldsymbol\theta=(\bar{c}_0,\bar{c}_1,\bar{c}_2,\rho)$ by sampling from their own marginal posterior via a Metropolis MCMC algorithm:
1. Initialize $\boldsymbol\theta$ at the posterior mode which is can be located
via numerical optimization. We executed this in `R` through the
following function, `c.posterior.mode`. This function takes in data
representing matrix $\textbf{y}$ and returns values for $\boldsymbol\theta$
that minimize its negative log-posterior, as well as the
corresponding Hessian.
```{r COVID parameters posterior maximization, echo = TRUE}
c.posterior.mode <- function(data, p=4, k1=1, k2=100, start_date=c(1991,1)){
v.neglogPost <- function(theta){
N = ncol(data)
K = 1 + p*N
Y = ts(data[(p+1):nrow(data),], start=start_date, frequency=4)
T = nrow(Y)
X = matrix(1,T,1)
# nrow(X)
for (i in 1:p){
X = cbind(X,data[(p+1):nrow(data)-i,])
}
# Calculate MLE for prior
############################################################
A.hat = solve(t(X)%*%X)%*%t(X)%*%Y
Sigma.hat = t(Y-X%*%A.hat)%*%(Y-X%*%A.hat)/nrow(Y)
# Specify prior distribution
############################################################
kappa.1 = k1
kappa.2 = k2
kappa.3 = 1
A.prior = matrix(0,nrow(A.hat),ncol(A.hat))
A.prior[2:(N+1),] = kappa.3*diag(N)
V.prior = diag(c(kappa.2,kappa.1*((1:p)^(-2))%x%rep(1,N)))
S.prior = diag(diag(Sigma.hat))
nu.prior = N+1
vec <- theta[1:3]
for (i in 4:12){
vec <- c(vec, 1 + (theta[3]-1)*theta[4]^(i-3))
}
V <- c(ts(rep(1, nrow(Y)-12), c(1991,1), frequency = 4) , vec)
Y.tilde <- diag(1/V)%*%Y
X.tilde <- diag(1/V)%*%X
A.tilde.hat <- solve((t(X.tilde)%*%X.tilde+solve(V.prior)))%*%(t(X.tilde)%*%Y.tilde+solve(V.prior)%*%A.prior)
epsilon.tilde <-Y.tilde - X.tilde%*%A.tilde.hat
# Log-likelihood
logL <- log(prod(V^(-N)))+(-N/2)*log(det(t(X.tilde)%*%X.tilde+solve(V.prior)))+
(-(T-p+nu.prior)/2)*log(det(S.prior +t(epsilon.tilde)%*%epsilon.tilde +
t(A.tilde.hat-A.prior)%*%solve(V.prior)%*%(A.tilde.hat-A.prior)))
# Pareto(1,1) and Beta(3,1.5) priors
pareto.a=1
pareto.b=1
beta.a=3
beta.b=1.5
beta.cons <- 1/beta(beta.a,beta.b)
# Log-prior
logP <- log((pareto.a*pareto.b^pareto.a)/(theta[1]^(pareto.a+1))*
(pareto.a*pareto.b^pareto.a)/(theta[2]^(pareto.a+1))*
(pareto.a*pareto.b^pareto.a)/(theta[3]^(pareto.a+1))*
beta.cons*theta[4]^(beta.a-1)*(1-theta[4])^(beta.b-1))
# negative log-posterior
neglogPost <- -(logL+logP)
return(neglogPost)
}
# numerically minimize the negative log-likelihood
post.maximizer <- optim(par=c(50, 50, 50, 0.5), fn=v.neglogPost, method="L-BFGS-B",
lower=c(1, 1, 1, 0.0001),
upper=c(100,100,100,0.99999), hessian = TRUE)
return(list(maximizer=post.maximizer$par, hessian=post.maximizer$hessian))
}
```
2. Draw candidate $\boldsymbol\theta^{*}$ from $N_4(\boldsymbol\theta^{(s-1)},cW)$, where $W$
is the inverse Hessian of the negative log posterior of $\boldsymbol\theta$ at
the mode, which is also calculated computationally, and $c$ is a
scaling factor.
3. Set:
$$\boldsymbol\theta^{(s)}= \begin{cases}
\boldsymbol\theta^* & \text{with pr.} \quad \alpha^{(s)}
\\
\\
\boldsymbol\theta^{(s-1)} & \text{with pr.} \quad 1-\alpha^{(s)}
\end{cases}$$
$$\alpha^{(s)} =\text{min}\Big[1,\frac{p(\boldsymbol\theta^*|Y,X,\underline{\gamma})}{p(\boldsymbol\theta^{(s-1)}|Y,X,\underline{\gamma})}\Big]$$
4. Define $\textbf{c}^{(s)}$ matrix using $\boldsymbol\theta^{(s)}$:
$$\textbf{c}^{(s)}=[1\quad...\quad1 \quad \bar{c}_0^{(s)}\quad \bar{c}_1^{(s)}\quad \bar{c}_2^{(s)}\quad 1+(\bar{c}_2^{(s)}-1)\rho^{(s)}\quad 1+(\bar{c}_2^{(s)}-1)\rho^{(s)2}\quad...]'$$
Steps 2 to 4 are implemented via the function, `mh.mcmc` which takes in
data, the posterior mode of $\boldsymbol\theta$, and the inverse Hessian from
`c.posterior.mode` and returns the draws of $\boldsymbol\theta$.
```{r Metropolis-Hastings function, echo = TRUE}
library(MASS)
library(coda)
mh.mcmc <- function(data, p=1, S.mh = 1000, c, W = diag(4), theta.init,
k1 = 1, k2 = 100, start_date = c(1991,1)){
# N = no. of variables
N = ncol(data)
# p = no. of lags
K = 1 + p*N
# forecast horizon
# h = 8
Y = ts(data[(p+1):nrow(data),], start=start_date, frequency=4)
T = nrow(Y)
X = matrix(1,T,1)
# nrow(X)
for (i in 1:p){
X = cbind(X,data[(p+1):nrow(data)-i,])
}
# Calculate MLE for prior
############################################################
A.hat = solve(t(X)%*%X)%*%t(X)%*%Y
Sigma.hat = t(Y-X%*%A.hat)%*%(Y-X%*%A.hat)/nrow(Y)
# Specify prior distribution
############################################################
kappa.1 = k1
kappa.2 = k2
kappa.3 = 1
A.prior = matrix(0,nrow(A.hat),ncol(A.hat))
A.prior[2:(N+1),] = kappa.3*diag(N)
V.prior = diag(c(kappa.2,kappa.1*((1:p)^(-2))%x%rep(1,N)))
S.prior = diag(diag(Sigma.hat))
nu.prior = N+1
# Metropolis-Hastings
###########################################################
# v0, v1, v2, rho
Theta <- matrix(NA,S.mh,4)
theta_old <- theta.init
set.seed(1)
for (s in 1:S.mh){
covid.vec <- function(theta){
vec <- theta[1:3]
for (i in 4:12){
vec <- c(vec, 1 + (theta[3]-1)*theta[4]^(i-3))
}
return(vec)
}
# Covid volatility likelihood kernel
v.logL <- function(V){
Y.tilde <- diag(1/V)%*%Y
X.tilde <- diag(1/V)%*%X
A.tilde.hat <- solve((t(X.tilde)%*%X.tilde+solve(V.prior)))%*%(t(X.tilde)%*%Y.tilde+solve(V.prior)%*%A.prior)
epsilon.tilde <-Y.tilde - X.tilde%*%A.tilde.hat
logL <- log(prod(V^(-N)))+(-N/2)*log(det(t(X.tilde)%*%X.tilde+solve(V.prior)))+
(-(T-p+nu.prior)/2)*log(det(S.prior +t(epsilon.tilde)%*%epsilon.tilde +
t(A.tilde.hat-A.prior)%*%solve(V.prior)%*%(A.tilde.hat-A.prior)))
return(logL)
}
# Covid volatility prior
v.logP <- function(theta, pareto.a=1, pareto.b=1, beta.a=3, beta.b=1.5){
beta.cons <- 1/beta(beta.a,beta.b)
logP <- log((pareto.a*pareto.b^pareto.a)/(theta[1]^(pareto.a+1))*
(pareto.a*pareto.b^pareto.a)/(theta[2]^(pareto.a+1))*
(pareto.a*pareto.b^pareto.a)/(theta[3]^(pareto.a+1))*
beta.cons*theta[4]^(beta.a-1)*(1-theta[4])^(beta.b-1))
return(logP)
}
v_ones <- ts(rep(1, nrow(Y)-12), c(1991,1), frequency = 4)
V.old <- c(v_ones, covid.vec(theta_old))
# New candidate parameters values
theta_new <- mvrnorm(1, theta_old, c*W)
V.new <- c(v_ones, covid.vec(theta_new))
# Calculate posteriors
v.logpost_old <- v.logL(V.old)+v.logP(theta_old)
v.logpost_new <- v.logL(V.new)+v.logP(theta_new)
# Posterior ratio
post.ratio <- exp(v.logpost_new-v.logpost_old)
# Acceptance/rejection alpha
alpha <- min(1, post.ratio)
u_star <- runif(1)
if (alpha > u_star){
Theta[s,] <- theta_new
} else {Theta[s,] <- theta_old}
theta_old <- Theta[s,]
}