Analysis of Financial Time Series in Stan - Chapter 8 - Multivariate Time Series Analysis and Its Applications

Chapter_8.knit

In this series we are trying to reproduce the models and examples listed in the book “Analysis of Financial Time Series, 3rd Edition”, by Ruey S. Tsay, using Stan https://mc-stan.org/ and the package RStan https://cran.r-project.org/web/packages/rstan/index.html. The main repository for the presented models and data can be found at https://github.com/marcomarconi/AFTS_with_Stan.

{
  require(tidyverse)
  require(rstan)
  rstan_options(auto_write = TRUE)
  rstan_options(javascript = FALSE)
  theme_set(theme_classic(base_size = 24))
}

8.2 VECTOR AUTOREGRESSIVE MODELS

Vector autoregressive models are the multivariate version of AR processes, a VAR(1) is defined as follows:

\[ r_t = \phi_0 + \Phi r_{t-1} + a_{t} \] where \(\phi_0\) is a k-dimensional vector, \(\Phi\) is a k × k matrix, and {\(a_t\)} is a sequence of serially uncorrelated random vectors with mean zero and covariance matrix \(\Sigma\). In application, the covariance matrix \(\Sigma\) is required to be positive definite. Notice that \(\Phi\) contains the lagged effect of each series on the other, while the concurrent effect between the series (that is, the simultaneous effect at lag 0) is implicitly contained in the covariance matrix \(\Sigma\). For this reason, this type of VAR model is named “reduced-norm”. In time series analysis this form is preferred over the one where the concurrent relationship is explicitly state for easiness of estimation and also because concurrent correlations cannot really be used in forecasting. Notice also that the innovations \(a_t\) are uncorrelated with past values of \(r_t\), simply because \(a_t\) is not serially correlated. The requirements for a VAR(1) process to be stationary is that the eigenvalues of \(\Phi\) are less than 1 in modulus. The VAR(p) model can be naively coded in Stan as follows:

data {
  int<lower=0> N;
  int<lower=0> K;
  matrix[N,K] y;
  int p;
}
parameters {
  vector[K] mu;
  matrix[K,K] phi[p];
  cov_matrix[K] Sigma;
}
model {
  mu ~ normal(0,1);
  Sigma ~ inv_wishart(2, diag_matrix(rep_vector(1, K)));
  for(t in (p+1):N) {
    vector[K] mu_tmp = to_vector(mu);
    for(i in 1:p)
      mu_tmp += phi[p] * to_vector(y[t-p,]);  
    y[t,] ~ multi_normal(mu_tmp, Sigma);
  }
}
generated quantities {
  vector[N] log_lik;
  {
    for(i in 1:p)
      log_lik[i] = multi_normal_lpdf(y[i,] | mu , Sigma);
    for (t in (p+1):(N)) {
      vector[K] mu_tmp = mu;
      for(i in 1:p)
        mu_tmp += phi[i] * to_vector(y[t-i,]);
      log_lik[t] = multi_normal_lpdf(y[t,] | mu_tmp, Sigma);
    }
  }
}

This naive implementation is however very slow. We could instead take advantage of QR decomposition described in the manual https://mc-stan.org/docs/2_27/stan-users-guide/QR-reparameterization-section.html. This produces a much faster sampling.
The VAR(p) model with QR decomposition is as follows:


data {
  int<lower=0> N;
  int<lower=0> K;
  matrix[N,K] y;
  int p;
}
transformed data {
  matrix[N, K] Q_ast;
  matrix[K, K] R_ast;
  matrix[K, K] R_ast_inverse;
  // thin and scale the QR decomposition
  Q_ast = qr_thin_Q(append_row(rep_row_vector(0, 2), y[1:(N-1),])) * sqrt(N - 1);
  R_ast = qr_thin_R(append_row(rep_row_vector(0, 2), y[1:(N-1),])) / sqrt(N - 1);
  R_ast_inverse = inverse(R_ast);
}

parameters {
  vector[K] mu;
  cov_matrix[K] Sigma;
  matrix[K,K] theta[p];     

}

model {
  mu ~ normal(0,1);
  Sigma ~ inv_wishart(2, diag_matrix(rep_vector(1, K)));
  for(t in (p+1):N) {
    row_vector[K] mu_tmp = to_row_vector(mu);
    for(i in 1:p)
      mu_tmp += Q_ast[t+1-i,] * theta[i]; // QR
    y[t,] ~ multi_normal(mu_tmp, Sigma);
  }
}

generated quantities {
  matrix[K,K] phi[p];
  vector[N] log_lik;

  for(i in 1:p)
    phi[i] = R_ast_inverse * theta[i]; 
    
  {
    for(i in 1:p)
      log_lik[i] = multi_normal_lpdf(y[i,] | mu , Sigma);
    for (t in (p+1):(N)) {
      row_vector[K] mu_tmp = to_row_vector(mu);
      for(i in 1:p)
        mu_tmp += Q_ast[t+1-i,] * theta[i];
      log_lik[t] = multi_normal_lpdf(y[t,] | mu_tmp, Sigma);
    }
  }
}

Example 8.4.

Take the bivariate series of monthly log returns of IBM stock and the S&P 500 index:

ibmsp2608 <- read.table("data/m-ibmsp2608.txt", header = T)
ibmsp2608$date <- ibmsp2608$date %>% as.character() %>%  as.Date(format="%Y%m%d")
ibmsp2608[,2:3] <- ibmsp2608[,2:3] * 100
par(mfrow=c(2,1), mar=c(2,5,1,4))
plot(ibmsp2608$date, ibmsp2608[,2], type="l", ylab="IBM", xlab="Date", cex.lab=2, cex.axis=1.5)
plot(ibmsp2608$date, ibmsp2608[,3], type="l", ylab="S&P 500", xlab="Date", cex.lab=2, cex.axis=1.5)

We are going yo fit a VAR(5) model to the bivariate series:

VAR_QR <- stan_model("models/Chapter_8/VAR_QR.stan")
r <-  ibmsp2608[,2:3]
fit_VAR_QR <- sampling(VAR_QR, data = list(N=nrow(r), K=ncol(r), y=r, p=5, ncol=2), chains = 4, cores = 4, iter=500)
print(fit_VAR_QR, pars=c("mu", "Sigma", "phi"))
## Inference for Stan model: VAR_QR.
## 4 chains, each with iter=500; warmup=250; thin=1; 
## post-warmup draws per chain=250, total post-warmup draws=1000.
## 
##             mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## mu[1]       1.23    0.01 0.23  0.76  1.08  1.23  1.38  1.72  1297    1
## mu[2]       0.51    0.01 0.18  0.17  0.38  0.52  0.62  0.86  1293    1
## Sigma[1,1] 50.01    0.06 2.21 45.74 48.44 49.98 51.57 54.26  1457    1
## Sigma[1,2] 24.69    0.04 1.45 21.92 23.64 24.68 25.72 27.52  1230    1
## Sigma[2,1] 24.69    0.04 1.45 21.92 23.64 24.68 25.72 27.52  1230    1
## Sigma[2,2] 30.02    0.04 1.34 27.60 29.12 29.95 30.84 32.86  1229    1
## phi[1,1,1] -0.03    0.00 0.04 -0.11 -0.06 -0.03 -0.01  0.05  1081    1
## phi[1,1,2] -0.03    0.00 0.03 -0.09 -0.05 -0.03 -0.01  0.02  1024    1
## phi[1,2,1]  0.16    0.00 0.05  0.05  0.12  0.16  0.19  0.26   978    1
## phi[1,2,2]  0.12    0.00 0.04  0.04  0.09  0.12  0.15  0.20   994    1
## phi[2,1,1]  0.09    0.00 0.04  0.00  0.06  0.09  0.12  0.17  1161    1
## phi[2,1,2]  0.04    0.00 0.03 -0.02  0.02  0.04  0.07  0.10  1206    1
## phi[2,2,1] -0.17    0.00 0.05 -0.27 -0.20 -0.17 -0.13 -0.06  1110    1
## phi[2,2,2] -0.05    0.00 0.04 -0.13 -0.08 -0.05 -0.02  0.04  1407    1
## phi[3,1,1]  0.05    0.00 0.04 -0.03  0.02  0.05  0.07  0.13  1409    1
## phi[3,1,2]  0.02    0.00 0.03 -0.04  0.00  0.02  0.04  0.08  1228    1
## phi[3,2,1] -0.11    0.00 0.05 -0.22 -0.15 -0.11 -0.08 -0.01  1196    1
## phi[3,2,2] -0.13    0.00 0.04 -0.20 -0.15 -0.13 -0.10 -0.05  1395    1
## phi[4,1,1] -0.02    0.00 0.04 -0.11 -0.05 -0.02  0.00  0.06  1394    1
## phi[4,1,2]  0.02    0.00 0.03 -0.05  0.00  0.02  0.04  0.09  1241    1
## phi[4,2,1] -0.01    0.00 0.05 -0.11 -0.04 -0.01  0.03  0.10  1163    1
## phi[4,2,2]  0.02    0.00 0.04 -0.06 -0.01  0.02  0.05  0.10  1277    1
## phi[5,1,1] -0.05    0.00 0.04 -0.13 -0.08 -0.05 -0.03  0.03  1269    1
## phi[5,1,2] -0.07    0.00 0.03 -0.13 -0.09 -0.07 -0.05 -0.01  1227    1
## phi[5,2,1]  0.14    0.00 0.05  0.03  0.10  0.13  0.17  0.24  1371    1
## phi[5,2,2]  0.12    0.00 0.04  0.04  0.09  0.12  0.15  0.20  1313    1
## 
## Samples were drawn using NUTS(diag_e) at Sat Jun 19 16:45:31 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

The estimated parameters are very similar to Tsay’s full model of TABLE 8.4.
There is a unidirectional dynamic relationship from the monthly S&P 500 index return to the IBM return. IBM returns are affected by the past movements of the S&P 500 index However, past movements of IBM stock returns do not significantly affect the S&P 500 index, even though the two returns have substantial concurrent correlation.
We can plot the posterior concurrent correlation:

Sigma <- extract(fit_VAR_QR, pars=c("Sigma"))[[1]] 
corr <- (Sigma[,1,2] / sqrt(Sigma[,1,1] * Sigma[,2,2]))
hist(corr, xlim=c(-1,1), xlab="Concurrent Correlation", cex.lab=1.5, cex.axis=1.25, main="")

We can check the adequacy of the model by testing the autocorrelation using the multivariate Ljung-Box test:

library(portes)
r <- ibmsp2608[,2:3]
mu_hat <- extract(fit_VAR_QR, pars=c("mu"))[[1]] %>% colMeans()
phi_hat <- extract(fit_VAR_QR, pars=c("phi"))[[1]] %>% apply(., 2, function(x)colMeans(x))  
a_hat <- matrix(nrow = nrow(fit_VAR_QR), ncol = 2)
for(t in 6:nrow(fit_VAR_QR)) {
  mu_tmp <- mu_hat
  for(i in 1:5)
    mu_tmp <- mu_tmp + (matrix(phi_hat[,i], ncol=2) %>% t) %*% unlist(r[t-i,])
  a_hat[t,] <- unlist(r[t,]) - mu_tmp  
}
LjungBox(a_hat %>% na.omit, lags = 1:8)
##  lags statistic df   p-value
##     1  2.779259  4 0.5954183
##     2  3.732322  8 0.8804308
##     3  9.761490 12 0.6368760
##     4 12.160781 16 0.7328397
##     5 17.899709 20 0.5940149
##     6 20.262478 24 0.6817558
##     7 23.106503 28 0.7276919
##     8 32.682912 32 0.4332629

8.3 VECTOR MOVING-AVERAGE MODELS

Vector moving-average models are the multivariate version of MA processes, a VMA(1) is defined as follows:

\[ r_t = \theta_0 + \Theta a_{t-1} + a_{t} \]

where \(\theta_0\) is a k-dimensional vector, \(\Theta\) is a k × k matrix, and {\(a_t\)} is a sequence of serially uncorrelated random vectors with mean zero and covariance matrix \(\Sigma\). It is required that the covariance matrix \(\Sigma\) exists. The VMA(p) model can be naively coded in Stan as follows:


data {
  int<lower=0> N;
  int<lower=0> K;
  matrix[N,K] y;
  int Q;
}

transformed data {
  vector[K] y_vector[N];
  for(i in 1:N)
    y_vector[i] = to_vector(y[i,]);
}

parameters {
  vector[K] mu;
  cov_matrix[K] Sigma;
  matrix[K,K] Theta[Q];     
}

transformed parameters {
  matrix[K,N] epsilon;       // error terms
  for (t in 1:N) {
    epsilon[,t] = y_vector[t] - mu;
    for(q in 1:min(t-1, Q))
      epsilon[,t] = epsilon[,t] -  Theta[q] * epsilon[,t-q];
  }
}
model {
  vector[K] eta[N];
  mu ~ normal(0,5);
  Sigma ~ inv_wishart(3, diag_matrix(rep_vector(2, K)));
  for(k in 1:Q)
    for(i in 1:K)
      for(j in 1:K)
        Theta[k][i,j] ~ normal(0, 1);
  for(t in 1:N) {
    eta[t] = mu;
    for(q in 1:min(t-1, Q))
      eta[t] += Theta[q] * epsilon[,t - q];  
  }
  y_vector ~ multi_normal(eta, Sigma);
}

generated quantities {
  vector[N] log_lik;
  {
    for(i in 1:Q)
      log_lik[i] = multi_normal_lpdf(y[i,] | mu , Sigma);
    for (t in (Q+1):N) {
      vector[K] mu_tmp = mu;
      for(i in 1:Q)
        mu_tmp += Theta[i] * to_vector(y[t-i,]);
      log_lik[t] = multi_normal_lpdf(y[t,] | mu_tmp, Sigma);
    }
  }
}

We will fit to the same data as the VAR model before:

VMA <- stan_model("models/Chapter_8/VMA.stan")
r <-  ibmsp2608[,2:3]
fit_VMA <- sampling(VMA, data = list(N=nrow(r), K=ncol(r), y=r, p=5), chains = 4, cores = 4, iter=500, init=0)
print(fit_VMA, pars=c("mu", "Sigma", "Theta"))
## Inference for Stan model: VMA.
## 4 chains, each with iter=500; warmup=250; thin=1; 
## post-warmup draws per chain=250, total post-warmup draws=1000.
## 
##               mean se_mean   sd  2.5%   25%   50%   75% 97.5% n_eff Rhat
## mu[1]         1.34    0.01 0.23  0.87  1.20  1.34  1.48  1.80  1361    1
## mu[2]         0.58    0.00 0.18  0.21  0.47  0.58  0.71  0.95  1493    1
## Sigma[1,1]   49.87    0.05 2.21 45.73 48.39 49.79 51.25 54.57  1689    1
## Sigma[1,2]   24.62    0.04 1.44 22.09 23.56 24.56 25.61 27.38  1461    1
## Sigma[2,1]   24.62    0.04 1.44 22.09 23.56 24.56 25.61 27.38  1461    1
## Sigma[2,2]   29.91    0.03 1.30 27.57 29.02 29.87 30.75 32.60  1441    1
## Theta[1,1,1] -0.03    0.00 0.04 -0.11 -0.06 -0.03  0.00  0.05   906    1
## Theta[1,1,2]  0.16    0.00 0.05  0.06  0.12  0.16  0.19  0.26   922    1
## Theta[1,2,1] -0.03    0.00 0.03 -0.09 -0.05 -0.03 -0.01  0.03   801    1
## Theta[1,2,2]  0.12    0.00 0.04  0.04  0.09  0.12  0.15  0.20   883    1
## Theta[2,1,1]  0.08    0.00 0.04  0.00  0.05  0.08  0.10  0.16   855    1
## Theta[2,1,2] -0.15    0.00 0.05 -0.25 -0.18 -0.14 -0.11 -0.04   888    1
## Theta[2,2,1]  0.03    0.00 0.03 -0.03  0.01  0.03  0.05  0.10   819    1
## Theta[2,2,2] -0.03    0.00 0.04 -0.11 -0.06 -0.03  0.00  0.05   739    1
## Theta[3,1,1]  0.05    0.00 0.04 -0.04  0.02  0.05  0.08  0.13   885    1
## Theta[3,1,2] -0.12    0.00 0.06 -0.23 -0.16 -0.12 -0.08 -0.02   929    1
## Theta[3,2,1]  0.02    0.00 0.03 -0.05 -0.01  0.01  0.04  0.08   940    1
## Theta[3,2,2] -0.13    0.00 0.04 -0.21 -0.15 -0.13 -0.10 -0.05  1002    1
## Theta[4,1,1] -0.02    0.00 0.04 -0.10 -0.05 -0.02  0.01  0.06   794    1
## Theta[4,1,2] -0.04    0.00 0.05 -0.15 -0.08 -0.05 -0.01  0.06   713    1
## Theta[4,2,1]  0.03    0.00 0.03 -0.04  0.00  0.03  0.05  0.09   641    1
## Theta[4,2,2] -0.02    0.00 0.04 -0.11 -0.05 -0.02  0.01  0.06   662    1
## Theta[5,1,1] -0.04    0.00 0.04 -0.11 -0.07 -0.04 -0.01  0.04   686    1
## Theta[5,1,2]  0.13    0.00 0.05  0.03  0.10  0.13  0.17  0.24   760    1
## Theta[5,2,1] -0.05    0.00 0.03 -0.11 -0.07 -0.05 -0.03  0.01   692    1
## Theta[5,2,2]  0.13    0.00 0.04  0.05  0.10  0.13  0.16  0.21   743    1
## 
## Samples were drawn using NUTS(diag_e) at Sat Jun 26 14:21:12 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

The estimates are similar to Tsay’s in TABLE 8.6, except that all Theta’s signs are inverted! I am not quite sure about the reason.

8.4 VECTOR ARMA MODELS

Vector arma models are the multivariate version of ARMA processes, a VARMA(1,1) is defined as follows:

\[ r_t = \phi_0 + \Phi r_{t-1} + \Theta a_{t-1} + a_{t} \]

where \(\phi_0\) is a k-dimensional vector, \(\Phi\) and \(\Theta\) are k × k matrices, and {\(a_t\)} is a sequence of serially uncorrelated random vectors with mean zero and covariance matrix \(\Sigma\). It is required that the covariance matrix \(\Sigma\) exists. The VARMA(1,1) model can be naively coded in Stan as follows:

data {
  int<lower=1> N;            // num observations
  int K;                    // Matrices size
  matrix[N,K] y;                // observed outputs

}
parameters {
  vector[K] mu;                   // mean coeff
  matrix[K,K] phi;                  // autoregression coeff
  matrix[K,K] theta;                // moving avg coeff
  cov_matrix[K] Sigma;
}
model {
  vector[K] err[N];
  mu ~ normal(0, 10);
  for(i in 1:K)
    for(j in 1:K) {
        phi[i,j] ~ normal(0, 1);
        theta[i,j] ~ normal(0, 1);
    }
  Sigma ~ inv_wishart(3, diag_matrix(rep_vector(2, K)));
  err[1] = to_vector(y[1,]) - mu + phi * mu;
  err[1] ~ multi_normal(rep_vector(0, 2), Sigma);
  for (t in 2:N) 
    err[t] = to_vector(y[t]) - (mu + phi * to_vector(y[t-1]) + theta * err[t-1]);
  err ~ multi_normal(rep_vector(0, 2), Sigma);
  
}

generated quantities {
  vector[N] log_lik;
  vector[K] err[N];
  {
      err[1] = to_vector(y[1,]) - mu + phi * mu;
      log_lik[1] = multi_normal_lpdf(err[1] | rep_vector(0, 2), Sigma);
      for (t in 2:N) {
        err[t] = to_vector(y[t]) - (mu + phi * to_vector(y[t-1]) + theta * err[t-1]);
        log_lik[t] = multi_normal_lpdf(err[t] | rep_vector(0, 2), Sigma);
      }
  }

}

However, we can instead use a reparametrized form where we decompose the covariance matrix \(\Sigma\) into a diagonal matrix of the conditional standard deviations and a conditional correlation matrix:

data {
  int<lower=1> N;            // num observations
  int K;                  // Matrices size 
  matrix[N,K] y;                // observed outputs

}

transformed data {
  matrix[K,N] y_vector; 
  y_vector = y';
}

parameters {
  vector[K] mu;                   // mean coeff
  matrix[K,K] phi;                  // autoregression coeff
  matrix[K,K] theta;                // moving avg coeff
  corr_matrix[K] Omega;
  vector<lower=0>[K] tau;
}

transformed parameters {
  vector[K] epsilon[N];
  matrix[K,K] Sigma;
  Sigma = quad_form_diag(Omega, tau);
  epsilon[1] = y_vector[,1] - mu + phi * mu;
  for (t in 2:N) 
    epsilon[t] = y_vector[,t] - (mu + phi * y_vector[,t-1] + theta * epsilon[t-1]);
}
model {
  mu ~ normal(0, 10);
  for(i in 1:K)
    for(j in 1:K) {
        phi[i,j] ~ normal(0, 1);
        theta[i,j] ~ normal(0, 1);
    }
  tau ~ exponential(1);
  Omega ~ lkj_corr(2);
  epsilon ~ multi_normal(rep_vector(0, 2), Sigma);
}

generated quantities {
  vector[N] log_lik;
  vector[K] err[N];
  {
      err[1] = y_vector[,1] - mu + phi * mu;
      log_lik[1] = multi_normal_lpdf(err[1] | rep_vector(0, 2), Sigma);
      for (t in 2:N) {
        err[t] = y_vector[,t] - (mu + phi * y_vector[,t-1] + theta * err[t-1]);
        log_lik[t] = multi_normal_lpdf(err[t] | rep_vector(0, 2), Sigma);
      }
  }

}

We are going to apply the VARMA model to two U.S. monthly interest rate series from April 1953 to January 2001. The first series is the 1-year Treasury constant maturity rate, and the second series is the 3-year Treasury constant maturity rate. There are 574 observations. To ensure the positiveness of U.S. interest rates, we analyze the log series:

r <- read.table("data/m-gs1n3-5301.txt", header = F)
colnames(r) <- c("1-year", "3-year", "date")
par(mfrow=c(2,1), mar=c(2,5,1,4))
plot(log(r[,1]), type="l", ylab="1-year", xlab="Date", cex.lab=2, cex.axis=1.5)
plot(log(r[,2]), type="l", ylab="3-year", xlab="Date", cex.lab=2, cex.axis=1.5)

We will fit the VARMA(1,1) model:

VARMA_r <- stan_model("models/Chapter_8/VARMA_reparametrized.stan")
fit_VARMA <- sampling(VARMA_r, data = list(N=nrow(r), K=2, y=log(r[,c(1,2)])), chains = 4, cores = 4, iter=500, init=0)
print(fit_VARMA, pars=c("mu", "phi", "theta", "Sigma", "Omega", "tau"))
## Inference for Stan model: VARMA_reparametrized.
## 4 chains, each with iter=500; warmup=250; thin=1; 
## post-warmup draws per chain=250, total post-warmup draws=1000.
## 
##             mean se_mean   sd  2.5%   25%   50%  75% 97.5% n_eff Rhat
## mu[1]      -0.01    0.00 0.02 -0.06 -0.03 -0.01 0.00  0.03   242 1.01
## mu[2]       0.01    0.00 0.02 -0.03  0.00  0.01 0.02  0.04   215 1.02
## phi[1,1]    0.90    0.00 0.05  0.81  0.87  0.89 0.93  1.00   187 1.03
## phi[1,2]    0.11    0.00 0.05 -0.01  0.07  0.11 0.14  0.21   180 1.03
## phi[2,1]   -0.03    0.00 0.04 -0.10 -0.06 -0.03 0.00  0.05   181 1.03
## phi[2,2]    1.02    0.00 0.05  0.93  0.99  1.02 1.05  1.11   170 1.03
## theta[1,1]  0.39    0.01 0.10  0.18  0.32  0.39 0.45  0.58   313 1.01
## theta[1,2] -0.04    0.01 0.12 -0.26 -0.12 -0.04 0.03  0.20   317 1.00
## theta[2,1]  0.18    0.00 0.09  0.00  0.12  0.17 0.23  0.34   332 1.01
## theta[2,2]  0.10    0.01 0.10 -0.09  0.03  0.10 0.17  0.31   335 1.01
## Sigma[1,1]  0.01    0.00 0.00  0.00  0.00  0.01 0.01  0.01   451 1.00
## Sigma[1,2]  0.00    0.00 0.00  0.00  0.00  0.00 0.00  0.00   423 1.00
## Sigma[2,1]  0.00    0.00 0.00  0.00  0.00  0.00 0.00  0.00   423 1.00
## Sigma[2,2]  0.00    0.00 0.00  0.00  0.00  0.00 0.00  0.00   472 1.00
## Omega[1,1]  1.00     NaN 0.00  1.00  1.00  1.00 1.00  1.00   NaN  NaN
## Omega[1,2]  0.92    0.00 0.01  0.90  0.91  0.92 0.92  0.93   514 1.00
## Omega[2,1]  0.92    0.00 0.01  0.90  0.91  0.92 0.92  0.93   514 1.00
## Omega[2,2]  1.00    0.00 0.00  1.00  1.00  1.00 1.00  1.00   125 1.00
## tau[1]      0.07    0.00 0.00  0.07  0.07  0.07 0.07  0.08   452 1.00
## tau[2]      0.06    0.00 0.00  0.06  0.06  0.06 0.06  0.07   472 1.00
## 
## Samples were drawn using NUTS(diag_e) at Sat Jun 26 15:57:25 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

We can check the adequacy of the fitting by looking at the residuals:

err <- extract(fit_VARMA, pars=c("err"))[[1]] %>% colMeans()
colnames(err) <- c("1-year", "3-year")
plot.ts(err[-1,], main="Residuals", cex.axis=1.5, cex.lab=1.5, xlab="")

The interest rate series are highly contemporaneously correlated. The concurrent correlation coefficient is:

Omega <- extract(fit_VARMA, pars=c("Omega"))[[1]] 
hist(Omega[,1,2], xlim=c(-1,1), xlab="Concurrent Correlation", cex.lab=1.5, cex.axis=1.25, main="")

8.5 UNIT-ROOT NONSTATIONARITY AND COINTEGRATION

Cointegration is a popular econometric concept which could deserve its own dissertation. Cointegration can be obtained by a linear transformation of unit-root nonstationary series, and this linear combination is stationary. Cointegration is necessary to represents long-term relationship between two or more time series. In fact, you might be tempted to model a typical VAR model seem before to produce two long-term related time series:

library(mvtnorm)
set.seed(1986)
N <- 200
Phi <- matrix(c(0.5, 0.5, 0.25, 0.5), nrow=2, byrow=TRUE)
Sigma <- matrix(c(3.58,2.5,2.5,2.19), nrow=2)
a <- rmvnorm(N, c(0,0), Sigma)
r <- matrix(nrow = N, ncol = 2)
r[1,] <- 0
for(t in 2:N)
  r[t,] <- Phi %*% r[t-1,] + a[t,]
par(mfrow=c(2,1))
matplot(r, type="l", ylab="")
matplot(apply(r, 2, cumsum), type="l", ylab="")

Acf(apply(r, 2, cumsum))

You can see, however, that despite a clear correlation between the differences of the time series, the corresponding unit-root non stationary time series diverge pretty quickly. Cointegration ensure the long-term relationship between the two time series along a common trend.
Engle and Granger (1987) discuss an error correction representation for a cointegrated system. In this example we are going to implemente the a simple vector error correction model (VECM) using the same formulation as the VECM function of the tsDyn package. It can be coded in Stan as follows:

fit_ECM_treasury <- sampling(ECM, data = list(N=nrow(r), K=ncol(r), y=r, P=2), chains = 4, cores = 4, init=0)

Now, we consider two weekly U.S. short-term interest rates. The series are the 3-month Treasury bill (TB) rate and 6-month Treasury bill rate from December 12, 1958, to August 6, 2004, for 2383 observations.

We will fit the VECM to this data, and compare it with the result obtained with the tsDyn package:

print(fit_ECM_treasury)
## Inference for Stan model: ECM.
## 4 chains, each with iter=2000; warmup=1000; thin=1; 
## post-warmup draws per chain=1000, total post-warmup draws=4000.
## 
##               mean se_mean   sd    2.5%     25%     50%     75%   97.5% n_eff
## alpha[1]     -0.09    0.00 0.02   -0.13   -0.11   -0.09   -0.08   -0.05  2357
## alpha[2]     -0.02    0.00 0.02   -0.06   -0.03   -0.02   -0.01    0.01  2396
## beta          1.01    0.00 0.01    0.99    1.01    1.01    1.02    1.03  3551
## c0[1]        -0.02    0.00 0.01   -0.04   -0.03   -0.02   -0.02   -0.01  2565
## c0[2]         0.00    0.00 0.01   -0.02   -0.01    0.00    0.00    0.01  2658
## phi[1,1,1]    0.05    0.00 0.05   -0.04    0.01    0.05    0.08    0.14  1786
## phi[1,1,2]    0.27    0.00 0.05    0.16    0.23    0.27    0.30    0.37  1809
## phi[1,2,1]   -0.04    0.00 0.04   -0.12   -0.07   -0.04   -0.01    0.04  1789
## phi[1,2,2]    0.32    0.00 0.05    0.22    0.28    0.32    0.35    0.41  1749
## phi[2,1,1]   -0.21    0.00 0.05   -0.30   -0.24   -0.21   -0.17   -0.11  1627
## phi[2,1,2]    0.25    0.00 0.06    0.14    0.22    0.25    0.29    0.36  1637
## phi[2,2,1]   -0.03    0.00 0.04   -0.12   -0.06   -0.03    0.00    0.05  1686
## phi[2,2,2]    0.10    0.00 0.05    0.00    0.07    0.10    0.13    0.20  1666
## Sigma[1,1]    0.04    0.00 0.00    0.04    0.04    0.04    0.04    0.04  3762
## Sigma[1,2]    0.03    0.00 0.00    0.03    0.03    0.03    0.03    0.04  3696
## Sigma[2,1]    0.03    0.00 0.00    0.03    0.03    0.03    0.03    0.04  3696
## Sigma[2,2]    0.03    0.00 0.00    0.03    0.03    0.03    0.03    0.03  3671
## lp__       7569.47    0.07 2.90 7562.71 7567.79 7569.80 7571.59 7574.08  1599
##            Rhat
## alpha[1]      1
## alpha[2]      1
## beta          1
## c0[1]         1
## c0[2]         1
## phi[1,1,1]    1
## phi[1,1,2]    1
## phi[1,2,1]    1
## phi[1,2,2]    1
## phi[2,1,1]    1
## phi[2,1,2]    1
## phi[2,2,1]    1
## phi[2,2,2]    1
## Sigma[1,1]    1
## Sigma[1,2]    1
## Sigma[2,1]    1
## Sigma[2,2]    1
## lp__          1
## 
## Samples were drawn using NUTS(diag_e) at Tue Jun 29 23:23:59 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
library(tsDyn)
summary(VECM(r, 2, estim="ML"))
## #############
## ###Model VECM 
## #############
## Full sample size: 2383   End sample size: 2380
## Number of variables: 2   Number of estimated slope parameters 12
## AIC -19907.66    BIC -19832.58   SSR 173.3685
## Cointegrating vector (estimated by ML):
##    X3m       X6m
## r1   1 -1.012439
## 
## 
##              ECT                 Intercept           X3m -1             
## Equation X3m -0.0949(0.0199)***  -0.0217(0.0061)***  0.0466(0.0480)     
## Equation X6m -0.0211(0.0179)     -0.0051(0.0055)     -0.0419(0.0432)    
##              X6m -1            X3m -2              X6m -2           
## Equation X3m 0.2650(0.0538)*** -0.2067(0.0481)***  0.2547(0.0543)***
## Equation X6m 0.3164(0.0484)*** -0.0346(0.0433)     0.0994(0.0488)*

The results are very similar. We can further observe and check the residuals:

N <- nrow(r)
r <- as.matrix(r)
pars <- extract(fit_ECM_treasury)
c0 <- colMeans(pars$c0)
alpha <- colMeans(pars$alpha)
beta <- mean(pars$beta)
phi <- apply(pars$phi, c(2,3,4), mean)
residuals <- matrix(nrow = N, ncol = 2)
residuals[1:3,] <- 0
for(t in 4:N)
  residuals[t,] <-  c0 + (alpha %*% (c(1,-beta) %*% r[t-1,])) + phi[,,1] %*% (r[t-1,] - r[t-2,]) + phi[,,2] %*% (r[t-2,] - r[t-3,]) 
plot.ts(residuals, cex.axis=1.5, cex.lab=1.5, xlab="")

A further analysis of the cointegrating residuals reveals the time series distance between the two assets:

r_hat <- matrix(nrow = N, ncol = 2)
r_hat[1:3,] <- 0
for(t in 4:N)
  r_hat[t,] <- r[t-1,] + c0 + (alpha %*% (c(1,-beta) %*% r[t-1,])) + phi[,,1] %*% (r[t-1,] - r[t-2,]) + phi[,,2] %*% (r[t-2,] - r[t-3,]) 
plot(r_hat[,1] - r_hat[,2] , type="l", cex.axis=1.5, cex.lab=1.5, xlab="", ylab="Cointegrating residuals")

8.8 PAIRS TRADING

We are going to use cointegration to show a simple example of pairs trading. This kind of strategy involves asset that tend to move along a common trend and exploits divergences along the path to sell the overpriced asset and to buy the underpriced. one. The difference between the two observed prices is called the spread. For pairs trading, the greater the spread, the larger the magnitude of mispricing and the greater the profit potential. See Vidyamurthy (2004), Pole (2007), Chan (2013) and https://hudsonthames.org/arbitragelab/ for more info.
To demonstrate pairs trading, Tsay considers two stocks traded on the New York Stock Exchange. The two companies are the Billiton Ltd. of Australia and the Vale S.A. of Brazil with stock symbols BHP and VALE, respectively:

BHP <- read.table("data/d-bhp0206.txt", header=T)
VALE <- read.table("data/d-vale0206.txt", header=T)
Stocks <- data.frame(BHP=BHP$adjclose %>% log, VALE=VALE$adjclose %>% log)
matplot(Stocks, ylab="Log price", type="l", col=c("black", "red"), cex.axis=1.5, cex.lab=1.5, xlab=""); 
legend('topleft',legend = colnames(Stocks), col = c('black','red'), lty=1:2, cex=1.5)

A simple linear regression already confirms that there is some sort of relationship between the two stocks:

library(tseries)
fit <- lm(BHP ~ VALE, Stocks)
summary(fit)
## 
## Call:
## lm(formula = BHP ~ VALE, data = Stocks)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.151818 -0.028265  0.003121  0.029803  0.147105 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.822648   0.003662   497.7   <2e-16 ***
## VALE        0.716664   0.002354   304.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04421 on 944 degrees of freedom
## Multiple R-squared:  0.9899, Adjusted R-squared:  0.9899 
## F-statistic: 9.266e+04 on 1 and 944 DF,  p-value: < 2.2e-16
plot(fit$residuals, type="l", cex.axis=1.5, cex.lab=1.5, xlab="")

Acf(fit$residuals)

adf.test(fit$residuals, k=2)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  fit$residuals
## Dickey-Fuller = -6.0306, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary

We are going to fit the VECM model and plot the implied spread, together with the 90% compatibility intervals:

fit_ECM_pairs <- readRDS("models/Chapter_8/saved_RDS/fit_ECM_pairs.rds")

The horizontal lines indicate the +-standard deviation of the spread, which can be used as strategy to enter trades.

Comments