Analysis of Financial Time Series in Stan - Chapter 4 - Nonlinear Models

Chapter_4.knit

In this series we are trying to reproduce the models and examples listed in the book “Analysis of Financial Time Series, 3rd Edition”, by Ruey S. Tsay, using Stan https://mc-stan.org/ and the package RStan https://cran.r-project.org/web/packages/rstan/index.html. The main repository for the presented models and data can be found at https://github.com/marcomarconi/AFTS_with_Stan.

{
  require(tidyverse)
  require(rstan)
  require(bayesplot)
  require(loo)
  require(LaplacesDemon)
  library(xts)
  library(quantmod)
  library(FKF)
  library(MARSS)
  library(forecast)
  rstan_options(auto_write = TRUE)
  rstan_options(javascript = FALSE)
  theme_set(theme_classic(base_size = 24))
}

The Threshold Autoregressive (TAR) model

In a TAR model, AR models are estimated separately in two or more piecewise values as defined by the dependent variable. The two (or more) AR model are modelled based on a threshold value (or another condition). In general, there will be different AR models depending on whether the value of interested exceeds or not the predefined threshold.These AR models may or may not be of the same order.

Example 4.3

We are going to use the daily log returns, in percentage and including dividends, of IBM stock from July 3, 1962, to December 31, 2003, for 10,446 observations.

d_ibmvwewsp6203 <- read.table("data/d-ibmvwewsp6203.txt", header=F )
y <- (d_ibmvwewsp6203$V2 %>% `*`(100) %>% ts) 
plot.ts(y, ylab="Log return", xlab="Time")

We are going to fit to the data 3 models: AR(2)–GARCH(1,1), TGARCH and AR(2)-TAR-GARCH(1,1).

The AR(2)-TAR-GARCH(1,1) can be coded in Stan as follows:

data {
  int<lower=0> N;
  vector[N] y;
  int K;
  int threshold;
  real sigma1;
  real mu1;
}
parameters {
  real<lower=0> alpha0;
  real<lower=0,upper=1> alpha1;
  real<lower=0,upper=(1-alpha1)> beta1;
  real<lower=-1,upper=1> phi_shock;                 
  real<lower=-1,upper=1> phi_sigma;                  
  real ar0;
  vector<lower=0,upper=1>[K] ar;
  
}
transformed parameters {
  real<lower=0> sigma[N];
  real mu[N];
  for (n in 1:K) {
    sigma[n] = sigma1;
    mu[n] = mu1;
  }
  for (t in (K+1):N) {
    mu[t] = ar0;
    for (k in 1:K) 
      mu[t] += ar[k] * y[t-k];
    sigma[t] = sqrt(alpha0
              + alpha1 * pow(y[t-1] - mu[t-1], 2)
              + beta1 * pow(sigma[t-1], 2)
              + (phi_shock *  pow(y[t-1] - mu[t-1], 2) + phi_sigma * pow(sigma[t-1], 2)) * ((y[t-1] - mu[t-1]) < threshold)
              );
  }
}
model {
  alpha0 ~ normal(0, 1);
  alpha1 ~ normal(0, 1);
  beta1 ~ normal(0, 1);
  sigma1 ~ cauchy(0, 1);
  phi_shock ~ normal(0, 2);
  phi_sigma ~ normal(0, 2);
  ar0 ~ normal(0, 1);
  ar ~ normal(0, 1);
  y ~ normal(mu, sigma);    
  
}
generated quantities {
  vector[N] log_lik;
  {
    real log_lik_mu = 0;
    for (t in (K+1):N) {
      log_lik[t] = normal_lpdf(y[t] | mu[t], sigma[t]);  
      log_lik_mu += log_lik[t];
    }       
    log_lik_mu /= N-(K+1);
    for (t in 1:K) 
      log_lik[t] = log_lik_mu;    
  }
}

For simplicity, the AR(2)-TAR-GARCH(1,1) will be fitted with the variational inference algorithm, as MCMC sampling takes a long time (here we are also loading pre-fitted models as it takes a long time to build this Markdown document):

#TAR <- stan_model(file = "models/Chapter_4/AR-TAR-GARCH.stan")
#fit_TAR_ibmvwewsp6203 <- vb(TAR, data = list(N = length(y), y = y, K=2,threshold=0, sigma1=sd(y), mu1=mean(y)), iter=500)
#fit_ARGARCH_ibmvwewsp6203 <- stan(file = "models/Chapter_3/AR-GARCH.stan", data = list(N = length(y), y = y, K=2), chains = 1, iter=500)
#fit_TGARCH_ibmvwewsp6203 <- stan(file = "models/Chapter_3/TGARCH.stan", data = list(N = length(y), y = y), chains = 1, iter=500)
fit_TAR_ibmvwewsp6203 <- readRDS("models/Chapter_4/saved_RDS/fit_TAR_ibmvwewsp6203.rds")
fit_ARGARCH_ibmvwewsp6203 <- readRDS("models/Chapter_4/saved_RDS/fit_ARGARCH_ibmvwewsp6203.rds")
fit_TGARCH_ibmvwewsp6203 <- readRDS("models/Chapter_4/saved_RDS/fit_TGARCH_ibmvwewsp6203.rds")

In the book Tsay notices that the AR(2)-TAR-GARCH(1,1) produces an unconditional mean much closer to the sample mean of 0.039 compared to the other two models, meaning that this non-linear model is able to capture the asymmetric behavior in daily IBM stock volatility while still being able to retain the returns mean value:

pars <- rstan::extract(fit_ARGARCH_ibmvwewsp6203)
mean_ARGARCH <- pars$ar0 / (1 - pars$ar[[1]] - pars$ar[[2]])
pars <- rstan::extract(fit_TGARCH_ibmvwewsp6203)
mean_TGARCH <- pars$mu
pars <- rstan::extract(fit_TAR_ibmvwewsp6203)
mean_TAR <- pars$ar0 / (1 - pars$ar[[1]] - pars$ar[[2]])
plot(density(mean_ARGARCH), xlim=c(0, 0.15), ylim=c(0,35), main="", xlab="", ylab="")
lines(density(mean_TGARCH), col="blue")
lines(density(mean_TAR), col="red")
abline(v=0.039, lty=2)
legend("topright", legend=c("AR-GARCH", "TGARCH", "AR-TAR-GARCH"), col=c("black", "blue", "red"), lty=rep(1,3))

Smooth Transition AR (STAR) model

The main problem with TAR model is that steep discontinuity points between different mean regimes are sometimes hard to explain in practice. In response, the STAR model was proposed, which models the transition using some smooth sigmoid function (usually logistic, see formula (4.15) in the book). A possible implementation of the STAR model in Stan could look as follows. Notice the requirement for a mean and scale prior for the scale paramter of the logistic funcion (gamma2). This parameter is usually hard to identify, even with large amount of data:

// PSIS-LOO-LFO fails here
data {
  int<lower=0> N;
  vector[N] y;
  int K;
  real sigma1;
  real gamma2_mean;
  real gamma2_sd;
}
parameters {
  real mu;
  real<lower=0> beta0;
  real<lower=0,upper=1> beta1;
  real<lower=0,upper=1> beta2;
  real gamma0;
  real gamma1;
  real<lower=0> gamma2;
}
transformed parameters {
  vector<lower=0>[N] sigma;
  for(i in 1:K)
    sigma[i] = sigma1; //beta0 + (gamma0/(1 + exp(0)));
  for (t in (K+1):N)
    sigma[t] = sqrt(
                    beta0 
                    + beta1*(y[t-1]-mu)^2 
                    + beta2*(y[t-2]-mu)^2 
                    + (gamma0 + gamma1*(y[t-1]-mu)^2)/(1 + exp(-gamma2*(y[t-1]-mu)))
                    );
}
model {
  mu ~ normal(0, 1);
  beta0 ~ normal(0.5, 0.5);
  beta1 ~ normal(0.5, 0.5);
  gamma0 ~ normal(0, 0.01); 
  gamma1 ~ normal(0, 0.1);
  gamma2 ~ normal(gamma2_mean, gamma2_sd);
  y ~ normal(mu, sigma);
}
generated quantities {
   vector[N] log_lik;

   { 
     for (t in 1:N) 
       log_lik[t] = normal_lpdf(y[t] | mu, sigma[t] );
     
   }
}

Example 4.4

Let’s fit the STAR model to the monthly simple stock returns for Minnesota Mining and Manufacturing (3M) Company from February 1946 to December 2008, together with regular ARCH(2) model:

ARCH <- stan_model("models/Chapter_3/ARCH.stan")
STAR <- stan_model("models/Chapter_4/STAR.stan")
m_3m4608<- read.table("data/m-3m4608.txt", header=T )
fit_m_3m4608_ARCH <- sampling(ARCH, data = list(N = length(m_3m4608$rtn), y = m_3m4608$rtn, Kar=2, family=0), iter=1000,chains = 4, cores = 4, refresh=0)
fit_m_3m4608_STAR <- sampling(STAR, data = list(N = length(m_3m4608$rtn), y = m_3m4608$rtn, K=2, sigma1=sd(m_3m4608$rtn),gamma2_mean=1000, gamma2_sd=500 ), iter=1000,chains = 4, cores = 4, refresh=0)
print(fit_m_3m4608_ARCH, pars=names(fit_m_3m4608_ARCH)[1:4])
## Inference for Stan model: ARCH.
## 4 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=2000.
## 
##          mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
## mu       0.01       0 0.00 0.01 0.01 0.01 0.01  0.02  2408 1.00
## alpha0   0.00       0 0.00 0.00 0.00 0.00 0.00  0.00  1021 1.00
## alpha[1] 0.10       0 0.05 0.02 0.07 0.10 0.13  0.21   941 1.01
## alpha[2] 0.12       0 0.05 0.04 0.08 0.12 0.15  0.24   983 1.00
## 
## Samples were drawn using NUTS(diag_e) at Sat May 29 15:27:34 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
print(fit_m_3m4608_STAR, pars=names(fit_m_3m4608_STAR)[1:7])
## Inference for Stan model: STAR.
## 4 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=2000.
## 
##           mean se_mean     sd   2.5%    25%     50%     75%   97.5% n_eff Rhat
## mu        0.01    0.00   0.00   0.01   0.01    0.01    0.02    0.02  1918    1
## beta0     0.00    0.00   0.00   0.00   0.00    0.00    0.00    0.00   902    1
## beta1     0.18    0.00   0.06   0.07   0.14    0.18    0.22    0.30   677    1
## beta2     0.11    0.00   0.05   0.03   0.08    0.11    0.14    0.22   924    1
## gamma0    0.00    0.00   0.00   0.00   0.00    0.00    0.00    0.00  1253    1
## gamma1   -0.16    0.00   0.07  -0.28  -0.20   -0.16   -0.12   -0.03   547    1
## gamma2 1031.60   14.66 439.89 243.41 710.97 1026.14 1343.04 1924.12   901    1
## 
## Samples were drawn using NUTS(diag_e) at Sat May 29 15:28:25 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Notice how the gamma2 posterior is practically unaffected by the data. The logistic parameters seem to be significantly different from zero, suggesting a substantial asymmetry in volatility response to positive/negative shocks. Tsay suggests that for large positive shocks the ARCH effects in the STAR model are weak, and stronger for large negative shocks.

We can appreciate the consequence of the uncertainty in parameter estimation by plotting the implied curve for the logistic function:

pars <- extract(fit_m_3m4608_STAR)
x <- seq(-0.01, 0.01, 0.0001); 
num <- sapply(1:2000, function(i) pars$gamma0[i] + pars$gamma1[i]*x^2)
den <- sapply(1:2000, function(i) pars$gamma2[i]*x)
res <- (num / (1 + exp(-den)))
q <- apply(res, 1, function(i) quantile(i, probs = c(0.05, 0.5, 0.95)))
plot(NA,xlim=c(-0.01, 0.01),ylim=c(0,0.2e-2),ylab="STAR effect",xlab="Previous Shock")
polygon(c(x, rev(x)), c(q[1,], rev(q[3,])), lty=2, col="gray")
lines(x, q[2,], ylim=c(0,1e-3))

A comparison between ARCH and STAR seems to favor the second:

loo_compare(loo(fit_m_3m4608_ARCH), loo(fit_m_3m4608_STAR))
##        elpd_diff se_diff
## model2  0.0       0.0   
## model1 -3.7       2.4

Markov Switching Autoregressive models

We can also model the transition between different regimes by assuming the existence of two underlying discrete hidden states and that the observed data is the result of the probabilistic transition between these two states. The idea is similar to hidden markov models and has been proposed by Hamilton (1989). This kind of model has already been implemented by Jim Savage http://modernstatisticalworkflow.blogspot.com/2018/02/regime-switching-models-in-stan.html so we are going to use a modified version of its code:

data {
  int N;
  vector[N] y;
  int Kar ;
}
parameters {
  vector<lower = -1, upper = 1>[Kar] phi[2];
  real<lower=0> c1;
  real<upper=0> c2;
  vector<lower = 0>[2] sigma;
  vector<lower = 0, upper = 1>[2] p;
  real<lower = 0, upper = 1> xi1_init; 
}
transformed parameters {
  vector[2] c;
  matrix[N, 2] eta;
  matrix[N, 2] xi;
  vector[N] f;
  c[1] = c1;
  c[2] = c2;
  // fill in etas
  for(t in 1:N) {
    if(t > Kar) {
      for(k in 1:2)
        eta[t,k] = exp(normal_lpdf(y[t]| c[k] + phi[k] .* y[(t-Kar):(t-1)], sigma[k]));
    } else {
      for(k in 1:2)
        eta[t,k] = exp(normal_lpdf(y[t]| c[k] , sigma[k]));
    }
  }
  
  // work out likelihood contributions
  
  for(t in 1:N) {
    // for the first observation
    if(t==1) {
      f[t] = p[1]*xi1_init*eta[t,1] + // stay in state 1
             (1 - p[1])*xi1_init*eta[t,2] + // transition from 1 to 2
             p[2]*(1 - xi1_init)*eta[t,2] + // stay in state 2 
             (1 - p[2])*(1 - xi1_init)*eta[t,1]; // transition from 2 to 1
      
      xi[t,1] = (p[1]*xi1_init*eta[t,1] +(1 - p[2])*(1 - xi1_init)*eta[t,1])/f[t];
      xi[t,2] = 1.0 - xi[t,1];
    
    } else {
    // and for the rest
      
      f[t] = p[1]*xi[t-1,1]*eta[t,1] + // stay in state 1
             (1 - p[1])*xi[t-1,1]*eta[t,2] + // transition from 1 to 2
             p[2]*xi[t-1,2]*eta[t,2] + // stay in state 2 
             (1 - p[2])*xi[t-1,2]*eta[t,1]; // transition from 2 to 1
      
      // work out xi
      
      xi[t,1] = (p[1]*xi[t-1,1]*eta[t,1] +(1 - p[2])*xi[t-1,2]*eta[t,1])/f[t];
      
      // there are only two states so the probability of the other state is 1 - prob of the first
      xi[t,2] = 1.0 - xi[t,1];
    }
  }
  
}
model {
  // priors
  p ~ beta(2, 2);
  [c1, c2] ~ cauchy(0, 1);
  for(k in 1:2)
    phi[k] ~ normal(0,1);
  sigma ~ cauchy(0, 1);
  xi1_init ~ beta(2, 2);

  target += sum(log(f));
}

Fitting this model to the data result is parameters estimation vaguely similar to Tsay, even if the autoregressive coefficients are quite different. Notice that the probability to stay in a state of positive growth is higher that the probability to stay in a state of negative growth, indicating that it is more likely for the U.S. GNP to get out of a contraction period than to jump into one:

q_gnp4791 <- scan("data/q-gnp4791.txt" )*100
MSA <- stan_model("models/Chapter_4/MSA.stan")
fit_q_gnp4791 <- sampling(MSA, data = list(N = length(q_gnp4791), y=q_gnp4791, K=2, Kar=4), chains = 4, cores = 4, iter=1000, init=0, refresh=0)
plot(fit_q_gnp4791, pars=names(fit_q_gnp4791)[1:14])

We can look at the mean growth rate of the two states, notice that the state corresponding to negative growth rates display large uncertainty, due to less observations:

pars <- extract(fit_q_gnp4791)
m1 <- pars$c[,1] / (1 - (pars$phi[,1,1] + pars$phi[,1,2] + pars$phi[,1,3] + pars$phi[,1,4]))
m2 <- pars$c[,2] / (1 - (pars$phi[,2,1] + pars$phi[,2,2] + pars$phi[,2,3] + pars$phi[,2,4]))
plot(density(m1), xlim=c(-7,7), col="blue")
lines(density(m2), xlim=c(-7,7), col="red")

The expected duration of the two states are quite different from Tsay (he estimated 11.31 quarters for the positive state and 3.69 for the negative state):

w1 <- 1/pars$p[,1] 
w2 <- 1/pars$p[,2] 
plot(density(w1), xlim=c(0,6),col="blue")
lines(density(w2),xlim=c(0,6), col="red")

Finally, we can also observe the probability of each state along the whole period:

xi <- pars$xi %>% colMeans()
matplot(xi, type="l", col=c("blue", "red"))

colMeans(xi)
## [1] 0.5929067 0.4070933

Comments