Dynamic Linear Models applied to Pairs Trading

SSM_beta.utf8

In this post we want to replicate the example shown at page 76 of Machine Tranding, by Ernest Chan. Chan shows how to apply a state spàce model to two (possibly) conintegrating time series (the ETFs EWC and EWA, in the example). The kind of SSM used by Chan is also known as dynamic linear regression in “Dynamic Linear Models with R” by Petris et al. See also https://nwfsc-timeseries.github.io/atsa-labs/sec-dlm-overview.html for an approach with the MARSS package.
The rationale of this type of stragety arises by observing the relationship between the two time series:

require(tidyverse)
require(rstan)
library(xts)
library(quantmod)
library(FKF)
library(MARSS)
rstan_options(auto_write = TRUE)
rstan_options(javascript = FALSE)

getSymbols("EWC", from="2006-04-26", to="2012-04-09")

## [1] "EWC"

getSymbols("EWA", from="2006-04-26", to="2012-04-09")

## [1] "EWA"

EWC_price <- EWC$EWC.Adjusted %>% as.vector
EWA_price <- EWA$EWA.Adjusted %>% as.vector
plot(EWC_price, EWA_price)

It is clear that the two prices display a strong linear relationship. We can therefore a regression line and have a look at the residuals:

fit_lm <- lm(EWC_price ~ EWA_price)
fit_lm

## 
## Call:
## lm(formula = EWC_price ~ EWA_price)
## 
## Coefficients:
## (Intercept)    EWA_price  
##       3.418        1.319

plot(fit_lm$residuals)

The residuals display the same behaviour of a common stationary time series. We could simply apply one of the Box-Jenkins models to exploit this behaviour, but in this example we are instead try to fit a dynamic linear model (DLM) to the data.
The model used by Chan is the following:

\[[EWC] = [EWA,1] * [hegderatio\quad offset]' + noise \]

which can be represented as DLM:

\[ y_t = [x_t,1] * [beta_0 \quad beta_1]_t' + \epsilon_t \\ [beta_0 \quad beta_1]_t' = [beta_0 \quad beta_1]_{t-1}' + \eta_t \] Note that beta is simply a time-varying vector containing the hedge ratio and the constant offset between the two time series.
The model can be coded in Stan as follow:


data {
  int<lower=1> N;
  vector[N] y;
  vector[N] x;
}

parameters {
  real<lower=0> sigma_y;
  cov_matrix[2] Sigma;
  vector[2] beta_init;
  vector[2] err_beta[N];
}
transformed parameters {
  matrix[2,N] beta;
  vector[N] mu;
  
  beta[,1] = beta_init + err_beta[1];
  mu[1] = beta[,1]' * [1, x[1]]';
  for(t in 2:N) {
    beta[,t] = beta[,t-1] + err_beta[t];
    mu[t] =  beta[,t]' * [1, x[t]]';
  }
}

model {
  beta_init ~ cauchy(0, 5);
  sigma_y ~ student_t(4, 0, 1);
  Sigma ~ inv_wishart(2, diag_matrix(rep_vector(1,2)));

  err_beta ~ multi_normal([0,0]', Sigma);
  y ~ normal(mu, sigma_y);
}

Let’s fit it to the data to the trainset (the first 1250 observations):

SSM_beta <- stan_model("models/EpChan/SSM_beta.stan")
trainset <- 1:1250
testset <- 1251:1500
#SSM_beta_chan <- sampling(SSM_beta, data= list(N = length(trainset), y = EWC_price[trainset],  x = EWA_price[trainset]), iter = 500, chains=1)
SSM_beta_chan <- readRDS("models/EpChan/SSM_beta_chan.rds")

Stan will have a quite hard time to fit this model, as these kind of model have strong identifibility issues. Still, estimations are close to Chan’s:

print(SSM_beta_chan, pars=c("sigma_y", "Sigma"),  digits=5)

## Inference for Stan model: SSM_beta.
## 1 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=500.
## 
##                mean se_mean      sd     2.5%      25%      50%      75%
## sigma_y     0.07408 0.00962 0.01582  0.05255  0.06197  0.06651  0.08668
## Sigma[1,1]  1.32915 0.04084 0.07457  1.21223  1.26336  1.32791  1.38095
## Sigma[1,2] -0.10461 0.00334 0.00609 -0.11752 -0.10879 -0.10470 -0.09961
## Sigma[2,1] -0.10461 0.00334 0.00609 -0.11752 -0.10879 -0.10470 -0.09961
## Sigma[2,2]  0.00984 0.00029 0.00055  0.00893  0.00943  0.00984  0.01025
##               97.5% n_eff    Rhat
## sigma_y     0.10226     3 2.38213
## Sigma[1,1]  1.49018     3 2.06112
## Sigma[1,2] -0.09482     3 2.11920
## Sigma[2,1] -0.09482     3 2.11920
## Sigma[2,2]  0.01094     4 2.01948
## 
## Samples were drawn using NUTS(diag_e) at Sat May  1 14:10:04 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

We can now apply the Kalman Filter to the whole dataset to get filtered estimates for the hedge ratio and offset. We can use the FKF package to do the job (or you can write the Kalman filter yourself):

# Retrive the estimates for the noise deviations
sigma_y <- extract(SSM_beta_chan, pars="sigma_y")[[1]] %>% mean
Sigma <- extract(SSM_beta_chan, pars="Sigma")[[1]] %>% apply(., c(2, 3), mean)
# the initial values for the filter
a0=c(1,1);P0=matrix(c(0,0,0,0), byrow = T, ncol=2)
# Now fill all the matrices according to the package documentation https://cran.r-project.org/web/packages/FKF/FKF.pdf
N <- length(EWC_price)
y <- EWC_price
Zt <- array(data = t(cbind(1, EWA_price)), dim = c(1, 2, N))
Tt <- array(data = c(1,0,0,1), dim = c(2, 2, N))
ct <- array(0, dim = c(1, N))
dt <- array(0, dim = c(2, N))
HHt = array(data = Sigma, dim = c(2, 2, N))
GGt = array(data = matrix(sigma_y^2), dim = c(1, 1, N))
fkf.obj <- fkf(a0=a0, P0=P0, ct = ct, dt = dt, Tt = Tt, Zt = Zt, HHt = HHt, GGt = GGt, yt = rbind(y))
par(mfrow=c(1,2))
plot(index(EWC), fkf.obj$att[1,], col="blue", type="l", ylab="Offset", xlab="Date")
plot(index(EWC), fkf.obj$att[2,], col="blue", type="l", ylab="Hedge Ratio", xlab="Date")

The fitted lines are very similar to the ones obtained by Chan.

We can improve fitting by re-implementing the previous naive version by including the Kalman Filter inside the Stan model:

data {
  int<lower=0> N;
  vector[N] y;
  vector[N] x;
  vector[2] m0;
  matrix[2,2] P0;
}
transformed data {
  matrix[2,2] F;
  row_vector[2] H[N]; 
  matrix[2,2] I;
  F = [[1,0],
        [0,1]];
  for(t in 1:N)      
    H[t] = [1, x[t]];      
}
parameters {
  cov_matrix[2] Q;
  real<lower=0> R;
}
transformed parameters {
  vector[2] m[N];
  real S[N];
  real mu[N];
  vector[2] m_pred[N]; 
  matrix[2,2] P[N]; 
  matrix[2,2] P_pred[N];
  real v[N];
  vector[2] K[N]; 
  {
    m_pred[1] = m0;
    P_pred[1] = P0;
    for (t in 1:N) {
      if (t>1) {
        m_pred[t] = F * m[t-1];
        P_pred[t] = F * P[t-1] * F' + Q;
      }
      v[t] = y[t] - H[t] * m_pred[t];
      S[t] = (H[t] * P_pred[t] * H[t]' + R);
      K[t] = (P_pred[t] * H[t]') / S[t];
      m[t] = m_pred[t] + K[t] * v[t];   
      P[t] = P_pred[t] - P_pred[t] * H[t]' * K[t]';
      mu[t] = H[t] * m_pred[t];
    }
  }
}
model {
  Q ~ inv_wishart(2, diag_matrix(rep_vector(1,2)));
  R ~ student_t(4, 0, 1);
  y ~ normal(mu, sqrt(S));
}
generated quantities {

}

Sampling from this model is faster and usually does not results in any divergence or maximum depth warnings:

SSM_beta_KF <- stan_model("models/EpChan/SSM_beta_KF.stan")
trainset <- 1:1250
testset <- 1251:1500
#SSM_beta_KF_chan <- sampling(SSM_beta_KF, data= list(N = length(trainset), y = EWC_price[trainset],  x = EWA_price[trainset], m0=c(1,1), P0=matrix(0, nrow=2, ncol=2)), iter = 500, chains=1)
SSM_beta_KF_chan <- readRDS("models/EpChan/SSM_beta_KF_chan.rds")
print(SSM_beta_KF_chan, pars=c("R", "Q"),  digits=5)

## Inference for Stan model: SSM_beta_KF.
## 1 chains, each with iter=500; warmup=250; thin=1; 
## post-warmup draws per chain=250, total post-warmup draws=250.
## 
##            mean se_mean      sd     2.5%      25%      50%      75%    97.5%
## R       0.10822 0.00042 0.00609  0.09734  0.10398  0.10784  0.11278  0.11964
## Q[1,1]  0.08685 0.00168 0.02333  0.05219  0.06992  0.08249  0.09865  0.14741
## Q[1,2] -0.00630 0.00012 0.00174 -0.01054 -0.00737 -0.00610 -0.00503 -0.00365
## Q[2,1] -0.00630 0.00012 0.00174 -0.01054 -0.00737 -0.00610 -0.00503 -0.00365
## Q[2,2]  0.00218 0.00001 0.00016  0.00193  0.00208  0.00216  0.00228  0.00252
##        n_eff    Rhat
## R        214 0.99814
## Q[1,1]   193 0.99600
## Q[1,2]   197 0.99601
## Q[2,1]   197 0.99601
## Q[2,2]   217 0.99599
## 
## Samples were drawn using NUTS(diag_e) at Mon May  3 02:29:27 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).

Nonetheless, as you can see I was unable to get estimates similar to Chan’s (any suggestion how to improve this?).
Therefore, we will instead proceed by fitting the data with the great MARSS package https://cran.r-project.org/web/packages/MARSS/vignettes/UserGuide.pdf:

trainset <- 1:1250
TT <- length(trainset)
m <- 2
y <- EWC_price[trainset] 
x <- EWA_price[trainset] 
B <- diag(m)  
U <- matrix(0, nrow = m, ncol = 1) 
Q <- matrix("c", m, m)  
diag(Q) <- c("q.alpha", "q.beta")  
Z <- array(NA, c(1, m, TT)) 
Z[1, 1, ] <- rep(1, TT)  
Z[1, 2, ] <- x 
A <- matrix(0)  
R <- matrix("r") 
inits_list <- list(x0 = matrix(c(0, 0), nrow = m))
mod_list <- list(B = B, U = U, Q = Q, Z = Z, A = A, R = R)
dlm_chan <- MARSS(y, inits = inits_list, model = mod_list)

## Success! abstol and log-log tests passed at 197 iterations.
## Alert: conv.test.slope.tol is 0.5.
## Test with smaller values (<0.1) to ensure convergence.
## 
## MARSS fit is
## Estimation method: kem 
## Convergence test: conv.test.slope.tol = 0.5, abstol = 0.001
## Estimation converged in 197 iterations. 
## Log-likelihood: 170.1968 
## AIC: -328.3937   AICc: -328.3261   
##  
##            Estimate
## R.r        0.003932
## Q.q.alpha  0.174022
## Q.c       -0.010655
## Q.q.beta   0.000791
## x0.X1      6.250310
## x0.X2      1.104986
## Initial states (x0) defined at t=0
## 
## Standard errors have not been calculated. 
## Use MARSSparamCIs to compute CIs and bias estimates.

As it can be seen, now the estimates are much closer to Chan’s.
Let’s filter the time series again with the new estimates:

# Retrive the estimates for the noise deviations
sigma_y <- sqrt(dlm_chan$par$R)
Sigma <- matrix(c(dlm_chan$par$Q[1], dlm_chan$par$Q[2], dlm_chan$par$Q[2], dlm_chan$par$Q[3]), byrow=TRUE, nrow=2, ncol=2)
# the initial values for the filter
a0=c(1,1);P0=matrix(c(0,0,0,0), byrow = T, ncol=2)
# Now fill all the matrices according to the package documentation https://cran.r-project.org/web/packages/FKF/FKF.pdf
N <- length(EWC_price)
y <- EWC_price
Zt <- array(data = t(cbind(1, EWA_price)), dim = c(1, 2, N))
Tt <- array(data = c(1,0,0,1), dim = c(2, 2, N))
ct <- array(0, dim = c(1, N))
dt <- array(0, dim = c(2, N))
HHt = array(data = Sigma, dim = c(2, 2, N))
GGt = array(data = matrix(sigma_y^2), dim = c(1, 1, N))
fkf.obj <- fkf(a0=a0, P0=P0, ct = ct, dt = dt, Tt = Tt, Zt = Zt, HHt = HHt, GGt = GGt, yt = rbind(y))
par(mfrow=c(1,2))
plot(index(EWC), fkf.obj$att[1,], col="blue", type="l", ylab="Offset", xlab="Date")
plot(index(EWC), fkf.obj$att[2,], col="blue", type="l", ylab="Hedge Ratio", xlab="Date")

Again, very similar to the ones showed in the book.
The trading stategy applied by Chan is a typical statistical arbitrage mean-reversion: “buy EWC(y) if we find that the observed value of y is smaller than the forecasted value by more than the forecasted standard deviation of the observations, while simultaneously shorting EWA”.
We can notice by plotting to forecast errors against the forecasted standard deviation:

plot(index(EWC)[-(1:3)], fkf.obj$vt[1,-(1:3)], ylab="Forecast error", xlab="Date" )
lines(index(EWC)[-(1:3)], sqrt(fkf.obj$Ft[1,1,-(1:3)]), col="blue")
lines(index(EWC)[-(1:3)], -sqrt(fkf.obj$Ft[1,1,-(1:3)]), col="blue")

The determination of the actual positions of EWC and EWA are the same as in Chan (2013), and in the current code I am following the MATLAB codes can be downloaded from https://www.epchan.com/book3/ as SSM_beta_EWA_EWC.m.

v <- fkf.obj$vt
S <- fkf.obj$Ft[1,1,]
y2 <- cbind(EWA_price, EWC_price)
shortsExit <- v < sqrt(S);
shortsEntry <- v > sqrt(S);
longsExit <- v > -sqrt(S);
longsEntry <- v < -sqrt(S);
numUnitsLong=vector(length = nrow(EWC));
numUnitsShort=vector(length = nrow(EWC));
numUnitsLong[1]=0;
numUnitsLong[longsEntry]=1;
numUnitsLong[longsExit]=0;
numUnitsShort[1]=0;
numUnitsShort[shortsEntry]=-1;
numUnitsShort[shortsExit]=0;
numUnits=numUnitsLong+numUnitsShort;
positions <- numUnits * cbind(-fkf.obj$att[1,], 1) * y2
pnl <- rowSums(lag(positions, 1) * (y2-lag(y2, 1)) / lag(y2, 1), na.rm = TRUE)
ret <- pnl / rowSums(abs(lag(positions, 1)), na.rm = TRUE);
ret[is.infinite(ret)] <- 0
ret[is.nan(ret)] <- 0
par(mfrow=c(1,2))
plot(index(EWC)[trainset], (cumprod(1+ret[trainset])-1),ylab="Cumulative Returns", xlab="Date", type="l")
plot(index(EWC)[testset], (cumprod(1+ret[testset])-1), ylab="Cumulative Returns", xlab="Date", type="l")

As it can be observed, the strategy does not perform well out-of-sample. According to Chan, that this could be attributes to overfitting the noise covariance matrix.

Search This Blog

No Time for Bayes

Dynamic Linear Models applied to Pairs Trading

Comments

Post a Comment

Popular posts from this blog

Dynamic Linear Models with Switching in Stan

Analysis of Financial Time Series in Stan - Chapter 10 - Multivariate Volatility Models and Their Applications

Analysis of Financial Time Series in Stan - Chapter 9 - Principal Component Analysis and Factor Models