### --- replication of Table 2 of Lamperti (2018, ECOSTA) for L>1 --- ###
### author: F. Lamperti
### date: 18-04-18
### Note: this piece of code allows to replicate the Subtracted L values for given l, with and without correction. One then needs to aggregate them according to formula (9) in the paper and the chosen weights.


#### functions ####
str_count_ovlap <- function(apat, atxt) { 
  stopifnot(length(apat) == 1, length(atxt) == 1) 
  pos <- c()  # positions of matches 
  i <- 1; n <- nchar(atxt) 
  found <- regexpr(apat, substr(atxt, i, n), perl=TRUE) 
  while (found > 0) { 
    pos <- c(pos, i + found - 1) 
    i <- i + found 
    found <- regexpr(apat, substr(atxt, i, n), perl=TRUE) 
  } 
  return(length(pos)) 
} 

#### settings ####
l   <- 2 # length of words  ---> you need to change value for every word length you want to test 
bx  <- 3 # precision 
N   <- 6 # length of the series
M   <- 1 # number of replications
b <- c(1:bx)
v <- as.matrix(expand.grid(b,b)) # here you need to put as many "b" as the length of words you are considering


# series
m1_out <- list()
m2_out <- list()

x <- c(1,1,3,2,2,1)             # x
m1_out[[1]] <- c(1,1,3,2,2,2)   # x_a
m2_out[[1]] <- c(1,1,2,3,2,1)   # x_b


#### SL values for  L > 1 ####
card <- bx^l  # cardinality of the alphabet --> use it as the base for the logarithm in order to normalize maxent to 1
Nl <- N-l+1   # number of observations

factorx <- factor(cut(x, breaks=bx, labels=F, include.lowest=T, right= T))
xstr <- paste(as.character(factorx),collapse="") #the symbolized time series as a string
ftab <- data.frame(v)
for (i in 1:nrow(v)){
  ftab$symbol[i] <- paste(v[i,],collapse="") # creation of the alphabet
  ftab$freqx[i] <- str_count_ovlap(ftab$symbol[i],xstr)/Nl # frequencies of DGP on the alphabet
}
ftab$infox <- apply(as.array(ftab$freqx),1,FUN=function(x){x*log(x, card)}) 
ftab$infox[is.na(ftab$infox)] <- 0
h_x <- (-1)*sum(ftab$infox, na.rm=T) # entropy x
Bx_b2 <- sum(ftab$freqx>0)

## vectors for entropies and states
h_a <- rep(NA, M)   # entropy of model A, block 2
h_ta <- rep(NA, M)  # vector for trasformed entropy: (p+q)/2
freqa <- list()
freqta <- list()

h_b <- rep(NA, M)   # entropy of model B, block 2
h_tb <- rep(NA, M)  # vector for trasformed entropy: (p+q)/2
freqb <- list()
freqtb <- list()

Bb <- rep(NA,M)
Btb <- rep(NA,M)
Bta <- rep(NA,M)
Ba <- rep(NA,M)
Ba_mat <- matrix(,nrow=card,ncol=M)
Bb_mat <- matrix(,nrow=card,ncol=M)
Bta_mat <- matrix(,nrow=card,ncol=M)
Btb_mat <- matrix(,nrow=card,ncol=M)


for (i in 1:M){
  #model 1 
  a_series <- m1_out[[i]]
  za <- factor(cut(a_series, breaks=bx, labels=F, include.lowest=T, right= T))
  astr <- paste(as.character(za),collapse="")
  #model 2 
  b_series <- m2_out[[i]]
  zb <- factor(cut(b_series, breaks=bx, labels=F, include.lowest=T, right= T))
  bstr <- paste(as.character(zb),collapse="")
  
  # computation of frequencies for models output
  
  for (j in 1:nrow(v)){
    ftab$freqa[j] <- str_count_ovlap(ftab$symbol[j], astr)/Nl 
    ftab$freqb[j] <- str_count_ovlap(ftab$symbol[j], bstr)/Nl
    ftab$freqta[j] <- (ftab$freqa[j]+ftab$freqx[j])/2
    ftab$freqtb[j] <- (ftab$freqb[j]+ftab$freqx[j])/2
  }
  
  ftab$infoa <- apply(as.array(ftab$freqa),1,FUN=function(x){x*log(x, card)}) 
  ftab$infoa[is.na(ftab$infoa)] <- 0
  h_a[i] <- (-1)*sum(ftab$infoa, na.rm=T) 
  freqa[[i]] <- ftab$freqa
  Ba[i] <- sum(ftab$freqa>0)
  Ba_mat[,i] <- ftab$freqa
  
  ftab$infota <- apply(as.array(ftab$freqta),1,FUN=function(x){x*log(x, card)}) 
  ftab$infota[is.na(ftab$infota)] <- 0
  h_ta[i] <- (-1)*sum(ftab$infota, na.rm=T) 
  freqta[[i]] <- ftab$freqta
  Bta[i] <- sum(ftab$freqta>0)
  Bta_mat[,i] <- ftab$freqta
  
  # entropy serie modello 2 (b) e mean (b+x)/2
  ftab$infob <- apply(as.array(ftab$freqb),1,FUN=function(x){x*log(x, card)}) 
  ftab$infob[is.na(ftab$infob)] <- 0
  h_b[i] <- (-1)*sum(ftab$infob, na.rm=T) 
  freqb[[i]] <- ftab$freqb
  Bb[i] <- sum(ftab$freqb>0)
  Bb_mat[,i] <- ftab$freqb
  
  ftab$infotb <- apply(as.array(ftab$freqtb),1,FUN=function(x){x*log(x, card)}) 
  ftab$infotb[is.na(ftab$infotb)] <- 0
  h_tb[i] <- (-1)*sum(ftab$infotb, na.rm=T) 
  freqtb[[i]] <- ftab$freqtb
  Btb[i] <- sum(ftab$freqtb>0)
  Btb_mat[,i] <- ftab$freqtb
}

### computation of the L-subtracted divergence 
SLdiv_a_NOCOR <- 2*mean(h_ta) - mean(h_a)  #model A
SLdiv_b_NOCOR <- 2*mean(h_tb) - mean(h_b)  #model B

### computation of errors and correction terms
Ba_ok <- sum(.rowMeans(Ba_mat,card,M)>0)
Bb_ok <- sum(.rowMeans(Bb_mat,card,M)>0)  
Bta_ok <- sum(.rowMeans(Bta_mat,card,M)>0)  
Btb_ok <- sum(.rowMeans(Btb_mat,card,M)>0)

COR_a<- 2*(Bta_ok-1)/(2*2*N) - (Ba_ok-1)/(2*N) #correction term model A
COR_b<- 2*(Btb_ok-1)/(2*2*N) - (Bb_ok-1)/(2*N) #correction term model B

SLdiv_a <- SLdiv_a_NOCOR + COR_a
SLdiv_b <- SLdiv_b_NOCOR + COR_b