### --- replication of Table 2 of Lamperti (2018, ECOSTA) for L>1 --- ### ### author: F. Lamperti ### date: 18-04-18 ### Note: this piece of code allows to replicate the Subtracted L values for given l, with and without correction. One then needs to aggregate them according to formula (9) in the paper and the chosen weights. #### functions #### str_count_ovlap <- function(apat, atxt) { stopifnot(length(apat) == 1, length(atxt) == 1) pos <- c() # positions of matches i <- 1; n <- nchar(atxt) found <- regexpr(apat, substr(atxt, i, n), perl=TRUE) while (found > 0) { pos <- c(pos, i + found - 1) i <- i + found found <- regexpr(apat, substr(atxt, i, n), perl=TRUE) } return(length(pos)) } #### settings #### l <- 2 # length of words ---> you need to change value for every word length you want to test bx <- 3 # precision N <- 6 # length of the series M <- 1 # number of replications b <- c(1:bx) v <- as.matrix(expand.grid(b,b)) # here you need to put as many "b" as the length of words you are considering # series m1_out <- list() m2_out <- list() x <- c(1,1,3,2,2,1) # x m1_out[[1]] <- c(1,1,3,2,2,2) # x_a m2_out[[1]] <- c(1,1,2,3,2,1) # x_b #### SL values for L > 1 #### card <- bx^l # cardinality of the alphabet --> use it as the base for the logarithm in order to normalize maxent to 1 Nl <- N-l+1 # number of observations factorx <- factor(cut(x, breaks=bx, labels=F, include.lowest=T, right= T)) xstr <- paste(as.character(factorx),collapse="") #the symbolized time series as a string ftab <- data.frame(v) for (i in 1:nrow(v)){ ftab$symbol[i] <- paste(v[i,],collapse="") # creation of the alphabet ftab$freqx[i] <- str_count_ovlap(ftab$symbol[i],xstr)/Nl # frequencies of DGP on the alphabet } ftab$infox <- apply(as.array(ftab$freqx),1,FUN=function(x){x*log(x, card)}) ftab$infox[is.na(ftab$infox)] <- 0 h_x <- (-1)*sum(ftab$infox, na.rm=T) # entropy x Bx_b2 <- sum(ftab$freqx>0) ## vectors for entropies and states h_a <- rep(NA, M) # entropy of model A, block 2 h_ta <- rep(NA, M) # vector for trasformed entropy: (p+q)/2 freqa <- list() freqta <- list() h_b <- rep(NA, M) # entropy of model B, block 2 h_tb <- rep(NA, M) # vector for trasformed entropy: (p+q)/2 freqb <- list() freqtb <- list() Bb <- rep(NA,M) Btb <- rep(NA,M) Bta <- rep(NA,M) Ba <- rep(NA,M) Ba_mat <- matrix(,nrow=card,ncol=M) Bb_mat <- matrix(,nrow=card,ncol=M) Bta_mat <- matrix(,nrow=card,ncol=M) Btb_mat <- matrix(,nrow=card,ncol=M) for (i in 1:M){ #model 1 a_series <- m1_out[[i]] za <- factor(cut(a_series, breaks=bx, labels=F, include.lowest=T, right= T)) astr <- paste(as.character(za),collapse="") #model 2 b_series <- m2_out[[i]] zb <- factor(cut(b_series, breaks=bx, labels=F, include.lowest=T, right= T)) bstr <- paste(as.character(zb),collapse="") # computation of frequencies for models output for (j in 1:nrow(v)){ ftab$freqa[j] <- str_count_ovlap(ftab$symbol[j], astr)/Nl ftab$freqb[j] <- str_count_ovlap(ftab$symbol[j], bstr)/Nl ftab$freqta[j] <- (ftab$freqa[j]+ftab$freqx[j])/2 ftab$freqtb[j] <- (ftab$freqb[j]+ftab$freqx[j])/2 } ftab$infoa <- apply(as.array(ftab$freqa),1,FUN=function(x){x*log(x, card)}) ftab$infoa[is.na(ftab$infoa)] <- 0 h_a[i] <- (-1)*sum(ftab$infoa, na.rm=T) freqa[[i]] <- ftab$freqa Ba[i] <- sum(ftab$freqa>0) Ba_mat[,i] <- ftab$freqa ftab$infota <- apply(as.array(ftab$freqta),1,FUN=function(x){x*log(x, card)}) ftab$infota[is.na(ftab$infota)] <- 0 h_ta[i] <- (-1)*sum(ftab$infota, na.rm=T) freqta[[i]] <- ftab$freqta Bta[i] <- sum(ftab$freqta>0) Bta_mat[,i] <- ftab$freqta # entropy serie modello 2 (b) e mean (b+x)/2 ftab$infob <- apply(as.array(ftab$freqb),1,FUN=function(x){x*log(x, card)}) ftab$infob[is.na(ftab$infob)] <- 0 h_b[i] <- (-1)*sum(ftab$infob, na.rm=T) freqb[[i]] <- ftab$freqb Bb[i] <- sum(ftab$freqb>0) Bb_mat[,i] <- ftab$freqb ftab$infotb <- apply(as.array(ftab$freqtb),1,FUN=function(x){x*log(x, card)}) ftab$infotb[is.na(ftab$infotb)] <- 0 h_tb[i] <- (-1)*sum(ftab$infotb, na.rm=T) freqtb[[i]] <- ftab$freqtb Btb[i] <- sum(ftab$freqtb>0) Btb_mat[,i] <- ftab$freqtb } ### computation of the L-subtracted divergence SLdiv_a_NOCOR <- 2*mean(h_ta) - mean(h_a) #model A SLdiv_b_NOCOR <- 2*mean(h_tb) - mean(h_b) #model B ### computation of errors and correction terms Ba_ok <- sum(.rowMeans(Ba_mat,card,M)>0) Bb_ok <- sum(.rowMeans(Bb_mat,card,M)>0) Bta_ok <- sum(.rowMeans(Bta_mat,card,M)>0) Btb_ok <- sum(.rowMeans(Btb_mat,card,M)>0) COR_a<- 2*(Bta_ok-1)/(2*2*N) - (Ba_ok-1)/(2*N) #correction term model A COR_b<- 2*(Btb_ok-1)/(2*2*N) - (Bb_ok-1)/(2*N) #correction term model B SLdiv_a <- SLdiv_a_NOCOR + COR_a SLdiv_b <- SLdiv_b_NOCOR + COR_b