entropy <- function(x){
  n <- length(x)
  unique_states <- unique(x)
  # initialize the prob of each state to 0
  # two elements for p(x=0) and p(x=1)
  p <- rep(0,length(unique_states))
  # update two elements of p
  # curr_state = 0 or 1
  i <- 1 # R starts counting arrays at 1!
  for (curr_state in unique_states){ 
    p[i] = sum(x==curr_state)/n # prob of a unique state
    i <- i + 1
  }
  H = sum(-p*log2(p))
  return(H)
}

mutualInformation <- function(X,Y){
  # X and Y are vectors of states
  n <- length(X)
  X.unique <- unique(X)  # unique states in X
  Y.unique <- unique(Y)
  # creates joint states between X and Y states
  # This needs to be for all pairs in the X and Y vecs
  #             this is where I messed up
  # xy.joint <- interaction(X.states,Y.states,sep=":")
  xy.joint <- interaction(X,Y,sep=":")
  # initialize mu=0
  mu <- 0
  for (x in X.unique){
    for (y in Y.unique){
      # sum over all pairs of unique x and y states
      # for example, 4 pairs
      px <- sum(x==X)/n  # marg prob of state x
      py <- sum(y==Y)/n  # marg prob
      xy <- paste(x,y,sep=":") # create observed pair val
      pxy <- sum(xy==xy.joint)/n # joint prob of pair
      if (pxy>0){
        mu <- mu + pxy*log2(pxy/(px*py))
      }
    }
  }
  return(mu)
}
  
normalizedMI <- function(X,Y){
  mu <- mutualInformation(X,Y)
  Hx <- entropy(X)
  Hy <- entropy(Y)
  nmi <- 2*mu/(Hx+Hy)
  return(nmi)
}

# Example X and Y vectors
n <- 10  # num of obsvs
p0 <- .5 # prob of first state
# Roughly 50:50 1's and 0's
x1 <- sample(c(0,1), size=n,  
             replace=T, prob=c(p0,1-p0))
# c(0,1) -- possible state values could be c("a","b")
# size=n -- number of random samples of the state levels
#           number of observations or length of output
# replace = T -- sample with replacement
#                so don't run out of states when sampling
# prob=c(p0,1-p0) -- probability of sampling each state
      
y1 <- rep(1,n)  # all 1's
interaction(x1,y1,sep=":")
unique(y1)
entropy(x1)

mutualInformation(x1,y1)
mutualInformation(x1,x1)
normalizedMI(x1,y1)  # close to 0 or 0
normalizedMI(x1,x1)  # should be 1 because normalized