library(plyr)

prove that joint entropy of two variables X and Y is equal to conditional entropy + entropy of Y
Prove : H(X;Y) = H(X|Y) + H(Y)

define shannon entropy

shannon_entropy <- function(x){
  df = plyr::count(x)
  n = length(x)
  df$probs = df$freq / n
  se<- -sum(df$probs * log(df$probs,base=2))
  return(se)
}

create distinct datasets

data1<-round(rnorm(100, 15, 1), 0)
data2<-round(rnorm(100, 50, 1), 0)

df <- data.frame(data1,data2)
colnames(df)<-c("X","Y")
head(df)
##    X  Y
## 1 16 50
## 2 14 51
## 3 13 50
## 4 16 50
## 5 16 50
## 6 16 51

calculate entropy of Y variable

entpy_Y<-shannon_entropy(df$Y)

entpy_Y
## [1] 2.174335

calculate joint probability of our data

prob_df <- prop.table(table(df))

1-) calculate conditional probabilities

cond<-data.frame()
for (i in colnames(prob_df)){
    for ( j in row.names(prob_df)){
      cond[j,i]<-prob_df[j,i] / sum(prob_df[,i])
    }
}

cond
##    47  48   49         50         51        52 53
## 13  0 0.0 0.00 0.08571429 0.03846154 0.0000000  0
## 14  1 0.2 0.36 0.31428571 0.23076923 0.3333333  0
## 15  0 0.4 0.36 0.31428571 0.50000000 0.5000000  1
## 16  0 0.2 0.16 0.25714286 0.19230769 0.0000000  0
## 17  0 0.2 0.08 0.02857143 0.03846154 0.1666667  0
## 18  0 0.0 0.04 0.00000000 0.00000000 0.0000000  0

2-) calculate conditional entropy

sx<-data.frame()
for (i in colnames(cond)){
  for ( j in row.names(cond)){
     sx[j,i]<- prob_df[j,i] * log(cond[j,i],base = 2)
      
  }
  
}

sx
##     47          48          49          50         51          52  53
## 13 NaN         NaN         NaN -0.10632962 -0.0470044         NaN NaN
## 14   0 -0.02321928 -0.13265381 -0.18368365 -0.1269286 -0.03169925 NaN
## 15 NaN -0.02643856 -0.13265381 -0.18368365 -0.1300000 -0.03000000   0
## 16 NaN -0.02321928 -0.10575425 -0.17634222 -0.1189256         NaN NaN
## 17 NaN -0.02321928 -0.07287712 -0.05129283 -0.0470044 -0.02584963 NaN
## 18 NaN         NaN -0.04643856         NaN        NaN         NaN NaN
conditional_entropy<- -sum(sx,na.rm = T)

3-) Calculate joint entropy

sx_prob<-data.frame()
for (i in colnames(prob_df)){
  for ( j in row.names(prob_df)){
    sx_prob[j,i]<- prob_df[j,i] * log(prob_df[j,i],base = 2)
    
  }
}

sx_prob
##             47          48          49          50          51          52
## 13         NaN         NaN         NaN -0.15176681 -0.06643856         NaN
## 14 -0.06643856 -0.06643856 -0.31265381 -0.35028670 -0.24353362 -0.11287712
## 15         NaN -0.11287712 -0.31265381 -0.35028670 -0.38264414 -0.15176681
## 16         NaN -0.06643856 -0.18575425 -0.31265381 -0.21609640         NaN
## 17         NaN -0.06643856 -0.11287712 -0.06643856 -0.06643856 -0.06643856
## 18         NaN         NaN -0.06643856         NaN         NaN         NaN
##            53
## 13        NaN
## 14        NaN
## 15 -0.1128771
## 16        NaN
## 17        NaN
## 18        NaN
joint_entropy<- -sum(sx_prob,na.rm = T)

check if joint probability is equal to conditional probability + entropy of Y
H(X;Y) = H(X|Y) + H(Y)

conditional_entropy + entpy_Y
## [1] 4.019552
joint_entropy
## [1] 4.019552

yes, thats it :)