From 249af3aa58a2340c58c357c356ac72034783de08 Mon Sep 17 00:00:00 2001
From: minhsphuc12 <phucnm.ec@gmail.com>
Date: Sat, 27 Jul 2019 23:08:15 +0700
Subject: [PATCH 1/6] add code, helper function, tests for mutual info score

---
 R/clustering.r                  | 59 ++++++++++++++++++++++++++++++
 R/helper-functions.r            | 65 ++++++++++++++++++++++++++++++++-
 inst/tinytest/test-clustering.r | 23 ++++++++++++
 3 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 R/clustering.r
 create mode 100644 inst/tinytest/test-clustering.r

diff --git a/R/clustering.r b/R/clustering.r
new file mode 100644
index 0000000..4380947
--- /dev/null
+++ b/R/clustering.r
@@ -0,0 +1,59 @@
+##' @title
+##' Clustering Metrics Parameters
+##'
+##' @description
+##' Documentation for shared parameters of functions that compute clustering
+##' metrics.
+##'
+##' @param actual \code{[numeric]} The ground truth numeric vector.
+##' @param predicted \code{[numeric]} The predicted numeric vector, where each
+##'     element in the vector is a prediction of the corresponding elements in
+##'     \code{actual}.
+##' @name clustering_params
+##' @include helper-functions.r
+NULL
+
+
+##' @title
+##' Adjusted Mutual Information Score / Mututal Information Score
+##'
+##'
+##' @description
+##'
+##' \code{mtr_mutual_info_score} measures the similarity, or mutual dependence 
+##' between two variable. The worst possible score is 0, higher values are 
+##' better.
+##' 
+##' 
+##' @inheritParams clustering_params
+##' @importFrom stats var
+##' @seealso \code{\link{mtr_r2}}
+##' @return A numeric scalar output
+##' @author Phuc Nguyen
+##' @examples
+##'
+##' act <- sample(1:10, 100, replace = T)
+##' pred <- sample(1:10, 100, replace = T)
+##' mtr_mutual_info_score(act, pred)
+##'
+##' act <- rep(c('a', 'b', 'c'), times = 4)
+##' pred <- rep(c('a', 'b', 'c'), each = 4)
+##' mtr_mutual_info_score(act, pred)
+##'
+##' @export
+mtr_mutual_info_score <- function(actual, predicted) {
+    chec_empty_vec(actual)
+    check_equal_length(actual, predicted)
+    entropy(actual) + entropy(predicted) - joint_entropy(vec_1 = actual, 
+                                                         vec_2 = predicted)
+}
+
+mtr_normalized_mutual_info_score <- function(actual, predicted) {
+    mtr_mutual_info_score(actual = actual, predicted = predicted) / 
+        mean(c(entropy(vec = actual), entropy(vec = predicted)))
+}
+
+mtr_adjusted_mutual_info_score <- function(actual, predicted) {
+    (mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) / 
+        (mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted))
+}
diff --git a/R/helper-functions.r b/R/helper-functions.r
index 12d9585..67125ed 100644
--- a/R/helper-functions.r
+++ b/R/helper-functions.r
@@ -1,5 +1,11 @@
 
-
+chec_empty_vec <- function(vec) {
+    if (length(vec) == 0) {
+        stop("vector must have positive length.", call. = FALSE)
+    }
+    
+    invisible()
+}
 
 check_equal_length <- function(actual, predicted) {
 
@@ -60,3 +66,60 @@ trapezoid <- function(x, y) {
 
     sum(dx * height)
 }
+
+class_prob <- function(vec, class) {
+    chec_empty_vec(vec)
+    length(which(vec == class)) / length(vec)
+}
+
+entropy <- function(vec) {
+    chec_empty_vec(vec)
+    li = c()
+    for (cl in unique(vec)) {
+        m = class_prob(vec = vec, class = cl)
+        li = c(li, -1 * m * log(m))
+    }
+    etp = sum(li, na.rm = TRUE)
+    etp
+}
+
+joint_class_prob <- function(vec_1, vec_2, class_1, class_2) {
+    chec_empty_vec(vec_1)
+    check_equal_length(vec_1, vec_2)
+    length(which(vec_1 == class_1 & vec_2 == class_2)) / length(vec_1)
+}
+
+joint_entropy <- function(vec_1, vec_2) {
+    check_equal_length(vec_1, vec_2)
+    li = c()
+    for(cl_1 in unique(vec_1)) {
+        for(cl_2 in unique(vec_2)) {
+            m = joint_class_prob(vec_1 = vec_1, vec_2 = vec_2, 
+                                 class_1 = cl_1, class_2 = cl_2)
+            li = c(li, - 1 * m * log(m))
+        }
+    }
+    joint_etp = sum(li, na.rm = TRUE)
+    joint_etp
+}
+
+expected_mutual_info <- function(vec_1, vec_2) {
+    check_equal_length(vec_1, vec_2)
+    N = length(vec_1)
+    li = c()
+    for (i in unique(vec_1)) {
+        a = length(which(vec_1 == i))
+        for (j in unique(vec_2)) {
+            b = length(which(vec_2 == j))
+            for (nij in max(a + b - N, 0, na.rm = TRUE): min(a, b, na.rm = TRUE)) {
+                li = c(li, (nij / N) * 
+                           log((N * nij) / (a * b)) * 
+                           (factorial(a) * factorial(b) * factorial(N - a) * factorial(N - b)) /
+                           (factorial(N) * factorial(nij) * factorial(a - nij) * factorial(b - nij) * factorial(N - a - b + nij)))
+            }
+        }
+    }
+    emi = sum(li, na.rm = TRUE)
+    emi
+}
+
diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r
new file mode 100644
index 0000000..998e093
--- /dev/null
+++ b/inst/tinytest/test-clustering.r
@@ -0,0 +1,23 @@
+
+## test correctness ------------------------------------------------------------
+
+vec_a = c(0, 1, 2, 0, 3, 4, 5, 1)
+vec_b = c(1, 1, 0, 0, 2, 2, 2, 2)
+
+tinytest::expect_equal(
+    mtr_mutual_info_score(vec_a, vec_b),
+    target = 0.693147180559945,
+    tol = 1e-7
+)
+
+tinytest::expect_equal(
+    mtr_normalized_mutual_info_score(vec_a, vec_b),
+    target = 0.5163977794943221,
+    tol = 1e-7
+)
+
+tinytest::expect_equal(
+    mtr_adjusted_mutual_info_score(vec_a, vec_b),
+    target = -0.10526315789473674,
+    tol = 1e-7
+)

From ff217fa8e069435622033becb7927eef9f464a8a Mon Sep 17 00:00:00 2001
From: minhsphuc12 <phucnm.ec@gmail.com>
Date: Sun, 28 Jul 2019 15:09:36 +0700
Subject: [PATCH 2/6] add code, test, export for adjusted rand score

---
 NAMESPACE                       |  2 ++
 R/clustering.r                  | 39 ++++++++++++++++++++++++++++++++-
 R/helper-functions.r            |  2 +-
 TODO.org                        |  2 +-
 inst/tinytest/test-clustering.r |  6 +++++
 5 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 8442b94..844af39 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -67,6 +67,8 @@ export(mtr_tpr)
 export(mtr_true_negative_rate)
 export(mtr_true_positive_rate)
 export(mtr_youden_index)
+export(mtr_mutual_info_score)
+export(mtr_adjusted_rand_score)
 importFrom(Rcpp,evalCpp)
 importFrom(stats,complete.cases)
 importFrom(stats,median)
diff --git a/R/clustering.r b/R/clustering.r
index 4380947..c9f8f3e 100644
--- a/R/clustering.r
+++ b/R/clustering.r
@@ -27,7 +27,7 @@ NULL
 ##' 
 ##' @inheritParams clustering_params
 ##' @importFrom stats var
-##' @seealso \code{\link{mtr_r2}}
+##' @seealso \code{\link{mtr_adjusted_rand_score}}
 ##' @return A numeric scalar output
 ##' @author Phuc Nguyen
 ##' @examples
@@ -57,3 +57,40 @@ mtr_adjusted_mutual_info_score <- function(actual, predicted) {
     (mtr_mutual_info_score(actual, predicted) - expected_mutual_info(actual, predicted)) / 
         (mean(c(entropy(actual), entropy(predicted))) - expected_mutual_info(actual, predicted))
 }
+
+##' @title
+##' Adjusted Rand Score
+##'
+##'
+##' @description
+##'
+##' \code{mtr_adjusted_rand_score} measures the similarity, or mutual dependence 
+##' between two variable. Perfect score is 1. Score between total random vectors
+##' is close to 0. Score can be negative.
+##' 
+##' 
+##' @inheritParams clustering_params
+##' @importFrom base choose
+##' @seealso \code{\link{mtr_mutual_info_score}}
+##' @return A numeric scalar output
+##' @author Phuc Nguyen
+##' @examples
+##'
+##' act <- sample(1:10, 100, replace = T)
+##' pred <- sample(1:10, 100, replace = T)
+##' mtr_adjusted_rand_score(act, pred)
+##'
+##' act <- rep(c('a', 'b', 'c'), times = 4)
+##' pred <- rep(c('a', 'b', 'c'), each = 4)
+##' mtr_adjusted_rand_score(act, pred)
+##'
+##' @export
+
+mtr_adjusted_rand_score <- function(actual, predicted) {
+    check_equal_length(actual, predicted)
+    N = length(actual)
+    a = sum(choose(table(actual, predicted), 2))
+    b = sum(choose(table(actual), 2)) * sum(choose(table(predicted), 2)) / choose(N, 2)
+    c = 1/2 * (sum(choose(table(actual), 2)) + sum(choose(table(predicted), 2)))
+    (a - b) / (c - b)
+}
diff --git a/R/helper-functions.r b/R/helper-functions.r
index 67125ed..00fc92a 100644
--- a/R/helper-functions.r
+++ b/R/helper-functions.r
@@ -69,7 +69,7 @@ trapezoid <- function(x, y) {
 
 class_prob <- function(vec, class) {
     chec_empty_vec(vec)
-    length(which(vec == class)) / length(vec)
+    length(which(vec == class)) / length(vec = vec)
 }
 
 entropy <- function(vec) {
diff --git a/TODO.org b/TODO.org
index c29993d..1610a53 100644
--- a/TODO.org
+++ b/TODO.org
@@ -109,7 +109,7 @@ Proper scoring rule:
 
 - [ ] Adjusted Mututal Information Score / Mutual Information Score
 
-- [ ] Adjusted Rand Score
+- [X] Adjusted Rand Score
 
 - [ ] Calinski-Harabasz Score
 
diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r
index 998e093..607a40f 100644
--- a/inst/tinytest/test-clustering.r
+++ b/inst/tinytest/test-clustering.r
@@ -21,3 +21,9 @@ tinytest::expect_equal(
     target = -0.10526315789473674,
     tol = 1e-7
 )
+
+tinytest::expect_equal(
+    mtr_adjusted_rand_score(vec_a, vec_b),
+    target = -0.12903225806451613,
+    tol = 1e-7
+)

From 3bdebb7efb41e113fabe0c749b0681887c41c938 Mon Sep 17 00:00:00 2001
From: minhsphuc12 <phucnm.ec@gmail.com>
Date: Sun, 28 Jul 2019 21:45:46 +0700
Subject: [PATCH 3/6] add code, test, export for homogeneity, completeness and
 v measure

---
 NAMESPACE                       |  3 +++
 R/clustering.r                  | 48 +++++++++++++++++++++++++++++++++
 R/helper-functions.r            | 16 ++++++++++-
 inst/tinytest/test-clustering.r | 21 +++++++++++++++
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/NAMESPACE b/NAMESPACE
index 844af39..6fcdf35 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -69,6 +69,9 @@ export(mtr_true_positive_rate)
 export(mtr_youden_index)
 export(mtr_mutual_info_score)
 export(mtr_adjusted_rand_score)
+export(mtr_homogeneity)
+export(mtr_completeness)
+export(mtr_v_measure)
 importFrom(Rcpp,evalCpp)
 importFrom(stats,complete.cases)
 importFrom(stats,median)
diff --git a/R/clustering.r b/R/clustering.r
index c9f8f3e..f1199e5 100644
--- a/R/clustering.r
+++ b/R/clustering.r
@@ -94,3 +94,51 @@ mtr_adjusted_rand_score <- function(actual, predicted) {
     c = 1/2 * (sum(choose(table(actual), 2)) + sum(choose(table(predicted), 2)))
     (a - b) / (c - b)
 }
+
+##' @title
+##' Homogeneity, Completeness, V-measure
+##'
+##'
+##' @description
+##'
+##' \code{mtr_homogeneity} and \code{mtr_completeness} measures an aspect of the 
+##' quality of clustering algorithm, as the former measures how similar the 
+##' elements within each cluster to each other, and the later measures 
+##' the degree a cluster has cover elements of same labels. 
+##' Both scores are in range of 0 to 1, as worst to best.
+##' \code{mtr_v_measure} is harmonic mean of homogeneity score and completeness
+##' score.
+##' 
+##' @inheritParams clustering_params
+##' @importFrom base choose
+##' @return A numeric scalar output
+##' @author Phuc Nguyen
+##' @examples
+##'
+##' act <- sample(1:10, 100, replace = T)
+##' pred <- sample(1:10, 100, replace = T)
+##' mtr_homogeneity(act, pred)
+##'
+##' act <- sample(1:10, 100, replace = T)
+##' pred <- sample(1:10, 100, replace = T)
+##' mtr_completeness(act, pred)
+##'
+##' act <- sample(1:10, 100, replace = T)
+##' pred <- sample(1:10, 100, replace = T)
+##' mtr_v_measure(act, pred)
+##'
+##' @export
+
+mtr_homogeneity <- function(actual, predicted) {
+    1 - conditional_entropy(actual, predicted) / entropy(actual)
+}
+
+mtr_completeness <- function(actual, predicted) {
+    1 - conditional_entropy(predicted, actual) / entropy(predicted)
+}
+
+mtr_v_measure <- function(actual, predicted) {
+    h = mtr_homogeneity(actual, predicted)
+    c = mtr_completeness(actual, predicted)
+    2 * h * c / (h + c)
+}
diff --git a/R/helper-functions.r b/R/helper-functions.r
index 00fc92a..d366341 100644
--- a/R/helper-functions.r
+++ b/R/helper-functions.r
@@ -69,7 +69,7 @@ trapezoid <- function(x, y) {
 
 class_prob <- function(vec, class) {
     chec_empty_vec(vec)
-    length(which(vec == class)) / length(vec = vec)
+    length(which(vec == class)) / length(vec)
 }
 
 entropy <- function(vec) {
@@ -123,3 +123,17 @@ expected_mutual_info <- function(vec_1, vec_2) {
     emi
 }
 
+conditional_entropy <- function(vec_1, vec_2) {
+    check_equal_length(vec_1, vec_2)
+    N = length(vec_1)
+    li = c()
+    for (i in unique(vec_1)) {
+        for (j in unique(vec_2)) {
+            b = length(which(vec_2 == j))
+            nij = length(which(vec_1 == i & vec_2 == j)) 
+            li = c(li, - nij / N * log(nij / b))
+        }
+    }
+    cond_entropy = sum(li, na.rm = TRUE)
+    cond_entropy
+}
diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r
index 607a40f..0ce4d78 100644
--- a/inst/tinytest/test-clustering.r
+++ b/inst/tinytest/test-clustering.r
@@ -27,3 +27,24 @@ tinytest::expect_equal(
     target = -0.12903225806451613,
     tol = 1e-7
 )
+
+tinytest::expect_equal(
+    mtr_homogeneity(vec_a, vec_b),
+    target = 0.3999999999999998,
+    tol = 1e-7
+)
+
+tinytest::expect_equal(
+    mtr_completeness(vec_a, vec_b),
+    target = 0.6666666666666665,
+    tol = 1e-7
+)
+
+tinytest::expect_equal(
+    mtr_v_measure(vec_a, vec_b),
+    target = 0.4999999999999998,
+    tol = 1e-7
+)
+
+
+

From 6efe5f33722209955b6418498e7fa04c83311d36 Mon Sep 17 00:00:00 2001
From: minhsphuc12 <phucnm.ec@gmail.com>
Date: Sun, 28 Jul 2019 21:50:20 +0700
Subject: [PATCH 4/6] update todo list for homogeneity, completeness, v_measure

---
 TODO.org | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/TODO.org b/TODO.org
index 1610a53..69a01f4 100644
--- a/TODO.org
+++ b/TODO.org
@@ -115,10 +115,10 @@ Proper scoring rule:
 
 - [ ] Davies-Bouldin Score
 
-- [ ] Completeness Metric
+- [X] Completeness Metric
 
-- [ ] V-Measure Score
+- [X] V-Measure Score
 
-- [ ] Homogeneity Score
+- [X] Homogeneity Score
 
 - [ ] Mean Silhouette Coefficient / Silhouette Coefficient

From ae03069055865ba61a26e262bbfbe02053efde61 Mon Sep 17 00:00:00 2001
From: minhsphuc12 <phucnm.ec@gmail.com>
Date: Mon, 29 Jul 2019 01:29:23 +0700
Subject: [PATCH 5/6] add code, test, export for Calinski-Harabasz Score

---
 NAMESPACE                       |  1 +
 R/clustering.r                  | 49 ++++++++++++++++++++++++++++++++-
 R/helper-functions.r            |  9 ++++++
 TODO.org                        |  2 +-
 inst/tinytest/test-clustering.r | 17 ++++++++++++
 5 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 6fcdf35..8b6f174 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -72,6 +72,7 @@ export(mtr_adjusted_rand_score)
 export(mtr_homogeneity)
 export(mtr_completeness)
 export(mtr_v_measure)
+export(mtr_calinski_harabasz)
 importFrom(Rcpp,evalCpp)
 importFrom(stats,complete.cases)
 importFrom(stats,median)
diff --git a/R/clustering.r b/R/clustering.r
index f1199e5..7ed030d 100644
--- a/R/clustering.r
+++ b/R/clustering.r
@@ -110,7 +110,6 @@ mtr_adjusted_rand_score <- function(actual, predicted) {
 ##' score.
 ##' 
 ##' @inheritParams clustering_params
-##' @importFrom base choose
 ##' @return A numeric scalar output
 ##' @author Phuc Nguyen
 ##' @examples
@@ -142,3 +141,51 @@ mtr_v_measure <- function(actual, predicted) {
     c = mtr_completeness(actual, predicted)
     2 * h * c / (h + c)
 }
+
+##' @title
+##' Calinski-Harabasz Score (Variance Ratio Criterion)
+##'
+##'
+##' @description
+##'
+##' \code{mtr_calinski_harabasz} measure the 'goodness' of clustering model 
+##' output, in case ground truth is unknown. Higher score mean cluster are dense
+##' and well separated.
+##' 
+##' @inheritParams clustering_params
+##' @return A numeric scalar output
+##' @author Phuc Nguyen
+##' @examples
+##' dt <- iris[,-5]
+##' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+##' 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+##' 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
+##' 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
+##' 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)
+##' mtr_calinski_harabasz(dt, pred)
+##'
+##' @export
+
+mtr_calinski_harabasz <- function(matrix_feature, predicted) {
+    dt_center = apply(matrix_feature, 2, FUN = mean)
+    N = length(predicted)
+    num_cluster = length(unique(predicted))
+    
+    check_number_of_labels(n_labels = num_cluster, n_samples = N)
+    
+    dispersion_between_cluster = 0
+    dispersion_within_cluster = 0
+    
+    for (cluster_val in unique(predicted)) {
+        size_cluster = length(which(predicted == cluster_val))
+        dt_cluster = matrix_feature[which(predicted == cluster_val),]
+        cluster_center = apply(dt_cluster, 2, FUN = mean)
+        
+        dispersion_between_cluster = dispersion_between_cluster + size_cluster * sum((t(cluster_center) - dt_center) ^ 2)
+        dispersion_within_cluster = dispersion_within_cluster + sum((t(dt_cluster) - cluster_center) ^ 2)
+    }
+    
+    (dispersion_between_cluster / dispersion_within_cluster) * ((N - num_cluster) / (num_cluster - 1))
+}
diff --git a/R/helper-functions.r b/R/helper-functions.r
index d366341..6b1a94e 100644
--- a/R/helper-functions.r
+++ b/R/helper-functions.r
@@ -34,6 +34,15 @@ check_binary <- function(actual) {
     invisible()
 }
 
+check_number_of_labels <- function(n_labels, n_samples) {
+    
+    if (! (1 < n_labels & n_labels < n_samples)) {
+        stop("Number of labels is invalid. Valid value are 2 to n_samples - 1", call. = FALSE)
+    }
+    
+    invisible()
+}
+
 clip <- function(x, mi, ma) {
     clip_(x, mi, ma)
 }
diff --git a/TODO.org b/TODO.org
index 69a01f4..871286c 100644
--- a/TODO.org
+++ b/TODO.org
@@ -111,7 +111,7 @@ Proper scoring rule:
 
 - [X] Adjusted Rand Score
 
-- [ ] Calinski-Harabasz Score
+- [X] Calinski-Harabasz Score
 
 - [ ] Davies-Bouldin Score
 
diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r
index 0ce4d78..49f5e5a 100644
--- a/inst/tinytest/test-clustering.r
+++ b/inst/tinytest/test-clustering.r
@@ -3,6 +3,7 @@
 
 vec_a = c(0, 1, 2, 0, 3, 4, 5, 1)
 vec_b = c(1, 1, 0, 0, 2, 2, 2, 2)
+data(iris)
 
 tinytest::expect_equal(
     mtr_mutual_info_score(vec_a, vec_b),
@@ -46,5 +47,21 @@ tinytest::expect_equal(
     tol = 1e-7
 )
 
+tinytest::expect_equal(
+    mtr_calinski_harabasz(iris[,-5], 
+        predicted = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                      1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
+                      2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
+                      2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)),
+    target = 560.3999242466402,
+    tol = 1e-2 
+    # numerical calculation outcome between python and R return large small 
+    # difference, hence relatively large tolerance level
+)
+
+
 
 

From 400281f19fce529a16bca4bfe4d9f44bc8465fe1 Mon Sep 17 00:00:00 2001
From: minhsphuc12 <phucnm.ec@gmail.com>
Date: Sat, 3 Aug 2019 03:10:10 +0700
Subject: [PATCH 6/6] add code, test, export for davies-boulding score

---
 NAMESPACE                       |  1 +
 R/clustering.r                  | 51 +++++++++++++++++++++++++++++++++
 R/helper-functions.r            | 29 +++++++++++++++++++
 TODO.org                        |  2 +-
 inst/tinytest/test-clustering.r | 12 ++++++++
 5 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/NAMESPACE b/NAMESPACE
index 8b6f174..abd3e4e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -73,6 +73,7 @@ export(mtr_homogeneity)
 export(mtr_completeness)
 export(mtr_v_measure)
 export(mtr_calinski_harabasz)
+export(mtr_davies_boulding_score)
 importFrom(Rcpp,evalCpp)
 importFrom(stats,complete.cases)
 importFrom(stats,median)
diff --git a/R/clustering.r b/R/clustering.r
index 7ed030d..891f871 100644
--- a/R/clustering.r
+++ b/R/clustering.r
@@ -153,6 +153,7 @@ mtr_v_measure <- function(actual, predicted) {
 ##' and well separated.
 ##' 
 ##' @inheritParams clustering_params
+##' @seealso \code{\link{mtr_davies_boulding_score}}
 ##' @return A numeric scalar output
 ##' @author Phuc Nguyen
 ##' @examples
@@ -189,3 +190,53 @@ mtr_calinski_harabasz <- function(matrix_feature, predicted) {
     
     (dispersion_between_cluster / dispersion_within_cluster) * ((N - num_cluster) / (num_cluster - 1))
 }
+
+##' @title
+##' Davies-Bouldin Score
+##'
+##'
+##' @description
+##'
+##' \code{mtr_davies_boulding_score} measure the 'goodness' of clustering model 
+##' output, in case ground truth is unknown. Lowest and best score is 0,
+##' and lower score mean clusters are dense and separated
+##' 
+##' @inheritParams clustering_params
+##' @seealso \code{\link{mtr_calinski_harabasz}}
+##' @return A numeric scalar output
+##' @author Phuc Nguyen
+##' @examples
+##' dt <- iris[,-5]
+##' pred <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+##' 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+##' 1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+##' 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
+##' 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
+##' 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)
+##' mtr_calinski_harabasz(dt, pred)
+##'
+##' @export
+
+mtr_davies_boulding_score <- function(matrix_feature, predicted) {
+    check_equal_cluster_length(matrix_feature, predicted)
+    similarity_sum = 0
+    for (cluster_val in unique(predicted)) {
+        dt_cluster = matrix_feature[which(predicted == cluster_val),]
+        cluster_center = apply(dt_cluster, 2, FUN = mean)
+        cluster_diameter = mean_distance(dt_cluster, cluster_center)
+        max_similarity = 0
+        
+        for(other_cluster_val in unique(predicted)[unique(predicted) != cluster_val]) {
+            dt_other_cluster = matrix_feature[which(predicted == other_cluster_val),]
+            other_cluster_center = apply(dt_other_cluster, 2, FUN = mean)
+            other_cluster_diameter = mean_distance(dt_other_cluster, other_cluster_center)
+            between_distance = pairwise_distance(cluster_center, other_cluster_center)
+            similarity = (cluster_diameter + other_cluster_diameter) / between_distance
+            max_similarity = max(max_similarity, similarity)
+        }
+        similarity_sum = similarity_sum + max_similarity
+    }
+    
+    similarity_sum/length(unique(predicted))
+}
diff --git a/R/helper-functions.r b/R/helper-functions.r
index 6b1a94e..b561ec1 100644
--- a/R/helper-functions.r
+++ b/R/helper-functions.r
@@ -16,6 +16,16 @@ check_equal_length <- function(actual, predicted) {
     invisible()
 }
 
+check_equal_cluster_length <- function(dt, predicted) {
+    
+    if (nrow(dt) != length(predicted)) {
+        stop("`data` and `cluster` must have equal length.", call. = FALSE)
+    }
+    
+    invisible()
+}
+
+
 check_cutoff_range <- function(cutoff) {
 
     if (cutoff > 1 || cutoff < 0) {
@@ -146,3 +156,22 @@ conditional_entropy <- function(vec_1, vec_2) {
     cond_entropy = sum(li, na.rm = TRUE)
     cond_entropy
 }
+
+pairwise_distance <- function(vec_1, vec_2) {
+    # calculate euclidean distance between two points
+    check_equal_length(vec_1, vec_2)
+    sqrt(sum((vec_1 - vec_2) ^ 2))
+}
+
+mean_distance <- function(set, pt) {
+    # calculate euclidean distance between a set of point, represented by 
+    # data frame, to a single point
+    total_dist = 0
+    check_equal_length(set, pt)
+    for (i in 1:nrow(set)) {
+        ve = set[i,]
+        total_dist = total_dist + pairwise_distance(ve, pt)
+    }
+    total_dist / nrow(set)
+}
+
diff --git a/TODO.org b/TODO.org
index 871286c..6d659a1 100644
--- a/TODO.org
+++ b/TODO.org
@@ -113,7 +113,7 @@ Proper scoring rule:
 
 - [X] Calinski-Harabasz Score
 
-- [ ] Davies-Bouldin Score
+- [X] Davies-Bouldin Score
 
 - [X] Completeness Metric
 
diff --git a/inst/tinytest/test-clustering.r b/inst/tinytest/test-clustering.r
index 49f5e5a..64fc1b7 100644
--- a/inst/tinytest/test-clustering.r
+++ b/inst/tinytest/test-clustering.r
@@ -62,6 +62,18 @@ tinytest::expect_equal(
     # difference, hence relatively large tolerance level
 )
 
+tinytest::expect_equal(
+    mtr_davies_boulding_score(iris[,-5], 
+                          predicted = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                        1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
+                                        2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
+                                        2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0)),
+    target = 0.6619715,
+    tol = 1e-5
+)